webamp/packages/skin-database/tasks/integrityCheck.ts
2025-07-09 21:05:54 -07:00

163 lines
4.7 KiB
TypeScript

import { knex } from "../db";
import { TWEET_SNOWFLAKE_REGEX } from "../utils";
function isNotGeneratedFile(file) {
switch (file.source) {
case "derivative":
case "metadata":
return false;
}
switch (file.format) {
case "Metadata":
case "Item Tile":
case "JPEG Thumb":
return false;
}
return true;
}
export async function checkInternetArchiveMetadata(): Promise<void> {
const results = await knex.raw(
'SELECT skin_md5, identifier, metadata FROM ia_items WHERE metadata != "";'
);
const tooMany: string[] = [];
const tooFew: string[] = [];
const missingSkin: string[] = [];
for (const item of results) {
const { identifier, metadata, skin_md5 } = item;
try {
const allFiles = JSON.parse(metadata).files;
const files = allFiles.filter(isNotGeneratedFile);
if (files.length > 2) {
tooMany.push(skin_md5);
continue;
console.warn("Too many files", { files, identifier, skin_md5 });
}
const skinFile = files.find((file) => file.md5 === skin_md5);
if (skinFile == null) {
missingSkin.push(skin_md5);
continue;
console.warn("No skin file", { identifier, skin_md5 });
}
if (files.length < 2) {
console.log({ skin_md5, identifier, length: files.length });
tooFew.push(skin_md5);
continue;
console.warn("Too few files", { identifier, skin_md5 });
}
} catch (_e) {
console.log(metadata);
}
}
console.table({
total: results.length,
tooMany: tooMany.length,
tooFew: tooFew.length,
missingSkin: missingSkin.length,
});
}
export async function integrityCheck(): Promise<void> {
await noDuplicateTweetIds();
await noDuplicateTweetMd5s();
await skinsWithoutFiles();
await filesWithoutSkins();
await reviewsWithoutSkins();
await tweetsHaveIdIfTheyHaveURl();
// There are still some
// await noDuplicateIaItems();
// There are still some
// await tweetsHaveIds();
// Not needed regularly
// await findHashesForTweets();
console.log("Done.");
}
async function skinsWithoutFiles(): Promise<void> {
console.log("Checking for skins without files...");
const result = await knex("skins")
.leftJoin("files", "skins.md5", "=", "files.skin_md5")
.where("files.skin_md5", null)
.select(["md5"]);
if (result.length > 0) {
console.warn(`Found ${result.length} skins without files`);
} else {
console.log("None found.");
}
}
async function filesWithoutSkins(): Promise<void> {
console.log("Checking for files without skins...");
const result = await knex("files")
.leftJoin("skins", "skins.md5", "=", "files.skin_md5")
.where("skins.md5", null)
.select(["files.skin_md5"]);
if (result.length > 0) {
console.warn(`Found ${result.length} files without skins`);
} else {
console.log("None found.");
}
}
async function reviewsWithoutSkins(): Promise<void> {
console.log("Checking for reviews without skins...");
const result = await knex("skin_reviews")
.leftJoin("skins", "skins.md5", "=", "skin_reviews.skin_md5")
.where("skins.md5", null)
.select(["skin_reviews.skin_md5"]);
if (result.length > 0) {
console.warn(`Found ${result.length} reviews without skins`);
} else {
console.log("None found.");
}
}
async function noDuplicateTweetIds(): Promise<void> {
console.log("Checking for duplicate Tweet ids...");
const result = await knex.raw(
"SELECT tweet_id, COUNT(*) c FROM tweets GROUP BY tweet_id HAVING c > 1;"
);
if (result.length > 0) {
console.warn(`Found ${result.length} tweets with duplicate tweet_id.`);
} else {
console.log("None found.");
}
}
async function noDuplicateTweetMd5s(): Promise<void> {
console.log("Checking for duplicate Tweet md5s...");
const result = await knex.raw(
"SELECT skin_md5, COUNT(*) c FROM tweets GROUP BY skin_md5 HAVING c > 1;"
);
if (result.length > 0) {
console.warn(`Found ${result.length} tweets with duplicate skin_md5.`);
} else {
console.log("None found.");
}
}
async function tweetsHaveIdIfTheyHaveURl(): Promise<void> {
console.log("Checking for Tweets with URLs but no ids...");
const results = await knex.raw(
"SELECT * FROM tweets WHERE tweet_id IS NULL AND url IS NOT NULL;"
);
if (results.length > 0) {
for (const row of results) {
const { id, url } = row;
const idMatch = url.match(TWEET_SNOWFLAKE_REGEX);
if (idMatch == null) {
throw new Error(`Can't extract id from tweet URL: ${url}`);
}
await knex("tweets").where({ id }).update({ tweet_id: idMatch[0] }, []);
console.log(`Updated tweet with url ${url}`);
}
} else {
console.log("None found.");
}
}