srch/crawl.js
2024-03-10 11:07:24 +13:00

51 lines
1.9 KiB
JavaScript

const cheerio = require('cheerio');
const db = require("./db");
const queue = require("./queue");
db.createTables();
if(!process.argv[2]) {
db.get("SELECT id, url FROM pages ORDER BY id DESC LIMIT 1", (err, row) => {
crawl(row.url, true);
});
} else {
crawl(process.argv[2]);
}
async function crawl(url, ignoreExisting = false) {
if(!!await db.promiseGet("SELECT url FROM pages WHERE url = ?", url) && !ignoreExisting)
return;
await(queue());
try {
const res = await fetch(url);
console.log(`[${res.status} ${res.statusText}] ${url}`);
if(res.status == 200) {
const html = await res.text();
const $ = cheerio.load(html);
const title = $("title").text();
const text = $.text().replace(/\s+/g, " ").trim();
console.log(`[TITLE] ${title} (${url})`);
db.run(`INSERT INTO pages(url, title, text)
VALUES(?, ?, ?)`, url, title, text, (err) => {
if(err) {
console.log(`[INSERT FAIL: ${err}] ${url}|${title}`);
} else {
console.log(`[INSERT] ${url}|${title}`);
}
});
for await (const anchor of $("a")) {
const href = $(anchor).attr("href");
crawl(new URL(href, url).href);
}
}
} catch(e) {
console.log(`[FETCH FAIL] Fail for ${url}`);
}
}