2024-03-10 01:33:08 +13:00
|
|
|
const cheerio = require('cheerio');
|
|
|
|
const db = require("./db");
|
|
|
|
const queue = require("./queue");
|
|
|
|
|
|
|
|
db.createTables();
|
|
|
|
|
|
|
|
if(!process.argv[2]) {
|
|
|
|
db.get("SELECT id, url FROM pages ORDER BY id DESC LIMIT 1", (err, row) => {
|
|
|
|
crawl(row.url, true);
|
|
|
|
});
|
|
|
|
} else {
|
|
|
|
crawl(process.argv[2]);
|
|
|
|
}
|
|
|
|
|
|
|
|
async function crawl(url, ignoreExisting = false) {
|
|
|
|
if(!!await db.promiseGet("SELECT url FROM pages WHERE url = ?", url) && !ignoreExisting)
|
|
|
|
return;
|
|
|
|
|
|
|
|
await(queue());
|
|
|
|
|
|
|
|
try {
|
|
|
|
const res = await fetch(url);
|
|
|
|
console.log(`[${res.status} ${res.statusText}] ${url}`);
|
|
|
|
|
|
|
|
if(res.status == 200) {
|
2024-03-10 11:07:24 +13:00
|
|
|
const html = await res.text();
|
|
|
|
const $ = cheerio.load(html);
|
2024-03-10 01:33:08 +13:00
|
|
|
const title = $("title").text();
|
2024-03-10 11:07:24 +13:00
|
|
|
const text = $.text().replace(/\s+/g, " ").trim();
|
2024-03-10 02:37:07 +13:00
|
|
|
|
2024-03-10 01:33:08 +13:00
|
|
|
console.log(`[TITLE] ${title} (${url})`);
|
|
|
|
|
2024-03-10 02:37:07 +13:00
|
|
|
db.run(`INSERT INTO pages(url, title, text)
|
|
|
|
VALUES(?, ?, ?)`, url, title, text, (err) => {
|
2024-03-10 01:33:08 +13:00
|
|
|
if(err) {
|
|
|
|
console.log(`[INSERT FAIL: ${err}] ${url}|${title}`);
|
|
|
|
} else {
|
|
|
|
console.log(`[INSERT] ${url}|${title}`);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
for await (const anchor of $("a")) {
|
|
|
|
const href = $(anchor).attr("href");
|
|
|
|
crawl(new URL(href, url).href);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} catch(e) {
|
|
|
|
console.log(`[FETCH FAIL] Fail for ${url}`);
|
|
|
|
}
|
|
|
|
}
|