const cheerio = require('cheerio'); const db = require("./db"); const queue = require("./queue"); db.createTables(); if(!process.argv[2]) { db.get("SELECT id, url FROM pages ORDER BY id DESC LIMIT 1", (err, row) => { crawl(row.url, true); }); } else { crawl(process.argv[2]); } async function crawl(url, ignoreExisting = false) { if(!!await db.promiseGet("SELECT url FROM pages WHERE url = ?", url) && !ignoreExisting) return; await(queue()); try { const res = await fetch(url); console.log(`[${res.status} ${res.statusText}] ${url}`); if(res.status == 200) { const html = await res.text(); const $ = cheerio.load(html); const title = $("title").text(); const text = $.text().replace(/\s+/g, " ").trim(); console.log(`[TITLE] ${title} (${url})`); db.run(`INSERT INTO pages(url, title, text) VALUES(?, ?, ?)`, url, title, text, (err) => { if(err) { console.log(`[INSERT FAIL: ${err}] ${url}|${title}`); } else { console.log(`[INSERT] ${url}|${title}`); } }); for await (const anchor of $("a")) { const href = $(anchor).attr("href"); crawl(new URL(href, url).href); } } } catch(e) { console.log(`[FETCH FAIL] Fail for ${url}`); } }