From fcbe2db0389a8bc98a8d09c6dead86b3f886a180 Mon Sep 17 00:00:00 2001 From: sam Date: Sun, 10 Mar 2024 02:37:07 +1300 Subject: [PATCH] add text section --- crawl.js | 10 ++++++---- db.js | 12 ++++++------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/crawl.js b/crawl.js index 7600436..b45970d 100644 --- a/crawl.js +++ b/crawl.js @@ -23,13 +23,15 @@ async function crawl(url, ignoreExisting = false) { console.log(`[${res.status} ${res.statusText}] ${url}`); if(res.status == 200) { - const $ = cheerio.load(await res.text()); + const html = await res.text(); + const $ = cheerio.load(html); const title = $("title").text(); - + const text = $.text().replace(/\s+/g, " ").trim(); + console.log(`[TITLE] ${title} (${url})`); - db.run(`INSERT INTO pages(url, title) - VALUES(?, ?)`, url, title, (err) => { + db.run(`INSERT INTO pages(url, title, text) + VALUES(?, ?, ?)`, url, title, text, (err) => { if(err) { console.log(`[INSERT FAIL: ${err}] ${url}|${title}`); } else { diff --git a/db.js b/db.js index f560088..0bce643 100644 --- a/db.js +++ b/db.js @@ -3,20 +3,20 @@ const db = new sqlite3.Database('index.db'); db.createTables = function() { this.exec(` -CREATE TABLE IF NOT EXISTS pages(id INTEGER PRIMARY KEY, url TEXT NOT NULL UNIQUE, title TEXT); -CREATE VIRTUAL TABLE IF NOT EXISTS page_search USING fts5(url, title, content=pages, content_rowid=id); +CREATE TABLE IF NOT EXISTS pages(id INTEGER PRIMARY KEY, url TEXT NOT NULL UNIQUE, title TEXT, text TEXT); +CREATE VIRTUAL TABLE IF NOT EXISTS page_search USING fts5(url, title, text, content=pages, content_rowid=id); CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN - INSERT INTO page_search(rowid, url, title) VALUES (new.id, new.url, new.title); + INSERT INTO page_search(rowid, url, title, text) VALUES (new.id, new.url, new.title, new.text); END; CREATE TRIGGER IF NOT EXISTS pages_ad AFTER DELETE ON pages BEGIN - INSERT INTO page_search(page_search, rowid, url, title) VALUES ('delete', old.id, old.url, old.title); + INSERT INTO page_search(page_search, rowid, url, title, text) VALUES ('delete', old.id, old.url, old.title, old.text); END; CREATE TRIGGER IF NOT EXISTS pages_au AFTER UPDATE ON pages BEGIN - INSERT INTO page_search(page_search, rowid, url, title) VALUES ('delete', old.id, old.url, old.title); - INSERT INTO page_search(rowid, url, title) VALUES (new.id, new.url, new.title); + INSERT INTO page_search(page_search, rowid, url, title, text) VALUES ('delete', old.id, old.url, old.title, old.text); + INSERT INTO page_search(rowid, url, title, text) VALUES (new.id, new.url, new.title, new.text); END; `); console.log("[SQL] Tables created.");