From 1bd53b890770a69575923999dab7ce3d8f96d6a0 Mon Sep 17 00:00:00 2001 From: sam Date: Sun, 10 Mar 2024 01:33:08 +1300 Subject: [PATCH] first commit --- .gitignore | 3 +++ crawl.js | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ db.js | 34 ++++++++++++++++++++++++++++++++++ package.json | 17 +++++++++++++++++ queue.js | 12 ++++++++++++ zrch.js | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 166 insertions(+) create mode 100644 .gitignore create mode 100644 crawl.js create mode 100644 db.js create mode 100644 package.json create mode 100644 queue.js create mode 100644 zrch.js diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7e977a4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +index.db +node_modules/ +package-lock.json diff --git a/crawl.js b/crawl.js new file mode 100644 index 0000000..7600436 --- /dev/null +++ b/crawl.js @@ -0,0 +1,48 @@ +const cheerio = require('cheerio'); +const db = require("./db"); +const queue = require("./queue"); + +db.createTables(); + +if(!process.argv[2]) { + db.get("SELECT id, url FROM pages ORDER BY id DESC LIMIT 1", (err, row) => { + crawl(row.url, true); + }); +} else { + crawl(process.argv[2]); +} + +async function crawl(url, ignoreExisting = false) { + if(!!await db.promiseGet("SELECT url FROM pages WHERE url = ?", url) && !ignoreExisting) + return; + + await(queue()); + + try { + const res = await fetch(url); + console.log(`[${res.status} ${res.statusText}] ${url}`); + + if(res.status == 200) { + const $ = cheerio.load(await res.text()); + const title = $("title").text(); + + console.log(`[TITLE] ${title} (${url})`); + + db.run(`INSERT INTO pages(url, title) + VALUES(?, ?)`, url, title, (err) => { + if(err) { + console.log(`[INSERT FAIL: ${err}] ${url}|${title}`); + } else { + console.log(`[INSERT] ${url}|${title}`); + } + }); + + for await (const anchor of $("a")) { + const href = $(anchor).attr("href"); + crawl(new URL(href, url).href); + } + } + } catch(e) { + console.log(`[FETCH FAIL] Fail for ${url}`); + } +} diff --git a/db.js b/db.js new file mode 100644 index 0000000..f560088 --- /dev/null +++ b/db.js @@ -0,0 +1,34 @@ +const sqlite3 = require('sqlite3').verbose(); +const db = new sqlite3.Database('index.db'); + +db.createTables = function() { + this.exec(` +CREATE TABLE IF NOT EXISTS pages(id INTEGER PRIMARY KEY, url TEXT NOT NULL UNIQUE, title TEXT); +CREATE VIRTUAL TABLE IF NOT EXISTS page_search USING fts5(url, title, content=pages, content_rowid=id); + +CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN + INSERT INTO page_search(rowid, url, title) VALUES (new.id, new.url, new.title); +END; + +CREATE TRIGGER IF NOT EXISTS pages_ad AFTER DELETE ON pages BEGIN + INSERT INTO page_search(page_search, rowid, url, title) VALUES ('delete', old.id, old.url, old.title); +END; + +CREATE TRIGGER IF NOT EXISTS pages_au AFTER UPDATE ON pages BEGIN + INSERT INTO page_search(page_search, rowid, url, title) VALUES ('delete', old.id, old.url, old.title); + INSERT INTO page_search(rowid, url, title) VALUES (new.id, new.url, new.title); +END; + `); + console.log("[SQL] Tables created."); +} + +db.promiseGet = function() { + return new Promise((res, rej) => { + this.get(...arguments, (err, row) => { + if(err) rej(err); + res(row); + }); + }); +} + +module.exports = db; diff --git a/package.json b/package.json new file mode 100644 index 0000000..69238f9 --- /dev/null +++ b/package.json @@ -0,0 +1,17 @@ +{ + "name": "crawler", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "dependencies": { + "cheerio": "^1.0.0-rc.12", + "express": "^4.18.3", + "sqlite3": "^5.1.7" + } +} diff --git a/queue.js b/queue.js new file mode 100644 index 0000000..72658e6 --- /dev/null +++ b/queue.js @@ -0,0 +1,12 @@ +let queueCallbacks = []; + +setInterval(() => { + const cb = queueCallbacks.shift(); + if(cb) cb(); +}, 500); + +module.exports = () => { + return new Promise(async res => { + queueCallbacks.push(res); + }); +} diff --git a/zrch.js b/zrch.js new file mode 100644 index 0000000..d192c20 --- /dev/null +++ b/zrch.js @@ -0,0 +1,52 @@ +const app = require("express")(); +const db = require("./db"); +const port = process.env.PORT || 8080; + +const sep = `

`; + +function genListing(row) { + return ` + ${row.title || row.url} +
+ ${row.url} +
`; +} + +function searchBar(query = "") { + return `
+ +
`; +} + +function offsetButton(query, offset, limit, text) { + return `${text}`; +} + +app.get("/", (req, res) => { + res.send(searchBar()); +}); + +app.get("/search", (req, res) => { + const query = req.query.q; + const offset = +req.query.o || 0; + const limit = +req.query.l || 50; + const next = offsetButton(query, offset + limit, limit, "Next"); + const prev = offsetButton(query, offset - limit, limit, "Prev"); + const page = (offset / limit) + 1; + + db.all("SELECT url, title FROM page_search(?) ORDER BY rank LIMIT ?, ?", + req.query.q, offset, limit, (err, rows) => { + res.send(` + ${searchBar(query)} + ${prev} ${page} ${next} + ${sep} + ${rows.map(genListing).join(sep)} + ${sep} + ${prev} ${page} ${next} + `); + }); +}); + +app.listen(port, () => { + console.log(`Running at http://127.0.0.1:${port}/`); +});