first commit

This commit is contained in:
sam 2024-03-10 01:33:08 +13:00
commit 1bd53b8907
6 changed files with 166 additions and 0 deletions

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
index.db
node_modules/
package-lock.json

48
crawl.js Normal file
View file

@ -0,0 +1,48 @@
const cheerio = require('cheerio');
const db = require("./db");
const queue = require("./queue");
db.createTables();
if(!process.argv[2]) {
db.get("SELECT id, url FROM pages ORDER BY id DESC LIMIT 1", (err, row) => {
crawl(row.url, true);
});
} else {
crawl(process.argv[2]);
}
async function crawl(url, ignoreExisting = false) {
if(!!await db.promiseGet("SELECT url FROM pages WHERE url = ?", url) && !ignoreExisting)
return;
await(queue());
try {
const res = await fetch(url);
console.log(`[${res.status} ${res.statusText}] ${url}`);
if(res.status == 200) {
const $ = cheerio.load(await res.text());
const title = $("title").text();
console.log(`[TITLE] ${title} (${url})`);
db.run(`INSERT INTO pages(url, title)
VALUES(?, ?)`, url, title, (err) => {
if(err) {
console.log(`[INSERT FAIL: ${err}] ${url}|${title}`);
} else {
console.log(`[INSERT] ${url}|${title}`);
}
});
for await (const anchor of $("a")) {
const href = $(anchor).attr("href");
crawl(new URL(href, url).href);
}
}
} catch(e) {
console.log(`[FETCH FAIL] Fail for ${url}`);
}
}

34
db.js Normal file
View file

@ -0,0 +1,34 @@
const sqlite3 = require('sqlite3').verbose();
const db = new sqlite3.Database('index.db');
db.createTables = function() {
this.exec(`
CREATE TABLE IF NOT EXISTS pages(id INTEGER PRIMARY KEY, url TEXT NOT NULL UNIQUE, title TEXT);
CREATE VIRTUAL TABLE IF NOT EXISTS page_search USING fts5(url, title, content=pages, content_rowid=id);
CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
INSERT INTO page_search(rowid, url, title) VALUES (new.id, new.url, new.title);
END;
CREATE TRIGGER IF NOT EXISTS pages_ad AFTER DELETE ON pages BEGIN
INSERT INTO page_search(page_search, rowid, url, title) VALUES ('delete', old.id, old.url, old.title);
END;
CREATE TRIGGER IF NOT EXISTS pages_au AFTER UPDATE ON pages BEGIN
INSERT INTO page_search(page_search, rowid, url, title) VALUES ('delete', old.id, old.url, old.title);
INSERT INTO page_search(rowid, url, title) VALUES (new.id, new.url, new.title);
END;
`);
console.log("[SQL] Tables created.");
}
db.promiseGet = function() {
return new Promise((res, rej) => {
this.get(...arguments, (err, row) => {
if(err) rej(err);
res(row);
});
});
}
module.exports = db;

17
package.json Normal file
View file

@ -0,0 +1,17 @@
{
"name": "crawler",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"cheerio": "^1.0.0-rc.12",
"express": "^4.18.3",
"sqlite3": "^5.1.7"
}
}

12
queue.js Normal file
View file

@ -0,0 +1,12 @@
let queueCallbacks = [];
setInterval(() => {
const cb = queueCallbacks.shift();
if(cb) cb();
}, 500);
module.exports = () => {
return new Promise(async res => {
queueCallbacks.push(res);
});
}

52
zrch.js Normal file
View file

@ -0,0 +1,52 @@
const app = require("express")();
const db = require("./db");
const port = process.env.PORT || 8080;
const sep = `<br /><br />`;
function genListing(row) {
return `<a href="${row.url}">
<span style="font-size: 1.5em">${row.title || row.url}</span>
<br />
<span style="font-size: 1em">${row.url}</span>
</a>`;
}
function searchBar(query = "") {
return `<form action="/search">
<input type="text" value="${query}" name="q" placeholder="search here!" />
</form>`;
}
function offsetButton(query, offset, limit, text) {
return `<a href="?q=${query}&o=${offset}&l=${limit}">${text}</a>`;
}
app.get("/", (req, res) => {
res.send(searchBar());
});
app.get("/search", (req, res) => {
const query = req.query.q;
const offset = +req.query.o || 0;
const limit = +req.query.l || 50;
const next = offsetButton(query, offset + limit, limit, "Next");
const prev = offsetButton(query, offset - limit, limit, "Prev");
const page = (offset / limit) + 1;
db.all("SELECT url, title FROM page_search(?) ORDER BY rank LIMIT ?, ?",
req.query.q, offset, limit, (err, rows) => {
res.send(`
${searchBar(query)}
${prev} ${page} ${next}
${sep}
${rows.map(genListing).join(sep)}
${sep}
${prev} ${page} ${next}
`);
});
});
app.listen(port, () => {
console.log(`Running at http://127.0.0.1:${port}/`);
});