first commit
This commit is contained in:
commit
1bd53b8907
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
index.db
|
||||||
|
node_modules/
|
||||||
|
package-lock.json
|
48
crawl.js
Normal file
48
crawl.js
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
const cheerio = require('cheerio');
|
||||||
|
const db = require("./db");
|
||||||
|
const queue = require("./queue");
|
||||||
|
|
||||||
|
db.createTables();
|
||||||
|
|
||||||
|
if(!process.argv[2]) {
|
||||||
|
db.get("SELECT id, url FROM pages ORDER BY id DESC LIMIT 1", (err, row) => {
|
||||||
|
crawl(row.url, true);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
crawl(process.argv[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function crawl(url, ignoreExisting = false) {
|
||||||
|
if(!!await db.promiseGet("SELECT url FROM pages WHERE url = ?", url) && !ignoreExisting)
|
||||||
|
return;
|
||||||
|
|
||||||
|
await(queue());
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch(url);
|
||||||
|
console.log(`[${res.status} ${res.statusText}] ${url}`);
|
||||||
|
|
||||||
|
if(res.status == 200) {
|
||||||
|
const $ = cheerio.load(await res.text());
|
||||||
|
const title = $("title").text();
|
||||||
|
|
||||||
|
console.log(`[TITLE] ${title} (${url})`);
|
||||||
|
|
||||||
|
db.run(`INSERT INTO pages(url, title)
|
||||||
|
VALUES(?, ?)`, url, title, (err) => {
|
||||||
|
if(err) {
|
||||||
|
console.log(`[INSERT FAIL: ${err}] ${url}|${title}`);
|
||||||
|
} else {
|
||||||
|
console.log(`[INSERT] ${url}|${title}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
for await (const anchor of $("a")) {
|
||||||
|
const href = $(anchor).attr("href");
|
||||||
|
crawl(new URL(href, url).href);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch(e) {
|
||||||
|
console.log(`[FETCH FAIL] Fail for ${url}`);
|
||||||
|
}
|
||||||
|
}
|
34
db.js
Normal file
34
db.js
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
const sqlite3 = require('sqlite3').verbose();
|
||||||
|
const db = new sqlite3.Database('index.db');
|
||||||
|
|
||||||
|
db.createTables = function() {
|
||||||
|
this.exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS pages(id INTEGER PRIMARY KEY, url TEXT NOT NULL UNIQUE, title TEXT);
|
||||||
|
CREATE VIRTUAL TABLE IF NOT EXISTS page_search USING fts5(url, title, content=pages, content_rowid=id);
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
|
||||||
|
INSERT INTO page_search(rowid, url, title) VALUES (new.id, new.url, new.title);
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS pages_ad AFTER DELETE ON pages BEGIN
|
||||||
|
INSERT INTO page_search(page_search, rowid, url, title) VALUES ('delete', old.id, old.url, old.title);
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS pages_au AFTER UPDATE ON pages BEGIN
|
||||||
|
INSERT INTO page_search(page_search, rowid, url, title) VALUES ('delete', old.id, old.url, old.title);
|
||||||
|
INSERT INTO page_search(rowid, url, title) VALUES (new.id, new.url, new.title);
|
||||||
|
END;
|
||||||
|
`);
|
||||||
|
console.log("[SQL] Tables created.");
|
||||||
|
}
|
||||||
|
|
||||||
|
db.promiseGet = function() {
|
||||||
|
return new Promise((res, rej) => {
|
||||||
|
this.get(...arguments, (err, row) => {
|
||||||
|
if(err) rej(err);
|
||||||
|
res(row);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = db;
|
17
package.json
Normal file
17
package.json
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
"name": "crawler",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "",
|
||||||
|
"main": "index.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"keywords": [],
|
||||||
|
"author": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"dependencies": {
|
||||||
|
"cheerio": "^1.0.0-rc.12",
|
||||||
|
"express": "^4.18.3",
|
||||||
|
"sqlite3": "^5.1.7"
|
||||||
|
}
|
||||||
|
}
|
12
queue.js
Normal file
12
queue.js
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
let queueCallbacks = [];
|
||||||
|
|
||||||
|
setInterval(() => {
|
||||||
|
const cb = queueCallbacks.shift();
|
||||||
|
if(cb) cb();
|
||||||
|
}, 500);
|
||||||
|
|
||||||
|
module.exports = () => {
|
||||||
|
return new Promise(async res => {
|
||||||
|
queueCallbacks.push(res);
|
||||||
|
});
|
||||||
|
}
|
52
zrch.js
Normal file
52
zrch.js
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
const app = require("express")();
|
||||||
|
const db = require("./db");
|
||||||
|
const port = process.env.PORT || 8080;
|
||||||
|
|
||||||
|
const sep = `<br /><br />`;
|
||||||
|
|
||||||
|
function genListing(row) {
|
||||||
|
return `<a href="${row.url}">
|
||||||
|
<span style="font-size: 1.5em">${row.title || row.url}</span>
|
||||||
|
<br />
|
||||||
|
<span style="font-size: 1em">${row.url}</span>
|
||||||
|
</a>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function searchBar(query = "") {
|
||||||
|
return `<form action="/search">
|
||||||
|
<input type="text" value="${query}" name="q" placeholder="search here!" />
|
||||||
|
</form>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function offsetButton(query, offset, limit, text) {
|
||||||
|
return `<a href="?q=${query}&o=${offset}&l=${limit}">${text}</a>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
app.get("/", (req, res) => {
|
||||||
|
res.send(searchBar());
|
||||||
|
});
|
||||||
|
|
||||||
|
app.get("/search", (req, res) => {
|
||||||
|
const query = req.query.q;
|
||||||
|
const offset = +req.query.o || 0;
|
||||||
|
const limit = +req.query.l || 50;
|
||||||
|
const next = offsetButton(query, offset + limit, limit, "Next");
|
||||||
|
const prev = offsetButton(query, offset - limit, limit, "Prev");
|
||||||
|
const page = (offset / limit) + 1;
|
||||||
|
|
||||||
|
db.all("SELECT url, title FROM page_search(?) ORDER BY rank LIMIT ?, ?",
|
||||||
|
req.query.q, offset, limit, (err, rows) => {
|
||||||
|
res.send(`
|
||||||
|
${searchBar(query)}
|
||||||
|
${prev} ${page} ${next}
|
||||||
|
${sep}
|
||||||
|
${rows.map(genListing).join(sep)}
|
||||||
|
${sep}
|
||||||
|
${prev} ${page} ${next}
|
||||||
|
`);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
app.listen(port, () => {
|
||||||
|
console.log(`Running at http://127.0.0.1:${port}/`);
|
||||||
|
});
|
Loading…
Reference in a new issue