-
-
Notifications
You must be signed in to change notification settings - Fork 143
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement metadata fetching logic in the crawler
- Loading branch information
1 parent
e035c2f
commit 6b5ec51
Showing
24 changed files
with
218 additions
and
52 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
MAKEFLAGS += --always-make | ||
|
||
format: | ||
bunx prettier . --write && bunx eslint . | ||
|
||
prisma: | ||
cd db; \ | ||
bunx prisma migrate dev; \ | ||
bunx prisma generate | ||
|
||
worker: | ||
cd crawler; \ | ||
bun --watch index.ts | ||
web: | ||
cd web; \ | ||
bun run dev | ||
|
||
studio: | ||
cd db; \ | ||
bunx prisma studio |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,68 @@ | ||
import logger from "@remember/shared/logger"; | ||
import { | ||
ZCrawlLinkRequest, | ||
zCrawlLinkRequestSchema, | ||
} from "@remember/shared/queues"; | ||
import { Job } from "bullmq"; | ||
|
||
export default async function runCrawler(job: Job) { | ||
logger.info(`[Crawler] Got a new job: ${job.name}`); | ||
import prisma from "@remember/db"; | ||
|
||
const metascraper = require("metascraper")([ | ||
require("metascraper-description")(), | ||
require("metascraper-image")(), | ||
require("metascraper-logo")(), | ||
require("metascraper-title")(), | ||
require("metascraper-url")(), | ||
]); | ||
|
||
export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) { | ||
const jobId = job.id || "unknown"; | ||
|
||
const request = zCrawlLinkRequestSchema.safeParse(job.data); | ||
if (!request.success) { | ||
logger.error( | ||
`[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`, | ||
); | ||
return; | ||
} | ||
|
||
const { url, linkId } = request.data; | ||
|
||
logger.info( | ||
`[Crawler][${jobId}] Will crawl "${url}" for link with id "${linkId}"`, | ||
); | ||
// TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc) | ||
|
||
const resp = await fetch(url); | ||
const respBody = await resp.text(); | ||
|
||
const meta = await metascraper({ | ||
url, | ||
html: respBody, | ||
}); | ||
|
||
await prisma.bookmarkedLink.update({ | ||
where: { | ||
id: linkId, | ||
}, | ||
data: { | ||
details: { | ||
upsert: { | ||
create: { | ||
title: meta.title, | ||
description: meta.description, | ||
imageUrl: meta.image, | ||
}, | ||
update: { | ||
title: meta.title, | ||
description: meta.description, | ||
imageUrl: meta.image, | ||
}, | ||
}, | ||
}, | ||
}, | ||
include: { | ||
details: true, | ||
}, | ||
}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import { Worker } from "bullmq"; | ||
|
||
import { | ||
LinkCrawlerQueue, | ||
ZCrawlLinkRequest, | ||
queueConnectionDetails, | ||
} from "@remember/shared/queues"; | ||
import logger from "@remember/shared/logger"; | ||
import runCrawler from "./crawler"; | ||
|
||
logger.info("Starting crawler worker ..."); | ||
|
||
const crawlerWorker = new Worker<ZCrawlLinkRequest, void>( | ||
LinkCrawlerQueue.name, | ||
runCrawler, | ||
{ | ||
connection: queueConnectionDetails, | ||
autorun: false, | ||
}, | ||
); | ||
|
||
crawlerWorker.on("completed", (job) => { | ||
const jobId = job?.id || "unknown"; | ||
logger.info(`[Crawler][${jobId}] Completed successfully`); | ||
}); | ||
|
||
crawlerWorker.on("failed", (job, error) => { | ||
const jobId = job?.id || "unknown"; | ||
logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`); | ||
}); | ||
|
||
await Promise.all([crawlerWorker.run()]); |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"$schema": "https://json.schemastore.org/package.json", | ||
"name": "@remember/db", | ||
"version": "0.1.0", | ||
"private": true, | ||
"main": "index.ts", | ||
"dependencies": {} | ||
} |
File renamed without changes.
File renamed without changes.
16 changes: 16 additions & 0 deletions
16
db/prisma/migrations/20240206184532_add_favicon/migration.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
-- RedefineTables | ||
PRAGMA foreign_keys=OFF; | ||
CREATE TABLE "new_BookmarkedLinkDetails" ( | ||
"id" TEXT NOT NULL PRIMARY KEY, | ||
"title" TEXT NOT NULL, | ||
"description" TEXT, | ||
"imageUrl" TEXT, | ||
"favicon" TEXT, | ||
"createdAt" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, | ||
CONSTRAINT "BookmarkedLinkDetails_id_fkey" FOREIGN KEY ("id") REFERENCES "BookmarkedLink" ("id") ON DELETE CASCADE ON UPDATE CASCADE | ||
); | ||
INSERT INTO "new_BookmarkedLinkDetails" ("createdAt", "description", "id", "imageUrl", "title") SELECT "createdAt", "description", "id", "imageUrl", "title" FROM "BookmarkedLinkDetails"; | ||
DROP TABLE "BookmarkedLinkDetails"; | ||
ALTER TABLE "new_BookmarkedLinkDetails" RENAME TO "BookmarkedLinkDetails"; | ||
PRAGMA foreign_key_check; | ||
PRAGMA foreign_keys=ON; |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
export * as Queues from './queues.ts'; | ||
export * as Queues from "./queues.ts"; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,15 @@ | ||
import winston from "winston"; | ||
|
||
const logger = winston.createLogger({ | ||
level: process.env.LOG_LEVEL || "debug", | ||
format: winston.format.combine( | ||
winston.format.timestamp(), | ||
winston.format.colorize(), | ||
winston.format.printf( | ||
(info) => `${info.timestamp} ${info.level}: ${info.message}`, | ||
), | ||
level: process.env.LOG_LEVEL || "debug", | ||
format: winston.format.combine( | ||
winston.format.timestamp(), | ||
winston.format.colorize(), | ||
winston.format.printf( | ||
(info) => `${info.timestamp} ${info.level}: ${info.message}`, | ||
), | ||
transports: [new winston.transports.Console()], | ||
), | ||
transports: [new winston.transports.Console()], | ||
}); | ||
|
||
export default logger; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
{ | ||
"$schema": "https://json.schemastore.org/package.json", | ||
"name": "@remember/shared", | ||
"version": "0.1.0", | ||
"private": true, | ||
"dependencies": { | ||
}, | ||
"dependencies": {}, | ||
"main": "index.ts" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,18 @@ | ||
import { Queue } from "bullmq"; | ||
import { z } from "zod"; | ||
|
||
export const queueConnectionDetails = { | ||
host: process.env.REDIS_HOST || "localhost", | ||
port: parseInt(process.env.REDIS_PORT || "6379"), | ||
host: process.env.REDIS_HOST || "localhost", | ||
port: parseInt(process.env.REDIS_PORT || "6379"), | ||
}; | ||
|
||
export const LinkCrawlerQueue = new Queue("link_crawler_queue", { connection: queueConnectionDetails }); | ||
|
||
export const zCrawlLinkRequestSchema = z.object({ | ||
linkId: z.string(), | ||
url: z.string().url(), | ||
}); | ||
export type ZCrawlLinkRequest = z.infer<typeof zCrawlLinkRequestSchema>; | ||
|
||
export const LinkCrawlerQueue = new Queue<ZCrawlLinkRequest, void>( | ||
"link_crawler_queue", | ||
{ connection: queueConnectionDetails }, | ||
); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.