Skip to content

Commit

Permalink
Implement metadata fetching logic in the crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
MohamedBassem committed Feb 6, 2024
1 parent e035c2f commit 6b5ec51
Show file tree
Hide file tree
Showing 24 changed files with 218 additions and 52 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ jobs:
run: bunx eslint .
- name: Format
run: bunx prettier . --check
- name: Prisma
working-directory: db
run: bunx prisma generate
- name: Build web app
working-directory: web
run: |
bunx prisma generate
bun run build
run: bun run build
20 changes: 20 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
MAKEFLAGS += --always-make

format:
bunx prettier . --write && bunx eslint .

prisma:
cd db; \
bunx prisma migrate dev; \
bunx prisma generate

worker:
cd crawler; \
bun --watch index.ts
web:
cd web; \
bun run dev

studio:
cd db; \
bunx prisma studio
Binary file modified bun.lockb
Binary file not shown.
66 changes: 64 additions & 2 deletions crawler/crawler.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,68 @@
import logger from "@remember/shared/logger";
import {
ZCrawlLinkRequest,
zCrawlLinkRequestSchema,
} from "@remember/shared/queues";
import { Job } from "bullmq";

export default async function runCrawler(job: Job) {
logger.info(`[Crawler] Got a new job: ${job.name}`);
import prisma from "@remember/db";

const metascraper = require("metascraper")([
require("metascraper-description")(),
require("metascraper-image")(),
require("metascraper-logo")(),
require("metascraper-title")(),
require("metascraper-url")(),
]);

export default async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
const jobId = job.id || "unknown";

const request = zCrawlLinkRequestSchema.safeParse(job.data);
if (!request.success) {
logger.error(
`[Crawler][${jobId}] Got malformed job request: ${request.error.toString()}`,
);
return;
}

const { url, linkId } = request.data;

logger.info(
`[Crawler][${jobId}] Will crawl "${url}" for link with id "${linkId}"`,
);
// TODO(IMPORTANT): Run security validations on the input URL (e.g. deny localhost, etc)

const resp = await fetch(url);
const respBody = await resp.text();

const meta = await metascraper({
url,
html: respBody,
});

await prisma.bookmarkedLink.update({
where: {
id: linkId,
},
data: {
details: {
upsert: {
create: {
title: meta.title,
description: meta.description,
imageUrl: meta.image,
},
update: {
title: meta.title,
description: meta.description,
imageUrl: meta.image,
},
},
},
},
include: {
details: true,
},
});
}
32 changes: 32 additions & 0 deletions crawler/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import { Worker } from "bullmq";

import {
LinkCrawlerQueue,
ZCrawlLinkRequest,
queueConnectionDetails,
} from "@remember/shared/queues";
import logger from "@remember/shared/logger";
import runCrawler from "./crawler";

logger.info("Starting crawler worker ...");

const crawlerWorker = new Worker<ZCrawlLinkRequest, void>(
LinkCrawlerQueue.name,
runCrawler,
{
connection: queueConnectionDetails,
autorun: false,
},
);

crawlerWorker.on("completed", (job) => {
const jobId = job?.id || "unknown";
logger.info(`[Crawler][${jobId}] Completed successfully`);
});

crawlerWorker.on("failed", (job, error) => {
const jobId = job?.id || "unknown";
logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`);
});

await Promise.all([crawlerWorker.run()]);
17 changes: 0 additions & 17 deletions crawler/main.ts

This file was deleted.

1 change: 1 addition & 0 deletions crawler/package.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"$schema": "https://json.schemastore.org/package.json",
"name": "@remember/crawler",
"version": "0.1.0",
"private": true,
Expand Down
File renamed without changes.
8 changes: 8 additions & 0 deletions db/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"$schema": "https://json.schemastore.org/package.json",
"name": "@remember/db",
"version": "0.1.0",
"private": true,
"main": "index.ts",
"dependencies": {}
}
16 changes: 16 additions & 0 deletions db/prisma/migrations/20240206184532_add_favicon/migration.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
-- RedefineTables
PRAGMA foreign_keys=OFF;
CREATE TABLE "new_BookmarkedLinkDetails" (
"id" TEXT NOT NULL PRIMARY KEY,
"title" TEXT NOT NULL,
"description" TEXT,
"imageUrl" TEXT,
"favicon" TEXT,
"createdAt" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT "BookmarkedLinkDetails_id_fkey" FOREIGN KEY ("id") REFERENCES "BookmarkedLink" ("id") ON DELETE CASCADE ON UPDATE CASCADE
);
INSERT INTO "new_BookmarkedLinkDetails" ("createdAt", "description", "id", "imageUrl", "title") SELECT "createdAt", "description", "id", "imageUrl", "title" FROM "BookmarkedLinkDetails";
DROP TABLE "BookmarkedLinkDetails";
ALTER TABLE "new_BookmarkedLinkDetails" RENAME TO "BookmarkedLinkDetails";
PRAGMA foreign_key_check;
PRAGMA foreign_keys=ON;
File renamed without changes.
5 changes: 3 additions & 2 deletions web/prisma/schema.prisma → db/prisma/schema.prisma
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,9 @@ model BookmarkedLink {
model BookmarkedLinkDetails {
id String @id
title String
description String
imageUrl String
description String?
imageUrl String?
favicon String?
createdAt DateTime @default(now())
link BookmarkedLink @relation(fields: [id], references: [id], onDelete: Cascade)
Expand Down
13 changes: 12 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,21 +1,32 @@
{
"$schema": "https://json.schemastore.org/package.json",
"name": "remember",
"version": "0.1.0",
"private": true,
"workspaces": [
"web",
"crawler",
"shared"
"shared",
"db"
],
"dependencies": {
"@next/eslint-plugin-next": "^14.1.0",
"@typescript-eslint/eslint-plugin": "^6.21.0",
"@typescript-eslint/parser": "^6.21.0",
"browserless": "^10.2.6",
"bullmq": "^5.1.9",
"class-variance-authority": "^0.7.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-react": "^7.33.2",
"eslint-plugin-react-hooks": "^4.6.0",
"prisma": "^5.9.1",
"metascraper": "^5.43.4",
"metascraper-description": "^5.43.4",
"metascraper-image": "^5.43.4",
"metascraper-logo": "^5.43.4",
"metascraper-title": "^5.43.4",
"metascraper-url": "^5.43.4",
"puppeteer": "^22.0.0",
"winston": "^3.11.0"
},
"devDependencies": {
Expand Down
2 changes: 1 addition & 1 deletion shared/index.ts
Original file line number Diff line number Diff line change
@@ -1 +1 @@
export * as Queues from './queues.ts';
export * as Queues from "./queues.ts";
16 changes: 8 additions & 8 deletions shared/logger.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import winston from "winston";

const logger = winston.createLogger({
level: process.env.LOG_LEVEL || "debug",
format: winston.format.combine(
winston.format.timestamp(),
winston.format.colorize(),
winston.format.printf(
(info) => `${info.timestamp} ${info.level}: ${info.message}`,
),
level: process.env.LOG_LEVEL || "debug",
format: winston.format.combine(
winston.format.timestamp(),
winston.format.colorize(),
winston.format.printf(
(info) => `${info.timestamp} ${info.level}: ${info.message}`,
),
transports: [new winston.transports.Console()],
),
transports: [new winston.transports.Console()],
});

export default logger;
4 changes: 2 additions & 2 deletions shared/package.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"$schema": "https://json.schemastore.org/package.json",
"name": "@remember/shared",
"version": "0.1.0",
"private": true,
"dependencies": {
},
"dependencies": {},
"main": "index.ts"
}
16 changes: 12 additions & 4 deletions shared/queues.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
import { Queue } from "bullmq";
import { z } from "zod";

export const queueConnectionDetails = {
host: process.env.REDIS_HOST || "localhost",
port: parseInt(process.env.REDIS_PORT || "6379"),
host: process.env.REDIS_HOST || "localhost",
port: parseInt(process.env.REDIS_PORT || "6379"),
};

export const LinkCrawlerQueue = new Queue("link_crawler_queue", { connection: queueConnectionDetails });

export const zCrawlLinkRequestSchema = z.object({
linkId: z.string(),
url: z.string().url(),
});
export type ZCrawlLinkRequest = z.infer<typeof zCrawlLinkRequestSchema>;

export const LinkCrawlerQueue = new Queue<ZCrawlLinkRequest, void>(
"link_crawler_queue",
{ connection: queueConnectionDetails },
);
17 changes: 13 additions & 4 deletions web/app/api/v1/links/route.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import { authOptions } from "@/lib/auth";
import prisma from "@/lib/prisma";
import { LinkCrawlerQueue } from "@remember/shared/queues";
import prisma from "@remember/db";

import {
ZNewBookmarkedLinkRequest,
zNewBookmarkedLinkRequestSchema,
ZGetLinksResponse,
ZBookmarkedLink,
} from "@/lib/types/api/links";
Expand All @@ -15,7 +17,9 @@ export async function POST(request: NextRequest) {
return new Response(null, { status: 401 });
}

const linkRequest = ZNewBookmarkedLinkRequest.safeParse(await request.json());
const linkRequest = zNewBookmarkedLinkRequestSchema.safeParse(
await request.json(),
);

if (!linkRequest.success) {
return NextResponse.json(
Expand All @@ -33,8 +37,13 @@ export async function POST(request: NextRequest) {
},
});

let response: ZBookmarkedLink = { ...link };
// Enqueue crawling request
await LinkCrawlerQueue.add("crawl", {
linkId: link.id,
url: link.url,
});

let response: ZBookmarkedLink = { ...link };
return NextResponse.json(response, { status: 201 });
}

Expand Down
12 changes: 12 additions & 0 deletions web/app/page.tsx
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
"use client";

import { useCallback } from "react";
import { LoginButton } from "../components/auth/login";
import { LogoutButton } from "../components/auth/logout";

export default function Home() {
const addUrl = useCallback(async () => {
await fetch("/api/v1/links", {
method: "POST",
body: JSON.stringify({ url: "https://mbassem.com" }),
});
}, []);
return (
<main className="flex min-h-screen flex-col items-center justify-between p-24">
<div>
<LoginButton />
<br />
<br />
<LogoutButton />
<br />
<br />
<button onClick={addUrl}>Add URL</button>
</div>
</main>
);
Expand Down
2 changes: 1 addition & 1 deletion web/lib/auth.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import NextAuth, { NextAuthOptions } from "next-auth";
import { PrismaAdapter } from "@next-auth/prisma-adapter";
import AuthentikProvider from "next-auth/providers/authentik";
import serverConfig from "@/lib/config";
import prisma from "@/lib/prisma";
import prisma from "@remember/db";

let providers = [];

Expand Down
14 changes: 8 additions & 6 deletions web/lib/types/api/links.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { z } from "zod";

export const ZBookmarkedLink = z.object({
export const zBookmarkedLinkSchema = z.object({
id: z.string(),
url: z.string().url(),
createdAt: z.coerce.date(),
Expand All @@ -13,13 +13,15 @@ export const ZBookmarkedLink = z.object({
})
.nullish(),
});
export type ZBookmarkedLink = z.infer<typeof ZBookmarkedLink>;
export type ZBookmarkedLink = z.infer<typeof zBookmarkedLinkSchema>;

// POST /v1/links
export const ZNewBookmarkedLinkRequest = ZBookmarkedLink.pick({ url: true });
export const zNewBookmarkedLinkRequestSchema = zBookmarkedLinkSchema.pick({
url: true,
});

// GET /v1/links
export const ZGetLinksResponse = z.object({
links: z.array(ZBookmarkedLink),
export const zGetLinksResponseSchema = z.object({
links: z.array(zBookmarkedLinkSchema),
});
export type ZGetLinksResponse = z.infer<typeof ZGetLinksResponse>;
export type ZGetLinksResponse = z.infer<typeof zGetLinksResponseSchema>;
Loading

0 comments on commit 6b5ec51

Please sign in to comment.