mirror of
https://github.com/soconnor0919/pdf2md.git
synced 2026-02-04 15:56:36 -05:00
feat: implement OCR fallback for PDF text extraction and enhance background UI with animated elements.
This commit is contained in:
7541
package-lock.json
generated
Normal file
7541
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
@@ -21,6 +21,7 @@
|
|||||||
"@radix-ui/react-slot": "^1.2.4",
|
"@radix-ui/react-slot": "^1.2.4",
|
||||||
"@radix-ui/react-tabs": "^1.1.13",
|
"@radix-ui/react-tabs": "^1.1.13",
|
||||||
"@t3-oss/env-nextjs": "^0.12.0",
|
"@t3-oss/env-nextjs": "^0.12.0",
|
||||||
|
"canvas": "^3.2.0",
|
||||||
"class-variance-authority": "^0.7.1",
|
"class-variance-authority": "^0.7.1",
|
||||||
"clsx": "^2.1.1",
|
"clsx": "^2.1.1",
|
||||||
"lucide-react": "^0.556.0",
|
"lucide-react": "^0.556.0",
|
||||||
@@ -30,6 +31,7 @@
|
|||||||
"react-dom": "19.2.1",
|
"react-dom": "19.2.1",
|
||||||
"tailwind-merge": "^3.4.0",
|
"tailwind-merge": "^3.4.0",
|
||||||
"tailwindcss-animate": "^1.0.7",
|
"tailwindcss-animate": "^1.0.7",
|
||||||
|
"tesseract.js": "^6.0.1",
|
||||||
"zod": "^3.24.2"
|
"zod": "^3.24.2"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
|||||||
@@ -5,7 +5,15 @@ export default function HomePage() {
|
|||||||
return (
|
return (
|
||||||
<main className="relative min-h-screen w-full bg-background selection:bg-primary/10">
|
<main className="relative min-h-screen w-full bg-background selection:bg-primary/10">
|
||||||
{/* Background Pattern */}
|
{/* Background Pattern */}
|
||||||
<div className="absolute inset-0 -z-10 h-full w-full bg-white dark:bg-black bg-[radial-gradient(#e5e7eb_1px,transparent_1px)] [background-size:16px_16px] dark:bg-[radial-gradient(#ffffff33_1px,transparent_1px)]" />
|
{/* Background Pattern */}
|
||||||
|
<div className="fixed inset-0 -z-10 h-full w-full bg-white dark:bg-neutral-950 overflow-hidden">
|
||||||
|
<div className="absolute inset-0 bg-[linear-gradient(to_right,#80808012_1px,transparent_1px),linear-gradient(to_bottom,#80808012_1px,transparent_1px)] bg-[size:24px_24px]"></div>
|
||||||
|
|
||||||
|
{/* Animated Blobs */}
|
||||||
|
<div className="absolute top-0 left-1/4 w-72 h-72 bg-purple-300 dark:bg-purple-600 rounded-full mix-blend-multiply dark:mix-blend-screen filter blur-xl opacity-70 dark:opacity-40 animate-blob"></div>
|
||||||
|
<div className="absolute top-0 right-1/4 w-72 h-72 bg-yellow-300 dark:bg-yellow-600 rounded-full mix-blend-multiply dark:mix-blend-screen filter blur-xl opacity-70 dark:opacity-40 animate-blob animation-delay-2000"></div>
|
||||||
|
<div className="absolute -bottom-8 left-1/3 w-72 h-72 bg-pink-300 dark:bg-pink-600 rounded-full mix-blend-multiply dark:mix-blend-screen filter blur-xl opacity-70 dark:opacity-40 animate-blob animation-delay-4000"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<Navbar />
|
<Navbar />
|
||||||
|
|
||||||
|
|||||||
31
src/lib/ocr.ts
Normal file
31
src/lib/ocr.ts
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
import { type PDFPageProxy } from 'pdfjs-dist';
|
||||||
|
import { createWorker } from 'tesseract.js';
|
||||||
|
import { createCanvas } from 'canvas';
|
||||||
|
|
||||||
|
export async function performOcrOnPage(page: PDFPageProxy): Promise<string> {
|
||||||
|
try {
|
||||||
|
const viewport = page.getViewport({ scale: 2.0 }); // Scale up for better OCR accuracy
|
||||||
|
const canvas = createCanvas(viewport.width, viewport.height);
|
||||||
|
const context = canvas.getContext('2d');
|
||||||
|
|
||||||
|
// Render PDF page to canvas
|
||||||
|
await page.render({
|
||||||
|
canvasContext: context as any, // Type mismatch between node-canvas and DOM canvas
|
||||||
|
viewport: viewport,
|
||||||
|
}).promise;
|
||||||
|
|
||||||
|
// Convert canvas to image buffer
|
||||||
|
const buffer = canvas.toBuffer('image/png');
|
||||||
|
|
||||||
|
// Perform OCR
|
||||||
|
const worker = await createWorker('eng');
|
||||||
|
const ret = await worker.recognize(buffer);
|
||||||
|
const text = ret.data.text;
|
||||||
|
await worker.terminate();
|
||||||
|
|
||||||
|
return text;
|
||||||
|
} catch (error) {
|
||||||
|
console.error('OCR failed for page:', error);
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -18,6 +18,10 @@ interface TextItem {
|
|||||||
hasEOL?: boolean;
|
hasEOL?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
import { performOcrOnPage } from './ocr';
|
||||||
|
|
||||||
|
// ... (existing imports and setup)
|
||||||
|
|
||||||
export async function extractTextFromPdf(buffer: Buffer): Promise<string> {
|
export async function extractTextFromPdf(buffer: Buffer): Promise<string> {
|
||||||
const data = new Uint8Array(buffer);
|
const data = new Uint8Array(buffer);
|
||||||
|
|
||||||
@@ -43,6 +47,20 @@ export async function extractTextFromPdf(buffer: Buffer): Promise<string> {
|
|||||||
// Filter empty items
|
// Filter empty items
|
||||||
const contentItems = items.filter(item => item.str.trim().length > 0);
|
const contentItems = items.filter(item => item.str.trim().length > 0);
|
||||||
|
|
||||||
|
// OCR Fallback: If page has very little text, try OCR
|
||||||
|
if (contentItems.length < 5) { // Threshold: fewer than 5 text items
|
||||||
|
// Check total character count too, just in case
|
||||||
|
const totalChars = contentItems.reduce((acc, item) => acc + item.str.length, 0);
|
||||||
|
if (totalChars < 50) {
|
||||||
|
console.log(`Page ${i} seems to be an image/scanned. Attempting OCR...`);
|
||||||
|
const ocrText = await performOcrOnPage(page);
|
||||||
|
if (ocrText.trim().length > 0) {
|
||||||
|
fullText += `\n\n${ocrText}\n\n`;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (contentItems.length === 0) continue;
|
if (contentItems.length === 0) continue;
|
||||||
|
|
||||||
for (const item of contentItems) {
|
for (const item of contentItems) {
|
||||||
|
|||||||
@@ -124,4 +124,36 @@
|
|||||||
body {
|
body {
|
||||||
@apply bg-background text-foreground;
|
@apply bg-background text-foreground;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@layer utilities {
|
||||||
|
.animate-blob {
|
||||||
|
animation: blob 7s infinite;
|
||||||
|
}
|
||||||
|
|
||||||
|
.animation-delay-2000 {
|
||||||
|
animation-delay: 2s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.animation-delay-4000 {
|
||||||
|
animation-delay: 4s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@keyframes blob {
|
||||||
|
0% {
|
||||||
|
transform: translate(0px, 0px) scale(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
33% {
|
||||||
|
transform: translate(30px, -50px) scale(1.1);
|
||||||
|
}
|
||||||
|
|
||||||
|
66% {
|
||||||
|
transform: translate(-20px, 20px) scale(0.9);
|
||||||
|
}
|
||||||
|
|
||||||
|
100% {
|
||||||
|
transform: translate(0px, 0px) scale(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user