Skip to content

Parse PDF into text but left out the last page #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: Development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion client/next.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ const nextConfig = {
"sharp$": false,
"onnxruntime-node$": false,
}

config.resolve.alias.canvas = false
config.resolve.alias.encoding = false
return config;
Expand Down
48 changes: 47 additions & 1 deletion client/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@
"@xenova/transformers": "^2.6.2",
"filepond": "^4.30.4",
"framer-motion": "^10.16.4",
"fs": "^0.0.1-security",
"next": "latest",
"pdf2json": "^3.0.4",
"react": "latest",
"react-dom": "latest",
"react-drag-drop-files": "^2.3.10",
"react-filepond": "^7.1.2",
"react-pdf": "^7.5.0"
"react-pdf": "^7.5.0",
"uuid": "^9.0.1"
},
"devDependencies": {
"@types/long": "^5.0.0",
Expand Down
Binary file added client/public/sample3.pdf
Binary file not shown.
60 changes: 60 additions & 0 deletions client/src/app/api/upload/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import { NextRequest, NextResponse } from 'next/server'; // To handle the request and response
import { promises as fs } from 'fs'; // To save the file temporarily
import { v4 as uuidv4 } from 'uuid'; // To generate a unique filename
import PDFParser from 'pdf2json'; // To parse the pdf

export async function POST(req: NextRequest) {
const formData: FormData = await req.formData();
const uploadedFiles = formData.getAll('filepond');
let fileName = '';
let parsedText = '';

if (uploadedFiles && uploadedFiles.length > 0) {
const uploadedFile = uploadedFiles[1];
console.log('Uploaded file:', uploadedFile);

// Check if uploadedFile is of type File
if (uploadedFile instanceof File) {
// Generate a unique filename
fileName = uuidv4();

// Convert the uploaded file into a temporary file
const tempFilePath = `/tmp/${fileName}.pdf`;

// Convert ArrayBuffer to Buffer
const fileBuffer = Buffer.from(await uploadedFile.arrayBuffer());

// Save the buffer as a file
await fs.writeFile(tempFilePath, fileBuffer);

// Parse the pdf using pdf2json. See pdf2json docs for more info.

// The reason I am bypassing type checks is because
// the default type definitions for pdf2json in the npm install
// do not allow for any constructor arguments.
// You can either modify the type definitions or bypass the type checks.
// I chose to bypass the type checks.
const pdfParser = new (PDFParser as any)(null, 1);

// See pdf2json docs for more info on how the below works.
pdfParser.on('pdfParser_dataError', (errData: any) =>
console.log(errData.parserError)
);

pdfParser.on('pdfParser_dataReady', () => {
console.log((pdfParser as any).getRawTextContent());
parsedText = (pdfParser as any).getRawTextContent();
});

pdfParser.loadPDF(tempFilePath);
} else {
console.log('Uploaded file is not in the expected format.');
}
} else {
console.log('No files found.');
}

const response = new NextResponse(parsedText);
response.headers.set('FileName', fileName);
return response;
}
4 changes: 2 additions & 2 deletions client/src/app/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import {
CardBody,
} from "@nextui-org/react";

import FileUploader from "@components/DropFile";
import DropFile from "@components/DropFile";

import FileViewer from "@components/FileViewer";

Expand Down Expand Up @@ -86,7 +86,7 @@ const Home = () => {
</p>
</div>

<FileUploader />
<DropFile />

<FileViewer />
<Card>
Expand Down
15 changes: 8 additions & 7 deletions client/src/components/DropFile.tsx
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import React, { useState } from "react";
import { FileUploader } from "react-drag-drop-files";
"use client";

const fileTypes = ["PDF"];
import { FilePond } from "react-filepond";
import "filepond/dist/filepond.min.css";
import { useState } from "react";

const DropFile = () => {

export default const DropFile = () => {

return (
<div className="w-[25vw]">
Expand All @@ -15,7 +17,6 @@ const DropFile = () => {
className="max-w-max"
/>
</div>
);
};

export default DropFile;
);
}
80 changes: 36 additions & 44 deletions client/src/components/FileViewer.tsx
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"use client";

import React, { useCallback, useState } from "react";
import React, { useCallback, useEffect, useState } from "react";

import { Button } from "@nextui-org/react";

import { pdfjs, Document, Page } from "react-pdf";
// import "react-pdf/dist/Page/AnnotationLayer.css";
import "react-pdf/dist/Page/AnnotationLayer.css";
import "react-pdf/dist/Page/TextLayer.css";

pdfjs.GlobalWorkerOptions.workerSrc = new URL(
Expand All @@ -21,27 +21,16 @@ function highlightPattern(text: string, pattern: string) {
}

export default function FileViewer() {
const [searchText, setSearchText] = useState("");
const [searchText, setSearchText] = useState<string>("");

const [numPages, setNumPages] = useState(null);
const [pageNumber, setPageNumber] = useState(1);

// Display the first page
const [currentPageText, setCurrentPageText] = useState<string>("");
const [context, setContext] = useState<string>("");

// Get number of pages
function onDocumentLoadSuccess({ numPages }: any) {
setNumPages(numPages);
setPageNumber(1);
}

// Navigation
function changePage(offset: any) {
setPageNumber((prevPageNumber) => prevPageNumber + offset);
}
function previousPage() {
changePage(-1);
}

function nextPage() {
changePage(1);
}

// Highlighting given text
Expand All @@ -50,38 +39,31 @@ export default function FileViewer() {
[searchText],
);

// Get text from single page
const getTextPage = useCallback(
(e: any) =>
e.getTextContent().then((textContent: any) => {
let pageText: string = textContent.items
.map((s: any) => s.str)
.join("");
setCurrentPageText(pageText);
console.log("Current page: " + pageText);
}),
[],
);

// Update context (still missing the last page)
useEffect(() => {
setContext(context + " " + currentPageText);
console.log("Context: " + context);
}, [currentPageText]);

// Uodate search text
function onChange(event: any) {
setSearchText(event.target.value);
}

return (
<div className="justify-center flex-col items-center flex gap-[0.25vw]">
<div className="flex justify-center max-w-full flex-col items-center gap-[0.25vw]">
<p className="items-center justify-center flex-auto">
Page {pageNumber || (numPages ? 1 : "--")} of {numPages || "--"}
</p>
<div className="flex justify-between items-center w-[30vw]">
<Button
color="primary"
disabled={pageNumber <= 1}
onClick={previousPage}
>
Previous
</Button>
<Button
color="primary"
disabled={numPages === null || pageNumber >= numPages}
onClick={nextPage}
>
Next
</Button>
</div>
</div>

<Document file={"./sample2.pdf"} onLoadSuccess={onDocumentLoadSuccess}>
<Page pageNumber={pageNumber} customTextRenderer={textRenderer} />
</Document>
<div>
<label htmlFor="search">Search:</label>
<input
Expand All @@ -91,6 +73,16 @@ export default function FileViewer() {
onChange={onChange}
/>
</div>
<Document file={"./sample3.pdf"} onLoadSuccess={onDocumentLoadSuccess}>
{Array.from(new Array(numPages), (el, index) => (
<Page
key={`page_${index + 1}`}
pageNumber={index + 1}
onLoadSuccess={getTextPage}
customTextRenderer={textRenderer}
/>
))}
</Document>
</div>
);
}