From bd0b712af99a33f7fc0a051267ad495a17897659 Mon Sep 17 00:00:00 2001 From: raghvender1205 Date: Mon, 5 May 2025 20:37:40 +0530 Subject: [PATCH] feat: implemented unstructured for pdf parsing and tree-sitter java parsers --- synthetic_data_kit/core/context.py | 1 + synthetic_data_kit/core/ingest.py | 2 + synthetic_data_kit/parsers/java_parser.py | 49 +++++++++++++++++++++++ synthetic_data_kit/parsers/pdf_parser.py | 8 ++-- 4 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 synthetic_data_kit/parsers/java_parser.py diff --git a/synthetic_data_kit/core/context.py b/synthetic_data_kit/core/context.py index d18e788..d4b7977 100644 --- a/synthetic_data_kit/core/context.py +++ b/synthetic_data_kit/core/context.py @@ -33,6 +33,7 @@ def _ensure_data_dirs(self): "data/docx", "data/ppt", "data/txt", + "data/java" "data/output", "data/generated", "data/cleaned", diff --git a/synthetic_data_kit/core/ingest.py b/synthetic_data_kit/core/ingest.py index 863f460..2175dca 100644 --- a/synthetic_data_kit/core/ingest.py +++ b/synthetic_data_kit/core/ingest.py @@ -21,6 +21,7 @@ def determine_parser(file_path: str, config: Dict[str, Any]): from synthetic_data_kit.parsers.docx_parser import DOCXParser from synthetic_data_kit.parsers.ppt_parser import PPTParser from synthetic_data_kit.parsers.txt_parser import TXTParser + from synthetic_data_kit.parsers.java_parser import JavaParser # Check if it's a URL if file_path.startswith(('http://', 'https://')): @@ -42,6 +43,7 @@ def determine_parser(file_path: str, config: Dict[str, Any]): '.docx': DOCXParser(), '.pptx': PPTParser(), '.txt': TXTParser(), + '.java': JavaParser() } if ext in parsers: diff --git a/synthetic_data_kit/parsers/java_parser.py b/synthetic_data_kit/parsers/java_parser.py new file mode 100644 index 0000000..c1ad97a --- /dev/null +++ b/synthetic_data_kit/parsers/java_parser.py @@ -0,0 +1,49 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +# Java parser logic using LangChain's LanguageParser + +import os +from typing import Dict, Any + +class JavaParser: + """Parser for Java source code using langchain's LanguageParser""" + + def parse(self, file_path: str) -> str: + """ + Parse a java file into structured segments + + Args: + file_path: Path to the java file + + Returns: + Extracts segments from the java file + """ + try: + from langchain_community.document_loaders.parsers.language.language_parser import LanguageParser + from langchain_community.document_loaders.parsers.language.java import JavaSegmenter + + with open(file_path, 'r', encoding='utf-8') as f: + code = f.read() + + segmenter = JavaSegmenter(code) + segments = segmenter.extract_functions_classes() + + return "\n\n".join(segments) + except ImportError: + raise ImportError("LangChain and its dependencies are required. Install them with: pip install langchain langchain-community tree_sitter tree_sitter_languages") + except Exception as e: + return f"Error parsing Java file: {e}" + + def save(self, content: str, output_path: str) -> None: + """Save the extracted segments to a file + + Args: + content: Extracted content + output_path: Path to save the text + """ + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(content) \ No newline at end of file diff --git a/synthetic_data_kit/parsers/pdf_parser.py b/synthetic_data_kit/parsers/pdf_parser.py index 3d19be1..0f79862 100644 --- a/synthetic_data_kit/parsers/pdf_parser.py +++ b/synthetic_data_kit/parsers/pdf_parser.py @@ -20,10 +20,12 @@ def parse(self, file_path: str) -> str: Extracted text from the PDF """ try: - from pdfminer.high_level import extract_text - return extract_text(file_path) + from unstructured.partition.pdf import partition_pdf + elements = partition_pdf(filename=file_path) + + return "\n".join([str(elem) for elem in elements]) except ImportError: - raise ImportError("pdfminer.six is required for PDF parsing. Install it with: pip install pdfminer.six") + raise ImportError("unstructured is required for PDF parsing. Install it with: pip install unstructured") def save(self, content: str, output_path: str) -> None: """Save the extracted text to a file