fix: megaparse sdk with nats (#3496)

chloedia · web-flow · commit e68b4f456988 · 2024-11-25T15:29:38.000+01:00
* Adapt deps
* Change megaparse processor inner file processing
diff --git a/core/pyproject.toml b/core/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
     "faiss-cpu>=1.8.0.post1",
     "rapidfuzz>=3.10.1",
     "markupsafe>=2.1.5",
-    "megaparse[all]== 0.0.43",
+    "megaparse-sdk==0.1.7"
 ]
 readme = "README.md"
 requires-python = ">= 3.11"
diff --git a/core/quivr_core/processor/implementations/megaparse_processor.py b/core/quivr_core/processor/implementations/megaparse_processor.py
@@ -3,8 +3,8 @@
 import tiktoken
 from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
-from megaparse.core.megaparse import MegaParse
-from megaparse.core.parser.unstructured_parser import UnstructuredParser
+from megaparse_sdk.client import MegaParseNATSClient
+from megaparse_sdk.config import ClientNATSConfig
 
 from quivr_core.config import MegaparseConfig
 from quivr_core.files.file import QuivrFile
@@ -75,9 +75,9 @@ def processor_metadata(self):
 
     async def process_file_inner(self, file: QuivrFile) -> list[Document]:
         logger.info(f"Uploading file {file.path} to MegaParse")
-        parser = UnstructuredParser(**self.megaparse_config.model_dump())
-        megaparse = MegaParse(parser)
-        response = await megaparse.aload(file.path)
+        async with MegaParseNATSClient(ClientNATSConfig()) as client:
+            response = await client.parse_file(file=file.path)
+
         logger.info(f"File :  {response}")
         document = Document(
             page_content=response,
@@ -87,28 +87,3 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
         for doc in docs:
             doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
         return docs
-
-    # async def process_file_inner(self, file: QuivrFile) -> list[Document]:
-    #     api_key = str(os.getenv("MEGAPARSE_API_KEY"))
-    #     megaparse = MegaParseSDK(api_key)
-    #     logger.info(f"Uploading file {file.path} to MegaParse")
-    #     data = {
-    #         "method": self.megaparse_config.method,
-    #         "strategy": self.megaparse_config.strategy,
-    #         "check_table": self.megaparse_config.check_table,
-    #         "parsing_instruction": self.megaparse_config.parsing_instruction,
-    #         "model_name": self.megaparse_config.model_name,
-    #     }
-    #     response = await megaparse.file.upload(
-    #         file_path=str(file.path),
-    #         **data,
-    #     )
-    #     document = Document(
-    #         page_content=response["result"],
-    #     )
-    #     if len(response) > self.splitter_config.chunk_size:
-    #         docs = self.text_splitter.split_documents([document])
-    #         for doc in docs:
-    #             doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
-    #         return docs
-    #     return [document]
diff --git a/examples/simple_question_megaparse.py b/examples/simple_question_megaparse.py
@@ -11,7 +11,7 @@
 if __name__ == "__main__":
     brain = Brain.from_files(
         name="test_brain",
-        file_paths=["./tests/processor/docx/demo.docx"],
+        file_paths=["./tests/processor/pdf/sample.pdf"],
         llm=LLMEndpoint(
             llm_config=LLMEndpointConfig(model="gpt-4o"),
             llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ dependencies = [`
`23`	`23`	`"faiss-cpu>=1.8.0.post1",`
`24`	`24`	`"rapidfuzz>=3.10.1",`
`25`	`25`	`"markupsafe>=2.1.5",`
`26`		`- "megaparse[all]== 0.0.43",`
	`26`	`+ "megaparse-sdk==0.1.7"`
`27`	`27`	`]`
`28`	`28`	`readme = "README.md"`
`29`	`29`	`requires-python = ">= 3.11"`