Skip to content

Commit e68b4f4

Browse files
authored
fix: megaparse sdk with nats (#3496)
* Adapt deps * Change megaparse processor inner file processing
1 parent a4e42b0 commit e68b4f4

File tree

3 files changed

+7
-32
lines changed

3 files changed

+7
-32
lines changed

core/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ dependencies = [
2323
"faiss-cpu>=1.8.0.post1",
2424
"rapidfuzz>=3.10.1",
2525
"markupsafe>=2.1.5",
26-
"megaparse[all]== 0.0.43",
26+
"megaparse-sdk==0.1.7"
2727
]
2828
readme = "README.md"
2929
requires-python = ">= 3.11"

core/quivr_core/processor/implementations/megaparse_processor.py

Lines changed: 5 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
import tiktoken
44
from langchain_core.documents import Document
55
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
6-
from megaparse.core.megaparse import MegaParse
7-
from megaparse.core.parser.unstructured_parser import UnstructuredParser
6+
from megaparse_sdk.client import MegaParseNATSClient
7+
from megaparse_sdk.config import ClientNATSConfig
88

99
from quivr_core.config import MegaparseConfig
1010
from quivr_core.files.file import QuivrFile
@@ -75,9 +75,9 @@ def processor_metadata(self):
7575

7676
async def process_file_inner(self, file: QuivrFile) -> list[Document]:
7777
logger.info(f"Uploading file {file.path} to MegaParse")
78-
parser = UnstructuredParser(**self.megaparse_config.model_dump())
79-
megaparse = MegaParse(parser)
80-
response = await megaparse.aload(file.path)
78+
async with MegaParseNATSClient(ClientNATSConfig()) as client:
79+
response = await client.parse_file(file=file.path)
80+
8181
logger.info(f"File : {response}")
8282
document = Document(
8383
page_content=response,
@@ -87,28 +87,3 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
8787
for doc in docs:
8888
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
8989
return docs
90-
91-
# async def process_file_inner(self, file: QuivrFile) -> list[Document]:
92-
# api_key = str(os.getenv("MEGAPARSE_API_KEY"))
93-
# megaparse = MegaParseSDK(api_key)
94-
# logger.info(f"Uploading file {file.path} to MegaParse")
95-
# data = {
96-
# "method": self.megaparse_config.method,
97-
# "strategy": self.megaparse_config.strategy,
98-
# "check_table": self.megaparse_config.check_table,
99-
# "parsing_instruction": self.megaparse_config.parsing_instruction,
100-
# "model_name": self.megaparse_config.model_name,
101-
# }
102-
# response = await megaparse.file.upload(
103-
# file_path=str(file.path),
104-
# **data,
105-
# )
106-
# document = Document(
107-
# page_content=response["result"],
108-
# )
109-
# if len(response) > self.splitter_config.chunk_size:
110-
# docs = self.text_splitter.split_documents([document])
111-
# for doc in docs:
112-
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
113-
# return docs
114-
# return [document]

examples/simple_question_megaparse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
if __name__ == "__main__":
1212
brain = Brain.from_files(
1313
name="test_brain",
14-
file_paths=["./tests/processor/docx/demo.docx"],
14+
file_paths=["./tests/processor/pdf/sample.pdf"],
1515
llm=LLMEndpoint(
1616
llm_config=LLMEndpointConfig(model="gpt-4o"),
1717
llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),

0 commit comments

Comments
 (0)