3
3
import tiktoken
4
4
from langchain_core .documents import Document
5
5
from langchain_text_splitters import RecursiveCharacterTextSplitter , TextSplitter
6
- from megaparse . core . megaparse import MegaParse
7
- from megaparse . core . parser . unstructured_parser import UnstructuredParser
6
+ from megaparse_sdk . client import MegaParseNATSClient
7
+ from megaparse_sdk . config import ClientNATSConfig
8
8
9
9
from quivr_core .config import MegaparseConfig
10
10
from quivr_core .files .file import QuivrFile
@@ -75,9 +75,9 @@ def processor_metadata(self):
75
75
76
76
async def process_file_inner (self , file : QuivrFile ) -> list [Document ]:
77
77
logger .info (f"Uploading file { file .path } to MegaParse" )
78
- parser = UnstructuredParser ( ** self . megaparse_config . model_dump ())
79
- megaparse = MegaParse ( parser )
80
- response = await megaparse . aload ( file . path )
78
+ async with MegaParseNATSClient ( ClientNATSConfig ()) as client :
79
+ response = await client . parse_file ( file = file . path )
80
+
81
81
logger .info (f"File : { response } " )
82
82
document = Document (
83
83
page_content = response ,
@@ -87,28 +87,3 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
87
87
for doc in docs :
88
88
doc .metadata = {"chunk_size" : len (self .enc .encode (doc .page_content ))}
89
89
return docs
90
-
91
- # async def process_file_inner(self, file: QuivrFile) -> list[Document]:
92
- # api_key = str(os.getenv("MEGAPARSE_API_KEY"))
93
- # megaparse = MegaParseSDK(api_key)
94
- # logger.info(f"Uploading file {file.path} to MegaParse")
95
- # data = {
96
- # "method": self.megaparse_config.method,
97
- # "strategy": self.megaparse_config.strategy,
98
- # "check_table": self.megaparse_config.check_table,
99
- # "parsing_instruction": self.megaparse_config.parsing_instruction,
100
- # "model_name": self.megaparse_config.model_name,
101
- # }
102
- # response = await megaparse.file.upload(
103
- # file_path=str(file.path),
104
- # **data,
105
- # )
106
- # document = Document(
107
- # page_content=response["result"],
108
- # )
109
- # if len(response) > self.splitter_config.chunk_size:
110
- # docs = self.text_splitter.split_documents([document])
111
- # for doc in docs:
112
- # doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
113
- # return docs
114
- # return [document]
0 commit comments