You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I was using llama parse cloud to read the content from the scanned image in pdf. Llama parse was able to decode the text from the scanned image.
2)But starting from today I see that , llama parse not able to decode the text, its returning "NO_CONTENT_HERE".
Below is the code:
import nest_asyncio
nest_asyncio.apply()
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_parse import LlamaParse
from langchain.text_splitter import SpacyTextSplitter
import os
os.environ["OPENAI_API_KEY"] = ""
print("hello")
parser = LlamaParse(
api_key = '&****',# can also be set in your env as LLAMA_CLOUD_API_KEY
result_type="markdown" # "markdown" and "text" are available
)
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader("./data", file_extractor=file_extractor).load_data()
#documents = parser.load_data("./data/LLP_27oct2008.pdf")
content = ""
for doc in documents:
content = doc.text
content = content.split('---')
print(len(content))
page_no = 1
page_to_content_map = {}
for tex in content:
page_to_content_map[page_no] = tex
page_no +=1
documents = []
for page in page_to_content_map:
metadata = {'page':page-1,'source':'LLP_27oct2008.pdf'}
page_content = page_to_content_map[page]
doc = Document(page_content=page_content, metadata=metadata)
documents.append(doc)
2)But starting from today I see that , llama parse not able to decode the text, its returning "NO_CONTENT_HERE".
Below is the code:
import nest_asyncio
nest_asyncio.apply()
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_parse import LlamaParse
from langchain.text_splitter import SpacyTextSplitter
import os
class Document:
def init(self, page_content, metadata):
self.page_content = page_content
self.metadata = metadata
os.environ["OPENAI_API_KEY"] = ""
print("hello")
parser = LlamaParse(
api_key = '&****',# can also be set in your env as LLAMA_CLOUD_API_KEY
result_type="markdown" # "markdown" and "text" are available
)
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader("./data", file_extractor=file_extractor).load_data()
#documents = parser.load_data("./data/LLP_27oct2008.pdf")
content = ""
for doc in documents:
content = doc.text
content = content.split('---')
print(len(content))
page_no = 1
page_to_content_map = {}
for tex in content:
page_to_content_map[page_no] = tex
page_no +=1
documents = []
for page in page_to_content_map:
metadata = {'page':page-1,'source':'LLP_27oct2008.pdf'}
page_content = page_to_content_map[page]
doc = Document(page_content=page_content, metadata=metadata)
documents.append(doc)
text_splitter = SpacyTextSplitter(chunk_size=500)
docs = text_splitter.split_documents(documents)
print(docs)
The PDF file am parsing has only scanned images.
The text was updated successfully, but these errors were encountered: