mirror of
https://github.com/lemeow125/DocManagerBackend.git
synced 2025-01-18 17:13:00 +08:00
Improve start_watcher OCR
This commit is contained in:
parent
9529560fed
commit
dba3f6df62
1 changed files with 59 additions and 11 deletions
|
@ -97,7 +97,7 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
encoded_image = base64.b64encode(
|
encoded_image = base64.b64encode(
|
||||||
img_buffer.getvalue()).decode()
|
img_buffer.getvalue()).decode()
|
||||||
|
|
||||||
# First LLM API call to determine category
|
# Determine category
|
||||||
class DocumentSchema(BaseModel):
|
class DocumentSchema(BaseModel):
|
||||||
category: str = "other"
|
category: str = "other"
|
||||||
explanation: Optional[str] = None
|
explanation: Optional[str] = None
|
||||||
|
@ -109,8 +109,6 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
|
|
||||||
Possible document types are: {possible_categories}. You are free to create a new one if none are suitable.
|
Possible document types are: {possible_categories}. You are free to create a new one if none are suitable.
|
||||||
|
|
||||||
If the document_type is Special Order or Memorandum, provide the sender of the document under sent_from.
|
|
||||||
|
|
||||||
Do all of this and return your output in JSON.
|
Do all of this and return your output in JSON.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -130,20 +128,14 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
response.message.content)
|
response.message.content)
|
||||||
document_type = result.category
|
document_type = result.category
|
||||||
|
|
||||||
# Second LLM API call to determine other details
|
# Determine sender
|
||||||
class DocumentSchema(BaseModel):
|
class DocumentSchema(BaseModel):
|
||||||
sent_from: str = "N/A"
|
sent_from: str = "N/A"
|
||||||
subject: str = "N/A"
|
|
||||||
document_date: Optional[date]
|
|
||||||
explanation: Optional[str] = None
|
explanation: Optional[str] = None
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
Determine who sent the document. Otherwise, return N/A.
|
Determine who sent the document. Otherwise, return N/A.
|
||||||
|
|
||||||
Identify the subject or possible title of the document.
|
|
||||||
|
|
||||||
Return the date of the document if it exists.
|
|
||||||
|
|
||||||
Do all of this and return your output in JSON.
|
Do all of this and return your output in JSON.
|
||||||
"""
|
"""
|
||||||
response = client.chat(
|
response = client.chat(
|
||||||
|
@ -162,6 +154,61 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
response.message.content)
|
response.message.content)
|
||||||
|
|
||||||
sent_from = result.sent_from
|
sent_from = result.sent_from
|
||||||
|
|
||||||
|
# Determine subject
|
||||||
|
class DocumentSchema(BaseModel):
|
||||||
|
subject: str = "N/A"
|
||||||
|
explanation: Optional[str] = None
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
Identify the subject of the document if it exists.
|
||||||
|
|
||||||
|
Do all of this and return your output in JSON.
|
||||||
|
"""
|
||||||
|
response = client.chat(
|
||||||
|
model=get_secret("OLLAMA_MODEL"),
|
||||||
|
messages=[
|
||||||
|
{"role": "user",
|
||||||
|
"content": prompt,
|
||||||
|
"images": [encoded_image]},
|
||||||
|
],
|
||||||
|
format=DocumentSchema.model_json_schema(),
|
||||||
|
options={
|
||||||
|
"temperature": 0
|
||||||
|
},
|
||||||
|
)
|
||||||
|
result = DocumentSchema.model_validate_json(
|
||||||
|
response.message.content)
|
||||||
|
|
||||||
|
document_subject = result.subject
|
||||||
|
|
||||||
|
# Determine date
|
||||||
|
class DocumentSchema(BaseModel):
|
||||||
|
document_date: Optional[date]
|
||||||
|
explanation: Optional[str] = None
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
Identify the date of the document if it exists.
|
||||||
|
|
||||||
|
If you are unable to determine the date, return nothing.
|
||||||
|
|
||||||
|
Do all of this and return your output in JSON.
|
||||||
|
"""
|
||||||
|
response = client.chat(
|
||||||
|
model=get_secret("OLLAMA_MODEL"),
|
||||||
|
messages=[
|
||||||
|
{"role": "user",
|
||||||
|
"content": prompt,
|
||||||
|
"images": [encoded_image]},
|
||||||
|
],
|
||||||
|
format=DocumentSchema.model_json_schema(),
|
||||||
|
options={
|
||||||
|
"temperature": 0
|
||||||
|
},
|
||||||
|
)
|
||||||
|
result = DocumentSchema.model_validate_json(
|
||||||
|
response.message.content)
|
||||||
|
|
||||||
document_date = result.document_date
|
document_date = result.document_date
|
||||||
|
|
||||||
if document_date:
|
if document_date:
|
||||||
|
@ -199,7 +246,8 @@ class PDFHandler(FileSystemEventHandler):
|
||||||
document_type=document_type,
|
document_type=document_type,
|
||||||
sent_from=sent_from,
|
sent_from=sent_from,
|
||||||
document_month=document_month,
|
document_month=document_month,
|
||||||
document_year=document_year
|
document_year=document_year,
|
||||||
|
subject=document_subject
|
||||||
)
|
)
|
||||||
|
|
||||||
DOCUMENT.file.save(
|
DOCUMENT.file.save(
|
||||||
|
|
Loading…
Reference in a new issue