Fixed warnings from pylint
This commit is contained in:
33
db.py
33
db.py
@@ -1,3 +1,7 @@
|
||||
#pylint: disable=missing-class-docstring,invalid-name,broad-exception-caught
|
||||
"""
|
||||
Database module for semantic document search tool.
|
||||
"""
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
@@ -6,13 +10,13 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
import numpy as np
|
||||
import pymupdf
|
||||
import ollama # TODO split to another file
|
||||
import ollama
|
||||
|
||||
#
|
||||
# Types
|
||||
#
|
||||
|
||||
type Vector = np.NDArray # np.NDArray[np.float32] ?
|
||||
type Vector = np.NDArray
|
||||
type VectorBytes = bytes
|
||||
|
||||
|
||||
@@ -90,12 +94,29 @@ def _vectorize_record(record: Record) -> tuple[Record, Vector]:
|
||||
return record, _embed(record.text)
|
||||
|
||||
|
||||
def test_embedding() -> bool:
|
||||
"""
|
||||
Test if embedding functionality is available and working.
|
||||
|
||||
Returns:
|
||||
bool: True if embedding is working, False otherwise
|
||||
"""
|
||||
try:
|
||||
_ = _embed("Test.")
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
#
|
||||
# High-level (exported) functions
|
||||
#
|
||||
|
||||
|
||||
def create_dummy() -> Database:
|
||||
"""
|
||||
Create a dummy database for testing purposes.
|
||||
"""
|
||||
db_length: Final[int] = 10
|
||||
vectors = [np.array([i, 2 * i, 3 * i, 4 * i]) for i in range(db_length)]
|
||||
records = {
|
||||
@@ -250,13 +271,11 @@ def add_document(db: Database | Path, file: Path, max_workers: int = 4) -> None:
|
||||
records: list[Record] = []
|
||||
chunk_size = 1024
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
for page_num, page in enumerate(doc):
|
||||
text = page.get_text().strip()
|
||||
if not text:
|
||||
print(f" Page {page_num + 1}: Skipped (empty)")
|
||||
continue
|
||||
|
||||
# Simple chunking - split text into chunks of specified size
|
||||
for chunk_idx, i in enumerate(range(0, len(text), chunk_size)):
|
||||
chunk = text[i : i + chunk_size]
|
||||
@@ -267,14 +286,14 @@ def add_document(db: Database | Path, file: Path, max_workers: int = 4) -> None:
|
||||
)
|
||||
doc.close()
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error processing PDF {file}: {e}")
|
||||
raise RuntimeError(f"Error processing PDF {file}: {e}") from e
|
||||
|
||||
# Process chunks in parallel
|
||||
print(f"Processing {len(records)} chunks with {max_workers} workers...")
|
||||
|
||||
db.documents.append(file)
|
||||
|
||||
# TODO measure with GIL disabled to check if multithreading actually helps
|
||||
# NOTE this will only help with GIL disabled
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||
futures = [pool.submit(_vectorize_record, r) for r in records]
|
||||
for f in as_completed(futures):
|
||||
|
||||
Reference in New Issue
Block a user