Multithread adding files (again)
This commit is contained in:
42
db.py
42
db.py
@@ -81,8 +81,8 @@ def _embed(text: str) -> Vector:
|
|||||||
return np.array(ollama.embeddings(model=MODEL, prompt=text)["embedding"])
|
return np.array(ollama.embeddings(model=MODEL, prompt=text)["embedding"])
|
||||||
|
|
||||||
|
|
||||||
def _vectorize_record(record: Record) -> Vector:
|
def _vectorize_record(record: Record) -> tuple[Record, Vector]:
|
||||||
return _embed(record.text)
|
return record, _embed(record.text)
|
||||||
|
|
||||||
#
|
#
|
||||||
# High-level (exported) functions
|
# High-level (exported) functions
|
||||||
@@ -260,40 +260,16 @@ def add_document(db: Database | Path, file: Path, max_workers: int = 4) -> None:
|
|||||||
print(f"Processing {len(records)} chunks with {max_workers} workers...")
|
print(f"Processing {len(records)} chunks with {max_workers} workers...")
|
||||||
|
|
||||||
db.documents.append(file)
|
db.documents.append(file)
|
||||||
for record in records:
|
|
||||||
vector = _vectorize_record(record)
|
|
||||||
db.records[vector.tobytes()] = record
|
|
||||||
db.vectors.append(vector)
|
|
||||||
|
|
||||||
# TODO measure with GIL disabled to check if multithreading actually helps
|
# TODO measure with GIL disabled to check if multithreading actually helps
|
||||||
# with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||||
# # Submit all chunk processing tasks
|
futures = [ pool.submit(_vectorize_record, r) for r in records ]
|
||||||
# vector_futures = {
|
for f in as_completed(futures):
|
||||||
# executor.submit(_vectorize_record, r): r for r in records
|
record, vector = f.result()
|
||||||
# }
|
db.records[vector.tobytes()] = record
|
||||||
|
db.vectors.append(vector)
|
||||||
# # Collect results as they complete
|
|
||||||
# completed_chunks = 0
|
|
||||||
# for future in as_completed(vector_futures):
|
|
||||||
# try:
|
|
||||||
# vector = future.result()
|
|
||||||
# # Convert vector to bytes for use as dictionary key
|
|
||||||
# vector_bytes = vector.tobytes()
|
|
||||||
# # Add to database
|
|
||||||
# db.vectors.append(vector)
|
|
||||||
# db.records[vector_bytes] = record
|
|
||||||
|
|
||||||
# completed_chunks += 1
|
|
||||||
# # if completed_chunks % 10 == 0 or completed_chunks == len(chunk_tasks):
|
|
||||||
# # print(f" Completed {completed_chunks}/{len(chunk_tasks)} chunks")
|
|
||||||
|
|
||||||
# except Exception as e:
|
|
||||||
# chunk_data = future_to_chunk[future]
|
|
||||||
# print(f" Error processing chunk {chunk_data}: {e}")
|
|
||||||
|
|
||||||
print(f"Successfully processed {file}: {len(records)} chunks")
|
|
||||||
|
|
||||||
|
|
||||||
|
print(f"Successfully processed {file}: {len(records)} chunks")
|
||||||
|
|
||||||
# Save database if we loaded it from file
|
# Save database if we loaded it from file
|
||||||
if save_to_file and database_file_path:
|
if save_to_file and database_file_path:
|
||||||
|
|||||||
Reference in New Issue
Block a user