Index Documents#
[1]:
import typing as T
import os
import shutil
from datetime import datetime, timezone, timedelta
from pathlib import Path
from whoosh import fields as F
from whoosh.index import exists_in, create_in, open_dir, Index
from whoosh import qparser, query, sorting
from rich import print as rprint
from rich.console import Console
[2]:
console = Console()
dir_index = Path(os.getcwd()).joinpath(".whoosh_index")
dir_index.mkdir(parents=True, exist_ok=True)
def clear_index():
shutil.rmtree(dir_index, ignore_errors=True)
def get_index(schema: F.SchemaClass):
if exists_in(str(dir_index)):
idx = open_dir(str(dir_index))
else:
dir_index.mkdir(parents=True, exist_ok=True)
idx = create_in(
dirname=str(dir_index),
schema=schema,
)
return idx
def result_to_docs(res) -> T.List[T.Dict[str, T.Any]]:
return [hit.fields() for hit in res]
def search(idx: Index, q: query.Query):
# console.rule("Query", characters="=")
print("---------- Query ----------------------------")
# console.rule("equivalent query string", characters="-")
print("---------- equivalent query string ----------")
print(q)
# console.rule("equivalent query object", characters="-")
print("---------- equivalent query object ----------")
rprint(repr(q))
# console.rule("Result", characters="=")
print("---------- Result ----------------------------")
with idx.searcher() as sr:
docs = result_to_docs(sr.search(q))
rprint(docs)
Store Arbitary#
Whoosh can store arbitrary dict with int, float, string, binary, datetime data.
[3]:
class DocumentSchema(F.SchemaClass):
doc_id = F.ID(stored=True, unique=True)
data = F.STORED()
schema = DocumentSchema()
est = timezone(timedelta(hours=-4))
data = [
dict(doc_id="id-1", data=dict(name="alice", birthday=datetime(1995, 1, 1, tzinfo=est), blob=b"hello")),
]
clear_index()
idx = get_index(schema)
writer = idx.writer()
for doc in data:
writer.add_document(**doc)
writer.commit()
[4]:
q_str = "doc_id:id-1"
q = qparser.MultifieldParser(["id"], schema=schema).parse(q_str)
search(idx, q)
---------- Query ----------------------------
---------- equivalent query string ----------
doc_id:id-1
---------- equivalent query object ----------
Term('doc_id', 'id-1')
---------- Result ----------------------------
[ { 'data': { 'name': 'alice', 'birthday': datetime.datetime(1995, 1, 1, 0, 0, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=72000))), 'blob': b'hello' }, 'doc_id': 'id-1' } ]
[ ]: