mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 06:34:16 +00:00
Introduce SQLite FTS5-powered search backend
Use SQLite's FTS5 extension to power full-text search without any additional dependencies. FTS5 was introduced in SQLite 3.9.0, [released][1] in 2015 so should be available on most SQLite installations at this point in time. [1]: https://www.sqlite.org/changes.html#version_3_9_0
This commit is contained in:
parent
c53ec45a29
commit
8fe5faf4d0
2 changed files with 159 additions and 4 deletions
|
@ -213,6 +213,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'},
|
||||
'SONIC_BUCKET': {'type': str, 'default': 'snapshots'},
|
||||
'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90},
|
||||
# SQLite3 FTS5
|
||||
'FTS_SEPARATE_DATABASE': {'type': bool, 'default': True},
|
||||
'FTS_TOKENIZERS': {'type': str, 'default': 'porter unicode61 remove_diacritics 2'},
|
||||
# Default from https://www.sqlite.org/limits.html#max_length
|
||||
'FTS_SQLITE_MAX_LENGTH': {'type': int, 'default': int(1e9)},
|
||||
},
|
||||
|
||||
'DEPENDENCY_CONFIG': {
|
||||
|
@ -345,6 +350,7 @@ ALLOWED_IN_OUTPUT_DIR = {
|
|||
'yarn.lock',
|
||||
'static',
|
||||
'sonic',
|
||||
'search.sqlite3',
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
|
|
|
@ -1,15 +1,164 @@
|
|||
from typing import List, Generator
|
||||
import codecs
|
||||
from typing import List, Optional, Generator
|
||||
import sqlite3
|
||||
|
||||
from archivebox.util import enforce_types
|
||||
from archivebox.config import (
|
||||
FTS_SEPARATE_DATABASE,
|
||||
FTS_TOKENIZERS,
|
||||
FTS_SQLITE_MAX_LENGTH
|
||||
)
|
||||
|
||||
FTS_TABLE = "snapshot_fts"
|
||||
FTS_ID_TABLE = "snapshot_id_fts"
|
||||
FTS_COLUMN = "texts"
|
||||
|
||||
if FTS_SEPARATE_DATABASE:
|
||||
database = sqlite3.connect("search.sqlite3")
|
||||
# Make connection callable, because `django.db.connection.cursor()`
|
||||
# has to be called to get a context manager, but sqlite3.Connection
|
||||
# is a context manager without being called.
|
||||
def connection():
|
||||
return database
|
||||
SQLITE_BIND = "?"
|
||||
else:
|
||||
from django.db import connection as database
|
||||
connection = database.cursor
|
||||
SQLITE_BIND = "%s"
|
||||
|
||||
# Only Python >= 3.11 supports sqlite3.Connection.getlimit(),
|
||||
# so fall back to the default if the API to get the real value isn't present
|
||||
try:
|
||||
limit_id = sqlite3.SQLITE_LIMIT_LENGTH
|
||||
try:
|
||||
with database.temporary_connection() as cursor:
|
||||
SQLITE_LIMIT_LENGTH = cursor.connection.getlimit(limit_id)
|
||||
except AttributeError:
|
||||
SQLITE_LIMIT_LENGTH = database.getlimit(limit_id)
|
||||
except AttributeError:
|
||||
SQLITE_LIMIT_LENGTH = FTS_SQLITE_MAX_LENGTH
|
||||
|
||||
|
||||
def _escape_sqlite3(value: str, *, quote: str, errors='strict') -> str:
|
||||
assert isinstance(quote, str), "quote is not a str"
|
||||
assert len(quote) == 1, "quote must be a single character"
|
||||
|
||||
encodable = value.encode('utf-8', errors).decode('utf-8')
|
||||
|
||||
nul_index = encodable.find("\x00")
|
||||
if nul_index >= 0:
|
||||
error = UnicodeEncodeError("NUL-terminated utf-8", encodable,
|
||||
nul_index, nul_index + 1, "NUL not allowed")
|
||||
error_handler = codecs.lookup_error(errors)
|
||||
replacement, _ = error_handler(error)
|
||||
encodable = encodable.replace("\x00", replacement)
|
||||
|
||||
return quote + encodable.replace(quote, quote * 2) + quote
|
||||
|
||||
def _escape_sqlite3_value(value: str, errors='strict') -> str:
|
||||
return _escape_sqlite3(value, quote="'", errors=errors)
|
||||
|
||||
def _escape_sqlite3_identifier(value: str) -> str:
|
||||
return _escape_sqlite3(value, quote='"', errors='strict')
|
||||
|
||||
@enforce_types
|
||||
def _create_tables():
|
||||
table = _escape_sqlite3_identifier(FTS_TABLE)
|
||||
# Escape as value, because fts5() expects
|
||||
# string literal column names
|
||||
column = _escape_sqlite3_value(FTS_COLUMN)
|
||||
id_table = _escape_sqlite3_identifier(FTS_ID_TABLE)
|
||||
tokenizers = _escape_sqlite3_value(FTS_TOKENIZERS)
|
||||
trigger_name = _escape_sqlite3_identifier(f"{FTS_ID_TABLE}_ad")
|
||||
|
||||
with connection() as cursor:
|
||||
# Create a contentless-delete FTS5 table that indexes
|
||||
# but does not store the texts of snapshots
|
||||
cursor.execute(
|
||||
f"CREATE VIRTUAL TABLE {table}"
|
||||
f" USING fts5({column},"
|
||||
f" tokenize={tokenizers},"
|
||||
" content='', contentless_delete=1);"
|
||||
)
|
||||
# Create a one-to-one mapping between ArchiveBox snapshot_id
|
||||
# and FTS5 rowid, because the column type of rowid can't be
|
||||
# customized.
|
||||
cursor.execute(
|
||||
f"CREATE TABLE {id_table}("
|
||||
" rowid INTEGER PRIMARY KEY AUTOINCREMENT,"
|
||||
" snapshot_id char(32) NOT NULL UNIQUE"
|
||||
");"
|
||||
)
|
||||
# Create a trigger to delete items from the FTS5 index when
|
||||
# the snapshot_id is deleted from the mapping, to maintain
|
||||
# consistency and make the `flush()` query simpler.
|
||||
cursor.execute(
|
||||
f"CREATE TRIGGER {trigger_name}"
|
||||
f" AFTER DELETE ON {id_table} BEGIN"
|
||||
f" DELETE FROM {table} WHERE rowid=old.rowid;"
|
||||
" END;"
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def index(snapshot_id: str, texts: List[str]):
|
||||
pass
|
||||
text = ' '.join(texts)[:SQLITE_LIMIT_LENGTH]
|
||||
|
||||
table = _escape_sqlite3_identifier(FTS_TABLE)
|
||||
column = _escape_sqlite3_identifier(FTS_COLUMN)
|
||||
id_table = _escape_sqlite3_identifier(FTS_ID_TABLE)
|
||||
|
||||
with connection() as cursor:
|
||||
retries = 2
|
||||
while retries > 0:
|
||||
retries -= 1
|
||||
try:
|
||||
# If there is already an FTS index rowid to snapshot_id mapping,
|
||||
# then don't insert a new one, silently ignoring the operation.
|
||||
# {id_table}.rowid is AUTOINCREMENT, so will generate an unused
|
||||
# rowid for the index if it is an unindexed snapshot_id.
|
||||
cursor.execute(
|
||||
f"INSERT OR IGNORE INTO {id_table}(snapshot_id) VALUES({SQLITE_BIND})",
|
||||
[snapshot_id])
|
||||
# Fetch the FTS index rowid for the given snapshot_id
|
||||
id_res = cursor.execute(
|
||||
f"SELECT rowid FROM {id_table} WHERE snapshot_id = {SQLITE_BIND}",
|
||||
[snapshot_id])
|
||||
rowid = id_res.fetchone()[0]
|
||||
# (Re-)index the content
|
||||
cursor.execute(
|
||||
"INSERT OR REPLACE INTO"
|
||||
f" {table}(rowid, {column}) VALUES ({SQLITE_BIND}, {SQLITE_BIND})",
|
||||
[rowid, text])
|
||||
# All statements succeeded; break retry loop
|
||||
break
|
||||
except Exception as e:
|
||||
if str(e).startswith(f"no such table:"):
|
||||
_create_tables()
|
||||
else:
|
||||
raise
|
||||
|
||||
@enforce_types
|
||||
def search(text: str) -> List[str]:
|
||||
pass
|
||||
table = _escape_sqlite3_identifier(FTS_TABLE)
|
||||
id_table = _escape_sqlite3_identifier(FTS_ID_TABLE)
|
||||
|
||||
with connection() as cursor:
|
||||
res = cursor.execute(
|
||||
f"SELECT snapshot_id FROM {table}"
|
||||
f" INNER JOIN {id_table}"
|
||||
f" ON {id_table}.rowid = {table}.rowid"
|
||||
f" WHERE {table} MATCH {SQLITE_BIND}",
|
||||
[text])
|
||||
snap_ids = [row[0] for row in res.fetchall()]
|
||||
return snap_ids
|
||||
|
||||
@enforce_types
|
||||
def flush(snapshot_ids: Generator[str, None, None]):
|
||||
pass
|
||||
snapshot_ids = list(snapshot_ids)
|
||||
|
||||
id_table = _escape_sqlite3_identifier(FTS_ID_TABLE)
|
||||
|
||||
with connection() as cursor:
|
||||
cursor.executemany(
|
||||
f"DELETE FROM {id_table} WHERE snapshot_id={SQLITE_BIND}",
|
||||
[snapshot_ids])
|
||||
|
|
Loading…
Reference in a new issue