|
- # Copyright 2015, 2016 OpenMarket Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import enum
- import logging
- import re
- from collections import deque
- from dataclasses import dataclass
- from typing import (
- TYPE_CHECKING,
- Any,
- Collection,
- Iterable,
- List,
- Optional,
- Set,
- Tuple,
- Union,
- )
-
- import attr
-
- from synapse.api.errors import SynapseError
- from synapse.events import EventBase
- from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause
- from synapse.storage.database import (
- DatabasePool,
- LoggingDatabaseConnection,
- LoggingTransaction,
- )
- from synapse.storage.databases.main.events_worker import EventRedactBehaviour
- from synapse.storage.engines import PostgresEngine, Sqlite3Engine
- from synapse.types import JsonDict
-
- if TYPE_CHECKING:
- from synapse.server import HomeServer
-
- logger = logging.getLogger(__name__)
-
-
- @attr.s(slots=True, frozen=True, auto_attribs=True)
- class SearchEntry:
- key: str
- value: str
- event_id: str
- room_id: str
- stream_ordering: Optional[int]
- origin_server_ts: int
-
-
- def _clean_value_for_search(value: str) -> str:
- """
- Replaces any null code points in the string with spaces as
- Postgres and SQLite do not like the insertion of strings with
- null code points into the full-text search tables.
- """
- return value.replace("\u0000", " ")
-
-
- class SearchWorkerStore(SQLBaseStore):
- def store_search_entries_txn(
- self, txn: LoggingTransaction, entries: Iterable[SearchEntry]
- ) -> None:
- """Add entries to the search table
-
- Args:
- txn:
- entries: entries to be added to the table
- """
- if not self.hs.config.server.enable_search:
- return
- if isinstance(self.database_engine, PostgresEngine):
- sql = """
- INSERT INTO event_search
- (event_id, room_id, key, vector, stream_ordering, origin_server_ts)
- VALUES (?,?,?,to_tsvector('english', ?),?,?)
- """
-
- args1 = (
- (
- entry.event_id,
- entry.room_id,
- entry.key,
- _clean_value_for_search(entry.value),
- entry.stream_ordering,
- entry.origin_server_ts,
- )
- for entry in entries
- )
-
- txn.execute_batch(sql, args1)
-
- elif isinstance(self.database_engine, Sqlite3Engine):
- self.db_pool.simple_insert_many_txn(
- txn,
- table="event_search",
- keys=("event_id", "room_id", "key", "value"),
- values=(
- (
- entry.event_id,
- entry.room_id,
- entry.key,
- _clean_value_for_search(entry.value),
- )
- for entry in entries
- ),
- )
-
- else:
- # This should be unreachable.
- raise Exception("Unrecognized database engine")
-
-
- class SearchBackgroundUpdateStore(SearchWorkerStore):
- EVENT_SEARCH_UPDATE_NAME = "event_search"
- EVENT_SEARCH_ORDER_UPDATE_NAME = "event_search_order"
- EVENT_SEARCH_USE_GIN_POSTGRES_NAME = "event_search_postgres_gin"
- EVENT_SEARCH_DELETE_NON_STRINGS = "event_search_sqlite_delete_non_strings"
-
- def __init__(
- self,
- database: DatabasePool,
- db_conn: LoggingDatabaseConnection,
- hs: "HomeServer",
- ):
- super().__init__(database, db_conn, hs)
-
- self.db_pool.updates.register_background_update_handler(
- self.EVENT_SEARCH_UPDATE_NAME, self._background_reindex_search
- )
- self.db_pool.updates.register_background_update_handler(
- self.EVENT_SEARCH_ORDER_UPDATE_NAME, self._background_reindex_search_order
- )
-
- self.db_pool.updates.register_background_update_handler(
- self.EVENT_SEARCH_USE_GIN_POSTGRES_NAME, self._background_reindex_gin_search
- )
-
- self.db_pool.updates.register_background_update_handler(
- self.EVENT_SEARCH_DELETE_NON_STRINGS, self._background_delete_non_strings
- )
-
- async def _background_reindex_search(
- self, progress: JsonDict, batch_size: int
- ) -> int:
- # we work through the events table from highest stream id to lowest
- target_min_stream_id = progress["target_min_stream_id_inclusive"]
- max_stream_id = progress["max_stream_id_exclusive"]
- rows_inserted = progress.get("rows_inserted", 0)
-
- TYPES = ["m.room.name", "m.room.message", "m.room.topic"]
-
- def reindex_search_txn(txn: LoggingTransaction) -> int:
- sql = """
- SELECT stream_ordering, event_id, room_id, type, json, origin_server_ts
- FROM events
- JOIN event_json USING (room_id, event_id)
- WHERE ? <= stream_ordering AND stream_ordering < ?
- AND (%s)
- ORDER BY stream_ordering DESC
- LIMIT ?
- """ % (
- " OR ".join("type = '%s'" % (t,) for t in TYPES),
- )
-
- txn.execute(sql, (target_min_stream_id, max_stream_id, batch_size))
-
- # we could stream straight from the results into
- # store_search_entries_txn with a generator function, but that
- # would mean having two cursors open on the database at once.
- # Instead we just build a list of results.
- rows = self.db_pool.cursor_to_dict(txn)
- if not rows:
- return 0
-
- min_stream_id = rows[-1]["stream_ordering"]
-
- event_search_rows = []
- for row in rows:
- try:
- event_id = row["event_id"]
- room_id = row["room_id"]
- etype = row["type"]
- stream_ordering = row["stream_ordering"]
- origin_server_ts = row["origin_server_ts"]
- try:
- event_json = db_to_json(row["json"])
- content = event_json["content"]
- except Exception:
- continue
-
- if etype == "m.room.message":
- key = "content.body"
- value = content["body"]
- elif etype == "m.room.topic":
- key = "content.topic"
- value = content["topic"]
- elif etype == "m.room.name":
- key = "content.name"
- value = content["name"]
- else:
- raise Exception("unexpected event type %s" % etype)
- except (KeyError, AttributeError):
- # If the event is missing a necessary field then
- # skip over it.
- continue
-
- if not isinstance(value, str):
- # If the event body, name or topic isn't a string
- # then skip over it
- continue
-
- event_search_rows.append(
- SearchEntry(
- key=key,
- value=value,
- event_id=event_id,
- room_id=room_id,
- stream_ordering=stream_ordering,
- origin_server_ts=origin_server_ts,
- )
- )
-
- self.store_search_entries_txn(txn, event_search_rows)
-
- progress = {
- "target_min_stream_id_inclusive": target_min_stream_id,
- "max_stream_id_exclusive": min_stream_id,
- "rows_inserted": rows_inserted + len(event_search_rows),
- }
-
- self.db_pool.updates._background_update_progress_txn(
- txn, self.EVENT_SEARCH_UPDATE_NAME, progress
- )
-
- return len(event_search_rows)
-
- if self.hs.config.server.enable_search:
- result = await self.db_pool.runInteraction(
- self.EVENT_SEARCH_UPDATE_NAME, reindex_search_txn
- )
- else:
- # Don't index anything if search is not enabled.
- result = 0
-
- if not result:
- await self.db_pool.updates._end_background_update(
- self.EVENT_SEARCH_UPDATE_NAME
- )
-
- return result
-
- async def _background_reindex_gin_search(
- self, progress: JsonDict, batch_size: int
- ) -> int:
- """This handles old synapses which used GIST indexes, if any;
- converting them back to be GIN as per the actual schema.
- """
-
- def create_index(conn: LoggingDatabaseConnection) -> None:
- conn.rollback()
-
- # we have to set autocommit, because postgres refuses to
- # CREATE INDEX CONCURRENTLY without it.
- conn.set_session(autocommit=True)
-
- try:
- c = conn.cursor()
-
- # if we skipped the conversion to GIST, we may already/still
- # have an event_search_fts_idx; unfortunately postgres 9.4
- # doesn't support CREATE INDEX IF EXISTS so we just catch the
- # exception and ignore it.
- import psycopg2
-
- try:
- c.execute(
- """
- CREATE INDEX CONCURRENTLY event_search_fts_idx
- ON event_search USING GIN (vector)
- """
- )
- except psycopg2.ProgrammingError as e:
- logger.warning(
- "Ignoring error %r when trying to switch from GIST to GIN", e
- )
-
- # we should now be able to delete the GIST index.
- c.execute("DROP INDEX IF EXISTS event_search_fts_idx_gist")
- finally:
- conn.set_session(autocommit=False)
-
- if isinstance(self.database_engine, PostgresEngine):
- await self.db_pool.runWithConnection(create_index)
-
- await self.db_pool.updates._end_background_update(
- self.EVENT_SEARCH_USE_GIN_POSTGRES_NAME
- )
- return 1
-
- async def _background_reindex_search_order(
- self, progress: JsonDict, batch_size: int
- ) -> int:
- target_min_stream_id = progress["target_min_stream_id_inclusive"]
- max_stream_id = progress["max_stream_id_exclusive"]
- rows_inserted = progress.get("rows_inserted", 0)
- have_added_index = progress["have_added_indexes"]
-
- if not have_added_index:
-
- def create_index(conn: LoggingDatabaseConnection) -> None:
- conn.rollback()
- conn.set_session(autocommit=True)
- c = conn.cursor()
-
- # We create with NULLS FIRST so that when we search *backwards*
- # we get the ones with non null origin_server_ts *first*
- c.execute(
- """
- CREATE INDEX CONCURRENTLY event_search_room_order
- ON event_search(room_id, origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)
- """
- )
- c.execute(
- """
- CREATE INDEX CONCURRENTLY event_search_order
- ON event_search(origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)
- """
- )
- conn.set_session(autocommit=False)
-
- await self.db_pool.runWithConnection(create_index)
-
- pg = dict(progress)
- pg["have_added_indexes"] = True
-
- await self.db_pool.runInteraction(
- self.EVENT_SEARCH_ORDER_UPDATE_NAME,
- self.db_pool.updates._background_update_progress_txn,
- self.EVENT_SEARCH_ORDER_UPDATE_NAME,
- pg,
- )
-
- def reindex_search_txn(txn: LoggingTransaction) -> Tuple[int, bool]:
- sql = """
- UPDATE event_search AS es
- SET stream_ordering = e.stream_ordering, origin_server_ts = e.origin_server_ts
- FROM events AS e
- WHERE e.event_id = es.event_id
- AND ? <= e.stream_ordering AND e.stream_ordering < ?
- RETURNING es.stream_ordering
- """
-
- min_stream_id = max_stream_id - batch_size
- txn.execute(sql, (min_stream_id, max_stream_id))
- rows = txn.fetchall()
-
- if min_stream_id < target_min_stream_id:
- # We've recached the end.
- return len(rows), False
-
- progress = {
- "target_min_stream_id_inclusive": target_min_stream_id,
- "max_stream_id_exclusive": min_stream_id,
- "rows_inserted": rows_inserted + len(rows),
- "have_added_indexes": True,
- }
-
- self.db_pool.updates._background_update_progress_txn(
- txn, self.EVENT_SEARCH_ORDER_UPDATE_NAME, progress
- )
-
- return len(rows), True
-
- num_rows, finished = await self.db_pool.runInteraction(
- self.EVENT_SEARCH_ORDER_UPDATE_NAME, reindex_search_txn
- )
-
- if not finished:
- await self.db_pool.updates._end_background_update(
- self.EVENT_SEARCH_ORDER_UPDATE_NAME
- )
-
- return num_rows
-
- async def _background_delete_non_strings(
- self, progress: JsonDict, batch_size: int
- ) -> int:
- """Deletes rows with non-string `value`s from `event_search` if using sqlite.
-
- Prior to Synapse 1.44.0, malformed events received over federation could cause integers
- to be inserted into the `event_search` table when using sqlite.
- """
-
- def delete_non_strings_txn(txn: LoggingTransaction) -> None:
- txn.execute("DELETE FROM event_search WHERE typeof(value) != 'text'")
-
- await self.db_pool.runInteraction(
- self.EVENT_SEARCH_DELETE_NON_STRINGS, delete_non_strings_txn
- )
-
- await self.db_pool.updates._end_background_update(
- self.EVENT_SEARCH_DELETE_NON_STRINGS
- )
- return 1
-
-
- class SearchStore(SearchBackgroundUpdateStore):
- def __init__(
- self,
- database: DatabasePool,
- db_conn: LoggingDatabaseConnection,
- hs: "HomeServer",
- ):
- super().__init__(database, db_conn, hs)
-
- async def search_msgs(
- self, room_ids: Collection[str], search_term: str, keys: Iterable[str]
- ) -> JsonDict:
- """Performs a full text search over events with given keys.
-
- Args:
- room_ids: List of room ids to search in
- search_term: Search term to search for
- keys: List of keys to search in, currently supports
- "content.body", "content.name", "content.topic"
-
- Returns:
- Dictionary of results
- """
- clauses = []
-
- args: List[Any] = []
-
- # Make sure we don't explode because the person is in too many rooms.
- # We filter the results below regardless.
- if len(room_ids) < 500:
- clause, args = make_in_list_sql_clause(
- self.database_engine, "room_id", room_ids
- )
- clauses = [clause]
-
- local_clauses = []
- for key in keys:
- local_clauses.append("key = ?")
- args.append(key)
-
- clauses.append("(%s)" % (" OR ".join(local_clauses),))
-
- count_args = args
- count_clauses = clauses
-
- if isinstance(self.database_engine, PostgresEngine):
- search_query = search_term
- sql = """
- SELECT ts_rank_cd(vector, websearch_to_tsquery('english', ?)) AS rank,
- room_id, event_id
- FROM event_search
- WHERE vector @@ websearch_to_tsquery('english', ?)
- """
- args = [search_query, search_query] + args
-
- count_sql = """
- SELECT room_id, count(*) as count FROM event_search
- WHERE vector @@ websearch_to_tsquery('english', ?)
- """
- count_args = [search_query] + count_args
- elif isinstance(self.database_engine, Sqlite3Engine):
- search_query = _parse_query_for_sqlite(search_term)
-
- sql = """
- SELECT rank(matchinfo(event_search)) as rank, room_id, event_id
- FROM event_search
- WHERE value MATCH ?
- """
- args = [search_query] + args
-
- count_sql = """
- SELECT room_id, count(*) as count FROM event_search
- WHERE value MATCH ?
- """
- count_args = [search_query] + count_args
- else:
- # This should be unreachable.
- raise Exception("Unrecognized database engine")
-
- for clause in clauses:
- sql += " AND " + clause
-
- for clause in count_clauses:
- count_sql += " AND " + clause
-
- # We add an arbitrary limit here to ensure we don't try to pull the
- # entire table from the database.
- sql += " ORDER BY rank DESC LIMIT 500"
-
- results = await self.db_pool.execute(
- "search_msgs", self.db_pool.cursor_to_dict, sql, *args
- )
-
- results = list(filter(lambda row: row["room_id"] in room_ids, results))
-
- # We set redact_behaviour to block here to prevent redacted events being returned in
- # search results (which is a data leak)
- events = await self.get_events_as_list( # type: ignore[attr-defined]
- [r["event_id"] for r in results],
- redact_behaviour=EventRedactBehaviour.block,
- )
-
- event_map = {ev.event_id: ev for ev in events}
-
- highlights = None
- if isinstance(self.database_engine, PostgresEngine):
- highlights = await self._find_highlights_in_postgres(search_query, events)
-
- count_sql += " GROUP BY room_id"
-
- count_results = await self.db_pool.execute(
- "search_rooms_count", self.db_pool.cursor_to_dict, count_sql, *count_args
- )
-
- count = sum(row["count"] for row in count_results if row["room_id"] in room_ids)
- return {
- "results": [
- {"event": event_map[r["event_id"]], "rank": r["rank"]}
- for r in results
- if r["event_id"] in event_map
- ],
- "highlights": highlights,
- "count": count,
- }
-
- async def search_rooms(
- self,
- room_ids: Collection[str],
- search_term: str,
- keys: Iterable[str],
- limit: int,
- pagination_token: Optional[str] = None,
- ) -> JsonDict:
- """Performs a full text search over events with given keys.
-
- Args:
- room_ids: The room_ids to search in
- search_term: Search term to search for
- keys: List of keys to search in, currently supports "content.body",
- "content.name", "content.topic"
- pagination_token: A pagination token previously returned
-
- Returns:
- Each match as a dictionary.
- """
- clauses = []
- args: List[Any] = []
-
- # Make sure we don't explode because the person is in too many rooms.
- # We filter the results below regardless.
- if len(room_ids) < 500:
- clause, args = make_in_list_sql_clause(
- self.database_engine, "room_id", room_ids
- )
- clauses = [clause]
-
- local_clauses = []
- for key in keys:
- local_clauses.append("key = ?")
- args.append(key)
-
- clauses.append("(%s)" % (" OR ".join(local_clauses),))
-
- # take copies of the current args and clauses lists, before adding
- # pagination clauses to main query.
- count_args = list(args)
- count_clauses = list(clauses)
-
- if pagination_token:
- try:
- origin_server_ts_str, stream_str = pagination_token.split(",")
- origin_server_ts = int(origin_server_ts_str)
- stream = int(stream_str)
- except Exception:
- raise SynapseError(400, "Invalid pagination token")
-
- clauses.append(
- """
- (origin_server_ts < ? OR (origin_server_ts = ? AND stream_ordering < ?))
- """
- )
- args.extend([origin_server_ts, origin_server_ts, stream])
-
- if isinstance(self.database_engine, PostgresEngine):
- search_query = search_term
- sql = """
- SELECT ts_rank_cd(vector, websearch_to_tsquery('english', ?)) as rank,
- origin_server_ts, stream_ordering, room_id, event_id
- FROM event_search
- WHERE vector @@ websearch_to_tsquery('english', ?) AND
- """
- args = [search_query, search_query] + args
-
- count_sql = """
- SELECT room_id, count(*) as count FROM event_search
- WHERE vector @@ websearch_to_tsquery('english', ?) AND
- """
- count_args = [search_query] + count_args
- elif isinstance(self.database_engine, Sqlite3Engine):
- # We use CROSS JOIN here to ensure we use the right indexes.
- # https://sqlite.org/optoverview.html#crossjoin
- #
- # We want to use the full text search index on event_search to
- # extract all possible matches first, then lookup those matches
- # in the events table to get the topological ordering. We need
- # to use the indexes in this order because sqlite refuses to
- # MATCH unless it uses the full text search index
- sql = """
- SELECT
- rank(matchinfo) as rank, room_id, event_id, origin_server_ts, stream_ordering
- FROM (
- SELECT key, event_id, matchinfo(event_search) as matchinfo
- FROM event_search
- WHERE value MATCH ?
- )
- CROSS JOIN events USING (event_id)
- WHERE
- """
- search_query = _parse_query_for_sqlite(search_term)
- args = [search_query] + args
-
- count_sql = """
- SELECT room_id, count(*) as count FROM event_search
- WHERE value MATCH ? AND
- """
- count_args = [search_query] + count_args
- else:
- # This should be unreachable.
- raise Exception("Unrecognized database engine")
-
- sql += " AND ".join(clauses)
- count_sql += " AND ".join(count_clauses)
-
- # We add an arbitrary limit here to ensure we don't try to pull the
- # entire table from the database.
- if isinstance(self.database_engine, PostgresEngine):
- sql += """
- ORDER BY origin_server_ts DESC NULLS LAST, stream_ordering DESC NULLS LAST
- LIMIT ?
- """
- elif isinstance(self.database_engine, Sqlite3Engine):
- sql += " ORDER BY origin_server_ts DESC, stream_ordering DESC LIMIT ?"
- else:
- raise Exception("Unrecognized database engine")
-
- # mypy expects to append only a `str`, not an `int`
- args.append(limit)
-
- results = await self.db_pool.execute(
- "search_rooms", self.db_pool.cursor_to_dict, sql, *args
- )
-
- results = list(filter(lambda row: row["room_id"] in room_ids, results))
-
- # We set redact_behaviour to block here to prevent redacted events being returned in
- # search results (which is a data leak)
- events = await self.get_events_as_list( # type: ignore[attr-defined]
- [r["event_id"] for r in results],
- redact_behaviour=EventRedactBehaviour.block,
- )
-
- event_map = {ev.event_id: ev for ev in events}
-
- highlights = None
- if isinstance(self.database_engine, PostgresEngine):
- highlights = await self._find_highlights_in_postgres(search_query, events)
-
- count_sql += " GROUP BY room_id"
-
- count_results = await self.db_pool.execute(
- "search_rooms_count", self.db_pool.cursor_to_dict, count_sql, *count_args
- )
-
- count = sum(row["count"] for row in count_results if row["room_id"] in room_ids)
-
- return {
- "results": [
- {
- "event": event_map[r["event_id"]],
- "rank": r["rank"],
- "pagination_token": "%s,%s"
- % (r["origin_server_ts"], r["stream_ordering"]),
- }
- for r in results
- if r["event_id"] in event_map
- ],
- "highlights": highlights,
- "count": count,
- }
-
- async def _find_highlights_in_postgres(
- self, search_query: str, events: List[EventBase]
- ) -> Set[str]:
- """Given a list of events and a search term, return a list of words
- that match from the content of the event.
-
- This is used to give a list of words that clients can match against to
- highlight the matching parts.
-
- Args:
- search_query
- events: A list of events
-
- Returns:
- A set of strings.
- """
-
- def f(txn: LoggingTransaction) -> Set[str]:
- highlight_words = set()
- for event in events:
- # As a hack we simply join values of all possible keys. This is
- # fine since we're only using them to find possible highlights.
- values = []
- for key in ("body", "name", "topic"):
- v = event.content.get(key, None)
- if v:
- v = _clean_value_for_search(v)
- values.append(v)
-
- if not values:
- continue
-
- value = " ".join(values)
-
- # We need to find some values for StartSel and StopSel that
- # aren't in the value so that we can pick results out.
- start_sel = "<"
- stop_sel = ">"
-
- while start_sel in value:
- start_sel += "<"
- while stop_sel in value:
- stop_sel += ">"
-
- query = (
- "SELECT ts_headline(?, websearch_to_tsquery('english', ?), %s)"
- % (
- _to_postgres_options(
- {
- "StartSel": start_sel,
- "StopSel": stop_sel,
- "MaxFragments": "50",
- }
- )
- )
- )
- txn.execute(query, (value, search_query))
- (headline,) = txn.fetchall()[0]
-
- # Now we need to pick the possible highlights out of the haedline
- # result.
- matcher_regex = "%s(.*?)%s" % (
- re.escape(start_sel),
- re.escape(stop_sel),
- )
-
- res = re.findall(matcher_regex, headline)
- highlight_words.update([r.lower() for r in res])
-
- return highlight_words
-
- return await self.db_pool.runInteraction("_find_highlights", f)
-
-
- def _to_postgres_options(options_dict: JsonDict) -> str:
- return "'%s'" % (",".join("%s=%s" % (k, v) for k, v in options_dict.items()),)
-
-
- @dataclass
- class Phrase:
- phrase: List[str]
-
-
- class SearchToken(enum.Enum):
- Not = enum.auto()
- Or = enum.auto()
- And = enum.auto()
-
-
- Token = Union[str, Phrase, SearchToken]
- TokenList = List[Token]
-
-
- def _is_stop_word(word: str) -> bool:
- # TODO Pull these out of the dictionary:
- # https://github.com/postgres/postgres/blob/master/src/backend/snowball/stopwords/english.stop
- return word in {"the", "a", "you", "me", "and", "but"}
-
-
- def _tokenize_query(query: str) -> TokenList:
- """
- Convert the user-supplied `query` into a TokenList, which can be translated into
- some DB-specific syntax.
-
- The following constructs are supported:
-
- - phrase queries using "double quotes"
- - case-insensitive `or` and `and` operators
- - negation of a keyword via unary `-`
- - unary hyphen to denote NOT e.g. 'include -exclude'
-
- The following differs from websearch_to_tsquery:
-
- - Stop words are not removed.
- - Unclosed phrases are treated differently.
-
- """
- tokens: TokenList = []
-
- # Find phrases.
- in_phrase = False
- parts = deque(query.split('"'))
- for i, part in enumerate(parts):
- # The contents inside double quotes is treated as a phrase.
- in_phrase = bool(i % 2)
-
- # Pull out the individual words, discarding any non-word characters.
- words = deque(re.findall(r"([\w\-]+)", part, re.UNICODE))
-
- # Phrases have simplified handling of words.
- if in_phrase:
- # Skip stop words.
- phrase = [word for word in words if not _is_stop_word(word)]
-
- # Consecutive words are implicitly ANDed together.
- if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or):
- tokens.append(SearchToken.And)
-
- # Add the phrase.
- tokens.append(Phrase(phrase))
- continue
-
- # Otherwise, not in a phrase.
- while words:
- word = words.popleft()
-
- if word.startswith("-"):
- tokens.append(SearchToken.Not)
-
- # If there's more word, put it back to be processed again.
- word = word[1:]
- if word:
- words.appendleft(word)
- elif word.lower() == "or":
- tokens.append(SearchToken.Or)
- else:
- # Skip stop words.
- if _is_stop_word(word):
- continue
-
- # Consecutive words are implicitly ANDed together.
- if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or):
- tokens.append(SearchToken.And)
-
- # Add the search term.
- tokens.append(word)
-
- return tokens
-
-
- def _tokens_to_sqlite_match_query(tokens: TokenList) -> str:
- """
- Convert the list of tokens to a string suitable for passing to sqlite's MATCH.
- Assume sqlite was compiled with enhanced query syntax.
-
- Ref: https://www.sqlite.org/fts3.html#full_text_index_queries
- """
- match_query = []
- for token in tokens:
- if isinstance(token, str):
- match_query.append(token)
- elif isinstance(token, Phrase):
- match_query.append('"' + " ".join(token.phrase) + '"')
- elif token == SearchToken.Not:
- # TODO: SQLite treats NOT as a *binary* operator. Hopefully a search
- # term has already been added before this.
- match_query.append(" NOT ")
- elif token == SearchToken.Or:
- match_query.append(" OR ")
- elif token == SearchToken.And:
- match_query.append(" AND ")
- else:
- raise ValueError(f"unknown token {token}")
-
- return "".join(match_query)
-
-
- def _parse_query_for_sqlite(search_term: str) -> str:
- """Takes a plain unicode string from the user and converts it into a form
- that can be passed to sqllite's matchinfo().
- """
- return _tokens_to_sqlite_match_query(_tokenize_query(search_term))
|