Skip to content

Commit

Permalink
feat: Case-insensitive search for non-ASCII messages (#5052)
Browse files Browse the repository at this point in the history
SQLite search with `LIKE` is case-insensitive only for ASCII chars. To make it case-insensitive for
all messages, create a new column `msgs.txt_normalized` defaulting to `NULL` (so we do not bump up
the database size in a migration) and storing lowercased/normalized text there when the row is
created/updated. When doing a search, search over `IFNULL(txt_normalized, txt)`.
  • Loading branch information
iequidoo committed Jun 17, 2024
1 parent a5d14b3 commit f6f4ccc
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 18 deletions.
27 changes: 18 additions & 9 deletions src/chat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -927,12 +927,13 @@ impl ChatId {
.sql
.execute(
"UPDATE msgs
SET timestamp=?,type=?,txt=?, param=?,mime_in_reply_to=?
SET timestamp=?,type=?,txt=?,txt_normalized=?,param=?,mime_in_reply_to=?
WHERE id=?;",
(
time(),
msg.viewtype,
&msg.text,
message::normalize_text(&msg.text),
msg.param.to_string(),
msg.in_reply_to.as_deref().unwrap_or_default(),
msg.id,
Expand All @@ -956,17 +957,19 @@ impl ChatId {
type,
state,
txt,
txt_normalized,
param,
hidden,
mime_in_reply_to)
VALUES (?,?,?, ?,?,?,?,?,?);",
VALUES (?,?,?,?,?,?,?,?,?,?);",
(
self,
ContactId::SELF,
time(),
msg.viewtype,
MessageState::OutDraft,
&msg.text,
message::normalize_text(&msg.text),
msg.param.to_string(),
1,
msg.in_reply_to.as_deref().unwrap_or_default(),
Expand Down Expand Up @@ -2075,7 +2078,7 @@ impl Chat {
.execute(
"UPDATE msgs
SET rfc724_mid=?, chat_id=?, from_id=?, to_id=?, timestamp=?, type=?,
state=?, txt=?, subject=?, param=?,
state=?, txt=?, txt_normalized=?, subject=?, param=?,
hidden=?, mime_in_reply_to=?, mime_references=?, mime_modified=?,
mime_headers=?, mime_compressed=1, location_id=?, ephemeral_timer=?,
ephemeral_timestamp=?
Expand All @@ -2089,6 +2092,7 @@ impl Chat {
msg.viewtype,
msg.state,
msg.text,
message::normalize_text(&msg.text),
&msg.subject,
msg.param.to_string(),
msg.hidden,
Expand Down Expand Up @@ -2117,6 +2121,7 @@ impl Chat {
type,
state,
txt,
txt_normalized,
subject,
param,
hidden,
Expand All @@ -2128,7 +2133,7 @@ impl Chat {
location_id,
ephemeral_timer,
ephemeral_timestamp)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,1,?,?,?);",
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,1,?,?,?);",
params_slice![
msg.rfc724_mid,
msg.chat_id,
Expand All @@ -2138,6 +2143,7 @@ impl Chat {
msg.viewtype,
msg.state,
msg.text,
message::normalize_text(&msg.text),
&msg.subject,
msg.param.to_string(),
msg.hidden,
Expand Down Expand Up @@ -4370,9 +4376,10 @@ pub async fn add_device_msg_with_importance(
timestamp_rcvd,
type,state,
txt,
txt_normalized,
param,
rfc724_mid)
VALUES (?,?,?,?,?,?,?,?,?,?,?);",
VALUES (?,?,?,?,?,?,?,?,?,?,?,?);",
(
chat_id,
ContactId::DEVICE,
Expand All @@ -4383,6 +4390,7 @@ pub async fn add_device_msg_with_importance(
msg.viewtype,
state,
&msg.text,
message::normalize_text(&msg.text),
msg.param.to_string(),
rfc724_mid,
),
Expand Down Expand Up @@ -4486,8 +4494,8 @@ pub(crate) async fn add_info_msg_with_cmd(

let row_id =
context.sql.insert(
"INSERT INTO msgs (chat_id,from_id,to_id,timestamp,timestamp_sent,timestamp_rcvd,type,state,txt,rfc724_mid,ephemeral_timer, param,mime_in_reply_to)
VALUES (?,?,?, ?,?,?,?,?, ?,?,?, ?,?);",
"INSERT INTO msgs (chat_id,from_id,to_id,timestamp,timestamp_sent,timestamp_rcvd,type,state,txt,txt_normalized,rfc724_mid,ephemeral_timer,param,mime_in_reply_to)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?);",
(
chat_id,
from_id.unwrap_or(ContactId::INFO),
Expand All @@ -4498,6 +4506,7 @@ pub(crate) async fn add_info_msg_with_cmd(
Viewtype::Text,
MessageState::InNoticed,
text,
message::normalize_text(text),
rfc724_mid,
ephemeral_timer,
param.to_string(),
Expand Down Expand Up @@ -4542,8 +4551,8 @@ pub(crate) async fn update_msg_text_and_timestamp(
context
.sql
.execute(
"UPDATE msgs SET txt=?, timestamp=? WHERE id=?;",
(text, timestamp, msg_id),
"UPDATE msgs SET txt=?, txt_normalized=?, timestamp=? WHERE id=?;",
(text, message::normalize_text(text), timestamp, msg_id),
)
.await?;
context.emit_msgs_changed(chat_id, msg_id);
Expand Down
16 changes: 12 additions & 4 deletions src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1259,12 +1259,12 @@ impl Context {
Ok(list)
}

/// Searches for messages containing the query string.
/// Searches for messages containing the query string case-insensitively.
///
/// If `chat_id` is provided this searches only for messages in this chat, if `chat_id`
/// is `None` this searches messages from all chats.
pub async fn search_msgs(&self, chat_id: Option<ChatId>, query: &str) -> Result<Vec<MsgId>> {
let real_query = query.trim();
let real_query = query.trim().to_lowercase();
if real_query.is_empty() {
return Ok(Vec::new());
}
Expand All @@ -1280,7 +1280,7 @@ impl Context {
WHERE m.chat_id=?
AND m.hidden=0
AND ct.blocked=0
AND txt LIKE ?
AND IFNULL(txt_normalized, txt) LIKE ?
ORDER BY m.timestamp,m.id;",
(chat_id, str_like_in_text),
|row| row.get::<_, MsgId>("id"),
Expand Down Expand Up @@ -1316,7 +1316,7 @@ impl Context {
AND m.hidden=0
AND c.blocked!=1
AND ct.blocked=0
AND m.txt LIKE ?
AND IFNULL(txt_normalized, txt) LIKE ?
ORDER BY m.id DESC LIMIT 1000",
(str_like_in_text,),
|row| row.get::<_, MsgId>("id"),
Expand Down Expand Up @@ -1721,6 +1721,8 @@ mod tests {
msg2.set_text("barbaz".to_string());
send_msg(&alice, chat.id, &mut msg2).await?;

alice.send_text(chat.id, "Δ-Chat").await;

// Global search with a part of text finds the message.
let res = alice.search_msgs(None, "ob").await?;
assert_eq!(res.len(), 1);
Expand All @@ -1733,6 +1735,12 @@ mod tests {
assert_eq!(res.first(), Some(&msg2.id));
assert_eq!(res.get(1), Some(&msg1.id));

// Search is case-insensitive.
for chat_id in [None, Some(chat.id)] {
let res = alice.search_msgs(chat_id, "δ-chat").await?;
assert_eq!(res.len(), 1);
}

// Global search with longer text does not find any message.
let res = alice.search_msgs(None, "foobarbaz").await?;
assert!(res.is_empty());
Expand Down
2 changes: 1 addition & 1 deletion src/ephemeral.rs
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ pub(crate) async fn delete_expired_messages(context: &Context, now: i64) -> Resu
for (msg_id, chat_id, viewtype, location_id) in rows {
transaction.execute(
"UPDATE msgs
SET chat_id=?, txt='', subject='', txt_raw='',
SET chat_id=?, txt='', txt_normalized=NULL, subject='', txt_raw='',
mime_headers='', from_id=0, to_id=0, param=''
WHERE id=?",
(DC_CHAT_ID_TRASH, msg_id),
Expand Down
11 changes: 10 additions & 1 deletion src/message.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ impl MsgId {
r#"
UPDATE msgs
SET
chat_id=?, txt='',
chat_id=?, txt='', txt_normalized=NULL,
subject='', txt_raw='',
mime_headers='',
from_id=0, to_id=0,
Expand Down Expand Up @@ -2072,6 +2072,15 @@ impl Viewtype {
}
}

/// Returns text for storing in the `msgs.txt_normalized` column (to make case-insensitive search
/// possible for non-ASCII messages).
pub(crate) fn normalize_text(text: &str) -> Option<String> {
if text.is_ascii() {
return None;
};
Some(text.to_lowercase()).filter(|t| t != text)
}

#[cfg(test)]
mod tests {
use num_traits::FromPrimitive;
Expand Down
8 changes: 5 additions & 3 deletions src/receive_imf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1540,7 +1540,7 @@ INSERT INTO msgs
rfc724_mid, chat_id,
from_id, to_id, timestamp, timestamp_sent,
timestamp_rcvd, type, state, msgrmsg,
txt, subject, txt_raw, param, hidden,
txt, txt_normalized, subject, txt_raw, param, hidden,
bytes, mime_headers, mime_compressed, mime_in_reply_to,
mime_references, mime_modified, error, ephemeral_timer,
ephemeral_timestamp, download_state, hop_info
Expand All @@ -1550,15 +1550,16 @@ INSERT INTO msgs
?, ?, ?, ?,
?, ?, ?, ?,
?, ?, ?, ?, ?,
?, ?, ?, ?, 1,
?, ?, ?, ?, ?, 1,
?, ?, ?, ?,
?, ?, ?, ?
)
ON CONFLICT (id) DO UPDATE
SET rfc724_mid=excluded.rfc724_mid, chat_id=excluded.chat_id,
from_id=excluded.from_id, to_id=excluded.to_id, timestamp_sent=excluded.timestamp_sent,
type=excluded.type, msgrmsg=excluded.msgrmsg,
txt=excluded.txt, subject=excluded.subject, txt_raw=excluded.txt_raw, param=excluded.param,
txt=excluded.txt, txt_normalized=excluded.txt_normalized, subject=excluded.subject,
txt_raw=excluded.txt_raw, param=excluded.param,
hidden=excluded.hidden,bytes=excluded.bytes, mime_headers=excluded.mime_headers,
mime_compressed=excluded.mime_compressed, mime_in_reply_to=excluded.mime_in_reply_to,
mime_references=excluded.mime_references, mime_modified=excluded.mime_modified, error=excluded.error, ephemeral_timer=excluded.ephemeral_timer,
Expand All @@ -1578,6 +1579,7 @@ RETURNING id
state,
is_dc_message,
if trash { "" } else { msg },
if trash { None } else { message::normalize_text(msg) },
if trash { "" } else { &subject },
// txt_raw might contain invalid utf8
if trash { "" } else { &txt_raw },
Expand Down
5 changes: 5 additions & 0 deletions src/sql/migrations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,11 @@ CREATE INDEX msgs_status_updates_index2 ON msgs_status_updates (uid);
.await?;
}

if dbversion < 115 {
sql.execute_migration("ALTER TABLE msgs ADD COLUMN txt_normalized TEXT", 115)
.await?;
}

let new_version = sql
.get_raw_config_int(VERSION_CFG)
.await?
Expand Down

0 comments on commit f6f4ccc

Please sign in to comment.