Skip to content

Commit

Permalink
Merge pull request #16 from clj-codes/feat/even-better-indexes
Browse files Browse the repository at this point in the history
fix: consider full symbol as index
  • Loading branch information
rafaeldelboni authored Feb 27, 2024
2 parents d9c61d6 + d301805 commit ea45b49
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 18 deletions.
1 change: 1 addition & 0 deletions .clj-kondo/config.edn
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{:lint-as {datalevin.interpret/inter-fn clojure.core/fn}}
3 changes: 2 additions & 1 deletion .lsp/config.edn
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
flow [[:block 1]]
flow-with-defaults [[:block 1]]
flow-as-of [[:block 1]]
flow-without-validation [[:block 1]]}}}
flow-without-validation [[:block 1]]
inter-fn [[:inner 0] [:inner 1]]}}}
54 changes: 39 additions & 15 deletions dev/playground.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
[codes.clj.docs.extractor.core :as core]
[codes.clj.docs.extractor.datalevin :as datalevin]
[datalevin.core :as d]
[datalevin.interpret :refer [inter-fn]]
[datalevin.search-utils :as su]
[datalevin.util :as util])
(:import [java.io File]))
Expand Down Expand Up @@ -58,7 +59,7 @@
db (d/db conn)

datoms (->> (d/fulltext-datoms db
"ass"
"."
{:top 30
:domains ["definition-name"
"namespace-name"
Expand Down Expand Up @@ -143,15 +144,19 @@

; tests with fulltext and analyzer
(let [query-analyzer (su/create-analyzer
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
{:tokenizer (datalevin/merge-tokenizers
(inter-fn [s] [[s 0 0]])
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
:token-filters [su/lower-case-token-filter]})

analyzer (su/create-analyzer
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
{:tokenizer (datalevin/merge-tokenizers
(inter-fn [s] [[s 0 0]])
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
:token-filters [su/lower-case-token-filter
su/prefix-token-filter]})

dir "/tmp/mydb"
dir (str "/tmp/mydb-" (random-uuid))
conn (d/create-conn dir
{:text {:db/valueType :db.type/string
:db/fulltext true
Expand All @@ -169,35 +174,46 @@
{:text "associative?"}
{:text "b"}
{:text "ba"}
{:text "bas"}]
{:text "bas"}
{:text "*"}
{:text "/"}
{:text "->"}
{:text "->>"}
{:text "as->"}
{:text "."}
{:text "as->banana"}]

_transact (d/transact! conn data)

result (->> (d/q '[:find ?i
result (->> (d/q '[:find ?e ?v
:in $ ?q
:where
[(fulltext $ ?q {:top 20}) [[?e]]]
[?e :text ?i]]
[(fulltext $ ?q {:top 20}) [[?e ?a ?v]]]]
(d/db conn)
"assoc-me")
"as")
doall)]

(d/close conn)
(util/delete-files dir)

result)

; tests with fulltext and analyzer on a raw query
; tests with fulltext and analyzer on a raw query
(let [query-analyzer (su/create-analyzer
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
{:tokenizer (datalevin/merge-tokenizers
(inter-fn [s] [[s 0 0]])
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
:token-filters [su/lower-case-token-filter]})

analyzer (su/create-analyzer
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
{:tokenizer (datalevin/merge-tokenizers
(inter-fn [s] [[s 0 0]])
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
:token-filters [su/lower-case-token-filter
su/prefix-token-filter]})

lmdb (d/open-kv "/tmp/mydb")
dir (str "/tmp/lmdb-" (random-uuid))
lmdb (d/open-kv dir)

engine (d/new-search-engine lmdb {:query-analyzer query-analyzer
:analyzer analyzer
Expand All @@ -213,13 +229,21 @@
7 "associative?"
8 "b"
9 "ba"
10 "bas"}
10 "bas"
11 "->"
12 "->>"
13 "as->"
14 "as->banana"
15 "/"
16 "*"
17 "."}

_transact (doseq [[k v] input]
(d/add-doc engine k v))

result (doall (d/search engine "assoc-m" {:top 20 :display :texts}))]
result (doall (d/search engine "->" {:top 20 :display :texts}))]

(d/close-kv lmdb)
(util/delete-files dir)

result))
17 changes: 15 additions & 2 deletions src/codes/clj/docs/extractor/datalevin.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns codes.clj.docs.extractor.datalevin
(:require [datalevin.core :as d]
[datalevin.interpret :refer [inter-fn]]
[datalevin.search-utils :as su]))

;; TODO: add id :db.unique/identity and ref :db.type/ref
Expand Down Expand Up @@ -78,12 +79,24 @@
(def db-schemas
(merge project-schema namespace-schema definition-schema))

(defn merge-tokenizers
"Merges the results of tokenizer a and b into one sequence."
[tokenizer-a tokenizer-b]
(inter-fn [^String s]
(into (sequence (tokenizer-a s))
(sequence (tokenizer-b s)))))

(defn bulk-transact! [datoms config]
(let [query-analyzer (su/create-analyzer
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
{:tokenizer (merge-tokenizers
(inter-fn [s] [[s 0 0]])
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
:token-filters [su/lower-case-token-filter]})

analyzer (su/create-analyzer
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
{:tokenizer (merge-tokenizers
(inter-fn [s] [[s 0 0]])
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
:token-filters [su/lower-case-token-filter
su/prefix-token-filter]})
conn (-> config :db :dir
Expand Down

0 comments on commit ea45b49

Please sign in to comment.