Line data Source code
1 : /**
2 : Copyright (c) 2017-2022 Roman Katuntsev <sbkarr@stappler.org>
3 : Copyright (c) 2023 Stappler LLC <admin@stappler.dev>
4 :
5 : Permission is hereby granted, free of charge, to any person obtaining a copy
6 : of this software and associated documentation files (the "Software"), to deal
7 : in the Software without restriction, including without limitation the rights
8 : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 : copies of the Software, and to permit persons to whom the Software is
10 : furnished to do so, subject to the following conditions:
11 :
12 : The above copyright notice and this permission notice shall be included in
13 : all copies or substantial portions of the Software.
14 :
15 : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 : THE SOFTWARE.
22 : **/
23 :
24 : #ifndef STAPPLER_SEARCH_SPSEARCHINDEX_H_
25 : #define STAPPLER_SEARCH_SPSEARCHINDEX_H_
26 :
27 : #include "SPRef.h"
28 : #include "SPSearchDistance.h"
29 :
30 : namespace STAPPLER_VERSIONIZED stappler::search {
31 :
32 : class SearchIndex : public Ref {
33 : public:
34 : using DefaultSep = StringView::Compose<
35 : StringView::CharGroup<CharGroupId::WhiteSpace>,
36 : StringView::Chars<'=', '/', '(', ')', '.', ',', '-', '\'', '"', ':', ';', '?', '!', '@', '#', '$', '%', '^', '*', '\\', '+', '[', ']'>
37 : >;
38 :
39 : struct Slice;
40 : struct Node;
41 : struct Token;
42 : struct ResultToken;
43 : struct ResultNode;
44 : struct Result;
45 :
46 : enum TokenType {
47 : SearchNode,
48 : SearchRequest
49 : };
50 :
51 : using TokenCallback = Function<void(const StringView &)>;
52 : using TokenizerCallback = Function<void(const StringView &, const TokenCallback &, TokenType)>;
53 : using HeuristicCallback = Function<float(const SearchIndex &index, const ResultNode &result)>;
54 : using FilterCallback = Function<bool(const Node *)>;
55 :
56 : struct Slice {
57 : uint16_t start = 0; // start position in node's canonical string
58 : uint16_t size = 0; // length in node's canonical string
59 : };
60 :
61 : struct Node {
62 : int64_t id = 0;
63 : int64_t tag = 0;
64 : String canonical;
65 : Distance alignment;
66 : };
67 :
68 : struct Token {
69 : uint32_t index = 0; // node index
70 : Slice slice; // slice from canonical
71 : };
72 :
73 : struct ResultToken {
74 : uint32_t word = 0; // node index
75 : uint16_t match = 0; // node index
76 : Slice slice; // slice from canonical
77 : };
78 :
79 : struct ResultNode {
80 : float score = 0.0f;
81 : const Node *node;
82 : Vector<ResultToken> matches;
83 : };
84 :
85 : struct Result {
86 : Rc<SearchIndex> ref;
87 : Vector<ResultNode> nodes;
88 : };
89 :
90 : struct Heuristic {
91 : using TagCallback = Function<float(int64_t)>;
92 : using SizeCallback = Function<float(uint32_t, uint32_t)>;
93 :
94 75 : Heuristic() { }
95 : Heuristic(const TagCallback &cb, bool exclude = true) : excludeEqualMatches(exclude), tagScore(cb) { }
96 :
97 : float operator ()(const SearchIndex &index, const SearchIndex::ResultNode &result);
98 :
99 : bool excludeEqualMatches = true;
100 :
101 : // for every full token match score is incremented to fullMatchScore / countOfMatches
102 : float fullMatchScore = 2.0f;
103 :
104 : // all token score is multiplied by score of it's node tag
105 100 : TagCallback tagScore = [] (int64_t tag) -> float {
106 100 : return 1.0f;
107 : };
108 :
109 : // token score is better for longer word
110 150 : SizeCallback wordScore = [] (uint32_t match, uint32_t size) -> float {
111 150 : return (1.0f + log2f(size)) * (float(match) / float(size));
112 : };
113 :
114 : // token score is better if tokens match sequentially
115 150 : SizeCallback positionScore = [] (uint32_t prev, uint32_t current) -> float {
116 150 : if (prev != maxOf<uint32_t>()) {
117 50 : return (prev + 1 == current)?1.0f:((prev + 2 == current)?0.5f:0.0f);
118 : }
119 100 : return 0.0f;
120 : };
121 : };
122 :
123 : bool init(const TokenizerCallback & = nullptr);
124 :
125 : void reserve(size_t);
126 : void add(const StringView &, int64_t id, int64_t tag);
127 :
128 : Result performSearch(const StringView &, size_t minMatch, const HeuristicCallback & = Heuristic(),
129 : const FilterCallback & filter = nullptr);
130 :
131 : StringView resolveToken(const Node &, const ResultToken &) const;
132 : Slice convertToken(const Node &, const ResultToken &) const;
133 :
134 : void print(const Callback<void(StringView)> &out) const;
135 :
136 : protected:
137 : StringView makeStringView(const Token &) const;
138 : StringView makeStringView(uint32_t idx, const Slice &) const;
139 :
140 : void onToken(Vector<Token> &vec, const StringView &, uint32_t, const Slice &);
141 :
142 : Vector<Node> _nodes;
143 : Vector<Token> _tokens;
144 : TokenizerCallback _tokenizer;
145 : };
146 :
147 : }
148 :
149 : #endif /* STAPPLER_SEARCH_SPSEARCHINDEX_H_ */
|