Line data Source code
1 : /**
2 : Copyright (c) 2017-2022 Roman Katuntsev <sbkarr@stappler.org>
3 : Copyright (c) 2023 Stappler LLC <admin@stappler.dev>
4 :
5 : Permission is hereby granted, free of charge, to any person obtaining a copy
6 : of this software and associated documentation files (the "Software"), to deal
7 : in the Software without restriction, including without limitation the rights
8 : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 : copies of the Software, and to permit persons to whom the Software is
10 : furnished to do so, subject to the following conditions:
11 :
12 : The above copyright notice and this permission notice shall be included in
13 : all copies or substantial portions of the Software.
14 :
15 : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 : THE SOFTWARE.
22 : **/
23 :
24 : #include "SPSearchIndex.h"
25 : #include "SPString.h"
26 :
27 : namespace STAPPLER_VERSIONIZED stappler::search {
28 :
29 25 : bool SearchIndex::init(const TokenizerCallback &tcb) {
30 25 : _tokenizer = tcb;
31 25 : return true;
32 : }
33 :
34 25 : void SearchIndex::reserve(size_t s) {
35 25 : _nodes.reserve(s);
36 25 : }
37 :
38 100 : void SearchIndex::add(const StringView &v, int64_t id, int64_t tag) {
39 100 : String origin(string::tolower<Interface>(v));
40 :
41 100 : _nodes.emplace_back(Node{id, tag});
42 :
43 100 : Node &node = _nodes.back();
44 100 : uint32_t idx = uint32_t(_nodes.size() - 1);
45 100 : auto &canonical = node.canonical;
46 :
47 41250 : auto tokenFn = [&, this] (const StringView &str) {
48 13750 : if (!str.empty()) {
49 13750 : if (!canonical.empty()) {
50 13650 : canonical.append(" ");
51 : }
52 13750 : auto s = canonical.size();
53 13750 : canonical.append(str.str<Interface>());
54 13750 : onToken(_tokens, str, idx, Slice{ uint16_t(s), uint16_t(str.size()) });
55 : }
56 13750 : };
57 :
58 100 : if (_tokenizer) {
59 100 : _tokenizer(origin, tokenFn, SearchNode);
60 : } else {
61 0 : StringView r(origin);
62 0 : r.split<DefaultSep>(tokenFn);
63 : }
64 :
65 100 : if (canonical.empty()) {
66 0 : _nodes.pop_back();
67 100 : } else if (canonical != origin) {
68 100 : node.alignment = Distance(origin, canonical);
69 : }
70 100 : }
71 :
72 75 : SearchIndex::Result SearchIndex::performSearch(const StringView &v, size_t minMatch, const HeuristicCallback &cb,
73 : const FilterCallback & filter) {
74 75 : String origin(string::tolower<Interface>(v));
75 :
76 75 : SearchIndex::Result res{this};
77 :
78 75 : uint32_t wordIndex = 0;
79 :
80 5600 : auto tokenFn = [&, this] (const StringView &str) {
81 : //std::cout << "Token: " << str << "\n";
82 100 : auto lb = std::lower_bound(_tokens.begin(), _tokens.end(), str, [&, this] (const Token &l, const StringView &r) {
83 900 : return string::compare_c(makeStringView(l.index, l.slice), r) < 0;
84 200 : });
85 :
86 100 : if (lb != _tokens.end()) {
87 100 : auto node = &_nodes.at(lb->index);
88 100 : StringView value = makeStringView(*lb);
89 : //std::cout << "Found: '" << value << "' from '" << _nodes.at(lb->index).canonical << "'\n";
90 :
91 1350 : while (lb != _tokens.end() && value.size() >= str.size() && String::traits_type::compare(value.data(), str.data(), str.size()) == 0) {
92 1250 : if (!filter || filter(node)) {
93 1250 : auto ret_it = std::lower_bound(res.nodes.begin(), res.nodes.end(), node,
94 2225 : [&] (const ResultNode &l, const Node *r) {
95 2225 : return l.node < r;
96 1250 : });
97 1250 : if (ret_it == res.nodes.end() || ret_it->node != node) {
98 100 : res.nodes.emplace(ret_it, ResultNode{ 0.0f, node, {ResultToken{wordIndex, uint16_t(str.size()), lb->slice}} });
99 : } else {
100 1150 : ret_it->matches.emplace_back(ResultToken{wordIndex, uint16_t(str.size()), lb->slice});
101 : }
102 : }
103 1250 : ++ lb;
104 1250 : if (lb != _tokens.end()) {
105 1250 : value = makeStringView(*lb);
106 1250 : node = &_nodes.at(lb->index);
107 : //std::cout << "Next: '" << value << "' from '" << _nodes.at(lb->index).canonical << "'\n";
108 : }
109 : }
110 : }
111 100 : wordIndex ++;
112 100 : };
113 :
114 75 : if (_tokenizer) {
115 75 : _tokenizer(origin, tokenFn, SearchRequest);
116 : } else {
117 0 : StringView r(origin);
118 0 : r.split<DefaultSep>(tokenFn);
119 : }
120 :
121 75 : if (cb) {
122 175 : for (auto &it : res.nodes) {
123 100 : it.score = cb(*this, it);
124 : }
125 :
126 75 : std::sort(res.nodes.begin(), res.nodes.end(), [] (const ResultNode &l, const ResultNode &r) {
127 50 : return l.score > r.score;
128 : });
129 : }
130 :
131 150 : return res;
132 75 : }
133 :
134 2500 : StringView SearchIndex::resolveToken(const Node &node, const ResultToken &token) const {
135 2500 : return StringView(node.canonical.data() + token.slice.start, token.match);
136 : }
137 :
138 1250 : SearchIndex::Slice SearchIndex::convertToken(const Node &node, const ResultToken &ret) const {
139 1250 : if (node.alignment.empty()) {
140 0 : return Slice{ret.slice.start, ret.match};
141 : } else {
142 1250 : auto start = ret.slice.start + node.alignment.diff_original(ret.slice.start);
143 1250 : auto end = ret.slice.start + ret.match;
144 1250 : end += node.alignment.diff_original(end, true);
145 1250 : return Slice{uint16_t(start), uint16_t(end - start)};
146 : }
147 : }
148 :
149 25 : void SearchIndex::print(const Callback<void(StringView)> &out) const {
150 13775 : for (auto &it : _tokens) {
151 13750 : out << it.index << " " << makeStringView(it) << " " << _nodes.at(it.index).id << "\n";
152 : }
153 25 : }
154 :
155 15100 : StringView SearchIndex::makeStringView(const Token &t) const {
156 15100 : return makeStringView(t.index, t.slice);
157 : }
158 :
159 121975 : StringView SearchIndex::makeStringView(uint32_t idx, const Slice &sl) const {
160 121975 : const Node &node = _nodes.at(idx);
161 121975 : return StringView(node.canonical.data() + sl.start, sl.size);
162 : }
163 :
164 13750 : void SearchIndex::onToken(Vector<Token> &vec, const StringView &rep, uint32_t idx, const Slice &sl) {
165 13750 : auto insert_it = std::lower_bound(vec.begin(), vec.end(), rep, [&, this] (const Token &l, const StringView &r) {
166 105975 : return string::compare_c(makeStringView(l.index, l.slice), r) < 0;
167 27500 : });
168 13750 : if (insert_it == vec.end()) {
169 125 : vec.emplace_back(Token{idx, sl});
170 : } else {
171 13625 : vec.emplace(insert_it, Token{idx, sl});
172 : }
173 13750 : }
174 :
175 100 : float SearchIndex::Heuristic::operator () (const SearchIndex &index, const SearchIndex::ResultNode &node) {
176 100 : float score = 0.0f;
177 100 : uint32_t idx = maxOf<uint32_t>();
178 :
179 100 : Vector<StringView> matches;
180 :
181 100 : float mod = tagScore(node.node->tag);
182 :
183 100 : float fullMatchCost = fullMatchScore / float(node.matches.size());
184 1350 : for (const SearchIndex::ResultToken &token_it : node.matches) {
185 1250 : if (excludeEqualMatches) {
186 1250 : auto t = index.resolveToken(*node.node, token_it);
187 1250 : if (std::find(matches.begin(), matches.end(), t) == matches.end()) {
188 150 : matches.push_back(t);
189 : } else {
190 1100 : continue;
191 : }
192 : }
193 :
194 150 : if (token_it.match == token_it.slice.size) {
195 150 : score += mod * fullMatchCost;
196 : }
197 :
198 150 : score += mod * wordScore(token_it.match, token_it.slice.size);
199 150 : score += mod * positionScore(idx, token_it.word);
200 150 : idx = token_it.word;
201 : }
202 100 : return score;
203 100 : }
204 :
205 : }
|