LCOV - code coverage report
Current view: top level - core/search - SPSearchIndex.cc (source / functions) Hit Total Coverage
Test: coverage.info Lines: 105 111 94.6 %
Date: 2024-05-12 00:16:13 Functions: 17 17 100.0 %

          Line data    Source code
       1             : /**
       2             : Copyright (c) 2017-2022 Roman Katuntsev <sbkarr@stappler.org>
       3             : Copyright (c) 2023 Stappler LLC <admin@stappler.dev>
       4             : 
       5             : Permission is hereby granted, free of charge, to any person obtaining a copy
       6             : of this software and associated documentation files (the "Software"), to deal
       7             : in the Software without restriction, including without limitation the rights
       8             : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
       9             : copies of the Software, and to permit persons to whom the Software is
      10             : furnished to do so, subject to the following conditions:
      11             : 
      12             : The above copyright notice and this permission notice shall be included in
      13             : all copies or substantial portions of the Software.
      14             : 
      15             : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      16             : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      17             : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
      18             : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      19             : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
      20             : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
      21             : THE SOFTWARE.
      22             : **/
      23             : 
      24             : #include "SPSearchIndex.h"
      25             : #include "SPString.h"
      26             : 
      27             : namespace STAPPLER_VERSIONIZED stappler::search {
      28             : 
      29          25 : bool SearchIndex::init(const TokenizerCallback &tcb) {
      30          25 :         _tokenizer = tcb;
      31          25 :         return true;
      32             : }
      33             : 
      34          25 : void SearchIndex::reserve(size_t s) {
      35          25 :         _nodes.reserve(s);
      36          25 : }
      37             : 
      38         100 : void SearchIndex::add(const StringView &v, int64_t id, int64_t tag) {
      39         100 :         String origin(string::tolower<Interface>(v));
      40             : 
      41         100 :         _nodes.emplace_back(Node{id, tag});
      42             : 
      43         100 :         Node &node = _nodes.back();
      44         100 :         uint32_t idx = uint32_t(_nodes.size() - 1);
      45         100 :         auto &canonical = node.canonical;
      46             : 
      47       41250 :         auto tokenFn = [&, this] (const StringView &str) {
      48       13750 :                 if (!str.empty()) {
      49       13750 :                         if (!canonical.empty()) {
      50       13650 :                                 canonical.append(" ");
      51             :                         }
      52       13750 :                         auto s = canonical.size();
      53       13750 :                         canonical.append(str.str<Interface>());
      54       13750 :                         onToken(_tokens, str, idx, Slice{ uint16_t(s), uint16_t(str.size()) });
      55             :                 }
      56       13750 :         };
      57             : 
      58         100 :         if (_tokenizer) {
      59         100 :                 _tokenizer(origin, tokenFn, SearchNode);
      60             :         } else {
      61           0 :                 StringView r(origin);
      62           0 :                 r.split<DefaultSep>(tokenFn);
      63             :         }
      64             : 
      65         100 :         if (canonical.empty()) {
      66           0 :                 _nodes.pop_back();
      67         100 :         } else if (canonical != origin) {
      68         100 :                 node.alignment = Distance(origin, canonical);
      69             :         }
      70         100 : }
      71             : 
      72          75 : SearchIndex::Result SearchIndex::performSearch(const StringView &v, size_t minMatch, const HeuristicCallback &cb,
      73             :                 const FilterCallback & filter) {
      74          75 :         String origin(string::tolower<Interface>(v));
      75             : 
      76          75 :         SearchIndex::Result res{this};
      77             : 
      78          75 :         uint32_t wordIndex = 0;
      79             : 
      80        5600 :         auto tokenFn = [&, this] (const StringView &str) {
      81             :                 //std::cout << "Token: " << str << "\n";
      82         100 :                 auto lb = std::lower_bound(_tokens.begin(), _tokens.end(), str, [&, this] (const Token &l, const StringView &r) {
      83         900 :                         return string::compare_c(makeStringView(l.index, l.slice), r) < 0;
      84         200 :                 });
      85             : 
      86         100 :                 if (lb != _tokens.end()) {
      87         100 :                         auto node = &_nodes.at(lb->index);
      88         100 :                         StringView value = makeStringView(*lb);
      89             :                         //std::cout << "Found: '" << value << "' from '" << _nodes.at(lb->index).canonical << "'\n";
      90             : 
      91        1350 :                         while (lb != _tokens.end() && value.size() >= str.size() && String::traits_type::compare(value.data(), str.data(), str.size()) == 0) {
      92        1250 :                                 if (!filter || filter(node)) {
      93        1250 :                                         auto ret_it = std::lower_bound(res.nodes.begin(), res.nodes.end(), node,
      94        2225 :                                                         [&] (const ResultNode &l, const Node *r) {
      95        2225 :                                                 return l.node < r;
      96        1250 :                                         });
      97        1250 :                                         if (ret_it == res.nodes.end() || ret_it->node != node) {
      98         100 :                                                 res.nodes.emplace(ret_it, ResultNode{ 0.0f, node, {ResultToken{wordIndex, uint16_t(str.size()), lb->slice}} });
      99             :                                         } else {
     100        1150 :                                                 ret_it->matches.emplace_back(ResultToken{wordIndex, uint16_t(str.size()), lb->slice});
     101             :                                         }
     102             :                                 }
     103        1250 :                                 ++ lb;
     104        1250 :                                 if (lb != _tokens.end()) {
     105        1250 :                                         value = makeStringView(*lb);
     106        1250 :                                         node = &_nodes.at(lb->index);
     107             :                                         //std::cout << "Next: '" << value << "' from '" << _nodes.at(lb->index).canonical << "'\n";
     108             :                                 }
     109             :                         }
     110             :                 }
     111         100 :                 wordIndex ++;
     112         100 :         };
     113             : 
     114          75 :         if (_tokenizer) {
     115          75 :                 _tokenizer(origin, tokenFn, SearchRequest);
     116             :         } else {
     117           0 :                 StringView r(origin);
     118           0 :                 r.split<DefaultSep>(tokenFn);
     119             :         }
     120             : 
     121          75 :         if (cb) {
     122         175 :                 for (auto &it : res.nodes) {
     123         100 :                         it.score = cb(*this, it);
     124             :                 }
     125             : 
     126          75 :                 std::sort(res.nodes.begin(), res.nodes.end(), [] (const ResultNode &l, const ResultNode &r) {
     127          50 :                         return l.score > r.score;
     128             :                 });
     129             :         }
     130             : 
     131         150 :         return res;
     132          75 : }
     133             : 
     134        2500 : StringView SearchIndex::resolveToken(const Node &node, const ResultToken &token) const {
     135        2500 :         return StringView(node.canonical.data() + token.slice.start, token.match);
     136             : }
     137             : 
     138        1250 : SearchIndex::Slice SearchIndex::convertToken(const Node &node, const ResultToken &ret) const {
     139        1250 :         if (node.alignment.empty()) {
     140           0 :                 return Slice{ret.slice.start, ret.match};
     141             :         } else {
     142        1250 :                 auto start = ret.slice.start + node.alignment.diff_original(ret.slice.start);
     143        1250 :                 auto end = ret.slice.start + ret.match;
     144        1250 :                 end += node.alignment.diff_original(end, true);
     145        1250 :                 return Slice{uint16_t(start), uint16_t(end - start)};
     146             :         }
     147             : }
     148             : 
     149          25 : void SearchIndex::print(const Callback<void(StringView)> &out) const {
     150       13775 :         for (auto &it : _tokens) {
     151       13750 :                 out << it.index << " " << makeStringView(it) << " " << _nodes.at(it.index).id << "\n";
     152             :         }
     153          25 : }
     154             : 
     155       15100 : StringView SearchIndex::makeStringView(const Token &t) const {
     156       15100 :         return makeStringView(t.index, t.slice);
     157             : }
     158             : 
     159      121975 : StringView SearchIndex::makeStringView(uint32_t idx, const Slice &sl) const {
     160      121975 :         const Node &node = _nodes.at(idx);
     161      121975 :         return StringView(node.canonical.data() + sl.start, sl.size);
     162             : }
     163             : 
     164       13750 : void SearchIndex::onToken(Vector<Token> &vec, const StringView &rep, uint32_t idx, const Slice &sl) {
     165       13750 :         auto insert_it = std::lower_bound(vec.begin(), vec.end(), rep, [&, this] (const Token &l, const StringView &r) {
     166      105975 :                 return string::compare_c(makeStringView(l.index, l.slice), r) < 0;
     167       27500 :         });
     168       13750 :         if (insert_it ==  vec.end()) {
     169         125 :                 vec.emplace_back(Token{idx, sl});
     170             :         } else {
     171       13625 :                 vec.emplace(insert_it, Token{idx, sl});
     172             :         }
     173       13750 : }
     174             : 
     175         100 : float SearchIndex::Heuristic::operator () (const SearchIndex &index, const SearchIndex::ResultNode &node) {
     176         100 :         float score = 0.0f;
     177         100 :         uint32_t idx = maxOf<uint32_t>();
     178             : 
     179         100 :         Vector<StringView> matches;
     180             : 
     181         100 :         float mod = tagScore(node.node->tag);
     182             : 
     183         100 :         float fullMatchCost = fullMatchScore / float(node.matches.size());
     184        1350 :         for (const SearchIndex::ResultToken &token_it : node.matches) {
     185        1250 :                 if (excludeEqualMatches) {
     186        1250 :                         auto t = index.resolveToken(*node.node, token_it);
     187        1250 :                         if (std::find(matches.begin(), matches.end(), t) == matches.end()) {
     188         150 :                                 matches.push_back(t);
     189             :                         } else {
     190        1100 :                                 continue;
     191             :                         }
     192             :                 }
     193             : 
     194         150 :                 if (token_it.match == token_it.slice.size) {
     195         150 :                         score += mod * fullMatchCost;
     196             :                 }
     197             : 
     198         150 :                 score += mod * wordScore(token_it.match, token_it.slice.size);
     199         150 :                 score += mod * positionScore(idx, token_it.word);
     200         150 :                 idx = token_it.word;
     201             :         }
     202         100 :         return score;
     203         100 : }
     204             : 
     205             : }

Generated by: LCOV version 1.14