LCOV - code coverage report
Current view: top level - core/search - SPSearchIndex.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 9 9 100.0 %
Date: 2024-05-12 00:16:13 Functions: 4 4 100.0 %

          Line data    Source code
       1             : /**
       2             : Copyright (c) 2017-2022 Roman Katuntsev <sbkarr@stappler.org>
       3             : Copyright (c) 2023 Stappler LLC <admin@stappler.dev>
       4             : 
       5             : Permission is hereby granted, free of charge, to any person obtaining a copy
       6             : of this software and associated documentation files (the "Software"), to deal
       7             : in the Software without restriction, including without limitation the rights
       8             : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
       9             : copies of the Software, and to permit persons to whom the Software is
      10             : furnished to do so, subject to the following conditions:
      11             : 
      12             : The above copyright notice and this permission notice shall be included in
      13             : all copies or substantial portions of the Software.
      14             : 
      15             : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      16             : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      17             : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
      18             : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      19             : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
      20             : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
      21             : THE SOFTWARE.
      22             : **/
      23             : 
      24             : #ifndef STAPPLER_SEARCH_SPSEARCHINDEX_H_
      25             : #define STAPPLER_SEARCH_SPSEARCHINDEX_H_
      26             : 
      27             : #include "SPRef.h"
      28             : #include "SPSearchDistance.h"
      29             : 
      30             : namespace STAPPLER_VERSIONIZED stappler::search {
      31             : 
      32             : class SearchIndex : public Ref {
      33             : public:
      34             :         using DefaultSep = StringView::Compose<
      35             :                         StringView::CharGroup<CharGroupId::WhiteSpace>,
      36             :                         StringView::Chars<'=', '/', '(', ')', '.', ',', '-', '\'', '"', ':', ';', '?', '!', '@', '#', '$', '%', '^', '*', '\\', '+', '[', ']'>
      37             :         >;
      38             : 
      39             :         struct Slice;
      40             :         struct Node;
      41             :         struct Token;
      42             :         struct ResultToken;
      43             :         struct ResultNode;
      44             :         struct Result;
      45             : 
      46             :         enum TokenType {
      47             :                 SearchNode,
      48             :                 SearchRequest
      49             :         };
      50             : 
      51             :         using TokenCallback = Function<void(const StringView &)>;
      52             :         using TokenizerCallback = Function<void(const StringView &, const TokenCallback &, TokenType)>;
      53             :         using HeuristicCallback = Function<float(const SearchIndex &index, const ResultNode &result)>;
      54             :         using FilterCallback = Function<bool(const Node *)>;
      55             : 
      56             :         struct Slice {
      57             :                 uint16_t start = 0; // start position in node's canonical string
      58             :                 uint16_t size = 0; // length in node's canonical string
      59             :         };
      60             : 
      61             :         struct Node {
      62             :                 int64_t id = 0;
      63             :                 int64_t tag = 0;
      64             :                 String canonical;
      65             :                 Distance alignment;
      66             :         };
      67             : 
      68             :         struct Token {
      69             :                 uint32_t index = 0; // node index
      70             :                 Slice slice; // slice from canonical
      71             :         };
      72             : 
      73             :         struct ResultToken {
      74             :                 uint32_t word = 0; // node index
      75             :                 uint16_t match = 0; // node index
      76             :                 Slice slice; // slice from canonical
      77             :         };
      78             : 
      79             :         struct ResultNode {
      80             :                 float score = 0.0f;
      81             :                 const Node *node;
      82             :                 Vector<ResultToken> matches;
      83             :         };
      84             : 
      85             :         struct Result {
      86             :                 Rc<SearchIndex> ref;
      87             :                 Vector<ResultNode> nodes;
      88             :         };
      89             : 
      90             :         struct Heuristic {
      91             :                 using TagCallback = Function<float(int64_t)>;
      92             :                 using SizeCallback = Function<float(uint32_t, uint32_t)>;
      93             : 
      94          75 :                 Heuristic() { }
      95             :                 Heuristic(const TagCallback &cb, bool exclude = true) : excludeEqualMatches(exclude), tagScore(cb) { }
      96             : 
      97             :                 float operator ()(const SearchIndex &index, const SearchIndex::ResultNode &result);
      98             : 
      99             :                 bool excludeEqualMatches = true;
     100             : 
     101             :                 // for every full token match score is incremented to fullMatchScore / countOfMatches
     102             :                 float fullMatchScore = 2.0f;
     103             : 
     104             :                 // all token score is multiplied by score of it's node tag
     105         100 :                 TagCallback tagScore = [] (int64_t tag) -> float {
     106         100 :                         return 1.0f;
     107             :                 };
     108             : 
     109             :                 // token score is better for longer word
     110         150 :                 SizeCallback wordScore = [] (uint32_t match, uint32_t size) -> float {
     111         150 :                         return (1.0f + log2f(size)) * (float(match) / float(size));
     112             :                 };
     113             : 
     114             :                 // token score is better if tokens match sequentially
     115         150 :                 SizeCallback positionScore = [] (uint32_t prev, uint32_t current) -> float {
     116         150 :                         if (prev != maxOf<uint32_t>()) {
     117          50 :                                 return (prev + 1 == current)?1.0f:((prev + 2 == current)?0.5f:0.0f);
     118             :                         }
     119         100 :                         return 0.0f;
     120             :                 };
     121             :         };
     122             : 
     123             :         bool init(const TokenizerCallback & = nullptr);
     124             : 
     125             :         void reserve(size_t);
     126             :         void add(const StringView &, int64_t id, int64_t tag);
     127             : 
     128             :         Result performSearch(const StringView &, size_t minMatch, const HeuristicCallback & = Heuristic(),
     129             :                         const FilterCallback & filter = nullptr);
     130             : 
     131             :         StringView resolveToken(const Node &, const ResultToken &) const;
     132             :         Slice convertToken(const Node &, const ResultToken &) const;
     133             : 
     134             :         void print(const Callback<void(StringView)> &out) const;
     135             : 
     136             : protected:
     137             :         StringView makeStringView(const Token &) const;
     138             :         StringView makeStringView(uint32_t idx, const Slice &) const;
     139             : 
     140             :         void onToken(Vector<Token> &vec, const StringView &, uint32_t, const Slice &);
     141             : 
     142             :         Vector<Node> _nodes;
     143             :         Vector<Token> _tokens;
     144             :         TokenizerCallback _tokenizer;
     145             : };
     146             : 
     147             : }
     148             : 
     149             : #endif /* STAPPLER_SEARCH_SPSEARCHINDEX_H_ */

Generated by: LCOV version 1.14