Line data Source code
1 : /**
2 : Copyright (c) 2014 Martin Šošić
3 : Copyright (c) 2017-2022 Roman Katuntsev <sbkarr@stappler.org>
4 : Copyright (c) 2023 Stappler LLC <admin@stappler.dev>
5 :
6 : Permission is hereby granted, free of charge, to any person obtaining a copy
7 : of this software and associated documentation files (the "Software"), to deal
8 : in the Software without restriction, including without limitation the rights
9 : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 : copies of the Software, and to permit persons to whom the Software is
11 : furnished to do so, subject to the following conditions:
12 :
13 : The above copyright notice and this permission notice shall be included in
14 : all copies or substantial portions of the Software.
15 :
16 : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 : THE SOFTWARE.
23 : **/
24 :
25 : #include "SPCommon.h"
26 : #include "SPSearchDistance.h"
27 :
28 : namespace STAPPLER_VERSIONIZED stappler::search {
29 :
30 : // from https://github.com/Martinsos/edlib
31 :
32 : using bytes = memory::bytes;
33 :
34 : typedef uint64_t Word;
35 : static const int WORD_SIZE = sizeof(Word) * 8; // Size of Word in bits
36 : static const Word WORD_1 = static_cast<Word>(1);
37 : static const Word HIGH_BIT_MASK = WORD_1 << (WORD_SIZE - 1); // 100..00
38 : static const int MAX_UCHAR = 255;
39 :
40 : // Status codes
41 : #define EDLIB_STATUS_OK 0
42 : #define EDLIB_STATUS_ERROR 1
43 :
44 : // Data needed to find alignment.
45 : struct AlignmentData {
46 : Word* Ps;
47 : Word* Ms;
48 : int* scores;
49 : int* firstBlocks;
50 : int* lastBlocks;
51 :
52 5325 : AlignmentData(int maxNumBlocks, int targetLength) {
53 : // We build a complete table and mark first and last block for each column
54 : // (because algorithm is banded so only part of each columns is used).
55 : // TODO: do not build a whole table, but just enough blocks for each column.
56 5325 : Ps = new Word[maxNumBlocks * targetLength];
57 5325 : Ms = new Word[maxNumBlocks * targetLength];
58 5325 : scores = new int[maxNumBlocks * targetLength];
59 5325 : firstBlocks = new int[targetLength];
60 5325 : lastBlocks = new int[targetLength];
61 5325 : }
62 :
63 5325 : ~AlignmentData() {
64 5325 : delete[] Ps;
65 5325 : delete[] Ms;
66 5325 : delete[] scores;
67 5325 : delete[] firstBlocks;
68 5325 : delete[] lastBlocks;
69 5325 : }
70 : };
71 :
72 : struct Block {
73 : Word P; // Pvin
74 : Word M; // Mvin
75 : int score; // score of last cell in block;
76 :
77 688125 : Block() {}
78 11650 : Block(Word p, Word m, int s) :P(p), M(m), score(s) {}
79 : };
80 : /**
81 : * Alignment methods - how should Edlib treat gaps before and after query?
82 : */
83 : typedef enum {
84 : EDLIB_MODE_NW,
85 : } EdlibAlignMode;
86 :
87 : /**
88 : * Alignment tasks - what do you want Edlib to do?
89 : */
90 : typedef enum {
91 : EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations.
92 : EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations.
93 : EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path.
94 : } EdlibAlignTask;
95 :
96 : /**
97 : * Describes cigar format.
98 : * @see http://samtools.github.io/hts-specs/SAMv1.pdf
99 : * @see http://drive5.com/usearch/manual/cigar.html
100 : */
101 : typedef enum {
102 : EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'.
103 : EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'.
104 : } EdlibCigarFormat;
105 :
106 : // Edit operations.
107 : #define EDLIB_EDOP_MATCH 0 //!< Match.
108 : #define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query.
109 : #define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query.
110 : #define EDLIB_EDOP_MISMATCH 3 //!< Mismatch.
111 :
112 : /**
113 : * Container for results of alignment done by edlibAlign() function.
114 : */
115 : typedef struct {
116 : /**
117 : * EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values.
118 : */
119 : int status;
120 :
121 : /**
122 : * -1 if k is non-negative and edit distance is larger than k.
123 : */
124 : int editDistance;
125 :
126 : /**
127 : * Array of zero-based positions in target where optimal alignment paths end.
128 : * If gap after query is penalized, gap counts as part of query (NW), otherwise not.
129 : * Set to NULL if edit distance is larger than k.
130 : * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
131 : */
132 : int *endLocations;
133 :
134 : /**
135 : * Array of zero-based positions in target where optimal alignment paths start,
136 : * they correspond to endLocations.
137 : * If gap before query is penalized, gap counts as part of query (NW), otherwise not.
138 : * Set to NULL if not calculated or if edit distance is larger than k.
139 : * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
140 : */
141 : int *startLocations;
142 :
143 : /**
144 : * Number of end (and start) locations.
145 : */
146 : int numLocations;
147 :
148 : /**
149 : * Alignment is found for first pair of start and end locations.
150 : * Set to NULL if not calculated.
151 : * Alignment is sequence of numbers: 0, 1, 2, 3.
152 : * 0 stands for match.
153 : * 1 stands for insertion to target.
154 : * 2 stands for insertion to query.
155 : * 3 stands for mismatch.
156 : * Alignment aligns query to target from begining of query till end of query.
157 : * If gaps are not penalized, they are not in alignment.
158 : * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
159 : */
160 : unsigned char *alignment;
161 :
162 : /**
163 : * Length of alignment.
164 : */
165 : int alignmentLength;
166 :
167 : /**
168 : * Number of different characters in query and target together.
169 : */
170 : int alphabetLength;
171 : } EdlibAlignResult;
172 :
173 : typedef struct {
174 : char first;
175 : char second;
176 : } EdlibEqualityPair;
177 :
178 : /**
179 : * @brief Configuration object for edlibAlign() function.
180 : */
181 : typedef struct {
182 : /**
183 : * Set k to non-negative value to tell edlib that edit distance is not larger than k.
184 : * Smaller k can significantly improve speed of computation.
185 : * If edit distance is larger than k, edlib will set edit distance to -1.
186 : * Set k to negative value and edlib will internally auto-adjust k until score is found.
187 : */
188 : int k;
189 :
190 : /**
191 : * Alignment method.
192 : * EDLIB_MODE_NW: global (Needleman-Wunsch)
193 : * EDLIB_MODE_SHW: prefix. Gap after query is not penalized.
194 : * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized.
195 : */
196 : EdlibAlignMode mode;
197 :
198 : /**
199 : * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is.
200 : * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target.
201 : * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target.
202 : * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target).
203 : */
204 : EdlibAlignTask task;
205 : } EdlibAlignConfig;
206 :
207 : /**
208 : * Defines equality relation on alphabet characters.
209 : * By default each character is always equal only to itself, but you can also provide additional equalities.
210 : */
211 : class EqualityDefinition {
212 : private:
213 : bool matrix[MAX_UCHAR + 1][MAX_UCHAR + 1];
214 : public:
215 450 : EqualityDefinition(const std::string& alphabet) {
216 7600 : for (int i = 0; i < static_cast<int>(alphabet.size()); i++) {
217 235250 : for (int j = 0; j < static_cast<int>(alphabet.size()); j++) {
218 228100 : matrix[i][j] = (i == j);
219 : }
220 : }
221 450 : }
222 :
223 : /**
224 : * @param a Element from transformed sequence.
225 : * @param b Element from transformed sequence.
226 : * @return True if a and b are defined as equal, false otherwise.
227 : */
228 755832350 : bool areEqual(unsigned char a, unsigned char b) const {
229 755832350 : return matrix[a][b];
230 : }
231 : };
232 :
233 : static int myersCalcEditDistanceNW(const Word* Peq, int W, int maxNumBlocks,
234 : int queryLength,
235 : const unsigned char* target, int targetLength,
236 : int k, int* bestScore_,
237 : int* position_, bool findAlignment,
238 : AlignmentData** alignData, int targetStopPosition);
239 :
240 :
241 : static int obtainAlignment(
242 : const unsigned char* query, const unsigned char* rQuery, int queryLength,
243 : const unsigned char* target, const unsigned char* rTarget, int targetLength,
244 : const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore,
245 : unsigned char** alignment, int* alignmentLength);
246 :
247 : static int obtainAlignmentHirschberg(
248 : const unsigned char* query, const unsigned char* rQuery, int queryLength,
249 : const unsigned char* target, const unsigned char* rTarget, int targetLength,
250 : const EqualityDefinition& equalityDefinition, int alphabetLength, int bestScore,
251 : unsigned char** alignment, int* alignmentLength);
252 :
253 : static int obtainAlignmentTraceback(int queryLength, int targetLength,
254 : int bestScore, const AlignmentData* alignData,
255 : unsigned char** alignment, int* alignmentLength);
256 :
257 : static std::string transformSequences(const char* queryOriginal, int queryLength,
258 : const char* targetOriginal, int targetLength,
259 : unsigned char** queryTransformed,
260 : unsigned char** targetTransformed);
261 :
262 : static inline int ceilDiv(int x, int y);
263 :
264 : static inline unsigned char* createReverseCopy(const unsigned char* seq, int length);
265 :
266 : static inline Word* buildPeq(const int alphabetLength,
267 : const unsigned char* query,
268 : const int queryLength,
269 : const EqualityDefinition& equalityDefinition);
270 :
271 :
272 : /**
273 : * Main edlib method.
274 : */
275 500 : static EdlibAlignResult edlibAlign(const char* const queryOriginal, const int queryLength,
276 : const char* const targetOriginal, const int targetLength,
277 : const EdlibAlignConfig config) {
278 : EdlibAlignResult result;
279 500 : result.status = EDLIB_STATUS_OK;
280 500 : result.editDistance = -1;
281 500 : result.endLocations = result.startLocations = NULL;
282 500 : result.numLocations = 0;
283 500 : result.alignment = NULL;
284 500 : result.alignmentLength = 0;
285 500 : result.alphabetLength = 0;
286 :
287 : /*------------ TRANSFORM SEQUENCES AND RECOGNIZE ALPHABET -----------*/
288 : unsigned char* query, * target;
289 : std::string alphabet = transformSequences(queryOriginal, queryLength, targetOriginal, targetLength,
290 500 : &query, &target);
291 500 : result.alphabetLength = static_cast<int>(alphabet.size());
292 : /*-------------------------------------------------------*/
293 :
294 : // Handle special situation when at least one of the sequences has length 0.
295 500 : if (queryLength == 0 || targetLength == 0) {
296 50 : result.editDistance = std::max(queryLength, targetLength);
297 50 : result.endLocations = static_cast<int *>(malloc(sizeof(int) * 1));
298 50 : result.endLocations[0] = targetLength - 1;
299 50 : result.numLocations = 1;
300 :
301 50 : free(query);
302 50 : free(target);
303 50 : return result;
304 : }
305 :
306 : /*--------------------- INITIALIZATION ------------------*/
307 450 : int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE); // bmax in Myers
308 450 : int W = maxNumBlocks * WORD_SIZE - queryLength; // number of redundant cells in last level blocks
309 450 : EqualityDefinition equalityDefinition(alphabet);
310 450 : Word* Peq = buildPeq(static_cast<int>(alphabet.size()), query, queryLength, equalityDefinition);
311 : /*-------------------------------------------------------*/
312 :
313 : /*------------------ MAIN CALCULATION -------------------*/
314 : // TODO: Store alignment data only after k is determined? That could make things faster.
315 : int positionNW; // Used only when mode is NW.
316 450 : AlignmentData* alignData = NULL;
317 450 : bool dynamicK = false;
318 450 : int k = config.k;
319 450 : if (k < 0) { // If valid k is not given, auto-adjust k until solution is found.
320 450 : dynamicK = true;
321 450 : k = WORD_SIZE; // Gives better results than smaller k.
322 : }
323 :
324 : do {
325 900 : myersCalcEditDistanceNW(Peq, W, maxNumBlocks, queryLength, target, targetLength,
326 : k, &(result.editDistance), &positionNW, false, &alignData, -1);
327 900 : k *= 2;
328 900 : } while(dynamicK && result.editDistance == -1);
329 :
330 450 : if (result.editDistance >= 0) { // If there is solution.
331 : // If NW mode, set end location explicitly.
332 450 : if (config.mode == EDLIB_MODE_NW) {
333 450 : result.endLocations = static_cast<int *>(malloc(sizeof(int) * 1));
334 450 : result.endLocations[0] = targetLength - 1;
335 450 : result.numLocations = 1;
336 : }
337 :
338 : // Find starting locations.
339 450 : if (config.task == EDLIB_TASK_LOC || config.task == EDLIB_TASK_PATH) {
340 450 : result.startLocations = static_cast<int *>(malloc(result.numLocations * sizeof(int)));
341 900 : for (int i = 0; i < result.numLocations; i++) {
342 450 : result.startLocations[i] = 0;
343 : }
344 : }
345 :
346 : // Find alignment -> all comes down to finding alignment for NW.
347 : // Currently we return alignment only for first pair of locations.
348 450 : if (config.task == EDLIB_TASK_PATH) {
349 450 : int alnStartLocation = result.startLocations[0];
350 450 : int alnEndLocation = result.endLocations[0];
351 450 : const unsigned char* alnTarget = target + alnStartLocation;
352 450 : const int alnTargetLength = alnEndLocation - alnStartLocation + 1;
353 450 : const unsigned char* rAlnTarget = createReverseCopy(alnTarget, alnTargetLength);
354 450 : const unsigned char* rQuery = createReverseCopy(query, queryLength);
355 450 : obtainAlignment(query, rQuery, queryLength,
356 : alnTarget, rAlnTarget, alnTargetLength,
357 450 : equalityDefinition, static_cast<int>(alphabet.size()), result.editDistance,
358 : &(result.alignment), &(result.alignmentLength));
359 450 : delete[] rAlnTarget;
360 450 : delete[] rQuery;
361 : }
362 : }
363 : /*-------------------------------------------------------*/
364 :
365 : //--- Free memory ---//
366 450 : delete[] Peq;
367 450 : free(query);
368 450 : free(target);
369 450 : if (alignData) delete alignData;
370 : //-------------------//
371 :
372 450 : return result;
373 500 : }
374 :
375 : /**
376 : * Build Peq table for given query and alphabet.
377 : * Peq is table of dimensions alphabetLength+1 x maxNumBlocks.
378 : * Bit i of Peq[s * maxNumBlocks + b] is 1 if i-th symbol from block b of query equals symbol s, otherwise it is 0.
379 : * NOTICE: free returned array with delete[]!
380 : */
381 5775 : static inline Word* buildPeq(const int alphabetLength,
382 : const unsigned char* const query,
383 : const int queryLength,
384 : const EqualityDefinition& equalityDefinition) {
385 5775 : int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
386 : // table of dimensions alphabetLength+1 x maxNumBlocks. Last symbol is wildcard.
387 5775 : Word* Peq = new Word[(alphabetLength + 1) * maxNumBlocks];
388 :
389 : // Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s
390 134300 : for (int symbol = 0; symbol <= alphabetLength; symbol++) {
391 12530450 : for (int b = 0; b < maxNumBlocks; b++) {
392 12401925 : if (symbol < alphabetLength) {
393 11866050 : Peq[symbol * maxNumBlocks + b] = 0;
394 771293250 : for (int r = (b+1) * WORD_SIZE - 1; r >= b * WORD_SIZE; r--) {
395 759427200 : Peq[symbol * maxNumBlocks + b] <<= 1;
396 : // NOTE: We pretend like query is padded at the end with W wildcard symbols
397 759427200 : if (r >= queryLength || equalityDefinition.areEqual(query[r], symbol))
398 37712300 : Peq[symbol * maxNumBlocks + b] += 1;
399 : }
400 : } else { // Last symbol is wildcard, so it is all 1s
401 535875 : Peq[symbol * maxNumBlocks + b] = static_cast<Word>(-1);
402 : }
403 : }
404 : }
405 :
406 5775 : return Peq;
407 : }
408 :
409 :
410 : /**
411 : * Returns new sequence that is reverse of given sequence.
412 : * Free returned array with delete[].
413 : */
414 900 : static inline unsigned char* createReverseCopy(const unsigned char* const seq, const int length) {
415 900 : unsigned char* rSeq = new unsigned char[length];
416 5144400 : for (int i = 0; i < length; i++) {
417 5143500 : rSeq[i] = seq[length - i - 1];
418 : }
419 900 : return rSeq;
420 : }
421 :
422 : /**
423 : * Corresponds to Advance_Block function from Myers.
424 : * Calculates one word(block), which is part of a column.
425 : * Highest bit of word (one most to the left) is most bottom cell of block from column.
426 : * Pv[i] and Mv[i] define vin of cell[i]: vin = cell[i] - cell[i-1].
427 : * @param [in] Pv Bitset, Pv[i] == 1 if vin is +1, otherwise Pv[i] == 0.
428 : * @param [in] Mv Bitset, Mv[i] == 1 if vin is -1, otherwise Mv[i] == 0.
429 : * @param [in] Eq Bitset, Eq[i] == 1 if match, 0 if mismatch.
430 : * @param [in] hin Will be +1, 0 or -1.
431 : * @param [out] PvOut Bitset, PvOut[i] == 1 if vout is +1, otherwise PvOut[i] == 0.
432 : * @param [out] MvOut Bitset, MvOut[i] == 1 if vout is -1, otherwise MvOut[i] == 0.
433 : * @param [out] hout Will be +1, 0 or -1.
434 : */
435 356436175 : static inline int calculateBlock(Word Pv, Word Mv, Word Eq, const int hin,
436 : Word &PvOut, Word &MvOut) {
437 : // hin can be 1, -1 or 0.
438 : // 1 -> 00...01
439 : // 0 -> 00...00
440 : // -1 -> 11...11 (2-complement)
441 :
442 356436175 : Word hinIsNeg = static_cast<Word>(hin >> 2) & WORD_1; // 00...001 if hin is -1, 00...000 if 0 or 1
443 :
444 356436175 : Word Xv = Eq | Mv;
445 : // This is instruction below written using 'if': if (hin < 0) Eq |= (Word)1;
446 356436175 : Eq |= hinIsNeg;
447 356436175 : Word Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq;
448 :
449 356436175 : Word Ph = Mv | ~(Xh | Pv);
450 356436175 : Word Mh = Pv & Xh;
451 :
452 356436175 : int hout = 0;
453 : // This is instruction below written using 'if': if (Ph & HIGH_BIT_MASK) hout = 1;
454 356436175 : hout = (Ph & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
455 : // This is instruction below written using 'if': if (Mh & HIGH_BIT_MASK) hout = -1;
456 356436175 : hout -= (Mh & HIGH_BIT_MASK) >> (WORD_SIZE - 1);
457 :
458 356436175 : Ph <<= 1;
459 356436175 : Mh <<= 1;
460 :
461 : // This is instruction below written using 'if': if (hin < 0) Mh |= (Word)1;
462 356436175 : Mh |= hinIsNeg;
463 : // This is instruction below written using 'if': if (hin > 0) Ph |= (Word)1;
464 356436175 : Ph |= static_cast<Word>((hin + 1) >> 1);
465 :
466 356436175 : PvOut = Mh | ~(Xv | Ph);
467 356436175 : MvOut = Ph & Xv;
468 :
469 356436175 : return hout;
470 : }
471 :
472 : /**
473 : * Does ceiling division x / y.
474 : * Note: x and y must be non-negative and x + y must not overflow.
475 : */
476 19600 : static inline int ceilDiv(const int x, const int y) {
477 19600 : return x % y ? x / y + 1 : x / y;
478 : }
479 :
480 21957225 : static inline int min(const int x, const int y) {
481 21957225 : return x < y ? x : y;
482 : }
483 :
484 21945275 : static inline int max(const int x, const int y) {
485 21945275 : return x > y ? x : y;
486 : }
487 :
488 :
489 : /**
490 : * @param [in] block
491 : * @return Values of cells in block, starting with bottom cell in block.
492 : */
493 28700 : static inline std::vector<int> getBlockCellValues(const Block block) {
494 28700 : std::vector<int> scores(WORD_SIZE);
495 28700 : int score = block.score;
496 28700 : Word mask = HIGH_BIT_MASK;
497 1836800 : for (int i = 0; i < WORD_SIZE - 1; i++) {
498 1808100 : scores[i] = score;
499 1808100 : if (block.P & mask) score--;
500 1808100 : if (block.M & mask) score++;
501 1808100 : mask >>= 1;
502 : }
503 28700 : scores[WORD_SIZE - 1] = score;
504 28700 : return scores;
505 : }
506 :
507 : /**
508 : * Writes values of cells in block into given array, starting with first/top cell.
509 : * @param [in] block
510 : * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE.
511 : */
512 5825 : static inline void readBlock(const Block block, int* const dest) {
513 5825 : int score = block.score;
514 5825 : Word mask = HIGH_BIT_MASK;
515 372800 : for (int i = 0; i < WORD_SIZE - 1; i++) {
516 366975 : dest[WORD_SIZE - 1 - i] = score;
517 366975 : if (block.P & mask) score--;
518 366975 : if (block.M & mask) score++;
519 366975 : mask >>= 1;
520 : }
521 5825 : dest[0] = score;
522 5825 : }
523 :
524 : /**
525 : * Writes values of cells in block into given array, starting with last/bottom cell.
526 : * @param [in] block
527 : * @param [out] dest Array into which cell values are written. Must have size of at least WORD_SIZE.
528 : */
529 5825 : static inline void readBlockReverse(const Block block, int* const dest) {
530 5825 : int score = block.score;
531 5825 : Word mask = HIGH_BIT_MASK;
532 372800 : for (int i = 0; i < WORD_SIZE - 1; i++) {
533 366975 : dest[i] = score;
534 366975 : if (block.P & mask) score--;
535 366975 : if (block.M & mask) score++;
536 366975 : mask >>= 1;
537 : }
538 5825 : dest[WORD_SIZE - 1] = score;
539 5825 : }
540 :
541 : /**
542 : * @param [in] block
543 : * @param [in] k
544 : * @return True if all cells in block have value larger than k, otherwise false.
545 : */
546 : static inline bool allBlockCellsLarger(const Block block, const int k) {
547 : std::vector<int> scores = getBlockCellValues(block);
548 : for (int i = 0; i < WORD_SIZE; i++) {
549 : if (scores[i] <= k) return false;
550 : }
551 : return true;
552 : }
553 :
554 :
555 : /**
556 : * Uses Myers' bit-vector algorithm to find edit distance for global(NW) alignment method.
557 : * @param [in] Peq Query profile.
558 : * @param [in] W Size of padding in last block.
559 : * TODO: Calculate this directly from query, instead of passing it.
560 : * @param [in] maxNumBlocks Number of blocks needed to cover the whole query.
561 : * TODO: Calculate this directly from query, instead of passing it.
562 : * @param [in] queryLength
563 : * @param [in] target
564 : * @param [in] targetLength
565 : * @param [in] k
566 : * @param [out] bestScore_ Edit distance.
567 : * @param [out] position_ 0-indexed position in target at which best score was found.
568 : * @param [in] findAlignment If true, whole matrix is remembered and alignment data is returned.
569 : * Quadratic amount of memory is consumed.
570 : * @param [out] alignData Data needed for alignment traceback (for reconstruction of alignment).
571 : * Set only if findAlignment is set to true, otherwise it is NULL.
572 : * Make sure to free this array using delete[].
573 : * @param [out] targetStopPosition If set to -1, whole calculation is performed normally, as expected.
574 : * If set to p, calculation is performed up to position p in target (inclusive)
575 : * and column p is returned as the only column in alignData.
576 : * @return Status.
577 : */
578 6225 : static int myersCalcEditDistanceNW(const Word* const Peq, const int W, const int maxNumBlocks,
579 : const int queryLength,
580 : const unsigned char* const target, const int targetLength,
581 : int k, int* const bestScore_,
582 : int* const position_, const bool findAlignment,
583 : AlignmentData** const alignData, const int targetStopPosition) {
584 6225 : if (targetStopPosition > -1 && findAlignment) {
585 : // They can not be both set at the same time!
586 0 : return EDLIB_STATUS_ERROR;
587 : }
588 :
589 : // Each STRONG_REDUCE_NUM column is reduced in more expensive way.
590 6225 : const int STRONG_REDUCE_NUM = 2048; // TODO: Choose this number dinamically (based on query and target lengths?), so it does not affect speed of computation
591 :
592 6225 : if (k < abs(targetLength - queryLength)) {
593 250 : *bestScore_ = *position_ = -1;
594 250 : return EDLIB_STATUS_OK;
595 : }
596 :
597 5975 : k = min(k, max(queryLength, targetLength)); // Upper bound for k
598 :
599 : // firstBlock is 0-based index of first block in Ukkonen band.
600 : // lastBlock is 0-based index of last block in Ukkonen band.
601 5975 : int firstBlock = 0;
602 : // This is optimal now, by my formula.
603 5975 : int lastBlock = min(maxNumBlocks, ceilDiv(min(k, (k + queryLength - targetLength) / 2) + 1, WORD_SIZE)) - 1;
604 : Block* bl; // Current block
605 :
606 694100 : Block* blocks = new Block[maxNumBlocks];
607 :
608 : // Initialize P, M and score
609 5975 : bl = blocks;
610 20625 : for (int b = 0; b <= lastBlock; b++) {
611 14650 : bl->score = (b + 1) * WORD_SIZE;
612 14650 : bl->P = static_cast<Word>(-1); // All 1s
613 14650 : bl->M = static_cast<Word>(0);
614 14650 : bl++;
615 : }
616 :
617 : // If we want to find alignment, we have to store needed data.
618 5975 : if (findAlignment)
619 2075 : *alignData = new AlignmentData(maxNumBlocks, targetLength);
620 3900 : else if (targetStopPosition > -1)
621 3250 : *alignData = new AlignmentData(maxNumBlocks, 1);
622 : else
623 650 : *alignData = NULL;
624 :
625 5975 : const unsigned char* targetChar = target;
626 21940225 : for (int c = 0; c < targetLength; c++) { // for each column
627 21937675 : const Word* Peq_c = Peq + *targetChar * maxNumBlocks;
628 :
629 : //----------------------- Calculate column -------------------------//
630 21937675 : int hout = 1;
631 21937675 : bl = blocks + firstBlock;
632 356552600 : for (int b = firstBlock; b <= lastBlock; b++) {
633 334614925 : hout = calculateBlock(bl->P, bl->M, Peq_c[b], hout, bl->P, bl->M);
634 334614925 : bl->score += hout;
635 334614925 : bl++;
636 : }
637 21937675 : bl--;
638 : //------------------------------------------------------------------//
639 : // bl now points to last block
640 :
641 : // Update k. I do it only on end of column because it would slow calculation too much otherwise.
642 : // NOTICE: I add W when in last block because it is actually result from W cells to the left and W cells up.
643 21937675 : k = min(k, bl->score
644 21937675 : + max(targetLength - c - 1, queryLength - ((1 + lastBlock) * WORD_SIZE - 1) - 1)
645 21937675 : + (lastBlock == maxNumBlocks - 1 ? W : 0));
646 :
647 : //---------- Adjust number of blocks according to Ukkonen ----------//
648 : //--- Adjust last block ---//
649 : // If block is not beneath band, calculate next block. Only next because others are certainly beneath band.
650 21937675 : if (lastBlock + 1 < maxNumBlocks
651 21821250 : && !(//score[lastBlock] >= k + WORD_SIZE || // NOTICE: this condition could be satisfied if above block also!
652 21821250 : ((lastBlock + 1) * WORD_SIZE - 1
653 21821250 : > k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength))) {
654 21821250 : lastBlock++; bl++;
655 21821250 : bl->P = static_cast<Word>(-1); // All 1s
656 21821250 : bl->M = static_cast<Word>(0);
657 21821250 : int newHout = calculateBlock(bl->P, bl->M, Peq_c[lastBlock], hout, bl->P, bl->M);
658 21821250 : bl->score = (bl - 1)->score - hout + WORD_SIZE + newHout;
659 21821250 : hout = newHout;
660 : }
661 :
662 : // While block is out of band, move one block up.
663 : // NOTE: Condition used here is more loose than the one from the article, since I simplified the max() part of it.
664 : // I could consider adding that max part, for optimal performance.
665 21937675 : while (lastBlock >= firstBlock
666 43422725 : && (bl->score >= k + WORD_SIZE
667 40152450 : || ((lastBlock + 1) * WORD_SIZE - 1 >
668 : // TODO: Does not work if do not put +1! Why???
669 40152450 : k - bl->score + 2 * WORD_SIZE - 2 - targetLength + c + queryLength + 1))) {
670 21485050 : lastBlock--; bl--;
671 : }
672 : //-------------------------//
673 :
674 : //--- Adjust first block ---//
675 : // While outside of band, advance block
676 21937675 : while (firstBlock <= lastBlock
677 22274275 : && (blocks[firstBlock].score >= k + WORD_SIZE
678 22274100 : || ((firstBlock + 1) * WORD_SIZE - 1 <
679 22274100 : blocks[firstBlock].score - k - targetLength + queryLength + c))) {
680 336600 : firstBlock++;
681 : }
682 : //--------------------------/
683 :
684 :
685 : // TODO: consider if this part is useful, it does not seem to help much
686 21937675 : if (c % STRONG_REDUCE_NUM == 0) { // Every some columns do more expensive but more efficient reduction
687 13075 : while (lastBlock >= firstBlock) {
688 : // If all cells outside of band, remove block
689 13075 : std::vector<int> scores = getBlockCellValues(*bl);
690 13075 : int numCells = lastBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
691 13075 : int r = lastBlock * WORD_SIZE + numCells - 1;
692 13075 : bool reduce = true;
693 386800 : for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) {
694 : // TODO: Does not work if do not put +1! Why???
695 386800 : if (scores[i] <= k && r <= k - scores[i] - targetLength + c + queryLength + 1) {
696 13075 : reduce = false;
697 13075 : break;
698 : }
699 373725 : r--;
700 : }
701 13075 : if (!reduce) break;
702 0 : lastBlock--; bl--;
703 13075 : }
704 :
705 13075 : while (firstBlock <= lastBlock) {
706 : // If all cells outside of band, remove block
707 13075 : std::vector<int> scores = getBlockCellValues(blocks[firstBlock]);
708 13075 : int numCells = firstBlock == maxNumBlocks - 1 ? WORD_SIZE - W : WORD_SIZE;
709 13075 : int r = firstBlock * WORD_SIZE + numCells - 1;
710 13075 : bool reduce = true;
711 33000 : for (int i = WORD_SIZE - numCells; i < WORD_SIZE; i++) {
712 33000 : if (scores[i] <= k && r >= scores[i] - k - targetLength + c + queryLength) {
713 13075 : reduce = false;
714 13075 : break;
715 : }
716 19925 : r--;
717 : }
718 13075 : if (!reduce) break;
719 0 : firstBlock++;
720 13075 : }
721 : }
722 :
723 :
724 : // If band stops to exist finish
725 21937675 : if (lastBlock < firstBlock) {
726 175 : *bestScore_ = *position_ = -1;
727 175 : delete[] blocks;
728 175 : return EDLIB_STATUS_OK;
729 : }
730 : //------------------------------------------------------------------//
731 :
732 :
733 : //---- Save column so it can be used for reconstruction ----//
734 21937500 : if (findAlignment && c < targetLength) {
735 2574900 : bl = blocks + firstBlock;
736 6254650 : for (int b = firstBlock; b <= lastBlock; b++) {
737 3679750 : (*alignData)->Ps[maxNumBlocks * c + b] = bl->P;
738 3679750 : (*alignData)->Ms[maxNumBlocks * c + b] = bl->M;
739 3679750 : (*alignData)->scores[maxNumBlocks * c + b] = bl->score;
740 3679750 : bl++;
741 : }
742 2574900 : (*alignData)->firstBlocks[c] = firstBlock;
743 2574900 : (*alignData)->lastBlocks[c] = lastBlock;
744 : }
745 : //----------------------------------------------------------//
746 : //---- If this is stop column, save it and finish ----//
747 21937500 : if (c == targetStopPosition) {
748 14900 : for (int b = firstBlock; b <= lastBlock; b++) {
749 11650 : (*alignData)->Ps[b] = (blocks + b)->P;
750 11650 : (*alignData)->Ms[b] = (blocks + b)->M;
751 11650 : (*alignData)->scores[b] = (blocks + b)->score;
752 : }
753 3250 : (*alignData)->firstBlocks[0] = firstBlock;
754 3250 : (*alignData)->lastBlocks[0] = lastBlock;
755 3250 : *bestScore_ = -1;
756 3250 : *position_ = targetStopPosition;
757 3250 : delete[] blocks;
758 3250 : return EDLIB_STATUS_OK;
759 : }
760 : //----------------------------------------------------//
761 :
762 21934250 : targetChar++;
763 : }
764 :
765 2550 : if (lastBlock == maxNumBlocks - 1) { // If last block of last column was calculated
766 : // Obtain best score from block -> it is complicated because query is padded with W cells
767 2550 : int bestScore = getBlockCellValues(blocks[lastBlock])[W];
768 2550 : if (bestScore <= k) {
769 2525 : *bestScore_ = bestScore;
770 2525 : *position_ = targetLength - 1;
771 2525 : delete[] blocks;
772 2525 : return EDLIB_STATUS_OK;
773 : }
774 : }
775 :
776 25 : *bestScore_ = *position_ = -1;
777 25 : delete[] blocks;
778 25 : return EDLIB_STATUS_OK;
779 : }
780 :
781 :
782 : /**
783 : * Finds one possible alignment that gives optimal score by moving back through the dynamic programming matrix,
784 : * that is stored in alignData. Consumes large amount of memory: O(queryLength * targetLength).
785 : * @param [in] queryLength Normal length, without W.
786 : * @param [in] targetLength Normal length, without W.
787 : * @param [in] bestScore Best score.
788 : * @param [in] alignData Data obtained during finding best score that is useful for finding alignment.
789 : * @param [out] alignment Alignment.
790 : * @param [out] alignmentLength Length of alignment.
791 : * @return Status code.
792 : */
793 2075 : static int obtainAlignmentTraceback(const int queryLength, const int targetLength,
794 : const int bestScore, const AlignmentData* const alignData,
795 : unsigned char** const alignment, int* const alignmentLength) {
796 2075 : const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
797 2075 : const int W = maxNumBlocks * WORD_SIZE - queryLength;
798 :
799 2075 : *alignment = static_cast<unsigned char*>(malloc((queryLength + targetLength - 1) * sizeof(unsigned char)));
800 2075 : *alignmentLength = 0;
801 2075 : int c = targetLength - 1; // index of column
802 2075 : int b = maxNumBlocks - 1; // index of block in column
803 2075 : int currScore = bestScore; // Score of current cell
804 2075 : int lScore = -1; // Score of left cell
805 2075 : int uScore = -1; // Score of upper cell
806 2075 : int ulScore = -1; // Score of upper left cell
807 2075 : Word currP = alignData->Ps[c * maxNumBlocks + b]; // P of current block
808 2075 : Word currM = alignData->Ms[c * maxNumBlocks + b]; // M of current block
809 : // True if block to left exists and is in band
810 2075 : bool thereIsLeftBlock = c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1];
811 : // We set initial values of lP and lM to 0 only to avoid compiler warnings, they should not affect the
812 : // calculation as both lP and lM should be initialized at some moment later (but compiler can not
813 : // detect it since this initialization is guaranteed by "business" logic).
814 2075 : Word lP = 0, lM = 0;
815 2075 : if (thereIsLeftBlock) {
816 2075 : lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // P of block to the left
817 2075 : lM = alignData->Ms[(c - 1) * maxNumBlocks + b]; // M of block to the left
818 : }
819 2075 : currP <<= W;
820 2075 : currM <<= W;
821 2075 : int blockPos = WORD_SIZE - W - 1; // 0 based index of current cell in blockPos
822 :
823 : // TODO(martin): refactor this whole piece of code. There are too many if-else statements,
824 : // it is too easy for a bug to hide and to hard to effectively cover all the edge-cases.
825 : // We need better separation of logic and responsibilities.
826 : while (true) {
827 2625825 : if (c == 0) {
828 2000 : thereIsLeftBlock = true;
829 2000 : lScore = b * WORD_SIZE + blockPos + 1;
830 2000 : ulScore = lScore - 1;
831 : }
832 :
833 : // TODO: improvement: calculate only those cells that are needed,
834 : // for example if I calculate upper cell and can move up,
835 : // there is no need to calculate left and upper left cell
836 : //---------- Calculate scores ---------//
837 2625825 : if (lScore == -1 && thereIsLeftBlock) {
838 2571800 : lScore = alignData->scores[(c - 1) * maxNumBlocks + b]; // score of block to the left
839 84554900 : for (int i = 0; i < WORD_SIZE - blockPos - 1; i++) {
840 81983100 : if (lP & HIGH_BIT_MASK) lScore--;
841 81983100 : if (lM & HIGH_BIT_MASK) lScore++;
842 81983100 : lP <<= 1;
843 81983100 : lM <<= 1;
844 : }
845 : }
846 2625825 : if (ulScore == -1) {
847 2623825 : if (lScore != -1) {
848 2622950 : ulScore = lScore;
849 2622950 : if (lP & HIGH_BIT_MASK) ulScore--;
850 2622950 : if (lM & HIGH_BIT_MASK) ulScore++;
851 : }
852 875 : else if (c > 0 && b-1 >= alignData->firstBlocks[c-1] && b-1 <= alignData->lastBlocks[c-1]) {
853 : // This is the case when upper left cell is last cell in block,
854 : // and block to left is not in band so lScore is -1.
855 875 : ulScore = alignData->scores[(c - 1) * maxNumBlocks + b - 1];
856 : }
857 : }
858 2625825 : if (uScore == -1) {
859 2568525 : uScore = currScore;
860 2568525 : if (currP & HIGH_BIT_MASK) uScore--;
861 2568525 : if (currM & HIGH_BIT_MASK) uScore++;
862 2568525 : currP <<= 1;
863 2568525 : currM <<= 1;
864 : }
865 : //-------------------------------------//
866 :
867 : // TODO: should I check if there is upper block?
868 :
869 : //-------------- Move --------------//
870 : // Move up - insertion to target - deletion from query
871 2625825 : if (uScore != -1 && uScore + 1 == currScore) {
872 51225 : currScore = uScore;
873 51225 : lScore = ulScore;
874 51225 : uScore = ulScore = -1;
875 51225 : if (blockPos == 0) { // If entering new (upper) block
876 950 : if (b == 0) { // If there are no cells above (only boundary cells)
877 0 : (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT; // Move up
878 0 : for (int i = 0; i < c + 1; i++) // Move left until end
879 0 : (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
880 0 : break;
881 : } else {
882 950 : blockPos = WORD_SIZE - 1;
883 950 : b--;
884 950 : currP = alignData->Ps[c * maxNumBlocks + b];
885 950 : currM = alignData->Ms[c * maxNumBlocks + b];
886 950 : if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
887 950 : thereIsLeftBlock = true;
888 950 : lP = alignData->Ps[(c - 1) * maxNumBlocks + b]; // TODO: improve this, too many operations
889 950 : lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
890 : } else {
891 0 : thereIsLeftBlock = false;
892 : // TODO(martin): There may not be left block, but there can be left boundary - do we
893 : // handle this correctly then? Are l and ul score set correctly? I should check that / refactor this.
894 : }
895 : }
896 : } else {
897 50275 : blockPos--;
898 50275 : lP <<= 1;
899 50275 : lM <<= 1;
900 : }
901 : // Mark move
902 51225 : (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
903 51225 : }
904 : // Move left - deletion from target - insertion to query
905 2574600 : else if (lScore != -1 && lScore + 1 == currScore) {
906 57300 : currScore = lScore;
907 57300 : uScore = ulScore;
908 57300 : lScore = ulScore = -1;
909 57300 : c--;
910 57300 : if (c == -1) { // If there are no cells to the left (only boundary cells)
911 0 : (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE; // Move left
912 0 : int numUp = b * WORD_SIZE + blockPos + 1;
913 0 : for (int i = 0; i < numUp; i++) // Move up until end
914 0 : (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
915 0 : break;
916 : }
917 57300 : currP = lP;
918 57300 : currM = lM;
919 57300 : if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
920 57300 : thereIsLeftBlock = true;
921 57300 : lP = alignData->Ps[(c - 1) * maxNumBlocks + b];
922 57300 : lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
923 : } else {
924 0 : if (c == 0) { // If there are no cells to the left (only boundary cells)
925 0 : thereIsLeftBlock = true;
926 0 : lScore = b * WORD_SIZE + blockPos + 1;
927 0 : ulScore = lScore - 1;
928 : } else {
929 0 : thereIsLeftBlock = false;
930 : }
931 : }
932 : // Mark move
933 57300 : (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
934 57300 : }
935 : // Move up left - (mis)match
936 2517300 : else if (ulScore != -1) {
937 2517300 : unsigned char moveCode = ulScore == currScore ? EDLIB_EDOP_MATCH : EDLIB_EDOP_MISMATCH;
938 2517300 : currScore = ulScore;
939 2517300 : uScore = lScore = ulScore = -1;
940 2517300 : c--;
941 2517300 : if (c == -1) { // If there are no cells to the left (only boundary cells)
942 1925 : (*alignment)[(*alignmentLength)++] = moveCode; // Move left
943 1925 : int numUp = b * WORD_SIZE + blockPos;
944 2000 : for (int i = 0; i < numUp; i++) // Move up until end
945 75 : (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_INSERT;
946 1925 : break;
947 : }
948 2515375 : if (blockPos == 0) { // If entering upper left block
949 38500 : if (b == 0) { // If there are no more cells above (only boundary cells)
950 150 : (*alignment)[(*alignmentLength)++] = moveCode; // Move up left
951 450 : for (int i = 0; i < c + 1; i++) // Move left until end
952 300 : (*alignment)[(*alignmentLength)++] = EDLIB_EDOP_DELETE;
953 150 : break;
954 : }
955 38350 : blockPos = WORD_SIZE - 1;
956 38350 : b--;
957 38350 : currP = alignData->Ps[c * maxNumBlocks + b];
958 38350 : currM = alignData->Ms[c * maxNumBlocks + b];
959 : } else { // If entering left block
960 2476875 : blockPos--;
961 2476875 : currP = lP;
962 2476875 : currM = lM;
963 2476875 : currP <<= 1;
964 2476875 : currM <<= 1;
965 : }
966 : // Set new left block
967 2515225 : if (c > 0 && b >= alignData->firstBlocks[c-1] && b <= alignData->lastBlocks[c-1]) {
968 2512425 : thereIsLeftBlock = true;
969 2512425 : lP = alignData->Ps[(c - 1) * maxNumBlocks + b];
970 2512425 : lM = alignData->Ms[(c - 1) * maxNumBlocks + b];
971 : } else {
972 2800 : if (c == 0) { // If there are no cells to the left (only boundary cells)
973 1925 : thereIsLeftBlock = true;
974 1925 : lScore = b * WORD_SIZE + blockPos + 1;
975 1925 : ulScore = lScore - 1;
976 : } else {
977 875 : thereIsLeftBlock = false;
978 : }
979 : }
980 : // Mark move
981 2515225 : (*alignment)[(*alignmentLength)++] = moveCode;
982 : } else {
983 : // Reached end - finished!
984 0 : break;
985 : }
986 : //----------------------------------//
987 2623750 : }
988 :
989 2075 : *alignment = static_cast<unsigned char*>(realloc(*alignment, (*alignmentLength) * sizeof(unsigned char)));
990 2075 : std::reverse(*alignment, *alignment + (*alignmentLength));
991 2075 : return EDLIB_STATUS_OK;
992 : }
993 :
994 :
995 : /**
996 : * Finds one possible alignment that gives optimal score (bestScore).
997 : * It will split problem into smaller problems using Hirschberg's algorithm and when they are small enough,
998 : * it will solve them using traceback algorithm.
999 : * @param [in] query
1000 : * @param [in] rQuery Reversed query.
1001 : * @param [in] queryLength
1002 : * @param [in] target
1003 : * @param [in] rTarget Reversed target.
1004 : * @param [in] targetLength
1005 : * @param [in] equalityDefinition
1006 : * @param [in] alphabetLength
1007 : * @param [in] bestScore Best(optimal) score.
1008 : * @param [out] alignment Sequence of edit operations that make target equal to query.
1009 : * @param [out] alignmentLength Length of alignment.
1010 : * @return Status code.
1011 : */
1012 3700 : static int obtainAlignment(
1013 : const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
1014 : const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
1015 : const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore,
1016 : unsigned char** const alignment, int* const alignmentLength) {
1017 :
1018 : // Handle special case when one of sequences has length of 0.
1019 3700 : if (queryLength == 0 || targetLength == 0) {
1020 0 : *alignmentLength = targetLength + queryLength;
1021 0 : *alignment = static_cast<unsigned char*>(malloc((*alignmentLength) * sizeof(unsigned char)));
1022 0 : for (int i = 0; i < *alignmentLength; i++) {
1023 0 : (*alignment)[i] = queryLength == 0 ? EDLIB_EDOP_DELETE : EDLIB_EDOP_INSERT;
1024 : }
1025 0 : return EDLIB_STATUS_OK;
1026 : }
1027 :
1028 3700 : const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
1029 3700 : const int W = maxNumBlocks * WORD_SIZE - queryLength;
1030 : int statusCode;
1031 :
1032 : // TODO: think about reducing number of memory allocations in alignment functions, probably
1033 : // by sharing some memory that is allocated only once. That refers to: Peq, columns in Hirschberg,
1034 : // and it could also be done for alignments - we could have one big array for alignment that would be
1035 : // sparsely populated by each of steps in recursion, and at the end we would just consolidate those results.
1036 :
1037 : // If estimated memory consumption for traceback algorithm is smaller than 1MB use it,
1038 : // otherwise use Hirschberg's algorithm. By running few tests I choose boundary of 1MB as optimal.
1039 3700 : long long alignmentDataSize = (2ll * sizeof(Word) + sizeof(int)) * maxNumBlocks * targetLength
1040 3700 : + 2ll * sizeof(int) * targetLength;
1041 3700 : if (alignmentDataSize < 1024 * 1024) {
1042 : int score_, endLocation_; // Used only to call function.
1043 2075 : AlignmentData* alignData = NULL;
1044 2075 : Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition);
1045 2075 : myersCalcEditDistanceNW(Peq, W, maxNumBlocks,
1046 : queryLength,
1047 : target, targetLength,
1048 : bestScore,
1049 : &score_, &endLocation_, true, &alignData, -1);
1050 : //assert(score_ == bestScore);
1051 : //assert(endLocation_ == targetLength - 1);
1052 :
1053 2075 : statusCode = obtainAlignmentTraceback(queryLength, targetLength,
1054 : bestScore, alignData, alignment, alignmentLength);
1055 2075 : delete alignData;
1056 2075 : delete[] Peq;
1057 : } else {
1058 1625 : statusCode = obtainAlignmentHirschberg(query, rQuery, queryLength,
1059 : target, rTarget, targetLength,
1060 : equalityDefinition, alphabetLength, bestScore,
1061 : alignment, alignmentLength);
1062 : }
1063 3700 : return statusCode;
1064 : }
1065 :
1066 :
1067 : /**
1068 : * Finds one possible alignment that gives optimal score (bestScore).
1069 : * Uses Hirschberg's algorithm to split problem into two sub-problems, solve them and combine them together.
1070 : * @param [in] query
1071 : * @param [in] rQuery Reversed query.
1072 : * @param [in] queryLength
1073 : * @param [in] target
1074 : * @param [in] rTarget Reversed target.
1075 : * @param [in] targetLength
1076 : * @param [in] alphabetLength
1077 : * @param [in] bestScore Best(optimal) score.
1078 : * @param [out] alignment Sequence of edit operations that make target equal to query.
1079 : * @param [out] alignmentLength Length of alignment.
1080 : * @return Status code.
1081 : */
1082 1625 : static int obtainAlignmentHirschberg(
1083 : const unsigned char* const query, const unsigned char* const rQuery, const int queryLength,
1084 : const unsigned char* const target, const unsigned char* const rTarget, const int targetLength,
1085 : const EqualityDefinition& equalityDefinition, const int alphabetLength, const int bestScore,
1086 : unsigned char** const alignment, int* const alignmentLength) {
1087 :
1088 1625 : const int maxNumBlocks = ceilDiv(queryLength, WORD_SIZE);
1089 1625 : const int W = maxNumBlocks * WORD_SIZE - queryLength;
1090 :
1091 1625 : Word* Peq = buildPeq(alphabetLength, query, queryLength, equalityDefinition);
1092 1625 : Word* rPeq = buildPeq(alphabetLength, rQuery, queryLength, equalityDefinition);
1093 :
1094 : // Used only to call functions.
1095 : int score_, endLocation_;
1096 :
1097 : // Divide dynamic matrix into two halfs, left and right.
1098 1625 : const int leftHalfWidth = targetLength / 2;
1099 1625 : const int rightHalfWidth = targetLength - leftHalfWidth;
1100 :
1101 : // Calculate left half.
1102 1625 : AlignmentData* alignDataLeftHalf = NULL;
1103 1625 : int leftHalfCalcStatus = myersCalcEditDistanceNW(
1104 : Peq, W, maxNumBlocks, queryLength, target, targetLength, bestScore,
1105 : &score_, &endLocation_, false, &alignDataLeftHalf, leftHalfWidth - 1);
1106 :
1107 : // Calculate right half.
1108 1625 : AlignmentData* alignDataRightHalf = NULL;
1109 1625 : int rightHalfCalcStatus = myersCalcEditDistanceNW(
1110 : rPeq, W, maxNumBlocks, queryLength, rTarget, targetLength, bestScore,
1111 : &score_, &endLocation_, false, &alignDataRightHalf, rightHalfWidth - 1);
1112 :
1113 1625 : delete[] Peq;
1114 1625 : delete[] rPeq;
1115 :
1116 1625 : if (leftHalfCalcStatus == EDLIB_STATUS_ERROR || rightHalfCalcStatus == EDLIB_STATUS_ERROR) {
1117 0 : if (alignDataLeftHalf) delete alignDataLeftHalf;
1118 0 : if (alignDataRightHalf) delete alignDataRightHalf;
1119 0 : return EDLIB_STATUS_ERROR;
1120 : }
1121 :
1122 : // Unwrap the left half.
1123 1625 : int firstBlockIdxLeft = alignDataLeftHalf->firstBlocks[0];
1124 1625 : int lastBlockIdxLeft = alignDataLeftHalf->lastBlocks[0];
1125 : // TODO: avoid this allocation by using some shared array?
1126 : // scoresLeft contains scores from left column, starting with scoresLeftStartIdx row (query index)
1127 : // and ending with scoresLeftEndIdx row (0-indexed).
1128 1625 : int scoresLeftLength = (lastBlockIdxLeft - firstBlockIdxLeft + 1) * WORD_SIZE;
1129 1625 : int* scoresLeft = new int[scoresLeftLength];
1130 7450 : for (int blockIdx = firstBlockIdxLeft; blockIdx <= lastBlockIdxLeft; blockIdx++) {
1131 5825 : Block block(alignDataLeftHalf->Ps[blockIdx], alignDataLeftHalf->Ms[blockIdx],
1132 5825 : alignDataLeftHalf->scores[blockIdx]);
1133 5825 : readBlock(block, scoresLeft + (blockIdx - firstBlockIdxLeft) * WORD_SIZE);
1134 : }
1135 1625 : int scoresLeftStartIdx = firstBlockIdxLeft * WORD_SIZE;
1136 : // If last block contains padding, shorten the length of scores for the length of padding.
1137 1625 : if (lastBlockIdxLeft == maxNumBlocks - 1) {
1138 0 : scoresLeftLength -= W;
1139 : }
1140 :
1141 : // Unwrap the right half (I also reverse it while unwraping).
1142 1625 : int firstBlockIdxRight = alignDataRightHalf->firstBlocks[0];
1143 1625 : int lastBlockIdxRight = alignDataRightHalf->lastBlocks[0];
1144 1625 : int scoresRightLength = (lastBlockIdxRight - firstBlockIdxRight + 1) * WORD_SIZE;
1145 1625 : int* scoresRight = new int[scoresRightLength];
1146 1625 : int* scoresRightOriginalStart = scoresRight;
1147 7450 : for (int blockIdx = firstBlockIdxRight; blockIdx <= lastBlockIdxRight; blockIdx++) {
1148 5825 : Block block(alignDataRightHalf->Ps[blockIdx], alignDataRightHalf->Ms[blockIdx],
1149 5825 : alignDataRightHalf->scores[blockIdx]);
1150 5825 : readBlockReverse(block, scoresRight + (lastBlockIdxRight - blockIdx) * WORD_SIZE);
1151 : }
1152 1625 : int scoresRightStartIdx = queryLength - (lastBlockIdxRight + 1) * WORD_SIZE;
1153 : // If there is padding at the beginning of scoresRight (that can happen because of reversing that we do),
1154 : // move pointer forward to remove the padding (that is why we remember originalStart).
1155 1625 : if (scoresRightStartIdx < 0) {
1156 : //assert(scoresRightStartIdx == -1 * W);
1157 0 : scoresRight += W;
1158 0 : scoresRightStartIdx += W;
1159 0 : scoresRightLength -= W;
1160 : }
1161 :
1162 1625 : delete alignDataLeftHalf;
1163 1625 : delete alignDataRightHalf;
1164 :
1165 : //--------------------- Find the best move ----------------//
1166 : // Find the query/row index of cell in left column which together with its lower right neighbour
1167 : // from right column gives the best score (when summed). We also have to consider boundary cells
1168 : // (those cells at -1 indexes).
1169 : // x|
1170 : // -+-
1171 : // |x
1172 1625 : int queryIdxLeftStart = max(scoresLeftStartIdx, scoresRightStartIdx - 1);
1173 1625 : int queryIdxLeftEnd = min(scoresLeftStartIdx + scoresLeftLength - 1,
1174 1625 : scoresRightStartIdx + scoresRightLength - 2);
1175 1625 : int leftScore = -1, rightScore = -1;
1176 1625 : int queryIdxLeftAlignment = -1; // Query/row index of cell in left column where alignment is passing through.
1177 1625 : bool queryIdxLeftAlignmentFound = false;
1178 161350 : for (int queryIdx = queryIdxLeftStart; queryIdx <= queryIdxLeftEnd; queryIdx++) {
1179 161350 : leftScore = scoresLeft[queryIdx - scoresLeftStartIdx];
1180 161350 : rightScore = scoresRight[queryIdx + 1 - scoresRightStartIdx];
1181 161350 : if (leftScore + rightScore == bestScore) {
1182 1625 : queryIdxLeftAlignment = queryIdx;
1183 1625 : queryIdxLeftAlignmentFound = true;
1184 1625 : break;
1185 : }
1186 : }
1187 : // Check boundary cells.
1188 1625 : if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx == 0 && scoresRightStartIdx == 0) {
1189 0 : leftScore = leftHalfWidth;
1190 0 : rightScore = scoresRight[0];
1191 0 : if (leftScore + rightScore == bestScore) {
1192 0 : queryIdxLeftAlignment = -1;
1193 0 : queryIdxLeftAlignmentFound = true;
1194 : }
1195 : }
1196 1625 : if (!queryIdxLeftAlignmentFound && scoresLeftStartIdx + scoresLeftLength == queryLength
1197 0 : && scoresRightStartIdx + scoresRightLength == queryLength) {
1198 0 : leftScore = scoresLeft[scoresLeftLength - 1];
1199 0 : rightScore = rightHalfWidth;
1200 0 : if (leftScore + rightScore == bestScore) {
1201 0 : queryIdxLeftAlignment = queryLength - 1;
1202 0 : queryIdxLeftAlignmentFound = true;
1203 : }
1204 : }
1205 :
1206 1625 : delete[] scoresLeft;
1207 1625 : delete[] scoresRightOriginalStart;
1208 :
1209 1625 : if (queryIdxLeftAlignmentFound == false) {
1210 : // If there was no move that is part of optimal alignment, then there is no such alignment
1211 : // or given bestScore is not correct!
1212 0 : return EDLIB_STATUS_ERROR;
1213 : }
1214 : //----------------------------------------------------------//
1215 :
1216 : // Calculate alignments for upper half of left half (upper left - ul)
1217 : // and lower half of right half (lower right - lr).
1218 1625 : const int ulHeight = queryIdxLeftAlignment + 1;
1219 1625 : const int lrHeight = queryLength - ulHeight;
1220 1625 : const int ulWidth = leftHalfWidth;
1221 1625 : const int lrWidth = rightHalfWidth;
1222 1625 : unsigned char* ulAlignment = NULL; int ulAlignmentLength;
1223 3250 : int ulStatusCode = obtainAlignment(query, rQuery + lrHeight, ulHeight,
1224 1625 : target, rTarget + lrWidth, ulWidth,
1225 : equalityDefinition, alphabetLength, leftScore,
1226 : &ulAlignment, &ulAlignmentLength);
1227 1625 : unsigned char* lrAlignment = NULL; int lrAlignmentLength;
1228 3250 : int lrStatusCode = obtainAlignment(query + ulHeight, rQuery, lrHeight,
1229 1625 : target + ulWidth, rTarget, lrWidth,
1230 : equalityDefinition, alphabetLength, rightScore,
1231 : &lrAlignment, &lrAlignmentLength);
1232 1625 : if (ulStatusCode == EDLIB_STATUS_ERROR || lrStatusCode == EDLIB_STATUS_ERROR) {
1233 0 : if (ulAlignment) free(ulAlignment);
1234 0 : if (lrAlignment) free(lrAlignment);
1235 0 : return EDLIB_STATUS_ERROR;
1236 : }
1237 :
1238 : // Build alignment by concatenating upper left alignment with lower right alignment.
1239 1625 : *alignmentLength = ulAlignmentLength + lrAlignmentLength;
1240 1625 : *alignment = static_cast<unsigned char*>(malloc((*alignmentLength) * sizeof(unsigned char)));
1241 1625 : memcpy(*alignment, ulAlignment, ulAlignmentLength);
1242 1625 : memcpy(*alignment + ulAlignmentLength, lrAlignment, lrAlignmentLength);
1243 :
1244 1625 : free(ulAlignment);
1245 1625 : free(lrAlignment);
1246 1625 : return EDLIB_STATUS_OK;
1247 : }
1248 :
1249 :
1250 : /**
1251 : * Takes char query and char target, recognizes alphabet and transforms them into unsigned char sequences
1252 : * where elements in sequences are not any more letters of alphabet, but their index in alphabet.
1253 : * Most of internal edlib functions expect such transformed sequences.
1254 : * This function will allocate queryTransformed and targetTransformed, so make sure to free them when done.
1255 : * Example:
1256 : * Original sequences: "ACT" and "CGT".
1257 : * Alphabet would be recognized as "ACTG". Alphabet length = 4.
1258 : * Transformed sequences: [0, 1, 2] and [1, 3, 2].
1259 : * @param [in] queryOriginal
1260 : * @param [in] queryLength
1261 : * @param [in] targetOriginal
1262 : * @param [in] targetLength
1263 : * @param [out] queryTransformed It will contain values in range [0, alphabet length - 1].
1264 : * @param [out] targetTransformed It will contain values in range [0, alphabet length - 1].
1265 : * @return Alphabet as a string of unique characters, where index of each character is its value in transformed
1266 : * sequences.
1267 : */
1268 500 : static std::string transformSequences(const char* const queryOriginal, const int queryLength,
1269 : const char* const targetOriginal, const int targetLength,
1270 : unsigned char** const queryTransformed,
1271 : unsigned char** const targetTransformed) {
1272 : // Alphabet is constructed from letters that are present in sequences.
1273 : // Each letter is assigned an ordinal number, starting from 0 up to alphabetLength - 1,
1274 : // and new query and target are created in which letters are replaced with their ordinal numbers.
1275 : // This query and target are used in all the calculations later.
1276 500 : *queryTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * queryLength));
1277 500 : *targetTransformed = static_cast<unsigned char *>(malloc(sizeof(unsigned char) * targetLength));
1278 :
1279 500 : std::string alphabet = "";
1280 :
1281 : // Alphabet information, it is constructed on fly while transforming sequences.
1282 : // letterIdx[c] is index of letter c in alphabet.
1283 : unsigned char letterIdx[MAX_UCHAR + 1];
1284 : bool inAlphabet[MAX_UCHAR + 1]; // inAlphabet[c] is true if c is in alphabet
1285 128500 : for (int i = 0; i < MAX_UCHAR + 1; i++) inAlphabet[i] = false;
1286 :
1287 2582775 : for (int i = 0; i < queryLength; i++) {
1288 2582275 : unsigned char c = static_cast<unsigned char>(queryOriginal[i]);
1289 2582275 : if (!inAlphabet[c]) {
1290 7625 : inAlphabet[c] = true;
1291 7625 : letterIdx[c] = static_cast<unsigned char>(alphabet.size());
1292 7625 : alphabet += queryOriginal[i];
1293 : }
1294 2582275 : (*queryTransformed)[i] = letterIdx[c];
1295 : }
1296 2589075 : for (int i = 0; i < targetLength; i++) {
1297 2588575 : unsigned char c = static_cast<unsigned char>(targetOriginal[i]);
1298 2588575 : if (!inAlphabet[c]) {
1299 1125 : inAlphabet[c] = true;
1300 1125 : letterIdx[c] = static_cast<unsigned char>(alphabet.size());
1301 1125 : alphabet += targetOriginal[i];
1302 : }
1303 2588575 : (*targetTransformed)[i] = letterIdx[c];
1304 : }
1305 :
1306 1000 : return alphabet;
1307 0 : }
1308 :
1309 :
1310 : class MemPoolHolder {
1311 : public:
1312 25 : MemPoolHolder() {
1313 25 : memory::pool::initialize();
1314 25 : }
1315 :
1316 25 : ~MemPoolHolder() {
1317 25 : if (pool) {
1318 25 : memory::pool::destroy(pool);
1319 : }
1320 25 : memory::pool::terminate();
1321 25 : }
1322 :
1323 500 : memory::pool_t * getPool() {
1324 500 : if (!pool) {
1325 25 : pool = memory::pool::create((memory::pool_t *)nullptr);
1326 : }
1327 500 : return pool;
1328 : }
1329 :
1330 : private:
1331 : memory::pool_t *pool = nullptr;
1332 : };
1333 :
1334 : thread_local MemPoolHolder tl_pool;
1335 :
1336 500 : static void edlibFreeAlignResult(EdlibAlignResult &result) {
1337 500 : if (result.endLocations) free(result.endLocations);
1338 500 : if (result.startLocations) free(result.startLocations);
1339 500 : if (result.alignment) free(result.alignment);
1340 500 : }
1341 :
1342 500 : Distance::Distance(const StringView &origin, const StringView &canonical, size_t maxDistance) {
1343 500 : auto orig = memory::pool::acquire();
1344 :
1345 500 : auto pool = tl_pool.getPool();
1346 500 : memory::pool::push(pool);
1347 :
1348 : EdlibAlignConfig cfg;
1349 500 : cfg.k = (maxDistance == maxOf<size_t>()) ? -1 : int(maxDistance);
1350 500 : cfg.mode = EDLIB_MODE_NW;
1351 500 : cfg.task = EDLIB_TASK_PATH;
1352 :
1353 500 : auto res = edlibAlign(origin.data(), origin.size(), canonical.data(), canonical.size(), cfg);
1354 :
1355 500 : memory::pool::push(orig);
1356 500 : _distance = res.editDistance;
1357 500 : _storage.reserve(res.alignmentLength);
1358 2626700 : for (int i = 0; i < res.alignmentLength; ++ i) {
1359 2626200 : _storage.emplace_back(Value(res.alignment[i]));
1360 : }
1361 500 : memory::pool::pop();
1362 :
1363 500 : edlibFreeAlignResult(res);
1364 :
1365 500 : memory::pool::clear(pool);
1366 500 : memory::pool::pop();
1367 500 : }
1368 :
1369 : }
|