libpappsomspp
Library for mass spectrometry
Loading...
Searching...
No Matches
semiglobalalignment.cpp
Go to the documentation of this file.
1/**
2 * \file pappsomspp/processing/specpeptidoms/semiglobalalignment.cpp
3 * \date 24/03/2025
4 * \author Aurélien Berthier
5 * \brief protein to spectrum alignment
6 *
7 * C++ implementation of the SpecPeptidOMS algorithm described in :
8 * (1) Benoist, É.; Jean, G.; Rogniaux, H.; Fertin, G.; Tessier, D. SpecPeptidOMS Directly and
9 * Rapidly Aligns Mass Spectra on Whole Proteomes and Identifies Peptides That Are Not Necessarily
10 * Tryptic: Implications for Peptidomics. J. Proteome Res. 2025.
11 * https://doi.org/10.1021/acs.jproteome.4c00870.
12 */
13
14/*
15 * Copyright (c) 2025 Aurélien Berthier
16 * <aurelien.berthier@ls2n.fr>
17 *
18 *
19 * This program is free software: you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published by
21 * the Free Software Foundation, either version 3 of the License, or
22 * (at your option) any later version.
23 *
24 * This program is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU General Public License for more details.
28 *
29 * You should have received a copy of the GNU General Public License
30 * along with this program. If not, see <http://www.gnu.org/licenses/>.
31 */
32
33#include <QString>
34#include <QStringRef>
35#include <algorithm>
36#include <qtypes.h>
37#include "semiglobalalignment.h"
40#include "correctiontree.h"
43
44void
46{
47 peaks.clear();
48 m_peptideModel.reset();
49 score = 0;
50 begin_shift = 0.0;
51 end_shift = 0.0;
52 shifts.clear();
53 SPC = 0;
54 beginning = 0;
55 end = 0;
56}
57
58
59QString
60pappso::specpeptidoms::Alignment::getPeptideString(const QString &protein_sequence) const
61{
62 return protein_sequence.mid(beginning, end - beginning);
63}
64
65double
67{
68 double sum_of_elems = std::accumulate(shifts.begin(), shifts.end(), 0);
69 return begin_shift + sum_of_elems + end_shift;
70}
71
72
73std::size_t
78
79
81 const ScoreValues &score_values,
82 const pappso::PrecisionPtr precision_ptr,
83 const pappso::AaCode &aaCode)
84 : m_scorevalues(score_values), m_aaCode(aaCode)
85{
86 m_precision_ptr = precision_ptr;
87
88 KeyCell key_cell_init_first;
89 key_cell_init_first.n_row = 0;
90 key_cell_init_first.score = 0;
91 key_cell_init_first.beginning = 0;
92 key_cell_init_first.tree_id = 0;
93 m_interest_cells.push_back(key_cell_init_first);
94}
95
96const std::vector<pappso::specpeptidoms::KeyCell> &
101
102void
104 const SpOMSProtein *protein_ptr)
105{
106 std::size_t sequence_length = protein_ptr->size();
107
108 initFastAlign(spectrum);
109
110 for(std::size_t row_number = 1; row_number <= sequence_length; row_number++)
111 {
112 updateAlignmentMatrix(*protein_ptr,
113 row_number,
114 spectrum.getAaPositions(protein_ptr->at(row_number - 1).code),
115 spectrum,
116 true,
117 protein_ptr);
118 }
119}
120
121void
123{
124 // m_scenario.clear();
125 // TODO don't forget to reset any important variable
126 // m_best_alignment.reset();
127 // m_best_corrected_alignment.reset();
128 // m_best_post_processed_alignment.reset();
129
130 KeyCell key_cell_init;
131 key_cell_init.n_row = 0;
132 key_cell_init.score = m_scorevalues.get(ScoreType::init);
133 key_cell_init.beginning = 0;
134 key_cell_init.tree_id = 0;
135
136 m_interest_cells.resize(spectrum.size());
137 std::fill(m_interest_cells.begin(), m_interest_cells.end(), key_cell_init);
138
139 m_interest_cells.at(0).score = 0;
140
141 // m_location_saver.resetLocationSaver();
142 m_updated_cells.clear();
143}
144
145void
147 std::size_t length2)
148{
149 m_scenario.reserve(length2 + 1, spectrum.size());
150 m_interest_cells.reserve(spectrum.size());
151 m_interest_cells.at(0).n_row = 0;
152 m_interest_cells.at(0).score = 0;
153 m_interest_cells.at(0).beginning = 0;
154 m_interest_cells.at(0).tree_id = 0;
155 for(std::size_t i = 1; i < m_interest_cells.size(); i++)
156 {
157 m_interest_cells.at(i).n_row = 0;
159 m_interest_cells.at(i).beginning = 0;
160 m_interest_cells.at(i).tree_id = 0;
161 }
162 for(std::size_t iter = m_interest_cells.size(); iter < spectrum.size(); iter++)
163 {
164 m_interest_cells.push_back({0, m_scorevalues.get(ScoreType::init), 0, 0});
165 }
166 m_scenario.resetScenario();
167}
168
169void
171 const SpOMSProtein *protein_ptr,
172 const std::size_t beginning,
173 const std::size_t length)
174{
175 try
176 {
177 qDebug();
178 const QString &protein_seq = protein_ptr->getSequence();
179 std::size_t length2;
180 if((qsizetype)(beginning + length) <= protein_seq.size())
181 {
182 length2 = length;
183 }
184 else
185 {
186 length2 = protein_seq.size() - beginning;
187 }
188
189 qDebug();
190 QString sequence_str = protein_seq.sliced(protein_seq.size() - beginning - length2, length2);
191
192 SpOMSProtein sequence("sub_sequence", sequence_str, m_aaCode);
193
194 // std::reverse(sequence.begin(), sequence.end());
195 std::vector<AaPosition> aa_positions;
196 CorrectionTree correction_tree;
197
198 initpreciseAlign(spectrum, length2);
199
200 for(std::size_t row_number = 1; row_number <= length2; row_number++)
201 {
202
203 qDebug() << "row_number - 1=" << row_number - 1 << " sequence.size()=" << sequence.size();
204 // aa = Aa(sequence[row_number - 1].unicode());
205 updateAlignmentMatrix(sequence,
206 row_number,
207 spectrum.getAaPositions(sequence[row_number - 1].code),
208 spectrum,
209 false,
210 protein_ptr);
211 }
212 qDebug();
213 saveBestAlignment(sequence, spectrum, protein_seq.size() - beginning);
214
215 qDebug() << m_scenario.getBestScore() << " " << MIN_ALIGNMENT_SCORE;
216 // Correction : if complementary peaks are used, corrected spectra without one of the two
217 // peaks are generated and aligned. The best alignment is kept.
218 if(m_scenario.getBestScore() >
219 MIN_ALIGNMENT_SCORE) // We only correct alignments with acceptable scores
220 {
221 // we only correct alignment if the sequence has a minimum amino acid diversity
222 if(checkSequenceDiversity(sequence.getSequence(), 5, 2))
223 {
224
225 qDebug();
227 for(std::size_t iter : m_best_alignment.peaks)
228 {
229 if(iter > spectrum.getComplementaryPeak(iter))
230 {
231 break;
232 }
233 else if(std::find(m_best_alignment.peaks.begin(),
234 m_best_alignment.peaks.end(),
235 spectrum.getComplementaryPeak(iter)) !=
236 m_best_alignment.peaks.end())
237 {
238 correction_tree.addPeaks(iter, spectrum.getComplementaryPeak(iter));
239 }
240 }
241 qDebug();
242 std::vector<std::vector<std::size_t>> corrections = correction_tree.getPeaks();
243 if(corrections.size() > 0)
244 {
245 m_best_alignment.score =
246 0; // Reset the best alignment score (we dont want to keep
247 // the original alignment if corrections are needed)
248 qDebug();
249 for(auto peaks_to_remove : corrections)
250 {
251 qDebug();
252 correctAlign(sequence,
253 protein_ptr,
254 spectrum,
255 peaks_to_remove,
256 protein_seq.size() - beginning);
257 qDebug();
258 }
259 qDebug();
261 }
262 }
263 }
264 else
265 {
266 // this sequence has too much redundancy
267 // we have to lower the score
268 m_best_alignment.score = 0;
269 }
270 qDebug();
271 }
272 catch(const pappso::PappsoException &err)
273 {
275 QObject::tr("SemiGlobalAlignment::preciseAlign failed :\n%1").arg(err.qwhat()));
276 }
277}
278
279void
281 const SpOMSProtein *protein_ptr,
282 const SpOMSSpectrum &spectrum,
283 std::vector<std::size_t> &peaks_to_remove,
284 std::size_t offset)
285{
286 std::vector<AaPosition> aa_positions;
287 CorrectionTree correction_tree;
288 std::vector<std::size_t> final_peaks_to_remove;
289
290 KeyCell key_cell_init;
291 key_cell_init.beginning = 0;
292 key_cell_init.n_row = 0;
293 key_cell_init.score = m_scorevalues.get(ScoreType::init);
294 key_cell_init.tree_id = 0;
295
296 std::fill(m_interest_cells.begin(), m_interest_cells.end(), key_cell_init);
297
298 m_interest_cells.at(0).score = 0;
299
300 m_scenario.resetScenario();
301 qDebug();
302 for(qsizetype row_number = 1; row_number <= (qsizetype)sequence.size(); row_number++)
303 {
304 qDebug() << row_number - 1 << " " << sequence.size();
305 qDebug() << "sequence[row_number - 1].aa" << (char)sequence[row_number - 1].aa;
306 qDebug();
307 aa_positions = spectrum.getAaPositions(sequence[row_number - 1].code, peaks_to_remove);
308 qDebug();
309 updateAlignmentMatrix(sequence, row_number, aa_positions, spectrum, false, protein_ptr);
310 qDebug();
311 }
312
313 qDebug();
314 // Correction : if complementary peaks are used, corrected spectra without one of the two peaks
315 // are generated and aligned. The best alignment is kept.
316 qDebug() << m_scenario.getBestScore();
317 if(m_scenario.getBestScore() >
318 MIN_ALIGNMENT_SCORE) // We only correct alignments with acceptable scores
319 {
320 qDebug();
321 qDebug() << sequence.getSequence();
322 qDebug() << offset;
323 qDebug() << spectrum.getPrecursorCharge();
324 saveBestAlignment(sequence, spectrum, offset);
325 qDebug();
326 for(std::size_t iter : m_best_alignment.peaks)
327 {
328 qDebug() << "iter:" << iter << "comp:" << spectrum.getComplementaryPeak(iter);
329 if(iter == spectrum.getComplementaryPeak(iter))
330 {
331 continue;
332 }
333 else if(iter > spectrum.getComplementaryPeak(iter))
334 {
335 break;
336 }
337 else if(std::find(m_best_alignment.peaks.begin(),
338 m_best_alignment.peaks.end(),
339 spectrum.getComplementaryPeak(iter)) != m_best_alignment.peaks.end())
340 {
341 correction_tree.addPeaks(iter, spectrum.getComplementaryPeak(iter));
342 }
343 }
344 std::vector<std::vector<std::size_t>> corrections = correction_tree.getPeaks();
345 if(corrections.size() > 0)
346 {
347 for(auto new_peaks_to_remove : corrections)
348 {
349 final_peaks_to_remove = std::vector<std::size_t>(new_peaks_to_remove);
350 final_peaks_to_remove.insert(
351 final_peaks_to_remove.end(), peaks_to_remove.begin(), peaks_to_remove.end());
352 correctAlign(sequence, protein_ptr, spectrum, final_peaks_to_remove, offset);
353 }
354 }
355 else if(m_scenario.getBestScore() > m_best_corrected_alignment.score)
356 {
358 }
359 }
360 qDebug();
361}
362
363void
365 const SpOMSProtein *protein_ptr,
366 std::size_t beginning,
367 std::size_t length,
368 const std::vector<double> &shifts)
369{
370 std::size_t current_SPC = m_best_alignment.SPC;
371 int current_best_score = m_best_alignment.score;
373 for(double precursor_mass_error : shifts)
374 {
375 SpOMSSpectrum corrected_spectrum(spectrum, precursor_mass_error);
376 preciseAlign(corrected_spectrum, protein_ptr, beginning, length);
378 {
380 }
381 }
382 if(m_best_post_processed_alignment.SPC > current_SPC &&
383 m_best_post_processed_alignment.score >= current_best_score)
384 {
386 }
387}
388
389void
392 const std::size_t row_number,
393 const std::vector<AaPosition> &aa_positions,
394 const SpOMSSpectrum &spectrum,
395 const bool fast_align,
396 const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
397{
398 int where = 0;
399 try
400 {
401 int score_found, score_shift, best_score, alt_score, tree_id;
402 uint32_t condition = 0;
403 std::size_t best_column, shift, beginning, missing_aas, length, perfect_shift_origin;
404 KeyCell *current_cell_ptr, *tested_cell_ptr;
405 AlignType alignment_type, temp_align_type;
406
407 double smallest_aa_mass = m_aaCode.getMass((std::uint8_t)1);
408
409 m_updated_cells.reserve(aa_positions.size());
410 where = 1;
411 // Computation of the threePeaks condition, see spomsspectrum.h for more details.
412 if(fast_align)
413 {
414 condition = 3;
415 if(row_number > 1)
416 {
417 qDebug() << (char)sequence.at(row_number - 2).aa;
418 qDebug() << "condition" << condition << "aa" << (char)sequence.at(row_number - 2).aa
419 << sequence.at(row_number - 2).code;
420 condition += 2 << sequence.at(row_number - 2).code;
421 qDebug();
422 qDebug() << "condition" << condition;
423 }
424 }
425 where = 2;
426 for(std::vector<AaPosition>::const_iterator aa_position = aa_positions.begin();
427 aa_position != aa_positions.end();
428 aa_position++)
429 {
430 qDebug() << "l_peak" << aa_position->l_peak << "r_peak" << aa_position->r_peak << "l_mass"
431 << aa_position->l_mass << "l_support" << aa_position->l_support << "condition"
432 << aa_position->condition;
433
434 where = 3;
435 if(((condition & aa_position->condition) != 0) ||
436 !fast_align) // Verification of the threePeaks condition (only during first alignment).
437 {
438 if(fast_align)
439 {
440 qDebug() << "threePeaks condition verified";
441 }
442
443 current_cell_ptr = &m_interest_cells.at(aa_position->r_peak);
444 if(spectrum.peakType(aa_position->r_peak) ==
445 specglob::ExperimentalSpectrumDataPointType::both) // The score used are different
446 // depending on the right peak
447 // type (simple or double).
448 {
449 qDebug() << "double peak";
450 score_found = m_scorevalues.get(ScoreType::foundDouble);
452 }
453 else
454 {
455 qDebug() << "single peak";
456 score_found = m_scorevalues.get(ScoreType::found);
457 score_shift = m_scorevalues.get(ScoreType::foundShift);
458 }
459
460 // not found case (always computed)
461 best_column = aa_position->r_peak;
462 best_score = current_cell_ptr->score + (row_number - current_cell_ptr->n_row) *
464 beginning = current_cell_ptr->beginning;
465 tree_id = current_cell_ptr->tree_id;
466 alignment_type = AlignType::notFound;
467 qDebug() << "not found" << best_score;
468
469 // found case (Can only happen if the left peak is supported)
470 if(aa_position->l_support)
471 {
472 tested_cell_ptr = &m_interest_cells.at(aa_position->l_peak);
473 if(aa_position->l_peak == 0)
474 {
475 alt_score = tested_cell_ptr->score + score_found;
476 }
477 else
478 {
479 if(tested_cell_ptr->n_row == row_number - 1)
480 {
481 alt_score = tested_cell_ptr->score +
482 (row_number - tested_cell_ptr->n_row - 1) *
484 score_found;
485 }
486 else
487 {
488 alt_score = tested_cell_ptr->score +
489 (row_number - tested_cell_ptr->n_row - 1) *
491 score_shift;
492 }
493 }
494 if(alt_score >= best_score) // If the found case improves the score, the new value
495 // of the key cell is computed.
496 {
497 alignment_type = AlignType::found;
498 best_score = alt_score;
499 best_column = aa_position->l_peak;
500 if(best_column == 0)
501 {
502 if(row_number < ALIGNMENT_SURPLUS)
503 {
504 beginning = 0;
505 }
506 else
507 {
508 beginning = std::max((std::size_t)(row_number - ALIGNMENT_SURPLUS),
509 (std::size_t)0);
510 }
511 if(fast_align)
512 {
513 tree_id = m_location_saver.getNextTree();
514 }
515 }
516 else
517 {
518 beginning = tested_cell_ptr->beginning;
519 tree_id = tested_cell_ptr->tree_id;
520 }
521 qDebug() << "found" << best_score << "from" << best_column << beginning
522 << tree_id;
523 }
524 }
525
526 where = 4;
527 // generic shift case (all shifts are tested)
528 if(aa_position->l_support)
529 {
530 shift = 1;
531 }
532 else
533 {
534 shift = 0;
535 }
536 while(shift < aa_position->l_peak)
537 {
538 tested_cell_ptr = &m_interest_cells.at(aa_position->l_peak - shift);
539 // verification saut parfait
540 if(perfectShiftPossible(sequence,
541 spectrum,
542 tested_cell_ptr->n_row,
543 row_number,
544 aa_position->l_peak - shift,
545 aa_position->r_peak) &&
547 {
548 alt_score = tested_cell_ptr->score +
549 (row_number - tested_cell_ptr->n_row - 1) *
551 score_found;
552 temp_align_type = AlignType::perfectShift;
553 }
554 else
555 {
556 alt_score = tested_cell_ptr->score +
557 (row_number - tested_cell_ptr->n_row - 1) *
559 score_shift;
560 temp_align_type = AlignType::shift;
561 }
562 if(alt_score > best_score)
563 {
564 alignment_type = temp_align_type;
565 best_score = alt_score;
566 best_column = aa_position->l_peak - shift;
567 beginning = tested_cell_ptr->beginning;
568 tree_id = tested_cell_ptr->tree_id;
569 qDebug() << "shift" << best_score << "from" << best_column;
570 }
571 shift++;
572 }
573
574 where = 5;
575 // case shift from column 0 (no penalties if all precedent amino acids are missed)
576 tested_cell_ptr = &m_interest_cells.at(0);
577 // verification saut parfait
578 if(aa_position->r_peak <= TOL_PEAKS_MISSING_FIRST_COLUMN)
579 {
580 perfect_shift_origin =
581 perfectShiftPossibleFrom0(sequence, spectrum, row_number, aa_position->r_peak);
582 }
583 else
584 {
585 perfect_shift_origin = row_number;
586 }
587
588 if(perfect_shift_origin != row_number)
589 {
590 alt_score = tested_cell_ptr->score + score_found;
591 temp_align_type = AlignType::perfectShift;
592 }
593 else
594 {
595 alt_score = tested_cell_ptr->score + score_shift;
596 temp_align_type = AlignType::shift;
597 }
598
599 where = 6;
600 if(alt_score > best_score)
601 {
602 qDebug() << "shift" << alt_score << "from 0";
603 alignment_type = temp_align_type;
604 best_score = alt_score;
605 best_column = 0;
606 missing_aas =
607 std::floor((aa_position->l_mass -
609 smallest_aa_mass);
610 qDebug() << "missing aas" << missing_aas;
611 if(row_number < ALIGNMENT_SURPLUS + missing_aas)
612 {
613 beginning = 0;
614 }
615 else
616 {
617 beginning =
618 std::max((std::size_t)(row_number - missing_aas - ALIGNMENT_SURPLUS),
619 (std::size_t)0);
620 }
621 where = 7;
622 if(fast_align)
623 {
624 tree_id = m_location_saver.getNextTree();
625 }
626 }
627
628 where = 8;
629 if(best_column != aa_position->r_peak)
630 {
631 m_updated_cells.push_back(
632 {aa_position->r_peak, {row_number, best_score, beginning, tree_id}});
633 }
634
635 where = 9;
636 if(best_score > m_location_saver.getMinScore(tree_id) && fast_align)
637 {
638 length =
639 row_number - beginning + 1 +
640 std::ceil(spectrum.getMissingMass(aa_position->r_peak) / smallest_aa_mass) +
642 where = 10;
643 m_location_saver.addLocation(beginning, length, tree_id, best_score, protein_ptr);
644 }
645 else if(!fast_align)
646 {
647
648 where = 11;
649 if(alignment_type == AlignType::perfectShift && best_column == 0)
650 {
651 m_scenario.saveOrigin(row_number,
652 aa_position->r_peak,
653 perfect_shift_origin,
654 0,
655 best_score,
657 }
658 else
659 {
660 m_scenario.saveOrigin(row_number,
661 aa_position->r_peak,
662 m_interest_cells.at(best_column).n_row,
663 best_column,
664 best_score,
665 alignment_type);
666 }
667 }
668 }
669 }
670
671 where = 30;
672 // Update row number in column 0
673 m_updated_cells.push_back({0, {row_number, 0, 0, 0}});
674
675 // Save updated key cells in the matrix
676 while(m_updated_cells.size() > 0)
677 {
678 qDebug() << m_interest_cells.size() << " " << m_updated_cells.back().first
679 << m_updated_cells.back().second.n_row << m_updated_cells.back().second.score
680 << m_updated_cells.back().second.beginning
681 << m_updated_cells.back().second.tree_id;
682 m_interest_cells.at(m_updated_cells.back().first) = m_updated_cells.back().second;
683 m_updated_cells.pop_back();
684 }
685 where++;
686 }
687 catch(const pappso::PappsoException &err)
688 {
690 QObject::tr("updateAlignmentMatrix failed :\n%1").arg(err.qwhat()));
691 }
692 catch(const std::exception &error)
693 {
695 QObject::tr("updateAlignmentMatrix failed std::exception :\n%1 %2")
696 .arg(where)
697 .arg(error.what()));
698 }
699}
700
701bool
704 const SpOMSSpectrum &spectrum,
705 const std::size_t origin_row,
706 const std::size_t current_row,
707 const std::size_t l_peak,
708 const std::size_t r_peak) const
709{
710 try
711 {
712 double missing_mass = 0;
713 auto it_end = sequence.begin() + current_row;
714 for(auto iter = sequence.begin() + origin_row; (iter != it_end) && (iter != sequence.end());
715 iter++)
716 {
717 missing_mass += iter->mass; // Aa(iter->unicode()).getMass();
718 }
719 if(missing_mass > 0)
720 {
721 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
722 return mz_range.contains(spectrum.getMZShift(l_peak, r_peak));
723 }
724 else
725 {
726 return false;
727 }
728 }
729 catch(const pappso::PappsoException &err)
730 {
732 QObject::tr("perfectShiftPossible failed :\n%1").arg(err.qwhat()));
733 }
734 catch(const std::exception &error)
735 {
737 QObject::tr("perfectShiftPossible failed std exception:\n%1").arg(error.what()));
738 }
739}
740
741std::size_t
744 const SpOMSSpectrum &spectrum,
745 const std::size_t current_row,
746 const std::size_t r_peak) const
747{
748 std::size_t perfect_shift_origin = current_row;
749 double missing_mass = spectrum.getMZShift(0, r_peak);
750 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
751 double aa_mass = 0;
752 while(aa_mass < missing_mass && perfect_shift_origin > 0 && !mz_range.contains(aa_mass))
753 {
754 aa_mass += sequence.at(perfect_shift_origin - 1)
755 .mass; // Aa(sequence.at(perfect_shift_origin - 1).unicode()).getMass();
756 perfect_shift_origin--;
757 }
758 if(mz_range.contains(aa_mass))
759 {
760 return perfect_shift_origin;
761 }
762 else
763 {
764 return current_row;
765 }
766}
767
768std::size_t
771 const SpOMSSpectrum &spectrum,
772 std::size_t end_row,
773 std::size_t end_peak) const
774{
775 try
776 {
777 std::size_t perfect_shift_end = end_row + 1;
778 double missing_mass = spectrum.getMissingMass(end_peak);
779 pappso::MzRange mz_range(missing_mass, m_precision_ptr);
780 double aa_mass = 0;
781 while(aa_mass < missing_mass && perfect_shift_end < (std::size_t)sequence.size() &&
782 !mz_range.contains(aa_mass))
783 {
784 aa_mass += sequence.at(perfect_shift_end - 1)
785 .mass; // Aa(sequence.at(perfect_shift_end - 1).unicode()).getMass();
786 perfect_shift_end++;
787 }
788 if(mz_range.contains(aa_mass))
789 {
790 return perfect_shift_end - 1;
791 }
792 else
793 {
794 return end_row;
795 }
796 }
797 catch(const pappso::PappsoException &err)
798 {
800 QObject::tr("perfectShiftPossibleEnd failed :\n%1").arg(err.qwhat()));
801 }
802}
803
807
813
819
820void
823 const SpOMSSpectrum &spectrum,
824 std::size_t offset)
825{
826 qDebug();
827 m_best_alignment.m_peptideModel.reset();
828 m_best_alignment.peaks.clear();
829 m_best_alignment.shifts.clear();
830 std::size_t previous_row = 0;
831 std::size_t previous_column = 0;
832 std::size_t perfect_shift_end;
833 std::pair<std::vector<ScenarioCell>, int> best_alignment = m_scenario.getBestAlignment();
834 m_best_alignment.score = best_alignment.second;
835 std::vector<SpOMSAa> skipped_aa;
836 double skipped_mass;
837 // Retrieving beginning and end
838 if(best_alignment.first.front().previous_row > offset)
839 {
841 QString("best_alignment.first.front().previous_row > offset %1 %2")
842 .arg(offset)
843 .arg(best_alignment.first.front().previous_row));
844 }
845 if(best_alignment.first.back().previous_row > offset)
846 {
848 QString("best_alignment.first.back().previous_row > offset %1 %2")
849 .arg(offset)
850 .arg(best_alignment.first.back().previous_row));
851 }
852 m_best_alignment.beginning = offset - best_alignment.first.front().previous_row;
853 m_best_alignment.end = offset - best_alignment.first.back().previous_row - 1;
854
855 qDebug();
856 AminoAcidModel aa_model;
857 aa_model.m_massDifference = 0;
858 // Filling temp_interpretation and peaks vectors
859 for(auto cell : best_alignment.first)
860 {
861 switch(cell.alignment_type)
862 {
863 case AlignType::found:
864 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
865 aa_model.m_massDifference = 0;
866 aa_model.m_skipped = false;
867 m_best_alignment.m_peptideModel.push_back(aa_model);
868 if(previous_row > cell.previous_row + 1)
869 {
870 skipped_mass = sequence.at(previous_row - 1)
871 .mass; // Aa(sequence.at(previous_row - 1).unicode()).getMass();
872 skipped_aa =
873 sequence.sliced(cell.previous_row, previous_row - cell.previous_row - 1);
874 aa_model.m_massDifference = 0;
875 aa_model.m_skipped = true;
876 for(auto aa : skipped_aa)
877 {
878 aa_model.m_aminoAcid = aa.aa;
879 m_best_alignment.m_peptideModel.push_back(aa_model);
880 skipped_mass += aa.mass; // Aa(aa.unicode()).getMass();
881 }
882 m_best_alignment.m_peptideModel.back().m_massDifference =
883 spectrum.getMZShift(cell.previous_column, previous_column) - skipped_mass;
884 }
885 m_best_alignment.peaks.push_back(cell.previous_column);
886 break;
888 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
889 aa_model.m_massDifference = 0;
890 aa_model.m_skipped = true;
891 m_best_alignment.m_peptideModel.push_back(aa_model);
892 break;
893 case AlignType::shift:
894
895 aa_model.m_aminoAcid = sequence.at(previous_row - 1).aa;
896 aa_model.m_massDifference = spectrum.getMZShift(cell.previous_column, previous_column) -
897 aa_model.m_aminoAcid.getMass();
898 aa_model.m_skipped = false;
899 m_best_alignment.m_peptideModel.push_back(aa_model);
900 m_best_alignment.peaks.push_back(cell.previous_column);
901 m_best_alignment.shifts.push_back(
902 spectrum.getMZShift(cell.previous_column, previous_column) -
903 sequence.at(previous_row - 1).mass);
904 break;
906 m_best_alignment.peaks.push_back(cell.previous_column);
907 skipped_aa = sequence.sliced(cell.previous_row, previous_row - cell.previous_row);
908 std::reverse(skipped_aa.begin(), skipped_aa.end());
909 aa_model.m_massDifference = 0;
910 aa_model.m_skipped = false;
911 for(auto aa : skipped_aa)
912 {
913 aa_model.m_aminoAcid = aa.aa;
914 m_best_alignment.m_peptideModel.push_back(aa_model);
915 }
916 break;
917 case AlignType::init:
918 previous_row = cell.previous_row;
919 previous_column = cell.previous_column;
920 m_best_alignment.peaks.push_back(cell.previous_column);
921 break;
922 }
923 previous_row = cell.previous_row;
924 previous_column = cell.previous_column;
925 }
926 std::reverse(m_best_alignment.peaks.begin(), m_best_alignment.peaks.end());
927 std::reverse(m_best_alignment.m_peptideModel.begin(), m_best_alignment.m_peptideModel.end());
928
929 qDebug();
930 // Compute begin_shift and end_shift
931 MzRange zero(0, m_precision_ptr);
932 m_best_alignment.begin_shift = spectrum.getMZShift(0, m_best_alignment.peaks.front());
933 m_best_alignment.end_shift = spectrum.getMissingMass(m_best_alignment.peaks.back());
934 if(zero.contains(m_best_alignment.end_shift))
935 {
936 m_best_alignment.end_shift = 0;
937 }
938
939 qDebug();
940 // Computing SPC
941 m_best_alignment.SPC = 0;
942 for(auto peak : m_best_alignment.peaks)
943 {
944 switch(spectrum.at(peak).type)
945 {
947 qDebug() << peak << "native";
948 m_best_alignment.SPC += 1;
949 break;
951 qDebug() << peak << "both";
952 m_best_alignment.SPC += 2;
953 break;
955 qDebug() << peak << "synthetic";
956 break;
958 qDebug() << peak << "symmetric";
959 m_best_alignment.SPC += 1;
960 break;
961 }
962 }
963
964 qDebug();
965 // Final check of the end shift
966 if(m_best_alignment.end_shift > 0)
967 {
968 perfect_shift_end = perfectShiftPossibleEnd(sequence,
969 spectrum,
970 best_alignment.first.front().previous_row,
971 m_best_alignment.peaks.back());
972 if(perfect_shift_end != best_alignment.first.front().previous_row)
973 {
974 skipped_aa =
975 sequence.sliced(best_alignment.first.front().previous_row,
976 perfect_shift_end - best_alignment.first.front().previous_row);
977 aa_model.m_massDifference = 0;
978 aa_model.m_skipped = true;
979 for(auto aa = skipped_aa.begin(); aa != skipped_aa.end(); aa++)
980 {
981 aa_model.m_aminoAcid = aa->aa;
982 m_best_alignment.m_peptideModel.push_back(aa_model);
983 }
984 m_best_alignment.beginning = offset - perfect_shift_end;
985 m_best_alignment.end_shift = 0;
986 }
987 else
988 {
990 }
991 }
992
993 qDebug();
994 // Writing final interpretation
995 if(m_best_alignment.end_shift > 0)
996 {
997 m_best_alignment.m_peptideModel.setNterShift(m_best_alignment.end_shift);
998 }
999
1000 std::reverse(m_best_alignment.m_peptideModel.begin(), m_best_alignment.m_peptideModel.end());
1001 if(m_best_alignment.begin_shift > 0)
1002 {
1003 m_best_alignment.m_peptideModel.setCterShift(m_best_alignment.begin_shift);
1004 }
1005
1006 m_best_alignment.m_peptideModel.setPrecursorMass(spectrum.getPrecursorMass());
1007 qDebug();
1008}
1009
1015
1016std::vector<double>
1018 const Alignment &alignment,
1019 const QString &protein_seq)
1020{
1021 // qDebug() << protein_seq;
1022 if(alignment.end > (std::size_t)protein_seq.size())
1023 {
1024 throw pappso::ExceptionOutOfRange(QString("alignment.end > protein_seq.size() %1 %2")
1025 .arg(alignment.end)
1026 .arg(protein_seq.size()));
1027 }
1028 std::vector<double> potential_mass_errors(alignment.shifts);
1029 double shift = alignment.end_shift;
1030 std::size_t index;
1031 if(alignment.beginning > 0)
1032 { // -1 on unsigned int makes it wrong
1033 index = alignment.beginning - 1;
1034 while(shift > 0 && index > 0)
1035 {
1036 potential_mass_errors.push_back(shift);
1037 // qDebug() << " shift=" << shift << " index=" << index
1038 // << " letter=" << protein_seq.at(index).toLatin1();
1039 shift -= aa_code.getMass(
1040 protein_seq.at(index).toLatin1()); // Aa(protein_seq.at(index).unicode()).getMass();
1041 index--;
1042 }
1043 }
1044
1045 // qDebug() << "second";
1046 shift = alignment.begin_shift;
1047 index = alignment.end + 1;
1048 while(shift > 0 && index < (std::size_t)protein_seq.size())
1049 {
1050 potential_mass_errors.push_back(shift);
1051 qDebug() << " shift=" << shift << " index=" << index
1052 << " letter=" << protein_seq.at(index).toLatin1();
1053 shift -= aa_code.getMass(
1054 protein_seq.at(index).toLatin1()); // Aa(protein_seq.at(index).unicode()).getMass();
1055 index++;
1056 }
1057 // qDebug();
1058 return potential_mass_errors;
1059}
1060
1061bool
1063 std::size_t window,
1064 std::size_t minimum_aa_diversity)
1065{
1066 qDebug() << "sequence=" << sequence << " window=" << window
1067 << " minimum_aa_diversity=" << minimum_aa_diversity;
1068 if(sequence.size() < (qsizetype)window)
1069 return false;
1070 auto it_begin = sequence.begin();
1071 auto it_end = sequence.begin() + window;
1072 QString window_copy(sequence.mid(0, window));
1073 while(it_end != sequence.end())
1074 {
1075 std::partial_sort_copy(it_begin, it_end, window_copy.begin(), window_copy.end());
1076
1077 qDebug() << window_copy;
1078 std::size_t uniqueCount =
1079 std::unique(window_copy.begin(), window_copy.end()) - window_copy.begin();
1080
1081 qDebug() << uniqueCount;
1082 if(uniqueCount < minimum_aa_diversity)
1083 return false;
1084 it_begin++;
1085 it_end++;
1086 }
1087 return true;
1088}
1089
1090const std::vector<pappso::specpeptidoms::KeyCell> &
1093 const std::size_t row_number,
1094 const std::vector<AaPosition> &aa_positions,
1095 const SpOMSSpectrum &spectrum,
1096 const bool fast_align,
1097 const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
1098{
1099 updateAlignmentMatrix(sequence, row_number, aa_positions, spectrum, fast_align, protein_ptr);
1100 return getInterestCells();
1101}
collection of integer code for each amino acid 0 => null 1 to 20 => amino acid sorted by there mass (...
Definition aacode.h:44
double getMass(uint8_t aa_code) const
get the mass of the amino acid given its integer code the amino acid can bear some modification (if a...
Definition aacode.cpp:224
pappso_double getMass() const override
Definition aa.cpp:90
bool contains(pappso_double) const
Definition mzrange.cpp:115
virtual const QString & qwhat() const
std::vector< std::vector< std::size_t > > getPeaks() const
void addPeaks(std::size_t peak1, std::size_t peak2)
void initpreciseAlign(const SpOMSSpectrum &spectrum, std::size_t length2)
function made for testing the preciseAlign process, initiate the variables for alignment
const Alignment & getBestAlignment() const
Returns a const ref to m_best_alignment.
Scenario getScenario() const
Returns a copy of m_scenario.
std::size_t perfectShiftPossibleEnd(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t end_row, std::size_t end_peak) const
indicates if a perfect shift is possible between the provided positions
void updateAlignmentMatrix(const pappso::specpeptidoms::SpOMSProtein &sequence, const std::size_t row_number, const std::vector< AaPosition > &aa_positions, const SpOMSSpectrum &spectrum, const bool fast_align, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
updates the scores of the alignment matrix for a given amino acid as well as the location heap/scenar...
void postProcessingAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, std::size_t beginning, std::size_t length, const std::vector< double > &shifts)
performs the post-processing : generates corrected spectra and align them
void preciseAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr, const std::size_t beginning, const std::size_t length)
performs the second alignment search between a protein subsequence and a spectrum.
void correctAlign(const SpOMSProtein &protein_subseq, const SpOMSProtein *protein_ptr, const SpOMSSpectrum &spectrum, std::vector< std::size_t > &peaks_to_remove, std::size_t offset)
Recursively performs the correction of the alignment.
const std::vector< KeyCell > & oneAlignStep(const pappso::specpeptidoms::SpOMSProtein &sequence, const std::size_t row_number, const std::vector< AaPosition > &aa_positions, const SpOMSSpectrum &spectrum, const bool fast_align, const pappso::specpeptidoms::SpOMSProtein *protein_ptr)
function made for testing the fastAlign process, process one line and return the alignment matrix
const std::vector< KeyCell > & getInterestCells() const
convenient function for degub purpose
bool perfectShiftPossible(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t origin_row, const std::size_t current_row, const std::size_t l_peak, const std::size_t r_peak) const
indicates if a perfect shift is possible between the provided positions
std::vector< std::pair< std::size_t, KeyCell > > m_updated_cells
void fastAlign(const SpOMSSpectrum &spectrum, const SpOMSProtein *protein_ptr)
perform the first alignment search between a protein sequence and a spectrum. The member location hea...
static bool checkSequenceDiversity(const QString &sequence, std::size_t window, std::size_t minimum_aa_diversity)
check that the sequence has a minimum of amino acid checkSequenceDiversity
std::size_t perfectShiftPossibleFrom0(const pappso::specpeptidoms::SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, const std::size_t current_row, const std::size_t r_peak) const
indicates if a perfect shift is possible from the spectrum beginning to the provided peak....
void initFastAlign(const SpOMSSpectrum &spectrum)
function made for testing the fastAlign process, initiate the variables for alignment
static std::vector< double > getPotentialMassErrors(const pappso::AaCode &aa_code, const Alignment &alignment, const QString &protein_seq)
Returns a list of the potential mass errors corresponding to the provided alignment in the provided p...
void saveBestAlignment(const SpOMSProtein &sequence, const SpOMSSpectrum &spectrum, std::size_t offset)
Stores the best alignment from m_scenario in m_best_alignment.
SemiGlobalAlignment(const ScoreValues &score_values, const pappso::PrecisionPtr precision_ptr, const AaCode &aaCode)
LocationSaver getLocationSaver() const
Returns a copy of m_location_saver.
const QString & getSequence() const
std::vector< SpOMSAa > sliced(std::size_t position, std::size_t length) const
double getMZShift(std::size_t l_peak, std::size_t r_peak) const
Returns the mz difference between two peaks.
uint getPrecursorCharge() const
Returns the spectrum's precursor's charge.
double getMissingMass(std::size_t peak) const
Returns the missing mass between a peak and the precursor's mass (shift at the end).
std::size_t getComplementaryPeak(std::size_t peak) const
const std::vector< AaPosition > & getAaPositions(std::uint8_t aa_code) const
Returns the list of aa_positions for a given amino acid code.
specglob::ExperimentalSpectrumDataPointType peakType(std::size_t indice) const
Returns the type of one of the spectrum's peaks.
@ synthetic
does not correspond to existing peak, for computational purpose
Definition types.h:85
@ both
both, the ion and the complement exists in the original spectrum
Definition types.h:83
@ symmetric
new peak : computed symmetric mass from a corresponding native peak
Definition types.h:81
const uint ALIGNMENT_SURPLUS(5)
const int MIN_ALIGNMENT_SCORE(15)
const uint TOL_PEAKS_MISSING(4)
const uint TOL_PEAKS_MISSING_FIRST_COLUMN(5)
const pappso_double MHPLUS(1.007276466879)
const pappso_double MPROTIUM(1.007825032241)
const pappso_double MASSOXYGEN(15.99491461956)
const PrecisionBase * PrecisionPtr
Definition precision.h:122
void reset()
reinitialize to default score_values
QString getPeptideString(const QString &protein_sequence) const
convenient function to get peptide sequence from location
double getNonAlignedMass() const
convenient function to get the remaining non explained mass shift
std::vector< std::size_t > peaks
std::size_t getPositionStart() const
get position of start on the protein sequence