diff --git a/.gitignore b/.gitignore index 4a36fb52..2a64db69 100644 --- a/.gitignore +++ b/.gitignore @@ -5,12 +5,6 @@ MSToolkit/extern/expat-2.2.9/ # user-specific Claude Code settings .claude/settings.local.json -# ignore dynamically generated files -MSToolkit/include/expat.h -MSToolkit/include/expat_external.h -MSToolkit/include/zconf.h -MSToolkit/include/zlib.h -MSToolkit/*.mri .DS_Store .idea @@ -157,4 +151,4 @@ ipch/ *.msp # Artifact of CodeQL -_codeql_detected_source_root \ No newline at end of file +_codeql_detected_source_root diff --git a/CLAUDE.md b/CLAUDE.md index c7cf3bcf..6c5e0ae1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -217,3 +217,22 @@ Rules for Claude Code: A `.gitattributes` file at the repo root enforces CRLF for all tracked source files at the git level, providing a second safety net. + + +## Development Workflows + +### Code Review Protocol (Copilot Mode) +When requested to perform a code review, always execute the following multi-step workflow before writing your feedback: +1. **Tooling Check:** Run the project's respective testing commands to gather concrete diagnostic data. +2. **Analysis:** Review the uncommitted files, staged changes, or the specified branch diff. +3. **Report Generation:** Structure the review using the exact template below. + +## Code Review Template +Provide feedback using this exact format: +1. **Summary:** A 1-2 sentence overview of the changes. +2. **Critical Issues:** Bugs, security vulnerabilities, or breaking changes. Provide the file path, exact line numbers, and the core issue. +3. **Code Quality & Maintainability:** Poor practices, anti-patterns, or missing tests. +4. **Actionable Improvements:** Specific refactoring suggestions accompanied by concise code snippets. + +*Constraint:* Keep critiques technical, objective, and ranked by severity. Avoid generic praise. + diff --git a/Comet.cpp b/Comet.cpp index 8740a2a9..c3b5e894 100644 --- a/Comet.cpp +++ b/Comet.cpp @@ -692,7 +692,7 @@ void LoadParameters(char* pszParamsFile, enzymeInformation.szSampleEnzymeBreakAA, enzymeInformation.szSampleEnzymeNoBreakAA); } - fgets(szParamBuf, SIZE_BUF, fp); + (void)fgets(szParamBuf, SIZE_BUF, fp); } fclose(fp); diff --git a/CometSearch/CometDataInternal.h b/CometSearch/CometDataInternal.h index 61d354ee..4eaa0fc9 100644 --- a/CometSearch/CometDataInternal.h +++ b/CometSearch/CometDataInternal.h @@ -12,1543 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Compatibility shim: existing .cpp files continue to include this header +// unchanged. All content has moved to the three focused headers below. +// New code should include the specific header it needs instead of this one. #ifndef _COMETDATAINTERNAL_H_ #define _COMETDATAINTERNAL_H_ -#include -#include -#include -#include "CometData.h" -#include "Threading.h" -#include "AScoreOptions.h" -#include "AScoreCentroid.h" -#include "AScoreAPI.h" -#include "AScoreFactory.h" -#include "AScoreDllInterface.h" - - -class CometSearchManager; - -#define PROTON_MASS 1.00727646688 -#define C13_DIFF 1.00335483 - -#define FLOAT_ZERO 1e-6 // 0.000001 - -#define MIN_PEPTIDE_LEN 1 // min # of AA for a petpide -#define MAX_PEPTIDE_LEN 51 // max # of AA for a peptide; one more than actual # to account for terminating char -#define MAX_PEPTIDE_LEN_P2 53 // max # of AA for a peptide plus 2 for N/C-term - -#define FRAGINDEX_MIN_IONS_SCORE 3 // min # of matched ions for peptide to register for E-value xcorr histogram -#define FRAGINDEX_MIN_IONS_REPORT 3 // min # of matched ions for peptide to be reported -#define FRAGINDEX_MIN_MASS 200.0 // minimum fragment ion mass used to generate fragment index -#define FRAGINDEX_MAX_MASS 2000.0 // maximum fragment ion mass used to generate fragment index -#define FRAGINDEX_MAX_BATCHSIZE 1000 // maximum number of spectra loaded when querying fragment index -#define FRAGINDEX_MAX_NUMPEAKS 150 // number of spectrum peaks used to query fragment index -#define FRAGINDEX_MAX_NUMSCORED 100 // for each fragment index spectrum query, score up to this many peptides -#define FRAGINDEX_MAX_COMBINATIONS 2000 -#define FRAGINDEX_MAX_MODS_PER_MOD 5 -#define FRAGINDEX_KEEP_ALL_PEPTIDES 1 // 1 = consider up to FRAGINDEX_MAX_COMBINATIONS of peptides; 0 = ignore all mods for peptide that exceed FRAGINDEX_MAX_COMBINATIONS - -#define MS1_MIN_MASS 0.0 // only parse up to this mass in MS1 scans for MS1 library searches -#define MS1_MAX_MASS 3000.0 // only parse up to this mass in MS1 scans for MS1 library searches -#define MS1_RT_HISTORY_SIZE 250 // size of MS1 RT history kept for recent history linear regression -#define MS1_RT_OUTLIER_THRESHOLD 2.0 // # stdev outlier threshold for MS1 RT history - -#define MAX_PEFFMOD_LEN 16 -#define SIZE_MASS 128 // ascii value size -#define SIZE_NATIVEID 256 // max length of nativeID string -#define NUM_SP_IONS 1000 // num ions for preliminary scoring -#define NUM_ION_SERIES 7 // a,b,c,x,y,z,z1 -#define EXPECT_DECOY_SIZE 3000 // number of decoy entries in CometDecoys.h - -#define WIDTH_REFERENCE 256 // length of the protein accession field to store -#define MAX_PROTEINS 50 // maximum number of proteins to return for each query; for index search only right now - -#define HISTO_SIZE 152 // some number greater than 150 - -#define NO_PEFF_VARIANT -127 - -#define ASCORE_CUTOFF_TO_ACCEPT 13.0 // minimum AScore value to accept localization - -#define FRAGINDEX_VMODS 5 // only parse first five variable mods for fragment ion index searches - // if this is ever larger than 16, need to extend range of siVarModProteinFilter - -#define VMODS 15 // also "VMODS+1" is 4th dimension of uiBinnedIonMasses to cover unmodified ions (0), mod NL (1-15) -#define COMPOUNDMODS_OFFSET 100 // piVarModSites values >= 100 encode compound mods; index = value - 100 -#define VMOD_1_INDEX 0 -#define VMOD_2_INDEX 1 -#define VMOD_3_INDEX 2 -#define VMOD_4_INDEX 3 -#define VMOD_5_INDEX 4 -#define VMOD_6_INDEX 5 -#define VMOD_7_INDEX 6 -#define VMOD_8_INDEX 7 -#define VMOD_9_INDEX 8 -#define VMOD_10_INDEX 9 -#define VMOD_11_INDEX 10 -#define VMOD_12_INDEX 11 -#define VMOD_13_INDEX 12 -#define VMOD_14_INDEX 13 -#define VMOD_15_INDEX 14 - -#define ENZYME_SINGLE_TERMINI 1 -#define ENZYME_DOUBLE_TERMINI 2 -#define ENZYME_N_TERMINI 8 -#define ENZYME_C_TERMINI 9 - -#define ION_SERIES_A 0 -#define ION_SERIES_B 1 -#define ION_SERIES_C 2 -#define ION_SERIES_X 3 -#define ION_SERIES_Y 4 -#define ION_SERIES_Z 5 -#define ION_SERIES_Z1 6 //z+1 - -#ifdef CRUX -#define XCORR_CUTOFF -999.0 -#else -#define XCORR_CUTOFF 1E-8 // some near-zero cutoff -#endif - -#define SPECLIB_CUTOFF -999.9 - -struct Options -{ - int iNumPeptideOutputLines; - int iWhichReadingFrame; - int iEnzymeTermini; - int iNumStored; // # of search results to store for xcorr analysis - int iMaxDuplicateProteins; // maximum number of duplicate proteins to report or store in idx file - int iSpectrumBatchSize; // # of spectra to search at a time within the scan range - int iStartCharge; - int iEndCharge; - int iMaxFragmentCharge; - int iMinPrecursorCharge; - int iMaxPrecursorCharge; - int iMSLevel; // filter query scans in raw/mzML/mzXML input by ms level (aka MS2, MS3) - int iSpecLibMSLevel; // filter speclib scans in raw/mzML/mzXML input by ms level (aka MS2, MS3) - int iMinPeaks; - int iRemovePrecursor; // 0=no, 1=yes, 2=ETD precursors, 3=phosphate neutral loss - int iDecoySearch; // 0=no, 1=concatenated search, 2=separate decoy search - int iNumThreads; // 0=poll CPU else set # threads to spawn - int iNumFragmentThreads; // # threads used for fragment indexing - bool bResolveFullPaths; // 0=do not resolve full paths; 1=resolve paths (default) - bool bOutputSqtStream; - bool bOutputSqtFile; - bool bOutputTxtFile; - bool bOutputPepXMLFile; - int iOutputMzIdentMLFile; - bool bOutputPercolatorFile; - bool bClipNtermMet; // 0=leave protein sequences alone; 1=also consider w/o N-term methionine - bool bClipNtermAA; // 0=leave peptide sequences as-is; 1=clip N-term amino acid from every peptide - bool bMango; // 0=normal; 1=Mango x-link ms2 input - bool bScaleFragmentNL; // 0=no; 1=scale fragment NL for each modified residue contained in fragment - bool bCreateFragmentIndex; // 0=normal search; 1=create fragment ion index plain peptide file - bool bCreatePeptideIndex; // 0=normal search; 1=create peptide index file; only one of bCreateFragmentIndex and bCreatePeptideIndex can be 1 - bool bFastPlainPeptideIdx; // 0=legacy RunSearch path; 1=use PepGenTuple per-thread buffers (avoids heap alloc) - bool bVerboseOutput; - bool bExplicitDeltaCn; // if set to 1, do not use sequence similarity logic - bool bPrintExpectScore; - bool bExportAdditionalScoresPepXML; // if 1, also report lnrSp, lnExpect, IonFrac, lnNumSP to pepXML output - bool bCorrectMass; // use selectionMZ instead of monoMZ if monoMZ is outside selection window - bool bTreatSameIL; - int iPrintAScoreProScore; // 0=no, otherwise specify variable_modXX number e.g. 1 for variable_mod01 - int iMaxIndexRunTime; // max run time of index search in milliseconds - int iFragIndexMinIonsScore; // minimum matched fragment index ions for scoring - int iFragIndexMinIonsReport; // minimum matched fragment index ions for reporting - int iFragIndexNumSpectrumPeaks; // # of peaks from spectrum to use for querying fragment index - int iFragIndexSkipReadPrecursors; // if true, skips reading precursors step - int iOverrideCharge; - long lMaxIterations; // max # of modification permutations for each iStart position - double dMinIntensity; // intensity cutoff for each peak - double dMinPercentageIntensity; // intensity cutoff for each peak as % of base peak - double dRemovePrecursorTol; - double dPeptideMassLow; // MH+ mass - double dPeptideMassHigh; // MH+ mass - double dMinimumXcorr; // set the minimum xcorr to report (default is 1e-8) - double dFragIndexMaxMass; // fragment index maximum fragment mass - double dFragIndexMinMass; // fragment index minimum fragment mass - double dMS1MinMass; // low mass cutoff in MS1 query/library spectra - double dMS1MaxMass; // high mass cutoff in MS1 query/library spectra - IntRange scanRange; - IntRange peptideLengthRange; - DoubleRange clearMzRange; - char szActivationMethod[24]; // mzXML only - string sPinProteinDelimiter; // PIN file protein delimiter; default tab - - Options& operator=(Options& a) - { - iNumPeptideOutputLines = a.iNumPeptideOutputLines; - iWhichReadingFrame = a.iWhichReadingFrame; - iEnzymeTermini = a.iEnzymeTermini; - iNumStored = a.iNumStored; - iMaxDuplicateProteins = a.iMaxDuplicateProteins; - iSpectrumBatchSize = a.iSpectrumBatchSize; - iStartCharge = a.iStartCharge; - iEndCharge = a.iEndCharge; - iMaxFragmentCharge = a.iMaxFragmentCharge; - iMinPrecursorCharge = a.iMinPrecursorCharge; - iMaxPrecursorCharge = a.iMaxPrecursorCharge ; - iMSLevel = a.iMSLevel; - iMinPeaks = a.iMinPeaks; - iRemovePrecursor = a.iRemovePrecursor; - iDecoySearch = a.iDecoySearch; - iNumThreads = a.iNumThreads; - bResolveFullPaths = a.bResolveFullPaths; - bOutputSqtStream = a.bOutputSqtStream; - bOutputSqtFile = a.bOutputSqtFile; - bOutputTxtFile = a.bOutputTxtFile; - bOutputPepXMLFile = a.bOutputPepXMLFile; - iOutputMzIdentMLFile = a.iOutputMzIdentMLFile; - bOutputPercolatorFile = a.bOutputPercolatorFile; - bClipNtermMet = a.bClipNtermMet; - bClipNtermAA = a.bClipNtermAA; - bMango = a.bMango; - bScaleFragmentNL = a.bScaleFragmentNL; - bCreatePeptideIndex = a.bCreatePeptideIndex; - bCreateFragmentIndex = a.bCreateFragmentIndex; - bFastPlainPeptideIdx = a.bFastPlainPeptideIdx; - bVerboseOutput = a.bVerboseOutput; - bExplicitDeltaCn = a.bExplicitDeltaCn; - bPrintExpectScore = a.bPrintExpectScore; - iPrintAScoreProScore = a.iPrintAScoreProScore; - bExportAdditionalScoresPepXML = a.bExportAdditionalScoresPepXML; - iOverrideCharge = a.iOverrideCharge; - bCorrectMass = a.bCorrectMass; - bTreatSameIL = a.bTreatSameIL; - iMaxIndexRunTime = a.iMaxIndexRunTime; - lMaxIterations = a.lMaxIterations; - dMinIntensity = a.dMinIntensity; - dMinPercentageIntensity = a.dMinPercentageIntensity; - dRemovePrecursorTol = a.dRemovePrecursorTol; - dPeptideMassLow = a.dPeptideMassLow; - dPeptideMassHigh = a.dPeptideMassHigh; - dMinimumXcorr = a.dMinimumXcorr; - scanRange = a.scanRange; - peptideLengthRange = a.peptideLengthRange; - clearMzRange = a.clearMzRange; - strcpy(szActivationMethod, a.szActivationMethod); - sPinProteinDelimiter = a.sPinProteinDelimiter; - - dFragIndexMinMass = a.dFragIndexMinMass; - dFragIndexMaxMass = a.dFragIndexMaxMass; - iFragIndexMinIonsScore = a.iFragIndexMinIonsScore; - iFragIndexMinIonsReport = a.iFragIndexMinIonsReport ; - iFragIndexNumSpectrumPeaks = a.iFragIndexNumSpectrumPeaks; - iFragIndexSkipReadPrecursors = a.iFragIndexSkipReadPrecursors; - - dMS1MinMass = a.dMS1MinMass; - dMS1MaxMass = a.dMS1MaxMass; - - return *this; - } -}; - -struct Results -{ - double dPepMass; - double dExpect; - float fScoreSp; - float fXcorr; - float fDeltaCn; - float fLastDeltaCn; - float fAScorePro; // AScorePro score - unsigned short usiRankXcorr; - unsigned short usiLenPeptide; - unsigned short usiRankSp; - unsigned short usiMatchedIons; - unsigned short usiTotalIons; - comet_fileoffset_t lProteinFilePosition; // for indexdb, this is the entry in g_pvProteinsList - long lWhichProtein; // which entry in g_pvProteinsList[] contains the matched proteins - int piVarModSites[MAX_PEPTIDE_LEN_P2]; // store variable mods encoding, +2 to accomodate N/C-term - double pdVarModSites[MAX_PEPTIDE_LEN_P2]; // store variable mods mass diffs, +2 to accomodate N/C-term - char pszMod[MAX_PEPTIDE_LEN][MAX_PEFFMOD_LEN]; // store PEFF mod string - char szPeptide[MAX_PEPTIDE_LEN]; - char cPrevAA; // stores prev flanking AA - char cNextAA; // stores following flanking AA - bool bClippedM; // true if new N-term protein due to clipped methionine - char cHasVariableMod; // HasVariableModType enum: 0 = no variable mod, 1 = has variable mod, 2 = has AScorePro mod - string sPeffOrigResidues; // original residue(s) of a PEFF variant - string sAScoreProSiteScores; // AScorePro site scores as comma-separated string - int iPeffOrigResiduePosition; // position of PEFF variant substitution; -1 = n-term, iLenPeptide = c-term; -9=unused - int iPeffNewResidueCount; // more than 0 new residues is a substitution (if iPeffOrigResidueCount=1) or insertion (if iPeffOrigResidueCount>1) - vector pWhichProtein; // file positions of matched protein entries - vector pWhichDecoyProtein; // keep separate decoy list (used for separate decoy matches and combined results) -}; - -struct SpecLibResults // MS2 spec lib -{ - unsigned int iWhichSpecLib; // the matched spectral library entry - float fSpecLibScore; - float fXcorr; // use xcorr for now - float fCn; // speclib score - float fRTtime; // retention time in seconds of the matched entry -}; - -struct SpecLibResultsMS1 // MS1 spec lib -{ - unsigned int iWhichSpecLib; // the matched spectral library entry - float fDotProduct; // unit vector dot product aka cosine similarity - float fRTime; // retention time in seconds of the matched entry -}; - -struct PepMassInfo -{ - double dCalcPepMass; - double dExpPepMass; // protonated MH+ experimental mass - double dPeptideMassToleranceLow; // mass tolerance low in amu from experimental mass - double dPeptideMassToleranceHigh; // mass tolerance high in amu from experimental mass - double dPeptideMassToleranceMinus; // low end of mass tolerance range including isotope offsets - double dPeptideMassTolerancePlus; // high end of mass tolerance range including isotope offsets -}; - -struct SpectrumInfoInternal -{ - int iArraySize; // m/z versus intensity array - int iHighestIon; - int iScanNumber; - unsigned short usiChargeState; - unsigned short usiMaxFragCharge; - double dTotalIntensity; - float fRTime; - char szMango[32]; // Mango encoding - char szNativeID[SIZE_NATIVEID]; // nativeID string from mzML -}; - -// The minimum and maximum mass range of all peptides to consider -// i.e. lowestPepMass - tolerance to highestPepMass + tolerance -struct MassRange -{ - double dMinMass; - double dMaxMass; - unsigned short usiMaxFragmentCharge; // global maximum fragment charge - bool bNarrowMassRange; // used to determine how to parse peptides in SearchForPeptides - unsigned int uiMaxFragmentArrayIndex; // BIN(dFragIndexMaxMass); used as fragment array index -}; - -extern MassRange g_massRange; - -// PreprocessStruct stores information used in preprocessing -// each spectrum. Information not kept around otherwise -struct PreprocessStruct -{ - int iHighestIon; - double dHighestIntensity; -}; - -struct OBOStruct // stores info read from OBO file -{ - double dMassDiffAvg; // this is looked up from strMod string from OBO - double dMassDiffMono; - string strMod; // mod string, PSI-MOD, Unimod or custom - - bool operator<(const OBOStruct& a) const - { - return (strMod < a.strMod); - } -}; - -struct ProteinEntryStruct -{ - comet_fileoffset_t lWhichProtein; // file pointer to protein - int iStartResidue; // start residue position in protein (1-based) - char cPrevAA; - char cNextAA; - - bool operator<(const ProteinEntryStruct& a) const - { - return (lWhichProtein < a.lWhichProtein); - } -}; - -struct PeffModStruct // stores info read from PEFF header -{ - double dMassDiffAvg; // this is looked up from strMod string from OBO - double dMassDiffMono; - int iPosition; // position of modification - char szMod[MAX_PEFFMOD_LEN]; - - bool operator<(const PeffModStruct& a) const - { - return (iPosition < a.iPosition); - } -}; - -struct PeffVariantSimpleStruct // stores info read from PEFF header -{ - int iPosition; // position of variant - char cResidue; // new variant - - bool operator<(const PeffVariantSimpleStruct& a) const - { - return (iPosition < a.iPosition); - } -}; - -struct PeffVariantComplexStruct // stores info read from PEFF header -{ - int iPositionA; // start position of variant - int iPositionB; // end position of variant - string sResidues; // if !empty(), insertion replacing aa from pos A to B; - // if empty(), deletion of aa from pos A to B - - bool operator<(const PeffVariantComplexStruct& a) const - { - return (iPositionA < a.iPositionA); - } -}; - -struct PeffProcessedStruct -{ - int iBeginResidue; - int iEndResidue; -}; - -struct PeffPositionStruct // collate PEFF mods by position in sequence -{ - int iPosition; // position within the sequence - vector vectorWhichPeff; // which specific peff entry from PeffModStruct - vector vectorMassDiffAvg; - vector vectorMassDiffMono; -}; - -struct PeffSearchStruct // variant info passed to SearchForPeptides -{ - int iPosition; - bool bBeginCleavage; - bool bEndCleavage; - char cOrigResidue; -}; - -//-->MH -typedef struct sDBEntry -{ - string strName; // might be able to delete this here - string strSeq; - comet_fileoffset_t lProteinFilePosition; - vector vectorPeffMod; - vector vectorPeffVariantSimple; - vector vectorPeffVariantComplex; - vector vectorPeffProcessed; -} sDBEntry; - -struct DBInfo -{ - char szDatabase[SIZE_FILE]; - char szFileName[SIZE_FILE]; - int iTotalNumProteins; - unsigned long int uliTotAACount; - - DBInfo& operator=(DBInfo& a) - { - strcpy(szDatabase, a.szDatabase); - strcpy(szFileName, a.szFileName); - iTotalNumProteins = a.iTotalNumProteins; - uliTotAACount = a.uliTotAACount; - - return *this; - } -}; - -struct DBIndex -{ - vector pcVarModSites; // empty = unmodified; else [iLen+2] encoding var mods - comet_fileoffset_t lIndexProteinFilePosition; // points to entry in g_pvProteinsList - double dPepMass; // MH+ pep mass - unsigned short siVarModProteinFilter; // bitwise representation of mmapProtein - char cPrevAA; - char cNextAA; - char sPeptide[MAX_PEPTIDE_LEN]; // peptide sequence, null-terminated - - bool operator==(const DBIndex& rhs) const - { - if (strcmp(sPeptide, rhs.sPeptide) != 0) - return false; - - if (fabs(dPepMass - rhs.dPepMass) > FLOAT_ZERO) - return false; - - int iLen = (int)strlen(sPeptide) + 2; - for (int i = 0; i < iLen; ++i) - { - char l = pcVarModSites.empty() ? 0 : pcVarModSites[i]; - char r = rhs.pcVarModSites.empty() ? 0 : rhs.pcVarModSites[i]; - if (l != r) - return false; - } - - return true; - } - - bool operator<(const DBIndex& rhs) const - { - int cmp = strcmp(sPeptide, rhs.sPeptide); - if (cmp != 0) - return cmp < 0; - - if (fabs(dPepMass - rhs.dPepMass) > FLOAT_ZERO) - return dPepMass < rhs.dPepMass; - - int iLen = (int)strlen(sPeptide) + 2; - for (int i = 0; i < iLen; ++i) - { - char l = pcVarModSites.empty() ? 0 : pcVarModSites[i]; - char r = rhs.pcVarModSites.empty() ? 0 : rhs.pcVarModSites[i]; - if (l != r) - return l < r; - } - - // FINAL tie-breaker: lowest protein index first in order - // to grab flanking residues from the first protein - return lIndexProteinFilePosition < rhs.lIndexProteinFilePosition; - } -}; - -// Compact fixed-size tuple used during plain-peptide index generation. -// Replaces heap-heavy DBIndex entries during the per-thread collection phase. -struct PepGenTuple -{ - char sPeptide[MAX_PEPTIDE_LEN]; // original AA letters (or L->I canonical), null-terminated - double dPepMass; // MH+ mass - comet_fileoffset_t lProteinFileOffset;// FASTA byte offset of the source protein - uint16_t siVarModProteinFilter; - char cPrevAA; - char cNextAA; -}; - -// --------------------------------------------------------------------------- -// 5-bit amino acid encoding for per-length short-peptide key packing. -// AAs are mapped in ASCII sort order (A=1, C=2, ..., Y=20) so that sorting -// packed uint64 keys is equivalent to lexicographic sort of sequences within -// a given peptide length. -// --------------------------------------------------------------------------- -static constexpr uint8_t kAA5bit[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0-15 - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 16-31 - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 32-47 - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 48-63 - 0, // 64 '@' - 1, // 65 'A' - 0, // 66 'B' - 2, // 67 'C' - 3, // 68 'D' - 4, // 69 'E' - 5, // 70 'F' - 6, // 71 'G' - 7, // 72 'H' - 8, // 73 'I' (canonical for I/L when bTreatSameIL) - 0, // 74 'J' - 9, // 75 'K' - 10, // 76 'L' (remapped to 8 when bTreatSameIL) - 11, // 77 'M' - 12, // 78 'N' - 0, // 79 'O' - 13, // 80 'P' - 14, // 81 'Q' - 15, // 82 'R' - 16, // 83 'S' - 17, // 84 'T' - 0, // 85 'U' - 18, // 86 'V' - 19, // 87 'W' - 0, // 88 'X' - 20, // 89 'Y' - 0, // 90 'Z' - // 91-255: all zeros - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0 -}; - -// Reverse map: 5-bit code -> amino acid character. -// Code 8 always decodes to 'I' (canonical; L maps to code 8 when bTreatSameIL). -static constexpr char k5bitAA[32] = { - '\0','A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R', - 'S', 'T','V','W','Y','\0','\0','\0','\0','\0','\0','\0','\0','\0','\0','\0' -}; - -// Pack up to 12 amino acids into a uint64 key (5 bits each, 60 bits total). -// When bTreatSameIL is true, L encodes identically to I. -inline uint64_t PackPeptide(const char* seq, int iLen, bool bTreatSameIL) -{ - uint64_t key = 0; - for (int i = 0; i < iLen; ++i) - { - char c = seq[i]; - if (bTreatSameIL && c == 'L') c = 'I'; - key |= ((uint64_t)kAA5bit[(unsigned char)c] << (55 - i * 5)); - } - return key; -} - -// Decode a packed key back to a null-terminated sequence of iLen characters. -inline void UnpackPeptide(uint64_t key, int iLen, char* seq) -{ - for (int i = 0; i < iLen; ++i) - seq[i] = k5bitAA[(key >> (55 - i * 5)) & 0x1F]; - seq[iLen] = '\0'; -} - -// Compact per-thread tuple for short peptides (len <= 12) during index generation. -// 32 bytes on 64-bit (8-byte alignment); uILMask occupies 2 of the 4 trailing pad bytes. -struct PepGenTupleShort -{ - uint64_t uPackedPep; // canonical 5-bit-encoded sequence (L treated as I when bTreatSameIL) - double dPepMass; - comet_fileoffset_t lProteinFileOffset; - uint16_t siVarModProteinFilter; - char cPrevAA; - char cNextAA; - uint16_t uILMask; // bitmask: bit k = 1 means position k was 'L' in FASTA original -}; - -// This is used for fragment indexing; plain peptides are stored in index -// file and read in to this data struct. Same as DBIndex w/o pcVarModSites[] -struct PlainPeptideIndexStruct -{ - comet_fileoffset_t lIndexProteinFilePosition; // points to entry in g_pvProteinsList - double dPepMass; // MH+ pep mass, unmodified mass; modified mass in FragmentPeptidesStruct - unsigned short siVarModProteinFilter; // bitwise representation of mmapProtein - char cPrevAA; - char cNextAA; - char szPeptide[MAX_PEPTIDE_LEN]; // peptide sequence, null-terminated - - bool operator==(const PlainPeptideIndexStruct &rhs) const - { - return strcmp(szPeptide, rhs.szPeptide) == 0; - } -}; - -struct FragmentPeptidesStruct -{ - size_t iWhichPeptide; // reference to raw peptide (sequence, proteins, etc.) in PlainPeptideIndexStruct - int modNumIdx; - double dPepMass; // peptide mass (modified or unmodified) after permuting mods - char cNtermMod; - char cCtermMod; - - bool operator<(const FragmentPeptidesStruct& a) const - { - return dPepMass < a.dPepMass; - } -}; - -struct SpecLibInfo // why a struct for just a string??? -{ - string strSpecLibFile; -}; - -struct SpecLibStruct -{ - string strName; // any string associated with speclib entry - unsigned int iLibEntry; // a reference number associated with speclib entry - unsigned int iNumPeaks; - int iSpecLibCharge; // precursor charge; not relevant for MS1 speclib - double dSpecLibMW; // if a peptide, store neutral mass - float fRTime; - float fScaleMinInten; // min intensity of data prior to encoding to pccSparseFastXcorrData; 0.0 for unit vector - float fScaleMaxInten; // max intensity of data prior to encoding to ppcSparseFastXcorrData - vector> vSpecLibPeaks; - float* pfUnitVector; - unsigned int uiArraySizeMS1; -}; - -// for MS1 alignment -struct RetentionMatch -{ - double dQueryTime; - double dReferenceTime; - int iSpectrumIndex; - - RetentionMatch(double dQueryTime, double dReferenceTime, int iSpectrumIndex); -}; -extern std::deque RetentionMatchHistory; - -extern unsigned int* g_iFragmentIndex; // CSR flat data: all posting lists concatenated [g_iFragmentIndexOffset[bin]..g_iFragmentIndexOffset[bin+1]) -extern uint64_t* g_iFragmentIndexOffset; // CSR offsets [uiMaxFragmentArrayIndex+1]: cumulative entry counts, can exceed UINT_MAX for large non-enzymatic searches -extern vector g_vFragmentPeptides; -extern vector g_vRawPeptides; -extern bool* g_bIndexPrecursors; // allocate an array of BIN(max_precursor, protonated) and use a bool to indicate if that precursor is present in input file(s) -extern vector g_vSpecLib; -extern vector> g_vulSpecLibPrecursorIndex; // this will be an vector of vectors - -struct IndexProteinStruct // for indexed database -{ - char szProt[WIDTH_REFERENCE]; - comet_fileoffset_t lProteinFilePosition; - int iWhichProtein; -}; - -struct PEFFInfo -{ - char szPeffOBO[SIZE_FILE]; - int iPeffSearch; // 0=no, 1=PSI-MOD, 2=Unimod, 3=PSI-MOD only, 4=Unimod only, 5=variants only -}; - -struct StaticMod -{ - double dAddCterminusPeptide; - double dAddNterminusPeptide; - double dAddCterminusProtein; - double dAddNterminusProtein; - double pdStaticMods[SIZE_MASS]; - - StaticMod& operator=(StaticMod& a) - { - dAddCterminusPeptide = a.dAddCterminusPeptide; - dAddNterminusPeptide = a.dAddNterminusPeptide; - dAddCterminusProtein = a.dAddCterminusProtein; - dAddNterminusProtein = a.dAddNterminusProtein; - - for (int i = 0; i < SIZE_MASS; ++i) - { - pdStaticMods[i] = a.pdStaticMods[i]; - } - - return *this; - } -}; - -struct PrecalcMasses -{ - double dNtermProton; // dAddNterminusPeptide + PROTON_MASS - double dCtermOH2Proton; // dAddCterminusPeptide + dOH2fragment + PROTON_MASS - double dOH2ProtonCtermNterm; // dOH2parent + PROTON_MASS + dAddCterminusPeptide + dAddNterminusPeptide - int iMinus17; // BIN'd value of mass(NH3) - int iMinus18; // BIN'd value of mass(H2O) - - PrecalcMasses& operator=(PrecalcMasses& a) - { - dNtermProton = a.dNtermProton; - dCtermOH2Proton = a.dCtermOH2Proton; - dOH2ProtonCtermNterm = a.dOH2ProtonCtermNterm; - iMinus17 = a.iMinus17; - iMinus18 = a.iMinus18; - - return *this; - } -}; - -struct VarModParams -{ - bool bVarModSearch; // set to true if variable mods are specified - bool bVarTermModSearch; // set to true if any n-term/c-term variable mods are specified - bool bVarProteinNTermMod; // set to true if a protein n-term variable mod specified - bool bVarProteinCTermMod; // set to true if a protein c-term variable mod specified - bool bBinaryModSearch; // set to true if any of the variable mods are of binary mod variety - bool bUseFragmentNeutralLoss; // set to true if any custom NL is set; applied only to 1+ and 2+ fragments - bool bRareVarModPresent; // set to true if any of iRequireThisMod == -1 - bool bVarModProteinFilter; // set to trueif protein mods list is applied - int iRequireVarMod; // 0=no; else use bits to determine which varmods are required - int iMaxVarModPerPeptide; - int iMaxPermutations; - VarMods varModList[VMODS]; - char cModCode[VMODS]; // mod characters - string sProteinLModsListFile; // file containing list of proteins to restrict application of varmods to - multimap mmapProteinModsList; // vector read from sProteinModsListFile if present - string sCompoundModsFile; // path to compound mods mass file; empty = disabled - vector vdCompoundMasses; // sorted, deduplicated list of masses read from sCompoundModsFile - unsigned int uiNumCompoundMasses; // vdCompoundMasses.size(); 0 when feature is disabled - - VarModParams& operator=(VarModParams& a) - { - bVarModSearch = a.bVarModSearch; - bVarTermModSearch = a.bVarTermModSearch; - bVarProteinNTermMod = a.bVarProteinNTermMod; - bVarProteinCTermMod = a.bVarProteinCTermMod; - bBinaryModSearch = a.bBinaryModSearch; - bUseFragmentNeutralLoss = a.bUseFragmentNeutralLoss; - bRareVarModPresent = a.bRareVarModPresent; - bVarModProteinFilter = a.bVarModProteinFilter; - iRequireVarMod = a.iRequireVarMod; - iMaxVarModPerPeptide = a.iMaxVarModPerPeptide; - iMaxPermutations = a.iMaxPermutations; - - for (int i = 0; i < VMODS; ++i) - { - varModList[i] = a.varModList[i]; - cModCode[i] = a.cModCode[i]; - } - - sCompoundModsFile = a.sCompoundModsFile; - vdCompoundMasses = a.vdCompoundMasses; - uiNumCompoundMasses = a.uiNumCompoundMasses; - - return *this; - } -}; - -struct MassUtil -{ - int bMonoMassesParent; - int bMonoMassesFragment; - double dCO; - double dNH3; - double dNH2; - double dH2O; - double dCOminusH2; - double dOH2fragment; - double dOH2parent; - double pdAAMassParent[SIZE_MASS]; - double pdAAMassFragment[SIZE_MASS]; - double pdAAMassUser[SIZE_MASS]; // user defined default amino acid masses - - MassUtil& operator=(MassUtil& a) - { - bMonoMassesParent = a.bMonoMassesParent; - bMonoMassesFragment = a.bMonoMassesFragment; - dCO = a.dCO; - dNH3 = a.dNH3; - dNH2 = a.dNH2; - dH2O = a.dH2O; - dCOminusH2 = a.dCOminusH2; - dOH2fragment = a.dOH2fragment; - dOH2parent = a.dOH2parent; - - for (int i = 0; i < SIZE_MASS; ++i) - { - pdAAMassParent[i] = a.pdAAMassParent[i]; - pdAAMassFragment[i] = a.pdAAMassFragment[i]; - pdAAMassUser[i] = a.pdAAMassUser[i]; - } - - return *this; - } -}; - -struct ToleranceParams -{ - int iMassToleranceUnits; // 0=amu, 1=mmu, else ppm (2) - int iMassToleranceType; // 0=MH+ (default), 1=precursor m/z; only valid if iMassToleranceUnits > 0 - int iIsotopeError; - double dInputToleranceMinus; // raw tolerance value from param file, lower bound; gets converted to dPeptideMassToleranceMinus - double dInputTolerancePlus; // raw tolerance value from param file, upper bound; gets converted to dPeptideMassTolerancePlus - double dFragmentBinSize; - double dFragmentBinStartOffset; - double dMS1BinSize; - double dMS1BinStartOffset; - - ToleranceParams& operator=(ToleranceParams& a) - { - iMassToleranceUnits = a.iMassToleranceUnits; - iMassToleranceType = a.iMassToleranceType; - iIsotopeError = a.iIsotopeError; - dInputToleranceMinus = a.dInputToleranceMinus; - dInputTolerancePlus = a.dInputTolerancePlus; - dFragmentBinSize = a.dFragmentBinSize; - dFragmentBinStartOffset = a.dFragmentBinStartOffset; - dMS1BinSize = a.dMS1BinSize; - dMS1BinStartOffset = a.dMS1BinStartOffset; - - return *this; - } -}; - -struct IonInfo -{ - int iNumIonSeriesUsed; - int piSelectedIonSeries[NUM_ION_SERIES]; - bool bUseWaterAmmoniaLoss; // ammonia, water loss - int iTheoreticalFragmentIons; - int iIonVal[NUM_ION_SERIES]; - - IonInfo& operator=(IonInfo& a) - { - iNumIonSeriesUsed = a.iNumIonSeriesUsed; - bUseWaterAmmoniaLoss = a.bUseWaterAmmoniaLoss; - iTheoreticalFragmentIons = a.iTheoreticalFragmentIons; - - for (int i = 0; i < NUM_ION_SERIES; ++i) - { - piSelectedIonSeries[i] = a.piSelectedIonSeries[i]; - iIonVal[i] = a.iIonVal[i]; - } - - return *this; - } -}; - -// Identifies which type of database is being searched. -// Defined before StaticParams so iDbType can use DbType. -enum class DbType -{ - FASTA_DB = 0, // normal FASTA sequence database - FI_DB = 1, // fragment ion index (.idx) - PI_DB = 2 // peptide index (.idx) -}; - -// static user params, won't change per thread - can make global! -struct StaticParams -{ - string sHostName; - char szMod[512]; // used for sqt output - char szDecoyPrefix[256]; // used for prefix to indicate decoys - string sDecoyPrefix; // escaped version of szDecoyPrefix for output within XML files - char szOutputSuffix[256]; // used for suffix to append to output file base names - char szTxtFileExt[256]; // text file extension; default "txt" - int iElapseTime; - char szDate[32]; - Options options; - DBInfo databaseInfo; - SpecLibInfo speclibInfo; - PEFFInfo peffInfo; - InputFileInfo inputFile; - int bPrintDuplReferences; - VarModParams variableModParameters; - ToleranceParams tolerances; - StaticMod staticModifications; - PrecalcMasses precalcMasses; - EnzymeInfo enzymeInformation; - MassUtil massUtility; - double dInverseBinWidth; // this is used in BIN() many times so use inverse binWidth to do multiply vs. divide - int iArraySizeGlobal; // (int)((g_staticParams.options.dPeptideMassHigh + plus_tol_in_daltons + buffer) * g_staticParams.dInverseBinWidth) - // for MS1 library search, use dMS1MaxMass instead of dPeptideMassHigh - double dOneMinusBinOffset; // this is used in BIN() many times so calculate once - IonInfo ionInformation; - int iXcorrProcessingOffset; - DbType iDbType; // FASTA_DB = normal fasta; FI_DB = fragment ion indexed; PI_DB = peptide index - vector vectorMassOffsets; - vector precursorNLIons; - int iPrecursorNLSize; - int iOldModsEncoding; - bool bSkipToStartScan; - std::chrono::high_resolution_clock::time_point tRealTimeStart; // track run time of real-time index search - - StaticParams() - { - RestoreDefaults(); - } - - StaticParams& operator=(StaticParams& a) - { - sHostName = a.sHostName; - strcpy(szMod, a.szMod); - strcpy(szDecoyPrefix, a.szDecoyPrefix); - strcpy(szOutputSuffix, a.szOutputSuffix); - strcpy(szTxtFileExt, a.szTxtFileExt); - vectorMassOffsets = a.vectorMassOffsets; - precursorNLIons= a.precursorNLIons; - iPrecursorNLSize = a.iPrecursorNLSize; - iOldModsEncoding = a.iOldModsEncoding; - iElapseTime = a.iElapseTime; - strcpy(szDate, a.szDate); - options = a.options; - databaseInfo = a.databaseInfo; - speclibInfo = a.speclibInfo; - inputFile = a.inputFile; - bPrintDuplReferences = a.bPrintDuplReferences; - variableModParameters = a.variableModParameters; - tolerances = a.tolerances; - staticModifications = a.staticModifications; - precalcMasses = a.precalcMasses; - enzymeInformation = a.enzymeInformation; - massUtility = a.massUtility; - dInverseBinWidth = a.dInverseBinWidth; - iArraySizeGlobal = a.iArraySizeGlobal; - dOneMinusBinOffset = a.dOneMinusBinOffset; - iXcorrProcessingOffset = a.iXcorrProcessingOffset; - ionInformation = a.ionInformation; - return *this; - } - - void RestoreDefaults() - { - int i; - - inputFile.iInputType = InputType_MS2; - - szMod[0] = '\0'; - - iXcorrProcessingOffset = 75; - iDbType = DbType::FASTA_DB; - - databaseInfo.szDatabase[0] = '\0'; - speclibInfo.strSpecLibFile.clear(); - - strcpy(szDecoyPrefix, "DECOY_"); - strcpy(szTxtFileExt, "txt"); - szOutputSuffix[0] = '\0'; - - peffInfo.szPeffOBO[0] = '\0'; - peffInfo.iPeffSearch = 0; - - variableModParameters.sCompoundModsFile = ""; - variableModParameters.vdCompoundMasses.clear(); - variableModParameters.uiNumCompoundMasses = 0; - - iPrecursorNLSize = 0; - - for (i = 0; i < SIZE_MASS; ++i) - { - massUtility.pdAAMassParent[i] = 999999.; - massUtility.pdAAMassFragment[i] = 999999.; - massUtility.pdAAMassUser[i] = 0.0; - staticModifications.pdStaticMods[i] = 0.0; - } - - massUtility.bMonoMassesFragment = 1; - massUtility.bMonoMassesParent = 1; - -#ifdef CRUX - staticModifications.pdStaticMods[(int)'C'] = 57.021464; -#endif - - - enzymeInformation.iAllowedMissedCleavage = 2; - - for (i = 0; i < VMODS; ++i) - { - variableModParameters.varModList[i].iMaxNumVarModAAPerMod = 3; - variableModParameters.varModList[i].iMinNumVarModAAPerMod = 0; - variableModParameters.varModList[i].iBinaryMod = 0; - variableModParameters.varModList[i].iRequireThisMod = 0; - variableModParameters.varModList[i].iVarModTermDistance = -1; // distance from N or C-term distance - variableModParameters.varModList[i].iWhichTerm = 0; // specify N (0) or C-term (1) - variableModParameters.varModList[i].dVarModMass = 0.0; - variableModParameters.varModList[i].dNeutralLoss = 0.0; - variableModParameters.varModList[i].dNeutralLoss2 = 0.0; - strcpy(variableModParameters.varModList[i].szVarModChar, "X"); - -#ifdef CRUX - if (i==0) - { - variableModParameters.varModList[i].dVarModMass = 15.9949; - strcpy(variableModParameters.varModList[i].szVarModChar, "M"); - } -#endif - } - - variableModParameters.cModCode[0] = '*'; - variableModParameters.cModCode[1] = '#'; - variableModParameters.cModCode[2] = '@'; - variableModParameters.cModCode[3] = '^'; - variableModParameters.cModCode[4] = '~'; - variableModParameters.cModCode[5] = '$'; - variableModParameters.cModCode[6] = '%'; - variableModParameters.cModCode[7] = '!'; - variableModParameters.cModCode[8] = '+'; - for (int i = 9; i < VMODS; ++i) - { - int iAscii = 88 + i; //start with lower case 'a' ASCII 97 - if (iAscii <= 125) // thru '}' which is ASCII 125 - variableModParameters.cModCode[i] = (char)(iAscii); - else - variableModParameters.cModCode[i] = '_'; - } - - variableModParameters.iMaxVarModPerPeptide = 5; - variableModParameters.iMaxPermutations = MAX_PERMUTATIONS; - variableModParameters.bUseFragmentNeutralLoss = false; - variableModParameters.iRequireVarMod = 0; - - ionInformation.bUseWaterAmmoniaLoss = false; - ionInformation.iTheoreticalFragmentIons = 1; // 0 = flanking peaks; 1 = no flanking peaks - ionInformation.iIonVal[ION_SERIES_A] = 0; - ionInformation.iIonVal[ION_SERIES_B] = 1; - ionInformation.iIonVal[ION_SERIES_C] = 0; - ionInformation.iIonVal[ION_SERIES_X] = 0; - ionInformation.iIonVal[ION_SERIES_Y] = 1; - ionInformation.iIonVal[ION_SERIES_Z] = 0; - ionInformation.iIonVal[ION_SERIES_Z1] = 0; - - options.iNumPeptideOutputLines = 5; - options.iWhichReadingFrame = 0; - options.iEnzymeTermini = 2; - options.iNumStored = 100; // default # of search results to store for xcorr analysis. - options.iMaxDuplicateProteins = 20; // maximum number of duplicate proteins to report or store in idx file - - options.bExplicitDeltaCn = false; - options.bPrintExpectScore = true; - options.iPrintAScoreProScore = 0; - options.bExportAdditionalScoresPepXML = false; - options.bCorrectMass = false; - options.bTreatSameIL = true; - options.iOverrideCharge = 0; - options.iMaxIndexRunTime = 0; // index run time limit in milliseconds; 0=no time limit - options.iRemovePrecursor = 0; - options.dRemovePrecursorTol = 1.5; - - options.bOutputSqtStream = false; - options.bOutputSqtFile = false; - options.bOutputTxtFile = false; - options.bOutputPepXMLFile = true; - options.iOutputMzIdentMLFile = false; - options.bOutputPercolatorFile = false; - - options.bResolveFullPaths = true; - - options.bMango = false; - options.bScaleFragmentNL = false; - options.bCreatePeptideIndex = false; - options.bCreateFragmentIndex = false; - options.bFastPlainPeptideIdx = false; - options.bVerboseOutput = false; - options.iDecoySearch = 0; - options.iNumThreads = 4; - options.iNumFragmentThreads = 4; - options.bClipNtermMet = false; - options.bClipNtermAA = false; - - options.lMaxIterations = 0; - - // These parameters affect mzXML/RAMP spectra only. - options.scanRange.iStart = 0; - options.scanRange.iEnd = 0; - options.iSpectrumBatchSize = 0; - options.iMinPeaks = 10; - options.iStartCharge = 0; - options.iEndCharge = 0; - options.iMaxFragmentCharge = 3; - options.iMinPrecursorCharge = 1; - options.iMaxPrecursorCharge = 6; - options.iMSLevel = 2; - options.dMinIntensity = 0.0; - options.dMinPercentageIntensity = 0.0; - options.dPeptideMassLow = 600.0; - options.dPeptideMassHigh = 5000.0; - options.dMinimumXcorr = XCORR_CUTOFF; - options.dFragIndexMaxMass = FRAGINDEX_MAX_MASS; - options.dFragIndexMinMass = FRAGINDEX_MIN_MASS; - strcpy(options.szActivationMethod, "ALL"); - // End of mzXML specific parameters. - - options.sPinProteinDelimiter = '\t'; - - options.dFragIndexMinMass = FRAGINDEX_MIN_MASS; - options.dFragIndexMaxMass = FRAGINDEX_MAX_MASS; - options.iFragIndexMinIonsScore = FRAGINDEX_MIN_IONS_SCORE; - options.iFragIndexMinIonsReport = FRAGINDEX_MIN_IONS_REPORT; - options.iFragIndexNumSpectrumPeaks = FRAGINDEX_MAX_NUMPEAKS; - options.iFragIndexSkipReadPrecursors = 1; // skip reading precursors by default - - options.dMS1MinMass = MS1_MIN_MASS; - options.dMS1MaxMass = MS1_MAX_MASS; - - options.clearMzRange.dStart = 0.0; - options.clearMzRange.dEnd = 0.0; - - options.peptideLengthRange.iStart = MIN_PEPTIDE_LEN; - options.peptideLengthRange.iEnd = MAX_PEPTIDE_LEN - 1; // -1 as MAX_PEPTIDE_LEN number includes terminating char - - staticModifications.dAddCterminusPeptide = 0.0; - staticModifications.dAddNterminusPeptide = 0.0; - staticModifications.dAddCterminusProtein = 0.0; - staticModifications.dAddNterminusProtein = 0.0; - - tolerances.iMassToleranceUnits = 0; - tolerances.iMassToleranceType = 0; - tolerances.iIsotopeError = 0; - tolerances.dInputToleranceMinus = -3.0; // peptide_mass_tolerance minus - tolerances.dInputTolerancePlus = 3.0; // peptide_mass_tolerance plus - tolerances.dFragmentBinSize = 1.0005; - tolerances.dFragmentBinStartOffset = 0.4; - tolerances.dMS1BinSize = 1.0005; - - bSkipToStartScan = true; - } -}; - -extern StaticParams g_staticParams; - -extern vector g_pvDBIndex; // used in both peptide index and fragment ion index; latter to store plain peptides -// Per-length, per-thread generation buffers. Outer index = (iLen - iMinLen) for short, -// (iLen - 13) for long. Inner index = thread slot. -extern vector>> g_vvvPepGenShort; // lengths <= 12 -extern vector>> g_vvvPepGenLong; // lengths > 12 -extern map g_pvProteinNames; // indexed database protein names and file positions - -// Flat CSR-style storage for the per-peptide protein list. -// Replaces vector> to eliminate the ~190M -// individual heap allocations (one per inner vector) that caused a -// ~6-minute free-time tail when building an MHC .idx file. -// External interface mirrors vector> so -// existing call sites need no changes. -class ProteinsListCSR -{ -public: - // Read-only proxy for a single row (one peptide's protein offsets). - struct Row - { - const comet_fileoffset_t* ptr; - size_t n; - - size_t size() const { return n; } - bool empty() const { return n == 0; } - - const comet_fileoffset_t& operator[](size_t j) const { return ptr[j]; } - comet_fileoffset_t at(size_t j) const { return ptr[j]; } - - const comet_fileoffset_t* begin() const { return ptr; } - const comet_fileoffset_t* end() const { return ptr + n; } - }; - - // Size / state - size_t size() const { return m_off.empty() ? 0 : m_off.size() - 1; } - bool empty() const { return size() == 0; } - - // Modifiers - void clear() - { - vector().swap(m_flat); - vector().swap(m_off); - } - - void reserve(size_t n) { m_off.reserve(n + 1); } - - void push_back(const vector& v) - { - if (m_off.empty()) m_off.push_back(0); - m_flat.insert(m_flat.end(), v.begin(), v.end()); - m_off.push_back(m_flat.size()); - } - - void push_back(vector&& v) - { - if (m_off.empty()) m_off.push_back(0); - m_flat.insert(m_flat.end(), v.begin(), v.end()); - m_off.push_back(m_flat.size()); - vector().swap(v); // release source buffer immediately - } - - // Batch-append from pre-built flat storage. - // flat: all protein file offsets for this block, concatenated in row order - // cnt: number of offsets per row (max value bounded by iMaxDuplicateProteins) - // Bulk-copies both arrays into m_flat/m_off with two insert() calls, then - // releases the source buffers. Replaces N individual push_back(vector&&) - // calls, each of which required one heap free() -- this reduces N free()s - // to 2 (one for flat, one for cnt) regardless of how many rows are in the block. - void append_flat(vector& flat, vector& cnt) - { - if (flat.empty()) - return; - if (m_off.empty()) - m_off.push_back(0); - m_flat.insert(m_flat.end(), flat.begin(), flat.end()); - for (uint32_t n : cnt) - m_off.push_back(m_off.back() + n); - vector().swap(flat); - vector().swap(cnt); - } - - // Element access - Row operator[](size_t i) const - { - return {m_flat.data() + m_off[i], - static_cast(m_off[i + 1] - m_off[i])}; - } - - Row at(size_t i) const { return (*this)[i]; } - - // Range-based for -- yields Row values - struct Iterator - { - const ProteinsListCSR* self; - size_t i; - - Row operator*() const { return (*self)[i]; } - Iterator& operator++() { ++i; return *this; } - bool operator!=(const Iterator& o) const { return i != o.i; } - }; - - Iterator begin() const { return {this, 0}; } - Iterator end() const { return {this, size()}; } - -private: - vector m_flat; // all protein offsets concatenated - vector m_off; // [N+1] CSR offsets; row i spans [m_off[i], m_off[i+1]) -}; - -extern ProteinsListCSR g_pvProteinsList; -extern unordered_map g_pvProteinNameCache; // file offset -> protein name string; populated at index load - -extern std::condition_variable g_searchPoolCV; // notified when a pool slot is released - -extern AScoreProCpp::AScoreOptions g_AScoreOptions; // AScore options -extern AScoreProCpp::AScoreDllInterface* g_AScoreInterface; - -struct ModificationNumber -{ -// int modificationNumber; - int modStringLen; // FIX: need to confirm if not needed (MOD_SEQS.at(modSeqIdx)).size(); - char* modifications; -}; - -extern vector MOD_NUMBERS; -extern vector MOD_SEQS; // Unique modifiable sequences. -extern int* MOD_SEQ_MOD_NUM_START; // Start index in the MOD_NUMBERS vector for a modifiable sequence; -1 if no modification numbers were generated -extern int* MOD_SEQ_MOD_NUM_CNT; // Total modifications numbers for a modifiable sequence. - -// Index into the MOD_SEQS vector -// -1 for peptides that have no modifiable amino acids -// -2 for peptides with no modifiable amino acids but contain n/c-term mods -extern int* PEPTIDE_MOD_SEQ_IDXS; - -extern int MOD_NUM; -extern bool g_bPlainPeptideIndexRead; // set to true if plain peptide index file is read (and fragment index generated) - // poor choice of name for the fragment index .idx given peptide index is back -extern std::atomic g_bPeptideIndexRead; // set to true if peptide index file is read -extern bool g_bSpecLibRead; // set to true if spectral library file is read - -extern bool g_bPerformSpecLibSearch; // set to true if doing spectral library search -extern bool g_bPerformDatabaseSearch; // set to true if doing database search - -extern bool g_bCometPreprocessMemoryAllocated; // set to true when memory has been allocated -extern bool g_bCometSearchMemoryAllocated; // set to true when memory has been allocated - -extern bool g_bIdxNoFasta; // set to true when .idx file being search but corresponding .fasta not present - // used in mzid output to skip sequence retrieval - -// Query stores information for peptide scoring and results -// This struct is allocated for each spectrum/charge combination -struct Query -{ - int iXcorrHistogram[HISTO_SIZE]; - unsigned int uiHistogramCount; // # of entries in histogram - float fPar[4]; // parameters of LMA regression - - int iMatchPeptideCount; // # of peptides that get stored (i.e. are greater than lowest score) - int iDecoyMatchPeptideCount; // # of decoy peptides that get stored (i.e. are greater than lowest score) - - short siMaxXcorr; // index of maximum correlation score in iXcorrHistogram - - short siLowestXcorrScoreIndex; - short siLowestDecoyXcorrScoreIndex; - - double dLowestXcorrScore; - double dLowestDecoyXcorrScore; - - float fLowestSpecLibScore; - - int iMinXcorrHisto; // min xcorr score for xcorr histogram to address good E-values for poor/sparse spectra - - double dMangoIndex; // scan number decimal precursor value i.e. 2401.001 for scan 2401, first precursor/z pair - - unsigned long int _uliNumMatchedPeptides; // # of peptides that get scored - unsigned long int _uliNumMatchedDecoyPeptides; - - // When true, sparse child arrays (float[SPARSE_MATRIX_SIZE]) belong to the - // thread-local RtsScratch pool and must NOT be delete[]'d by the destructor. - // Set only by PreprocessSingleSpectrumThreadLocal via PreprocessSingleSpectrumCore. - bool bSparseFromPool; - - // Sparse matrix representation of data - int iSpScoreData; //size of sparse matrix - int iFastXcorrDataSize; - float **ppfSparseSpScoreData; - float **ppfSparseFastXcorrData; - float **ppfSparseFastXcorrDataNL; // ppfSparseFastXcorrData with NH3, H2O contributions - - // Store raw peaks for AScorePro - - // List of ms/ms masses for fragment index search; intensity not important at this stage - vector vfRawFragmentPeakMass; - // Consider replacing vfRawFragmentPeakMass with a vector> to store - // both mass and intensity if AScorePro is used - vector vRawFragmentPeakMassIntensity; - - - PepMassInfo _pepMassInfo; - SpectrumInfoInternal _spectrumInfoInternal; - Results* _pResults; - Results* _pDecoys; - SpecLibResults* _pSpecLibResults; - - std::chrono::high_resolution_clock::time_point tSearchStart; // per-query search start time for iMaxIndexRunTime timeout - - Mutex accessMutex; - - Query() - { - memset(iXcorrHistogram, 0, sizeof(iXcorrHistogram)); - - iMatchPeptideCount = 0; - iDecoyMatchPeptideCount = 0; - uiHistogramCount = 0; - iMinXcorrHisto = 0; - - fPar[0]=0.0; - fPar[1]=0.0; - fPar[2]=0.0; - fPar[3]=0.0; - - siMaxXcorr = 0; // index of maximum correlation score in iXcorrHistogram - siLowestXcorrScoreIndex = 0; - siLowestDecoyXcorrScoreIndex = 0; - - dLowestXcorrScore = XCORR_CUTOFF; - dLowestDecoyXcorrScore = XCORR_CUTOFF; - - fLowestSpecLibScore = SPECLIB_CUTOFF; - - dMangoIndex = 0.0; - - _uliNumMatchedPeptides = 0; - _uliNumMatchedDecoyPeptides = 0; - - bSparseFromPool = false; - - ppfSparseSpScoreData = NULL; - ppfSparseFastXcorrData = NULL; - ppfSparseFastXcorrDataNL = NULL; // ppfSparseFastXcorrData with NH3, H2O contributions - - vfRawFragmentPeakMass.clear(); - vRawFragmentPeakMassIntensity.clear(); - - _pepMassInfo.dCalcPepMass = 0.0; - _pepMassInfo.dExpPepMass = 0.0; - _pepMassInfo.dPeptideMassToleranceLow = 0.0; - _pepMassInfo.dPeptideMassToleranceHigh = 0.0; - _pepMassInfo.dPeptideMassToleranceMinus = 0.0; - _pepMassInfo.dPeptideMassTolerancePlus = 0.0; - - _spectrumInfoInternal.dTotalIntensity = 0.0; - _spectrumInfoInternal.iArraySize = 0; - _spectrumInfoInternal.iHighestIon = 0; - _spectrumInfoInternal.iScanNumber = 0; - _spectrumInfoInternal.dTotalIntensity = 0.0; - - _pResults = NULL; - _pDecoys = NULL; - _pSpecLibResults = NULL; - - Threading::InitMutex(&accessMutex); - } - - ~Query() - { - int i; - if (!bSparseFromPool) - { - for (i = 0; i < iSpScoreData; ++i) - { - if (ppfSparseSpScoreData[i] != NULL) - delete[] ppfSparseSpScoreData[i]; - } - } - delete[] ppfSparseSpScoreData; - ppfSparseSpScoreData = NULL; - - if (g_staticParams.ionInformation.bUseWaterAmmoniaLoss - && (g_staticParams.ionInformation.iIonVal[ION_SERIES_A] - || g_staticParams.ionInformation.iIonVal[ION_SERIES_B] - || g_staticParams.ionInformation.iIonVal[ION_SERIES_Y])) - { - if (!bSparseFromPool) - { - for (i = 0; i < iFastXcorrDataSize; ++i) - { - if (ppfSparseFastXcorrData[i] != NULL) - delete[] ppfSparseFastXcorrData[i]; - if (ppfSparseFastXcorrDataNL[i]!=NULL) - delete[] ppfSparseFastXcorrDataNL[i]; - } - } - delete[] ppfSparseFastXcorrDataNL; - ppfSparseFastXcorrDataNL = NULL; - } - else - { - if (!bSparseFromPool) - { - for (i = 0; i < iFastXcorrDataSize; ++i) - { - if (ppfSparseFastXcorrData[i] != NULL) - delete[] ppfSparseFastXcorrData[i]; - } - } - } - delete[] ppfSparseFastXcorrData; - ppfSparseFastXcorrData = NULL; - - if (_pResults != NULL) - { - _pResults->pWhichProtein.clear(); - if (g_staticParams.options.iDecoySearch == 1) - _pResults->pWhichDecoyProtein.clear(); - delete[] _pResults; - _pResults = NULL; - } - - if (g_staticParams.options.iDecoySearch == 2 && _pDecoys != NULL) - { - _pDecoys->pWhichDecoyProtein.clear(); - delete[] _pDecoys; - _pDecoys = NULL; - } - - Threading::DestroyMutex(accessMutex); - } -}; - -struct QueryMS1 -{ - // short siLowestSpecLibIndex; - // float fLowestXcorr; - unsigned int uiMatchMS1Count; // # of peptides that get stored (i.e. are greater than lowest score) - unsigned int iArraySizeMS1; // dimension of pcFastXcorrData - - // Standard array representation of data - // Library spectra are fast xcorr manipulated so non need to do so with query MS1 - float* pfFastXcorrData; - - SpecLibResultsMS1 _pSpecLibResultsMS1; - - Mutex accessMutex; - - QueryMS1() - { - // siLowestSpecLibIndex = 0; - // fLowestXcorr = SPECLIB_CUTOFF; - uiMatchMS1Count = 0; - pfFastXcorrData = NULL; - _pSpecLibResultsMS1.fDotProduct = 0.0; - _pSpecLibResultsMS1.fRTime = 0.0; - - Threading::InitMutex(&accessMutex); - } - - ~QueryMS1() - { - //FIX delete _pSepcLibResults - - Threading::DestroyMutex(accessMutex); - } -}; - -extern vector g_pvQuery; -extern vector g_pvQueryMS1; -extern vector g_pvInputFiles; -extern Mutex g_pvQueryMutex; -extern Mutex g_pvDBIndexMutex; -extern Mutex g_preprocessMemoryPoolMutex; -extern Mutex g_searchMemoryPoolMutex; -extern Mutex g_dbIndexMutex; -extern Mutex g_vSpecLibMutex; - -struct IonSeriesStruct // defines which fragment ion series are considered -{ - int bPreviousMatch[8]; -}; - - -struct MatchedIonsStruct // for SingleSpectrumSearch -{ - double dMass; - double dInten; - - bool operator<(const MatchedIonsStruct& a) const - { - return dInten > a.dInten; - } -}; +#include "core/Constants.h" +#include "core/Params.h" +#include "core/Types.h" #endif // _COMETDATAINTERNAL_H_ diff --git a/CometSearch/CometFragmentIndex.cpp b/CometSearch/CometFragmentIndex.cpp index db2671a7..1f81adc4 100644 --- a/CometSearch/CometFragmentIndex.cpp +++ b/CometSearch/CometFragmentIndex.cpp @@ -244,7 +244,7 @@ void CometFragmentIndex::GenerateFragmentIndex(ThreadPool *tp) void CometFragmentIndex::AddFragmentsThreadProc(bool bCountOnly, - ThreadPool *tp) + ThreadPool* /*tp*/) { size_t iWhichFragmentPeptide = 0; // unused here for counting only @@ -602,7 +602,8 @@ bool CometFragmentIndex::GeneratePlainPeptideIndex(ThreadPool* tp, vector emptyQueries; + bool bSucceeded = CometSearch::RunSearch(0, 0, tp, emptyQueries); g_staticParams.options.bCreateFragmentIndex = false; g_staticParams.options.bFastPlainPeptideIdx = false; @@ -1102,64 +1103,40 @@ bool CometFragmentIndex::WriteFIPlainPeptideIndex(ThreadPool *tp) // Destruction is O(n) for pcVarModSites in g_pvDBIndex but trivial for // g_vRawPeptides; order no longer matters. { - auto tClear = chrono::steady_clock::now(); vector().swap(g_vRawPeptides); -// printf(" - freed g_vRawPeptides: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); g_pvDBIndex.clear(); // DBIndex::sPeptide strings freed after g_vRawPeptides // to keep the allocator bins warm for the string frees above -// printf(" - freed g_pvDBIndex: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); g_pvProteinsList.clear(); // CSR flat layout: 2 free() calls instead of ~190M -// printf(" - freed g_pvProteinsList: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); g_pvProteinNames.clear(); -// printf(" - freed g_pvProteinNames: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); vector().swap(MOD_SEQS); -// printf(" - freed MOD_SEQS: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); vector().swap(g_vFragmentPeptides); -// printf(" - freed g_vFragmentPeptides: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); delete[] PEPTIDE_MOD_SEQ_IDXS; PEPTIDE_MOD_SEQ_IDXS = nullptr; -// printf(" - freed PEPTIDE_MOD_SEQ_IDXS: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); delete[] MOD_SEQ_MOD_NUM_START; MOD_SEQ_MOD_NUM_START = nullptr; delete[] MOD_SEQ_MOD_NUM_CNT; MOD_SEQ_MOD_NUM_CNT = nullptr; -// printf(" - freed MOD_SEQ_MOD_NUM_START/CNT: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } fflush(stdout); @@ -1412,7 +1389,7 @@ bool CometFragmentIndex::ReadPlainPeptideIndex(void) { size_t pepSectionSize = (size_t)(clProteinsFilePos - clPeptidesFilePos) - sizeof(size_t); vector pepBuf(pepSectionSize); - fread(pepBuf.data(), 1, pepSectionSize, fp); + (void)fread(pepBuf.data(), 1, pepSectionSize, fp); const char* p = pepBuf.data(); struct PlainPeptideIndexStruct sTmp; @@ -1437,7 +1414,7 @@ bool CometFragmentIndex::ReadPlainPeptideIndex(void) { size_t protSectionSize = (size_t)(clPermutationsFilePos - clProteinsFilePos); vector protBuf(protSectionSize); - fread(protBuf.data(), 1, protSectionSize, fp); + (void)fread(protBuf.data(), 1, protSectionSize, fp); const char* p = protBuf.data(); size_t tSize; @@ -1505,7 +1482,7 @@ bool CometFragmentIndex::ReadPlainPeptideIndex(void) comet_fileoffset_t varDataStart = comet_ftell(fp); size_t varDataSize = (size_t)(clFooterPos - varDataStart); vector varBuf(varDataSize); - fread(varBuf.data(), 1, varDataSize, fp); + (void)fread(varBuf.data(), 1, varDataSize, fp); const char* p = varBuf.data(); int iTmp; diff --git a/CometSearch/CometInterfaces.h b/CometSearch/CometInterfaces.h index 8bcf095d..8347c779 100644 --- a/CometSearch/CometInterfaces.h +++ b/CometSearch/CometInterfaces.h @@ -85,7 +85,7 @@ namespace CometInterfaces ICometSearchManager *GetCometSearchManager(); void ReleaseCometSearchManager(); - static ThreadPool* _tp; + [[maybe_unused]] static ThreadPool* _tp; } #endif // _COMETINTERFACES_H_ diff --git a/CometSearch/CometMassSpecUtils.cpp b/CometSearch/CometMassSpecUtils.cpp index 7857a9d4..a06b5f38 100644 --- a/CometSearch/CometMassSpecUtils.cpp +++ b/CometSearch/CometMassSpecUtils.cpp @@ -190,7 +190,8 @@ void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, bool bReturnFullProteinString, // 0 = return accession only, 1 = return full description line unsigned int *uiNumTotProteins, // matched protein count vector& vProteinTargets, // the target protein names - vector& vProteinDecoys) // the decoy protein names if applicable + vector& vProteinDecoys, // the decoy protein names if applicable + const vector& queries) { char szProteinName[WIDTH_REFERENCE]; @@ -209,9 +210,9 @@ void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, Results* pOutput; if (iPrintTargetDecoy != 2) - pOutput = g_pvQuery.at(iWhichQuery)->_pResults; + pOutput = queries.at(iWhichQuery)->_pResults; else - pOutput = g_pvQuery.at(iWhichQuery)->_pDecoys; + pOutput = queries.at(iWhichQuery)->_pDecoys; int iPrintDuplicateProteinCt = 0; // track # proteins, exit when at iMaxDuplicateProteins @@ -284,9 +285,9 @@ void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, Results* pOutput; if (iPrintTargetDecoy != 2) - pOutput = g_pvQuery.at(iWhichQuery)->_pResults; + pOutput = queries.at(iWhichQuery)->_pResults; else - pOutput = g_pvQuery.at(iWhichQuery)->_pDecoys; + pOutput = queries.at(iWhichQuery)->_pDecoys; int iPrintDuplicateProteinCt = 0; // track # proteins, exit when at iMaxDuplicateProteins diff --git a/CometSearch/CometMassSpecUtils.h b/CometSearch/CometMassSpecUtils.h index b75700ff..04b141b7 100644 --- a/CometSearch/CometMassSpecUtils.h +++ b/CometSearch/CometMassSpecUtils.h @@ -61,7 +61,8 @@ class CometMassSpecUtils bool bReturnFullProteinString, // 0 = return accession only, 1 = return full description line unsigned int *iNumTotProteins, // matched protein count vector& vProteinTargets, // the target protein names - vector& vProteinDecoys); // the decoy protein names if applicable + vector& vProteinDecoys, // the decoy protein names if applicable + const vector& queries); static string GetField(std::string *s, unsigned int n, diff --git a/CometSearch/CometPeptideIndex.cpp b/CometSearch/CometPeptideIndex.cpp index 4dc547b4..9ac446f0 100644 --- a/CometSearch/CometPeptideIndex.cpp +++ b/CometSearch/CometPeptideIndex.cpp @@ -88,25 +88,24 @@ bool CometPeptideIndex::ReadPeptideIndex(void) comet_fileoffset_t lEndOfPeptides; comet_fileoffset_t clProteinsFilePos; - size_t tTmpRead; - tTmpRead = fread(&lEndOfPeptides, clSizeCometFileOffset, 1, fp); - tTmpRead = fread(&clProteinsFilePos, clSizeCometFileOffset, 1, fp); + (void)fread(&lEndOfPeptides, clSizeCometFileOffset, 1, fp); + (void)fread(&clProteinsFilePos, clSizeCometFileOffset, 1, fp); // --- Read the mass index and peptide count from lEndOfPeptides position --- comet_fseek(fp, lEndOfPeptides, SEEK_SET); int iMinMass, iMaxMass; uint64_t tNumPeptides; - tTmpRead = fread(&iMinMass, sizeof(int), 1, fp); - tTmpRead = fread(&iMaxMass, sizeof(int), 1, fp); - tTmpRead = fread(&tNumPeptides, sizeof(uint64_t), 1, fp); + (void)fread(&iMinMass, sizeof(int), 1, fp); + (void)fread(&iMaxMass, sizeof(int), 1, fp); + (void)fread(&tNumPeptides, sizeof(uint64_t), 1, fp); int iMaxPeptideMass10 = iMaxMass * 10; // Read the mass index array: lIndex[0..iMaxPeptideMass10-1] // Each entry is a file offset to the first peptide at that 0.1 Da mass bin comet_fileoffset_t* lIndex = new comet_fileoffset_t[iMaxPeptideMass10]; - tTmpRead = fread(lIndex, clSizeCometFileOffset, iMaxPeptideMass10, fp); + (void)fread(lIndex, clSizeCometFileOffset, iMaxPeptideMass10, fp); // --- Read protein names --- // Protein names are stored between end-of-header and clProteinsFilePos @@ -125,7 +124,7 @@ bool CometPeptideIndex::ReadPeptideIndex(void) comet_fseek(fp, clProteinsFilePos, SEEK_SET); size_t tNumProteinEntries; - tTmpRead = fread(&tNumProteinEntries, clSizeCometFileOffset, 1, fp); + (void)fread(&tNumProteinEntries, clSizeCometFileOffset, 1, fp); g_pvProteinsList.clear(); g_pvProteinsList.reserve(tNumProteinEntries); @@ -133,11 +132,11 @@ bool CometPeptideIndex::ReadPeptideIndex(void) for (size_t i = 0; i < tNumProteinEntries; ++i) { size_t tNumProteins; - tTmpRead = fread(&tNumProteins, clSizeCometFileOffset, 1, fp); + (void)fread(&tNumProteins, clSizeCometFileOffset, 1, fp); vector vTmp(tNumProteins); for (size_t j = 0; j < tNumProteins; ++j) - tTmpRead = fread(&vTmp[j], clSizeCometFileOffset, 1, fp); + (void)fread(&vTmp[j], clSizeCometFileOffset, 1, fp); g_pvProteinsList.push_back(std::move(vTmp)); } @@ -235,7 +234,8 @@ bool CometPeptideIndex::WritePeptideIndex(ThreadPool* tp) if (bSucceeded) { - bSucceeded = CometSearch::RunSearch(0, 0, tp); + vector emptyQueries; + bSucceeded = CometSearch::RunSearch(0, 0, tp, emptyQueries); } if (!bSucceeded) diff --git a/CometSearch/CometPostAnalysis.cpp b/CometSearch/CometPostAnalysis.cpp index f3e60334..00a899d7 100644 --- a/CometSearch/CometPostAnalysis.cpp +++ b/CometSearch/CometPostAnalysis.cpp @@ -176,18 +176,18 @@ CometPostAnalysis::~CometPostAnalysis() } -bool CometPostAnalysis::PostAnalysis(ThreadPool* tp) +bool CometPostAnalysis::PostAnalysis(ThreadPool* tp, const vector& queries) { bool bSucceeded = true; //Reuse existing ThreadPool ThreadPool *pPostAnalysisThreadPool = tp; - for (int i=0; i<(int)g_pvQuery.size(); ++i) + for (int i=0; i<(int)queries.size(); ++i) { - if (g_pvQuery.at(i)->iMatchPeptideCount > 0 || g_pvQuery.at(i)->iDecoyMatchPeptideCount > 0) + if (queries.at(i)->iMatchPeptideCount > 0 || queries.at(i)->iDecoyMatchPeptideCount > 0) { - PostAnalysisThreadData* pThreadData = new PostAnalysisThreadData(i); + PostAnalysisThreadData* pThreadData = new PostAnalysisThreadData(i, &queries); pPostAnalysisThreadPool->doJob(std::bind(PostAnalysisThreadProc, pThreadData, pPostAnalysisThreadPool)); @@ -223,7 +223,7 @@ void CometPostAnalysis::PostAnalysisThreadProc(PostAnalysisThreadData *pThreadDa (void)tp; // suppress unused parameter warning int iQueryIndex = pThreadData->iQueryIndex; - Query* pQuery = g_pvQuery.at(iQueryIndex); + Query* pQuery = pThreadData->pQueries->at(iQueryIndex); AnalyzeSP(pQuery); diff --git a/CometSearch/CometPostAnalysis.h b/CometSearch/CometPostAnalysis.h index aa001ced..50b926a1 100644 --- a/CometSearch/CometPostAnalysis.h +++ b/CometSearch/CometPostAnalysis.h @@ -24,15 +24,18 @@ struct PostAnalysisThreadData { int iQueryIndex; + const vector* pQueries; PostAnalysisThreadData() { iQueryIndex = -1; + pQueries = nullptr; } - PostAnalysisThreadData(int iQueryIndex_in) + PostAnalysisThreadData(int iQueryIndex_in, const vector* pQueries_in) { iQueryIndex = iQueryIndex_in; + pQueries = pQueries_in; } }; @@ -41,7 +44,7 @@ class CometPostAnalysis public: CometPostAnalysis(); ~CometPostAnalysis(); - static bool PostAnalysis(ThreadPool* tp); + static bool PostAnalysis(ThreadPool* tp, const vector& queries); static void PostAnalysisThreadProc(PostAnalysisThreadData* pThreadData, ThreadPool* tp); // Query*-based overloads, the only versions now diff --git a/CometSearch/CometPreprocess.cpp b/CometSearch/CometPreprocess.cpp index 867e2388..241804f3 100644 --- a/CometSearch/CometPreprocess.cpp +++ b/CometSearch/CometPreprocess.cpp @@ -646,7 +646,8 @@ bool CometPreprocess::LoadAndPreprocessSpectra(MSReader &mstReader, int iFirstScan, int iLastScan, int iAnalysisType, - ThreadPool* tp) + ThreadPool* tp, + SearchSession& session) { int iFileLastScan = -1; // The actual last scan in the file. int iScanNumber = 0; @@ -761,15 +762,16 @@ bool CometPreprocess::LoadAndPreprocessSpectra(MSReader &mstReader, if (CheckActivationMethodFilter(mstSpectrum.getActivationMethod())) { - // add this hack when 1 thread is specified otherwise g_pvQuery.size() returns 0 + // add this hack when 1 thread is specified otherwise session.queries.size() returns 0 if (g_staticParams.options.iNumThreads == 1) pPreprocessThreadPool->wait_on_threads(); - Threading::LockMutex(g_pvQueryMutex); - // this needed because processing can add multiple spectra at a time - iNumSpectraLoaded = (int)g_pvQuery.size(); - iNumSpectraLoaded++; - Threading::UnlockMutex(g_pvQueryMutex); + { + std::lock_guard lk(session.queriesMutex); + // this needed because processing can add multiple spectra at a time + iNumSpectraLoaded = (int)session.queries.size(); + iNumSpectraLoaded++; + } pPreprocessThreadPool->wait_for_available_thread(); @@ -778,6 +780,7 @@ bool CometPreprocess::LoadAndPreprocessSpectra(MSReader &mstReader, //run filter here. PreprocessThreadData *pPreprocessThreadData = new PreprocessThreadData(mstSpectrum, iAnalysisType, iFileLastScan); + pPreprocessThreadData->pSession = &session; pPreprocessThreadPool->doJob(std::bind(PreprocessThreadProc, pPreprocessThreadData, pPreprocessThreadPool)); } @@ -804,22 +807,18 @@ bool CometPreprocess::LoadAndPreprocessSpectra(MSReader &mstReader, } } - Threading::LockMutex(g_pvQueryMutex); - - if (CheckExit(iAnalysisType, - iScanNumber, - iTotalScans, - iLastScan, - mstReader.getLastScan(), - iNumSpectraLoaded, - 0)) - { - Threading::UnlockMutex(g_pvQueryMutex); - break; - } - else { - Threading::UnlockMutex(g_pvQueryMutex); + std::lock_guard lk(session.queriesMutex); + if (CheckExit(iAnalysisType, + iScanNumber, + iTotalScans, + iLastScan, + mstReader.getLastScan(), + iNumSpectraLoaded, + 0)) + { + break; + } } } @@ -836,7 +835,7 @@ bool CometPreprocess::LoadAndPreprocessSpectra(MSReader &mstReader, void CometPreprocess::PreprocessThreadProc(PreprocessThreadData *pPreprocessThreadData, - ThreadPool* tp) + ThreadPool* /*tp*/) { // This returns false if it fails, but the errors are already logged // so no need to check the return value here. @@ -882,7 +881,8 @@ void CometPreprocess::PreprocessThreadProc(PreprocessThreadData *pPreprocessThre ppdTmpCorrelationDataArr[i], ppfFastXcorrData[i], ppfFastXcorrDataNL[i], - ppfSpScoreData[i]); + ppfSpScoreData[i], + pPreprocessThreadData->pSession); delete pPreprocessThreadData; pPreprocessThreadData = NULL; @@ -890,7 +890,7 @@ void CometPreprocess::PreprocessThreadProc(PreprocessThreadData *pPreprocessThre void CometPreprocess::PreprocessThreadProcMS1(PreprocessThreadData* pPreprocessThreadDataMS1, - ThreadPool* tp, + ThreadPool* /*tp*/, const double dMaxQueryRT, const double dMaxSpecLibRT) { @@ -1866,23 +1866,22 @@ double* CometPreprocess::GetRtsRawDataBuffer() } -// Original public entry point: builds Query* via Core, then pushes into g_pvQuery. -// Preserves backward compatibility with existing callers. +// Original public entry point: builds Query* via Core, then pushes into session.queries. bool CometPreprocess::PreprocessSingleSpectrum(int iPrecursorCharge, double dMZ, double *pdMass, double *pdInten, int iNumPeaks, - double *pdTmpSpectrum) + double *pdTmpSpectrum, + SearchSession& session) { Query* pScoring = PreprocessSingleSpectrumCore(iPrecursorCharge, dMZ, pdMass, pdInten, iNumPeaks, pdTmpSpectrum); if (pScoring == nullptr) return false; - Threading::LockMutex(g_pvQueryMutex); - g_pvQuery.push_back(pScoring); - Threading::UnlockMutex(g_pvQueryMutex); + std::lock_guard lk(session.queriesMutex); + session.queries.push_back(pScoring); return true; } @@ -2023,7 +2022,8 @@ bool CometPreprocess::PreprocessSpectrum(Spectrum &spec, double *pdTmpCorrelationData, float *pfFastXcorrData, float *pfFastXcorrDataNL, - float *pfSpScoreData) + float *pfSpScoreData, + SearchSession* pSession) { int iScanNumber = spec.getScanNumber(); int iSpectrumCharge = 0; @@ -2236,9 +2236,8 @@ bool CometPreprocess::PreprocessSpectrum(Spectrum &spec, return false; } - Threading::LockMutex(g_pvQueryMutex); - g_pvQuery.push_back(pScoring); - Threading::UnlockMutex(g_pvQueryMutex); + std::lock_guard lk(pSession->queriesMutex); + pSession->queries.push_back(pScoring); } } } @@ -2804,7 +2803,8 @@ bool CometPreprocess::IsValidInputType(int inputType) bool CometPreprocess::PreprocessMS1SingleSpectrum(double* pdMass, double* pdInten, - int iNumPeaks) + int iNumPeaks, + SearchSession& session) { QueryMS1* pScoringMS1 = new QueryMS1(); @@ -2866,7 +2866,8 @@ bool CometPreprocess::PreprocessMS1SingleSpectrum(double* pdMass, pScoringMS1->iArraySizeMS1 = iArraySizeMS1; - g_pvQueryMS1.push_back(pScoringMS1); + std::lock_guard lk(session.queriesMutex); + session.ms1Queries.push_back(pScoringMS1); return true; } @@ -2949,7 +2950,7 @@ QueryMS1* CometPreprocess::PreprocessMS1SingleSpectrumThreadLocal(double* pdMass // Fused FI_DB batch worker: preprocess + RunSearch + post-analysis for one spectrum. // Uses per-thread g_rtsScratch scratch buffers (no shared batch pool contention). // iSlot is this worker thread's pre-assigned _ppbDuplFragmentArr index. -void CometPreprocess::FusedSearchSpectrum(Spectrum spec, int iSlot) +void CometPreprocess::FusedSearchSpectrum(Spectrum spec, int iSlot, SearchSession& session) { int iScanNumber = spec.getScanNumber(); int iSpectrumCharge = 0; @@ -3231,9 +3232,8 @@ void CometPreprocess::FusedSearchSpectrum(Spectrum spec, int iSlot) pScoring->vfRawFragmentPeakMass.clear(); pScoring->vfRawFragmentPeakMass.shrink_to_fit(); - Threading::LockMutex(g_pvQueryMutex); - g_pvQuery.push_back(pScoring); - Threading::UnlockMutex(g_pvQueryMutex); + std::lock_guard lk(session.queriesMutex); + session.queries.push_back(pScoring); } } } @@ -3247,12 +3247,12 @@ bool CometPreprocess::FusedLoadAndSearchSpectra(MSReader& mstReader, int iFirstScan, int iLastScan, int iAnalysisType, - ThreadPool* tp) + ThreadPool* tp, + SearchSession& session) { int iFileLastScan = -1; int iScanNumber = 0; int iTotalScans = 0; - int iNumSpectraLoaded = 0; int iTmpCount = 0; Spectrum mstSpectrum; @@ -3269,11 +3269,11 @@ bool CometPreprocess::FusedLoadAndSearchSpectra(MSReader& mstReader, for (int t = 0; t < iNumSlots; ++t) { - tp->doJob([&queue, t]() + tp->doJob([&queue, t, &session]() { Spectrum spec; while (queue.pop(spec)) - FusedSearchSpectrum(std::move(spec), t); + FusedSearchSpectrum(std::move(spec), t, session); }); } @@ -3359,7 +3359,6 @@ bool CometPreprocess::FusedLoadAndSearchSpectra(MSReader& mstReader, if (CheckActivationMethodFilter(mstSpectrum.getActivationMethod())) { queue.push(std::move(mstSpectrum)); - iNumSpectraLoaded++; } } @@ -3380,14 +3379,14 @@ bool CometPreprocess::FusedLoadAndSearchSpectra(MSReader& mstReader, } } - Threading::LockMutex(g_pvQueryMutex); - if (CheckExit(iAnalysisType, iScanNumber, iTotalScans, iLastScan, - mstReader.getLastScan(), iNumSpectraLoaded, 0)) { - Threading::UnlockMutex(g_pvQueryMutex); - break; + std::lock_guard lk(session.queriesMutex); + if (CheckExit(iAnalysisType, iScanNumber, iTotalScans, iLastScan, + mstReader.getLastScan(), (int)session.queries.size(), 0)) + { + break; + } } - Threading::UnlockMutex(g_pvQueryMutex); } Threading::DestroyMutex(_maxChargeMutex); diff --git a/CometSearch/CometPreprocess.h b/CometSearch/CometPreprocess.h index 67b664ea..c605cdf2 100644 --- a/CometSearch/CometPreprocess.h +++ b/CometSearch/CometPreprocess.h @@ -17,6 +17,7 @@ #define _COMETPREPROCESS_H_ #include "ThreadPool.h" +#include "search/SearchSession.h" struct PreprocessThreadData { @@ -24,16 +25,17 @@ struct PreprocessThreadData int iAnalysisType; int iFileLastScan; bool *pbMemoryPool; //MH: Manages active memory pool + SearchSession* pSession; PreprocessThreadData() - : mstSpectrum(), iAnalysisType(0), iFileLastScan(0), pbMemoryPool(nullptr) + : mstSpectrum(), iAnalysisType(0), iFileLastScan(0), pbMemoryPool(nullptr), pSession(nullptr) { } PreprocessThreadData(Spectrum& spec_in, int iAnalysisType_in, int iFileLastScan_in) - : mstSpectrum(spec_in), iAnalysisType(iAnalysisType_in), iFileLastScan(iFileLastScan_in), pbMemoryPool(nullptr) + : mstSpectrum(spec_in), iAnalysisType(iAnalysisType_in), iFileLastScan(iFileLastScan_in), pbMemoryPool(nullptr), pSession(nullptr) { } @@ -69,7 +71,8 @@ class CometPreprocess int iFirstScan, int iLastScan, int iAnalysisType, - ThreadPool* tp); + ThreadPool* tp, + SearchSession& session); static void PreprocessThreadProc(PreprocessThreadData *pPreprocessThreadData, ThreadPool* tp); static void PreprocessThreadProcMS1(PreprocessThreadData* pPreprocessThreadDataMS1, @@ -84,7 +87,8 @@ class CometPreprocess double *pdMass, double *pdInten, int iNumPeaks, - double *pdTmpSpectrum); + double *pdTmpSpectrum, + SearchSession& session); // Thread-local version: returns Query* without touching g_pvQuery. // Caller owns the returned Query* and must delete it when done. @@ -97,7 +101,8 @@ class CometPreprocess static bool PreprocessMS1SingleSpectrum(double* pdMass, double* pdInten, - int iNumPeaks); + int iNumPeaks, + SearchSession& session); // Thread-local version: returns QueryMS1* without touching g_pvQueryMS1. // Caller owns the returned QueryMS1* and must delete it when done. static QueryMS1* PreprocessMS1SingleSpectrumThreadLocal(double* pdMass, @@ -109,7 +114,7 @@ class CometPreprocess // Fused FI_DB batch path: preprocess + search + post-analysis for one spectrum // in a single pass using thread-local scratch buffers. iSlot is this worker's // pre-assigned _ppbDuplFragmentArr index. - static void FusedSearchSpectrum(Spectrum spec, int iSlot); + static void FusedSearchSpectrum(Spectrum spec, int iSlot, SearchSession& session); // Fused FI_DB batch path: stream spectra through a bounded producer/consumer // queue into FusedSearchSpectrum workers. Replaces LoadAndPreprocessSpectra + @@ -118,7 +123,8 @@ class CometPreprocess int iFirstScan, int iLastScan, int iAnalysisType, - ThreadPool* tp); + ThreadPool* tp, + SearchSession& session); // Returns the thread-local raw-data buffer used by PreprocessSingleSpectrumThreadLocal. // The buffer is sized to g_staticParams.iArraySizeGlobal and its content after a @@ -149,7 +155,8 @@ class CometPreprocess double *pdTmpCorrelationData, float *pfFastXcorrData, float *pfFastXcorrDataNL, - float *pfSpScoreData); + float *pfSpScoreData, + SearchSession* pSession); static bool AdjustMassTol(struct Query *pScoring); static bool CheckActivationMethodFilter(MSActivation act); static bool Preprocess(struct Query *pScoring, diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index 9e5a9170..aecbb874 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -15,14 +15,24 @@ #include "Common.h" #include "CometSearch.h" #include "CometFragmentIndexReader.h" +#include "threading/SearchMemoryPool.h" +#include #include #define BINARYSEARCHCUTOFF 20 // do linear search through FI if # entries is this or less -bool* CometSearch::_pbSearchMemoryPool = nullptr; bool** CometSearch::_ppbDuplFragmentArr = nullptr; +// Module-local pool instance. Owns the same scratch arrays as the +// legacy _pbSearchMemoryPool/_ppbDuplFragmentArr statics above. +// Both representations are kept in sync during the transition: +// AllocateMemory populates both; AcquirePoolSlot/releaseSlot use s_pool. +// TODO(Phase N): s_pool is a file-static singleton. Move it into a +// per-instance context (RtsContext / CometSearchManager member) before +// multiple concurrent RTS instances are viable. +static SearchMemoryPool s_pool; + extern comet_fileoffset_t clSizeCometFileOffset; @@ -44,50 +54,46 @@ CometSearch::~CometSearch() bool CometSearch::AllocateMemory(int maxNumThreads) { - if (g_bCometSearchMemoryAllocated) // already allocated + if (g_bCometSearchMemoryAllocated) return true; + if (!s_pool.allocate(maxNumThreads, g_staticParams.iArraySizeGlobal)) + return false; + try { - _pbSearchMemoryPool = new bool[maxNumThreads](); - _ppbDuplFragmentArr = new bool* [maxNumThreads]; - + _ppbDuplFragmentArr = new bool*[maxNumThreads]; for (int i = 0; i < maxNumThreads; ++i) - _ppbDuplFragmentArr[i] = new bool[g_staticParams.iArraySizeGlobal](); - - g_bCometSearchMemoryAllocated = true; - - return true; + _ppbDuplFragmentArr[i] = s_pool.duplFragmentArr(i); } catch (const std::bad_alloc& ba) { - string strErrorMsg = " Error - memory allocation failed. bad_alloc: " + std::string(ba.what()) + ".\n"; + string strErrorMsg = " Error - AllocateMemory alias arrays failed. bad_alloc: " + std::string(ba.what()) + ".\n"; g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg); - + s_pool.deallocate(); g_bCometSearchMemoryAllocated = false; - return false; } + + g_bCometSearchMemoryAllocated = true; + return true; } -bool CometSearch::DeallocateMemory(int maxNumThreads) +bool CometSearch::DeallocateMemory(int /*maxNumThreads*/) { if (!g_bCometSearchMemoryAllocated) return true; - delete [] _pbSearchMemoryPool; - - for (int i = 0; i < maxNumThreads; ++i) - { - delete [] _ppbDuplFragmentArr[i]; - } + s_pool.deallocate(); - delete [] _ppbDuplFragmentArr; + // _ppbDuplFragmentArr holds pointers into s_pool's scratch arrays; those + // are already freed by s_pool.deallocate(). Only free the alias array itself. + delete[] _ppbDuplFragmentArr; + _ppbDuplFragmentArr = nullptr; g_bCometSearchMemoryAllocated = false; - return true; } @@ -96,23 +102,7 @@ bool CometSearch::DeallocateMemory(int maxNumThreads) // Returns the slot index (0..iNumThreads-1), or -1 on timeout. int CometSearch::AcquirePoolSlot() { - int i = -1; - std::unique_lock lock(g_searchMemoryPoolMutex); - - bool found = g_searchPoolCV.wait_for(lock, std::chrono::seconds(240), [&i]() { - for (int j = 0; j < g_staticParams.options.iNumThreads; ++j) - { - if (_pbSearchMemoryPool[j] == false) - { - _pbSearchMemoryPool[j] = true; - i = j; - return true; - } - } - return false; - }); - - return found ? i : -1; + return s_pool.acquireSlot(); } @@ -135,9 +125,8 @@ bool CometSearch::RunSearch(Query* pQuery) logerr(" Error - could not acquire memory pool slot for thread-local FI search.\n"); return false; } + SearchMemoryPoolSlotGuard guard{s_pool, iSlot}; SearchFragmentIndex(pQuery, _ppbDuplFragmentArr[iSlot]); - { std::lock_guard lk(g_searchMemoryPoolMutex); _pbSearchMemoryPool[iSlot] = false; } - g_searchPoolCV.notify_one(); } else if (g_staticParams.iDbType == DbType::PI_DB) // peptide index { @@ -178,9 +167,8 @@ bool CometSearch::RunSearch(Query* pQuery) logerr(" Error - could not acquire memory pool slot for thread-local PI search.\n"); return false; } + SearchMemoryPoolSlotGuard guard{s_pool, iSlot}; SearchPeptideIndex(pQuery, _ppbDuplFragmentArr[iSlot]); - { std::lock_guard lk(g_searchMemoryPoolMutex); _pbSearchMemoryPool[iSlot] = false; } - g_searchPoolCV.notify_one(); } else { @@ -202,87 +190,52 @@ bool CometSearch::RunSearch(Query* pQuery, int iSlot) } -// called by DoSingleSpectrumSearchMultiResults -bool CometSearch::RunSearch(ThreadPool *tp) -{ - CometSearch sqSearch; - size_t iWhichQuery = 0; - - if (g_staticParams.iDbType == DbType::FI_DB) // fragment ion index - { - if (!g_bPlainPeptideIndexRead) - { - CometFragmentIndex sqFI; - sqFI.ReadPlainPeptideIndex(); - sqFI.CreateFragmentIndex(tp); - } - - int iSlot = AcquirePoolSlot(); - if (iSlot < 0) - { - logerr(" Error - could not acquire memory pool slot for single-query FI search.\n"); - return false; - } - SearchFragmentIndex(g_pvQuery.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); - { std::lock_guard lk(g_searchMemoryPoolMutex); _pbSearchMemoryPool[iSlot] = false; } - g_searchPoolCV.notify_one(); - } - else if (g_staticParams.iDbType == DbType::PI_DB) // peptide index - { - sqSearch.SearchPeptideIndex(tp); - } - else - { - string strErrorMsg = " Error - index search but iDbType = " + std::to_string(static_cast(g_staticParams.iDbType)) + "\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - - return true; -} - - bool CometSearch::RunSearch(int iPercentStart, int iPercentEnd, - ThreadPool* tp) + ThreadPool* tp, + vector& queries) { bool bSucceeded = true; if (g_staticParams.iDbType == DbType::FI_DB) { - CometFragmentIndex* sqFI = new CometFragmentIndex(); - CometSearch* sqSearch = new CometSearch(); + CometFragmentIndex sqFI; if (!g_bPlainPeptideIndexRead) { - sqFI->ReadPlainPeptideIndex(); - sqFI->CreateFragmentIndex(tp); + sqFI.ReadPlainPeptideIndex(); + sqFI.CreateFragmentIndex(tp); } - delete sqFI; - ThreadPool* pSearchThreadPool = tp; - size_t iEnd = g_pvQuery.size(); + size_t iEnd = queries.size(); + std::atomic bAllSlotsAcquired(true); for (size_t iWhichQuery = 0; iWhichQuery < iEnd; ++iWhichQuery) { - pSearchThreadPool->doJob([iWhichQuery]() { + pSearchThreadPool->doJob([iWhichQuery, &queries, &bAllSlotsAcquired]() { int iSlot = AcquirePoolSlot(); if (iSlot < 0) { logerr(" Error - could not acquire memory pool slot for batch FI search thread.\n"); + bAllSlotsAcquired = false; return; } - SearchFragmentIndex(g_pvQuery.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); - { std::lock_guard lk(g_searchMemoryPoolMutex); _pbSearchMemoryPool[iSlot] = false; } - g_searchPoolCV.notify_one(); + SearchMemoryPoolSlotGuard guard{s_pool, iSlot}; + SearchFragmentIndex(queries.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); }); } pSearchThreadPool->wait_on_threads(); + if (!bAllSlotsAcquired) + { + string strErrorMsg = " Error - one or more batch FI search queries could not acquire a memory pool slot.\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + bSucceeded = false; + } + if (!g_staticParams.options.bOutputSqtStream && !(g_staticParams.databaseInfo.iTotalNumProteins % 500)) { char szTmp[128]; @@ -292,14 +245,12 @@ bool CometSearch::RunSearch(int iPercentStart, logout("\b\b\b\b"); } - delete sqSearch; return bSucceeded; } else if (g_staticParams.iDbType == DbType::PI_DB) { - CometSearch* sqSearch = new CometSearch(); - sqSearch->SearchPeptideIndex(tp); - delete sqSearch; + CometSearch sqSearch; + sqSearch.SearchPeptideIndex(tp, queries); return bSucceeded; } else @@ -935,7 +886,7 @@ bool CometSearch::RunSearch(int iPercentStart, // Now search sequence entry; add threading here so that // each protein sequence is passed to a separate thread. - SearchThreadData *pSearchThreadData = new SearchThreadData(dbe); + SearchThreadData *pSearchThreadData = new SearchThreadData(dbe, &queries); pSearchThreadPool->doJob(std::bind(SearchThreadProc, pSearchThreadData, pSearchThreadPool)); @@ -1003,24 +954,17 @@ bool CometSearch::RunSearch(int iPercentStart, } -bool CometSearch::RunSpecLibSearch(ThreadPool* tp) -{ - printf("OK in RunSpecLib\n"); - - return true; -} - - -bool CometSearch::RunSpecLibSearch(int iPercentStart, - int iPercentEnd, - ThreadPool* tp) +bool CometSearch::RunSpecLibSearch(int /*iPercentStart*/, + int /*iPercentEnd*/, + ThreadPool* /*tp*/, + vector& queries) { // to fill g_vulSpecLibPrecursorIndex, set // binmin = BINPREC(expmass - tol) // binmax = BINPREC(expmass + tol) // then for (i=binmin; i<=binmax; ++i) {g_vulSpecLibPrecursorIndex[i].push_back(entry)} - for (vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) + for (vector::iterator it = queries.begin(); it != queries.end(); ++it) { int iBinExpMass = BINPREC((*it)->_pepMassInfo.dExpPepMass); @@ -1047,16 +991,18 @@ bool CometSearch::RunMS1Search(ThreadPool* tp, double dRT, double dMaxMS1RTDiff, const double dMaxSpecLibRT, - const double dMaxQueryRT) + const double dMaxQueryRT, + vector& ms1Queries) { ThreadPool* pRunMS1SearchThreadPool = tp; - for (size_t iWhichMS1Query = 0; iWhichMS1Query < g_pvQueryMS1.size(); ++iWhichMS1Query) + for (size_t iWhichMS1Query = 0; iWhichMS1Query < ms1Queries.size(); ++iWhichMS1Query) { + QueryMS1* pMS1Query = ms1Queries.at(iWhichMS1Query); // for each query, thread the search by segmenting the library for (int iWhichThread = 0; iWhichThread < g_staticParams.options.iNumThreads; ++iWhichThread) { - pRunMS1SearchThreadPool->doJob(std::bind(SearchMS1Library, iWhichMS1Query, iWhichThread, dRT, + pRunMS1SearchThreadPool->doJob(std::bind(SearchMS1Library, pMS1Query, iWhichThread, dRT, dMaxMS1RTDiff, dMaxSpecLibRT, dMaxQueryRT, pRunMS1SearchThreadPool)); } } @@ -1070,7 +1016,7 @@ bool CometSearch::RunMS1Search(ThreadPool* tp, // the read-only g_vSpecLib entries within the RT window. Populates the output // scores vector with up to topN best matches. Zero shared mutable state. bool CometSearch::RunMS1Search(QueryMS1* pQueryMS1, - const int topN, + const int /*topN*/, double dRT, double dMaxMS1RTDiff, const double dMaxSpecLibRT, @@ -1261,44 +1207,24 @@ bool CometSearch::MapOBO(string strMod, void CometSearch::SearchThreadProc(SearchThreadData *pSearchThreadData, - ThreadPool* tp) + ThreadPool* /*tp*/) { - int i = -1; - - // Grab available array from shared memory pool. - { - std::unique_lock lock(g_searchMemoryPoolMutex); - bool found = g_searchPoolCV.wait_for(lock, std::chrono::seconds(240), [&i]() { - for (int j = 0; j < g_staticParams.options.iNumThreads; ++j) - { - if (_pbSearchMemoryPool[j] == false) - { - _pbSearchMemoryPool[j] = true; - i = j; - return true; - } - } - return false; - }); - if (!found) - i = g_staticParams.options.iNumThreads; // sentinel: timeout - } + int i = AcquirePoolSlot(); - if (i < 0 || i == g_staticParams.options.iNumThreads) + if (i < 0) { logerr(" Error - could not find available memory pool for MS2 search thread.\n"); return; } - // Give memory manager access to the thread. - pSearchThreadData->pbSearchMemoryPool = &_pbSearchMemoryPool[i]; + SearchMemoryPoolSlotGuard guard{s_pool, i}; // Heap-allocate to avoid thread stack overflow: CometSearch has ~295 KB of // member arrays (_uiBinnedIonMasses, etc.) that would exhaust the 1 MB thread // stack in debug builds when combined with the deep DoSearch call chain. CometSearch* sqSearch = new CometSearch(); sqSearch->_iSlot = i; - sqSearch->DoSearch(pSearchThreadData->dbEntry, _ppbDuplFragmentArr[i]); + sqSearch->DoSearch(pSearchThreadData->dbEntry, _ppbDuplFragmentArr[i], *pSearchThreadData->pQueries); delete sqSearch; delete pSearchThreadData; @@ -1307,8 +1233,11 @@ void CometSearch::SearchThreadProc(SearchThreadData *pSearchThreadData, bool CometSearch::DoSearch(sDBEntry dbe, - bool *pbDuplFragment) + bool *pbDuplFragment, + const vector& queries) { + _pQueries = &queries; + if (g_staticParams.options.bFastPlainPeptideIdx) { _seenShort.clear(); @@ -1877,12 +1806,16 @@ void CometSearch::SearchFragmentIndex(Query* pQuery, } -bool CometSearch::SearchPeptideIndex(ThreadPool* tp) +bool CometSearch::SearchPeptideIndex(ThreadPool* /*tp*/, vector& queries) { comet_fileoffset_t lEndOfStruct; FILE* fp; - size_t tTmp; + // BinarySearchMass() and AnalyzePeptideIndex() read the query list through + // _pQueries rather than a parameter (mirroring CometSearch::DoSearch()); without + // this assignment _pQueries stays nullptr on a freshly constructed CometSearch + // instance and the first dereference below segfaults. + _pQueries = &queries; CometPostAnalysis cpa; @@ -1929,15 +1862,15 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp) comet_fileoffset_t clProteinsFilePos; comet_fseek(fp, -clSizeCometFileOffset * 2, SEEK_END); - tTmp = fread(&lEndOfStruct, clSizeCometFileOffset, 1, fp); - tTmp = fread(&clProteinsFilePos, clSizeCometFileOffset, 1, fp); + (void)fread(&lEndOfStruct, clSizeCometFileOffset, 1, fp); + (void)fread(&clProteinsFilePos, clSizeCometFileOffset, 1, fp); if (!g_bPeptideIndexRead) { // now read in: vector> g_pvProteinsList comet_fseek(fp, clProteinsFilePos, SEEK_SET); size_t tSize; - tTmp = fread(&tSize, clSizeCometFileOffset, 1, fp); + (void)fread(&tSize, clSizeCometFileOffset, 1, fp); vector vTmp; g_pvProteinsList.clear(); @@ -1945,12 +1878,12 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp) for (size_t it = 0; it < tSize; ++it) { size_t tNumProteinOffsets; - tTmp = fread(&tNumProteinOffsets, clSizeCometFileOffset, 1, fp); + (void)fread(&tNumProteinOffsets, clSizeCometFileOffset, 1, fp); vTmp.clear(); for (size_t it2 = 0; it2 < tNumProteinOffsets; ++it2) { - tTmp = fread(&clTmp, clSizeCometFileOffset, 1, fp); + (void)fread(&clTmp, clSizeCometFileOffset, 1, fp); vTmp.push_back(clTmp); } g_pvProteinsList.push_back(vTmp); @@ -1969,9 +1902,9 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp) // seek to index comet_fseek(fp, lEndOfStruct, SEEK_SET); - tTmp = fread(&iMinMass, sizeof(int), 1, fp); - tTmp = fread(&iMaxMass, sizeof(int), 1, fp); - tTmp = fread(&tNumPeptides, sizeof(uint64_t), 1, fp); + (void)fread(&iMinMass, sizeof(int), 1, fp); + (void)fread(&iMaxMass, sizeof(int), 1, fp); + (void)fread(&tNumPeptides, sizeof(uint64_t), 1, fp); // sanity checks if (iMinMass < 0 || iMinMass > 20000 || iMaxMass < 0 || iMaxMass > 20000) @@ -1988,7 +1921,7 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp) for (int i = 0; i < iMaxPeptideMass10; ++i) lReadIndex[i] = -1; - tTmp = fread(lReadIndex, sizeof(comet_fileoffset_t), iMaxPeptideMass10, fp); + (void)fread(lReadIndex, sizeof(comet_fileoffset_t), iMaxPeptideMass10, fp); int iStart = (int)(g_massRange.dMinMass - 0.5); // smallest mass/index start int iEnd = (int)(g_massRange.dMaxMass + 0.5); // largest mass/index end @@ -2034,16 +1967,14 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp) // compatibility with standard search in StorePeptide dbe.lProteinFilePosition = sDBI.lIndexProteinFilePosition; - ThreadPool* pSearchThreadPool = tp; - while ((int)(sDBI.dPepMass * 10) <= iEnd10) { if (sDBI.dPepMass > g_massRange.dMaxMass) break; - int iWhichQuery = BinarySearchMass(0, (int)g_pvQuery.size(), sDBI.dPepMass); + int iWhichQuery = BinarySearchMass(0, (int)queries.size(), sDBI.dPepMass); - while (iWhichQuery > 0 && g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassTolerancePlus >= sDBI.dPepMass) + while (iWhichQuery > 0 && queries.at(iWhichQuery)->_pepMassInfo.dPeptideMassTolerancePlus >= sDBI.dPepMass) iWhichQuery--; // Do the search @@ -2072,25 +2003,6 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp) } } -/* - for (vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) - { - int iNumMatchedPeptides = (*it)->iMatchPeptideCount; - if (iNumMatchedPeptides > g_staticParams.options.iNumStored) - iNumMatchedPeptides = g_staticParams.options.iNumStored; - - for (int x = 0; x < iNumMatchedPeptides; x++) - { - printf("OK %d scan %d, pep %s, xcorr %f, mass %f, matchcount %d\n", x, - (*it)->_spectrumInfoInternal.iScanNumber, - (*it)->_pResults[x].szPeptide, - (*it)->_pResults[x].fXcorr, - (*it)->_pResults[x].dPepMass, - (*it)->iMatchPeptideCount; fflush(stdout); - } - } -*/ - delete[] lReadIndex; std::fclose(fp); return true; @@ -2768,9 +2680,9 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, } // Compare calculated fragment ions against all matching query spectra. - while (iWhichQuery < (int)g_pvQuery.size()) + while (iWhichQuery < (int)_pQueries->size()) { - if (sDBI.dPepMass < g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) + if (sDBI.dPepMass < _pQueries->at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) { // If calculated mass is smaller than low mass range. break; @@ -2910,7 +2822,7 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ctNL++) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -2981,7 +2893,7 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ctNL++) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -3133,7 +3045,7 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ctNL++) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -3204,7 +3116,7 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ctNL++) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -3266,22 +3178,22 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, } -void CometSearch::SearchMS1Library(size_t iWhichMS1Query, +void CometSearch::SearchMS1Library(QueryMS1* pMS1Query, const int iWhichThread, const double dRT, const double dMaxMS1RTDiff, const double dMaxSpecLibRT, const double dMaxQueryRT, - ThreadPool* tp) + ThreadPool* /*tp*/) { unsigned int iStart = BINPREC(g_staticParams.options.dMS1MinMass); - // Given iWhichMS1Query, this search will run through a subset of the library entries + // Given pMS1Query, this search will run through a subset of the library entries for (size_t iWhichMS1LibEntry = iWhichThread; iWhichMS1LibEntry < g_vSpecLib.size(); iWhichMS1LibEntry += g_staticParams.options.iNumThreads) { double dScore = 0.0; - unsigned int uiArrayLimit = g_pvQueryMS1.at(iWhichMS1Query)->iArraySizeMS1; + unsigned int uiArrayLimit = pMS1Query->iArraySizeMS1; if (uiArrayLimit > g_vSpecLib.at(iWhichMS1LibEntry).uiArraySizeMS1) uiArrayLimit = g_vSpecLib.at(iWhichMS1LibEntry).uiArraySizeMS1; @@ -3289,20 +3201,20 @@ void CometSearch::SearchMS1Library(size_t iWhichMS1Query, { for (unsigned int i = iStart; i < uiArrayLimit; ++i) { - dScore += g_pvQueryMS1.at(iWhichMS1Query)->pfFastXcorrData[i] * g_vSpecLib.at(iWhichMS1LibEntry).pfUnitVector[i]; + dScore += pMS1Query->pfFastXcorrData[i] * g_vSpecLib.at(iWhichMS1LibEntry).pfUnitVector[i]; } - if (dScore > g_pvQueryMS1.at(iWhichMS1Query)->_pSpecLibResultsMS1.fDotProduct) + if (dScore > pMS1Query->_pSpecLibResultsMS1.fDotProduct) { - Threading::LockMutex(g_pvQueryMutex); - if (dScore > g_pvQueryMS1.at(iWhichMS1Query)->_pSpecLibResultsMS1.fDotProduct) + Threading::LockMutex(pMS1Query->accessMutex); + if (dScore > pMS1Query->_pSpecLibResultsMS1.fDotProduct) { - g_pvQueryMS1.at(iWhichMS1Query)->_pSpecLibResultsMS1.fDotProduct = (float)dScore; + pMS1Query->_pSpecLibResultsMS1.fDotProduct = (float)dScore; // scale back to reference RT - g_pvQueryMS1.at(iWhichMS1Query)->_pSpecLibResultsMS1.fRTime = (float)(g_vSpecLib.at(iWhichMS1LibEntry).fRTime * dMaxSpecLibRT / dMaxQueryRT); - g_pvQueryMS1.at(iWhichMS1Query)->_pSpecLibResultsMS1.iWhichSpecLib = g_vSpecLib.at(iWhichMS1LibEntry).iLibEntry; + pMS1Query->_pSpecLibResultsMS1.fRTime = (float)(g_vSpecLib.at(iWhichMS1LibEntry).fRTime * dMaxSpecLibRT / dMaxQueryRT); + pMS1Query->_pSpecLibResultsMS1.iWhichSpecLib = g_vSpecLib.at(iWhichMS1LibEntry).iLibEntry; } - Threading::UnlockMutex(g_pvQueryMutex); + Threading::UnlockMutex(pMS1Query->accessMutex); } } else if (g_vSpecLib.at(iWhichMS1LibEntry).fRTime > dRT + dMaxMS1RTDiff) @@ -3758,9 +3670,9 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, bool bFirstTimeThroughLoopForPeptide = true; // Compare calculated fragment ions against all matching query spectra. - while (iWhichQuery < (int)g_pvQuery.size()) + while (iWhichQuery < (int)_pQueries->size()) { - if (dCalcPepMass < g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) + if (dCalcPepMass < _pQueries->at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) { // If calculated mass is smaller than low mass range. break; @@ -3827,7 +3739,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -3867,7 +3779,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -3979,7 +3891,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -4020,7 +3932,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -4333,11 +4245,11 @@ int CometSearch::WithinMassTolerance(double dCalcPepMass, // proper enzyme termini, check if within mass tolerance of any given entry. // Do a binary search on list of input queries to find matching mass. - int iPos = BinarySearchMass(0, (int)g_pvQuery.size(), dCalcPepMass); + int iPos = BinarySearchMass(0, (int)_pQueries->size(), dCalcPepMass); // Seek back to first peptide entry that matches mass tolerance in case binary // search doesn't hit the first entry. - while (iPos > 0 && g_pvQuery.at(iPos)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) + while (iPos > 0 && _pQueries->at(iPos)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) iPos--; if (iPos != -1) @@ -4395,11 +4307,11 @@ bool CometSearch::WithinMassTolerancePeff(double dCalcPepMass, // of any entry. If so, simply return true here and will repeat the PEFF permutations later. // Do a binary search on list of input queries to find matching mass. - int iPos = BinarySearchMass(0, (int)g_pvQuery.size(), dCalcPepMass + dMassAddition); + int iPos = BinarySearchMass(0, (int)_pQueries->size(), dCalcPepMass + dMassAddition); // Seek back to first peptide entry that matches mass tolerance in case binary // search doesn't hit the first entry. - while (iPos > 0 && g_pvQuery.at(iPos)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) + while (iPos > 0 && _pQueries->at(iPos)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass + dMassAddition) iPos--; if (iPos != -1) @@ -4604,18 +4516,18 @@ int CometSearch::BinarySearchMass(int start, double dCalcPepMass) const { auto it = std::lower_bound( - g_pvQuery.begin() + start, - g_pvQuery.begin() + end, + _pQueries->begin() + start, + _pQueries->begin() + end, dCalcPepMass, [](const Query* query, double mass) { return query->_pepMassInfo.dPeptideMassTolerancePlus < mass; }); - if (it != g_pvQuery.begin() + end + if (it != _pQueries->begin() + end && (*it)->_pepMassInfo.dPeptideMassToleranceMinus <= dCalcPepMass && dCalcPepMass <= (*it)->_pepMassInfo.dPeptideMassTolerancePlus) { - return static_cast(std::distance(g_pvQuery.begin(), it)); + return static_cast(std::distance(_pQueries->begin(), it)); } return -1; @@ -4668,7 +4580,7 @@ size_t CometSearch::BinarySearchIndexMass(size_t start, bool CometSearch::CheckMassMatch(size_t iWhichQuery, double dCalcPepMass) { - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = _pQueries->at(iWhichQuery); int iMassOffsetsSize = (int)g_staticParams.vectorMassOffsets.size(); @@ -5055,7 +4967,7 @@ void CometSearch::XcorrScore(char* szProteinSeq, int iWhichIonSeries; bool bUseWaterAmmoniaNLPeaks = false; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = _pQueries->at(iWhichQuery); float** ppSparseFastXcorrData; // use this if bSparseMatrix @@ -5135,7 +5047,7 @@ void CometSearch::XcorrScore(char* szProteinSeq, ppSparseFastXcorrData = pQuery->ppfSparseFastXcorrData; for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (int ctZ = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctZ >= 1; --ctZ) + for (int ctZ = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctZ >= 1; --ctZ) { bin = *(*(*p_uiBinnedPrecursorNL + ctNL) + ctZ); @@ -5232,7 +5144,7 @@ void CometSearch::StorePeptide(size_t iWhichQuery, int i; int iLenPeptide; int iLenPeptide2; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = _pQueries->at(iWhichQuery); if (dXcorr < g_staticParams.options.dMinimumXcorr) return; @@ -5681,7 +5593,7 @@ int CometSearch::CheckDuplicate(int iWhichQuery, int iLenPeptide = iEndPos - iStartPos + 1; int iLenProteinMinus1 = (int)strlen(szProteinSeq) - 1; int bIsDuplicate = 0; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = _pQueries->at(iWhichQuery); if (g_staticParams.options.iDecoySearch == 2 && bDecoyPep) { @@ -7526,11 +7438,11 @@ bool CometSearch::MergeVarMods(char* szProteinSeq, // Need to check if mass is ok // Do a binary search on list of input queries to find matching mass. - iWhichQuery = BinarySearchMass(0, (int)g_pvQuery.size(), dTmpCalcPepMass); + iWhichQuery = BinarySearchMass(0, (int)_pQueries->size(), dTmpCalcPepMass); // Seek back to first peptide entry that matches mass tolerance in case binary // search doesn't hit the first entry. - while (iWhichQuery > 0 && g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) + while (iWhichQuery > 0 && _pQueries->at(iWhichQuery)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) iWhichQuery--; // Only if this PEFF mod (plus possible variable mods) is within mass tolerance, continue @@ -7655,9 +7567,9 @@ bool CometSearch::CalcVarModIons(char* szProteinSeq, // Compare calculated fragment ions against all matching query spectra - while (iWhichQuery < (int)g_pvQuery.size()) + while (iWhichQuery < (int)_pQueries->size()) { - if (dCalcPepMass < g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) + if (dCalcPepMass < _pQueries->at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) { // if calculated mass is smaller than low mass range, it // means we reached candidate peptides that are too big @@ -7861,7 +7773,7 @@ bool CometSearch::CalcVarModIons(char* szProteinSeq, // initialize precursorNL for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -7944,7 +7856,7 @@ bool CometSearch::CalcVarModIons(char* szProteinSeq, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; @@ -8197,7 +8109,7 @@ bool CometSearch::CalcVarModIons(char* szProteinSeq, // initialize precursorNL for decoy for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -8269,7 +8181,7 @@ bool CometSearch::CalcVarModIons(char* szProteinSeq, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -8651,114 +8563,217 @@ void CometSearch::StorePeptideI(Query* pQuery, int iLenPeptide = iEndPos - iStartPos + 1; int iLenProteinMinus1 = (int)strlen(szProteinSeq) - 1; - short siLowestXcorrScoreIndex = pQuery->siLowestXcorrScoreIndex; + int iSizepiVarModSites = sizeof(int) * MAX_PEPTIDE_LEN_P2; + int iSizepdVarModSites = sizeof(double) * MAX_PEPTIDE_LEN_P2; - pQuery->iMatchPeptideCount++; - pQuery->_pResults[siLowestXcorrScoreIndex].usiLenPeptide = iLenPeptide; + if (g_staticParams.options.iDecoySearch == 2 && bDecoyPep) + { + short siLowestDecoyXcorrScoreIndex = pQuery->siLowestDecoyXcorrScoreIndex; - memcpy(pQuery->_pResults[siLowestXcorrScoreIndex].szPeptide, szProteinSeq + iStartPos, iLenPeptide * sizeof(char)); - pQuery->_pResults[siLowestXcorrScoreIndex].szPeptide[iLenPeptide] = '\0'; - pQuery->_pResults[siLowestXcorrScoreIndex].dPepMass = dCalcPepMass; + pQuery->iDecoyMatchPeptideCount++; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].usiLenPeptide = iLenPeptide; - if (pQuery->_spectrumInfoInternal.usiChargeState > 2) - { - pQuery->_pResults[siLowestXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) - * pQuery->_spectrumInfoInternal.usiMaxFragCharge - * g_staticParams.ionInformation.iNumIonSeriesUsed; - } - else - { - pQuery->_pResults[siLowestXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) - * g_staticParams.ionInformation.iNumIonSeriesUsed; - } + memcpy(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].szPeptide, szProteinSeq + iStartPos, iLenPeptide * sizeof(char)); + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].szPeptide[iLenPeptide] = '\0'; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].dPepMass = dCalcPepMass; - pQuery->_pResults[siLowestXcorrScoreIndex].fXcorr = (float)dXcorr; - pQuery->_pResults[siLowestXcorrScoreIndex].bClippedM = false; + if (pQuery->_spectrumInfoInternal.usiChargeState > 2) + { + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) + * pQuery->_spectrumInfoInternal.usiMaxFragCharge + * g_staticParams.ionInformation.iNumIonSeriesUsed; + } + else + { + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) + * g_staticParams.ionInformation.iNumIonSeriesUsed; + } - if (iStartPos == 0) - pQuery->_pResults[siLowestXcorrScoreIndex].cPrevAA = '-'; - else - pQuery->_pResults[siLowestXcorrScoreIndex].cPrevAA = szProteinSeq[iStartPos - 1]; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].fXcorr = (float)dXcorr; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].bClippedM = false; - if (iEndPos == iLenProteinMinus1) - pQuery->_pResults[siLowestXcorrScoreIndex].cNextAA = '-'; - else - pQuery->_pResults[siLowestXcorrScoreIndex].cNextAA = szProteinSeq[iEndPos + 1]; + if (iStartPos == 0) + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cPrevAA = '-'; + else + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cPrevAA = szProteinSeq[iStartPos - 1]; + + if (iEndPos == iLenProteinMinus1) + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cNextAA = '-'; + else + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cNextAA = szProteinSeq[iEndPos + 1]; - pQuery->_pResults[siLowestXcorrScoreIndex].iPeffOrigResiduePosition = NO_PEFF_VARIANT; - pQuery->_pResults[siLowestXcorrScoreIndex].sPeffOrigResidues.clear(); - pQuery->_pResults[siLowestXcorrScoreIndex].iPeffNewResidueCount = 0; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].iPeffOrigResiduePosition = NO_PEFF_VARIANT; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].sPeffOrigResidues.clear(); + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].iPeffNewResidueCount = 0; - pQuery->_pResults[siLowestXcorrScoreIndex].pWhichProtein.clear(); - pQuery->_pResults[siLowestXcorrScoreIndex].pWhichDecoyProtein.clear(); - pQuery->_pResults[siLowestXcorrScoreIndex].lProteinFilePosition = dbe->lProteinFilePosition; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pWhichProtein.clear(); + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pWhichDecoyProtein.clear(); + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].lProteinFilePosition = dbe->lProteinFilePosition; - pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_None; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cHasVariableMod = HasVariableModType_None; - int iSizepiVarModSites = sizeof(int) * MAX_PEPTIDE_LEN_P2; - int iSizepdVarModSites = sizeof(double) * MAX_PEPTIDE_LEN_P2; + if (g_staticParams.variableModParameters.bVarModSearch) + { + if (!iFoundVariableMod) + { + memset(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); + memset(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); + } + else + { + memcpy(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].piVarModSites, piVarModSites, iSizepiVarModSites); + + int iVal; + for (int i = 0; i < iLenPeptide + 2; ++i) + { + iVal = pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].piVarModSites[i]; + + if (iVal > 0) + { + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pdVarModSites[i] = g_staticParams.variableModParameters.varModList[iVal - 1].dVarModMass; - if (g_staticParams.variableModParameters.bVarModSearch) + if (g_staticParams.options.iPrintAScoreProScore == -1 + || (g_staticParams.options.iPrintAScoreProScore > 0 && iVal == g_AScoreOptions.getSymbol() - '0')) + { + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cHasVariableMod = HasVariableModType_AScorePro; + } + else if (pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cHasVariableMod == HasVariableModType_None) + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cHasVariableMod = HasVariableModType_True; + } + else + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pdVarModSites[i] = 0.0; + } + } + } + else + { + memset(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); + memset(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); + } + + // Get new lowest decoy score. + pQuery->dLowestDecoyXcorrScore = pQuery->_pDecoys[0].fXcorr; + siLowestDecoyXcorrScoreIndex = 0; + + for (short siA = (short)(g_staticParams.options.iNumStored - 1); siA > 0; --siA) + { + if (pQuery->_pDecoys[siA].fXcorr < pQuery->dLowestDecoyXcorrScore || pQuery->_pDecoys[siA].usiLenPeptide == 0) + { + pQuery->dLowestDecoyXcorrScore = pQuery->_pDecoys[siA].fXcorr; + siLowestDecoyXcorrScoreIndex = siA; + } + } + + pQuery->siLowestDecoyXcorrScoreIndex = siLowestDecoyXcorrScoreIndex; + } + else { - if (!iFoundVariableMod) + short siLowestXcorrScoreIndex = pQuery->siLowestXcorrScoreIndex; + + pQuery->iMatchPeptideCount++; + pQuery->_pResults[siLowestXcorrScoreIndex].usiLenPeptide = iLenPeptide; + + memcpy(pQuery->_pResults[siLowestXcorrScoreIndex].szPeptide, szProteinSeq + iStartPos, iLenPeptide * sizeof(char)); + pQuery->_pResults[siLowestXcorrScoreIndex].szPeptide[iLenPeptide] = '\0'; + pQuery->_pResults[siLowestXcorrScoreIndex].dPepMass = dCalcPepMass; + + if (pQuery->_spectrumInfoInternal.usiChargeState > 2) { - memset(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); - memset(pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); + pQuery->_pResults[siLowestXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) + * pQuery->_spectrumInfoInternal.usiMaxFragCharge + * g_staticParams.ionInformation.iNumIonSeriesUsed; } else { - memcpy(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, piVarModSites, iSizepiVarModSites); + pQuery->_pResults[siLowestXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) + * g_staticParams.ionInformation.iNumIonSeriesUsed; + } + + pQuery->_pResults[siLowestXcorrScoreIndex].fXcorr = (float)dXcorr; + pQuery->_pResults[siLowestXcorrScoreIndex].bClippedM = false; + + if (iStartPos == 0) + pQuery->_pResults[siLowestXcorrScoreIndex].cPrevAA = '-'; + else + pQuery->_pResults[siLowestXcorrScoreIndex].cPrevAA = szProteinSeq[iStartPos - 1]; + + if (iEndPos == iLenProteinMinus1) + pQuery->_pResults[siLowestXcorrScoreIndex].cNextAA = '-'; + else + pQuery->_pResults[siLowestXcorrScoreIndex].cNextAA = szProteinSeq[iEndPos + 1]; + + pQuery->_pResults[siLowestXcorrScoreIndex].iPeffOrigResiduePosition = NO_PEFF_VARIANT; + pQuery->_pResults[siLowestXcorrScoreIndex].sPeffOrigResidues.clear(); + pQuery->_pResults[siLowestXcorrScoreIndex].iPeffNewResidueCount = 0; + + pQuery->_pResults[siLowestXcorrScoreIndex].pWhichProtein.clear(); + pQuery->_pResults[siLowestXcorrScoreIndex].pWhichDecoyProtein.clear(); + pQuery->_pResults[siLowestXcorrScoreIndex].lProteinFilePosition = dbe->lProteinFilePosition; + + pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_None; - for (int i = 0; i < iLenPeptide + 2; ++i) + if (g_staticParams.variableModParameters.bVarModSearch) + { + if (!iFoundVariableMod) { - if (piVarModSites[i] > 0) - pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = g_staticParams.variableModParameters.varModList[piVarModSites[i] - 1].dVarModMass; - else - pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = 0.0; + memset(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); + memset(pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); } - - int iVal; - for (int i = 0; i < iLenPeptide + 2; ++i) + else { - iVal = pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites[i]; + memcpy(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, piVarModSites, iSizepiVarModSites); - if (iVal > 0) + for (int i = 0; i < iLenPeptide + 2; ++i) { - pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = g_staticParams.variableModParameters.varModList[iVal - 1].dVarModMass; + if (piVarModSites[i] > 0) + pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = g_staticParams.variableModParameters.varModList[piVarModSites[i] - 1].dVarModMass; + else + pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = 0.0; + } + + int iVal; + for (int i = 0; i < iLenPeptide + 2; ++i) + { + iVal = pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites[i]; - if (g_staticParams.options.iPrintAScoreProScore == -1 - || (g_staticParams.options.iPrintAScoreProScore > 0 && iVal == g_AScoreOptions.getSymbol() - '0')) + if (iVal > 0) { - pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_AScorePro; + pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = g_staticParams.variableModParameters.varModList[iVal - 1].dVarModMass; + + if (g_staticParams.options.iPrintAScoreProScore == -1 + || (g_staticParams.options.iPrintAScoreProScore > 0 && iVal == g_AScoreOptions.getSymbol() - '0')) + { + pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_AScorePro; + } + else if (pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod == HasVariableModType_None) + pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_True; } - else if (pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod == HasVariableModType_None) - pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_True; + else + pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = 0.0; } - else - pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = 0.0; } } - } - else - { - memset(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); - memset(pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); - } + else + { + memset(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); + memset(pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); + } - // Get new lowest score. - pQuery->dLowestXcorrScore = pQuery->_pResults[0].fXcorr; - siLowestXcorrScoreIndex = 0; + // Get new lowest score. + pQuery->dLowestXcorrScore = pQuery->_pResults[0].fXcorr; + siLowestXcorrScoreIndex = 0; - for (int i = g_staticParams.options.iNumStored - 1; i > 0; --i) - { - if (pQuery->_pResults[i].fXcorr < pQuery->dLowestXcorrScore || pQuery->_pResults[i].usiLenPeptide == 0) + for (int i = g_staticParams.options.iNumStored - 1; i > 0; --i) { - pQuery->dLowestXcorrScore = pQuery->_pResults[i].fXcorr; - siLowestXcorrScoreIndex = i; + if (pQuery->_pResults[i].fXcorr < pQuery->dLowestXcorrScore || pQuery->_pResults[i].usiLenPeptide == 0) + { + pQuery->dLowestXcorrScore = pQuery->_pResults[i].fXcorr; + siLowestXcorrScoreIndex = i; + } } - } - pQuery->siLowestXcorrScoreIndex = siLowestXcorrScoreIndex; + pQuery->siLowestXcorrScoreIndex = siLowestXcorrScoreIndex; + } } @@ -8882,9 +8897,9 @@ void CometSearch::CompoundModSearch(char *szProteinSeq, bool bFirstTime = true; - while (iWhichQuery < (int)g_pvQuery.size()) + while (iWhichQuery < (int)_pQueries->size()) { - if (dModMass < g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) + if (dModMass < _pQueries->at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) break; if (CheckMassMatch(iWhichQuery, dModMass)) diff --git a/CometSearch/CometSearch.h b/CometSearch/CometSearch.h index 8301cac7..6f1a3f64 100644 --- a/CometSearch/CometSearch.h +++ b/CometSearch/CometSearch.h @@ -39,22 +39,15 @@ struct SearchThreadData { sDBEntry dbEntry; - bool* pbSearchMemoryPool; ThreadPool* tp; + const vector* pQueries; - SearchThreadData() = default; - SearchThreadData(const sDBEntry& dbEntry_in) - : dbEntry(dbEntry_in), pbSearchMemoryPool(nullptr), tp(nullptr) { + SearchThreadData(const sDBEntry& dbEntry_in, const vector* pQueries_in) + : dbEntry(dbEntry_in), tp(nullptr), pQueries(pQueries_in) { } ~SearchThreadData() { - if (pbSearchMemoryPool) - { - { std::lock_guard lk(g_searchMemoryPoolMutex); *pbSearchMemoryPool = false; } - g_searchPoolCV.notify_one(); - pbSearchMemoryPool = nullptr; - } dbEntry.vectorPeffMod.clear(); dbEntry.vectorPeffVariantSimple.clear(); } @@ -73,8 +66,8 @@ class CometSearch static bool RunSearch(int iPercentStart, int iPercentEnd, - ThreadPool* tp); - static bool RunSearch(ThreadPool* tp); + ThreadPool* tp, + vector& queries); // Task 1.3: Thread-local overload: searches a caller-owned Query* without // touching g_pvQuery. Allocates its own pbDuplFragment scratch buffer. @@ -86,13 +79,14 @@ class CometSearch static bool RunSpecLibSearch(int iPercentStart, int iPercentEnd, - ThreadPool* tp); - static bool RunSpecLibSearch(ThreadPool* tp); + ThreadPool* tp, + vector& queries); static bool RunMS1Search(ThreadPool* tp, double dRT, double dMaxMS1RTDiff, const double dMaxSpecLibRT, - const double dMaxQueryRT); + const double dMaxQueryRT, + vector& ms1Queries); // Thread-local overload: searches a caller-owned QueryMS1* against read-only g_vSpecLib. // No global mutable state accessed. static bool RunMS1Search(QueryMS1* pQueryMS1, @@ -106,7 +100,7 @@ class CometSearch static void SearchThreadProc(SearchThreadData* pSearchThreadData, ThreadPool* tp); - bool DoSearch(sDBEntry dbe, bool* pbDuplFragment); + bool DoSearch(sDBEntry dbe, bool* pbDuplFragment, const vector& queries); // Performance: Mark as const where possible bool CheckEnzymeTermini(const char* szProteinSeq, @@ -120,13 +114,13 @@ class CometSearch int BinarySearchMass(int start, int end, double dCalcPepMass) const; - static bool CheckMassMatch(size_t iWhichQuery, - double dCalcPepMass); + bool CheckMassMatch(size_t iWhichQuery, + double dCalcPepMass); // Task 1.2: Thread-local overload accepting Query* directly. static bool CheckMassMatch(Query* pQuery, double dCalcPepMass); - bool SearchPeptideIndex(ThreadPool* tp); + bool SearchPeptideIndex(ThreadPool* tp, vector& queries); struct ProteinInfo { @@ -311,7 +305,7 @@ class CometSearch bool TranslateNA2AA(int* frame, int iDirection, char* sDNASequence); - static void SearchMS1Library(size_t iWhichMS1Query, + static void SearchMS1Library(QueryMS1* pMS1Query, const int iWhichThread, const double dRT, const double dMaxMS1RTDiff, @@ -378,10 +372,10 @@ class CometSearch static int AcquirePoolSlot(); // Spin-wait for a free slot; returns index or -1 on timeout - static bool *_pbSearchMemoryPool; // Pool of memory to be shared by search threads static bool **_ppbDuplFragmentArr; // Number of arrays equals number of threads int _iSlot = -1; // pool slot index; set by SearchThreadProc before DoSearch + const vector* _pQueries = nullptr; // batch query list; set before FASTA/PI search std::unordered_set _seenShort; // per-protein dedup for len <= 12 (bFastPlainPeptideIdx) std::unordered_set _seenLong; // per-protein dedup for len > 12 (bFastPlainPeptideIdx) }; diff --git a/CometSearch/CometSearch.vcxproj b/CometSearch/CometSearch.vcxproj index a2068b26..81952f45 100644 --- a/CometSearch/CometSearch.vcxproj +++ b/CometSearch/CometSearch.vcxproj @@ -47,7 +47,7 @@ Level3 Disabled - ..\MSToolkit\include;..\MSToolkit\include\extern;..\AScorePro\include + $(ProjectDir);..\MSToolkit\include;..\MSToolkit\include\extern;..\AScorePro\include Fast WIN32;WIN64;_WIN64;_MBCS;_CRT_SECURE_NO_DEPRECATE;_NOSQLITE;NOMINMAX;_HAS_STD_BYTE=0;RTS_TIMING_OFF;%(PreprocessorDefinitions) ProgramDatabase @@ -64,7 +64,7 @@ MaxSpeed false true - ..\MSToolkit\include;..\MSToolkit\include\extern;..\AScorePro\include + $(ProjectDir);..\MSToolkit\include;..\MSToolkit\include\extern;..\AScorePro\include AnySuitable Speed true @@ -116,6 +116,23 @@ + + + + + + + + + + + + + + + + + @@ -136,6 +153,12 @@ + + + + + + diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index 61d4c606..a696eee3 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -23,6 +23,12 @@ #include "CometWritePepXML.h" #include "CometWriteMzIdentML.h" #include "CometWritePercolator.h" +#include "output/IResultWriter.h" +#include "output/SqtWriter.h" +#include "output/TxtWriter.h" +#include "output/PepXmlWriter.h" +#include "output/MzIdentMlWriter.h" +#include "output/PercolatorWriter.h" #include "CometDataInternal.h" #include "CometSearchManager.h" #include "CometStatus.h" @@ -32,6 +38,13 @@ #include "CometAlignment.h" #include "AScoreOptions.h" #include "AScoreFactory.h" +#include "search/SearchSession.h" +#include "search/SearchUtils.h" +#include "search/ISearchStrategy.h" +#include "search/FiStrategy.h" +#include "search/FastaStrategy.h" +#include "search/PiStrategy.h" +#include "search/Pipeline.h" #include #include @@ -40,14 +53,6 @@ extern comet_fileoffset_t clSizeCometFileOffset; -std::vector g_pvQuery; - -// g_pvQueryMS1: BATCH PATH ONLY - used by RunMS1Search(ThreadPool*,...) and -// PreprocessMS1SingleSpectrum(). The single-spectrum MS1 search path -// (DoMS1SearchMultiResults) uses thread-local QueryMS1* objects and never -// reads or writes this vector. Do not access from concurrent search threads. -std::vector g_pvQueryMS1; - std::vector g_pvInputFiles; StaticParams g_staticParams; vector g_pvDBIndex; @@ -57,14 +62,12 @@ MassRange g_massRange; Mutex g_pvQueryMutex; Mutex g_pvDBIndexMutex; Mutex g_preprocessMemoryPoolMutex; -Mutex g_searchMemoryPoolMutex; Mutex g_ms1AlignerMutex; CometStatus g_cometStatus; string g_sCometVersion; map g_pvProteinNames; // for either db index unordered_map g_pvProteinNameCache; // populated at index load; eliminates per-spectrum fopen in RTS path -std::condition_variable g_searchPoolCV; // signaled when a pool slot is released AScoreProCpp::AScoreOptions g_AScoreOptions; // AScore options // Thread-safety note - g_AScoreInterface is shared across PostAnalysis threads. @@ -146,242 +149,6 @@ static std::string GetHostName() return {}; } -static InputType GetInputType(const char *pszFileName) -{ - int iLen = (int)strlen(pszFileName); - - if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 6, ".mzXML") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 5, ".mzML") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 9, ".mzXML.gz") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 8, ".mzML.gz")) - - { - return InputType_MZXML; - } - else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".raw")) - { - return InputType_RAW; - } - else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".ms2") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 5, ".cms2")) - { - return InputType_MS2; - } - else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".mgf")) - { - return InputType_MGF; - } - - return InputType_UNKNOWN; -} - -static bool UpdateInputFile(InputFileInfo *pFileInfo) -{ - bool bUpdateBaseName = false; - char szTmpBaseName[SIZE_FILE]; - - // Make sure not set on command line OR more than 1 input file - // Need to do this check here before g_staticParams.inputFile is set to *pFileInfo - if (g_staticParams.inputFile.szBaseName[0] =='\0' || g_pvInputFiles.size()>1) - bUpdateBaseName = true; - else - strcpy(szTmpBaseName, g_staticParams.inputFile.szBaseName); - - g_staticParams.inputFile = *pFileInfo; - - g_staticParams.inputFile.iInputType = GetInputType(g_staticParams.inputFile.szFileName); - - if (InputType_UNKNOWN == g_staticParams.inputFile.iInputType) - { - return false; - } - - // per request, perform quick check to validate file still exists - // to avoid creating stub output files in these cases. - FILE *fp; - if ( (fp=fopen(g_staticParams.inputFile.szFileName, "r"))==NULL) - { - string strErrorMsg = " Error - cannot read input file \"" + string(g_staticParams.inputFile.szFileName) + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - else - { - fclose(fp); - } - -#ifndef CRUX - if (bUpdateBaseName) // set individual basename from input file - { - char *pStr; - int iLen = (int)strlen(g_staticParams.inputFile.szFileName); - - strcpy(g_staticParams.inputFile.szBaseName, g_staticParams.inputFile.szFileName); - - if ( (pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) - *pStr = '\0'; - - if (!STRCMP_IGNORE_CASE(g_staticParams.inputFile.szFileName + iLen - 9, ".mzXML.gz") - || !STRCMP_IGNORE_CASE(g_staticParams.inputFile.szFileName + iLen - 8, ".mzML.gz")) - { - if ( (pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) - *pStr = '\0'; - } - } - else - { - strcpy(g_staticParams.inputFile.szBaseName, szTmpBaseName); // set basename from command line - } -#endif - - return true; -} - -static void SetMSLevelFilter(MSReader &mstReader) -{ - vector msLevel; - - if (g_staticParams.options.iMSLevel == 3) - msLevel.push_back(MS3); - else if (g_staticParams.options.iMSLevel == 2) - msLevel.push_back(MS2); - else if (g_staticParams.options.iMSLevel == 1) - msLevel.push_back(MS1); - - mstReader.setFilter(msLevel); -} - -// Allocate memory for the _pResults struct for each g_pvQuery entry. -static bool AllocateResultsMem() -{ - for (std::vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) - { - Query* pQuery = *it; - - try - { - pQuery->_pResults = new Results[g_staticParams.options.iNumStored]; - } - catch (std::bad_alloc& ba) - { - string strErrorMsg = " Error - new(_pResults[]). bad_alloc: \"" + std::string(ba.what()) + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - - if (g_staticParams.options.iDecoySearch==2) - { - try - { - pQuery->_pDecoys = new Results[g_staticParams.options.iNumStored]; - } - catch (std::bad_alloc& ba) - { - string strErrorMsg = " Error - new(_pDecoys[]). bad_alloc: " + std::string(ba.what()) + "\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - } - - pQuery->iMatchPeptideCount = 0; - pQuery->iDecoyMatchPeptideCount = 0; - - for (int j=0; j_pResults[j].dPepMass = 0.0; - pQuery->_pResults[j].dExpect = 999; - pQuery->_pResults[j].fScoreSp = 0.0; - pQuery->_pResults[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; - pQuery->_pResults[j].fAScorePro = 0.0; - pQuery->_pResults[j].usiLenPeptide = 0; - pQuery->_pResults[j].usiRankSp = 0; - pQuery->_pResults[j].usiMatchedIons = 0; - pQuery->_pResults[j].usiTotalIons = 0; - pQuery->_pResults[j].szPeptide[0] = '\0'; - pQuery->_pResults[j].sAScoreProSiteScores.clear(); - pQuery->_pResults[j].pWhichProtein.clear(); - pQuery->_pResults[j].sPeffOrigResidues.clear(); - pQuery->_pResults[j].iPeffOrigResiduePosition = -9; - memset(pQuery->iXcorrHistogram, 0, sizeof(pQuery->iXcorrHistogram)); - - if (g_staticParams.options.iDecoySearch) - pQuery->_pResults[j].pWhichDecoyProtein.clear(); - - if (g_staticParams.options.iDecoySearch==2) - { - pQuery->_pDecoys[j].dPepMass = 0.0; - pQuery->_pDecoys[j].dExpect = 999; - pQuery->_pDecoys[j].fScoreSp = 0.0; - pQuery->_pDecoys[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; - pQuery->_pDecoys[j].fAScorePro = 0.0; - pQuery->_pDecoys[j].usiLenPeptide = 0; - pQuery->_pDecoys[j].usiRankSp = 0; - pQuery->_pDecoys[j].usiMatchedIons = 0; - pQuery->_pDecoys[j].usiTotalIons = 0; - pQuery->_pDecoys[j].szPeptide[0] = '\0'; - pQuery->_pDecoys[j].sAScoreProSiteScores.clear(); - pQuery->_pDecoys[j].pWhichProtein.clear(); - pQuery->_pDecoys[j].sPeffOrigResidues.clear(); - pQuery->_pDecoys[j].iPeffOrigResiduePosition = -9; - } - } - } - - return true; -} - -// Allocate memory for the _pSpecLibResults struct for each g_pvQueryMS1 entry. -static bool AllocateResultsMemMS1() -{ -/* - for (std::vector::iterator it = g_pvQueryMS1.begin(); it != g_pvQueryMS1.end(); ++it) - { - QueryMS1* pQueryMS1 = *it; - - try - { - pQueryMS1->_pSpecLibResultsMS1 = new SpecLibResultsMS1[g_staticParams.options.iNumStored]; - } - catch (std::bad_alloc& ba) - { - string strErrorMsg = " Error - new(_pSpecLibResults[]). bad_alloc: " + std::string(ba.what()) + "\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - for (int j=0; j_pSpecLibResultsMS1[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; - pQueryMS1->_pSpecLibResultsMS1[j].fCn = 0; - pQueryMS1->_pSpecLibResultsMS1[j].fRTime = 0; - } - - } -*/ - return true; -} - -static bool compareByPeptideMass(Query const* a, Query const* b) -{ - return (a->_pepMassInfo.dExpPepMass < b->_pepMassInfo.dExpPepMass); -} - -static bool compareByMangoIndex(Query const* a, Query const* b) -{ - return (a->dMangoIndex < b->dMangoIndex); -} - -static bool compareByScanNumber(Query const* a, Query const* b) -{ - // sort by charge state if same scan number - if (a->_spectrumInfoInternal.iScanNumber == b->_spectrumInfoInternal.iScanNumber) - return (a->_spectrumInfoInternal.usiChargeState < b->_spectrumInfoInternal.usiChargeState); - return (a->_spectrumInfoInternal.iScanNumber < b->_spectrumInfoInternal.iScanNumber); -} - static bool ValidateOutputFormat() { if (!g_staticParams.options.bOutputSqtStream @@ -578,9 +345,6 @@ CometSearchManager::CometSearchManager() : // Initialize the mutex we'll use to protect the preprocess memory pool Threading::InitMutex(&g_preprocessMemoryPoolMutex); - // Initialize the mutex we'll use to protect the search memory pool - Threading::InitMutex(&g_searchMemoryPoolMutex); - // Initialize the mutex we'll use to protect the MS1 RT aligner Threading::InitMutex(&g_ms1AlignerMutex); @@ -597,7 +361,7 @@ CometSearchManager::CometSearchManager() : CometSearchManager::~CometSearchManager() { - // Destroy the mutex we used to protect g_pvQuery. + // Destroy the mutex we used to protect g_pvQueryMutex. Threading::DestroyMutex(g_pvQueryMutex); // Destroy the mutex we used to protect g_pvDBIndex. @@ -606,9 +370,6 @@ CometSearchManager::~CometSearchManager() // Destroy the mutex we used to protect the preprocess memory pool Threading::DestroyMutex(g_preprocessMemoryPoolMutex); - // Destroy the mutex we used to protect the search memory pool - Threading::DestroyMutex(g_searchMemoryPoolMutex); - // Destroy the mutex we used to protect the MS1 RT aligner Threading::DestroyMutex(g_ms1AlignerMutex); @@ -2182,8 +1943,6 @@ bool CometSearchManager::DoSearch() ThreadPool *tp = _tp; - auto tGlobalStartTime = chrono::steady_clock::now(); - if (!InitializeStaticParams()) return false; @@ -2347,936 +2106,43 @@ bool CometSearchManager::DoSearch() return bSucceeded; // index written; caller (InitializeSingleSpectrumSearch) will load it } - bool bBlankSearchFile = false; - - if (g_bPerformDatabaseSearch && g_staticParams.iDbType == DbType::FI_DB) - { - if (!g_staticParams.options.iFragIndexSkipReadPrecursors) - { - // read precursors before creating fragment index - auto tTime1 = chrono::steady_clock::now(); - if (!g_staticParams.options.bOutputSqtStream) - { - cout << " - read precursors ... "; - fflush(stdout); - } - - for (int i = 0; i < (int)g_pvInputFiles.size(); ++i) - { - bSucceeded = UpdateInputFile(g_pvInputFiles.at(i)); - if (!bSucceeded) - break; - - // For file access using MSToolkit. - MSReader mstReader; - - // We want to read only MS2/MS3 scans. - SetMSLevelFilter(mstReader); - - CometPreprocess::Reset(); - - bSucceeded = CometPreprocess::ReadPrecursors(mstReader); - } - - if (!g_staticParams.options.bOutputSqtStream) - cout << CometMassSpecUtils::ElapsedTime(tTime1) << endl; - } - } + // AScore initialization happens inside Pipeline::run(), after the strategy has + // loaded its database/index -- see the comment there for why the ordering matters. if (g_bPerformSpecLibSearch) - { CometSpecLib::LoadSpecLib(g_staticParams.speclibInfo.strSpecLibFile); - } - - bool bPerformAScoreInitialization = true; - - for (int i = 0; i < (int)g_pvInputFiles.size(); ++i) - { - bSucceeded = UpdateInputFile(g_pvInputFiles.at(i)); - if (!bSucceeded) - break; - - time_t tStartTime; - time(&tStartTime); - strftime(g_staticParams.szDate, 26, "%Y/%m/%d, %I:%M:%S %p", localtime(&tStartTime)); - - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - strOut = " Search start: " + string(g_staticParams.szDate) + "\n"; - strOut += " - Input file: " + string(g_staticParams.inputFile.szFileName) + "\n"; - logout(strOut); - fflush(stdout); - } - - int iFirstScan = g_staticParams.inputFile.iFirstScan; // First scan to search specified by user. - int iLastScan = g_staticParams.inputFile.iLastScan; // Last scan to search specified by user. - int iPercentStart = 0; // percentage within input file for start scan of batch - int iPercentEnd = 0; // percentage within input file for end scan of batch - int iAnalysisType = g_staticParams.inputFile.iAnalysisType; // 1=dta (retired), - // 2=specific scan, - // 3=specific scan + charge, - // 4=scan range, - // 5=entire file - - // For SQT & pepXML output file, check if they can be written to before doing anything else. - FILE *fpout_sqt=NULL; - FILE *fpoutd_sqt=NULL; - FILE *fpout_pepxml=NULL; - FILE *fpoutd_pepxml=NULL; - FILE *fpout_mzidentml=NULL; - FILE *fpoutd_mzidentml=NULL; - FILE *fpout_mzidentmltmp=NULL; - FILE *fpoutd_mzidentmltmp=NULL; - FILE *fpout_percolator=NULL; - FILE *fpout_txt=NULL; - FILE *fpoutd_txt=NULL; - - std::string sOutputSQT; - std::string sOutputDecoySQT; - std::string sOutputPepXML; - std::string sOutputDecoyPepXML; - std::string sOutputMzIdentML; - std::string sOutputDecoyMzIdentML; - std::string sOutputMzIdentMLtmp; // temporary file used to hold mzIdentML output before finalizing - std::string sOutputDecoyMzIdentMLtmp; // temporary file used to hold decoy mzIdentML output before finalizing - std::string sOutputPercolator; - std::string sOutputTxt; - std::string sOutputDecoyTxt; - - if (g_staticParams.options.bOutputSqtFile) - { - if (iAnalysisType == AnalysisType_EntireFile) - { - sOutputSQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".sqt"; - -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - { - sOutputSQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".target.sqt"; - } -#endif - } - else - { - sOutputSQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".sqt"; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputSQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".target.sqt"; -#endif - } - - if ((fpout_sqt = fopen(sOutputSQT.c_str(), "w")) == NULL) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputSQT + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - CometWriteSqt::PrintSqtHeader(fpout_sqt, *this); - - if (bSucceeded && (g_staticParams.options.iDecoySearch == 2)) - { - if (iAnalysisType == AnalysisType_EntireFile) - sOutputDecoySQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".decoy.sqt"; - else - sOutputDecoySQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".decoy.sqt"; - - if ((fpoutd_sqt = fopen(sOutputDecoySQT.c_str(), "w")) == NULL) - { - string strErrorMsg = " Error - cannot write to decoy file \"" + sOutputDecoySQT + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - CometWriteSqt::PrintSqtHeader(fpoutd_sqt, *this); - } - } - - if (bSucceeded && g_staticParams.options.bOutputTxtFile) - { - if (iAnalysisType == AnalysisType_EntireFile) - { - sOutputTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + "." + g_staticParams.szTxtFileExt; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".target." + g_staticParams.szTxtFileExt; -#endif - } - else - { - sOutputTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + "." + g_staticParams.szTxtFileExt; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".target." + g_staticParams.szTxtFileExt; -#endif - } - - if ((fpout_txt = fopen(sOutputTxt.c_str(), "w")) == NULL) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputTxt + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - CometWriteTxt::PrintTxtHeader(fpout_txt); - fflush(fpout_txt); - - if (bSucceeded && (g_staticParams.options.iDecoySearch == 2)) - { - if (iAnalysisType == AnalysisType_EntireFile) - sOutputDecoyTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".decoy." + g_staticParams.szTxtFileExt; - else - sOutputDecoyTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".decoy." + g_staticParams.szTxtFileExt; - - fpoutd_txt = fopen(sOutputDecoyTxt.c_str(), "w"); - if (!fpoutd_txt) - { - string strErrorMsg = " Error - cannot write to decoy file \"" + sOutputDecoyTxt + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - CometWriteTxt::PrintTxtHeader(fpoutd_txt); - } - } - - if (bSucceeded && g_staticParams.options.bOutputPepXMLFile) - { - if (iAnalysisType == AnalysisType_EntireFile) - { - sOutputPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".pep.xml"; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".target.pep.xml"; -#endif - } - else - { - sOutputPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".pep.xml"; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".target.pep.xml"; -#endif - } - - fpout_pepxml = fopen(sOutputPepXML.c_str(), "w"); - if (!fpout_pepxml) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputPepXML + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - if (bSucceeded) - bSucceeded = CometWritePepXML::WritePepXMLHeader(fpout_pepxml, *this); - - if (bSucceeded && (g_staticParams.options.iDecoySearch == 2)) - { - if (iAnalysisType == AnalysisType_EntireFile) - sOutputDecoyPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".decoy.pep.xml"; - else - sOutputDecoyPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".decoy.pep.xml"; - - fpoutd_pepxml = fopen(sOutputDecoyPepXML.c_str(), "w"); - if (!fpoutd_pepxml) - { - string strErrorMsg = " Error - cannot write to decoy file \"" + sOutputDecoyPepXML + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - if (bSucceeded) - bSucceeded = CometWritePepXML::WritePepXMLHeader(fpoutd_pepxml, *this); - } - } - - if (bSucceeded && g_staticParams.options.iOutputMzIdentMLFile) - { - if (iAnalysisType == AnalysisType_EntireFile) - { - sOutputMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".mzid"; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".target.mzid"; -#endif - } - else - { - sOutputMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".mzid"; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".target.mzid"; -#endif - } - - fpout_mzidentml = fopen(sOutputMzIdentML.c_str(), "w"); - if (!fpout_mzidentml) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputMzIdentML + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } + // Build search session with run-level flags. + SearchSession session(g_cometStatus); + session.bPerformDatabaseSearch = g_bPerformDatabaseSearch; + session.bPerformSpecLibSearch = g_bPerformSpecLibSearch; - sOutputMzIdentMLtmp = sOutputMzIdentML + ".XXXXXX"; -#ifdef _WIN32 - errno_t err = _mktemp_s(&sOutputMzIdentMLtmp[0], sOutputMzIdentMLtmp.size() + 1); - if (err != 0) - { - string strErrorMsg = " Error - cannot create temporary file \"" + sOutputMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } -#else - int iRet = mkstemp(&sOutputMzIdentMLtmp[0]); - if (iRet == -1) - { - string strErrorMsg = " Error - cannot create temporary file \"" + sOutputMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } -#endif + // Select strategy and create writers, then run the pipeline. + std::unique_ptr pStrategy; + if (g_staticParams.iDbType == DbType::FI_DB) + pStrategy = std::make_unique(); + else if (g_staticParams.iDbType == DbType::PI_DB) + pStrategy = std::make_unique(); + else + pStrategy = std::make_unique(); + + // PepXML, mzIdentML, Percolator, Txt first; SQT last (WriteSqt modifies szMod). + std::vector> vWriters; + if (g_staticParams.options.bOutputPepXMLFile) + vWriters.push_back(std::make_unique()); + if (g_staticParams.options.iOutputMzIdentMLFile) + vWriters.push_back(std::make_unique(this)); + if (g_staticParams.options.bOutputPercolatorFile) + vWriters.push_back(std::make_unique()); + if (g_staticParams.options.bOutputTxtFile) + vWriters.push_back(std::make_unique()); + if (g_staticParams.options.bOutputSqtFile || g_staticParams.options.bOutputSqtStream) + vWriters.push_back(std::make_unique()); + + Pipeline pipeline(std::move(pStrategy), std::move(vWriters), this); + bSucceeded = pipeline.run(session, g_pvInputFiles, *tp); - fpout_mzidentmltmp = fopen(sOutputMzIdentMLtmp.c_str(), "w"); - if (!fpout_mzidentmltmp) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - if (bSucceeded && (g_staticParams.options.iDecoySearch == 2)) - { - if (iAnalysisType == AnalysisType_EntireFile) - sOutputDecoyMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".decoy.mzid"; - else - sOutputDecoyMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".decoy.mzid"; - - fpoutd_mzidentml = fopen(sOutputDecoyMzIdentML.c_str(), "w"); - if (!fpoutd_mzidentml) - { - string strErrorMsg = " Error - cannot write to decoy file \"" + sOutputDecoyMzIdentML + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - sOutputDecoyMzIdentMLtmp = sOutputDecoyMzIdentML + ".XXXXXX"; -#ifdef _WIN32 - errno_t err = _mktemp_s(&sOutputDecoyMzIdentMLtmp[0], sOutputDecoyMzIdentMLtmp.size() + 1); - if (err != 0) - { - string strErrorMsg = " Error - cannot create temporary file \"" + sOutputDecoyMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } -#else - int iRet = mkstemp(&sOutputDecoyMzIdentMLtmp[0]); - if (iRet == -1) - { - string strErrorMsg = " Error - cannot create temporary file \"" + sOutputDecoyMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } -#endif - fpoutd_mzidentmltmp = fopen(sOutputDecoyMzIdentMLtmp.c_str(), "w"); - if (!fpoutd_mzidentmltmp) - { - string strErrorMsg = " Error - cannot write to decoy file \"" + sOutputDecoyMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - } - } - - if (bSucceeded && g_staticParams.options.bOutputPercolatorFile) - { - if (iAnalysisType == AnalysisType_EntireFile) - sOutputPercolator = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".pin"; - else - sOutputPercolator = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".pin"; - - fpout_percolator = fopen(sOutputPercolator.c_str(), "w"); - if (!fpout_percolator) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputPercolator + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - if (bSucceeded) - CometWritePercolator::WritePercolatorHeader(fpout_percolator); - } - - int iTotalSpectraSearched = 0; - if (bSucceeded) - { - //MH: Allocate memory shared by threads during spectral processing. - bSucceeded = CometPreprocess::AllocateMemory(g_staticParams.options.iNumThreads); - if (!bSucceeded) - break; - - // Allocate memory shared by threads during search - bSucceeded = CometSearch::AllocateMemory(g_staticParams.options.iNumThreads); - if (!bSucceeded) - break; - - // For file access using MSToolkit. - MSReader mstReader; - - // We want to read only MS2/MS3 scans. - SetMSLevelFilter(mstReader); - - // We need to reset some of the static variables in-between input files - CometPreprocess::Reset(); - - FILE* fpfasta = NULL; // pointer to FASTA file; if .idx search, FASTA is used to retrieve sequences (mzid output) - FILE* fpidx = NULL; // pointer to .idx file if used - - if (g_bPerformDatabaseSearch) - { - string sTmpDB = g_staticParams.databaseInfo.szDatabase; - - if (g_staticParams.iDbType != DbType::FASTA_DB) - { - // .idx db so first open .idx file - if ((fpidx = fopen(sTmpDB.c_str(), "r")) == NULL) - { - string strErrorMsg = " Error (1a) - cannot read .idx file \"" + sTmpDB + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - - // .idx db so next check if FASTA is present (not required) - sTmpDB = sTmpDB.erase(sTmpDB.size() - 4); // need plain fasta if indexdb input - if ((fpfasta = fopen(sTmpDB.c_str(), "r")) == NULL) - { - g_bIdxNoFasta = true; - fpfasta = NULL; - } - } - else - { - // FASTA search only - fpidx = NULL; - - if ((fpfasta = fopen(sTmpDB.c_str(), "r")) == NULL) - { - string strErrorMsg = " Error (1b) - cannot read sequence database file \"" + sTmpDB + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - } - } - - if (g_staticParams.options.iSpectrumBatchSize == 0 && g_staticParams.iDbType == DbType::FASTA_DB) - { - logout(" - Reading all spectra into memory; set \"spectrum_batch_size\" if search terminates here.\n"); - fflush(stdout); - } - - CometFragmentIndex sqSearch; - - if (g_bPerformDatabaseSearch && g_staticParams.iDbType == DbType::FI_DB) - { - if (!g_bPlainPeptideIndexRead) - { - auto tStartTime = chrono::steady_clock::now(); - if (!g_staticParams.options.bOutputSqtStream) - { - cout << " - read .idx ... "; - fflush(stdout); - } - - sqSearch.ReadPlainPeptideIndex(); - - if (!g_staticParams.options.bOutputSqtStream) - { - cout << CometMassSpecUtils::ElapsedTime(tStartTime) << endl; - } - - sqSearch.CreateFragmentIndex(tp); - } - } - - if (g_staticParams.options.iPrintAScoreProScore && bPerformAScoreInitialization) - { - SetAScoreOptions(g_AScoreOptions); -// PrintAScoreOptions(g_AScoreOptions); - - // Create the AScoreDllInterface using the factory function - g_AScoreInterface = CreateAScoreDllInterface(); - if (!g_AScoreInterface) - { - std::cerr << "Failed to create AScore interface." << std::endl; - exit(1); - } - - bPerformAScoreInitialization = false; - } - - auto tBeginTime = chrono::steady_clock::now(); - if (g_staticParams.iDbType != DbType::FASTA_DB) - { - printf(" - searching \"%s\" ... ", g_staticParams.inputFile.szBaseName); - fflush(stdout); - } - - FILE* fpdb = NULL; - if (g_bPerformDatabaseSearch) - { - if (g_staticParams.iDbType != DbType::FASTA_DB) - fpdb = fpidx; - else - fpdb = fpfasta; - } - - int iBatchNum = 0; - while (!CometPreprocess::DoneProcessingAllSpectra()) // Loop through iMaxSpectraPerSearch - { - iBatchNum++; - - // Fused FI_DB path: read + preprocess + search + post-analysis per spectrum - // in one pass using per-thread scratch buffers and a lock-free dispatch loop. - // Excludes Mango and spectral-library paths which rely on legacy ordering. - bool bFusedFIDB = (g_staticParams.iDbType == DbType::FI_DB - && g_bPerformDatabaseSearch - && !g_staticParams.options.bMango - && !g_bPerformSpecLibSearch); - - if (bFusedFIDB) - { - // IMPORTANT: From this point onwards, because we've loaded some - // spectra, we MUST "goto cleanup_results" before exiting the loop, - // or we will create a memory leak! - g_cometStatus.SetStatusMsg(string("Running fused FI_DB search...")); - - bSucceeded = CometPreprocess::FusedLoadAndSearchSpectra(mstReader, iFirstScan, iLastScan, iAnalysisType, tp); - - if (!bSucceeded) - goto cleanup_results; - - iPercentStart = iPercentEnd; - iPercentEnd = mstReader.getPercent(); - - if (g_pvQuery.empty()) - continue; - - iTotalSpectraSearched += (int)g_pvQuery.size(); - } - else - { - // Legacy three-sweep path: LoadAndPreprocess -> AllocateResults -> - // sort-by-mass -> RunSearch -> PostAnalysis. - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - logout(" - Load spectra:"); - fflush(stdout); - } - - g_cometStatus.SetStatusMsg(string("Loading and processing input spectra")); - - // IMPORTANT: From this point onwards, because we've loaded some - // spectra, we MUST "goto cleanup_results" before exiting the loop, - // or we will create a memory leak! - - bSucceeded = CometPreprocess::LoadAndPreprocessSpectra(mstReader, iFirstScan, iLastScan, iAnalysisType, tp); - - if (!bSucceeded) - goto cleanup_results; - - iPercentStart = iPercentEnd; - iPercentEnd = mstReader.getPercent(); - - if (g_pvQuery.empty()) - continue; //FIX make sure continue instead of break makes sense - else // possible no spectrum in batch passes filters; do not want to break in that case; - iTotalSpectraSearched += (int)g_pvQuery.size(); - - bSucceeded = AllocateResultsMem(); - - if (!bSucceeded) - goto cleanup_results; - - { // need strStatusMsg in it's own scope due to goto statement above - string strStatusMsg = " " + std::to_string(g_pvQuery.size()) + string("\n"); - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - logout(strStatusMsg); - } - g_cometStatus.SetStatusMsg(strStatusMsg); - } - - if (g_staticParams.options.bMango) - { - int iCurrentScanNumber = 0; // used to track multiple Mango precursors from same scan number - int iMangoIndex=0; - - // sort back to original spectrum order in MS2 scan in order to associate pairs - // based on sequential order of precursors for each scan - std::sort(g_pvQuery.begin(), g_pvQuery.end(), compareByMangoIndex); - - for (std::vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) - { - if ((*it)->_spectrumInfoInternal.iScanNumber != iCurrentScanNumber) - { - iCurrentScanNumber = (*it)->_spectrumInfoInternal.iScanNumber; - iMangoIndex = 0; - } - else - iMangoIndex++; - - sprintf((*it)->_spectrumInfoInternal.szMango, "%03d_%c", (int)iMangoIndex/2, (iMangoIndex % 2)?'B':'A'); - } - } - - // Sort g_pvQuery vector by dExpPepMass. - std::sort(g_pvQuery.begin(), g_pvQuery.end(), compareByPeptideMass); - - g_massRange.dMinMass = g_pvQuery.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; - g_massRange.dMaxMass = g_pvQuery.at(g_pvQuery.size()-1)->_pepMassInfo.dPeptideMassTolerancePlus; - - if (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass) - g_massRange.bNarrowMassRange = true; - else - g_massRange.bNarrowMassRange = false; - - bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); - if (!bSucceeded) - goto cleanup_results; - - g_cometStatus.SetStatusMsg(string("Running search...")); - - // Now that spectra are loaded to memory and sorted, do search. - if (g_bPerformDatabaseSearch) - bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp); - if (g_bPerformSpecLibSearch) - bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp); - - if (!bSucceeded) - goto cleanup_results; - - bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); - if (!bSucceeded) - goto cleanup_results; - - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - logout(" - Post analysis:"); - fflush(stdout); - } - - if (g_bPerformDatabaseSearch) - { - g_cometStatus.SetStatusMsg(string("Performing post-search analysis ...")); - - // Sort each entry by xcorr, calculate E-values, etc. - bSucceeded = CometPostAnalysis::PostAnalysis(tp); - } - - if (!bSucceeded) - goto cleanup_results; - } - - // Sort g_pvQuery vector by scan (shared by both paths). - std::sort(g_pvQuery.begin(), g_pvQuery.end(), compareByScanNumber); - - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - logout(" done\n"); - fflush(stdout); - } - - if (g_staticParams.options.bOutputPepXMLFile) - CometWritePepXML::WritePepXML(fpout_pepxml, fpoutd_pepxml, fpdb, iTotalSpectraSearched - (int)g_pvQuery.size()); - - // For mzid output, dump psms as tab-delimited text first then collate results to - // mzid file at very end due to requirements of this format. - if (g_staticParams.options.iOutputMzIdentMLFile) - CometWriteMzIdentML::WriteMzIdentMLTmp(fpout_mzidentmltmp, fpoutd_mzidentmltmp, iBatchNum); - - if (g_staticParams.options.bOutputPercolatorFile) - { - bSucceeded = CometWritePercolator::WritePercolator(fpout_percolator, fpdb); - if (!bSucceeded) - goto cleanup_results; - } - - if (g_staticParams.options.bOutputTxtFile) - { - CometWriteTxt::WriteTxt(fpout_txt, fpoutd_txt, fpdb); - } - - // Write SQT last as I destroy the g_staticParams.szMod string during that process - if (g_staticParams.options.bOutputSqtStream || g_staticParams.options.bOutputSqtFile) - CometWriteSqt::WriteSqt(fpout_sqt, fpoutd_sqt, fpdb); - -cleanup_results: - - // Deleting each Query object in the vector calls its destructor, which - // frees the spectral memory (see definition for Query in CometDataInternal.h). - for (auto it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) - delete (*it); - - g_pvQuery.clear(); - - if (!bSucceeded) - break; - } - - if (bSucceeded) - { - if (iTotalSpectraSearched == 0) - logout(" Warning - no spectra searched.\n"); - - if (NULL != fpout_pepxml) - CometWritePepXML::WritePepXMLEndTags(fpout_pepxml); - - if (NULL != fpoutd_pepxml) - CometWritePepXML::WritePepXMLEndTags(fpoutd_pepxml); - - if (NULL != fpout_mzidentml) - { - fclose(fpout_mzidentmltmp); // close for writing and re-open for reading - - if ((fpout_mzidentmltmp = fopen(sOutputMzIdentMLtmp.c_str(), "r")) == NULL) - { - string strErrorMsg = " Error - cannot read temporary file \"" + sOutputMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - // now read tmp file and write mzIdentML - CometWriteMzIdentML::WriteMzIdentML(fpout_mzidentml, fpdb, sOutputMzIdentMLtmp.c_str(), *this); - - fclose(fpout_mzidentmltmp); - remove(sOutputMzIdentMLtmp.c_str()); - } - - if (NULL != fpoutd_mzidentml) - { - fclose(fpoutd_mzidentmltmp); // close for writing and re-open for reading - - if ((fpoutd_mzidentmltmp = fopen(sOutputDecoyMzIdentMLtmp.c_str(), "r")) == NULL) - { - string strErrorMsg = " Error - cannot read temporary file \"" + sOutputDecoyMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - // now read tmp file and write mzIdentML - CometWriteMzIdentML::WriteMzIdentML(fpoutd_mzidentml, fpdb, sOutputDecoyMzIdentMLtmp.c_str(), *this); - - fclose(fpoutd_mzidentmltmp); - remove(sOutputDecoyMzIdentMLtmp.c_str()); - } - - if (!g_staticParams.options.bOutputSqtStream) - { - const auto duration = chrono::duration_cast(chrono::steady_clock::now() - tBeginTime); - double dTimePerSpectra = (double)duration.count() / (double)iTotalSpectraSearched; - - if (g_staticParams.iDbType == DbType::FASTA_DB) - strOut = " - Run stats: "; - else - strOut = ""; - - char buf[128]; - - std::snprintf(buf, sizeof(buf), "%.2f", dTimePerSpectra); - strOut += CometMassSpecUtils::ElapsedTime(tBeginTime) + " (" + std::to_string(iTotalSpectraSearched) + " spectra, " - + std::string(buf) + "ms/spec, "; - - std::snprintf(buf, sizeof(buf), "%.0f", 1000.0 / dTimePerSpectra); - strOut += std::string(buf) + "Hz"; - - if (g_staticParams.iDbType == DbType::FASTA_DB) - strOut += ", " + CometMassSpecUtils::GetPeakMemory(); - - strOut += ")\n"; - - logout(strOut); - } - - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - time_t tEndTime; - - time(&tEndTime); - - strftime(g_staticParams.szDate, 26, "%Y/%m/%d, %I:%M:%S %p", localtime(&tEndTime)); - strOut = " Search end: " + string(g_staticParams.szDate) + " (" + CometMassSpecUtils::ElapsedTime(tGlobalStartTime) + ", " + CometMassSpecUtils::GetPeakMemory() + ")\n\n"; - logout(strOut); - } - } - - if (fpidx != NULL) - fclose(fpidx); - if (fpfasta != NULL) - fclose(fpfasta); - } - - //MH: Deallocate spectral processing memory. - CometPreprocess::DeallocateMemory(g_staticParams.options.iNumThreads); - - // Deallocate search memory - CometSearch::DeallocateMemory(g_staticParams.options.iNumThreads); - - if (NULL != fpout_pepxml) - { - fclose(fpout_pepxml); - fpout_pepxml = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputPepXML.c_str()); - } - - if (NULL != fpoutd_pepxml) - { - fclose(fpoutd_pepxml); - fpoutd_pepxml = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputDecoyPepXML.c_str()); - } - - if (NULL != fpout_mzidentml) - { - fclose(fpout_mzidentml); - fpout_mzidentml = NULL; - if (iTotalSpectraSearched == 0) - { - remove(sOutputMzIdentML.c_str()); - remove(sOutputMzIdentMLtmp.c_str()); - } - } - - if (NULL != fpoutd_mzidentml) - { - fclose(fpoutd_mzidentml); - fpoutd_mzidentml = NULL; - if (iTotalSpectraSearched == 0) - { - remove(sOutputDecoyMzIdentML.c_str()); - remove(sOutputDecoyMzIdentMLtmp.c_str()); - } - } - - if (NULL != fpout_percolator) - { - fclose(fpout_percolator); - fpout_percolator = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputPercolator.c_str()); - } - - if (NULL != fpout_sqt) - { - fclose(fpout_sqt); - fpout_sqt = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputSQT.c_str()); - } - - if (NULL != fpoutd_sqt) - { - fclose(fpoutd_sqt); - fpoutd_sqt = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputDecoySQT.c_str()); - } - - if (NULL != fpoutd_sqt) - { - fclose(fpoutd_sqt); - fpoutd_sqt = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputDecoySQT.c_str()); - } - - if (NULL != fpout_txt) - { - fclose(fpout_txt); - fpout_txt = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputTxt.c_str()); - } - - if (NULL != fpoutd_txt) - { - fclose(fpoutd_txt); - fpoutd_txt = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputDecoyTxt.c_str()); - } - - if (iTotalSpectraSearched == 0) - bBlankSearchFile = true; - - g_staticParams.inputFile.szBaseName[0] = '\0'; - - if (!bSucceeded) - break; - } - - if (g_staticParams.iDbType == DbType::FI_DB) // clean fragment ion index - { - free(g_bIndexPrecursors); // allocated in InitializeStaticParams - - delete[] g_iFragmentIndex; - delete[] g_iFragmentIndexOffset; - } - - if (g_staticParams.iDbType != DbType::FASTA_DB) // for either index search - { - strOut = " - done. (" + CometMassSpecUtils::ElapsedTime(tGlobalStartTime); - - string strMemUse = CometMassSpecUtils::GetPeakMemory(); - if (!strMemUse.empty()) - strOut += ", " + strMemUse + ")"; - else - strOut += ")"; - - strOut += "\n\n"; - - logout(strOut); - } - - if (g_staticParams.options.iPrintAScoreProScore) - DeleteAScoreDllInterface(g_AScoreInterface); - - if (bBlankSearchFile) - return false; - else - return bSucceeded; + return bSucceeded; } @@ -3574,7 +2440,7 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, // the binned sqrt-intensity spectrum needed for fragment-ion matching below. double* pdTmpSpectrum = CometPreprocess::GetRtsRawDataBuffer(); - // Step 1: Preprocess into a thread-local Query* (does NOT touch g_pvQuery) + // Step 1: Preprocess into a thread-local Query* (does NOT touch session.queries) #ifdef RTS_TIMING tTimingMark = hrc::now(); #endif @@ -3615,7 +2481,7 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, // Step 3: Run the fragment index search on the thread-local Query* // This uses the new RunSearch(Query*) overload that allocates its own - // pbDuplFragment and never touches g_pvQuery or _ppbDuplFragmentArr. + // pbDuplFragment and never touches session.queries or _ppbDuplFragmentArr. #ifdef RTS_TIMING tTimingMark = hrc::now(); #endif @@ -3658,7 +2524,7 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, if (takeSearchResultsN > iSize) takeSearchResultsN = iSize; - // Step 4: Post-analysis using Query* overloads (no g_pvQuery access) + // Step 4: Post-analysis using Query* overloads (no session.queries access) if (pQuery->iMatchPeptideCount > 0) { if (g_staticParams.options.iMaxIndexRunTime > 0) diff --git a/CometSearch/CometSpecLib.cpp b/CometSearch/CometSpecLib.cpp index b6b337c3..c40ead94 100644 --- a/CometSearch/CometSpecLib.cpp +++ b/CometSearch/CometSpecLib.cpp @@ -110,7 +110,7 @@ bool CometSpecLib::LoadSpecLib(string strSpecLibFile) } -bool CometSpecLib::ReadSpecLibSqlite(string strSpecLibFile) +bool CometSpecLib::ReadSpecLibSqlite(string /*strSpecLibFile*/) { printf(" Error - sqlite/.db files as spectral libraries are not supported yet.\n"); @@ -192,7 +192,7 @@ bool CometSpecLib::ReadSpecLibSqlite(string strSpecLibFile) } -bool CometSpecLib::ReadSpecLibRaw(string strSpecLibFile) +bool CometSpecLib::ReadSpecLibRaw(string /*strSpecLibFile*/) { printf(" Error - raw files as spectral libraries are not supported yet.\n"); exit(1); @@ -556,7 +556,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, ThreadPool* pLoadSpecThreadPool = tp; bool bFirstScan = true; - bool bDoneProcessingAllSpectra = false; printf(" - loading MS1 scan (%d, mass range %0.1lf - %0.1lf): ", iFileLastScan, g_staticParams.options.dMS1MinMass, g_staticParams.options.dMS1MaxMass); @@ -602,7 +601,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, if ((iFileLastScan != -1) && (iFileLastScan < iFirstScan)) { - bDoneProcessingAllSpectra = true; break; } @@ -621,7 +619,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, if (iScanNumber > iFileLastScan) { - bDoneProcessingAllSpectra = true; break; } @@ -629,7 +626,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, { if (iScanNumber > iFileLastScan) { - bDoneProcessingAllSpectra = true; break; } @@ -648,7 +644,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, } else if (CometPreprocess::IsValidInputType(iSpecLibInputType)) { - bDoneProcessingAllSpectra = true; break; } else @@ -657,7 +652,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, if (iTmpCount > iFileLastScan) { - bDoneProcessingAllSpectra = true; break; } } @@ -729,7 +723,7 @@ double CometSpecLib::ScoreSpecLib(Query *pQuery, // SpecLib entries that are matched to that "bin". This allows a mass query to walk through // and score against all entries in the vector. void CometSpecLib::SetSpecLibPrecursorIndex(double dNeutralMass, - int iSpecLibCharge, + int /*iSpecLibCharge*/, size_t iWhichSpecLib) { double dProtonatedMass = dNeutralMass + PROTON_MASS; diff --git a/CometSearch/CometWriteMzIdentML.cpp b/CometSearch/CometWriteMzIdentML.cpp index a24bace2..aa558fc3 100644 --- a/CometSearch/CometWriteMzIdentML.cpp +++ b/CometSearch/CometWriteMzIdentML.cpp @@ -40,22 +40,23 @@ CometWriteMzIdentML::~CometWriteMzIdentML() void CometWriteMzIdentML::WriteMzIdentMLTmp(FILE *fpout, FILE *fpoutd, - int iBatchNum) + int iBatchNum, + const vector& queries) { int i; // Print temporary results in tab-delimited file if (g_staticParams.options.iDecoySearch == 2) { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintTmpPSM(i, 1, iBatchNum, fpout); - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintTmpPSM(i, 2, iBatchNum, fpoutd); + for (i=0; i<(int)queries.size(); ++i) + PrintTmpPSM(i, 1, iBatchNum, fpout, queries); + for (i=0; i<(int)queries.size(); ++i) + PrintTmpPSM(i, 2, iBatchNum, fpoutd, queries); } else { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintTmpPSM(i, 0, iBatchNum, fpout); + for (i=0; i<(int)queries.size(); ++i) + PrintTmpPSM(i, 0, iBatchNum, fpout, queries); } } @@ -63,12 +64,13 @@ void CometWriteMzIdentML::WriteMzIdentMLTmp(FILE *fpout, void CometWriteMzIdentML::WriteMzIdentML(FILE *fpout, FILE *fpdb, string sTmpFile, - CometSearchManager &searchMgr) + CometSearchManager &searchMgr, + bool bIdxNoFasta) { WriteMzIdentMLHeader(fpout); // now loop through sTmpFile file, wr - ParseTmpFile(fpout, fpdb, sTmpFile, searchMgr); + ParseTmpFile(fpout, fpdb, sTmpFile, searchMgr, bIdxNoFasta); fprintf(fpout, "\n"); } @@ -112,7 +114,8 @@ bool CometWriteMzIdentML::WriteMzIdentMLHeader(FILE *fpout) bool CometWriteMzIdentML::ParseTmpFile(FILE *fpout, FILE *fpdb, string sTmpFile, - CometSearchManager &searchMgr) + CometSearchManager &searchMgr, + bool bIdxNoFasta) { std::vector vMzidTmp; // vector to store entire tmp output std::vector vProteinTargets; // store vector of target protein file offsets @@ -314,7 +317,7 @@ bool CometWriteMzIdentML::ParseTmpFile(FILE *fpout, CometMassSpecUtils::EscapeString(strProteinName); fprintf(fpout, " 0) @@ -1373,12 +1376,13 @@ void CometWriteMzIdentML::WriteSpectrumIdentificationList(FILE* fpout, void CometWriteMzIdentML::PrintTmpPSM(int iWhichQuery, int iPrintTargetDecoy, int iBatchNum, - FILE *fpout) + FILE *fpout, + const vector& queries) { - if ((iPrintTargetDecoy != 2 && g_pvQuery.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) - || (iPrintTargetDecoy == 2 && g_pvQuery.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) + if ((iPrintTargetDecoy != 2 && queries.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) + || (iPrintTargetDecoy == 2 && queries.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) { - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); Results *pOutput; int iNumPrintLines; diff --git a/CometSearch/CometWriteMzIdentML.h b/CometSearch/CometWriteMzIdentML.h index 3c3ffa46..75d42e91 100644 --- a/CometSearch/CometWriteMzIdentML.h +++ b/CometSearch/CometWriteMzIdentML.h @@ -53,12 +53,14 @@ class CometWriteMzIdentML static void WriteMzIdentMLTmp(FILE *fpout, FILE *fpoutd, - int iBatchNum); + int iBatchNum, + const vector& queries); static void WriteMzIdentML(FILE *fpout, FILE *fpdb, string sTmpFile, - CometSearchManager &searchMgr); + CometSearchManager &searchMgr, + bool bIdxNoFasta); private: @@ -67,7 +69,8 @@ class CometWriteMzIdentML static void PrintTmpPSM(int iWhichQuery, int iPrintTargetDecoy, int iBatchNum, - FILE *fpOut); + FILE *fpOut, + const vector& queries); static void WriteMods(FILE *fpout, CometSearchManager &searchMgr); @@ -103,7 +106,8 @@ class CometWriteMzIdentML static bool ParseTmpFile(FILE *fpout, FILE *fpdb, string ssTmpFile, - CometSearchManager &searchMgr); + CometSearchManager &searchMgr, + bool bIdxNoFasta); }; #endif diff --git a/CometSearch/CometWritePepXML.cpp b/CometSearch/CometWritePepXML.cpp index c109c26c..2f089951 100644 --- a/CometSearch/CometWritePepXML.cpp +++ b/CometSearch/CometWritePepXML.cpp @@ -37,22 +37,23 @@ CometWritePepXML::~CometWritePepXML() void CometWritePepXML::WritePepXML(FILE *fpout, FILE *fpoutd, FILE *fpdb, - int iNumSpectraSearched) + int iNumSpectraSearched, + const vector& queries) { int i; // Print out the separate decoy hits. if (g_staticParams.options.iDecoySearch == 2) { - for (i = 0; i < (int)g_pvQuery.size(); ++i) - PrintResults(i, 1, fpout, fpdb, iNumSpectraSearched); - for (i = 0; i < (int)g_pvQuery.size(); ++i) - PrintResults(i, 2, fpoutd, fpdb, iNumSpectraSearched); + for (i = 0; i < (int)queries.size(); ++i) + PrintResults(i, 1, fpout, fpdb, iNumSpectraSearched, queries); + for (i = 0; i < (int)queries.size(); ++i) + PrintResults(i, 2, fpoutd, fpdb, iNumSpectraSearched, queries); } else { - for (i = 0; i < (int)g_pvQuery.size(); ++i) - PrintResults(i, 0, fpout, fpdb, iNumSpectraSearched); + for (i = 0; i < (int)queries.size(); ++i) + PrintResults(i, 0, fpout, fpdb, iNumSpectraSearched, queries); } } @@ -416,14 +417,15 @@ void CometWritePepXML::PrintResults(int iWhichQuery, int iPrintTargetDecoy, FILE *fpout, FILE *fpdb, - int iNumSpectraSearched) + int iNumSpectraSearched, + const vector& queries) { int i, iNumPrintLines, iMinLength; char *pStr; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); // look for either \ or / separator so valid for Windows or Linux if ((pStr = strrchr(g_staticParams.inputFile.szBaseName, '\\')) == NULL @@ -500,7 +502,7 @@ void CometWritePepXML::PrintResults(int iWhichQuery, for (int iWhichResult=0; iWhichResult g_staticParams.options.dMinimumXcorr) - PrintPepXMLSearchHit(iWhichQuery, iWhichResult, iPrintTargetDecoy, pOutput, fpout, fpdb); + PrintPepXMLSearchHit(iWhichQuery, iWhichResult, iPrintTargetDecoy, pOutput, fpout, fpdb, queries); } fprintf(fpout, " \n"); @@ -513,14 +515,15 @@ void CometWritePepXML::PrintPepXMLSearchHit(int iWhichQuery, int iPrintTargetDecoy, Results *pOutput, FILE *fpout, - FILE *fpdb) + FILE *fpdb, + const vector& queries) { int i; int iNTT; int iNMC; unsigned int uiNumTotProteins = 0; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); CalcNTTNMC(pOutput, iWhichResult, &iNTT, &iNMC); @@ -529,7 +532,7 @@ void CometWritePepXML::PrintPepXMLSearchHit(int iWhichQuery, std::vector::iterator it; bool bReturnFulProteinString = false; - CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys); + CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys, queries); fprintf(fpout, " & queries); static void WritePepXMLEndTags(FILE *fpout); @@ -47,14 +48,16 @@ class CometWritePepXML int iPrintTargetDecoy, FILE *fpOut, FILE *fpdb, - int iNumSpectraSearched); + int iNumSpectraSearched, + const vector& queries); static void PrintPepXMLSearchHit(int iWhichQuery, int iWhichResult, int iPrintTargetDecoy, Results *pOutput, FILE *fpOut, - FILE *fpdb); + FILE *fpdb, + const vector& queries); static void GetVal(char *szElement, char *szAttribute, diff --git a/CometSearch/CometWritePercolator.cpp b/CometSearch/CometWritePercolator.cpp index 55940804..ddf9d78b 100644 --- a/CometSearch/CometWritePercolator.cpp +++ b/CometSearch/CometWritePercolator.cpp @@ -32,22 +32,23 @@ CometWritePercolator::~CometWritePercolator() bool CometWritePercolator::WritePercolator(FILE *fpout, - FILE *fpdb) + FILE *fpdb, + const vector& queries) { int i; int iLenDecoyPrefix = (int)strlen(g_staticParams.szDecoyPrefix); // Print results. - for (i=0; i<(int)g_pvQuery.size(); ++i) + for (i=0; i<(int)queries.size(); ++i) { - if (g_pvQuery.at(i)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) + if (queries.at(i)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) { - PrintResults(i, fpout, fpdb, 0, iLenDecoyPrefix); // print search hit (could be decoy if g_staticParams.options.iDecoySearch=1) + PrintResults(i, fpout, fpdb, 0, iLenDecoyPrefix, queries); } - if (g_staticParams.options.iDecoySearch == 2 && g_pvQuery.at(i)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr) + if (g_staticParams.options.iDecoySearch == 2 && queries.at(i)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr) { - PrintResults(i, fpout, fpdb, 2, iLenDecoyPrefix); // print decoy hit + PrintResults(i, fpout, fpdb, 2, iLenDecoyPrefix, queries); } } @@ -89,11 +90,12 @@ bool CometWritePercolator::PrintResults(int iWhichQuery, FILE *fpout, FILE *fpdb, int iPrintTargetDecoy, - int iLenDecoyPrefix) + int iLenDecoyPrefix, + const vector& queries) { int iNumPrintLines; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); Results *pOutput; @@ -127,7 +129,7 @@ bool CometWritePercolator::PrintResults(int iWhichQuery, unsigned int uiNumTotProteins = 0; // unused in pin bool bReturnFulProteinString = false; - CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys); + CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys, queries); if (g_staticParams.options.iDecoySearch) // using Comet's internal decoys { @@ -164,7 +166,7 @@ bool CometWritePercolator::PrintResults(int iWhichQuery, fprintf(fpout, "%0.6f\t", pQuery->_pepMassInfo.dExpPepMass); //ExpMass fprintf(fpout, "%0.6f\t", pOutput[iWhichResult].dPepMass); //CalcMass - PrintPercolatorSearchHit(iWhichQuery, iWhichResult, iPrintTargetDecoy, pOutput, fpout, vProteinTargets, vProteinDecoys); + PrintPercolatorSearchHit(iWhichQuery, iWhichResult, iPrintTargetDecoy, pOutput, fpout, vProteinTargets, vProteinDecoys, queries); } return true; @@ -176,15 +178,15 @@ void CometWritePercolator::PrintPercolatorSearchHit(int iWhichQuery, int iPrintTargetDecoy, Results *pOutput, FILE *fpout, - - vector vProteinTargets, - vector vProteinDecoys) + const vector& vProteinTargets, + const vector& vProteinDecoys, + const vector& queries) { int iNterm; int iCterm; int iNMC; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); CalcNTTNMC(pOutput, iWhichResult, &iNterm, &iCterm, &iNMC); @@ -273,7 +275,7 @@ void CometWritePercolator::PrintPercolatorSearchHit(int iWhichQuery, else fprintf(fpout, "%c.%s.%c\t", pOutput[iWhichResult].cPrevAA, pOutput[iWhichResult].szPeptide, pOutput[iWhichResult].cNextAA); - std::vector::iterator it; + std::vector::const_iterator it; bool bPrintTab = false; if (iPrintTargetDecoy != 2) // if not decoy only, print target proteins diff --git a/CometSearch/CometWritePercolator.h b/CometSearch/CometWritePercolator.h index 035efc31..59b8c639 100644 --- a/CometSearch/CometWritePercolator.h +++ b/CometSearch/CometWritePercolator.h @@ -24,7 +24,8 @@ class CometWritePercolator ~CometWritePercolator(); static void WritePercolatorHeader(FILE *fpout); static bool WritePercolator(FILE *fpout, - FILE *fpdb); + FILE *fpdb, + const vector& queries); private: @@ -32,14 +33,16 @@ class CometWritePercolator FILE *fpOut, FILE *fpdb, int iPrintTargetDecoy, - int iLenDecoyPrefix); + int iLenDecoyPrefix, + const vector& queries); static void PrintPercolatorSearchHit(int iWhichQuery, int iWhichResult, int iPrintTargetDecoy, Results *pOutput, FILE *fpOut, - vector vProteinTargets, - vector vProteinDecoys); + const vector& vProteinTargets, + const vector& vProteinDecoys, + const vector& queries); static void CalcNTTNMC(Results *pOutput, int iWhichQuery, int *iNterm, diff --git a/CometSearch/CometWriteSqt.cpp b/CometSearch/CometWriteSqt.cpp index db117740..b7858fec 100644 --- a/CometSearch/CometWriteSqt.cpp +++ b/CometSearch/CometWriteSqt.cpp @@ -31,22 +31,23 @@ CometWriteSqt::~CometWriteSqt() void CometWriteSqt::WriteSqt(FILE *fpout, FILE *fpoutd, - FILE *fpdb) + FILE *fpdb, + const vector& queries) { int i; // Print out the separate decoy hits. if (g_staticParams.options.iDecoySearch == 2) { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 1, fpout, fpdb); - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 2, fpoutd, fpdb); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 1, fpout, fpdb, queries); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 2, fpoutd, fpdb, queries); } else { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 0, fpout, fpdb); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 0, fpout, fpdb, queries); } } @@ -164,13 +165,14 @@ void CometWriteSqt::PrintSqtHeader(FILE *fpout, void CometWriteSqt::PrintResults(int iWhichQuery, int iPrintTargetDecoy, FILE *fpout, - FILE *fpdb) + FILE *fpdb, + const vector& queries) { int i, iNumPrintLines; std::ostringstream oss; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); Results *pOutput; @@ -212,7 +214,7 @@ void CometWriteSqt::PrintResults(int iWhichQuery, for (i=0; i g_staticParams.options.dMinimumXcorr) - PrintSqtLine(iWhichQuery, i, pOutput, fpout, fpdb, iPrintTargetDecoy); + PrintSqtLine(iWhichQuery, i, pOutput, fpout, fpdb, iPrintTargetDecoy, queries); } } @@ -222,7 +224,8 @@ void CometWriteSqt::PrintSqtLine(int iWhichQuery, Results *pOutput, FILE *fpout, FILE *fpdb, - int iPrintTargetDecoy) + int iPrintTargetDecoy, + const vector& queries) { int i; std::ostringstream oss; @@ -325,7 +328,7 @@ void CometWriteSqt::PrintSqtLine(int iWhichQuery, bool bReturnFulProteinString = false; CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, - bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys); + bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys, queries); if (iPrintTargetDecoy != 2) // if not decoy only, print target proteins { diff --git a/CometSearch/CometWriteSqt.h b/CometSearch/CometWriteSqt.h index e02aa3ba..9e4482db 100644 --- a/CometSearch/CometWriteSqt.h +++ b/CometSearch/CometWriteSqt.h @@ -25,7 +25,8 @@ class CometWriteSqt static void WriteSqt(FILE *fpout, FILE *fpoutd, - FILE *fpdb); + FILE *fpdb, + const vector& queries); static void PrintSqtHeader(FILE *fpout, CometSearchManager &searchMgr); @@ -34,13 +35,15 @@ class CometWriteSqt static void PrintResults(int iWhichQuery, int iPrintTargetDecoy, FILE *fpOut, - FILE *fpdb); + FILE *fpdb, + const vector& queries); static void PrintSqtLine(int iWhichQuery, int iWhichResult, Results *pOutput, FILE *fpOut, FILE *fpdb, - int iPrintTargetDecoy); + int iPrintTargetDecoy, + const vector& queries); }; #endif diff --git a/CometSearch/CometWriteTxt.cpp b/CometSearch/CometWriteTxt.cpp index 24e48e14..b9b95a74 100644 --- a/CometSearch/CometWriteTxt.cpp +++ b/CometSearch/CometWriteTxt.cpp @@ -31,22 +31,23 @@ CometWriteTxt::~CometWriteTxt() void CometWriteTxt::WriteTxt(FILE *fpout, FILE *fpoutd, - FILE *fpdb) + FILE *fpdb, + const vector& queries) { int i; // Print out the separate decoy hits. if (g_staticParams.options.iDecoySearch == 2) { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 1, fpout, fpdb); - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 2, fpoutd, fpdb); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 1, fpout, fpdb, queries); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 2, fpoutd, fpdb, queries); } else { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 0, fpout, fpdb); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 0, fpout, fpdb, queries); } } @@ -115,13 +116,14 @@ void CometWriteTxt::PrintTxtHeader(FILE *fpout) void CometWriteTxt::PrintResults(int iWhichQuery, int iPrintTargetDecoy, FILE *fpout, - FILE *fpdb) //fpdb is file pointer for either FASTA or .idx file + FILE *fpdb, + const vector& queries) //fpdb is file pointer for either FASTA or .idx file { #ifdef CRUX - if ((iPrintTargetDecoy != 2 && g_pvQuery.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) - || (iPrintTargetDecoy == 2 && g_pvQuery.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) + if ((iPrintTargetDecoy != 2 && queries.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) + || (iPrintTargetDecoy == 2 && queries.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) { - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); int charge = pQuery->_spectrumInfoInternal.usiChargeState; double spectrum_neutral_mass = pQuery->_pepMassInfo.dExpPepMass - PROTON_MASS; @@ -211,7 +213,7 @@ void CometWriteTxt::PrintResults(int iWhichQuery, unsigned int uiNumTotProteins = 0; // print protein list - PrintProteins(fpout, fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, &uiNumTotProteins); + PrintProteins(fpout, fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, &uiNumTotProteins, queries); // Cleavage type fprintf(fpout, "\t%c%c\t", pOutput[iWhichResult].cPrevAA, pOutput[iWhichResult].cNextAA); @@ -227,10 +229,10 @@ void CometWriteTxt::PrintResults(int iWhichQuery, } #else - if ((iPrintTargetDecoy != 2 && g_pvQuery.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) - || (iPrintTargetDecoy == 2 && g_pvQuery.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) + if ((iPrintTargetDecoy != 2 && queries.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) + || (iPrintTargetDecoy == 2 && queries.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) { - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); Results *pOutput; int iNumPrintLines; @@ -377,7 +379,7 @@ void CometWriteTxt::PrintResults(int iWhichQuery, unsigned int uiNumTotProteins = 0; // print protein list - PrintProteins(fpout, fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, &uiNumTotProteins); + PrintProteins(fpout, fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, &uiNumTotProteins, queries); fprintf(fpout, "\t%u\t", uiNumTotProteins); @@ -409,7 +411,8 @@ void CometWriteTxt::PrintProteins(FILE *fpout, int iWhichQuery, int iWhichResult, int iPrintTargetDecoy, - unsigned int *uiNumTotProteins) + unsigned int *uiNumTotProteins, + const vector& queries) { std::vector vProteinTargets; // store vector of target protein names std::vector vProteinDecoys; // store vector of decoy protein names @@ -417,7 +420,7 @@ void CometWriteTxt::PrintProteins(FILE *fpout, bool bReturnFulProteinString = false; - CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, uiNumTotProteins, vProteinTargets, vProteinDecoys); + CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, uiNumTotProteins, vProteinTargets, vProteinDecoys, queries); bool bPrintComma = false; diff --git a/CometSearch/CometWriteTxt.h b/CometSearch/CometWriteTxt.h index db048fd4..f921b760 100644 --- a/CometSearch/CometWriteTxt.h +++ b/CometSearch/CometWriteTxt.h @@ -24,7 +24,8 @@ class CometWriteTxt ~CometWriteTxt(); static void WriteTxt(FILE *fpout, FILE *fpoutd, - FILE *fpdb); + FILE *fpdb, + const vector& queries); static void PrintTxtHeader(FILE *fpout); static void PrintModifications(FILE *fpout, @@ -35,13 +36,15 @@ class CometWriteTxt int iWhichQuery, int iWhichResult, int iPrintTargetDecoy, - unsigned int *uiNumTotProteins); + unsigned int *uiNumTotProteins, + const vector& queries); private: static void PrintResults(int iWhichQuery, int iPrintTargetDecoy, FILE *fpOut, - FILE *fpdb); + FILE *fpdb, + const vector& queries); }; #endif diff --git a/CometSearch/Makefile b/CometSearch/Makefile index 8ec0a970..fb0b6463 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -14,9 +14,9 @@ endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-write-strings -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 -I../$(ASCOREPRO)/include + override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-write-strings -Wno-unknown-pragmas -Wno-char-subscripts -Wno-unused-result -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 -I../$(ASCOREPRO)/include else - override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-write-strings -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 -I../$(ASCOREPRO)/include + override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-write-strings -Wno-unknown-pragmas -Wno-char-subscripts -Wno-unused-result -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 -I../$(ASCOREPRO)/include endif # dependency generation (gcc/clang) @@ -27,7 +27,13 @@ COMETSEARCH_SRC = Threading CometInterfaces CometSearch CometPreprocess CometPos CometWriteSqt CometWritePepXML CometWriteMzIdentML CometWritePercolator CometWriteTxt CometSearchManager \ CombinatoricsUtils CometModificationsPermuter CometFragmentIndex CometPeptideIndex CometSpecLib CometAlignment -COMETSEARCH_OBJ = $(addprefix $(OBJDIR)/, $(addsuffix .o, $(COMETSEARCH_SRC))) +THREADING_SRC = threading/SearchMemoryPool + +SEARCH_SRC = search/SearchUtils search/FiStrategy search/FastaStrategy search/PiStrategy search/Pipeline + +COMETSEARCH_OBJ = $(addprefix $(OBJDIR)/, $(addsuffix .o, $(COMETSEARCH_SRC))) \ + $(addprefix $(OBJDIR)/, $(addsuffix .o, $(THREADING_SRC))) \ + $(addprefix $(OBJDIR)/, $(addsuffix .o, $(SEARCH_SRC))) all: libcometsearch.a @@ -38,6 +44,7 @@ $(OBJDIR): mkdir -p $(OBJDIR) $(OBJDIR)/%.o: %.cpp %.h Common.h CometDataInternal.h BS_thread_pool.hpp | $(OBJDIR) + @mkdir -p $(dir $@) ${CXX} ${CXXFLAGS} $< -c -o $@ # Add specific dependency rules for object files that require multiple headers @@ -57,5 +64,13 @@ $(OBJDIR)/CometPreprocess.o: CometPreprocess.cpp Common.h CometData.h CometDataI $(OBJDIR)/CometMassSpecUtils.o: CometMassSpecUtils.cpp Common.h CometData.h CometDataInternal.h CometSearch.h CometSearchManager.h CometMassSpecUtils.h CometInterfaces.h BS_thread_pool.hpp | $(OBJDIR) ${CXX} ${CXXFLAGS} ${DEPFLAGS} CometMassSpecUtils.cpp -c -o $@ +$(OBJDIR)/threading/%.o: threading/%.cpp threading/%.h | $(OBJDIR) + @mkdir -p $(OBJDIR)/threading + ${CXX} ${CXXFLAGS} -I. $< -c -o $@ + +$(OBJDIR)/search/%.o: search/%.cpp | $(OBJDIR) + @mkdir -p $(OBJDIR)/search + ${CXX} ${CXXFLAGS} -I. $< -c -o $@ + clean: rm -rf $(OBJDIR) *.a diff --git a/CometSearch/Threading.cpp b/CometSearch/Threading.cpp index 8bfafcb9..e1d6381c 100644 --- a/CometSearch/Threading.cpp +++ b/CometSearch/Threading.cpp @@ -52,7 +52,7 @@ void Threading::UnlockMutex(Mutex& mutex) mutex.unlock(); } -void Threading::DestroyMutex(Mutex& mutex) +void Threading::DestroyMutex(Mutex& /*mutex*/) { // std::mutex destructor handles cleanup automatically // Ensure mutex is unlocked before destruction diff --git a/CometSearch/core/Constants.h b/CometSearch/core/Constants.h new file mode 100644 index 00000000..368a8cef --- /dev/null +++ b/CometSearch/core/Constants.h @@ -0,0 +1,109 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _COMETCONSTANTS_H_ +#define _COMETCONSTANTS_H_ + +#define PROTON_MASS 1.00727646688 +#define C13_DIFF 1.00335483 + +#define FLOAT_ZERO 1e-6 // 0.000001 + +#define MIN_PEPTIDE_LEN 1 // min # of AA for a petpide +#define MAX_PEPTIDE_LEN 51 // max # of AA for a peptide; one more than actual # to account for terminating char +#define MAX_PEPTIDE_LEN_P2 53 // max # of AA for a peptide plus 2 for N/C-term + +#define FRAGINDEX_MIN_IONS_SCORE 3 // min # of matched ions for peptide to register for E-value xcorr histogram +#define FRAGINDEX_MIN_IONS_REPORT 3 // min # of matched ions for peptide to be reported +#define FRAGINDEX_MIN_MASS 200.0 // minimum fragment ion mass used to generate fragment index +#define FRAGINDEX_MAX_MASS 2000.0 // maximum fragment ion mass used to generate fragment index +#define FRAGINDEX_MAX_BATCHSIZE 1000 // maximum number of spectra loaded when querying fragment index +#define FRAGINDEX_MAX_NUMPEAKS 150 // number of spectrum peaks used to query fragment index +#define FRAGINDEX_MAX_NUMSCORED 100 // for each fragment index spectrum query, score up to this many peptides +#define FRAGINDEX_MAX_COMBINATIONS 2000 +#define FRAGINDEX_MAX_MODS_PER_MOD 5 +#define FRAGINDEX_KEEP_ALL_PEPTIDES 1 // 1 = consider up to FRAGINDEX_MAX_COMBINATIONS of peptides; 0 = ignore all mods for peptide that exceed FRAGINDEX_MAX_COMBINATIONS + +#define MS1_MIN_MASS 0.0 // only parse up to this mass in MS1 scans for MS1 library searches +#define MS1_MAX_MASS 3000.0 // only parse up to this mass in MS1 scans for MS1 library searches +#define MS1_RT_HISTORY_SIZE 250 // size of MS1 RT history kept for recent history linear regression +#define MS1_RT_OUTLIER_THRESHOLD 2.0 // # stdev outlier threshold for MS1 RT history + +#define MAX_PEFFMOD_LEN 16 +#define SIZE_MASS 128 // ascii value size +#define SIZE_NATIVEID 256 // max length of nativeID string +#define NUM_SP_IONS 1000 // num ions for preliminary scoring +#define NUM_ION_SERIES 7 // a,b,c,x,y,z,z1 +#define EXPECT_DECOY_SIZE 3000 // number of decoy entries in CometDecoys.h + +#define WIDTH_REFERENCE 256 // length of the protein accession field to store +#define MAX_PROTEINS 50 // maximum number of proteins to return for each query; for index search only right now + +#define HISTO_SIZE 152 // some number greater than 150 + +#define NO_PEFF_VARIANT -127 + +#define ASCORE_CUTOFF_TO_ACCEPT 13.0 // minimum AScore value to accept localization + +#define FRAGINDEX_VMODS 5 // only parse first five variable mods for fragment ion index searches + // if this is ever larger than 16, need to extend range of siVarModProteinFilter + +#define VMODS 15 // also "VMODS+1" is 4th dimension of uiBinnedIonMasses to cover unmodified ions (0), mod NL (1-15) +#define COMPOUNDMODS_OFFSET 100 // piVarModSites values >= 100 encode compound mods; index = value - 100 +#define VMOD_1_INDEX 0 +#define VMOD_2_INDEX 1 +#define VMOD_3_INDEX 2 +#define VMOD_4_INDEX 3 +#define VMOD_5_INDEX 4 +#define VMOD_6_INDEX 5 +#define VMOD_7_INDEX 6 +#define VMOD_8_INDEX 7 +#define VMOD_9_INDEX 8 +#define VMOD_10_INDEX 9 +#define VMOD_11_INDEX 10 +#define VMOD_12_INDEX 11 +#define VMOD_13_INDEX 12 +#define VMOD_14_INDEX 13 +#define VMOD_15_INDEX 14 + +#define ENZYME_SINGLE_TERMINI 1 +#define ENZYME_DOUBLE_TERMINI 2 +#define ENZYME_N_TERMINI 8 +#define ENZYME_C_TERMINI 9 + +#define ION_SERIES_A 0 +#define ION_SERIES_B 1 +#define ION_SERIES_C 2 +#define ION_SERIES_X 3 +#define ION_SERIES_Y 4 +#define ION_SERIES_Z 5 +#define ION_SERIES_Z1 6 //z+1 + +#ifdef CRUX +#define XCORR_CUTOFF -999.0 +#else +#define XCORR_CUTOFF 1E-8 // some near-zero cutoff +#endif + +#define SPECLIB_CUTOFF -999.9 + +// Identifies which type of database is being searched. +enum class DbType +{ + FASTA_DB = 0, // normal FASTA sequence database + FI_DB = 1, // fragment ion index (.idx) + PI_DB = 2 // peptide index (.idx) +}; + +#endif // _COMETCONSTANTS_H_ diff --git a/CometSearch/core/Params.h b/CometSearch/core/Params.h new file mode 100644 index 00000000..37f1aebe --- /dev/null +++ b/CometSearch/core/Params.h @@ -0,0 +1,450 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Parameter structs: Options, DBInfo, StaticParams, and their sub-structs. +// Depends on: core/Constants.h, CometData.h + +#ifndef _COMETPARAMS_H_ +#define _COMETPARAMS_H_ + +#include +#include +#include +#include +#include "core/Constants.h" +#include "CometData.h" + +using std::string; +using std::vector; +using std::multimap; + +class CometSearchManager; + +struct Options +{ + int iNumPeptideOutputLines; + int iWhichReadingFrame; + int iEnzymeTermini; + int iNumStored; // # of search results to store for xcorr analysis + int iMaxDuplicateProteins; // maximum number of duplicate proteins to report or store in idx file + int iSpectrumBatchSize; // # of spectra to search at a time within the scan range + int iStartCharge; + int iEndCharge; + int iMaxFragmentCharge; + int iMinPrecursorCharge; + int iMaxPrecursorCharge; + int iMSLevel; // filter query scans in raw/mzML/mzXML input by ms level (aka MS2, MS3) + int iSpecLibMSLevel; // filter speclib scans in raw/mzML/mzXML input by ms level (aka MS2, MS3) + int iMinPeaks; + int iRemovePrecursor; // 0=no, 1=yes, 2=ETD precursors, 3=phosphate neutral loss + int iDecoySearch; // 0=no, 1=concatenated search, 2=separate decoy search + int iNumThreads; // 0=poll CPU else set # threads to spawn + int iNumFragmentThreads; // # threads used for fragment indexing + bool bResolveFullPaths; // 0=do not resolve full paths; 1=resolve paths (default) + bool bOutputSqtStream; + bool bOutputSqtFile; + bool bOutputTxtFile; + bool bOutputPepXMLFile; + int iOutputMzIdentMLFile; + bool bOutputPercolatorFile; + bool bClipNtermMet; // 0=leave protein sequences alone; 1=also consider w/o N-term methionine + bool bClipNtermAA; // 0=leave peptide sequences as-is; 1=clip N-term amino acid from every peptide + bool bMango; // 0=normal; 1=Mango x-link ms2 input + bool bScaleFragmentNL; // 0=no; 1=scale fragment NL for each modified residue contained in fragment + bool bCreateFragmentIndex; // 0=normal search; 1=create fragment ion index plain peptide file + bool bCreatePeptideIndex; // 0=normal search; 1=create peptide index file; only one of bCreateFragmentIndex and bCreatePeptideIndex can be 1 + bool bFastPlainPeptideIdx; // 0=legacy RunSearch path; 1=use PepGenTuple per-thread buffers (avoids heap alloc) + bool bVerboseOutput; + bool bExplicitDeltaCn; // if set to 1, do not use sequence similarity logic + bool bPrintExpectScore; + bool bExportAdditionalScoresPepXML; // if 1, also report lnrSp, lnExpect, IonFrac, lnNumSP to pepXML output + bool bCorrectMass; // use selectionMZ instead of monoMZ if monoMZ is outside selection window + bool bTreatSameIL; + int iPrintAScoreProScore; // 0=no, otherwise specify variable_modXX number e.g. 1 for variable_mod01 + int iMaxIndexRunTime; // max run time of index search in milliseconds + int iFragIndexMinIonsScore; // minimum matched fragment index ions for scoring + int iFragIndexMinIonsReport; // minimum matched fragment index ions for reporting + int iFragIndexNumSpectrumPeaks; // # of peaks from spectrum to use for querying fragment index + int iFragIndexSkipReadPrecursors; // if true, skips reading precursors step + int iOverrideCharge; + long lMaxIterations; // max # of modification permutations for each iStart position + double dMinIntensity; // intensity cutoff for each peak + double dMinPercentageIntensity; // intensity cutoff for each peak as % of base peak + double dRemovePrecursorTol; + double dPeptideMassLow; // MH+ mass + double dPeptideMassHigh; // MH+ mass + double dMinimumXcorr; // set the minimum xcorr to report (default is 1e-8) + double dFragIndexMaxMass; // fragment index maximum fragment mass + double dFragIndexMinMass; // fragment index minimum fragment mass + double dMS1MinMass; // low mass cutoff in MS1 query/library spectra + double dMS1MaxMass; // high mass cutoff in MS1 query/library spectra + IntRange scanRange; + IntRange peptideLengthRange; + DoubleRange clearMzRange; + char szActivationMethod[24]; // mzXML only + string sPinProteinDelimiter; // PIN file protein delimiter; default tab +}; + +// The minimum and maximum mass range of all peptides to consider +// i.e. lowestPepMass - tolerance to highestPepMass + tolerance +struct MassRange +{ + double dMinMass; + double dMaxMass; + unsigned short usiMaxFragmentCharge; // global maximum fragment charge + bool bNarrowMassRange; // used to determine how to parse peptides in SearchForPeptides + unsigned int uiMaxFragmentArrayIndex; // BIN(dFragIndexMaxMass); used as fragment array index +}; + +extern MassRange g_massRange; + +struct DBInfo +{ + char szDatabase[SIZE_FILE]; + char szFileName[SIZE_FILE]; + int iTotalNumProteins; + unsigned long int uliTotAACount; +}; + +struct SpecLibInfo // why a struct for just a string??? +{ + string strSpecLibFile; +}; + +struct PEFFInfo +{ + char szPeffOBO[SIZE_FILE]; + int iPeffSearch; // 0=no, 1=PSI-MOD, 2=Unimod, 3=PSI-MOD only, 4=Unimod only, 5=variants only +}; + +struct StaticMod +{ + double dAddCterminusPeptide; + double dAddNterminusPeptide; + double dAddCterminusProtein; + double dAddNterminusProtein; + double pdStaticMods[SIZE_MASS]; +}; + +struct PrecalcMasses +{ + double dNtermProton; // dAddNterminusPeptide + PROTON_MASS + double dCtermOH2Proton; // dAddCterminusPeptide + dOH2fragment + PROTON_MASS + double dOH2ProtonCtermNterm; // dOH2parent + PROTON_MASS + dAddCterminusPeptide + dAddNterminusPeptide + int iMinus17; // BIN'd value of mass(NH3) + int iMinus18; // BIN'd value of mass(H2O) +}; + +struct VarModParams +{ + bool bVarModSearch; // set to true if variable mods are specified + bool bVarTermModSearch; // set to true if any n-term/c-term variable mods are specified + bool bVarProteinNTermMod; // set to true if a protein n-term variable mod specified + bool bVarProteinCTermMod; // set to true if a protein c-term variable mod specified + bool bBinaryModSearch; // set to true if any of the variable mods are of binary mod variety + bool bUseFragmentNeutralLoss; // set to true if any custom NL is set; applied only to 1+ and 2+ fragments + bool bRareVarModPresent; // set to true if any of iRequireThisMod == -1 + bool bVarModProteinFilter; // set to trueif protein mods list is applied + int iRequireVarMod; // 0=no; else use bits to determine which varmods are required + int iMaxVarModPerPeptide; + int iMaxPermutations; + VarMods varModList[VMODS]; + char cModCode[VMODS]; // mod characters + string sProteinLModsListFile; // file containing list of proteins to restrict application of varmods to + multimap mmapProteinModsList; // vector read from sProteinModsListFile if present + string sCompoundModsFile; // path to compound mods mass file; empty = disabled + vector vdCompoundMasses; // sorted, deduplicated list of masses read from sCompoundModsFile + unsigned int uiNumCompoundMasses; // vdCompoundMasses.size(); 0 when feature is disabled +}; + +struct MassUtil +{ + int bMonoMassesParent; + int bMonoMassesFragment; + double dCO; + double dNH3; + double dNH2; + double dH2O; + double dCOminusH2; + double dOH2fragment; + double dOH2parent; + double pdAAMassParent[SIZE_MASS]; + double pdAAMassFragment[SIZE_MASS]; + double pdAAMassUser[SIZE_MASS]; // user defined default amino acid masses +}; + +struct ToleranceParams +{ + int iMassToleranceUnits; // 0=amu, 1=mmu, else ppm (2) + int iMassToleranceType; // 0=MH+ (default), 1=precursor m/z; only valid if iMassToleranceUnits > 0 + int iIsotopeError; + double dInputToleranceMinus; // raw tolerance value from param file, lower bound; gets converted to dPeptideMassToleranceMinus + double dInputTolerancePlus; // raw tolerance value from param file, upper bound; gets converted to dPeptideMassTolerancePlus + double dFragmentBinSize; + double dFragmentBinStartOffset; + double dMS1BinSize; + double dMS1BinStartOffset; +}; + +struct IonInfo +{ + int iNumIonSeriesUsed; + int piSelectedIonSeries[NUM_ION_SERIES]; + bool bUseWaterAmmoniaLoss; // ammonia, water loss + int iTheoreticalFragmentIons; + int iIonVal[NUM_ION_SERIES]; +}; + +// static user params, won't change per thread - can make global! +struct StaticParams +{ + string sHostName; + char szMod[512]; // used for sqt output + char szDecoyPrefix[256]; // used for prefix to indicate decoys + string sDecoyPrefix; // escaped version of szDecoyPrefix for output within XML files + char szOutputSuffix[256]; // used for suffix to append to output file base names + char szTxtFileExt[256]; // text file extension; default "txt" + int iElapseTime; + char szDate[32]; + Options options; + DBInfo databaseInfo; + SpecLibInfo speclibInfo; + PEFFInfo peffInfo; + InputFileInfo inputFile; + int bPrintDuplReferences; + VarModParams variableModParameters; + ToleranceParams tolerances; + StaticMod staticModifications; + PrecalcMasses precalcMasses; + EnzymeInfo enzymeInformation; + MassUtil massUtility; + double dInverseBinWidth; // this is used in BIN() many times so use inverse binWidth to do multiply vs. divide + int iArraySizeGlobal; // (int)((g_staticParams.options.dPeptideMassHigh + plus_tol_in_daltons + buffer) * g_staticParams.dInverseBinWidth) + // for MS1 library search, use dMS1MaxMass instead of dPeptideMassHigh + double dOneMinusBinOffset; // this is used in BIN() many times so calculate once + IonInfo ionInformation; + int iXcorrProcessingOffset; + DbType iDbType; // FASTA_DB = normal fasta; FI_DB = fragment ion indexed; PI_DB = peptide index + vector vectorMassOffsets; + vector precursorNLIons; + int iPrecursorNLSize; + int iOldModsEncoding; + bool bSkipToStartScan; + std::chrono::high_resolution_clock::time_point tRealTimeStart; // track run time of real-time index search + + StaticParams() + { + RestoreDefaults(); + } + + void RestoreDefaults() + { + int i; + + inputFile.iInputType = InputType_MS2; + + szMod[0] = '\0'; + + iXcorrProcessingOffset = 75; + iDbType = DbType::FASTA_DB; + + databaseInfo.szDatabase[0] = '\0'; + speclibInfo.strSpecLibFile.clear(); + + strcpy(szDecoyPrefix, "DECOY_"); + strcpy(szTxtFileExt, "txt"); + szOutputSuffix[0] = '\0'; + + peffInfo.szPeffOBO[0] = '\0'; + peffInfo.iPeffSearch = 0; + + variableModParameters.sCompoundModsFile = ""; + variableModParameters.vdCompoundMasses.clear(); + variableModParameters.uiNumCompoundMasses = 0; + + iPrecursorNLSize = 0; + + for (i = 0; i < SIZE_MASS; ++i) + { + massUtility.pdAAMassParent[i] = 999999.; + massUtility.pdAAMassFragment[i] = 999999.; + massUtility.pdAAMassUser[i] = 0.0; + staticModifications.pdStaticMods[i] = 0.0; + } + + massUtility.bMonoMassesFragment = 1; + massUtility.bMonoMassesParent = 1; + +#ifdef CRUX + staticModifications.pdStaticMods[(int)'C'] = 57.021464; +#endif + + + enzymeInformation.iAllowedMissedCleavage = 2; + + for (i = 0; i < VMODS; ++i) + { + variableModParameters.varModList[i].iMaxNumVarModAAPerMod = 3; + variableModParameters.varModList[i].iMinNumVarModAAPerMod = 0; + variableModParameters.varModList[i].iBinaryMod = 0; + variableModParameters.varModList[i].iRequireThisMod = 0; + variableModParameters.varModList[i].iVarModTermDistance = -1; // distance from N or C-term distance + variableModParameters.varModList[i].iWhichTerm = 0; // specify N (0) or C-term (1) + variableModParameters.varModList[i].dVarModMass = 0.0; + variableModParameters.varModList[i].dNeutralLoss = 0.0; + variableModParameters.varModList[i].dNeutralLoss2 = 0.0; + strcpy(variableModParameters.varModList[i].szVarModChar, "X"); + +#ifdef CRUX + if (i==0) + { + variableModParameters.varModList[i].dVarModMass = 15.9949; + strcpy(variableModParameters.varModList[i].szVarModChar, "M"); + } +#endif + } + + variableModParameters.cModCode[0] = '*'; + variableModParameters.cModCode[1] = '#'; + variableModParameters.cModCode[2] = '@'; + variableModParameters.cModCode[3] = '^'; + variableModParameters.cModCode[4] = '~'; + variableModParameters.cModCode[5] = '$'; + variableModParameters.cModCode[6] = '%'; + variableModParameters.cModCode[7] = '!'; + variableModParameters.cModCode[8] = '+'; + for (int i = 9; i < VMODS; ++i) + { + int iAscii = 88 + i; //start with lower case 'a' ASCII 97 + if (iAscii <= 125) // thru '}' which is ASCII 125 + variableModParameters.cModCode[i] = (char)(iAscii); + else + variableModParameters.cModCode[i] = '_'; + } + + variableModParameters.iMaxVarModPerPeptide = 5; + variableModParameters.iMaxPermutations = MAX_PERMUTATIONS; + variableModParameters.bUseFragmentNeutralLoss = false; + variableModParameters.iRequireVarMod = 0; + + ionInformation.bUseWaterAmmoniaLoss = false; + ionInformation.iTheoreticalFragmentIons = 1; // 0 = flanking peaks; 1 = no flanking peaks + ionInformation.iIonVal[ION_SERIES_A] = 0; + ionInformation.iIonVal[ION_SERIES_B] = 1; + ionInformation.iIonVal[ION_SERIES_C] = 0; + ionInformation.iIonVal[ION_SERIES_X] = 0; + ionInformation.iIonVal[ION_SERIES_Y] = 1; + ionInformation.iIonVal[ION_SERIES_Z] = 0; + ionInformation.iIonVal[ION_SERIES_Z1] = 0; + + options.iNumPeptideOutputLines = 5; + options.iWhichReadingFrame = 0; + options.iEnzymeTermini = 2; + options.iNumStored = 100; // default # of search results to store for xcorr analysis. + options.iMaxDuplicateProteins = 20; // maximum number of duplicate proteins to report or store in idx file + + options.bExplicitDeltaCn = false; + options.bPrintExpectScore = true; + options.iPrintAScoreProScore = 0; + options.bExportAdditionalScoresPepXML = false; + options.bCorrectMass = false; + options.bTreatSameIL = true; + options.iOverrideCharge = 0; + options.iMaxIndexRunTime = 0; // index run time limit in milliseconds; 0=no time limit + options.iRemovePrecursor = 0; + options.dRemovePrecursorTol = 1.5; + + options.bOutputSqtStream = false; + options.bOutputSqtFile = false; + options.bOutputTxtFile = false; + options.bOutputPepXMLFile = true; + options.iOutputMzIdentMLFile = false; + options.bOutputPercolatorFile = false; + + options.bResolveFullPaths = true; + + options.bMango = false; + options.bScaleFragmentNL = false; + options.bCreatePeptideIndex = false; + options.bCreateFragmentIndex = false; + options.bFastPlainPeptideIdx = false; + options.bVerboseOutput = false; + options.iDecoySearch = 0; + options.iNumThreads = 4; + options.iNumFragmentThreads = 4; + options.bClipNtermMet = false; + options.bClipNtermAA = false; + + options.lMaxIterations = 0; + + // These parameters affect mzXML/RAMP spectra only. + options.scanRange.iStart = 0; + options.scanRange.iEnd = 0; + options.iSpectrumBatchSize = 0; + options.iMinPeaks = 10; + options.iStartCharge = 0; + options.iEndCharge = 0; + options.iMaxFragmentCharge = 3; + options.iMinPrecursorCharge = 1; + options.iMaxPrecursorCharge = 6; + options.iMSLevel = 2; + options.dMinIntensity = 0.0; + options.dMinPercentageIntensity = 0.0; + options.dPeptideMassLow = 600.0; + options.dPeptideMassHigh = 5000.0; + options.dMinimumXcorr = XCORR_CUTOFF; + options.dFragIndexMaxMass = FRAGINDEX_MAX_MASS; + options.dFragIndexMinMass = FRAGINDEX_MIN_MASS; + strcpy(options.szActivationMethod, "ALL"); + // End of mzXML specific parameters. + + options.sPinProteinDelimiter = '\t'; + + options.dFragIndexMinMass = FRAGINDEX_MIN_MASS; + options.dFragIndexMaxMass = FRAGINDEX_MAX_MASS; + options.iFragIndexMinIonsScore = FRAGINDEX_MIN_IONS_SCORE; + options.iFragIndexMinIonsReport = FRAGINDEX_MIN_IONS_REPORT; + options.iFragIndexNumSpectrumPeaks = FRAGINDEX_MAX_NUMPEAKS; + options.iFragIndexSkipReadPrecursors = 1; // skip reading precursors by default + + options.dMS1MinMass = MS1_MIN_MASS; + options.dMS1MaxMass = MS1_MAX_MASS; + + options.clearMzRange.dStart = 0.0; + options.clearMzRange.dEnd = 0.0; + + options.peptideLengthRange.iStart = MIN_PEPTIDE_LEN; + options.peptideLengthRange.iEnd = MAX_PEPTIDE_LEN - 1; // -1 as MAX_PEPTIDE_LEN number includes terminating char + + staticModifications.dAddCterminusPeptide = 0.0; + staticModifications.dAddNterminusPeptide = 0.0; + staticModifications.dAddCterminusProtein = 0.0; + staticModifications.dAddNterminusProtein = 0.0; + + tolerances.iMassToleranceUnits = 0; + tolerances.iMassToleranceType = 0; + tolerances.iIsotopeError = 0; + tolerances.dInputToleranceMinus = -3.0; // peptide_mass_tolerance minus + tolerances.dInputTolerancePlus = 3.0; // peptide_mass_tolerance plus + tolerances.dFragmentBinSize = 1.0005; + tolerances.dFragmentBinStartOffset = 0.4; + tolerances.dMS1BinSize = 1.0005; + + bSkipToStartScan = true; + } +}; + +extern StaticParams g_staticParams; + +#endif // _COMETPARAMS_H_ diff --git a/CometSearch/core/Types.h b/CometSearch/core/Types.h new file mode 100644 index 00000000..2f5fec6d --- /dev/null +++ b/CometSearch/core/Types.h @@ -0,0 +1,837 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-spectrum and index runtime data structs: Results, Query, QueryMS1, etc. +// Depends on: core/Constants.h, core/Params.h, CometData.h, Threading.h, AScore headers + +#ifndef _COMETTYPES_H_ +#define _COMETTYPES_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "core/Constants.h" +#include "core/Params.h" +#include "Threading.h" +#include "AScoreOptions.h" +#include "AScoreCentroid.h" +#include "AScoreAPI.h" +#include "AScoreFactory.h" +#include "AScoreDllInterface.h" + +using std::string; +using std::vector; +using std::map; + +class CometSearchManager; + +struct Results +{ + double dPepMass; + double dExpect; + float fScoreSp; + float fXcorr; + float fDeltaCn; + float fLastDeltaCn; + float fAScorePro; // AScorePro score + unsigned short usiRankXcorr; + unsigned short usiLenPeptide; + unsigned short usiRankSp; + unsigned short usiMatchedIons; + unsigned short usiTotalIons; + comet_fileoffset_t lProteinFilePosition; // for indexdb, this is the entry in g_pvProteinsList + long lWhichProtein; // which entry in g_pvProteinsList[] contains the matched proteins + int piVarModSites[MAX_PEPTIDE_LEN_P2]; // store variable mods encoding, +2 to accomodate N/C-term + double pdVarModSites[MAX_PEPTIDE_LEN_P2]; // store variable mods mass diffs, +2 to accomodate N/C-term + char pszMod[MAX_PEPTIDE_LEN][MAX_PEFFMOD_LEN]; // store PEFF mod string + char szPeptide[MAX_PEPTIDE_LEN]; + char cPrevAA; // stores prev flanking AA + char cNextAA; // stores following flanking AA + bool bClippedM; // true if new N-term protein due to clipped methionine + char cHasVariableMod; // HasVariableModType enum: 0 = no variable mod, 1 = has variable mod, 2 = has AScorePro mod + string sPeffOrigResidues; // original residue(s) of a PEFF variant + string sAScoreProSiteScores; // AScorePro site scores as comma-separated string + int iPeffOrigResiduePosition; // position of PEFF variant substitution; -1 = n-term, iLenPeptide = c-term; -9=unused + int iPeffNewResidueCount; // more than 0 new residues is a substitution (if iPeffOrigResidueCount=1) or insertion (if iPeffOrigResidueCount>1) + vector pWhichProtein; // file positions of matched protein entries + vector pWhichDecoyProtein; // keep separate decoy list (used for separate decoy matches and combined results) +}; + +struct SpecLibResults // MS2 spec lib +{ + unsigned int iWhichSpecLib; // the matched spectral library entry + float fSpecLibScore; + float fXcorr; // use xcorr for now + float fCn; // speclib score + float fRTtime; // retention time in seconds of the matched entry +}; + +struct SpecLibResultsMS1 // MS1 spec lib +{ + unsigned int iWhichSpecLib; // the matched spectral library entry + float fDotProduct; // unit vector dot product aka cosine similarity + float fRTime; // retention time in seconds of the matched entry +}; + +struct PepMassInfo +{ + double dCalcPepMass; + double dExpPepMass; // protonated MH+ experimental mass + double dPeptideMassToleranceLow; // mass tolerance low in amu from experimental mass + double dPeptideMassToleranceHigh; // mass tolerance high in amu from experimental mass + double dPeptideMassToleranceMinus; // low end of mass tolerance range including isotope offsets + double dPeptideMassTolerancePlus; // high end of mass tolerance range including isotope offsets +}; + +struct SpectrumInfoInternal +{ + int iArraySize; // m/z versus intensity array + int iHighestIon; + int iScanNumber; + unsigned short usiChargeState; + unsigned short usiMaxFragCharge; + double dTotalIntensity; + float fRTime; + char szMango[32]; // Mango encoding + char szNativeID[SIZE_NATIVEID]; // nativeID string from mzML +}; + +// PreprocessStruct stores information used in preprocessing +// each spectrum. Information not kept around otherwise +struct PreprocessStruct +{ + int iHighestIon; + double dHighestIntensity; +}; + +struct OBOStruct // stores info read from OBO file +{ + double dMassDiffAvg; // this is looked up from strMod string from OBO + double dMassDiffMono; + string strMod; // mod string, PSI-MOD, Unimod or custom + + bool operator<(const OBOStruct& a) const + { + return (strMod < a.strMod); + } +}; + +struct ProteinEntryStruct +{ + comet_fileoffset_t lWhichProtein; // file pointer to protein + int iStartResidue; // start residue position in protein (1-based) + char cPrevAA; + char cNextAA; + + bool operator<(const ProteinEntryStruct& a) const + { + return (lWhichProtein < a.lWhichProtein); + } +}; + +struct PeffModStruct // stores info read from PEFF header +{ + double dMassDiffAvg; // this is looked up from strMod string from OBO + double dMassDiffMono; + int iPosition; // position of modification + char szMod[MAX_PEFFMOD_LEN]; + + bool operator<(const PeffModStruct& a) const + { + return (iPosition < a.iPosition); + } +}; + +struct PeffVariantSimpleStruct // stores info read from PEFF header +{ + int iPosition; // position of variant + char cResidue; // new variant + + bool operator<(const PeffVariantSimpleStruct& a) const + { + return (iPosition < a.iPosition); + } +}; + +struct PeffVariantComplexStruct // stores info read from PEFF header +{ + int iPositionA; // start position of variant + int iPositionB; // end position of variant + string sResidues; // if !empty(), insertion replacing aa from pos A to B; + // if empty(), deletion of aa from pos A to B + + bool operator<(const PeffVariantComplexStruct& a) const + { + return (iPositionA < a.iPositionA); + } +}; + +struct PeffProcessedStruct +{ + int iBeginResidue; + int iEndResidue; +}; + +struct PeffPositionStruct // collate PEFF mods by position in sequence +{ + int iPosition; // position within the sequence + vector vectorWhichPeff; // which specific peff entry from PeffModStruct + vector vectorMassDiffAvg; + vector vectorMassDiffMono; +}; + +struct PeffSearchStruct // variant info passed to SearchForPeptides +{ + int iPosition; + bool bBeginCleavage; + bool bEndCleavage; + char cOrigResidue; +}; + +//-->MH +typedef struct sDBEntry +{ + string strName; // might be able to delete this here + string strSeq; + comet_fileoffset_t lProteinFilePosition; + vector vectorPeffMod; + vector vectorPeffVariantSimple; + vector vectorPeffVariantComplex; + vector vectorPeffProcessed; +} sDBEntry; + +struct DBIndex +{ + vector pcVarModSites; // empty = unmodified; else [iLen+2] encoding var mods + comet_fileoffset_t lIndexProteinFilePosition; // points to entry in g_pvProteinsList + double dPepMass; // MH+ pep mass + unsigned short siVarModProteinFilter = 0; // bitwise representation of mmapProtein + char cPrevAA; + char cNextAA; + char sPeptide[MAX_PEPTIDE_LEN]; // peptide sequence, null-terminated + + bool operator==(const DBIndex& rhs) const + { + if (strcmp(sPeptide, rhs.sPeptide) != 0) + return false; + + if (fabs(dPepMass - rhs.dPepMass) > FLOAT_ZERO) + return false; + + int iLen = (int)strlen(sPeptide) + 2; + for (int i = 0; i < iLen; ++i) + { + char l = pcVarModSites.empty() ? 0 : pcVarModSites[i]; + char r = rhs.pcVarModSites.empty() ? 0 : rhs.pcVarModSites[i]; + if (l != r) + return false; + } + + return true; + } + + bool operator<(const DBIndex& rhs) const + { + int cmp = strcmp(sPeptide, rhs.sPeptide); + if (cmp != 0) + return cmp < 0; + + if (fabs(dPepMass - rhs.dPepMass) > FLOAT_ZERO) + return dPepMass < rhs.dPepMass; + + int iLen = (int)strlen(sPeptide) + 2; + for (int i = 0; i < iLen; ++i) + { + char l = pcVarModSites.empty() ? 0 : pcVarModSites[i]; + char r = rhs.pcVarModSites.empty() ? 0 : rhs.pcVarModSites[i]; + if (l != r) + return l < r; + } + + // FINAL tie-breaker: lowest protein index first in order + // to grab flanking residues from the first protein + return lIndexProteinFilePosition < rhs.lIndexProteinFilePosition; + } +}; + +// Compact fixed-size tuple used during plain-peptide index generation. +// Replaces heap-heavy DBIndex entries during the per-thread collection phase. +struct PepGenTuple +{ + char sPeptide[MAX_PEPTIDE_LEN]; // original AA letters (or L->I canonical), null-terminated + double dPepMass; // MH+ mass + comet_fileoffset_t lProteinFileOffset;// FASTA byte offset of the source protein + uint16_t siVarModProteinFilter; + char cPrevAA; + char cNextAA; +}; + +// --------------------------------------------------------------------------- +// 5-bit amino acid encoding for per-length short-peptide key packing. +// AAs are mapped in ASCII sort order (A=1, C=2, ..., Y=20) so that sorting +// packed uint64 keys is equivalent to lexicographic sort of sequences within +// a given peptide length. +// --------------------------------------------------------------------------- +static constexpr uint8_t kAA5bit[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0-15 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 16-31 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 32-47 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 48-63 + 0, // 64 '@' + 1, // 65 'A' + 0, // 66 'B' + 2, // 67 'C' + 3, // 68 'D' + 4, // 69 'E' + 5, // 70 'F' + 6, // 71 'G' + 7, // 72 'H' + 8, // 73 'I' (canonical for I/L when bTreatSameIL) + 0, // 74 'J' + 9, // 75 'K' + 10, // 76 'L' (remapped to 8 when bTreatSameIL) + 11, // 77 'M' + 12, // 78 'N' + 0, // 79 'O' + 13, // 80 'P' + 14, // 81 'Q' + 15, // 82 'R' + 16, // 83 'S' + 17, // 84 'T' + 0, // 85 'U' + 18, // 86 'V' + 19, // 87 'W' + 0, // 88 'X' + 20, // 89 'Y' + 0, // 90 'Z' + // 91-255: all zeros + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0 +}; + +// Reverse map: 5-bit code -> amino acid character. +// Code 8 always decodes to 'I' (canonical; L maps to code 8 when bTreatSameIL). +static constexpr char k5bitAA[32] = { + '\0','A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R', + 'S', 'T','V','W','Y','\0','\0','\0','\0','\0','\0','\0','\0','\0','\0','\0' +}; + +// Pack up to 12 amino acids into a uint64 key (5 bits each, 60 bits total). +// When bTreatSameIL is true, L encodes identically to I. +inline uint64_t PackPeptide(const char* seq, int iLen, bool bTreatSameIL) +{ + uint64_t key = 0; + for (int i = 0; i < iLen; ++i) + { + char c = seq[i]; + if (bTreatSameIL && c == 'L') c = 'I'; + key |= ((uint64_t)kAA5bit[(unsigned char)c] << (55 - i * 5)); + } + return key; +} + +// Decode a packed key back to a null-terminated sequence of iLen characters. +inline void UnpackPeptide(uint64_t key, int iLen, char* seq) +{ + for (int i = 0; i < iLen; ++i) + seq[i] = k5bitAA[(key >> (55 - i * 5)) & 0x1F]; + seq[iLen] = '\0'; +} + +// Compact per-thread tuple for short peptides (len <= 12) during index generation. +// 32 bytes on 64-bit (8-byte alignment); uILMask occupies 2 of the 4 trailing pad bytes. +struct PepGenTupleShort +{ + uint64_t uPackedPep; // canonical 5-bit-encoded sequence (L treated as I when bTreatSameIL) + double dPepMass; + comet_fileoffset_t lProteinFileOffset; + uint16_t siVarModProteinFilter; + char cPrevAA; + char cNextAA; + uint16_t uILMask; // bitmask: bit k = 1 means position k was 'L' in FASTA original +}; + +// This is used for fragment indexing; plain peptides are stored in index +// file and read in to this data struct. Same as DBIndex w/o pcVarModSites[] +struct PlainPeptideIndexStruct +{ + comet_fileoffset_t lIndexProteinFilePosition; // points to entry in g_pvProteinsList + double dPepMass; // MH+ pep mass, unmodified mass; modified mass in FragmentPeptidesStruct + unsigned short siVarModProteinFilter; // bitwise representation of mmapProtein + char cPrevAA; + char cNextAA; + char szPeptide[MAX_PEPTIDE_LEN]; // peptide sequence, null-terminated + + bool operator==(const PlainPeptideIndexStruct &rhs) const + { + return strcmp(szPeptide, rhs.szPeptide) == 0; + } +}; + +struct FragmentPeptidesStruct +{ + size_t iWhichPeptide; // reference to raw peptide (sequence, proteins, etc.) in PlainPeptideIndexStruct + int modNumIdx; + double dPepMass; // peptide mass (modified or unmodified) after permuting mods + char cNtermMod; + char cCtermMod; + + bool operator<(const FragmentPeptidesStruct& a) const + { + return dPepMass < a.dPepMass; + } +}; + +struct SpecLibStruct +{ + string strName; // any string associated with speclib entry + unsigned int iLibEntry; // a reference number associated with speclib entry + unsigned int iNumPeaks; + int iSpecLibCharge; // precursor charge; not relevant for MS1 speclib + double dSpecLibMW; // if a peptide, store neutral mass + float fRTime; + float fScaleMinInten; // min intensity of data prior to encoding to pccSparseFastXcorrData; 0.0 for unit vector + float fScaleMaxInten; // max intensity of data prior to encoding to ppcSparseFastXcorrData + vector> vSpecLibPeaks; + float* pfUnitVector; + unsigned int uiArraySizeMS1; +}; + +// for MS1 alignment +struct RetentionMatch +{ + double dQueryTime; + double dReferenceTime; + int iSpectrumIndex; + + RetentionMatch(double dQueryTime, double dReferenceTime, int iSpectrumIndex); +}; +extern std::deque RetentionMatchHistory; + +extern unsigned int* g_iFragmentIndex; // CSR flat data: all posting lists concatenated [g_iFragmentIndexOffset[bin]..g_iFragmentIndexOffset[bin+1]) +extern uint64_t* g_iFragmentIndexOffset; // CSR offsets [uiMaxFragmentArrayIndex+1]: cumulative entry counts, can exceed UINT_MAX for large non-enzymatic searches +extern vector g_vFragmentPeptides; +extern vector g_vRawPeptides; +extern bool* g_bIndexPrecursors; // allocate an array of BIN(max_precursor, protonated) and use a bool to indicate if that precursor is present in input file(s) +extern vector g_vSpecLib; +extern vector> g_vulSpecLibPrecursorIndex; // this will be an vector of vectors + +struct IndexProteinStruct // for indexed database +{ + char szProt[WIDTH_REFERENCE]; + comet_fileoffset_t lProteinFilePosition; + int iWhichProtein; +}; + +// Flat CSR-style storage for the per-peptide protein list. +// Replaces vector> to eliminate the ~190M +// individual heap allocations (one per inner vector) that caused a +// ~6-minute free-time tail when building an MHC .idx file. +// External interface mirrors vector> so +// existing call sites need no changes. +class ProteinsListCSR +{ +public: + // Read-only proxy for a single row (one peptide's protein offsets). + struct Row + { + const comet_fileoffset_t* ptr; + size_t n; + + size_t size() const { return n; } + bool empty() const { return n == 0; } + + const comet_fileoffset_t& operator[](size_t j) const { return ptr[j]; } + comet_fileoffset_t at(size_t j) const { return ptr[j]; } + + const comet_fileoffset_t* begin() const { return ptr; } + const comet_fileoffset_t* end() const { return ptr + n; } + }; + + // Size / state + size_t size() const { return m_off.empty() ? 0 : m_off.size() - 1; } + bool empty() const { return size() == 0; } + + // Modifiers + void clear() + { + vector().swap(m_flat); + vector().swap(m_off); + } + + void reserve(size_t n) { m_off.reserve(n + 1); } + + void push_back(const vector& v) + { + if (m_off.empty()) m_off.push_back(0); + m_flat.insert(m_flat.end(), v.begin(), v.end()); + m_off.push_back(m_flat.size()); + } + + void push_back(vector&& v) + { + if (m_off.empty()) m_off.push_back(0); + m_flat.insert(m_flat.end(), v.begin(), v.end()); + m_off.push_back(m_flat.size()); + vector().swap(v); // release source buffer immediately + } + + // Batch-append from pre-built flat storage. + // flat: all protein file offsets for this block, concatenated in row order + // cnt: number of offsets per row (max value bounded by iMaxDuplicateProteins) + // Bulk-copies both arrays into m_flat/m_off with two insert() calls, then + // releases the source buffers. Replaces N individual push_back(vector&&) + // calls, each of which required one heap free() -- this reduces N free()s + // to 2 (one for flat, one for cnt) regardless of how many rows are in the block. + void append_flat(vector& flat, vector& cnt) + { + if (flat.empty()) + return; + if (m_off.empty()) + m_off.push_back(0); + m_flat.insert(m_flat.end(), flat.begin(), flat.end()); + for (uint32_t n : cnt) + m_off.push_back(m_off.back() + n); + vector().swap(flat); + vector().swap(cnt); + } + + // Element access + Row operator[](size_t i) const + { + return {m_flat.data() + m_off[i], + static_cast(m_off[i + 1] - m_off[i])}; + } + + Row at(size_t i) const { return (*this)[i]; } + + // Range-based for -- yields Row values + struct Iterator + { + const ProteinsListCSR* self; + size_t i; + + Row operator*() const { return (*self)[i]; } + Iterator& operator++() { ++i; return *this; } + bool operator!=(const Iterator& o) const { return i != o.i; } + }; + + Iterator begin() const { return {this, 0}; } + Iterator end() const { return {this, size()}; } + +private: + vector m_flat; // all protein offsets concatenated + vector m_off; // [N+1] CSR offsets; row i spans [m_off[i], m_off[i+1]) +}; + +extern ProteinsListCSR g_pvProteinsList; +extern std::unordered_map g_pvProteinNameCache; // file offset -> protein name string; populated at index load + +extern AScoreProCpp::AScoreOptions g_AScoreOptions; // AScore options +extern AScoreProCpp::AScoreDllInterface* g_AScoreInterface; + +struct ModificationNumber +{ +// int modificationNumber; + int modStringLen; // FIX: need to confirm if not needed (MOD_SEQS.at(modSeqIdx)).size(); + char* modifications; +}; + +extern vector MOD_NUMBERS; +extern vector MOD_SEQS; // Unique modifiable sequences. +extern int* MOD_SEQ_MOD_NUM_START; // Start index in the MOD_NUMBERS vector for a modifiable sequence; -1 if no modification numbers were generated +extern int* MOD_SEQ_MOD_NUM_CNT; // Total modifications numbers for a modifiable sequence. + +// Index into the MOD_SEQS vector +// -1 for peptides that have no modifiable amino acids +// -2 for peptides with no modifiable amino acids but contain n/c-term mods +extern int* PEPTIDE_MOD_SEQ_IDXS; + +extern int MOD_NUM; +extern bool g_bPlainPeptideIndexRead; // set to true if plain peptide index file is read (and fragment index generated) + // poor choice of name for the fragment index .idx given peptide index is back +extern std::atomic g_bPeptideIndexRead; // set to true if peptide index file is read +extern bool g_bSpecLibRead; // set to true if spectral library file is read + +// g_bPerformSpecLibSearch, g_bPerformDatabaseSearch, g_bIdxNoFasta moved to SearchSession +// (Phase 4: batch path only -- see search/SearchSession.h) + +extern bool g_bCometPreprocessMemoryAllocated; // set to true when memory has been allocated +extern bool g_bCometSearchMemoryAllocated; // set to true when memory has been allocated + +// Query stores information for peptide scoring and results +// This struct is allocated for each spectrum/charge combination +struct Query +{ + int iXcorrHistogram[HISTO_SIZE]; + unsigned int uiHistogramCount; // # of entries in histogram + float fPar[4]; // parameters of LMA regression + + int iMatchPeptideCount; // # of peptides that get stored (i.e. are greater than lowest score) + int iDecoyMatchPeptideCount; // # of decoy peptides that get stored (i.e. are greater than lowest score) + + short siMaxXcorr; // index of maximum correlation score in iXcorrHistogram + + short siLowestXcorrScoreIndex; + short siLowestDecoyXcorrScoreIndex; + + double dLowestXcorrScore; + double dLowestDecoyXcorrScore; + + float fLowestSpecLibScore; + + int iMinXcorrHisto; // min xcorr score for xcorr histogram to address good E-values for poor/sparse spectra + + double dMangoIndex; // scan number decimal precursor value i.e. 2401.001 for scan 2401, first precursor/z pair + + unsigned long int _uliNumMatchedPeptides; // # of peptides that get scored + unsigned long int _uliNumMatchedDecoyPeptides; + + // When true, sparse child arrays (float[SPARSE_MATRIX_SIZE]) belong to the + // thread-local RtsScratch pool and must NOT be delete[]'d by the destructor. + // Set only by PreprocessSingleSpectrumThreadLocal via PreprocessSingleSpectrumCore. + bool bSparseFromPool; + + // Sparse matrix representation of data + int iSpScoreData; //size of sparse matrix + int iFastXcorrDataSize; + float **ppfSparseSpScoreData; + float **ppfSparseFastXcorrData; + float **ppfSparseFastXcorrDataNL; // ppfSparseFastXcorrData with NH3, H2O contributions + + // Store raw peaks for AScorePro + + // List of ms/ms masses for fragment index search; intensity not important at this stage + vector vfRawFragmentPeakMass; + // Consider replacing vfRawFragmentPeakMass with a vector> to store + // both mass and intensity if AScorePro is used + vector vRawFragmentPeakMassIntensity; + + + PepMassInfo _pepMassInfo; + SpectrumInfoInternal _spectrumInfoInternal; + Results* _pResults; + Results* _pDecoys; + SpecLibResults* _pSpecLibResults; + + std::chrono::high_resolution_clock::time_point tSearchStart; // per-query search start time for iMaxIndexRunTime timeout + + Mutex accessMutex; + + Query() + { + memset(iXcorrHistogram, 0, sizeof(iXcorrHistogram)); + + iMatchPeptideCount = 0; + iDecoyMatchPeptideCount = 0; + uiHistogramCount = 0; + iMinXcorrHisto = 0; + + fPar[0]=0.0; + fPar[1]=0.0; + fPar[2]=0.0; + fPar[3]=0.0; + + siMaxXcorr = 0; // index of maximum correlation score in iXcorrHistogram + siLowestXcorrScoreIndex = 0; + siLowestDecoyXcorrScoreIndex = 0; + + dLowestXcorrScore = XCORR_CUTOFF; + dLowestDecoyXcorrScore = XCORR_CUTOFF; + + fLowestSpecLibScore = SPECLIB_CUTOFF; + + dMangoIndex = 0.0; + + _uliNumMatchedPeptides = 0; + _uliNumMatchedDecoyPeptides = 0; + + bSparseFromPool = false; + + ppfSparseSpScoreData = NULL; + ppfSparseFastXcorrData = NULL; + ppfSparseFastXcorrDataNL = NULL; // ppfSparseFastXcorrData with NH3, H2O contributions + + vfRawFragmentPeakMass.clear(); + vRawFragmentPeakMassIntensity.clear(); + + _pepMassInfo.dCalcPepMass = 0.0; + _pepMassInfo.dExpPepMass = 0.0; + _pepMassInfo.dPeptideMassToleranceLow = 0.0; + _pepMassInfo.dPeptideMassToleranceHigh = 0.0; + _pepMassInfo.dPeptideMassToleranceMinus = 0.0; + _pepMassInfo.dPeptideMassTolerancePlus = 0.0; + + _spectrumInfoInternal.dTotalIntensity = 0.0; + _spectrumInfoInternal.iArraySize = 0; + _spectrumInfoInternal.iHighestIon = 0; + _spectrumInfoInternal.iScanNumber = 0; + _spectrumInfoInternal.dTotalIntensity = 0.0; + + _pResults = NULL; + _pDecoys = NULL; + _pSpecLibResults = NULL; + + Threading::InitMutex(&accessMutex); + } + + ~Query() + { + int i; + if (!bSparseFromPool) + { + for (i = 0; i < iSpScoreData; ++i) + { + if (ppfSparseSpScoreData[i] != NULL) + delete[] ppfSparseSpScoreData[i]; + } + } + delete[] ppfSparseSpScoreData; + ppfSparseSpScoreData = NULL; + + if (g_staticParams.ionInformation.bUseWaterAmmoniaLoss + && (g_staticParams.ionInformation.iIonVal[ION_SERIES_A] + || g_staticParams.ionInformation.iIonVal[ION_SERIES_B] + || g_staticParams.ionInformation.iIonVal[ION_SERIES_Y])) + { + if (!bSparseFromPool) + { + for (i = 0; i < iFastXcorrDataSize; ++i) + { + if (ppfSparseFastXcorrData[i] != NULL) + delete[] ppfSparseFastXcorrData[i]; + if (ppfSparseFastXcorrDataNL[i]!=NULL) + delete[] ppfSparseFastXcorrDataNL[i]; + } + } + delete[] ppfSparseFastXcorrDataNL; + ppfSparseFastXcorrDataNL = NULL; + } + else + { + if (!bSparseFromPool) + { + for (i = 0; i < iFastXcorrDataSize; ++i) + { + if (ppfSparseFastXcorrData[i] != NULL) + delete[] ppfSparseFastXcorrData[i]; + } + } + } + delete[] ppfSparseFastXcorrData; + ppfSparseFastXcorrData = NULL; + + if (_pResults != NULL) + { + _pResults->pWhichProtein.clear(); + if (g_staticParams.options.iDecoySearch == 1) + _pResults->pWhichDecoyProtein.clear(); + delete[] _pResults; + _pResults = NULL; + } + + if (g_staticParams.options.iDecoySearch == 2 && _pDecoys != NULL) + { + _pDecoys->pWhichDecoyProtein.clear(); + delete[] _pDecoys; + _pDecoys = NULL; + } + + Threading::DestroyMutex(accessMutex); + } +}; + +struct QueryMS1 +{ + // short siLowestSpecLibIndex; + // float fLowestXcorr; + unsigned int uiMatchMS1Count; // # of peptides that get stored (i.e. are greater than lowest score) + unsigned int iArraySizeMS1; // dimension of pcFastXcorrData + + // Standard array representation of data + // Library spectra are fast xcorr manipulated so non need to do so with query MS1 + float* pfFastXcorrData; + + SpecLibResultsMS1 _pSpecLibResultsMS1; + + Mutex accessMutex; + + QueryMS1() + { + // siLowestSpecLibIndex = 0; + // fLowestXcorr = SPECLIB_CUTOFF; + uiMatchMS1Count = 0; + pfFastXcorrData = NULL; + _pSpecLibResultsMS1.fDotProduct = 0.0; + _pSpecLibResultsMS1.fRTime = 0.0; + + Threading::InitMutex(&accessMutex); + } + + ~QueryMS1() + { + //FIX delete _pSepcLibResults + + Threading::DestroyMutex(accessMutex); + } +}; + +// g_pvQuery and g_pvQueryMS1 moved to SearchSession.queries / SearchSession.ms1Queries +// (Phase 4: batch path only -- see search/SearchSession.h) +extern vector g_pvInputFiles; +extern Mutex g_pvQueryMutex; +extern Mutex g_pvDBIndexMutex; +extern Mutex g_preprocessMemoryPoolMutex; +extern Mutex g_dbIndexMutex; +extern Mutex g_vSpecLibMutex; + +extern vector g_pvDBIndex; // used in both peptide index and fragment ion index; latter to store plain peptides +// Per-length, per-thread generation buffers. Outer index = (iLen - iMinLen) for short, +// (iLen - 13) for long. Inner index = thread slot. +extern vector>> g_vvvPepGenShort; // lengths <= 12 +extern vector>> g_vvvPepGenLong; // lengths > 12 +extern std::map g_pvProteinNames; // indexed database protein names and file positions + +struct IonSeriesStruct // defines which fragment ion series are considered +{ + int bPreviousMatch[8]; +}; + + +struct MatchedIonsStruct // for SingleSpectrumSearch +{ + double dMass; + double dInten; + + bool operator<(const MatchedIonsStruct& a) const + { + return dInten > a.dInten; + } +}; + +#endif // _COMETTYPES_H_ diff --git a/CometSearch/output/IResultWriter.h b/CometSearch/output/IResultWriter.h new file mode 100644 index 00000000..d0d6d0cf --- /dev/null +++ b/CometSearch/output/IResultWriter.h @@ -0,0 +1,106 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _IRESULTWRITER_H_ +#define _IRESULTWRITER_H_ + +#include +#include +#include + +class CometSearchManager; +class CometStatus; +struct Query; + +// Parameters passed to each writer's open() method. +struct WriterOpenCtx +{ + // pStatus is required: every writer's open() dereferences it unconditionally on + // the file-open-failure path with no null check, so making it constructor-only + // (rather than a default-nullptr field set later like the rest of this struct) + // turns "forgot to set pStatus" from a runtime null-pointer crash into a compile + // error at the construction site. + explicit WriterOpenCtx(CometStatus& status) : pStatus(&status) {} + + const char* szBaseName = nullptr; + const char* szOutputSuffix = nullptr; + const char* szTxtFileExt = nullptr; // TxtWriter only + bool bEntireFile = false; // true => no scan-range suffix on output name + int iFirstScan = 0; + int iLastScan = 0; + int iDecoySearch = 0; // 0=off, 1=concat, 2=separate + bool bIdxNoFasta = false; // .idx DB with no companion .fasta (mzIdentML) + CometSearchManager* pMgr = nullptr; // for format headers that need ICometSearchManager + CometStatus* const pStatus; // session error/cancel state; never null, set once above +}; + +// Parameters passed to each writer's write() method (per-batch). +struct WriterWriteCtx +{ + FILE* fpdb; + int iScanOffset; // iTotalSpectraSearched - queries.size(); pepXML only + int iBatchNum; // mzIdentML only + const std::vector* pQueries; // batch query results for this write call +}; + +class IResultWriter +{ +public: + virtual ~IResultWriter() = default; + + // Open output file(s) and write format header. + // Returns false on error. + virtual bool open(const WriterOpenCtx& ctx) = 0; + + // Write all results in ctx.pQueries (the current batch's session.queries) for one batch. + // Returns false on error. + virtual bool write(const WriterWriteCtx& ctx) = 0; + + // Write format footer (if any), close file(s), and optionally remove + // them (bEmpty = iTotalSpectraSearched == 0). + // + // Contract: must be safe to call even if open() was never called, or returned + // false partway through. Pipeline::run() calls close(false, false) on every + // writer in the vector -- including ones after the one whose open() failed -- + // when any writer's open() fails, so implementations must null-check their file + // handles before touching them. + virtual void close(bool bSucceeded, bool bEmpty) = 0; + +protected: + // Shared output-filename builder used by all format writers. + static void BuildNames(const WriterOpenCtx& ctx, + const char* ext, + const char* extDecoy, + std::string& sTarget, + std::string& sDecoy, + const char* extTargetCrux = nullptr) + { + std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; + std::string range; + if (!ctx.bEntireFile) + range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); +#ifdef CRUX + if (ctx.iDecoySearch == 2) + { sTarget = base + range + (extTargetCrux ? extTargetCrux : ext); sDecoy = base + range + extDecoy; } + else + sTarget = base + range + ext; +#else + (void)extTargetCrux; + sTarget = base + range + ext; + if (ctx.iDecoySearch == 2) sDecoy = base + range + extDecoy; +#endif + } +}; + +#endif // _IRESULTWRITER_H_ diff --git a/CometSearch/output/MzIdentMlWriter.h b/CometSearch/output/MzIdentMlWriter.h new file mode 100644 index 00000000..c4a48a51 --- /dev/null +++ b/CometSearch/output/MzIdentMlWriter.h @@ -0,0 +1,147 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _MZIDENTMLWRITER_H_ +#define _MZIDENTMLWRITER_H_ + +#include "output/IResultWriter.h" +#include "CometWriteMzIdentML.h" +#include "CometStatus.h" +#include "Common.h" + +class MzIdentMlWriter : public IResultWriter +{ +public: + explicit MzIdentMlWriter(CometSearchManager* pMgr) : _pMgr(pMgr) {} + + bool open(const WriterOpenCtx& ctx) override + { + _bIdxNoFasta = ctx.bIdxNoFasta; + _pStatus = ctx.pStatus; + BuildNames(ctx, ".mzid", ".decoy.mzid", _sTarget, _sDecoy, ".target.mzid"); + + _fpout = fopen(_sTarget.c_str(), "w"); + if (!_fpout) + { + std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + if (!OpenTmp(_sTarget, _sTgtTmp, _fpoutTmp, ctx.pStatus)) return false; + + if (ctx.iDecoySearch == 2) + { + _fpoutd = fopen(_sDecoy.c_str(), "w"); + if (!_fpoutd) + { + std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + if (!OpenTmp(_sDecoy, _sDecTmp, _fpoutdTmp, ctx.pStatus)) return false; + } + return true; + } + + bool write(const WriterWriteCtx& ctx) override + { + _fpdb = ctx.fpdb; // remember for close() + CometWriteMzIdentML::WriteMzIdentMLTmp(_fpoutTmp, _fpoutdTmp, ctx.iBatchNum, *ctx.pQueries); + return true; + } + + void close(bool bSucceeded, bool bEmpty) override + { + FinalizeOne(_fpout, _fpoutTmp, _sTgtTmp, bSucceeded, bEmpty); + FinalizeOne(_fpoutd, _fpoutdTmp, _sDecTmp, bSucceeded, bEmpty); + if (bEmpty) + { + if (!_sTarget.empty()) remove(_sTarget.c_str()); + if (!_sDecoy.empty()) remove(_sDecoy.c_str()); + if (!_sTgtTmp.empty()) remove(_sTgtTmp.c_str()); + if (!_sDecTmp.empty()) remove(_sDecTmp.c_str()); + } + } + +private: + CometSearchManager* _pMgr = nullptr; + CometStatus* _pStatus = nullptr; + FILE* _fpout = nullptr; + FILE* _fpoutd = nullptr; + FILE* _fpoutTmp = nullptr; + FILE* _fpoutdTmp = nullptr; + FILE* _fpdb = nullptr; + bool _bIdxNoFasta = false; + std::string _sTarget, _sDecoy, _sTgtTmp, _sDecTmp; + + bool OpenTmp(const std::string& sBase, std::string& sTmp, FILE*& fp, CometStatus* pStatus) + { + sTmp = sBase + ".XXXXXX"; + bool bTmpOk; +#ifdef _WIN32 + bTmpOk = (_mktemp_s(&sTmp[0], sTmp.size() + 1) == 0); +#else + { + int fd = mkstemp(&sTmp[0]); + if (fd != -1) ::close(fd); // release kernel fd; fopen below opens its own handle + bTmpOk = (fd != -1); + } +#endif + if (!bTmpOk) + { + std::string msg = " Error - cannot create temporary file \"" + sTmp + "\".\n"; + pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + fp = fopen(sTmp.c_str(), "w"); + if (!fp) + { + std::string msg = " Error - cannot write to temporary file \"" + sTmp + "\".\n"; + pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + return true; + } + + void FinalizeOne(FILE*& fpFinal, FILE*& fpTmp, const std::string& sTmp, + bool bSucceeded, bool bEmpty) + { + if (!fpFinal) return; + if (bSucceeded && fpTmp) + { + fclose(fpTmp); + fpTmp = fopen(sTmp.c_str(), "r"); + if (fpTmp) + { + CometWriteMzIdentML::WriteMzIdentML(fpFinal, _fpdb, sTmp, *_pMgr, _bIdxNoFasta); + fclose(fpTmp); fpTmp = nullptr; + if (!bEmpty) remove(sTmp.c_str()); + } + else + { + std::string msg = " Error - cannot reopen temporary mzIdentML file \"" + sTmp + "\" for merge.\n"; + _pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + } + } + else if (fpTmp) + { + fclose(fpTmp); fpTmp = nullptr; + if (!bEmpty) remove(sTmp.c_str()); + } + fclose(fpFinal); fpFinal = nullptr; + } + +}; + +#endif // _MZIDENTMLWRITER_H_ diff --git a/CometSearch/output/PepXmlWriter.h b/CometSearch/output/PepXmlWriter.h new file mode 100644 index 00000000..8130010c --- /dev/null +++ b/CometSearch/output/PepXmlWriter.h @@ -0,0 +1,83 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _PEPXMLWRITER_H_ +#define _PEPXMLWRITER_H_ + +#include "output/IResultWriter.h" +#include "CometWritePepXML.h" +#include "CometStatus.h" +#include "Common.h" + +class PepXmlWriter : public IResultWriter +{ +public: + bool open(const WriterOpenCtx& ctx) override + { + BuildNames(ctx, ".pep.xml", ".decoy.pep.xml", _sTarget, _sDecoy, ".target.pep.xml"); + + if ((_fpout = fopen(_sTarget.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + if (!CometWritePepXML::WritePepXMLHeader(_fpout, *ctx.pMgr)) + return false; + + if (ctx.iDecoySearch == 2) + { + if ((_fpoutd = fopen(_sDecoy.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + if (!CometWritePepXML::WritePepXMLHeader(_fpoutd, *ctx.pMgr)) + return false; + } + return true; + } + + bool write(const WriterWriteCtx& ctx) override + { + CometWritePepXML::WritePepXML(_fpout, _fpoutd, ctx.fpdb, ctx.iScanOffset, *ctx.pQueries); + return true; + } + + void close(bool bSucceeded, bool bEmpty) override + { + if (_fpout) + { + if (bSucceeded) CometWritePepXML::WritePepXMLEndTags(_fpout); + fclose(_fpout); _fpout = nullptr; + if (bEmpty) remove(_sTarget.c_str()); + } + if (_fpoutd) + { + if (bSucceeded) CometWritePepXML::WritePepXMLEndTags(_fpoutd); + fclose(_fpoutd); _fpoutd = nullptr; + if (bEmpty && !_sDecoy.empty()) remove(_sDecoy.c_str()); + } + } + +private: + FILE* _fpout = nullptr; + FILE* _fpoutd = nullptr; + std::string _sTarget; + std::string _sDecoy; + +}; + +#endif // _PEPXMLWRITER_H_ diff --git a/CometSearch/output/PercolatorWriter.h b/CometSearch/output/PercolatorWriter.h new file mode 100644 index 00000000..8626ee82 --- /dev/null +++ b/CometSearch/output/PercolatorWriter.h @@ -0,0 +1,61 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _PERCOLATORWRITER_H_ +#define _PERCOLATORWRITER_H_ + +#include "output/IResultWriter.h" +#include "CometWritePercolator.h" +#include "CometStatus.h" +#include "Common.h" + +class PercolatorWriter : public IResultWriter +{ +public: + bool open(const WriterOpenCtx& ctx) override + { + std::string sUnused; + BuildNames(ctx, ".pin", ".pin", _sPath, sUnused, ".pin"); + + _fpout = fopen(_sPath.c_str(), "w"); + if (!_fpout) + { + std::string msg = " Error - cannot write to file \"" + _sPath + "\".\n"; + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + CometWritePercolator::WritePercolatorHeader(_fpout); + return true; + } + + bool write(const WriterWriteCtx& ctx) override + { + return CometWritePercolator::WritePercolator(_fpout, ctx.fpdb, *ctx.pQueries); + } + + void close(bool /*bSucceeded*/, bool bEmpty) override + { + if (_fpout) + { + fclose(_fpout); _fpout = nullptr; + if (bEmpty) remove(_sPath.c_str()); + } + } + +private: + FILE* _fpout = nullptr; + std::string _sPath; +}; + +#endif // _PERCOLATORWRITER_H_ diff --git a/CometSearch/output/SqtWriter.h b/CometSearch/output/SqtWriter.h new file mode 100644 index 00000000..bc2e8263 --- /dev/null +++ b/CometSearch/output/SqtWriter.h @@ -0,0 +1,82 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _SQTWRITER_H_ +#define _SQTWRITER_H_ + +#include "output/IResultWriter.h" +#include "CometWriteSqt.h" +#include "CometStatus.h" +#include "Common.h" + +class SqtWriter : public IResultWriter +{ +public: + bool open(const WriterOpenCtx& ctx) override + { + if (g_staticParams.options.bOutputSqtFile) + { + BuildNames(ctx, ".sqt", ".decoy.sqt", _sTarget, _sDecoy, ".target.sqt"); + + if ((_fpout = fopen(_sTarget.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + CometWriteSqt::PrintSqtHeader(_fpout, *ctx.pMgr); + + if (ctx.iDecoySearch == 2) + { + if ((_fpoutd = fopen(_sDecoy.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + CometWriteSqt::PrintSqtHeader(_fpoutd, *ctx.pMgr); + } + } + return true; + } + + bool write(const WriterWriteCtx& ctx) override + { + CometWriteSqt::WriteSqt(_fpout, _fpoutd, ctx.fpdb, *ctx.pQueries); + return true; + } + + void close(bool /*bSucceeded*/, bool bEmpty) override + { + if (_fpout) + { + fclose(_fpout); _fpout = nullptr; + if (bEmpty) remove(_sTarget.c_str()); + } + if (_fpoutd) + { + fclose(_fpoutd); _fpoutd = nullptr; + if (bEmpty && !_sDecoy.empty()) remove(_sDecoy.c_str()); + } + } + +private: + FILE* _fpout = nullptr; + FILE* _fpoutd = nullptr; + std::string _sTarget; + std::string _sDecoy; + +}; + +#endif // _SQTWRITER_H_ diff --git a/CometSearch/output/TxtWriter.h b/CometSearch/output/TxtWriter.h new file mode 100644 index 00000000..1d5f1014 --- /dev/null +++ b/CometSearch/output/TxtWriter.h @@ -0,0 +1,83 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _TXTWRITER_H_ +#define _TXTWRITER_H_ + +#include "output/IResultWriter.h" +#include "CometWriteTxt.h" +#include "CometStatus.h" +#include "Common.h" + +class TxtWriter : public IResultWriter +{ +public: + bool open(const WriterOpenCtx& ctx) override + { + std::string ext = std::string(".") + ctx.szTxtFileExt; + std::string extDecoy = std::string(".decoy.") + ctx.szTxtFileExt; + std::string extTarget = std::string(".target.") + ctx.szTxtFileExt; + BuildNames(ctx, ext.c_str(), extDecoy.c_str(), _sTarget, _sDecoy, extTarget.c_str()); + + if ((_fpout = fopen(_sTarget.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + CometWriteTxt::PrintTxtHeader(_fpout); + fflush(_fpout); + + if (ctx.iDecoySearch == 2) + { + if ((_fpoutd = fopen(_sDecoy.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + CometWriteTxt::PrintTxtHeader(_fpoutd); + } + return true; + } + + bool write(const WriterWriteCtx& ctx) override + { + CometWriteTxt::WriteTxt(_fpout, _fpoutd, ctx.fpdb, *ctx.pQueries); + return true; + } + + void close(bool /*bSucceeded*/, bool bEmpty) override + { + if (_fpout) + { + fclose(_fpout); _fpout = nullptr; + if (bEmpty) remove(_sTarget.c_str()); + } + if (_fpoutd) + { + fclose(_fpoutd); _fpoutd = nullptr; + if (bEmpty && !_sDecoy.empty()) remove(_sDecoy.c_str()); + } + } + +private: + FILE* _fpout = nullptr; + FILE* _fpoutd = nullptr; + std::string _sTarget; + std::string _sDecoy; + +}; + +#endif // _TXTWRITER_H_ diff --git a/CometSearch/search/FastaStrategy.cpp b/CometSearch/search/FastaStrategy.cpp new file mode 100644 index 00000000..dc0ab427 --- /dev/null +++ b/CometSearch/search/FastaStrategy.cpp @@ -0,0 +1,79 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "Common.h" +#include "FastaStrategy.h" +#include "SearchUtils.h" +#include "CometPreprocess.h" +#include "CometSearch.h" +#include "CometPostAnalysis.h" +#include "CometSearchManager.h" +#include "MSReader.h" + +bool FastaStrategy::initialize(SearchSession& /*session*/, ThreadPool* /*tp*/) +{ + if (!CometPreprocess::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + if (!CometSearch::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + return true; +} + +bool FastaStrategy::openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) +{ + fpfasta = nullptr; + fpidx = nullptr; + fpdb = nullptr; + + if (!session.bPerformDatabaseSearch) + return true; + + if ((fpfasta = fopen(szDatabase.c_str(), "r")) == nullptr) + { + string strErrorMsg = " Error (1b) - cannot read sequence database file \"" + szDatabase + "\".\n"; + session.statusRef.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + + fpdb = fpfasta; + (void)session; // session.bIdxNoFasta stays false for FASTA searches + + return true; +} + +bool FastaStrategy::executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) +{ + return executeBatchLegacy(mstReader, iFirstScan, iLastScan, iAnalysisType, + iPercentStart, iPercentEnd, tp, session, true); +} + +void FastaStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) +{ + (void)fpidx; // always nullptr for FASTA searches + if (fpfasta != nullptr) fclose(fpfasta); +} + +void FastaStrategy::finalize() +{ + CometPreprocess::DeallocateMemory(g_staticParams.options.iNumThreads); + CometSearch::DeallocateMemory(g_staticParams.options.iNumThreads); +} diff --git a/CometSearch/search/FastaStrategy.h b/CometSearch/search/FastaStrategy.h new file mode 100644 index 00000000..9368c37f --- /dev/null +++ b/CometSearch/search/FastaStrategy.h @@ -0,0 +1,39 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ISearchStrategy.h" + +// Search strategy for FASTA_DB (classic three-sweep) batch searches. +// +// initialize(): reads the protein variable-mod filter file if configured; +// allocates search and preprocess memory pools. +// executeBatch(): LoadAndPreprocessSpectra -> RunSearch -> PostAnalysis. +// finalize(): frees memory pools. +class FastaStrategy : public ISearchStrategy +{ +public: + bool initialize(SearchSession& session, ThreadPool* tp) override; + bool openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) override; + bool executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) override; + void closeFiles(FILE* fpfasta, FILE* fpidx) override; + void finalize() override; + bool isIndexBased() const override { return false; } +}; diff --git a/CometSearch/search/FiStrategy.cpp b/CometSearch/search/FiStrategy.cpp new file mode 100644 index 00000000..7b766289 --- /dev/null +++ b/CometSearch/search/FiStrategy.cpp @@ -0,0 +1,178 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "Common.h" +#include "FiStrategy.h" +#include "SearchUtils.h" +#include "CometFragmentIndex.h" +#include "CometPreprocess.h" +#include "CometSearch.h" +#include "CometPostAnalysis.h" +#include "CometMassSpecUtils.h" +#include "MSReader.h" + +extern std::vector g_pvInputFiles; +extern bool g_bPlainPeptideIndexRead; +extern unsigned int* g_iFragmentIndex; +extern uint64_t* g_iFragmentIndexOffset; +extern bool* g_bIndexPrecursors; + +bool FiStrategy::initialize(SearchSession& session, ThreadPool* tp) +{ + if (!CometPreprocess::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + if (!CometSearch::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + // Pre-read precursors across all input files before building the index. + if (session.bPerformDatabaseSearch && !g_staticParams.options.iFragIndexSkipReadPrecursors) + { + auto tTime1 = chrono::steady_clock::now(); + if (!g_staticParams.options.bOutputSqtStream) + { + cout << " - read precursors ... "; + fflush(stdout); + } + + for (int i = 0; i < (int)g_pvInputFiles.size(); ++i) + { + if (!UpdateInputFile(g_pvInputFiles.at(i))) + return false; + + MSReader mstReader; + SetMSLevelFilter(mstReader); + CometPreprocess::Reset(); + + if (!CometPreprocess::ReadPrecursors(mstReader)) + return false; + } + + if (!g_staticParams.options.bOutputSqtStream) + cout << CometMassSpecUtils::ElapsedTime(tTime1) << endl; + } + + // Load plain peptide index (.idx) and build the in-memory fragment index. + if (session.bPerformDatabaseSearch && !g_bPlainPeptideIndexRead) + { + auto tStartTime = chrono::steady_clock::now(); + if (!g_staticParams.options.bOutputSqtStream) + { + cout << " - read .idx ... "; + fflush(stdout); + } + + CometFragmentIndex sqSearch; + sqSearch.ReadPlainPeptideIndex(); + + if (!g_staticParams.options.bOutputSqtStream) + cout << CometMassSpecUtils::ElapsedTime(tStartTime) << endl; + + sqSearch.CreateFragmentIndex(tp); + } + + return true; +} + +bool FiStrategy::openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) +{ + fpfasta = nullptr; + fpidx = nullptr; + fpdb = nullptr; + + if (!session.bPerformDatabaseSearch) + return true; + + string sTmpDB = szDatabase; + + if ((fpidx = fopen(sTmpDB.c_str(), "r")) == nullptr) + { + string strErrorMsg = " Error (1a) - cannot read .idx file \"" + sTmpDB + "\".\n"; + session.statusRef.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + + // Try to open the companion .fasta (not required for FI_DB search). + sTmpDB = sTmpDB.erase(sTmpDB.size() - 4); // strip .idx + if ((fpfasta = fopen(sTmpDB.c_str(), "r")) == nullptr) + { + session.bIdxNoFasta = true; + fpfasta = nullptr; + } + + fpdb = fpidx; + + return true; +} + +bool FiStrategy::executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) +{ + // Fused path: per-spectrum read+preprocess+search+post-analysis in one pass. + // Disabled for Mango or speclib runs (those require the legacy ordering). + bool bFused = session.bPerformDatabaseSearch + && !g_staticParams.options.bMango + && !session.bPerformSpecLibSearch; + + if (bFused) + { + session.statusRef.SetStatusMsg(string("Running fused FI_DB search...")); + + bool bSucceeded = CometPreprocess::FusedLoadAndSearchSpectra( + mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); + + iPercentStart = iPercentEnd; + iPercentEnd = mstReader.getPercent(); + + return bSucceeded; + } + + // Legacy three-sweep path: LoadAndPreprocess -> AllocateResults -> + // sort-by-mass -> RunSearch -> PostAnalysis. + return executeBatchLegacy(mstReader, iFirstScan, iLastScan, iAnalysisType, + iPercentStart, iPercentEnd, tp, session, false); +} + +void FiStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) +{ + if (fpidx != nullptr) fclose(fpidx); + if (fpfasta != nullptr) fclose(fpfasta); +} + +void FiStrategy::finalize() +{ + if (g_staticParams.iDbType == DbType::FI_DB) + { + free(g_bIndexPrecursors); + delete[] g_iFragmentIndex; + delete[] g_iFragmentIndexOffset; + + // Reset so a subsequent DoSearch() in the same process (batch run after an + // RTS session, or a second batch run) rebuilds the index instead of reusing + // these now-freed pointers; g_bPlainPeptideIndexRead gates that rebuild in + // FiStrategy::initialize() and is otherwise never reset to false. + g_bIndexPrecursors = nullptr; + g_iFragmentIndex = nullptr; + g_iFragmentIndexOffset = nullptr; + g_bPlainPeptideIndexRead = false; + } + + CometPreprocess::DeallocateMemory(g_staticParams.options.iNumThreads); + CometSearch::DeallocateMemory(g_staticParams.options.iNumThreads); +} diff --git a/CometSearch/search/FiStrategy.h b/CometSearch/search/FiStrategy.h new file mode 100644 index 00000000..c2136ff4 --- /dev/null +++ b/CometSearch/search/FiStrategy.h @@ -0,0 +1,41 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ISearchStrategy.h" + +// Search strategy for FI_DB (fragment ion index) batch searches. +// +// initialize(): pre-reads precursors, loads the .idx plain-peptide table, +// builds the in-memory fragment ion index. +// executeBatch(): uses the fused FI path (FusedLoadAndSearchSpectra) when +// possible; falls back to the legacy three-sweep path for +// Mango or speclib runs where the fused path is unavailable. +// finalize(): frees the fragment index arrays and memory pools. +class FiStrategy : public ISearchStrategy +{ +public: + bool initialize(SearchSession& session, ThreadPool* tp) override; + bool openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) override; + bool executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) override; + void closeFiles(FILE* fpfasta, FILE* fpidx) override; + void finalize() override; + bool isIndexBased() const override { return true; } +}; diff --git a/CometSearch/search/ISearchStrategy.h b/CometSearch/search/ISearchStrategy.h new file mode 100644 index 00000000..f979d027 --- /dev/null +++ b/CometSearch/search/ISearchStrategy.h @@ -0,0 +1,77 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "SearchSession.h" +#include "ThreadPool.h" +#include +#include + +namespace MSToolkit { class MSReader; } + +// Abstract search strategy. One concrete implementation per database type: +// FiStrategy -- FI_DB (fragment ion index, fused + fallback legacy path) +// FastaStrategy -- FASTA_DB (classic three-sweep path) +// PiStrategy -- PI_DB (plain peptide index) +// +// Pipeline selects the correct one at startup and holds it for the entire run. +class ISearchStrategy +{ +public: + virtual ~ISearchStrategy() = default; + + // Called once before the per-file loop. + // Allocates search/preprocess memory pools, loads/builds the index, + // pre-reads precursors (FI_DB), reads var-mod filter file (FASTA_DB). + // Returns false on error. + virtual bool initialize(SearchSession& session, ThreadPool* tp) = 0; + + // Called once per input file. + // Opens database file handles (fpfasta, fpidx) and sets fpdb to whichever + // handle writers use for sequence retrieval. + // Sets session.bIdxNoFasta = true when an .idx search has no companion .fasta. + // Returns false on error. + virtual bool openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) = 0; + + // Called once per batch within a file. + // Fills session.queries with fully scored Query* results (preprocess + search + // + post-analysis, all done here). May return with session.queries empty + // if no spectra passed the filters in this batch. + // Updates iPercentStart/iPercentEnd after loading (before RunSearch) so that + // RunSearch receives the file-position range for this batch. + // Returns false on error or cancel. + virtual bool executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) = 0; + + // Called once per input file after all batches. + // Closes the file handles opened by openFiles(). + virtual void closeFiles(FILE* fpfasta, FILE* fpidx) = 0; + + // Called once after all files. + // Frees memory pools and (for FI_DB) the fragment index arrays. + virtual void finalize() = 0; + + // Returns true for index-based searches (FI_DB, PI_DB), false for FASTA_DB. + // Pipeline::run() is the only consumer, and uses it solely to choose between the + // compact index-style progress line ("- searching ... done") and the verbose + // FASTA-style per-file banners ("Search start:"/"Search end:", "done" per batch). + // This flag carries no other semantics -- it must not be used to gate actual + // search behavior; that belongs in the strategy's own initialize()/executeBatch(). + virtual bool isIndexBased() const = 0; +}; diff --git a/CometSearch/search/PiStrategy.cpp b/CometSearch/search/PiStrategy.cpp new file mode 100644 index 00000000..d4344764 --- /dev/null +++ b/CometSearch/search/PiStrategy.cpp @@ -0,0 +1,94 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "Common.h" +#include "PiStrategy.h" +#include "SearchUtils.h" +#include "CometPreprocess.h" +#include "CometSearch.h" +#include "CometPostAnalysis.h" +#include "MSReader.h" + +bool PiStrategy::initialize(SearchSession& session, ThreadPool* tp) +{ + (void)session; + (void)tp; + + // The peptide index is loaded lazily on first access inside + // CometSearch::RunSearch -> SearchPeptideIndex. No explicit + // ReadPeptideIndex() call is needed here. + + if (!CometPreprocess::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + if (!CometSearch::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + return true; +} + +bool PiStrategy::openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) +{ + fpfasta = nullptr; + fpidx = nullptr; + fpdb = nullptr; + + if (!session.bPerformDatabaseSearch) + return true; + + string sTmpDB = szDatabase; + + if ((fpidx = fopen(sTmpDB.c_str(), "r")) == nullptr) + { + string strErrorMsg = " Error (1a) - cannot read .idx file \"" + sTmpDB + "\".\n"; + session.statusRef.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + + // Try to open the companion .fasta (not required for PI_DB search). + sTmpDB = sTmpDB.erase(sTmpDB.size() - 4); // strip .idx + if ((fpfasta = fopen(sTmpDB.c_str(), "r")) == nullptr) + { + session.bIdxNoFasta = true; + fpfasta = nullptr; + } + + fpdb = fpidx; + + return true; +} + +bool PiStrategy::executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) +{ + return executeBatchLegacy(mstReader, iFirstScan, iLastScan, iAnalysisType, + iPercentStart, iPercentEnd, tp, session, false); +} + +void PiStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) +{ + if (fpidx != nullptr) fclose(fpidx); + if (fpfasta != nullptr) fclose(fpfasta); +} + +void PiStrategy::finalize() +{ + CometPreprocess::DeallocateMemory(g_staticParams.options.iNumThreads); + CometSearch::DeallocateMemory(g_staticParams.options.iNumThreads); +} diff --git a/CometSearch/search/PiStrategy.h b/CometSearch/search/PiStrategy.h new file mode 100644 index 00000000..5634b693 --- /dev/null +++ b/CometSearch/search/PiStrategy.h @@ -0,0 +1,39 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ISearchStrategy.h" + +// Search strategy for PI_DB (plain peptide index) batch searches. +// +// initialize(): allocates search and preprocess memory pools; the peptide +// index itself is loaded lazily by SearchPeptideIndex on first use. +// executeBatch(): LoadAndPreprocessSpectra -> RunSearch (PI path) -> PostAnalysis. +// finalize(): frees memory pools. +class PiStrategy : public ISearchStrategy +{ +public: + bool initialize(SearchSession& session, ThreadPool* tp) override; + bool openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) override; + bool executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) override; + void closeFiles(FILE* fpfasta, FILE* fpidx) override; + void finalize() override; + bool isIndexBased() const override { return true; } +}; diff --git a/CometSearch/search/Pipeline.cpp b/CometSearch/search/Pipeline.cpp new file mode 100644 index 00000000..4407beec --- /dev/null +++ b/CometSearch/search/Pipeline.cpp @@ -0,0 +1,304 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "Common.h" +#include "Pipeline.h" +#include "SearchUtils.h" +#include "CometPreprocess.h" +#include "CometMassSpecUtils.h" +#include "MSReader.h" +#include "AScoreFactory.h" + +Pipeline::Pipeline(std::unique_ptr strategy, + std::vector> writers, + CometSearchManager* pMgr) + : _strategy(std::move(strategy)) + , _writers(std::move(writers)) + , _pMgr(pMgr) +{ +} + +bool Pipeline::run(SearchSession& session, + const std::vector& files, + ThreadPool& tp) +{ + auto tGlobalStart = chrono::steady_clock::now(); + + if (!_strategy->initialize(session, &tp)) + { + _strategy->finalize(); + return false; + } + + // AScore initialization happens here -- after the strategy has loaded its + // database/index -- rather than earlier in DoSearch(), because FI_DB's + // ReadPlainPeptideIndex() (called from FiStrategy::initialize() above) overwrites + // g_staticParams.variableModParameters.varModList[] from the .idx file's + // VariableMod: header. SetAScoreOptions() reads those same fields to build its + // differential-mod list, so it must run after the index load, not before, or it + // configures AScore from stale/default mod values. + if (g_staticParams.options.iPrintAScoreProScore) + { + _pMgr->SetAScoreOptions(g_AScoreOptions); + g_AScoreInterface = CreateAScoreDllInterface(); + if (!g_AScoreInterface) + { + std::cerr << "Failed to create AScore interface." << std::endl; + _strategy->finalize(); + return false; + } + } + + bool bSucceeded = true; + int iTotalAllFiles = 0; // spectra searched across all files (for blank-file check) + + for (auto pFile : files) + { + if (!UpdateInputFile(pFile)) + { + bSucceeded = false; + break; + } + + int iFirstScan = g_staticParams.inputFile.iFirstScan; + int iLastScan = g_staticParams.inputFile.iLastScan; + int iPercentStart = 0; + int iPercentEnd = 0; + int iAnalysisType = g_staticParams.inputFile.iAnalysisType; + + // Print search-start banner for FASTA searches. + if (!g_staticParams.options.bOutputSqtStream && !_strategy->isIndexBased()) + { + time_t tStartTime; + time(&tStartTime); + strftime(g_staticParams.szDate, 26, "%Y/%m/%d, %I:%M:%S %p", localtime(&tStartTime)); + + string strOut = " Search start: " + string(g_staticParams.szDate) + "\n"; + strOut += " - Input file: " + string(g_staticParams.inputFile.szFileName) + "\n"; + logout(strOut); + fflush(stdout); + } + + // Open database file handles (strategy-specific: .idx or .fasta). + FILE* fpfasta = nullptr; + FILE* fpidx = nullptr; + FILE* fpdb = nullptr; + + if (!_strategy->openFiles(g_staticParams.databaseInfo.szDatabase, + fpfasta, fpidx, fpdb, session)) + { + bSucceeded = false; + break; + } + + if (g_staticParams.options.iSpectrumBatchSize == 0 && !_strategy->isIndexBased()) + { + logout(" - Reading all spectra into memory; set \"spectrum_batch_size\" if search terminates here.\n"); + fflush(stdout); + } + + // Open writers (after openFiles so session.bIdxNoFasta is correctly set). + WriterOpenCtx woctx(session.statusRef); + woctx.szBaseName = g_staticParams.inputFile.szBaseName; + woctx.szOutputSuffix = g_staticParams.szOutputSuffix; + woctx.szTxtFileExt = g_staticParams.szTxtFileExt; + woctx.bEntireFile = (iAnalysisType == AnalysisType_EntireFile); + woctx.iFirstScan = iFirstScan; + woctx.iLastScan = iLastScan; + woctx.iDecoySearch = g_staticParams.options.iDecoySearch; + woctx.bIdxNoFasta = session.bIdxNoFasta; + woctx.pMgr = _pMgr; + + for (auto& pw : _writers) + { + if (!pw->open(woctx)) + { + bSucceeded = false; + break; + } + } + + if (!bSucceeded) + { + for (auto& pw : _writers) pw->close(false, false); + _strategy->closeFiles(fpfasta, fpidx); + break; + } + + // MSReader setup. + MSReader mstReader; + SetMSLevelFilter(mstReader); + CometPreprocess::Reset(); + + // Print "searching..." message for index-based searches. + auto tBeginTime = chrono::steady_clock::now(); + if (_strategy->isIndexBased()) + { + printf(" - searching \"%s\" ... ", g_staticParams.inputFile.szBaseName); + fflush(stdout); + } + + int iTotalSpectraSearched = 0; + int iBatchNum = 0; + + auto cleanupBatch = [&]() + { + for (auto* q : session.queries) delete q; + session.queries.clear(); + for (auto* q : session.ms1Queries) delete q; + session.ms1Queries.clear(); + }; + + while (!CometPreprocess::DoneProcessingAllSpectra()) + { + iBatchNum++; + + bSucceeded = _strategy->executeBatch(mstReader, + iFirstScan, iLastScan, iAnalysisType, + iPercentStart, iPercentEnd, + &tp, session); + + if (!bSucceeded) + { + cleanupBatch(); + break; + } + + if (session.queries.empty()) + continue; + + iTotalSpectraSearched += (int)session.queries.size(); + + // Sort by scan number (shared by all paths; SQT writes last, which modifies szMod). + std::sort(session.queries.begin(), session.queries.end(), compareByScanNumber); + + if (!g_staticParams.options.bOutputSqtStream && !_strategy->isIndexBased()) + { + logout(" done\n"); + fflush(stdout); + } + + // Per-batch write. + { + WriterWriteCtx wwctx; + wwctx.fpdb = fpdb; + wwctx.iScanOffset = iTotalSpectraSearched - (int)session.queries.size(); + wwctx.iBatchNum = iBatchNum; + wwctx.pQueries = &session.queries; + + for (auto& pw : _writers) + { + if (!pw->write(wwctx)) + { + bSucceeded = false; + break; + } + } + } + + cleanupBatch(); + + if (!bSucceeded) + break; + } + + // Per-file timing and run-stats message. + if (bSucceeded) + { + if (iTotalSpectraSearched == 0) + logout(" Warning - no spectra searched.\n"); + + if (!g_staticParams.options.bOutputSqtStream) + { + const auto duration = chrono::duration_cast( + chrono::steady_clock::now() - tBeginTime); + double dTimePerSpectra = (iTotalSpectraSearched > 0) + ? (double)duration.count() / (double)iTotalSpectraSearched + : 0.0; + + string strOut; + char buf[128]; + + if (!_strategy->isIndexBased()) + strOut = " - Run stats: "; + else + strOut = ""; + + std::snprintf(buf, sizeof(buf), "%.2f", dTimePerSpectra); + strOut += CometMassSpecUtils::ElapsedTime(tBeginTime) + + " (" + std::to_string(iTotalSpectraSearched) + " spectra, " + + std::string(buf) + "ms/spec, "; + + std::snprintf(buf, sizeof(buf), "%.0f", (dTimePerSpectra > 0.0) ? 1000.0 / dTimePerSpectra : 0.0); + strOut += std::string(buf) + "Hz"; + + if (!_strategy->isIndexBased()) + strOut += ", " + CometMassSpecUtils::GetPeakMemory(); + + strOut += ")\n"; + logout(strOut); + } + + if (!g_staticParams.options.bOutputSqtStream && !_strategy->isIndexBased()) + { + time_t tEndTime; + time(&tEndTime); + strftime(g_staticParams.szDate, 26, "%Y/%m/%d, %I:%M:%S %p", localtime(&tEndTime)); + string strOut = " Search end: " + string(g_staticParams.szDate) + + " (" + CometMassSpecUtils::ElapsedTime(tGlobalStart) + + ", " + CometMassSpecUtils::GetPeakMemory() + ")\n\n"; + logout(strOut); + } + } + + _strategy->closeFiles(fpfasta, fpidx); + + // Finalize and close writers. + { + bool bEmpty = (iTotalSpectraSearched == 0); + for (auto& pw : _writers) + pw->close(bSucceeded, bEmpty); + } + + iTotalAllFiles += iTotalSpectraSearched; + g_staticParams.inputFile.szBaseName[0] = '\0'; + + if (!bSucceeded) + break; + } + + _strategy->finalize(); + + if (g_staticParams.options.iPrintAScoreProScore) + DeleteAScoreDllInterface(g_AScoreInterface); + + // Print overall "done" banner for index-based searches. + if (_strategy->isIndexBased()) + { + string strOut = " - done. (" + CometMassSpecUtils::ElapsedTime(tGlobalStart); + string strMemUse = CometMassSpecUtils::GetPeakMemory(); + if (!strMemUse.empty()) + strOut += ", " + strMemUse + ")"; + else + strOut += ")"; + strOut += "\n\n"; + logout(strOut); + } + + // Return false if no spectra were searched across all files (blank-file sentinel). + if (iTotalAllFiles == 0) + return false; + + return bSucceeded; +} diff --git a/CometSearch/search/Pipeline.h b/CometSearch/search/Pipeline.h new file mode 100644 index 00000000..38cc258e --- /dev/null +++ b/CometSearch/search/Pipeline.h @@ -0,0 +1,47 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ISearchStrategy.h" +#include "../output/IResultWriter.h" +#include "../CometSearchManager.h" +#include +#include + +// Pipeline drives the batch search for all input files. +// It owns the strategy (which provides the per-batch search implementation) +// and the result writers (which serialize results to disk). +// +// Typical call sequence from CometSearchManager::DoSearch(): +// Pipeline pipeline(std::move(strategy), std::move(writers), pMgr); +// pipeline.run(session, g_pvInputFiles, *tp); +class Pipeline +{ +public: + Pipeline(std::unique_ptr strategy, + std::vector> writers, + CometSearchManager* pMgr); + + // Drives initialize -> per-file loop (open, batch-loop, close) -> finalize. + // Returns false if any file fails or no spectra are found across all files. + bool run(SearchSession& session, + const std::vector& files, + ThreadPool& tp); + +private: + std::unique_ptr _strategy; + std::vector> _writers; + CometSearchManager* _pMgr; +}; diff --git a/CometSearch/search/SearchSession.h b/CometSearch/search/SearchSession.h new file mode 100644 index 00000000..cc1f9e2f --- /dev/null +++ b/CometSearch/search/SearchSession.h @@ -0,0 +1,74 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Owns all mutable state for one batch search run. +// Created at the top of CometSearchManager::DoSearch() per input-file iteration. +// Passed by reference to pipeline functions that read or write per-run state. +// +// Read-only index globals (g_iFragmentIndex, g_vFragmentPeptides, g_vRawPeptides, +// g_vSpecLib, g_pvProteinsList, g_pvProteinNameCache, g_pvDBIndex, ...) are NOT moved +// here -- they are large, initialised once, and shared read-only across all threads. +// +// g_pvQueryMutex, g_bPlainPeptideIndexRead, and g_bSpecLibRead remain as globals, +// not SearchSession members, and this is permanent rather than a pending migration +// step: they are also read/written by the RTS path (InitializeSingleSpectrumSearch / +// DoSingleSpectrumSearchMultiResults), which is intentionally not moved into the +// strategy/Pipeline pattern (see docs/20260612_architecture_migration.md, "RTS path" -- +// the RTS entry points are wrapper-compatibility-sensitive and out of scope for the +// migration). Since a single process can serve both RTS and batch requests, this +// once-per-process init state must stay process-global so both paths observe the same +// value; it cannot move into a per-batch-run SearchSession. SearchSession does not +// shadow these globals; all code reads the globals directly. +// +// g_cometStatus is exposed here as statusRef: a reference to the process-wide +// singleton. Pipeline and strategy code use session.statusRef so they are not +// coupled to the global name; deep core files (CometSearch.cpp, CometPreprocess.cpp, +// etc.) still reference g_cometStatus directly because they have no SearchSession +// in scope. Both spellings touch the same object. + +#ifndef _SEARCHSESSION_H_ +#define _SEARCHSESSION_H_ + +#include "core/Params.h" +#include "core/Types.h" +#include "CometStatus.h" +#include +#include + +struct SearchSession +{ + // Per-batch MS2 result accumulator. + // Guarded by queriesMutex in the batch path. + std::vector queries; + + // Per-batch MS1 result accumulator (batch path only). + std::vector ms1Queries; + + // Mutex protecting queries and ms1Queries during parallel spectrum loading. + std::mutex queriesMutex; + + // Run-time flags (replace the batch-path-only globals). + bool bPerformDatabaseSearch = false; + bool bPerformSpecLibSearch = false; + bool bIdxNoFasta = false; + + // Reference to the process-wide status singleton (g_cometStatus). + CometStatus& statusRef; + + explicit SearchSession(CometStatus& st) : statusRef(st) {} + SearchSession(const SearchSession&) = delete; + SearchSession& operator=(const SearchSession&) = delete; +}; + +#endif // _SEARCHSESSION_H_ diff --git a/CometSearch/search/SearchUtils.cpp b/CometSearch/search/SearchUtils.cpp new file mode 100644 index 00000000..ec88eb57 --- /dev/null +++ b/CometSearch/search/SearchUtils.cpp @@ -0,0 +1,317 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "SearchUtils.h" + +static bool HasSuffixIgnoreCase(const char* pszFileName, int iLen, const char* pszSuffix) +{ + int iSuffixLen = (int)strlen(pszSuffix); + + return iLen >= iSuffixLen && !STRCMP_IGNORE_CASE(pszFileName + iLen - iSuffixLen, pszSuffix); +} + + +static InputType GetInputType(const char* pszFileName) +{ + int iLen = (int)strlen(pszFileName); + + if (HasSuffixIgnoreCase(pszFileName, iLen, ".mzXML") + || HasSuffixIgnoreCase(pszFileName, iLen, ".mzML") + || HasSuffixIgnoreCase(pszFileName, iLen, ".mzXML.gz") + || HasSuffixIgnoreCase(pszFileName, iLen, ".mzML.gz")) + { + return InputType_MZXML; + } + else if (HasSuffixIgnoreCase(pszFileName, iLen, ".raw")) + { + return InputType_RAW; + } + else if (HasSuffixIgnoreCase(pszFileName, iLen, ".ms2") + || HasSuffixIgnoreCase(pszFileName, iLen, ".cms2")) + { + return InputType_MS2; + } + else if (HasSuffixIgnoreCase(pszFileName, iLen, ".mgf")) + { + return InputType_MGF; + } + + return InputType_UNKNOWN; +} + + +bool UpdateInputFile(InputFileInfo* pFileInfo) +{ + bool bUpdateBaseName = false; + char szTmpBaseName[SIZE_FILE]; + + if (g_staticParams.inputFile.szBaseName[0] == '\0' || g_pvInputFiles.size() > 1) + bUpdateBaseName = true; + else + strcpy(szTmpBaseName, g_staticParams.inputFile.szBaseName); + + g_staticParams.inputFile = *pFileInfo; + g_staticParams.inputFile.iInputType = GetInputType(g_staticParams.inputFile.szFileName); + + if (InputType_UNKNOWN == g_staticParams.inputFile.iInputType) + return false; + + FILE* fp; + if ((fp = fopen(g_staticParams.inputFile.szFileName, "r")) == NULL) + { + string strErrorMsg = " Error - cannot read input file \"" + string(g_staticParams.inputFile.szFileName) + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + else + { + fclose(fp); + } + +#ifndef CRUX + if (bUpdateBaseName) + { + char* pStr; + int iLen = (int)strlen(g_staticParams.inputFile.szFileName); + + strcpy(g_staticParams.inputFile.szBaseName, g_staticParams.inputFile.szFileName); + + if ((pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) + *pStr = '\0'; + + if (HasSuffixIgnoreCase(g_staticParams.inputFile.szFileName, iLen, ".mzXML.gz") + || HasSuffixIgnoreCase(g_staticParams.inputFile.szFileName, iLen, ".mzML.gz")) + { + if ((pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) + *pStr = '\0'; + } + } + else + { + strcpy(g_staticParams.inputFile.szBaseName, szTmpBaseName); + } +#endif + + return true; +} + + +void SetMSLevelFilter(MSReader& mstReader) +{ + vector msLevel; + + if (g_staticParams.options.iMSLevel == 3) + msLevel.push_back(MS3); + else if (g_staticParams.options.iMSLevel == 2) + msLevel.push_back(MS2); + else if (g_staticParams.options.iMSLevel == 1) + msLevel.push_back(MS1); + + mstReader.setFilter(msLevel); +} + + +bool AllocateResultsMem(std::vector& queries) +{ + for (std::vector::iterator it = queries.begin(); it != queries.end(); ++it) + { + Query* pQuery = *it; + + try + { + pQuery->_pResults = new Results[g_staticParams.options.iNumStored]; + } + catch (std::bad_alloc& ba) + { + string strErrorMsg = " Error - new(_pResults[]). bad_alloc: \"" + std::string(ba.what()) + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + + if (g_staticParams.options.iDecoySearch == 2) + { + try + { + pQuery->_pDecoys = new Results[g_staticParams.options.iNumStored]; + } + catch (std::bad_alloc& ba) + { + string strErrorMsg = " Error - new(_pDecoys[]). bad_alloc: " + std::string(ba.what()) + "\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + } + + pQuery->iMatchPeptideCount = 0; + pQuery->iDecoyMatchPeptideCount = 0; + memset(pQuery->iXcorrHistogram, 0, sizeof(pQuery->iXcorrHistogram)); + + for (int j = 0; j < g_staticParams.options.iNumStored; ++j) + { + pQuery->_pResults[j].dPepMass = 0.0; + pQuery->_pResults[j].dExpect = 999; + pQuery->_pResults[j].fScoreSp = 0.0; + pQuery->_pResults[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; + pQuery->_pResults[j].fAScorePro = 0.0; + pQuery->_pResults[j].usiLenPeptide = 0; + pQuery->_pResults[j].usiRankSp = 0; + pQuery->_pResults[j].usiMatchedIons = 0; + pQuery->_pResults[j].usiTotalIons = 0; + pQuery->_pResults[j].szPeptide[0] = '\0'; + pQuery->_pResults[j].sAScoreProSiteScores.clear(); + pQuery->_pResults[j].pWhichProtein.clear(); + pQuery->_pResults[j].sPeffOrigResidues.clear(); + pQuery->_pResults[j].iPeffOrigResiduePosition = -9; + + if (g_staticParams.options.iDecoySearch) + pQuery->_pResults[j].pWhichDecoyProtein.clear(); + + if (g_staticParams.options.iDecoySearch == 2) + { + pQuery->_pDecoys[j].dPepMass = 0.0; + pQuery->_pDecoys[j].dExpect = 999; + pQuery->_pDecoys[j].fScoreSp = 0.0; + pQuery->_pDecoys[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; + pQuery->_pDecoys[j].fAScorePro = 0.0; + pQuery->_pDecoys[j].usiLenPeptide = 0; + pQuery->_pDecoys[j].usiRankSp = 0; + pQuery->_pDecoys[j].usiMatchedIons = 0; + pQuery->_pDecoys[j].usiTotalIons = 0; + pQuery->_pDecoys[j].szPeptide[0] = '\0'; + pQuery->_pDecoys[j].sAScoreProSiteScores.clear(); + pQuery->_pDecoys[j].pWhichProtein.clear(); + pQuery->_pDecoys[j].sPeffOrigResidues.clear(); + pQuery->_pDecoys[j].iPeffOrigResiduePosition = -9; + } + } + } + + return true; +} + + +bool RunSearchAndPostAnalysis(int iPercentStart, int iPercentEnd, + ThreadPool* tp, SearchSession& session, + bool bLogPrePostAnalysis) +{ + if (g_staticParams.options.bMango) + { + int iCurrentScanNumber = 0; + int iMangoIndex = 0; + + std::sort(session.queries.begin(), session.queries.end(), compareByMangoIndex); + + for (std::vector::iterator it = session.queries.begin(); it != session.queries.end(); ++it) + { + if ((*it)->_spectrumInfoInternal.iScanNumber != iCurrentScanNumber) + { + iCurrentScanNumber = (*it)->_spectrumInfoInternal.iScanNumber; + iMangoIndex = 0; + } + else + { + iMangoIndex++; + } + sprintf((*it)->_spectrumInfoInternal.szMango, "%03d_%c", + (int)iMangoIndex / 2, (iMangoIndex % 2) ? 'B' : 'A'); + } + } + + std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); + + g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; + g_massRange.dMaxMass = session.queries.at(session.queries.size() - 1)->_pepMassInfo.dPeptideMassTolerancePlus; + g_massRange.bNarrowMassRange = (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass); + + bool bSucceeded = !session.statusRef.IsError() && !session.statusRef.IsCancel(); + if (!bSucceeded) + return false; + + session.statusRef.SetStatusMsg(string("Running search...")); + + if (session.bPerformDatabaseSearch) + bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); + if (bSucceeded && session.bPerformSpecLibSearch) + bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); + // TODO(batch-MS1): CometSearch::RunMS1Search(tp, dRT, dMaxMS1RTDiff, dMaxSpecLibRT, + // dMaxQueryRT, session.ms1Queries) must be called here when the batch MS1 speclib + // path is implemented. It requires a second reader pass over the file at + // iSpecLibMSLevel to populate session.ms1Queries, plus per-file RT range values + // from CometSpecLib::LoadSpecLibMS1Raw. Neither exists in the batch pipeline yet. + + if (!bSucceeded) + return false; + + bSucceeded = !session.statusRef.IsError() && !session.statusRef.IsCancel(); + if (!bSucceeded) + return false; + + if (bLogPrePostAnalysis && !g_staticParams.options.bOutputSqtStream) + { + logout(" - Post analysis:"); + fflush(stdout); + } + + if (session.bPerformDatabaseSearch) + { + session.statusRef.SetStatusMsg(string("Performing post-search analysis ...")); + bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); + } + + return bSucceeded; +} + + +bool executeBatchLegacy(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session, + bool bVerbose) +{ + if (bVerbose && !g_staticParams.options.bOutputSqtStream) + { + logout(" - Load spectra:"); + fflush(stdout); + } + + session.statusRef.SetStatusMsg(string("Loading and processing input spectra")); + + bool bSucceeded = CometPreprocess::LoadAndPreprocessSpectra( + mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); + + iPercentStart = iPercentEnd; + iPercentEnd = mstReader.getPercent(); + + if (!bSucceeded) + return false; + + if (session.queries.empty()) + return true; + + bSucceeded = AllocateResultsMem(session.queries); + if (!bSucceeded) + return false; + + { + string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); + if (bVerbose && !g_staticParams.options.bOutputSqtStream) + logout(strStatusMsg); + session.statusRef.SetStatusMsg(strStatusMsg); + } + + return RunSearchAndPostAnalysis(iPercentStart, iPercentEnd, tp, session, bVerbose); +} diff --git a/CometSearch/search/SearchUtils.h b/CometSearch/search/SearchUtils.h new file mode 100644 index 00000000..e0c3d6e3 --- /dev/null +++ b/CometSearch/search/SearchUtils.h @@ -0,0 +1,62 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "Common.h" +#include "CometDataInternal.h" +#include "CometSearch.h" +#include "CometPostAnalysis.h" +#include "CometPreprocess.h" +#include "MSReader.h" +#include "SearchSession.h" + +// Shared utilities used by Pipeline and strategy classes. + +bool UpdateInputFile(InputFileInfo* pFileInfo); +void SetMSLevelFilter(MSReader& mstReader); +bool AllocateResultsMem(std::vector& queries); +bool RunSearchAndPostAnalysis(int iPercentStart, int iPercentEnd, + ThreadPool* tp, SearchSession& session, + bool bLogPrePostAnalysis = false); + +// Legacy three-sweep batch body: LoadAndPreprocess -> AllocateResults -> +// RunSearchAndPostAnalysis. Used by FiStrategy (non-fused fallback), +// FastaStrategy, and PiStrategy. Pass bVerbose=true for FASTA-path +// console progress output. +bool executeBatchLegacy(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session, + bool bVerbose); + +// ----------------------------------------------------------------------- +// Query sort comparators -- kept inline; single-expression each. +// ----------------------------------------------------------------------- +inline bool compareByPeptideMass(Query const* a, Query const* b) +{ + return (a->_pepMassInfo.dExpPepMass < b->_pepMassInfo.dExpPepMass); +} + +inline bool compareByMangoIndex(Query const* a, Query const* b) +{ + return (a->dMangoIndex < b->dMangoIndex); +} + +inline bool compareByScanNumber(Query const* a, Query const* b) +{ + if (a->_spectrumInfoInternal.iScanNumber == b->_spectrumInfoInternal.iScanNumber) + return (a->_spectrumInfoInternal.usiChargeState < b->_spectrumInfoInternal.usiChargeState); + return (a->_spectrumInfoInternal.iScanNumber < b->_spectrumInfoInternal.iScanNumber); +} diff --git a/CometSearch/threading/SearchMemoryPool.cpp b/CometSearch/threading/SearchMemoryPool.cpp new file mode 100644 index 00000000..5fecabf7 --- /dev/null +++ b/CometSearch/threading/SearchMemoryPool.cpp @@ -0,0 +1,92 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "threading/SearchMemoryPool.h" +#include "Common.h" +#include "CometStatus.h" +#include + + +bool SearchMemoryPool::allocate(int nSlots, int iArraySize) +{ + if (_allocated) + return true; + + try + { + _pool = new bool*[nSlots](); // value-init to nullptr so partial allocs are safe to delete[] + for (int i = 0; i < nSlots; ++i) + _pool[i] = new bool[iArraySize](); + _freeSlots.reserve(nSlots); + for (int i = 0; i < nSlots; ++i) + _freeSlots.push_back(i); + _nSlots = nSlots; + _allocated = true; + return true; + } + catch (const std::bad_alloc& ba) + { + // Free whatever was allocated before the throw. + if (_pool) + { + for (int k = 0; k < nSlots; ++k) + delete[] _pool[k]; // safe: unset slots are nullptr after value-init above + delete[] _pool; + _pool = nullptr; + } + _freeSlots.clear(); + std::string strErrorMsg = " Error - SearchMemoryPool::allocate failed. bad_alloc: " + std::string(ba.what()) + ".\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + _allocated = false; + return false; + } +} + + +void SearchMemoryPool::_deallocate(int nSlots) +{ + for (int i = 0; i < nSlots; ++i) + delete[] _pool[i]; + delete[] _pool; + _pool = nullptr; + _freeSlots.clear(); + _allocated = false; +} + + +void SearchMemoryPool::deallocate() +{ + if (_allocated) + _deallocate(_nSlots); +} + + +int SearchMemoryPool::acquireSlot() +{ + std::unique_lock lock(_mutex); + bool found = _cv.wait_for(lock, std::chrono::seconds(240), [this]() { return !_freeSlots.empty(); }); + if (!found) + return -1; + int slot = _freeSlots.back(); + _freeSlots.pop_back(); + return slot; +} + + +void SearchMemoryPool::releaseSlot(int slot) +{ + { std::lock_guard lk(_mutex); _freeSlots.push_back(slot); } + _cv.notify_one(); +} diff --git a/CometSearch/threading/SearchMemoryPool.h b/CometSearch/threading/SearchMemoryPool.h new file mode 100644 index 00000000..47477b87 --- /dev/null +++ b/CometSearch/threading/SearchMemoryPool.h @@ -0,0 +1,80 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Owns the per-thread duplicate-fragment scratch arrays used during FI/PI search. +// Extracted from CometSearch static members (_pbSearchMemoryPool, +// _ppbDuplFragmentArr, AllocateMemory, DeallocateMemory, AcquirePoolSlot) +// and the paired globals g_searchMemoryPoolMutex, g_searchPoolCV, +// g_bCometSearchMemoryAllocated. + +#ifndef _SEARCHMEMORYPOOL_H_ +#define _SEARCHMEMORYPOOL_H_ + +#include +#include +#include +#include + +class SearchMemoryPool +{ +public: + SearchMemoryPool() = default; + ~SearchMemoryPool() { if (_allocated) _deallocate(_nSlots); } + + // Allocates nSlots scratch arrays each of iArraySize bools. + bool allocate(int nSlots, int iArraySize); + + // Frees all scratch arrays. + void deallocate(); + + // Blocks up to 240 s until a slot is free. + // Returns slot index [0, nSlots) or -1 on timeout. + int acquireSlot(); + + // Releases the slot and wakes one waiting acquireSlot() caller. + void releaseSlot(int slot); + + // Returns the duplicate-fragment scratch array for a claimed slot. + bool* duplFragmentArr(int slot) const { assert(slot >= 0 && slot < _nSlots); return _pool[slot]; } + + bool isAllocated() const { return _allocated; } + int slotCount() const { return _nSlots; } + +private: + void _deallocate(int nSlots); + + int _nSlots = 0; + bool** _pool = nullptr; // [_nSlots][iArraySize]: scratch buffers + bool _allocated = false; + + // Stack of currently-free slot indices. A slot's presence here (rather than a + // separate bool[] scanned linearly) is the sole source of truth for "is free", + // so acquire/release are O(1) instead of O(nSlots) regardless of pool size. + std::vector _freeSlots; + std::mutex _mutex; + std::condition_variable _cv; +}; + +// RAII guard for a slot acquired via SearchMemoryPool::acquireSlot(). Releases the +// slot on scope exit (normal return or exception unwind) so a throw out of the +// search body never leaks the slot and stalls the next acquireSlot() caller for up +// to 240 s. Construct only after checking the acquired slot is >= 0. +struct SearchMemoryPoolSlotGuard +{ + SearchMemoryPool& pool; + int slot; + ~SearchMemoryPoolSlotGuard() { if (slot >= 0) pool.releaseSlot(slot); } +}; + +#endif // _SEARCHMEMORYPOOL_H_ diff --git a/MSToolkit/.gitignore b/MSToolkit/.gitignore index fa724387..a38003a7 100644 --- a/MSToolkit/.gitignore +++ b/MSToolkit/.gitignore @@ -1,4 +1,10 @@ +include/expat.h include/expat_config.h +include/expat_external.h +include/zconf.h +include/zlib.h +include/zutil.h build/ .vs/ -*.json \ No newline at end of file +*.mri +*.json diff --git a/Makefile b/Makefile index fd7e0319..38928698 100644 --- a/Makefile +++ b/Makefile @@ -4,9 +4,9 @@ COMETSEARCH = CometSearch UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-char-subscripts -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/extern/expat-2.2.9/lib -I$(MSTOOLKIT)/extern/zlib-1.2.11 -I$(COMETSEARCH) -I$(ASCOREPRO)/include + override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-char-subscripts -Wno-unused-result -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/extern/expat-2.2.9/lib -I$(MSTOOLKIT)/extern/zlib-1.2.11 -I$(COMETSEARCH) -I$(ASCOREPRO)/include else - override CXXFLAGS += -O3 -static -std=c++20 -fpermissive -Wall -Wextra -Wno-char-subscripts -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/extern/expat-2.2.9/lib -I$(MSTOOLKIT)/extern/zlib-1.2.11 -I$(COMETSEARCH) -I$(ASCOREPRO)/include + override CXXFLAGS += -O3 -static -std=c++20 -fpermissive -Wall -Wextra -Wno-char-subscripts -Wno-unused-result -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/extern/expat-2.2.9/lib -I$(MSTOOLKIT)/extern/zlib-1.2.11 -I$(COMETSEARCH) -I$(ASCOREPRO)/include endif EXECNAME = comet.exe diff --git a/data/comet_canonical.params b/data/comet_phospho_internaldecoy1.params similarity index 91% rename from data/comet_canonical.params rename to data/comet_phospho_internaldecoy1.params index 8b89258f..7b9f4a58 100644 --- a/data/comet_canonical.params +++ b/data/comet_phospho_internaldecoy1.params @@ -2,8 +2,13 @@ # Comet MS/MS search engine parameters file. # Everything following the '#' symbol is treated as a comment. # -database_name = C:\\Work\\Comet-master\\data\\human.canonical.fasta -decoy_search = 0 # 0=no (default), 1=internal decoy concatenated, 2=internal decoy separate +# Variant of comet_phospho.params for tests/regression/run_regression.py's +# "internaldecoy1" decoy variant: decoy_search=1 (internal decoy, concatenated). +# Used for the fasta and pi modes only -- FI searches do not support Comet's +# internal decoy generation. +# +database_name = human.target-decoy.fasta +decoy_search = 1 # 0=no (default), 1=internal decoy concatenated, 2=internal decoy separate num_threads = 0 # 0=poll CPU to set num threads; else specify num threads directly (max 128) @@ -12,7 +17,7 @@ print_ascorepro_score = -1 # 0=no, 0 to 5 to localize variable_mod0 # # masses # -peptide_mass_tolerance_upper = -20.0 # upper bound of the precursor mass tolerance +peptide_mass_tolerance_upper = 20.0 # upper bound of the precursor mass tolerance peptide_mass_tolerance_lower = -20.0 # lower bound of the precursor mass tolerance; USUALLY NEGATIVE TO BE LOWER THAN 0 peptide_mass_units = 2 # 0=amu, 1=mmu, 2=ppm precursor_tolerance_type = 1 # 0=MH+ (default), 1=precursor m/z; only valid for amu/mmu tolerances @@ -21,9 +26,9 @@ isotope_error = 0 # 0=off, 1=0/1 (C13 error), 2=0/1/2, 3=0/ # # search enzyme # -search_enzyme_number = 0 # choose from list at end of this params file +search_enzyme_number = 1 # choose from list at end of this params file search_enzyme2_number = 0 # second enzyme; set to 0 if no second enzyme -sample_enzyme_number = 0 # specifies the sample enzyme which is possibly different than the one applied to the search; +sample_enzyme_number = 1 # specifies the sample enzyme which is possibly different than the one applied to the search; # used by PeptideProphet to calculate NTT & NMC in pepXML output (default=1 for trypsin). num_enzyme_termini = 2 # 1 (semi-digested), 2 (fully digested, default), 8 C-term unspecific , 9 N-term unspecific allowed_missed_cleavage = 2 # maximum value is 5; for enzyme search @@ -33,8 +38,8 @@ allowed_missed_cleavage = 2 # maximum value is 5; for enzyme search # format: <0=variable/else binary> # e.g. 79.966331 STY 0 3 -1 0 0 97.976896 # -#variable_mod01 = 15.9949 M 0 2 -1 0 0 0.0 -#variable_mod02 = 79.966331 STY 0 2 -1 0 0 97.976896 +variable_mod01 = 15.9949 M 0 2 -1 0 0 0.0 +variable_mod02 = 79.966331 STY 0 2 -1 0 0 97.976896 variable_mod03 = 0.0 X 0 3 -1 0 0 0.0 variable_mod04 = 0.0 X 0 3 -1 0 0 0.0 variable_mod05 = 0.0 X 0 3 -1 0 0 0.0 @@ -64,7 +69,7 @@ use_NL_ions = 0 # 0=no, 1=yes to consider NH3/H2O neutral # output_sqtfile = 0 # 0=no, 1=yes write sqt file output_txtfile = 1 # 0=no, 1=yes, 2=Crux-formatted write tab-delimited txt file -output_pepxmlfile = 1 # 0=no, 1=yes write pepXML file +output_pepxmlfile = 0 # 0=no, 1=yes write pepXML file output_mzidentmlfile = 0 # 0=no, 1=yes write mzIdentML file output_percolatorfile = 0 # 0=no, 1=yes write Percolator pin file num_output_lines = 1 # num peptide results to show @@ -82,7 +87,7 @@ activation_method = ALL # activation method; used if activation m # misc parameters # digest_mass_range = 800.0 5000.0 # MH+ peptide mass range to analyze -peptide_length_range = 8 25 # minimum and maximum peptide length to analyze (default min 1 to allowed max 51) +peptide_length_range = 8 50 # minimum and maximum peptide length to analyze (default min 1 to allowed max 51) max_duplicate_proteins = 10 # maximum number of additional duplicate protein names to report for each peptide ID; -1 reports all duplicates max_fragment_charge = 3 # set maximum fragment charge state to analyze (allowed max 5) min_precursor_charge = 1 # set minimum precursor charge state to analyze (1 if missing) diff --git a/data/comet.params b/data/comet_phospho_internaldecoy2.params similarity index 92% rename from data/comet.params rename to data/comet_phospho_internaldecoy2.params index a0facab0..32fc99b0 100644 --- a/data/comet.params +++ b/data/comet_phospho_internaldecoy2.params @@ -2,8 +2,13 @@ # Comet MS/MS search engine parameters file. # Everything following the '#' symbol is treated as a comment. # +# Variant of comet_phospho.params for tests/regression/run_regression.py's +# "internaldecoy2" decoy variant: decoy_search=2 (internal decoy, separate). +# Used for the fasta and pi modes only -- FI searches do not support Comet's +# internal decoy generation. +# database_name = human.target-decoy.fasta -decoy_search = 0 # 0=no (default), 1=internal decoy concatenated, 2=internal decoy separate +decoy_search = 2 # 0=no (default), 1=internal decoy concatenated, 2=internal decoy separate num_threads = 0 # 0=poll CPU to set num threads; else specify num threads directly (max 128) @@ -12,7 +17,7 @@ print_ascorepro_score = -1 # 0=no, 0 to 5 to localize variable_mod0 # # masses # -peptide_mass_tolerance_upper = -20.0 # upper bound of the precursor mass tolerance +peptide_mass_tolerance_upper = 20.0 # upper bound of the precursor mass tolerance peptide_mass_tolerance_lower = -20.0 # lower bound of the precursor mass tolerance; USUALLY NEGATIVE TO BE LOWER THAN 0 peptide_mass_units = 2 # 0=amu, 1=mmu, 2=ppm precursor_tolerance_type = 1 # 0=MH+ (default), 1=precursor m/z; only valid for amu/mmu tolerances @@ -21,9 +26,9 @@ isotope_error = 0 # 0=off, 1=0/1 (C13 error), 2=0/1/2, 3=0/ # # search enzyme # -search_enzyme_number = 0 # choose from list at end of this params file +search_enzyme_number = 1 # choose from list at end of this params file search_enzyme2_number = 0 # second enzyme; set to 0 if no second enzyme -sample_enzyme_number = 0 # specifies the sample enzyme which is possibly different than the one applied to the search; +sample_enzyme_number = 1 # specifies the sample enzyme which is possibly different than the one applied to the search; # used by PeptideProphet to calculate NTT & NMC in pepXML output (default=1 for trypsin). num_enzyme_termini = 2 # 1 (semi-digested), 2 (fully digested, default), 8 C-term unspecific , 9 N-term unspecific allowed_missed_cleavage = 2 # maximum value is 5; for enzyme search @@ -33,8 +38,8 @@ allowed_missed_cleavage = 2 # maximum value is 5; for enzyme search # format: <0=variable/else binary> # e.g. 79.966331 STY 0 3 -1 0 0 97.976896 # -#variable_mod01 = 15.9949 M 0 2 -1 0 0 0.0 -#variable_mod02 = 79.966331 STY 0 2 -1 0 0 97.976896 +variable_mod01 = 15.9949 M 0 2 -1 0 0 0.0 +variable_mod02 = 79.966331 STY 0 2 -1 0 0 97.976896 variable_mod03 = 0.0 X 0 3 -1 0 0 0.0 variable_mod04 = 0.0 X 0 3 -1 0 0 0.0 variable_mod05 = 0.0 X 0 3 -1 0 0 0.0 @@ -64,7 +69,7 @@ use_NL_ions = 0 # 0=no, 1=yes to consider NH3/H2O neutral # output_sqtfile = 0 # 0=no, 1=yes write sqt file output_txtfile = 1 # 0=no, 1=yes, 2=Crux-formatted write tab-delimited txt file -output_pepxmlfile = 1 # 0=no, 1=yes write pepXML file +output_pepxmlfile = 0 # 0=no, 1=yes write pepXML file output_mzidentmlfile = 0 # 0=no, 1=yes write mzIdentML file output_percolatorfile = 0 # 0=no, 1=yes write Percolator pin file num_output_lines = 1 # num peptide results to show @@ -82,7 +87,7 @@ activation_method = ALL # activation method; used if activation m # misc parameters # digest_mass_range = 800.0 5000.0 # MH+ peptide mass range to analyze -peptide_length_range = 8 25 # minimum and maximum peptide length to analyze (default min 1 to allowed max 51) +peptide_length_range = 8 50 # minimum and maximum peptide length to analyze (default min 1 to allowed max 51) max_duplicate_proteins = 10 # maximum number of additional duplicate protein names to report for each peptide ID; -1 reports all duplicates max_fragment_charge = 3 # set maximum fragment charge state to analyze (allowed max 5) min_precursor_charge = 1 # set minimum precursor charge state to analyze (1 if missing) diff --git a/data/comet_small.params b/data/comet_small.params deleted file mode 100644 index 62976967..00000000 --- a/data/comet_small.params +++ /dev/null @@ -1,158 +0,0 @@ -# comet_version 2026.01 rev. 0 -# Comet MS/MS search engine parameters file. -# Everything following the '#' symbol is treated as a comment. -# -database_name = C:\\Work\\Comet-master\\data\\human.small.fasta -decoy_search = 0 # 0=no (default), 1=internal decoy concatenated, 2=internal decoy separate - -num_threads = 0 # 0=poll CPU to set num threads; else specify num threads directly (max 128) - -print_ascorepro_score = -1 # 0=no, 0 to 5 to localize variable_mod01 to _mod05; -1 to localize all variable mods - -# -# masses -# -peptide_mass_tolerance_upper = -20.0 # upper bound of the precursor mass tolerance -peptide_mass_tolerance_lower = -20.0 # lower bound of the precursor mass tolerance; USUALLY NEGATIVE TO BE LOWER THAN 0 -peptide_mass_units = 2 # 0=amu, 1=mmu, 2=ppm -precursor_tolerance_type = 1 # 0=MH+ (default), 1=precursor m/z; only valid for amu/mmu tolerances -isotope_error = 0 # 0=off, 1=0/1 (C13 error), 2=0/1/2, 3=0/1/2/3, 4=-1/0/1/2/3, 5=-1/0/1 - -# -# search enzyme -# -search_enzyme_number = 0 # choose from list at end of this params file -search_enzyme2_number = 0 # second enzyme; set to 0 if no second enzyme -sample_enzyme_number = 0 # specifies the sample enzyme which is possibly different than the one applied to the search; - # used by PeptideProphet to calculate NTT & NMC in pepXML output (default=1 for trypsin). -num_enzyme_termini = 2 # 1 (semi-digested), 2 (fully digested, default), 8 C-term unspecific , 9 N-term unspecific -allowed_missed_cleavage = 2 # maximum value is 5; for enzyme search - -# -# Up to 15 variable_mod entries are supported for a standard search; manually add additional entries as needed -# format: <0=variable/else binary> -# e.g. 79.966331 STY 0 3 -1 0 0 97.976896 -# -#variable_mod01 = 15.9949 M 0 2 -1 0 0 0.0 -#variable_mod02 = 79.966331 STY 0 2 -1 0 0 97.976896 -variable_mod03 = 0.0 X 0 3 -1 0 0 0.0 -variable_mod04 = 0.0 X 0 3 -1 0 0 0.0 -variable_mod05 = 0.0 X 0 3 -1 0 0 0.0 -max_variable_mods_in_peptide = 4 -require_variable_mod = 0 - -# -# fragment ions -# -# ion trap ms/ms: 1.0005 tolerance, 0.4 offset (mono masses), theoretical_fragment_ions = 1 -# high res ms/ms: 0.02 tolerance, 0.0 offset (mono masses), theoretical_fragment_ions = 0, spectrum_batch_size = 15000 -# -fragment_bin_tol = 0.02 # binning to use on fragment ions -fragment_bin_offset = 0.0 # offset position to start the binning (0.0 to 1.0) -theoretical_fragment_ions = 0 # 0=use flanking peaks, 1=M peak only -use_A_ions = 0 -use_B_ions = 1 -use_C_ions = 0 -use_X_ions = 0 -use_Y_ions = 1 -use_Z_ions = 0 -use_Z1_ions = 0 -use_NL_ions = 0 # 0=no, 1=yes to consider NH3/H2O neutral loss peaks - -# -# output -# -output_sqtfile = 0 # 0=no, 1=yes write sqt file -output_txtfile = 1 # 0=no, 1=yes, 2=Crux-formatted write tab-delimited txt file -output_pepxmlfile = 1 # 0=no, 1=yes write pepXML file -output_mzidentmlfile = 0 # 0=no, 1=yes write mzIdentML file -output_percolatorfile = 0 # 0=no, 1=yes write Percolator pin file -num_output_lines = 1 # num peptide results to show - -# -# mzXML/mzML/raw file parameters -# -scan_range = 0 0 # start and end scan range to search; either entry can be set independently -precursor_charge = 0 0 # precursor charge range to analyze; does not override any existing charge; 0 as 1st entry ignores parameter -override_charge = 0 # 0=no, 1=override precursor charge states, 2=ignore precursor charges outside precursor_charge range, 3=see online -ms_level = 2 # MS level to analyze, valid are levels 2 (default) or 3 -activation_method = ALL # activation method; used if activation method set; allowed ALL, CID, ECD, ETD, ETD+SA, PQD, HCD, IRMPD, SID - -# -# misc parameters -# -digest_mass_range = 800.0 5000.0 # MH+ peptide mass range to analyze -peptide_length_range = 8 25 # minimum and maximum peptide length to analyze (default min 1 to allowed max 51) -max_duplicate_proteins = 10 # maximum number of additional duplicate protein names to report for each peptide ID; -1 reports all duplicates -max_fragment_charge = 3 # set maximum fragment charge state to analyze (allowed max 5) -min_precursor_charge = 1 # set minimum precursor charge state to analyze (1 if missing) -max_precursor_charge = 6 # set maximum precursor charge state to analyze (allowed max 9) -clip_nterm_methionine = 0 # 0=leave protein sequences as-is; 1=also consider sequence w/o N-term methionine -spectrum_batch_size = 75000 # max. # of spectra to search at a time; 0 to search the entire scan range in one loop -decoy_prefix = DECOY_ # decoy entries are denoted by this string which is pre-pended to each protein accession -equal_I_and_L = 1 # 0=treat I and L as different; 1=treat I and L as same -mass_offsets = # one or more mass offsets to search (values substracted from deconvoluted precursor mass) - -# -# spectral processing -# -minimum_peaks = 10 # required minimum number of peaks in spectrum to search (default 10) -minimum_intensity = 0 # minimum intensity value to read in -remove_precursor_peak = 0 # 0=no, 1=yes, 2=all charge reduced precursor peaks (for ETD), 3=phosphate neutral loss peaks -remove_precursor_tolerance = 1.5 # +- Da tolerance for precursor removal -clear_mz_range = 0.0 0.0 # clear out all peaks in the specified m/z range e.g. remove reporter ion region of TMT spectra -percentage_base_peak = 0.0 # specify a percentage (e.g. "0.05" for 5%) of the base peak intensity as a minimum intensity threshold - -# -# static modifications -# -add_Cterm_peptide = 0.0 -add_Nterm_peptide = 0.0 -add_Cterm_protein = 0.0 -add_Nterm_protein = 0.0 - -add_G_glycine = 0.0000 # added to G - avg. 57.0513, mono. 57.02146 -add_A_alanine = 0.0000 # added to A - avg. 71.0779, mono. 71.03711 -add_S_serine = 0.0000 # added to S - avg. 87.0773, mono. 87.03203 -add_P_proline = 0.0000 # added to P - avg. 97.1152, mono. 97.05276 -add_V_valine = 0.0000 # added to V - avg. 99.1311, mono. 99.06841 -add_T_threonine = 0.0000 # added to T - avg. 101.1038, mono. 101.04768 -add_C_cysteine = 57.021464 # added to C - avg. 103.1429, mono. 103.00918 -add_L_leucine = 0.0000 # added to L - avg. 113.1576, mono. 113.08406 -add_I_isoleucine = 0.0000 # added to I - avg. 113.1576, mono. 113.08406 -add_N_asparagine = 0.0000 # added to N - avg. 114.1026, mono. 114.04293 -add_D_aspartic_acid = 0.0000 # added to D - avg. 115.0874, mono. 115.02694 -add_Q_glutamine = 0.0000 # added to Q - avg. 128.1292, mono. 128.05858 -add_K_lysine = 0.0000 # added to K - avg. 128.1723, mono. 128.09496 -add_E_glutamic_acid = 0.0000 # added to E - avg. 129.1140, mono. 129.04259 -add_M_methionine = 0.0000 # added to M - avg. 131.1961, mono. 131.04048 -add_H_histidine = 0.0000 # added to H - avg. 137.1393, mono. 137.05891 -add_F_phenylalanine = 0.0000 # added to F - avg. 147.1739, mono. 147.06841 -add_U_selenocysteine = 0.0000 # added to U - avg. 150.0379, mono. 150.95363 -add_R_arginine = 0.0000 # added to R - avg. 156.1857, mono. 156.10111 -add_Y_tyrosine = 0.0000 # added to Y - avg. 163.0633, mono. 163.06333 -add_W_tryptophan = 0.0000 # added to W - avg. 186.0793, mono. 186.07931 -add_O_pyrrolysine = 0.0000 # added to O - avg. 237.2982, mono 237.14773 -add_B_user_amino_acid = 0.0000 # added to B - avg. 0.0000, mono. 0.00000 -add_J_user_amino_acid = 0.0000 # added to J - avg. 0.0000, mono. 0.00000 -add_X_user_amino_acid = 0.0000 # added to X - avg. 0.0000, mono. 0.00000 -add_Z_user_amino_acid = 0.0000 # added to Z - avg. 0.0000, mono. 0.00000 - -# -# COMET_ENZYME_INFO _must_ be at the end of this parameters file -# Enzyme entries can be added/deleted/edited -# -[COMET_ENZYME_INFO] -0. Cut_everywhere 0 - - -1. Trypsin 1 KR P -2. Trypsin/P 1 KR - -3. Lys_C 1 K P -4. Lys_N 0 K - -5. Arg_C 1 R P -6. Asp_N 0 DN - -7. CNBr 1 M - -8. Asp-N_ambic 1 DE - -9. PepsinA 1 FL - -10. Chymotrypsin 1 FWYL P -11. No_cut 1 @ @ - diff --git a/docs/20260612_architecture_migration.md b/docs/20260612_architecture_migration.md new file mode 100644 index 00000000..df81cae5 --- /dev/null +++ b/docs/20260612_architecture_migration.md @@ -0,0 +1,965 @@ +# Architecture Migration Plan + +**Date**: 2026-06-12 +**Scope**: `CometSearch/` library only +**Goal**: Separate concerns, reduce coupling, increase modularity. +Behavior is unchanged at every step; each phase is independently compilable and testable. + +--- + +## Status (as of 2026-06-13) + +| Phase | Description | State | Commit | +|-------|-------------|-------|--------| +| 1 | Split `CometDataInternal.h` | **Complete** | `4337ee8d` | +| 2 | Extract `SearchMemoryPool` | **Complete** | `4337ee8d` | +| 3 | Extract `IResultWriter` | **Complete** | `4337ee8d` | +| 4 | Introduce `SearchSession` | **Complete** | `00e0655f` | +| 5 | Extract `ISearchStrategy` + `Pipeline` | **Complete** | uncommitted | +| 6+ | Further decomposition (index/, spectrum/, scoring/) | Planned | — | + +All phases verified: 17/17 unit tests pass; HeLa FI_DB batch parity confirmed at +each phase boundary (zero PSM diff at 1 % and 5 % FDR, xcorr and e-value). + +--- + +## Background + +The codebase has six structural pathologies that this plan addresses in order of +increasing invasiveness: + +1. `CometDataInternal.h` (1,554 lines) is a monolith — constants, parameter structs, + result structs, index structs, and scoring structs all in one file. A one-line + change rebuilds every translation unit. + +2. Pool slot management (`_pbSearchMemoryPool`, `_ppbDuplFragmentArr`, + `AcquirePoolSlot`) is buried in `CometSearch` static members and bleeds into + `CometSearchManager` and `SearchThreadData` with no clear ownership. + +3. Five result writers (`CometWriteTxt`, `CometWriteSqt`, `CometWritePepXML`, + `CometWriteMzIdentML`, `CometWritePercolator`) are called via sequential `if` + chains in `DoSearch()` and access `g_pvQuery` / `g_staticParams` directly. + There is no shared interface. + +4. Twenty-eight mutable globals act as the implicit API between all modules: + `g_staticParams`, `g_pvQuery`, `g_pvQueryMS1`, `g_cometStatus`, + `g_searchMemoryPoolMutex`, `g_searchPoolCV`, `g_bPlainPeptideIndexRead`, etc. + Any file can write any global at any time. + +5. `CometSearchManager::DoSearch()` (~1,100 lines) mixes parameter validation, + index loading, per-file loop logic, file handle management, search dispatch, + result writing, and progress reporting. + +6. Search-path selection (`if iDbType == FI_DB ... else if PI_DB ...`) appears in + `DoSearch()`, `RunSearch()`, `LoadAndPreprocessSpectra()`, + `FusedLoadAndSearchSpectra()`, and `InitializeSingleSpectrumSearch()`. Adding + a new index type requires edits in eight or more places. + +--- + +## Target Folder Structure + +``` +CometSearch/ +├── core/ +│ ├── Constants.h # All compile-time constants (split from CometDataInternal.h) +│ ├── Types.h # Results, Query, PepMassInfo, scoring data structs +│ └── Params.h # StaticParams and all sub-structs +│ +├── params/ +│ ├── ParamLoader.h/.cpp # File/map -> StaticParams (from CometSearchManager lines 625-1862) +│ └── ParamValidator.h/.cpp # ValidateOutputFormat, ValidateScanRange, etc. +│ +├── index/ +│ ├── ISearchIndex.h # Abstract interface: Load(), GetType(), IsLoaded() +│ ├── fragment/ +│ │ ├── FragmentIndex.h/.cpp # Runtime state + query +│ │ └── FragmentIndexBuilder.h/.cpp # WriteFIPlainPeptideIndex +│ ├── peptide/ +│ │ ├── PeptideIndex.h/.cpp +│ │ └── PeptideIndexBuilder.h/.cpp +│ └── speclib/ +│ ├── SpecLib.h/.cpp +│ └── Alignment.h/.cpp +│ +├── spectrum/ +│ ├── ISpectrumSource.h # Interface: next(Spectrum&)->bool, scanCount(), seekTo() +│ ├── MSReaderSource.h/.cpp # MSReader-backed implementation +│ ├── Preprocessor.h/.cpp # Binning, xcorr prep -- pure computation, no I/O +│ └── BoundedQueue.h # BoundedSpectrumQueue (moved from CometPreprocess.cpp) +│ +├── scoring/ +│ ├── XcorrScorer.h/.cpp # SearchFragmentIndex, XcorrScore +│ ├── SpScorer.h/.cpp # CalculateSP +│ └── EValueScorer.h/.cpp # CalculateEValue, CalculateDeltaCn +│ +├── search/ +│ ├── SearchSession.h # Owns mutable run state (replaces g_pvQuery etc.) +│ ├── ISearchStrategy.h # Pure virtual: initialize / execute / finalize +│ ├── FastaStrategy.h/.cpp # FASTA_DB path +│ ├── FiStrategy.h/.cpp # FI_DB batch + RTS paths +│ ├── PiStrategy.h/.cpp # PI_DB path +│ └── Pipeline.h/.cpp # Selects strategy, drives per-file loop +│ +├── output/ +│ ├── IResultWriter.h # Pure virtual: write(results, params) +│ ├── TxtWriter.h/.cpp +│ ├── SqtWriter.h/.cpp +│ ├── PepXmlWriter.h/.cpp +│ ├── MzIdentMlWriter.h/.cpp +│ └── PercolatorWriter.h/.cpp +│ +├── threading/ +│ ├── ThreadPool.h # Unchanged +│ └── SearchMemoryPool.h/.cpp # Extracted from CometSearch statics +│ +└── SearchManager.h/.cpp # Thin ICometSearchManager impl -- delegates to Pipeline +``` + +--- + +## Phase 1 — Split `CometDataInternal.h` + +**Status**: Complete — committed `4337ee8d` + +**Effort**: ~1 day **Risk**: Low (mechanical split, no logic changes) + +### Problem + +`CometDataInternal.h` is included by every `.cpp` in the library. It contains: +- Physical/algorithmic constants (`#define` macros, lines 33–114) +- Fourteen parameter sub-structs (`Options`, `ToleranceParams`, `IonInfo`, + `MassUtil`, `VarModParams`, `StaticMod`, `PrecalcMasses`, `DBInfo`, + `SpecLibInfo`, `PEFFInfo`, `EnzymeInfo`, `MassRange`, lines 116–980) +- `StaticParams` aggregate (lines 890–1172) +- Result/query structs (`Results`, `Query`, `QueryMS1`, `PepMassInfo`, + `SpectrumInfoInternal`, `PreprocessStruct`, `SpecLibResults`, lines 248–1490) +- Index-related structs (`PlainPeptideIndexStruct`, `FragmentPeptidesStruct`, + `DBIndex`, `PepGenTuple`, `PepGenTupleShort`, `IndexProteinStruct`, + `ProteinsListCSR`, lines 454–1277) +- PEFF structs (`PeffModStruct`, `PeffVariantSimpleStruct`, + `PeffVariantComplexStruct`, `PeffPositionStruct`, `PeffSearchStruct`, + lines 340–424) +- Scoring/output structs (`MatchedIonsStruct`, `IonSeriesStruct`, + `ModificationNumber`, lines 1278–1555) +- `DbType` enum (line 882) + +### Action + +Create `CometSearch/core/` and split into three headers: + +**`core/Constants.h`** — all `#define` constants replaced with `constexpr`: + +``` +Source lines in CometDataInternal.h: 33-114 +Contents: + PROTON_MASS, C13_DIFF, FLOAT_ZERO + MIN/MAX_PEPTIDE_LEN, MAX_PEPTIDE_LEN_P2 + FRAGINDEX_* (8 constants) + MS1_* (4 constants) + MAX_PEFFMOD_LEN, SIZE_MASS, SIZE_NATIVEID + NUM_SP_IONS, NUM_ION_SERIES, VMODS, HISTO_SIZE + WIDTH_REFERENCE, MAX_PROTEINS, EXPECT_DECOY_SIZE + NO_PEFF_VARIANT, ASCORE_CUTOFF_TO_ACCEPT, FRAGINDEX_VMODS + COMPOUNDMODS_OFFSET, VMOD_*_INDEX (15 constants) + ENZYME_* (4 constants) + ION_SERIES_* (7 constants) + XCORR_CUTOFF, SPECLIB_CUTOFF + DbType enum (move from line 882) +Change: #define -> constexpr int/double. DbType moves here from line 882. +``` + +**`core/Params.h`** — all parameter structs that StaticParams aggregates: + +``` +Source lines in CometDataInternal.h: 116-246 (Options) + 828-854 (ToleranceParams) + 856-878 (IonInfo) + 721-789 (PrecalcMasses) + 790-826 (MassUtil) + 697-720 (StaticMod) + 741-789 (VarModParams) + 436-453 (DBInfo) + 645-649 (SpecLibInfo) + 691-696 (PEFFInfo) + 890-1172 (StaticParams) + 321-333 (MassRange) +Also includes: EnzymeInfo (from CometData.h -- leave in place, just #include it) +Depends on: core/Constants.h, CometData.h +``` + +**`core/Types.h`** — runtime data structs (per-spectrum, per-query): + +``` +Source lines in CometDataInternal.h: + 248-278 Results + 280-295 SpecLibResults, SpecLibResultsMS1 + 296-320 PepMassInfo, SpectrumInfoInternal + 334-339 PreprocessStruct + 340-424 PEFF structs (5 structs) + 425-435 sDBEntry + 454-602 DBIndex, PepGenTuple, PepGenTupleShort + 510-602 PepGenTuple / PepGenTupleShort + 603-644 PlainPeptideIndexStruct, FragmentPeptidesStruct + 650-689 SpecLibStruct, RetentionMatch + 684-690 IndexProteinStruct + 1175-1277 ProteinsListCSR + 1278-1310 ModificationNumber + 1312-1491 Query + 1492-1536 QueryMS1 + 1537-1555 IonSeriesStruct, MatchedIonsStruct + 352-364 ProteinEntryStruct + 365-424 Peff structs +Depends on: core/Constants.h, core/Params.h, CometData.h, Threading.h, AScore headers +``` + +### Transition + +Keep `CometDataInternal.h` as a compatibility shim that just includes the three +new headers. This means zero changes to existing `.cpp` files in Phase 1: + +```cpp +// CometDataInternal.h after Phase 1 -- pure forwarding +#pragma once +#include "core/Constants.h" +#include "core/Params.h" +#include "core/Types.h" +``` + +In Phase 2+, files that only need one of the three headers update their own +`#include` to the specific header. `CometDataInternal.h` can be retired once +no `.cpp` includes it directly. + +### Verification + +``` +make cclean && make # must compile clean +python3 tests/unit/run_tests.py --comet comet.exe # all 17 must pass +``` + +--- + +## Phase 2 — Extract `SearchMemoryPool` + +**Status**: Complete — committed `4337ee8d` + +**Effort**: ~1 day **Risk**: Low (self-contained, well-tested at runtime) + +### Problem + +The duplicate-fragment scratch arrays and the pool-slot semaphore are spread +across three locations with no single owner: + +| Location | What it does | +|----------|-------------| +| `CometSearch.cpp` lines 23-24 | Defines `_pbSearchMemoryPool`, `_ppbDuplFragmentArr` as class statics | +| `CometSearch.cpp` lines 45-116 | `AllocateMemory()`, `DeallocateMemory()`, `AcquirePoolSlot()` | +| `CometSearch.h` lines 50-59 | `SearchThreadData::~SearchThreadData()` releases the slot directly | +| `CometSearch.cpp` lines 139-140, 182-183, 227-228, 272+ | Inline slot release at each `RunSearch` call site | +| `CometSearchManager.cpp` lines 2741-2748 | Calls `AllocateMemory` / `DeallocateMemory` | +| `CometSearchManager.cpp` line 60 | Defines `g_searchMemoryPoolMutex` | +| `CometSearchManager.cpp` line 67 | Defines `g_searchPoolCV` | +| `CometSearchManager.cpp` line 94 | Defines `g_bCometSearchMemoryAllocated` | + +### New File: `threading/SearchMemoryPool.h` + +```cpp +#pragma once +#include +#include + +// Owns the per-thread duplicate-fragment scratch arrays used during search. +// Replaces CometSearch::_pbSearchMemoryPool, _ppbDuplFragmentArr, +// AllocateMemory(), DeallocateMemory(), AcquirePoolSlot() and the paired globals +// g_searchMemoryPoolMutex, g_searchPoolCV, g_bCometSearchMemoryAllocated. +class SearchMemoryPool +{ +public: + SearchMemoryPool() = default; + ~SearchMemoryPool() { if (_allocated) deallocate(_nSlots); } + + // Allocates nSlots scratch arrays each of size iArraySize bools. + // Corresponds to CometSearch::AllocateMemory(nThreads). + bool allocate(int nSlots, int iArraySize); + + // Frees all scratch arrays. + // Corresponds to CometSearch::DeallocateMemory(nThreads). + void deallocate(int nSlots); + + // Blocks up to 240 s until a slot is free. Returns index in [0, nSlots) + // or -1 on timeout. Corresponds to CometSearch::AcquirePoolSlot(). + int acquireSlot(); + + // Returns the slot and signals one waiting acquireSlot() caller. + // Corresponds to the inline release blocks in CometSearch::RunSearch. + void releaseSlot(int slot); + + // Direct access to the scratch array for a claimed slot. + bool* duplFragmentArr(int slot) const { return _pool[slot]; } + + int slotCount() const { return _nSlots; } + +private: + int _nSlots = 0; + bool* _inUse = nullptr; // was _pbSearchMemoryPool + bool** _pool = nullptr; // was _ppbDuplFragmentArr + bool _allocated = false; + + std::mutex _mutex; + std::condition_variable _cv; +}; +``` + +### New File: `threading/SearchMemoryPool.cpp` + +The implementations are direct ports of the existing functions: + +``` +allocate() <- CometSearch::AllocateMemory() lines 45-72 + reads: g_staticParams.iArraySizeGlobal (pass as parameter instead) + writes: g_bCometSearchMemoryAllocated (becomes _allocated member) + +deallocate() <- CometSearch::DeallocateMemory() lines 75-92 + reads: g_bCometSearchMemoryAllocated (becomes _allocated member) + +acquireSlot() <- CometSearch::AcquirePoolSlot() lines 97-116 + reads: g_staticParams.options.iNumThreads (becomes _nSlots) + uses: g_searchMemoryPoolMutex -> _mutex + g_searchPoolCV -> _cv + _pbSearchMemoryPool -> _inUse + +releaseSlot() <- inline blocks at CometSearch.cpp lines 139, 182, 227, 272+ + uses: g_searchMemoryPoolMutex -> _mutex + g_searchPoolCV -> _cv + _pbSearchMemoryPool -> _inUse +``` + +### `SearchThreadData` update + +`CometSearch.h` `SearchThreadData::~SearchThreadData()` (lines 50-59) currently +releases the slot directly into globals. Update it to hold a `SearchMemoryPool*` +and call `releaseSlot()`: + +```cpp +struct SearchThreadData +{ + sDBEntry dbEntry; + int iPoolSlot = -1; + SearchMemoryPool* pPool = nullptr; + ThreadPool* tp = nullptr; + + ~SearchThreadData() + { + if (pPool && iPoolSlot >= 0) + { + pPool->releaseSlot(iPoolSlot); + iPoolSlot = -1; + } + dbEntry.vectorPeffMod.clear(); + dbEntry.vectorPeffVariantSimple.clear(); + } +}; +``` + +### Call-site changes + +Every call site that calls `CometSearch::AllocateMemory()`, +`CometSearch::DeallocateMemory()`, or `CometSearch::AcquirePoolSlot()` is +updated to use the `SearchMemoryPool` object. The object is constructed in +`CometSearchManager::DoSearch()` and passed by reference to all functions that +need it: + +``` +CometSearchManager.cpp line 2741: CometPreprocess::AllocateMemory() -- unchanged +CometSearchManager.cpp line 2746: CometSearch::AllocateMemory() -> pool.allocate(n, arraySize) +CometSearchManager.cpp ~line 2332: CometSearch::DeallocateMemory() -> pool.deallocate(n) +CometSearch.cpp line 132: AcquirePoolSlot() -> pool.acquireSlot() +CometSearch.cpp line 139: inline release -> pool.releaseSlot(iSlot) +CometSearch.cpp line 175: (PI_DB path) same pattern +CometSearch.cpp lines 220, 227, 272+: (batch path) same pattern +CometPreprocess.cpp FusedLoadAndSearchSpectra: pool.duplFragmentArr(t) replaces + _ppbDuplFragmentArr[t] +``` + +Globals retired after this phase: `g_searchMemoryPoolMutex`, `g_searchPoolCV`, +`g_bCometSearchMemoryAllocated`. + +### Verification + +``` +make cclean && make +python3 tests/unit/run_tests.py --comet comet.exe # all 17 must pass +# run HeLa mzXML batch search and confirm PSM count matches pre-change baseline +``` + +--- + +## Phase 3 — Extract `IResultWriter` + +**Status**: Complete — committed `4337ee8d` + +**Effort**: ~2 days **Risk**: Medium (touches writer internals) + +### Problem + +Five writer classes are dispatched from `DoSearch()` via 300+ lines of sequential +`if (bOutputXxx)` blocks (lines 2446–2900 in `CometSearchManager.cpp`). Each +writer reads `g_pvQuery` and `g_staticParams` directly. There is no shared +interface, so the dispatch cannot be driven polymorphically. + +### New File: `output/IResultWriter.h` + +```cpp +#pragma once +#include "core/Types.h" +#include "core/Params.h" +#include + +// Abstract result serializer. One concrete implementation per output format. +// Replaces the sequential if (bOutputTxtFile) / if (bOutputPepXMLFile) / ... +// dispatch in CometSearchManager::DoSearch(). +class IResultWriter +{ +public: + virtual ~IResultWriter() = default; + + // Open output file(s) and write format header. + // baseName: g_staticParams.inputFile.szBaseName + szOutputSuffix + // Called once per input file, before any spectra are searched. + virtual bool open(const std::string& baseName, const StaticParams& params) = 0; + + // Write all results for one batch of spectra. + // results is sorted by scan number (compareByScanNumber already applied). + // Called once per spectrum batch within a file. + virtual void write(const std::vector& results, + const StaticParams& params) = 0; + + // Flush and close output file(s). Write format footer if needed (e.g. pepXML). + // Called once per input file, after all batches are complete. + virtual void close(const StaticParams& params) = 0; +}; +``` + +### Writer refactoring + +Each existing writer becomes a concrete `IResultWriter`. The key behavioral +change is: instead of reading `g_pvQuery` directly, receive `results` as a +parameter. `g_staticParams` access is replaced by the `params` parameter. + +**`CometWriteTxt` -> `output/TxtWriter`** + +``` +Current: void CometWriteTxt::PrintResults(int iWhichQuery, bool bDecoy, + FILE* fpout, FILE* fpoutd, int iPrintTargetDecoy) + reads g_pvQuery.at(iWhichQuery) and g_staticParams directly + +After: write() iterates over the results vector instead of g_pvQuery. + The file handles (fpout, fpoutd) become private members opened in open(). + g_staticParams references become params parameter. + +open() <- file open + PrintTxtHeader() call (lines 2500-2550 in SearchManager) +write() <- current PrintResults() loop body, receiving vector +close() <- fclose(fpout); fclose(fpoutd); +``` + +**`CometWriteSqt` -> `output/SqtWriter`** + +``` +open() <- file open + PrintSqtHeader() call (lines 2446-2498 in SearchManager) +write() <- existing PrintResults() but receiving vector +close() <- fclose(fpout); fclose(fpoutd); +``` + +**`CometWritePepXML` -> `output/PepXmlWriter`** + +``` +open() <- file open + WritePepXMLHeader() (lines 2553-2627 in SearchManager) +write() <- existing PrintPepXMLResults() receiving vector +close() <- WritePepXMLFooter() + fclose +Note: pepXML has a two-pass pattern (tmp file + finalize). The tmp-file logic + (currently lines 2659-2724) moves into close(). +``` + +**`CometWriteMzIdentML` -> `output/MzIdentMlWriter`** + +``` +open() <- file open + header (lines 2628-2724 in SearchManager) +write() <- existing per-scan output +close() <- footer + tmp file merge + fclose +``` + +**`CometWritePercolator` -> `output/PercolatorWriter`** + +``` +open() <- file open + WritePercolatorHeader() (lines 2724-2734 in SearchManager) +write() <- existing PrintPercolatorResults() receiving vector +close() <- fclose +``` + +### `DoSearch()` dispatch replacement + +The 300-line dispatch block in `DoSearch()` (lines 2446-2900) becomes a factory +that builds a `vector>` once per input file: + +```cpp +// In DoSearch() -- replaces lines 2446-2734 +vector> writers; +if (g_staticParams.options.bOutputTxtFile) + writers.push_back(make_unique()); +if (g_staticParams.options.bOutputSqtFile || g_staticParams.options.bOutputSqtStream) + writers.push_back(make_unique()); +if (g_staticParams.options.bOutputPepXMLFile) + writers.push_back(make_unique()); +if (g_staticParams.options.iOutputMzIdentMLFile) + writers.push_back(make_unique()); +if (g_staticParams.options.bOutputPercolatorFile) + writers.push_back(make_unique()); + +// open all writers before first search batch +for (auto& w : writers) + if (!w->open(baseName, g_staticParams)) { /* handle error */ } + +// after each batch sort+write: +for (auto& w : writers) + w->write(g_pvQuery, g_staticParams); + +// after all batches: +for (auto& w : writers) + w->close(g_staticParams); +``` + +Note: `g_pvQuery` and `g_staticParams` are still globals at this phase. That +coupling is eliminated in Phase 4. Phase 3 only introduces the interface and +moves file-handle lifetime into the writer objects. + +### Verification + +``` +make cclean && make +python3 tests/unit/run_tests.py --comet comet.exe +# Run HeLa mzXML; diff txt output against pre-Phase-3 baseline -- must be identical +# (header line timestamp will differ; all PSM data must match exactly) +``` + +--- + +## Phase 4 — Introduce `SearchSession` + +**Status**: Complete — committed `00e0655f` + +**Effort**: ~3 days **Risk**: Medium-high (many call sites) + +### Problem + +The mutable state for one search run is scattered across 28 globals. Any code +can modify any of them without any indication of ownership or lifetime. + +### New File: `search/SearchSession.h` + +```cpp +#pragma once +#include "core/Params.h" +#include "core/Types.h" +#include "CometStatus.h" +#include +#include + +// Owns mutable state for one search run. +// Created at the start of DoSearch() / InitializeSingleSpectrumSearch(). +// Passed by reference to all pipeline functions that write results. +// Read-only index state (g_iFragmentIndex, g_vFragmentPeptides, g_vSpecLib, +// g_vRawPeptides, g_pvProteinsList, g_pvProteinNameCache) is NOT moved here -- +// those are large, initialized once, and shared read-only across all searches. +// They remain as const globals. (See note on pragmatic globals below.) +struct SearchSession +{ + // Run parameters -- set once before searching, then read-only. + // The params reference outlives the session (owned by CometSearchManager). + const StaticParams& params; + + // Per-batch result accumulator. + // Guarded by queriesMutex in the batch path; not accessed concurrently in RTS. + std::vector queries; + std::vector ms1Queries; + std::mutex queriesMutex; + + // Run-time flags (currently globals) + bool bPerformDatabaseSearch = false; + bool bPerformSpecLibSearch = false; + bool bIdxNoFasta = false; + bool bPlainPeptideIndexRead = false; + bool bSpecLibRead = false; + + // Error / cancel state for this run. + // Replaces g_cometStatus for per-run isolation. + CometStatus status; + + explicit SearchSession(const StaticParams& p) : params(p) {} + SearchSession(const SearchSession&) = delete; + SearchSession& operator=(const SearchSession&) = delete; +}; +``` + +### Globals replaced by SearchSession + +``` +Global (CometSearchManager.cpp) -> SearchSession member +------------------------------------------------------- +g_pvQuery -> session.queries +g_pvQueryMS1 -> session.ms1Queries +g_pvQueryMutex -> session.queriesMutex +g_bPerformDatabaseSearch -> session.bPerformDatabaseSearch +g_bPerformSpecLibSearch -> session.bPerformSpecLibSearch +g_bIdxNoFasta -> session.bIdxNoFasta +g_bPlainPeptideIndexRead -> session.bPlainPeptideIndexRead +g_bSpecLibRead -> session.bSpecLibRead +g_cometStatus -> session.status +``` + +### Globals intentionally NOT moved (pragmatic globals) + +The following globals remain as globals. They are large, allocated once, +read-only after initialization, and shared by concurrent threads. Moving them +into a session object would require reference or pointer threading through +hundreds of scoring call sites with no correctness benefit: + +``` +g_staticParams -- read-only after DoSearch() init; replace with session.params +g_iFragmentIndex -- read-only after index load; stays global +g_iFragmentIndexOffset -- same +g_vFragmentPeptides -- same +g_vRawPeptides -- same +g_pvProteinsList -- same +g_pvProteinNameCache -- same +g_vSpecLib -- same +g_pvDBIndex -- read-only after FASTA scan; stays global +g_vvvPepGenShort/.Long -- same +g_massRange -- derived from params; can be computed on demand +g_pvProteinNames -- read-only after load; stays global +g_pvInputFiles -- owned by CometSearchManager; stays +g_sCometVersion -- constant after init; stays +g_AScoreOptions -- constant after init; stays +g_AScoreInterface -- constant after init; stays +g_bPeptideIndexRead -- atomic, read-only after set; stays +RetentionMatchHistory -- deque used by alignment; keep as module-local in Alignment +``` + +### Migration strategy + +Introduce `SearchSession` alongside the existing globals. In Phase 4, both exist +in parallel. Each function signature that currently reads a global gets a +`SearchSession&` parameter added. The global is then read from `session.member` +instead of the global directly. Once all reads/writes go through the session, +the global definition is removed. + +Recommended order within Phase 4 (lowest risk first): + +``` +Step 4a: Add session to DoSearch() and the per-file loop. Pass to writer open()/write()/close(). +Step 4b: Thread session into CometPreprocess::LoadAndPreprocessSpectra() and + FusedLoadAndSearchSpectra(). Remove g_pvQuery push under mutex; use + session.queries.push_back() under session.queriesMutex. +Step 4c: Thread session into CometSearch::RunSearch() overloads. RunSearch(Query*) + and RunSearch(int, int, ThreadPool*) no longer read g_pvQuery directly. +Step 4d: Thread session into CometPostAnalysis. PostAnalysisThreadProc currently + iterates g_pvQuery; replace with session.queries. +Step 4e: Remove global definitions for the nine replaced globals. Compiler errors + will identify any remaining direct accesses. +``` + +### Verification + +After each step 4a-4e: +``` +make cclean && make +python3 tests/unit/run_tests.py --comet comet.exe +# batch HeLa mzXML diff against Phase 3 baseline +``` + +--- + +## Phase 5 — Extract `ISearchStrategy` and `Pipeline` + +**Status**: Complete — uncommitted (working tree on `batch_FI_optimization`) + +**Effort**: ~1 week **Risk**: High (most invasive refactor) + +### Problem + +`DoSearch()` selects the search path via cascading `if (iDbType == FI_DB)` chains +that appear in at minimum these locations: + +``` +CometSearchManager.cpp ~line 2252: bCreatePeptideIndex path +CometSearchManager.cpp ~line 2324: bCreateFragmentIndex path +CometSearchManager.cpp ~line 2352: FI_DB precursor pre-read +CometSearchManager.cpp ~line 2808: FI_DB index load +CometSearchManager.cpp ~line 2900: FASTA_DB vs FI/PI_DB file opens +CometSearch.cpp line 122: RunSearch(Query*) dispatch +CometSearch.cpp line 206: RunSearch(ThreadPool*) dispatch +CometPreprocess.cpp: LoadAndPreprocessSpectra vs FusedLoadAndSearchSpectra +CometSearchManager.cpp ~line 3283: InitializeSingleSpectrumSearch dispatch +``` + +### New File: `search/ISearchStrategy.h` + +```cpp +#pragma once +#include "SearchSession.h" +#include "threading/SearchMemoryPool.h" +#include "ThreadPool.h" + +struct InputFileInfo; + +// One implementation per database type: FastaStrategy, FiStrategy, PiStrategy. +// Pipeline selects the correct one at startup and holds it for the run. +class ISearchStrategy +{ +public: + virtual ~ISearchStrategy() = default; + + // Called once before the first input file. + // Responsible for index loading / building (e.g. ReadPlainPeptideIndex, + // CreateFragmentIndex, WriteFIPlainPeptideIndex, WritePeptideIndex). + // Returns false on error. + virtual bool initialize(SearchSession& session, ThreadPool& pool) = 0; + + // Called once per input file. Opens the spectrum source, reads/searches + // all batches, appends fully scored Query* objects to session.queries. + // Returns false on error or cancel. + virtual bool execute(const InputFileInfo& file, + SearchSession& session, + SearchMemoryPool& pool, + ThreadPool& tp) = 0; + + // Called once after all files. Cleanup (index dealloc, etc.). + virtual void finalize(SearchSession& session, ThreadPool& pool) = 0; +}; +``` + +### Strategy implementations + +**`search/FiStrategy.h/.cpp`** — FI_DB batch path + +``` +initialize(): + If bCreateFragmentIndex: call WriteFIPlainPeptideIndex(tp) then return. + Else: pre-read precursors (if !iFragIndexSkipReadPrecursors), + call ReadPlainPeptideIndex() + CreateFragmentIndex(tp). + Source: DoSearch() lines 2324-2414. + +execute(): + Opens MSReader, calls FusedLoadAndSearchSpectra() in batch loop. + Source: DoSearch() lines 2808-3220 (FI_DB branch). + +finalize(): + CometSearch::DeallocateMemory(), CometPreprocess::DeallocateMemory(). +``` + +**`search/FastaStrategy.h/.cpp`** — FASTA_DB path + +``` +initialize(): + ReadProteinVarModFilterFile() if configured. + CometSearch::AllocateMemory(). + Source: DoSearch() lines 2252-2277. + +execute(): + Opens MSReader and FASTA file handle. + Runs LoadAndPreprocessSpectra() + RunSearch() in batch loop. + Source: DoSearch() lines 2800-3220 (FASTA_DB branch). + +finalize(): + DeallocateMemory(). +``` + +**`search/PiStrategy.h/.cpp`** — PI_DB path + +``` +initialize(): + If bCreatePeptideIndex: call WritePeptideIndex(tp) then return. + Else: load peptide index. + Source: DoSearch() lines 2245-2252. + +execute(): + Same loop structure as FiStrategy but calls SearchPeptideIndex(). + +finalize(): + DeallocateMemory(). +``` + +### New File: `search/Pipeline.h/.cpp` + +``` +// Pipeline.h +class Pipeline +{ +public: + Pipeline(unique_ptr strategy, + vector> writers); + + // Drives the full batch search for all files. + // Replaces the main body of CometSearchManager::DoSearch(). + bool run(SearchSession& session, + const vector& files, + ThreadPool& pool); + +private: + void flushAndWrite(SearchSession& session); + unique_ptr _strategy; + vector> _writers; +}; +``` + +### Strategy factory + +A free function in `SearchManager.cpp` selects the right strategy based on +`g_staticParams.iDbType` and the index-build flags. This is the single location +where the `if (iDbType == FI_DB)` logic lives after Phase 5: + +```cpp +static unique_ptr makeStrategy(const StaticParams& p) +{ + if (p.iDbType == DbType::FI_DB || p.options.bCreateFragmentIndex) + return make_unique(); + if (p.iDbType == DbType::PI_DB || p.options.bCreatePeptideIndex) + return make_unique(); + return make_unique(); +} +``` + +### `DoSearch()` after Phase 5 + +The 4,585-line `CometSearchManager::DoSearch()` body reduces to approximately: + +```cpp +bool CometSearchManager::DoSearch() +{ + if (!InitializeStaticParams()) return false; + if (!ValidateOutputFormat()) return false; + if (!ValidateScanRange()) return false; + if (!ValidatePeptideLengthRange()) return false; + + try { _tp->fillPool(g_staticParams.options.iNumThreads); } + catch (...) { /* error */ return false; } + + SearchSession session(g_staticParams); + session.bPerformDatabaseSearch = ValidateSequenceDatabaseFile(); + session.bPerformSpecLibSearch = ValidateSpecLibFile(); + + auto strategy = makeStrategy(g_staticParams); + auto writers = makeWriters(g_staticParams); // builds IResultWriter vector + Pipeline pipeline(move(strategy), move(writers)); + + return pipeline.run(session, g_pvInputFiles, *_tp); +} +``` + +### RTS path + +The RTS entry points (`InitializeSingleSpectrumSearch`, +`DoSingleSpectrumSearchMultiResults`, `FinalizeSingleSpectrumSearch`) are +**not moved into the strategy pattern** in Phase 5. They are thread-safe, +well-tested, and called from C# via `CometWrapper`. Refactoring them carries +high wrapper-compatibility risk. They remain in `CometSearchManager` and use +`g_staticParams` / `g_iFragmentIndex` etc. directly. This is explicitly out +of scope for Phase 5. + +### Verification + +``` +make cclean && make +python3 tests/unit/run_tests.py --comet comet.exe # all 17 must pass +# batch HeLa mzXML diff against Phase 4 baseline -- identical PSM data +# run integration test (T17/T18) against human.small.fasta +# confirm RTS path still compiles and executes via RealtimeSearch.exe smoke test +``` + +### Actual implementation notes + +The interface as built is more fine-grained than the plan above. The plan had a +single `execute(file, session, pool, tp)` per file; the actual `ISearchStrategy` +splits the per-file work into four methods so the common per-file loop (MSReader +setup, writer open/close, batch while-loop, timing) can live in `Pipeline::run()` +without duplication across three strategies: + +```cpp +virtual bool initialize(SearchSession& session, ThreadPool* tp) = 0; +virtual bool openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) = 0; +virtual bool executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) = 0; +virtual void closeFiles(FILE* fpfasta, FILE* fpidx) = 0; +virtual void finalize() = 0; +virtual bool isIndexBased() const = 0; +``` + +`iPercentStart`/`iPercentEnd` are passed by reference so each strategy can update +them after `LoadAndPreprocessSpectra` but before calling `RunSearch`, preserving +the exact progress-reporting semantics of the original code. + +A `search/SearchUtils.h` header was added to hold utility functions extracted +from `CometSearchManager.cpp` statics (`GetInputType`, `UpdateInputFile`, +`SetMSLevelFilter`, `AllocateResultsMem`, `compareByPeptideMass`, +`compareByMangoIndex`, `compareByScanNumber`). These are inline functions +shared by all three strategy `.cpp` files without circular includes. + +The early-return index-build paths (`bCreateFragmentIndex`, `bCreatePeptideIndex`) +remain in `DoSearch()` as early returns before `makeStrategy()` is called, rather +than being absorbed into strategy `initialize()`. This avoids adding "are we +done?" signaling between strategy and pipeline for what is conceptually a +separate, one-shot operation. + +### Results + +- Build: clean on Linux (gcc, c++20), no new errors +- Unit tests: 17/17 pass +- HeLa FI_DB parity: `20250520_Hela_60min_06.mzXML` vs `human.canonical.target-decoy.fasta.idx`, + trypsin + phospho + oxidation, 49,747 spectra + Pre-Phase5 (commit `00e0655f`): 16,559 xcorr PSMs @ 1% FDR, 18,458 evalue PSMs @ 1% + Phase5 (working tree): 16,559 xcorr PSMs @ 1% FDR, 18,458 evalue PSMs @ 1% + Diff: **zero unique PSMs** at 1% and 5% FDR for both xcorr and evalue sorting + +--- + +## Build System + +The Linux `Makefile` currently globs `CometSearch/*.cpp`. After Phase 1 it needs +to include subdirectory sources. Update the `SRCS` variable in `CometSearch/Makefile`: + +```makefile +SRCS := $(wildcard *.cpp) \ + $(wildcard core/*.cpp) \ + $(wildcard threading/*.cpp) \ + $(wildcard output/*.cpp) \ + $(wildcard search/*.cpp) +``` + +The Windows `CometSearch.vcxproj` needs a new `` entry for each new +`.cpp` added. Use `` entries to create matching Solution Explorer folders. + +--- + +## Line-Ending Rule + +All new `.h` and `.cpp` files must use CRLF line endings (Windows `\r\n`). +Verify after creating each file: +```bash +file CometSearch/threading/SearchMemoryPool.h # must show "CRLF line terminators" +``` +If not, run `unix2dos ` before committing. + +--- + +## Phase Summary + +| Phase | Target | Key Files Changed | Globals Retired | Risk | +|-------|--------|-------------------|-----------------|------| +| 1 | Split `CometDataInternal.h` | `core/Constants.h`, `core/Params.h`, `core/Types.h` | None (shim kept) | Low | +| 2 | `SearchMemoryPool` | `threading/SearchMemoryPool.h/.cpp`, `CometSearch.h/.cpp`, `CometSearchManager.cpp` | `g_searchMemoryPoolMutex`, `g_searchPoolCV`, `g_bCometSearchMemoryAllocated` | Low | +| 3 | `IResultWriter` | `output/IResultWriter.h`, 5 writer files, `CometSearchManager.cpp` | None yet | Medium | +| 4 | `SearchSession` | `search/SearchSession.h`, `CometSearchManager.cpp`, `CometPreprocess.cpp`, `CometSearch.cpp`, `CometPostAnalysis.cpp` | `g_pvQuery`, `g_pvQueryMS1`, `g_pvQueryMutex`, `g_bPerformDatabaseSearch`, `g_bPerformSpecLibSearch`, `g_bIdxNoFasta`, `g_bPlainPeptideIndexRead`, `g_bSpecLibRead`, `g_cometStatus` | Medium-high | +| 5 | `ISearchStrategy` + `Pipeline` | `search/ISearchStrategy.h`, `FiStrategy`, `FastaStrategy`, `PiStrategy`, `Pipeline.h/.cpp`, `SearchManager.cpp` | Search-path `if/else` chains | High | diff --git a/docs/20260612_producerConsumerQueue.md b/docs/20260612_producerConsumerQueue.md new file mode 100644 index 00000000..4a04f935 --- /dev/null +++ b/docs/20260612_producerConsumerQueue.md @@ -0,0 +1,209 @@ +# Producer/Consumer Queue for Fused Batch FI_DB Path + +## Context + +`FusedLoadAndSearchSpectra` (added in `batch_FI_optimization`) eliminated the +three-sweep DRAM anti-scaling problem by fusing preprocess -> search -> +post-analysis per spectrum in one pass. However, it still reads the entire batch +into `std::vector vSpectra` before dispatching any worker, because the +original work-stealing design required the full vector to be present before +`fetch_add` indexing could begin. + +This two-phase structure has a measurable RAM cost: + +- `MSToolkit::Spectrum` stores its peaks in a heap-allocated `vector` + (12 bytes per peak: 8-byte `double mz` + 4-byte `float intensity`). +- A typical HeLa MS2 spectrum has ~600-800 peaks: ~8 KB per spectrum. +- A 302 MB HeLa `.raw` file contains ~40k MS2 spectra: **~320 MB** held in + `vSpectra` simultaneously before a single spectrum is processed. +- Peak RAM for the HeLa benchmark is 10.5 GB; that 320 MB is recoverable + with no algorithmic loss. + +There is no correctness reason to read ahead more than one spectrum beyond what +workers can immediately consume. A bounded producer/consumer queue lets the +read loop and the worker pool run concurrently, capping peak spectrum RAM to +`O(iNumThreads)` regardless of file size. + +## Goal + +Replace the two-phase (read-all -> process-all) structure of +`FusedLoadAndSearchSpectra` with a single-pass pipeline: + +- **Producer** (calling thread): reads spectra from the raw file one at a time + and pushes them into a bounded concurrent queue, blocking when the queue is + full. +- **Consumers** (`iNumThreads` workers): pop from the queue and call + `FusedSearchSpectrum` immediately, with no change to `FusedSearchSpectrum` + itself. + +I/O and compute overlap; peak spectrum RAM drops from ~320 MB to a few hundred KB +(queue depth x spectrum size). + +## Confirmed facts the design relies on + +- `FusedSearchSpectrum(Spectrum spec, int iSlot)` takes `Spectrum` by value + (already a copy); the queue can safely `std::move` spectra into and out of + storage. No pointer aliasing issue. +- The pool slot index `iSlot` is a per-worker constant (0..iNumThreads-1). + Each consumer lambda captures its own `t` at launch time -- same as the current + `fetch_add` dispatch. The `_ppbDuplFragmentArr` lifetime is the full batch, + not per-spectrum. This is unchanged. +- `CheckExit` / `g_pvQueryMutex` are called on the producer thread inside the + read loop. This is unchanged; only the producer runs the loop. +- `_bDoneProcessingAllSpectra` is set by the read loop before the function + returns. The outer `CometSearchManager` batch while loop reads it after + `FusedLoadAndSearchSpectra` returns. This is unchanged. +- `g_pvQuery` is pushed under `g_pvQueryMutex` inside `FusedSearchSpectrum`. + Multiple consumer threads already do this in the current implementation; + no change needed. +- PSM output is sorted by scan number (`compareByScanNumber`) in + `CometSearchManager` after `FusedLoadAndSearchSpectra` returns. Consumer + execution order therefore does not need to match read order; only the sort + at the end matters. **PSM output remains bit-identical to the current + fused path** (same `FusedSearchSpectrum`, same post-sort). +- `tp->wait_on_threads()` already blocks until all active jobs finish; + no new synchronization primitive is needed at the outer level. + +## Design: BoundedSpectrumQueue + +A simple mutex + two condition-variable queue is sufficient. The bottleneck +is `FusedSearchSpectrum` (~1.4 ms/spectrum), not queue throughput. Lock-free +structures would add complexity with no measurable benefit. + +```cpp +struct BoundedSpectrumQueue +{ + std::queue q; + std::mutex mtx; + std::condition_variable cvNotFull; + std::condition_variable cvNotEmpty; + size_t maxDepth; + bool bDone = false; + + explicit BoundedSpectrumQueue(size_t depth) : maxDepth(depth) {} + + // Producer calls this. Blocks when queue is full. + void push(Spectrum&& spec) + { + std::unique_lock lk(mtx); + cvNotFull.wait(lk, [&]{ return q.size() < maxDepth || bDone; }); + if (!bDone) + { + q.push(std::move(spec)); + cvNotEmpty.notify_one(); + } + } + + // Consumer calls this. Returns false when done and queue is empty. + bool pop(Spectrum& spec) + { + std::unique_lock lk(mtx); + cvNotEmpty.wait(lk, [&]{ return !q.empty() || bDone; }); + if (q.empty()) return false; + spec = std::move(q.front()); + q.pop(); + cvNotFull.notify_one(); + return true; + } + + // Producer calls after the read loop ends. + void finish() + { + std::unique_lock lk(mtx); + bDone = true; + cvNotEmpty.notify_all(); + cvNotFull.notify_all(); + } +}; +``` + +**Queue depth**: `iNumThreads * 4`. At steady state, each consumer holds one +spectrum (inside `FusedSearchSpectrum`). A depth of 4x threads means the +producer can stay up to 4 spectra/thread ahead without blocking. For 20 threads, +peak in-flight spectra = 20 (being processed) + 80 (in queue) = 100 spectra x +~8 KB = **800 KB**, down from ~320 MB. + +## Implementation changes + +### Stage 1 -- Add `BoundedSpectrumQueue` (CometPreprocess.cpp) + +Define the struct near the top of `CometPreprocess.cpp`, alongside the +`RtsScratch` definition. It is a local implementation detail and does not need +its own header. + +### Stage 2 -- Restructure `FusedLoadAndSearchSpectra` + +Remove `std::vector vSpectra` and the `std::atomic ctr` +dispatch block. Replace with: + +``` +1. Construct BoundedSpectrumQueue with depth = iNumThreads * 4. + +2. Launch iNumThreads consumer workers BEFORE the read loop: + + for (int t = 0; t < iNumSlots; ++t) + { + tp->doJob([&queue, t]() + { + Spectrum spec; + while (queue.pop(spec)) + FusedSearchSpectrum(std::move(spec), t); + }); + } + +3. Run the read loop on the calling thread (unchanged logic). + Replace: + vSpectra.push_back(mstSpectrum); + with: + queue.push(std::move(mstSpectrum)); + +4. After the read loop: call queue.finish(). + +5. tp->wait_on_threads() (unchanged). +``` + +Note: workers are launched before reading starts so that the first spectrum +pushed is consumed immediately with no dead time. If launched after, the read +loop could fill the queue and stall before any worker starts. + +### Stage 3 -- Error/cancel handling + +If `g_cometStatus.IsError()` or `IsCancel()` is detected inside the read loop +(via `CheckExit`), the read loop breaks. `queue.finish()` is called +unconditionally after the loop and before `wait_on_threads`, so consumers drain +any buffered spectra and exit cleanly. This matches the current behavior where +spectra already in `vSpectra` were still processed after an early break. If +strict cancellation is desired (drop buffered spectra on error), consumers can +check `g_cometStatus` at the top of their loop and call `queue.finish()` +themselves to unblock the producer. + +## Files changed + +| File | Change | +|------|--------| +| `CometSearch/CometPreprocess.cpp` | Add `BoundedSpectrumQueue`; restructure `FusedLoadAndSearchSpectra` | +| `CometSearch/CometPreprocess.h` | No change (no new public API) | +| Everything else | No change | + +## Memory impact summary + +| Metric | Before (batch_FI_optimization) | After (this plan) | +|--------|-------------------------------|-------------------| +| Spectrum buffer RAM | ~320 MB (40k spectra) | ~800 KB (100 spectra) | +| Peak total (HeLa) | 10.5 GB | ~10.2 GB (est.) | +| Dominant cost | Fragment index (~9.5 GB) | Fragment index (~9.5 GB) | + +The fragment index dominates; this change recovers the spectrum-buffer overhead +entirely. + +## Verification + +1. **Unit tests**: `python tests/unit/run_tests.py --comet comet.exe` -- all 17 + tests must pass. +2. **PSM parity**: Run on HeLa `.raw` with both the `batch_FI_optimization` + binary (before this change) and the new binary. `diff` on the `.txt` outputs + must show only the header line (run name + timestamp), as verified for the + prior change. `tools/qvalue.py --diff` must show zero unique PSMs at 1% and + 5% FDR. +3. **Memory**: Run under `/usr/bin/time -v` and confirm `Maximum resident set + size` drops by ~300 MB relative to the prior binary on the same HeLa file. diff --git a/docs/20260615_multiple_rts_instances.md b/docs/20260615_multiple_rts_instances.md new file mode 100644 index 00000000..f081aa4f --- /dev/null +++ b/docs/20260615_multiple_rts_instances.md @@ -0,0 +1,134 @@ +# Multiple Concurrent RTS Instances: Design Options + +**Goal:** Allow N concurrent RTS instances in the same host process (or across processes), each running an independent set of search parameters, so that different subsets of spectra can be searched with different parameter sets simultaneously. + +--- + +## The Core Challenge + +All state that makes one search parameterization distinct from another is currently a process-wide singleton. There are two categories: + +**Must be per-instance (encode the parameter set):** +- `g_staticParams` -- the parameter root +- `g_iFragmentIndex` / `g_iFragmentIndexOffset` / `g_vFragmentPeptides` -- the index encodes enzyme cleavage, variable mods, and peptide length range; different params -> different index +- `MOD_NUMBERS` / `MOD_SEQS` / `PEPTIDE_MOD_SEQ_IDXS` -- mod permutation tables built from `variableModParameters` +- `CometSearch::_pbSearchMemoryPool` / `_ppbDuplFragmentArr` -- pool sized to param-set's thread count +- `g_AScoreOptions` / `g_AScoreInterface` -- if AScore settings differ +- `g_cometStatus` -- each instance needs independent error/cancel state +- All init flags and `singleSearchInitializationComplete` + +**Potentially shared (encode the database, not the params):** +- `g_vRawPeptides` (~300 MB) -- plain peptide sequences from the `.idx` file +- `g_pvProteinsList` (~200 MB CSR) -- protein file offsets per peptide +- `g_pvProteinNameCache` (~7 MB) -- protein name strings +- `g_pvProteinNames` -- indexed protein accessions +- `g_vSpecLib` / `g_vulSpecLibPrecursorIndex` -- if all instances use the same MS1 reference + +--- + +## Option A: Multiple processes + +Run N separate instances of the host application (or N `RealtimeSearch.exe` processes). Each process has its own address space and therefore its own independent copy of all globals. A C# coordinator routes spectra to the right process and aggregates results. + +**Zero C++ changes required.** Works today. + +**Pros:** Complete isolation, no lock contention between instances, simplest reasoning about state. + +**Cons:** N x full memory footprint per process. For a human target-decoy `.idx`, the fragment index alone is 3-8 GB; three instances = 9-24 GB just for the index. IPC cost for routing spectra and collecting results across process boundaries. + +**When to choose:** If memory is not constrained, or if the parameter sets are infrequently changed and process startup latency is acceptable. + +--- + +## Option B: Per-instance context struct (recommended long-term path) + +Move all process-global state into a `SearchContext` struct owned by each `CometSearchManager` instance. Multiple `CometSearchManager` objects can then coexist in the same process with fully independent state. + +```cpp +// New: CometSearch/RtsContext.h +struct RtsContext { + StaticParams params; + unsigned int* iFragmentIndex = nullptr; + uint64_t* iFragmentIndexOffset = nullptr; + vector vFragmentPeptides; + vector vRawPeptides; + ProteinsListCSR pvProteinsList; + unordered_map pvProteinNameCache; + map pvProteinNames; + bool* bIndexPrecursors = nullptr; + vector vSpecLib; + vector> vulSpecLibPrecursorIndex; + AScoreProCpp::AScoreOptions AScoreOptions; + AScoreProCpp::AScoreDllInterface* pAScoreInterface = nullptr; + vector MOD_NUMBERS; + vector MOD_SEQS; + // ... mod index arrays ... + bool* pbSearchMemoryPool = nullptr; + bool** ppbDuplFragmentArr = nullptr; + CometStatus status; + // init flags are already members of CometSearchManager +}; +``` + +`CometSearchManager` holds a `unique_ptr`. Every internal function that currently reads `g_staticParams` receives a `const RtsContext&` (or `const StaticParams&`) instead. The `CometSearch` class static members `_pbSearchMemoryPool` / `_ppbDuplFragmentArr` become per-instance (either stored in `RtsContext` and passed in, or `CometSearch` becomes a non-static class). + +The C# side creates N `CometSearchManagerWrapper` objects -- a natural extension of what is already there. Each wrapper wraps one `CometSearchManager` which owns one `RtsContext`. Spectra are routed to the appropriate wrapper by the C# coordinator. + +**Pros:** Single process, low IPC overhead, easy result aggregation, no process-startup latency per instance. Full clean encapsulation -- no global state at all after the refactor. + +**Cons:** Memory cost is the same as multi-process (N x index size). The refactor touches ~15 `.cpp`/`.h` files everywhere `g_staticParams`, `g_iFragmentIndex`, etc. are referenced. It is mechanical but not small -- `g_staticParams` alone appears in roughly 30 call sites in `CometSearch.cpp`. + +**Scope estimate:** The most invasive change is threading `const RtsContext&` (or just `const StaticParams&` for the scoring-only functions) through the call chains in `CometSearch.cpp`, `CometPreprocess.cpp`, `CometPostAnalysis.cpp`, `CometFragmentIndex.cpp`. A staged approach works: start with `g_staticParams` (referenced everywhere), get that building cleanly, then migrate the index arrays. + +--- + +## Option C: Shared-database layer + per-param search layer + +Split `RtsContext` into two levels: + +```cpp +struct DatabaseContext { // shared via shared_ptr + vector vRawPeptides; + ProteinsListCSR pvProteinsList; + unordered_map pvProteinNameCache; + map pvProteinNames; +}; + +struct SearchContext { // per-instance + StaticParams params; + shared_ptr db; // shared + unsigned int* iFragmentIndex; + uint64_t* iFragmentIndexOffset; + vector vFragmentPeptides; + // ... pools, mod tables, AScore, status ... +}; +``` + +The `DatabaseContext` is loaded once from the `.idx` file (which encodes protein names and peptide sequences regardless of search params) and shared among all `SearchContext` instances that reference the same file. + +**Memory savings:** ~500 MB per extra instance on the sharable data. The fragment index still cannot be shared when mods or enzyme differ -- and at 3-8 GB that is the dominant cost. Savings are typically 10-20% for a human proteome use case with three instances. + +**Pros:** Meaningful memory reduction if N is large or the base database is very large. + +**Cons:** Added complexity (two-level ownership, `shared_ptr` threading, database-identity matching). Benefit is modest when the index itself is not shared. + +**When to choose:** If all N instances use the same `.idx` file (guaranteed same database) AND memory is tight enough that 500 MB x N matters. + +--- + +## Special case: same database, same mods, different scoring params only + +If the only differences between instances are tolerance, ion series (a/b/c/x/y/z), minimum score, or similar scoring-time parameters -- things that do not affect which peptides are in the index -- then the entire fragment index (`g_iFragmentIndex`, `g_vFragmentPeptides`, `g_vRawPeptides`) is the same for all instances and can be shared. Only `g_staticParams` truly differs. + +In this case Option C degenerates to: share everything except `StaticParams` and the memory pool. The C++ scoring functions would receive a `const StaticParams&` argument instead of reading `g_staticParams` directly, which is a much smaller change than the full Option B refactor. + +--- + +## Recommendation + +| Timeframe | Choice | Reason | +|-----------|--------|--------| +| Immediately | **Option A** (multiple processes) | Zero C++ changes, works today, C# coordinator routes spectra | +| Long term | **Option B** (per-instance context) | Clean encapsulation, single process, natural extension of C# API, enables future optimizations including Option C | + +The key question that should drive which option is prioritized: **do the N param sets use the same `.idx` database file?** If yes and memory is a concern, the staging would be: Option B first, then selectively apply Option C's `shared_ptr` for the large read-only arrays as an optimization on top. diff --git a/docs/20260616_codereview1.md b/docs/20260616_codereview1.md new file mode 100644 index 00000000..cf54c974 --- /dev/null +++ b/docs/20260616_codereview1.md @@ -0,0 +1,218 @@ +# Code Review — architecture_update branch +# 2026-06-16 + +Reviewed by: Claude Sonnet 4.6 (high-effort, 7-angle finder + per-candidate verification) +Scope: `git diff master...HEAD` — 70 files, +5542 / -3127 lines + +--- + +## Summary + +This branch introduces a major architectural refactor: Strategy pattern for search +(`FastaStrategy`, `FiStrategy`, `PiStrategy`), a `Pipeline` orchestrator, a `SearchSession` +object, an `IResultWriter` interface with four concrete writer classes, a `SearchMemoryPool`, +and a `core/` split of `CometDataInternal.h` into `Constants.h`, `Params.h`, and `Types.h`. +The structural changes are sound, but the refactor introduced four confirmed bugs (two of +which corrupt search results or violate real-time latency guarantees) and two plausible bugs +on error paths. + +--- + +## Critical Issues + +### 1. `StorePeptideI` ignores `bDecoyPep` — decoys written to target list (FDR corruption) +**File:** `CometSearch/CometSearch.cpp` ~line 8618 +**Severity:** Critical — wrong results, silent + +The new `Query*`-based overload of `StorePeptideI` (added for the FI/PI index path in +Task 1.2) comments out the `bDecoyPep` parameter: + +```cpp +void CometSearch::StorePeptideI(Query* pQuery, ..., bool /*bDecoyPep*/, ...) { +``` + +The parameter is dead. The function body always writes to `pQuery->_pResults` and +increments `iMatchPeptideCount`, regardless of whether the peptide is a decoy. +`pQuery->_pDecoys` and `iDecoyMatchPeptideCount` are never touched by this overload. + +The callers at lines ~8591 and ~8608 correctly pass `bDecoyPep=true` for decoy hits. +The old `StorePeptide` overload (line ~5221) has the correct branch: + +```cpp +if (g_staticParams.options.iDecoySearch == 2 && bDecoyPep) + // write to _pDecoys +``` + +The new overload is missing this branch entirely. + +**Impact:** Any FI_DB or PI_DB search with `iDecoySearch=2` (separate decoy mode) silently +mixes all decoy PSMs into the target result list. FDR estimation is corrupted for every +index-path search in separate-decoy mode. + +**Fix:** Restore the `iDecoySearch==2 && bDecoyPep` branch in `StorePeptideI`, writing +to `pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex]` and comparing against +`pQuery->dLowestDecoyXcorrScore` when the decoy condition holds. + +--- + +### 2. `SearchMS1Library` uses global `g_pvQueryMutex` instead of per-query lock +**File:** `CometSearch/CometSearch.cpp` ~line 3275 +**Severity:** High — RTS latency violation; cross-path serialization + +`SearchMS1Library` (the MS1 real-time search path) guards score updates on a caller-owned +`QueryMS1*` with the process-wide `g_pvQueryMutex`: + +```cpp +ThreadMutexLock(&g_pvQueryMutex); // line ~3275 +// update pMS1Query->dBestXcorr, etc. +ThreadMutexUnlock(&g_pvQueryMutex); +``` + +The MS2 RTS path (`RunSearch`) correctly uses `pQuery->accessMutex` for per-query +isolation (lines ~5135, ~8554). `SearchMS1Library` should do the same. + +`g_pvQueryMutex` is also held during batch speclib loading +(`CometPreprocess.cpp:1007`). A running batch search therefore blocks every concurrent +RTS MS1 thread for the full duration of the speclib load, violating the real-time latency +guarantee. + +**Fix:** Add an `accessMutex` field to `QueryMS1` (mirroring `Query::accessMutex`) and +use it in `SearchMS1Library` for score-update critical sections. + +--- + +### 3. `MzIdentMlWriter::FinalizeOne` silently produces invalid `.mzid` on temp-file reopen failure +**File:** `CometSearch/output/MzIdentMlWriter.h` ~line 116 +**Severity:** High — silent data corruption, no error reported + +`FinalizeOne()` closes the temp file then immediately reopens it for reading: + +```cpp +fclose(fpTmp); // line 115 +fpTmp = fopen(sTmp.c_str(), "r"); // line 116 +if (fpTmp) { // line 117 + CometWriteMzIdentML::WriteMzIdentML(...); + fclose(fpTmp); +} +fclose(fpFinal); // line 129 +``` + +If the `fopen` at line 116 fails (network filesystem, external cleanup, non-atomic +close-reopen), the `if` block is skipped: `WriteMzIdentML` is never called, the +spectrum results are never appended, and the output file is closed at line 129 containing +only the XML header — no spectrum results, no closing tags. `g_cometStatus` is never +updated; `DoSearch` returns `true`. Downstream tools receive a structurally invalid file. + +**Fix:** Check the return value of the second `fopen` and call +`g_cometStatus.SetStatus(CometResult_Failed, ...)` on failure before returning. + +--- + +## Code Quality & Maintainability + +### 4. `Pipeline::run` — writer `open()` failure leaves already-opened writers unclosed +**File:** `CometSearch/search/Pipeline.cpp` ~line 108 +**Severity:** Medium — FILE* handle leak, truncated output files + +When a writer's `open()` fails, the inner loop breaks and Pipeline calls +`_strategy->closeFiles()` then breaks out of the file loop entirely, bypassing the +`pw->close()` block at lines ~243–247. Writers that already opened successfully (with +partially-written headers on disk) are never closed. + +**Fix:** On `open()` failure, iterate all writers that have already been successfully +opened and call their `close()` before returning `false`. + +--- + +### 5. `Pipeline::run` returns on `initialize()` failure without calling `finalize()` (memory leak) +**File:** `CometSearch/search/Pipeline.cpp` ~line 38 +**Severity:** Medium — memory leak on error path + +```cpp +if (!_strategy->initialize(session)) return false; // line 38 +// ... +_strategy->finalize(session); // line ~256, only reached on success +``` + +`finalize()` is the sole cleanup point for memory allocated by `initialize()` (thread-pool +scratch buffers, precursor arrays). If `initialize()` returns `false` midway — e.g., +`CometPreprocess::AllocateMemory` succeeds then `ReadPrecursors` fails — those allocations +are never freed. On repeated calls (C# wrapper retrying after a failed search), each +failed init accumulates leaked memory. + +**Fix:** Call `_strategy->finalize(session)` before returning `false` at line 38, or +structure the function with a `goto cleanup` / RAII guard so `finalize` always runs. + +--- + +### 6. `WithinMassTolerancePeff` seek-back loop uses wrong reference mass +**File:** `CometSearch/CometSearch.cpp` ~line 4380 +**Severity:** Medium — false negatives in PEFF searches + +After `BinarySearchMass` locates the correct position for `dCalcPepMass + dMassAddition`, +the seek-back while-loop compares against bare `dCalcPepMass` instead of +`dCalcPepMass + dMassAddition`. With a large positive PEFF modification (e.g., +80 Da +for phospho), the found position is 80 Da ahead of `dCalcPepMass` in the sorted index; +the seek-back stops far too early, and candidate peptides that are within tolerance of +the modified mass are never evaluated. + +**Fix:** Change the seek-back comparison operand from `dCalcPepMass` to +`dCalcPepMass + dMassAddition`, mirroring the value passed to `BinarySearchMass`. + +--- + +### 7. `SearchSession::bPlainPeptideIndexRead` / `bSpecLibRead` are dead fields +**File:** `CometSearch/search/SearchSession.h` ~line 44 +**Severity:** Low — architectural drift; misleads about ownership + +`SearchSession` declares `bPlainPeptideIndexRead` and `bSpecLibRead` as session-owned +state, but `FiStrategy::initialize` reads the global `g_bPlainPeptideIndexRead` — not +`session.bPlainPeptideIndexRead`. The session fields are never set or checked by any +code path. A reader auditing `SearchSession` to understand index state will draw the +wrong conclusion about where the authoritative value lives. + +**Fix:** Either wire `FiStrategy::initialize` to read and write `session.bPlainPeptideIndexRead` +and retire the global, or remove the dead session fields until the migration is ready. + +--- + +## Actionable Improvements + +### 8. `FiStrategy::executeBatch` is a near-copy of `FastaStrategy::executeBatch` with a dead Mango block +**File:** `CometSearch/search/FiStrategy.cpp` ~line 59 + +The non-fused `executeBatch` body is an almost-exact copy of `FastaStrategy::executeBatch`, +including a Mango sort block that can never execute in this branch (`bFused` is false only +when `bMango || bSpecLib` is true, meaning the fused path is taken instead). Any future +change to the shared preprocessing sequence must be applied in both files. + +Extract the shared preprocessing sequence into a free function in `SearchUtils.h` and call +it from both strategies. + +--- + +### 9. `BuildNames()` copy-pasted verbatim into all four writer classes +**Files:** `CometSearch/output/SqtWriter.h`, `TxtWriter.h`, `PepXmlWriter.h`, `MzIdentMlWriter.h` ~line 43 each + +Each concrete writer class contains an identical private static `BuildNames()` method; +only the default file extension string differs at the call site. Any fix to filename +construction logic (CRUX mode suffix, range-number format, path separator) must be applied +in four places and will inevitably diverge. + +```cpp +// Replace four copies with one free function in IResultWriter.h: +static void BuildNames(const std::string& defaultExt, + std::string& sBaseName, + std::vector& vFileNames); +``` + +--- + +### 10. `PrintPercolatorSearchHit` takes `vector` by value — per-PSM copy overhead +**File:** `CometSearch/CometWritePercolator.h` ~line 43 + +`PrintPercolatorSearchHit` accepts `vProteinTargets` and `vProteinDecoys` by value, +copying up to `iMaxDuplicateProteins` (default 20) `std::string` objects per PSM. The +vectors are assembled by the caller immediately before the call and used read-only inside +the function. Change to `const std::vector&` to eliminate the per-PSM +allocation/copy/destruction with no other change required. diff --git a/docs/20260616_codereview2.md b/docs/20260616_codereview2.md new file mode 100644 index 00000000..10e3acb4 --- /dev/null +++ b/docs/20260616_codereview2.md @@ -0,0 +1,214 @@ +# Code Review — architecture_update (Follow-Up) + +**Date:** 2026-06-16 +**Reviewer:** Claude Code (claude-sonnet-4-6) +**Scope:** Follow-up review of the 9 fixes applied after the initial code review (20260616_codereview.md). +Branch: `architecture_update` vs `master` (working tree included). +**Method:** 7-angle Phase 1 (A-G, up to 6 candidates each) + Phase 2 per-candidate +verification (CONFIRMED / PLAUSIBLE / REFUTED). Only CONFIRMED and PLAUSIBLE findings +are reported below. + +--- + +## 1. Summary + +The nine fixes from the first review are largely sound: the mutex, PEFF seek-back, +dead-field removal, BuildNames consolidation, and Percolator const-ref changes are all +correct and clean. Three confirmed defects remain in newly added code: a file-descriptor +leak in MzIdentMlWriter, a misplaced memset in AllocateResultsMem, and an int-to-short +narrowing in StorePeptideI's new decoy branch. One plausible concurrency hazard exists +in the dual slot-tracking representation carried over from the refactor. + +--- + +## 2. Critical Issues + +### [C1] MzIdentMlWriter -- mkstemp fd leaked on every OpenTmp() call (Linux) + +**File:** `CometSearch/output/MzIdentMlWriter.h` +**Lines:** ~94-101 + +On Linux, `OpenTmp()` calls `mkstemp(&sTmp[0])` (which creates and opens the temp file, +returning a live fd), uses the return value only as an error sentinel (`== -1`), then +calls `fopen(sTmp.c_str(), "w")` to open a second handle to the same path. The fd +returned by `mkstemp()` is never passed to `close()`. One fd is leaked per `OpenTmp()` +invocation -- once per mzIdentML output file per search batch. + +**Failure scenario:** With a small `spectrum_batch_size` or many concurrent mzIdentML +writers, the process exhausts its open-fd limit, causing subsequent `fopen()` calls to +return `nullptr` and triggering "cannot write to temporary mzIdentML file" errors that +abort the search. + +**Fix:** +```cpp +int fd = mkstemp(&sTmp[0]); +if (fd == -1) +{ + // error path + return false; +} +close(fd); // release the fd; fopen below opens its own handle +fp = fopen(sTmp.c_str(), "w"); +``` + +--- + +### [C2] SearchUtils.h -- iXcorrHistogram memset inside per-result slot loop + +**File:** `CometSearch/search/SearchUtils.h` +**Lines:** ~190 (inside `AllocateResultsMem`) + +`iXcorrHistogram` is a per-`Query` array (declared `int iXcorrHistogram[HISTO_SIZE]` on +the `Query` struct in `core/Types.h:593`), not a per-`Results` slot field. The +`memset(pQuery->iXcorrHistogram, 0, sizeof(pQuery->iXcorrHistogram))` call is placed +inside the inner `for (int j = 0; j < g_staticParams.options.iNumStored; ++j)` loop, so +it zeroes the same query-level array `iNumStored` times instead of once. On iterations +j > 0, it resets the histogram, destroying any accumulation from prior j iterations. + +**Failure scenario:** Currently harmless because histogram population happens after +`AllocateResultsMem` returns (during the search phase). However, if histogram data were +ever partially populated before the j-loop completes, iteration j=1 would silently +destroy accumulations from j=0. It also wastes `iNumStored - 1` redundant memset calls +per query. + +**Fix:** Move `memset(pQuery->iXcorrHistogram, ...)` to just after +`pQuery->iDecoyMatchPeptideCount = 0`, before the for-j loop begins, so it executes +exactly once per query. + +--- + +### [C3] CometSearch.cpp -- int-to-short narrowing in StorePeptideI decoy index + +**File:** `CometSearch/CometSearch.cpp` +**Lines:** ~8724-8733 (new decoy branch in `StorePeptideI`) + +The new decoy branch recomputes the lowest-scoring decoy slot index with +`for (int i = 1; ...)` and assigns `siLowestDecoyXcorrScoreIndex = i` where the local +variable is declared `short`. This is an implicit int-to-short narrowing conversion. +The analogous loop in `StorePeptide()` (FASTA path, line ~5227) uses `short siA` +throughout, keeping the type consistent with the `short siLowestDecoyXcorrScoreIndex` +field on `Query` (declared `core/Types.h:603`). + +**Failure scenario:** Safe at current `iNumStored` values (typically <= 10). If +`iNumStored` were ever set to >= 32,768 the narrowing truncation would produce a wrong +or negative index, causing `_pDecoys[]` to be accessed out of bounds in the next +`StorePeptideI` call and silently corrupting decoy results. + +**Fix:** Change the loop variable to `short` to match `StorePeptide()`: +```cpp +for (short siA = 1; siA < (short)g_staticParams.options.iNumStored; ++siA) +{ + if (pQuery->_pDecoys[siA].fXcorr < pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].fXcorr) + siLowestDecoyXcorrScoreIndex = siA; +} +pQuery->siLowestDecoyXcorrScoreIndex = siLowestDecoyXcorrScoreIndex; +``` + +--- + +## 3. Code Quality and Maintainability + +### [C4] CometSearch -- dual slot-tracking systems alias the same scratch buffers + +**File:** `CometSearch/CometSearch.cpp`, `CometSearch/threading/SearchMemoryPool.h` +**Lines:** `CometSearch.cpp:1267`, `SearchMemoryPool.cpp:80` + +The refactor introduced `SearchMemoryPool` (`s_pool`) but retained the legacy +`_pbSearchMemoryPool[]` + `g_searchMemoryPoolMutex` slot-tracking used by the FASTA +batch path (`SearchThreadProc`). The RTS path calls `s_pool.acquireSlot()` / +`releaseSlot()` (guarded by `s_pool._mutex`), while `SearchThreadProc` scans +`_pbSearchMemoryPool[]` under `g_searchMemoryPoolMutex`. Both systems alias the same +physical scratch buffers (`_ppbDuplFragmentArr[i]` = `s_pool._pool[i]`), but neither +delegates to the other -- they are genuinely independent availability-tracking arrays. + +**Failure scenario (PLAUSIBLE):** If FASTA batch search and RTS search ever ran +concurrently in the same process, slot `i` could be claimed by `SearchThreadProc` via +`_pbSearchMemoryPool[i]` and simultaneously by `AcquirePoolSlot()` via +`s_pool._inUse[i]`, handing the same scratch buffer to two threads and silently +corrupting XCorr scores. The `TODO(Phase N)` comment at `CometSearch.cpp:31` +acknowledges the singleton design is not yet multi-instance safe. + +**Recommendation:** Route `SearchThreadProc` through `s_pool.acquireSlot()` / +`releaseSlot()` and remove `_pbSearchMemoryPool`, `g_searchMemoryPoolMutex`, and +`g_searchPoolCV` once all paths use the single `s_pool` authority. + +--- + +### [C5] CometSearch -- dead RunSpecLibSearch(ThreadPool*) overload + +**File:** `CometSearch/CometSearch.cpp` (~line 1000), `CometSearch/CometSearch.h` (~line 94) + +The 1-argument overload `RunSpecLibSearch(ThreadPool* /*tp*/)` is declared and defined +but has no callers in the current codebase. Its body is a commented-out debug printf +followed by `return true`. The live path is the 4-argument overload +`RunSpecLibSearch(int, int, ThreadPool*, vector&)` called from +`SearchUtils.h::RunSearchAndPostAnalysis()`. + +**Failure scenario:** If any future code resolves a call with a single `ThreadPool*` +argument to this overload -- by mistake or through a partial refactor -- all speclib +scoring is silently skipped with no error. The two overloads are visually similar and +the compiler produces no diagnostic. + +**Fix:** Remove the dead 1-argument overload from both the `.h` declaration and the +`.cpp` definition. + +--- + +### [C6] FastaStrategy -- dead if-block in initialize() + +**File:** `CometSearch/search/FastaStrategy.cpp` +**Lines:** ~27-48 + +The block conditioned on `session.bPerformDatabaseSearch && sProteinLModsListFile.length() > 0` +in `FastaStrategy::initialize()` contains only a multi-line comment explaining why +nothing is done here (the filter is loaded before `makeStrategy()` is called). There +are no executable statements inside the block. + +**Failure scenario:** No runtime defect. The risk is a future developer placing +initialization code inside this block expecting it to execute, unaware that the comment +explains the work is already complete by the time `initialize()` is called. + +**Fix:** Remove the dead block entirely, or replace it with a one-line comment at the +top of `initialize()` stating the precondition. + +--- + +## 4. Actionable Improvements + +### [I1] PercolatorWriter -- inline filename construction should use BuildNames + +**File:** `CometSearch/output/PercolatorWriter.h` +**Lines:** ~28-35 + +`PercolatorWriter::open()` constructs its output filename using the same +`base + range + ".pin"` pattern as `IResultWriter::BuildNames()`, but does so inline +rather than calling the shared helper. It is the only concrete writer that does not call +`BuildNames()`. Any future change to naming conventions (e.g., a new suffix format or +CRUX conditional) must be applied in two places. + +**Fix:** Call `BuildNames(ctx, ".pin", ".decoy.pin", ".target.pin", _sPath, _sDecoyPath)` +and drop the local `base`/`range` variables, matching the pattern used by all other +writers. + +--- + +### [I2] IResultWriter::BuildNames -- extTargetCrux should default to nullptr + +**File:** `CometSearch/output/IResultWriter.h` +**Lines:** ~72-86 + +The `extTargetCrux` parameter of `BuildNames()` is unconditionally `(void)`-cast and +discarded in non-CRUX builds. All four call sites must pass a dummy string literal that +is silently ignored at compile time, leaking the CRUX/non-CRUX conditional into every +call site. + +**Improvement:** Add `= nullptr` as the default for `extTargetCrux`: +```cpp +static void BuildNames(const WriterOpenCtx& ctx, + const char* ext, + const char* extDecoy, + std::string& sTarget, + std::string& sDecoy, + const char* extTargetCrux = nullptr); +``` +Non-CRUX callers can then omit the argument entirely. diff --git a/docs/20260617_codereview1.md b/docs/20260617_codereview1.md new file mode 100644 index 00000000..a1c67f78 --- /dev/null +++ b/docs/20260617_codereview1.md @@ -0,0 +1,317 @@ +# Code Review: architecture_update branch (2026-06-17) + +## Scope + +Deep review of the `architecture_update` branch versus `master` (commit `c971a2dd`). +The diff covers the Strategy/Pipeline refactor: `ISearchStrategy` + `Pipeline` replace +the monolithic `CometSearchManager::DoSearch` per-file loop; `SearchSession` replaces the +batch-path globals `g_pvQuery` / `g_pvQueryMS1`; `SearchMemoryPool` encapsulates the +thread scratch-array pool; and a new `output/IResultWriter` layer wraps the existing +`CometWrite*` classes. + +Review method: 8 parallel finder angles (line-by-line diff scan, removed-behavior audit, +cross-file tracer, reuse, simplification, efficiency, altitude, conventions), each +surfacing up to 6 candidates, followed by a 1-vote verification pass on the strongest +findings. + +--- + +## 1. Summary + +The refactor successfully decouples per-batch mutable state from process-wide globals and +introduces a clean strategy/pipeline separation. The CometWrapper layer is fully insulated +(all calls go through the unchanged `ICometSearchManager` vtable). However, three +correctness bugs were introduced -- two silent data-corruption paths in hand-written +`operator=` overloads, and one functional regression that drops the batch MS1 spectral- +library search path entirely. + +--- + +## 2. Critical Issues + +### 2a. Batch MS1 speclib search silently dead (functional regression) + +**File:** `CometSearch/search/SearchUtils.h:283` + +`RunSearchAndPostAnalysis` (the shared batch body called by all three strategies) invokes +`CometSearch::RunSearch` and `CometSearch::RunSpecLibSearch` but never calls +`CometSearch::RunMS1Search(ThreadPool*, ...)`. Separately, `CometPreprocess:: +PreprocessMS1SingleSpectrum(session&)` -- the only function that populates +`session.ms1Queries` -- has zero callers in any strategy or pipeline code path. + +Result: a batch run with `bPerformSpecLibSearch = true` produces no MS1 spectral-library +matches and emits no error or warning. The MS1 speclib batch path was present in +`CometSearchManager::DoSearch` on `master` and is now dead code. + +**Fix:** wire `PreprocessMS1SingleSpectrum(session)` and `RunMS1Search(tp, ..., +session.ms1Queries)` into `RunSearchAndPostAnalysis` when `session.bPerformSpecLibSearch` +is true, mirroring the MS2 speclib path already present. + +**Status (2026-06-17):** Investigation confirmed this was not a regression -- the batch +MS1 speclib path (`PreprocessMS1SingleSpectrum` / `RunMS1Search(ThreadPool*,...)`) had zero +callers on `master` as well. Two partial fixes applied: (1) `Pipeline::cleanupBatch` lambda +now also deletes and clears `session.ms1Queries` so any future wiring will not leak; (2) a +TODO comment at `SearchUtils.h:287` documents the RT-range parameters required before the +batch MS1 path can be wired in. + +--- + +### 2b. VarModParams::operator= drops two fields -- protein-filter var-mod searches silently broken + +**File:** `CometSearch/core/Params.h:273` + +`VarModParams::operator=` (called via `StaticParams::operator=` line 451) assigns every +field except `sProteinLModsListFile` (std::string) and `mmapProteinModsList` +(multimap). After any `StaticParams` copy, `bVarModProteinFilter` is true but +`mmapProteinModsList` is empty, so the filter silently matches every protein and the +restriction is ignored. + +`Options::operator=` (line 98) has the same structural problem: `iSpecLibMSLevel` (int, +declared line 48) is never assigned. After copy, the speclib MS-level filter uses +whatever value was already in the destination. + +**Root cause:** All five hand-written `operator=` bodies in `Params.h` (`Options`, +`DBInfo`, `StaticMod`, `PrecalcMasses`, `VarModParams`) copy fields one by one and have +drifted from their struct declarations. The compiler-synthesised `operator=` would copy +all members correctly for free -- every member is a trivially-copyable scalar, a fixed +array of scalars, or a `std::string` / `std::vector` / `std::multimap` with correct copy +semantics. + +**Fix:** Delete all five hand-written `operator=` definitions and rely on the compiler- +generated versions. If explicit copy control is needed for a specific reason, add a +static_assert or a comment naming that reason. + +**Status (2026-06-17):** Fixed. All nine hand-written `operator=` bodies in `Params.h` +replaced with `= default` (correct `const Type&` signature). The full scope was larger +than initially identified -- beyond the five listed above, `MassUtil`, `ToleranceParams`, +`IonInfo`, and `StaticParams` had the same drift bug. `StaticParams::operator=` was missing +`peffInfo`, `iDbType`, `sDecoyPrefix` (string), `bSkipToStartScan`, and `tRealTimeStart`. +Build verified clean after replacement. + +--- + +### 2c. SearchThreadProc has no RAII guard for the pool slot -- bad_alloc during index build causes 240-second deadlock + +**File:** `CometSearch/CometSearch.cpp:1253` + +```cpp +int i = AcquirePoolSlot(); +// ... +CometSearch* sqSearch = new CometSearch(); +sqSearch->DoSearch(...); +delete sqSearch; +s_pool.releaseSlot(i); // never reached if DoSearch throws +``` + +`DoSearch` contains two re-throwing `catch` blocks (lines ~3563 and ~7558) inside +`g_pvDBIndex.push_back()` failure paths, reachable when `bCreateFragmentIndex` or +`bCreatePeptideIndex` is set. If the system OOMs mid index-build, the exception propagates +past `releaseSlot`. The old `SearchThreadData::~SearchThreadData` released the slot +unconditionally; that safety net was removed in this diff. + +**Fix:** Wrap the slot in a simple RAII guard: + +```cpp +struct SlotGuard { + int slot; + ~SlotGuard() { if (slot >= 0) s_pool.releaseSlot(slot); } +}; +SlotGuard guard{i}; +``` + +**Status (2026-06-17):** Fixed. Local `SlotGuard` struct added to `SearchThreadProc` +immediately after the slot is acquired. The explicit `s_pool.releaseSlot(i)` call was +removed; the guard destructor handles release on both normal exit and exception unwind. + +--- + +## 3. Code Quality & Maintainability + +### 3a. FusedLoadAndSearchSpectra batch-size check fires early + +**File:** `CometSearch/CometPreprocess.cpp:3362` + +`iNumSpectraLoaded` is incremented when a spectrum is pushed onto the bounded queue +(before any consumer thread processes it). `CheckExit` fires when +`iNumSpectraLoaded >= iSpectrumBatchSize`. With a queue depth of `iNumThreads * 4`, the +read loop can stop up to `iNumThreads * 4` entries before the configured batch size is +actually searched. + +The non-fused `LoadAndPreprocessSpectra` path sets `iNumSpectraLoaded = +session.queries.size()` (post-preprocessing count), so the two paths have different +batch-size semantics. Users relying on `spectrum_batch_size` for memory control in FI_DB +mode will observe smaller-than-configured batches. + +**Status (2026-06-17):** Fixed. Removed the local `iNumSpectraLoaded` variable and its +queue-push increment from `FusedLoadAndSearchSpectra`. The `CheckExit` call (which already +holds `session.queriesMutex`) now passes `(int)session.queries.size()` directly, matching +the non-fused path semantics: the count reflects spectra that have been fully preprocessed +and stored in `session.queries`. + +### 3b. SearchThreadData::pQueries latent null deref + +**File:** `CometSearch/CometSearch.h:43` / `CometSearch/CometSearch.cpp:1269` + +Both `SearchThreadData` constructors initialise `pQueries = nullptr`. `SearchThreadProc` +dereferences it at line 1269 with no null check. All current callers correctly set +`pQueries = &queries` before dispatching, but the type provides no enforcement. A future +dispatch path that forgets the assignment will crash inside a thread with no useful +diagnostic. + +**Fix:** make `pQueries` a required constructor parameter (remove the default-null +initialiser) or add an assert before the dereference. + +**Status (2026-06-17):** Fixed. Removed the no-arg `= default` constructor (unused). +`pQueries` is now a required second parameter of the `sDBEntry` constructor: +`SearchThreadData(const sDBEntry&, const vector*)`. The one call site in +`RunSearch` updated to `new SearchThreadData(dbe, &queries)`, eliminating the +post-construction assignment step. + +### 3c. Pipeline::cleanupBatch skips session.ms1Queries + +**File:** `CometSearch/search/Pipeline.cpp:136` + +The `cleanupBatch` lambda deletes and clears `session.queries` but never touches +`session.ms1Queries`. Currently `session.ms1Queries` is never populated (see 2a above), +so there is no active leak. If batch MS1 search is re-wired, every batch will leak its +`QueryMS1*` objects across all batches and all input files. + +**Status (2026-06-17):** Fixed as part of 2a. `cleanupBatch` now also iterates and +deletes `session.ms1Queries` and calls `session.ms1Queries.clear()`. + +### 3d. session.params member is vestigial + +**File:** `CometSearch/search/SearchSession.h:48` + +`SearchSession` carries `const StaticParams& params` that is never read by any caller. +Every strategy, pipeline, and utility accesses `g_staticParams` directly. The member +implies an in-progress migration that has not started, misleading future readers. + +**Status (2026-06-17):** Fixed. `const StaticParams& params` member and the accompanying +comment removed from `SearchSession`. Constructor simplified to +`explicit SearchSession(CometStatus& st)`. The one construction site in +`CometSearchManager.cpp` updated accordingly. + +### 3e. Non-ASCII characters in SearchSession.h + +**File:** `CometSearch/search/SearchSession.h:20,21,47` + +Lines 20 (U+2026 HORIZONTAL ELLIPSIS) and 21, 47 (U+2014 EM DASH) are UTF-8 multi-byte +sequences. CLAUDE.md rule: \"No non-ASCII characters allowed in the code or documentation.\" +All other new files are pure ASCII. Replace with ASCII equivalents (`...` and `--`). + +**Status (2026-06-17):** Fixed. The EM DASH on old line 47 was removed along with the +vestigial `params` member (3d). The HORIZONTAL ELLIPSIS on line 20 replaced with `...` +and the EM DASH on line 21 replaced with `--`. Verified with `grep -P "[^\x00-\x7F]"`: +no non-ASCII bytes remain. + +### 3f. Trailing whitespace in Params.h + +**File:** `CometSearch/core/Params.h:154,155,257` + +Line 154 has 4 trailing spaces, line 155 has 2 trailing spaces and a stray space before +the semicolon (`iFragIndexMinIonsReport ;`), and line 257 has 1 trailing space. CLAUDE.md +rule: \"No trailing whitespace.\" + +**Status (2026-06-17):** Fixed. The stray space before the semicolon and the two lines of +trailing spaces on old lines 154-155 were eliminated when the hand-written `operator=` +bodies were replaced with `= default` (issue 2b), which removed those lines entirely. The +remaining trailing space on old line 257 (`bVarProteinCTermMod` declaration, now line 162) +was stripped directly. Verified with `grep -P "[\t ][\r]?$"`: no trailing whitespace +remains. + +--- + +## 4. Actionable Improvements + +### 4a. Delete hand-written operator= in Params.h + +Replace all five with `= default` or remove them entirely: + +```cpp +// Before (drift-prone): +Options& operator=(Options& a) { iNumPeptideOutputLines = a.iNumPeptideOutputLines; ... } + +// After: +Options& operator=(const Options&) = default; +``` + +If the non-const signature `operator=(Options& a)` was intentional (e.g., to allow +modification of the source), document why; otherwise make it `const Options&`. + +**Status (2026-06-17):** Done as part of issue 2b. All nine hand-written `operator=` +bodies (not just the five originally identified) were replaced with `= default` using the +correct `const Type&` signature. + +### 4b. Move RunSearchAndPostAnalysis out of SearchUtils.h + +**File:** `CometSearch/search/SearchUtils.h:244` + +`SearchUtils.h` is included by 5 translation units and contains 65-line non-trivial +functions marked `inline static`. Each TU gets its own copy. Move `RunSearchAndPostAnalysis`, +`AllocateResultsMem`, and `UpdateInputFile` into a `SearchUtils.cpp` and keep only +declarations in the header. The three small comparator helpers +(`compareByPeptideMass`, etc.) are genuinely inline-worthy and can stay. + +**Status (2026-06-17):** Done. Created `CometSearch/search/SearchUtils.cpp` containing the +definitions of `UpdateInputFile`, `SetMSLevelFilter`, `AllocateResultsMem`, and +`RunSearchAndPostAnalysis`. `GetInputType` became a `static` helper in that .cpp (not +exported). `SearchUtils.h` now contains only declarations plus the three inline comparators; +added self-contained includes (`MSReader.h`, `SearchSession.h`) so the header compiles +standalone. `search/SearchUtils` added to `SEARCH_SRC` in the Makefile and +`search\SearchUtils.cpp` added to `CometSearch.vcxproj`. + +### 4c. Factor out the shared legacy batch body in FiStrategy and FastaStrategy + +**Files:** `CometSearch/search/FiStrategy.cpp:147`, `CometSearch/search/FastaStrategy.cpp:60` + +The two \"legacy three-sweep\" paths (`LoadAndPreprocess` -> `AllocateResultsMem` -> +`RunSearchAndPostAnalysis`) are structurally identical except for a verbosity flag. The +difference is already encoded in the `bLogPrePostAnalysis` parameter that `RunSearchAndPostAnalysis` +accepts. Extract a shared free function: + +```cpp +bool executeBatchLegacy(MSToolkit::MSReader& mstReader, int iFirstScan, int iLastScan, + int iAnalysisType, int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session, bool bVerbose); +``` + +**Status (2026-06-17):** Done. `executeBatchLegacy` added to `SearchUtils.cpp` / +declared in `SearchUtils.h`. The `bVerbose` flag controls the three per-strategy +differences: the \"Load spectra:\" console log before loading, the spectra-count +`logout` after allocation, and whether to pass `bLogPrePostAnalysis=true` to +`RunSearchAndPostAnalysis`. All three strategy `executeBatch` bodies replaced with a +single call; this covered `PiStrategy` as well (not mentioned in the original finding +but structurally identical to the `FiStrategy` non-fused path). + +### 4d. Fix iNumSpectraLoaded semantics in FusedLoadAndSearchSpectra + +**File:** `CometSearch/CometPreprocess.cpp:3362` + +Either (a) increment `iNumSpectraLoaded` inside `FusedSearchSpectrum` after a spectrum +completes preprocessing (requires an atomic counter shared with the worker lambdas), or +(b) document that the fused-path batch size is approximate (+/- queue depth) and update +any user-facing documentation for `spectrum_batch_size` accordingly. + +**Status (2026-06-17):** Done as part of 3a (option a). The local `iNumSpectraLoaded` +variable and its queue-push increment were removed entirely. `CheckExit` now receives +`(int)session.queries.size()` directly under the already-held `queriesMutex`, which counts +only spectra that have been fully preprocessed -- the same semantics as the non-fused path. + +--- + +## Appendix: Findings Not Requiring Code Changes + +- **CometWrapper isolation confirmed**: all CometWrapper calls go through the + `ICometSearchManager` vtable; no internal signature changes propagate to the wrapper + layer. +- **s_pool singleton (TODO acknowledged)**: the file-static `SearchMemoryPool s_pool` in + `CometSearch.cpp` prevents multiple concurrent RTS instances. The TODO comment at line + 30 correctly identifies this. No concurrent RTS path currently invokes the batch pool, + so this is a known deferred item, not a regression. +- **FiStrategy::finalize() redundant iDbType check**: the `if (g_staticParams.iDbType == + DbType::FI_DB)` guard is always true when called by the pipeline (which selected + FiStrategy precisely because iDbType == FI_DB). Harmless today. +- **Redundant #include lines in CometSearchManager.cpp**: the five `CometWrite*.h` + includes at lines 21-25 are already pulled in transitively by the new `output/*Writer.h` + includes. Dead includes, no functional impact. diff --git a/docs/20260617_codereview2.md b/docs/20260617_codereview2.md new file mode 100644 index 00000000..9908b224 --- /dev/null +++ b/docs/20260617_codereview2.md @@ -0,0 +1,186 @@ +Code Review: architecture_update branch, uncommitted working-tree diff (2026-06-17) +===================================================================================== + +Scope +----- +Reviewed the current uncommitted changes on top of commit c971a2dd (13 modified +files + 1 new file, +58/-573 lines). This diff implements the fix pass for the +findings recorded earlier today in docs/20260617_codereview.md: replacing +hand-written `operator=` bodies in Params.h with `= default`, adding a SlotGuard +RAII wrapper in SearchThreadProc, fixing the FusedLoadAndSearchSpectra batch-size +check, extracting SearchUtils.h's non-trivial functions into a new SearchUtils.cpp, +and factoring the three strategies' batch bodies into a shared executeBatchLegacy +helper. + +Method: verified each "Status: Fixed" claim against the actual diff line-by-line, +rebuilt from clean (`make cclean && make`), ran the full unit suite, checked CRLF / +non-ASCII / trailing-whitespace compliance per CLAUDE.md, then searched for +structurally identical instances of the bug pattern that was just fixed. + +--- + +1. Summary +---------- +All six fixes claimed in docs/20260617_codereview.md are present in the diff and +verified correct: the `operator=` replacements are safe (every member of every +affected struct is a value type with correct default-copy semantics, no owning raw +pointers), the SlotGuard correctly releases the pool slot on exceptional exit, the +batch-size counter now reflects processed rather than queued spectra, and the +SearchUtils split / executeBatchLegacy extraction preserve behavior exactly +(FastaStrategy keeps `bVerbose=true`, FiStrategy/PiStrategy keep `bVerbose=false`, +matching their pre-diff behavior). Clean rebuild produces zero warnings; all 17 unit +tests pass. One gap was found: the SlotGuard fix addressed only one of five call +sites that share the identical acquire-slot/run/release-slot pattern, leaving the +production batch-FI hot path and the RTS single-spectrum path exposed to the same +240-second slot-leak hazard the fix was written to close. + +**Status (2026-06-17): all items closed.** The critical issue (2a) and both +actionable improvements (4a, 4b) have been fixed -- see per-item status notes below. +Rebuilt clean (`make cclean && make`, zero warnings) and re-ran the full unit suite +(17 passed, 0 failed, 0 skipped) after the fix. + +--- + +2. Critical Issues +------------------- + +### 2a. SlotGuard fix is incomplete -- four sibling call sites still leak the pool + slot on exception + +**Files:** `CometSearch/CometSearch.cpp:128, 170, 214, 266` + +The diff adds a `SlotGuard` RAII wrapper around the one call site in +`SearchThreadProc` (line ~1263) so `s_pool.releaseSlot()` fires even if `DoSearch` +throws. The same bare `AcquirePoolSlot() -> run -> s_pool.releaseSlot()` pattern, +with no guard, exists at four other sites that were not touched: + +- `CometSearch::RunSearch(Query*)` line 128 (RTS thread-local FI search -- the + documented concurrent RTS path in CLAUDE.md) +- same function, line 170 (RTS thread-local PI search) +- `CometSearch::RunSearch(ThreadPool*, vector&)` line 214 (single-query FI + fallback) +- `CometSearch::RunSearch(int, int, ThreadPool*, vector&)` line 266 -- inside + a per-query lambda dispatched to the thread pool; this is the production batch-FI + search hot path, executed once per query in every FI_DB batch + +`SearchFragmentIndex` (called at all four sites) builds a +`std::unordered_map`, a `std::vector>` via +`push_back`, and calls `std::sort` -- all of which can throw `std::bad_alloc` under +memory pressure, the same failure mode that motivated the original fix. +`SearchPeptideIndex` (lines 170, 244) has equivalent allocations. + +If any of these throw, the slot is never released. `SearchMemoryPool::acquireSlot()` +(threading/SearchMemoryPool.cpp:76) then blocks every subsequent caller for up to +240 seconds (the same symptom described for the issue that was just fixed) before +giving up and returning -1. For the RTS single-spectrum path this directly +contradicts the threading-model guarantee in CLAUDE.md that the RTS path stays +responsive; for the batch FI path it can stall an entire search batch. + +**Fix:** lift `SlotGuard` out of `SearchThreadProc` into a shared location (e.g. +`SearchMemoryPool.h`, since `s_pool` already lives in that translation unit) and +apply it at all five acquire/release sites, or wrap the post-acquire body of each +site in a `try { ... } catch (...) { s_pool.releaseSlot(slot); throw; }`. Since this +is the same author and same diff that recognized and fixed the pattern once, doing +it everywhere now is cheap; finding the next instance after a production stall is +not. + +**Status (2026-06-17):** Fixed. Added `SearchMemoryPoolSlotGuard` to +`threading/SearchMemoryPool.h` (a small RAII struct holding a `SearchMemoryPool&` +and the slot index, releasing in its destructor) and applied it at all five +acquire/release sites in `CometSearch.cpp`: the two thread-local RTS overloads +(`RunSearch(Query*)`, FI and PI branches), the single-query FI fallback +(`RunSearch(ThreadPool*, vector&)`), the batch-FI per-query lambda +(`RunSearch(int, int, ThreadPool*, vector&)`), and the original +`SearchThreadProc` site (whose function-local `SlotGuard` struct was removed in +favor of the shared one). All five bare `s_pool.releaseSlot(...)` calls following a +search body were removed; the guard now owns release in every case, including +exception unwind. + +--- + +3. Code Quality & Maintainability +---------------------------------- + +Nothing new beyond what docs/20260617_codereview.md already recorded and the diff +already fixed. No trailing whitespace, no non-ASCII characters, and CRLF line +endings are correct in every changed/added line (verified with `file` and +`grep -P "[^\x00-\x7F]"` / `grep -P "[\t ][\r]?$"` restricted to lines actually +touched by this diff -- the unrelated pre-existing trailing-whitespace lines found +elsewhere in CometSearch.cpp/CometPreprocess.cpp/CometSearch.h/CometSearchManager.cpp +are untouched by this diff and out of scope). + +--- + +4. Actionable Improvements +---------------------------- + +### 4a. Share one SlotGuard definition instead of risking drift + +`SlotGuard` is currently a function-local struct defined only inside +`SearchThreadProc`. Move it next to `SearchMemoryPool` (e.g. as a nested type or a +free struct in `threading/SearchMemoryPool.h`) so the four other call sites in 2a +can reuse it directly: + +```cpp +// threading/SearchMemoryPool.h +struct SearchMemoryPoolSlotGuard +{ + SearchMemoryPool& pool; + int slot; + ~SearchMemoryPoolSlotGuard() { if (slot >= 0) pool.releaseSlot(slot); } +}; +``` + +```cpp +int iSlot = AcquirePoolSlot(); +if (iSlot < 0) { logerr(...); return false; } +SearchMemoryPoolSlotGuard guard{s_pool, iSlot}; +SearchFragmentIndex(pQuery, _ppbDuplFragmentArr[iSlot]); +``` + +**Status (2026-06-17):** Done as part of fixing 2a -- `SearchMemoryPoolSlotGuard` was +added to `threading/SearchMemoryPool.h` exactly as proposed and is now the only +release mechanism used anywhere in `CometSearch.cpp`. + +### 4b. Batch-FI lambda swallows AcquirePoolSlot failure + +**File:** `CometSearch/CometSearch.cpp:258-266` (pre-existing, not introduced by +this diff, surfaced while tracing 2a) + +When `AcquirePoolSlot()` returns -1 inside the per-query lambda, the lambda logs and +returns, but `RunSearch`'s `bSucceeded` is never set to `false` -- the query is +silently dropped from the batch with no caller-visible failure. Not in scope for +this diff's fix pass, but worth a follow-up ticket since it compounds 2a (a slot +leaked by one query makes the next query's acquire more likely to time out and be +silently dropped too). + +**Status (2026-06-17):** Fixed. Added a `std::atomic bAllSlotsAcquired(true)` +captured by reference in the per-query lambda; on `AcquirePoolSlot() < 0` the lambda +now sets it `false` (in addition to the existing `logerr`) instead of just +returning. After `wait_on_threads()`, `RunSearch` checks the flag and, if any query +failed to acquire a slot, calls `g_cometStatus.SetStatus(CometResult_Failed, ...)` +and sets `bSucceeded = false` before returning, making the failure visible to the +caller instead of silently dropping the affected queries from the batch. + +--- + +Appendix: Verified, no changes needed +---------------------------------------- +- `Options`/`DBInfo`/`StaticMod`/`PrecalcMasses`/`VarModParams`/`MassUtil`/ + `ToleranceParams`/`IonInfo`/`StaticParams` `operator= = default`: every member of + every struct is a value type (POD scalar, fixed array of scalars, `std::string`, + `std::vector`, `std::multimap`, `std::chrono::time_point`) -- no owning raw + pointers anywhere in `Params.h`, so compiler-generated copy is correct and copies + every field, closing the original drift bug for good rather than just patching the + fields named in the original finding. +- `executeBatchLegacy` / `SearchUtils.cpp` extraction: behavior-preserving: verbose + flag wiring matches each strategy's pre-diff console output exactly; locking around + `CheckExit`'s new `session.queries.size()` argument is consistent with the existing + `queriesMutex` discipline at the push site (CometPreprocess.cpp:3236). +- `Pipeline::cleanupBatch` now also drains `session.ms1Queries` -- consistent with + the dead/not-yet-wired batch MS1 path noted in the prior review; no active leak + today, but correct hygiene if that path is wired in later. +- Build (pre-fix and post-fix): `make cclean && make -j20` from a clean tree -- + zero warnings both times. +- Tests (pre-fix and post-fix): `python3 tests/unit/run_tests.py --comet + ./comet.exe` -- 17 passed, 0 failed, 0 skipped both times. diff --git a/docs/20260617_codereview3.md b/docs/20260617_codereview3.md new file mode 100644 index 00000000..b524cc62 --- /dev/null +++ b/docs/20260617_codereview3.md @@ -0,0 +1,367 @@ +# Code Review: architecture_update branch (2026-06-17) -- independent pass + +## Scope + +Independent review of the `architecture_update` branch versus `master`, at branch tip +commit `0e10e71f` (74 files changed, +5,936/-3,275 lines). Performed without reference +to the same-day reviews in `docs/20260617_codereview.md` and `docs/20260617_codereview2.md`, +per request, as a second independent pass over the Strategy/Pipeline refactor: +`ISearchStrategy` (`FiStrategy` / `PiStrategy` / `FastaStrategy`) + `Pipeline` replacing +the monolithic `CometSearchManager::DoSearch` per-file loop, `SearchSession` replacing +the batch-path globals, `SearchMemoryPool` with RAII slot guards, and a new +`output/IResultWriter` layer wrapping the existing `CometWrite*` classes. + +Method: clean rebuild (`make cclean && make -j$(nproc)`) with a warning scan; full unit ++ integration test run (19/19 passed, including the T18 byte-identical determinism +check); manual line-by-line trace of the ~1,232 lines removed from +`CometSearchManager.cpp` against their new homes in `SearchUtils.cpp` / the strategy +classes to confirm behavior was preserved; targeted reads of `SearchMemoryPool`, +`Pipeline`, `SearchSession`, all three strategies, all five `IResultWriter` +implementations, and `core/Params.h` / `Types.h` / `Constants.h`. + +--- + +## 1. Summary + +Build is clean under `-Wall -Wextra` (zero warnings) and all 19 tests pass. The +extraction of `DoSearch`'s per-file loop into `Pipeline` + strategy classes is largely +faithful -- the diff was traced line-by-line and the removed logic reappears intact in +`SearchUtils.cpp` and the three strategy `.cpp` files, including the per-batch writer +open/write/close lifecycle and the FASTA/idx file-handle handling. The latest commit's +exception-safety fix (`SearchMemoryPoolSlotGuard` applied at all five +acquire/release sites) is correctly done. One concrete correctness regression was found: +reordering AScore initialization relative to fragment-index loading silently breaks +AScorePro phosphosite scoring for batch FI_DB searches. A few maintainability gaps in +the new abstraction are also worth hardening before this lands on `master`. + +**Status (2026-06-17): all items closed, plus one additional critical bug (2b) found +during live testing after this review.** Issue 2a, 2b, all of section 3, and all of +section 4 have been fixed -- see the per-item status notes below, including two new +regression tests (`t19`, `t20`) each verified to fail against its respective pre-fix +code and pass against the fix. Rebuilt clean (`make cclean && make -j$(nproc)`, zero +warnings) and re-ran the full unit + integration suite (21 passed, 0 failed, 0 skipped) +after the final round of fixes. + +--- + +## 2. Critical Issues + +### 2a. AScorePro configured with stale variable-mod data for batch FI_DB searches + +**Files:** `CometSearch/CometSearchManager.cpp:2110-2119` (new AScore-init call site) +vs. `CometSearch/search/FiStrategy.cpp:67-83` (index load, now run afterward) + +`SetAScoreOptions(g_AScoreOptions)` is now called once, unconditionally, near the top +of `DoSearch()` -- *before* `Pipeline::run()` constructs and initializes the strategy. +For `FI_DB` (fragment-index) searches, `FiStrategy::initialize()` subsequently calls +`CometFragmentIndex::ReadPlainPeptideIndex()`, which **overwrites** +`g_staticParams.variableModParameters.varModList[].dVarModMass / szVarModChar / +dNeutralLoss` from the `.idx` file's `VariableMod:` header line +(`CometFragmentIndex.cpp:1276-1310`). `SetAScoreOptions()` reads exactly those fields +(`CometSearchManager.cpp:3225-3258`) to build the AScore differential-mod list. + +Pre-refactor, this was correctly sequenced: the diff shows the old code ran +`ReadPlainPeptideIndex()` / `CreateFragmentIndex()` *first*, then `SetAScoreOptions()` +second, inside the per-file loop guarded by a `bPerformAScoreInitialization` flag. The +RTS path (`InitializeSingleSpectrumSearch`, `CometSearchManager.cpp:2268-2287`) still +gets this right and even carries a comment explaining why: *"normally set at end of +InitializeStaticParams; must do here again after ReadPlainPeptideIndex for single +spectrum search."* The same re-sync was not preserved for the batch path after the +refactor. + +`PI_DB` is not affected: `CometSearch::SearchPeptideIndex` (`CometSearch.cpp:1880-1903`) +lazily re-parses the index header and re-calls `SetAScoreOptions` on first invocation, +guarded by `g_bPeptideIndexRead`, so it self-heals. `FI_DB` has no equivalent internal +correction. + +**Impact:** any batch search against a prebuilt fragment index with +`print_ascore_score` enabled will configure AScorePro using whatever variable-mod +values happened to already be in `g_staticParams` *before* the index header was parsed +-- commonly empty/default, since FI_DB search-time params files don't need to redeclare +variable mods (they're embedded in the index). AScore site-localization scores would +silently be computed against the wrong (or no) differential mod, with no error raised. + +**Fix:** move the `SetAScoreOptions` / `CreateAScoreDllInterface` block in `DoSearch()` +to after the strategy's `initialize()` has run, e.g.: + +```cpp +// CometSearchManager.cpp, after strategy selection +if (!pStrategy->initialize(session, tp)) { pStrategy->finalize(); return false; } +if (g_staticParams.options.iPrintAScoreProScore) +{ + SetAScoreOptions(g_AScoreOptions); + g_AScoreInterface = CreateAScoreDllInterface(); + if (!g_AScoreInterface) { std::cerr << "Failed to create AScore interface." << std::endl; return false; } +} +``` + +This also avoids creating an AScore interface for an `FI_DB` run whose index fails to +load. + +**Status (2026-06-17):** Fixed. The AScore init/teardown block was moved out of +`CometSearchManager::DoSearch()` and into `Pipeline::run()` (`CometSearch/search/ +Pipeline.cpp`), rather than patched in place, so the fix covers every strategy through +one call site instead of duplicating the re-sync logic per strategy (see Actionable +Improvement 4b, also closed by this change). `SetAScoreOptions()` / +`CreateAScoreDllInterface()` now run immediately after `_strategy->initialize(session, +&tp)` succeeds -- i.e. after `FiStrategy::initialize()` has already called +`ReadPlainPeptideIndex()` for FI_DB runs -- and `DeleteAScoreDllInterface()` now runs +right after `_strategy->finalize()` at the end of `run()`, matching the original +unconditional teardown. A failure to create the AScore interface now also calls +`_strategy->finalize()` before returning, so the strategy's allocated memory pools are +not leaked on that error path (a small improvement over the pre-fix code, which +returned without finalizing on this same error). Verified with a clean rebuild (zero +warnings) and the full 19-test unit + integration suite. + +--- + +### 2b. Batch PI_DB search crashes on the first scored candidate (`_pQueries` never assigned) + +**File:** `CometSearch/CometSearch.cpp:1862` (`SearchPeptideIndex(ThreadPool*, vector&)`) + +**Discovered:** 2026-06-17, reported against the VS-built Windows binary running a real +peptide-index (`-j`) search via WSL interop: the process printed +`- searching "" ...` and then exited with no further output, no error message, +and no result file -- a silent crash, not a hang. + +`CometSearch::BinarySearchMass()` and the `AnalyzePeptideIndex(int iWhichQuery, ...)` +overload read the active query list through a `CometSearch` member, `_pQueries`, +rather than a parameter. `CometSearch::DoSearch()` (the FASTA path) sets +`_pQueries = &queries;` at entry for exactly this reason. The architecture refactor +changed `BinarySearchMass()` from reading the old global `g_pvQuery` directly to +reading it through `_pQueries`, and updated `DoSearch()` accordingly, but +`SearchPeptideIndex(ThreadPool*, vector&)` -- the PI_DB batch path, called from +a freshly constructed `CometSearch* sqSearch = new CometSearch();` in +`CometSearch::RunSearch(int, int, ThreadPool*, vector&)` -- was never updated +to set `_pQueries`. It stayed `nullptr` (the class's default member initializer), and +the first call into `BinarySearchMass()` dereferenced it, segfaulting before any +output was written. + +**Reproduced locally** with a minimal fixture (T19's phospho peptide/spectrum, built as +a PI_DB index instead of FI_DB) and confirmed via `gdb` backtrace: + +``` +#0 CometSearch::BinarySearchMass(int, int, double) const +#1 CometSearch::SearchPeptideIndex(ThreadPool*, vector&) +#2 CometSearch::RunSearch(int, int, ThreadPool*, vector&) +#3 RunSearchAndPostAnalysis(int, int, ThreadPool*, SearchSession&, bool) +#4 Pipeline::run(SearchSession&, vector const&, ThreadPool&) +#5 CometSearchManager::DoSearch() +``` + +matching the reported symptom exactly: the crash happens after the `"- searching ..."` +progress print and before any batch completes. + +**Fix:** added `_pQueries = &queries;` at the top of +`SearchPeptideIndex(ThreadPool*, vector&)`, mirroring `DoSearch()`. + +**Status (2026-06-17):** Fixed and empirically validated both directions, not just +inspected. With the fix: a PI_DB search of the fixture completes and scores correctly +(`xcorr=3.4260`, `ascorepro=330.7289`, phospho correctly localized to position 7). +Then `git stash`-reverted just this one-line fix, rebuilt, and re-ran the same search: +it reproduced the identical segfault inside `BinarySearchMass`, confirming the fix is +both necessary and sufficient. Restored the fix and confirmed the full test suite +(21 tests, including the two new ones below) passes cleanly with zero build warnings. + +Added two regression tests to `tests/unit/run_tests.py`: +- **t19** (already added for issue 2a) continues to cover the FI_DB AScore-ordering fix. +- **t20** (new) reuses T19's phospho fixture but builds a PI_DB (`-j`) index instead of + an FI_DB (`-i`) index, then runs the same search and asserts it exits cleanly (rc=0) + and produces the correct PSM. Verified to fail (non-zero exit from the crash) against + the pre-fix code and pass against the fix, the same way 2a's test was validated. + +--- + +## 3. Code Quality & Maintainability + +### 3a. Pipeline relies on an undocumented "close() is always safe on an unopened writer" contract + +**File:** `CometSearch/search/Pipeline.cpp:104-118` + +When a writer's `open()` fails partway through the writer list, `close(false, false)` +is called on *every* writer, including ones whose `open()` was never reached. This only +works because every concrete `IResultWriter` happens to null-check its file handle +first in `close()`. The invariant is real and currently upheld by all five writers, but +it is not stated anywhere in `IResultWriter.h`; a future writer that forgets the +null-check will crash on a partial-open failure with no compiler or test signal. + +**Fix:** add a one-line contract comment above `IResultWriter::close()` stating that +`close()` must be safe to call even if `open()` was never called or failed. + +**Status (2026-06-17):** Fixed. Added a contract note to `IResultWriter::close()` in +`CometSearch/output/IResultWriter.h` stating that implementations must be safe to call +even when `open()` was never invoked or returned false, and explaining why +(`Pipeline::run()` calls `close(false, false)` on every writer in the vector, including +ones after the one whose `open()` failed). No behavior change; all five existing +writers already satisfy the contract. + +### 3b. Stale "Phase 5" migration note in SearchSession.h + +**File:** `CometSearch/search/SearchSession.h:23-28` + +The header still says *"g_pvQueryMutex, g_bPlainPeptideIndexRead, and g_bSpecLibRead +remain as globals... Full removal is deferred to Phase 5."* Per `docs/20260612 +_architecture_migration.md`'s own phase numbering, Phase 5 (Pipeline/Strategy) is +the work already present in this branch. The comment now reads as an open TODO with no +tracked follow-up. Either the deferral is permanent (the RTS path will never adopt +`SearchSession`) and the comment should say so plainly, or there is real follow-up work +that should be filed somewhere visible instead of living only in a header comment. + +**Status (2026-06-17):** Fixed -- closed as part of fixing Actionable Improvement 4d, +which addressed this same `SearchSession.h:23-28` comment block ("state plainly" branch +chosen there). See 4d's status note for the detail; recorded separately here only +because this finding and 4d originally described the same fix as two different +write-ups (a critique and its corresponding improvement) rather than one item. + +### 3c. `isIndexBased()` conflates two unrelated concerns + +**File:** `CometSearch/search/ISearchStrategy.h:70-72`, used throughout +`CometSearch/search/Pipeline.cpp` + +`Pipeline::run()` branches on `_strategy->isIndexBased()` both to decide whether to +print the FASTA-style "Search start:" banner / per-spectrum verbose logging *and* to +decide whether to print the index-style "searching... done" progress line. These are +really one decision ("which strategy is this") wearing the trappings of two +unrelated questions (whether reading the database needs an index, and which console +output style to use). Today the mapping happens to be 1:1, so it costs nothing, but a +fourth strategy with index-based storage and FASTA-style verbose logging (or vice +versa) would have no way to express that without a behavior change at every call site. +Not urgent, but worth a `progressStyle()`-type accessor if a fourth strategy is ever +added. + +**Status (2026-06-17):** Addressed, narrower than originally framed. Re-checked every +call site of `isIndexBased()` (`grep` across `CometSearch/`): all nine are in +`Pipeline.cpp`, and every one of them is purely a console-output style switch (verbose +FASTA banners vs. the compact index-style progress line, including the "Reading all +spectra into memory" warning, which only ever changes what gets *printed*, not what +the strategy does). On closer inspection there isn't a second concern hiding in current +usage -- the original framing overstated the issue. Splitting the interface into two +accessors for a distinction the code doesn't actually have yet would be the kind of +premature abstraction this codebase's conventions warn against, so instead the +doc comment on `ISearchStrategy::isIndexBased()` was tightened to state explicitly that +`Pipeline::run()` is the only consumer, name the exact banners/lines it switches +between, and warn that the flag must not be used to gate actual search behavior. If a +fourth strategy ever needs index-based storage with FASTA-style verbose logging (or +vice versa), that is the trigger to revisit the split, not before. + +### 3d. Redundant `operator=` declaration left over from the Params.h cleanup + +**File:** `CometSearch/core/Params.h:98` (and similarly for the other structs touched +by the same cleanup) + +`Options& operator=(const Options&) = default;` is now redundant: with no other +user-declared special member, the compiler already generates an identical copy +assignment implicitly. Harmless, but it's leftover noise from the "replace +hand-written operator= with = default" pass -- could simply be deleted now that the +hand-written bodies are gone. + +**Status (2026-06-17):** Fixed. Removed all nine redundant `operator= = default` +declarations from `core/Params.h` (`Options`, `DBInfo`, `StaticMod`, `PrecalcMasses`, +`VarModParams`, `MassUtil`, `ToleranceParams`, `IonInfo`, `StaticParams`). None of these +structs declares a destructor, copy constructor, move constructor, or move assignment, +so a user-declared default constructor (only `StaticParams` has one) does not suppress +the implicit copy assignment operator either -- the compiler generates the identical +member-wise copy with the declarations removed. Verified with a clean rebuild (zero +warnings) and the full 19-test suite. + +--- + +## 4. Actionable Improvements + +### 4a. Add a regression test for AScore + FI_DB + +No test in `tests/unit/` exercises `print_ascore_score` against a fragment index, +which is exactly why issue 2a was not caught by CI. A minimal test that builds a tiny +FI_DB index with a variable mod, runs a search with `print_ascore_score` set and a +deliberately different/blank `variable_mod01` in the search-time params, and asserts +the AScore differential-mod symbol/mass reflects the `.idx` file's value (not the +params file's) would catch this entire class of ordering bug permanently and guard +against it recurring during future refactors. + +**Status (2026-06-17):** Fixed. Added `t19` to `tests/unit/run_tests.py`, with fixtures +`tests/unit/data/t19_ascore_fidb.fasta` (single 8-residue protein, one phospho-acceptor +S) and `t19_ascore_fidb.ms2` (synthetic singly-charged b/y ions for +`ACDEFGS[+79.966331]K`, precomputed from monoisotopic residue masses). The test builds +an FI_DB index with a real `variable_mod01` (phospho on S), then searches it with +`print_ascorepro_score=1` but a deliberately blank `variable_mod01` in the search-time +params -- the realistic case, since FI_DB search params don't need to redeclare mods +already baked into the index. It asserts the rank-1 PSM's `ascorepro` column is `> 0`. + +Verified the test actually discriminates the bug, not just incidentally passes: with +the fix in place it reports `ascorepro = 330.7289`; temporarily reverting +`CometSearchManager.cpp`/`Pipeline.cpp` to the pre-fix ordering (`git stash`, rebuild) +reproduces the exact failure mode predicted in 2a's analysis -- `ascorepro` comes back +as the untouched default sentinel `0.0`, because with the bug `g_AScoreOptions`'s +symbol never gets set to the mod's index (`CometSearch.cpp:5584-5585`'s +`iVal == g_AScoreOptions.getSymbol() - '0'` check fails), so `cHasVariableMod` is never +set to `HasVariableModType_AScorePro` and `CometPostAnalysis::CalculateAScorePro()` +returns immediately without running. Restored the fix afterward and confirmed the full +20-test suite (19 prior + t19) passes cleanly with zero build warnings. + +### 4b. Fix issue 2a at a single call site rather than inside FiStrategy + +Patching `FiStrategy::initialize()` to re-call `SetAScoreOptions()` after +`ReadPlainPeptideIndex()` would work but duplicates a process-wide concern (AScore +setup) inside a per-strategy class. Fixing the ordering at one shared call site keeps +the AScore lifecycle in one place and automatically covers any future strategy that +loads an index with embedded mod definitions. + +**Status (2026-06-17):** Done as part of fixing 2a. The shared call site chosen was +`Pipeline::run()` rather than `DoSearch()` itself, since `Pipeline::run()` is what +actually invokes `_strategy->initialize()`/`finalize()` and is already the single +caller of both -- placing the AScore lifecycle there means no strategy subclass needs +its own re-sync logic, present or future. + +### 4c. Document the writer close()-after-failed-open contract + +One sentence on `IResultWriter::close()` (issue 3a) removes the only undocumented +cross-class invariant `Pipeline::run()` currently depends on. + +**Status (2026-06-17):** Done as part of fixing 3a -- see that item's status note. + +### 4d. Resolve the stale Phase 5 comment + +Either state plainly in `SearchSession.h` that the RTS-path globals are permanently +out of scope for `SearchSession`, or file the remaining migration work so it is +discoverable outside of a header comment (issue 3b). + +**Status (2026-06-17):** Done -- "state plainly" branch chosen. Checked +`docs/20260612_architecture_migration.md`'s own phase plan: Phase 5 (Pipeline/Strategy) +is the last phase defined, and its own "RTS path" section already states the RTS entry +points are "explicitly out of scope for Phase 5" because they are +wrapper-compatibility-sensitive -- there is no Phase 6 deferring further removal. +Rewrote the comment block in `SearchSession.h` to say plainly that +`g_pvQueryMutex`/`g_bPlainPeptideIndexRead`/`g_bSpecLibRead` remaining as globals is +permanent, not a pending migration step: a single process can serve both RTS and batch +requests, so this once-per-process init state must stay process-global rather than move +into a per-batch-run `SearchSession`. Also re-flowed the trailing `g_cometStatus` +paragraph, which had been nested under the now-removed "Phase 4 migration note:" +sub-header, to match the rest of the comment's indentation. + +--- + +## Appendix: Findings Not Requiring Code Changes + +- **SearchMemoryPool / RAII slot guards**: `SearchMemoryPoolSlotGuard` is applied at + all five `AcquirePoolSlot()` / `releaseSlot()` sites in `CometSearch.cpp` + (`CometSearch::RunSearch(Query*)` FI and PI branches, the single-query FI fallback, + the batch-FI per-query lambda, and `SearchThreadProc`). `SearchMemoryPool::allocate()` + correctly unwinds partial allocations on `bad_alloc`. No exception-safety gaps found. +- **g_bIndexPrecursors alloc/free**: allocated with `malloc` in + `CometSearchManager.cpp:1552`, freed with `free()` in `FiStrategy::finalize()` -- + consistent, no mismatched allocator. +- **Output writers**: `TxtWriter`, `SqtWriter`, `PercolatorWriter`, `PepXmlWriter`, + `MzIdentMlWriter` all null-check their file handles in `close()`, including + `MzIdentMlWriter`'s more involved temp-file merge/rename lifecycle (`FinalizeOne`). + No double-close, no leaked temp files in the `bEmpty` or failed-merge paths observed. +- **RunSearchAndPostAnalysis / executeBatchLegacy**: only ever called with a + non-empty `session.queries` (guarded by the empty-check in `executeBatchLegacy` + before `RunSearchAndPostAnalysis` is invoked), so the unchecked + `session.queries.at(0)` / `.at(size()-1)` mass-range calculation inside it is safe in + every current call path. +- **Fused FI search path (`FusedLoadAndSearchSpectra` / `FusedSearchSpectrum`)**: + pushes into `session.queries` under `session.queriesMutex`, consistent with the + non-fused path's locking discipline; `Pipeline`'s post-batch stats and + empty-batch handling work correctly for both paths. +- **Build / tests**: `make cclean && make -j$(nproc)` clean, zero warnings. 17 unit + 2 + integration tests (T17 peptide-count range, T18 byte-identical determinism) all pass. diff --git a/docs/20260618_mutexserialization.md b/docs/20260618_mutexserialization.md new file mode 100644 index 00000000..41616123 --- /dev/null +++ b/docs/20260618_mutexserialization.md @@ -0,0 +1,293 @@ +# Mutex Serialization in SearchMemoryPool -- Problem and Optimization Plan + +## Context + +`SearchMemoryPool` (`CometSearch/threading/SearchMemoryPool.h/.cpp`) hands out +duplicate-fragment scratch-buffer slots to search threads. Every call into +`CometSearch::AcquirePoolSlot()` / `releaseSlot()` takes the pool's single +`std::mutex` -- first to scan/pop a free slot, then again to push it back. A prior +benchmarking pass (see Appendix) replaced the original O(n) linear scan of a +`bool[]` array with an O(1) free-list stack, confirming the scan itself was not +the bottleneck: total throughput across all threads stayed flat at roughly +3.8-5M ops/sec from 8 threads up to 512 threads, regardless of slot count. Flat +throughput under increasing thread count, on an operation with no inherent +ordering requirement, is the signature of a single global serialization point -- +in this case, the pool's one mutex. This document describes that problem in more +detail and lays out a measurement-gated plan for removing the serialization from +the hottest call site. + +## The problem + +`acquireSlot()`/`releaseSlot()` (`threading/SearchMemoryPool.cpp`) take the same +`std::mutex _mutex` on every call: + +```cpp +int SearchMemoryPool::acquireSlot() +{ + std::unique_lock lock(_mutex); + bool found = _cv.wait_for(lock, std::chrono::seconds(240), [this]() { return !_freeSlots.empty(); }); + if (!found) return -1; + int slot = _freeSlots.back(); + _freeSlots.pop_back(); + return slot; +} + +void SearchMemoryPool::releaseSlot(int slot) +{ + { std::lock_guard lk(_mutex); _freeSlots.push_back(slot); } + _cv.notify_one(); +} +``` + +Conceptually, "give me any one free slot" and "give this slot back" do not need a +total order across all callers -- any free slot will do, and releases don't need +to be sequenced relative to other releases. But the current design forces every +acquire and every release through one mutex, so N threads doing this concurrently +serialize to roughly the same total throughput as 1 thread doing it N times. The +benchmark in the Appendix confirms this directly: per-operation latency stayed in +the 200-310 ns range across all tested slot/thread counts (8 through 512), with +*total* throughput never scaling up with thread count the way a genuinely +parallel operation would. + +### Where this is actually hot + +Not every caller of `AcquirePoolSlot()` is on a tight per-spectrum loop. Current +call sites, in descending order of call frequency: + +| Call site | File:line | Frequency | Notes | +|---|---|---|---| +| `CometSearch::RunSearch(Query* pQuery)` | `CometSearch.cpp:110` (acquire at lines 122, 164) | **Once per spectrum, per RTS call** | This is `DoSingleSpectrumSearchMultiResults`'s search path -- the RTS thread-local entry point that the project already benchmarks for per-spectrum Hz. Every concurrent RTS caller takes the global mutex twice (acquire + release) per spectrum. | +| `CometSearch::RunSearch(int,int,ThreadPool*,vector&)` FI_DB branch | `CometSearch.cpp:218` | Once per query, per batch | Only reached for the legacy (non-fused) batch path, i.e. when Mango or a spectral-library search forces `FiStrategy::executeBatch()` away from the fused path (`search/FiStrategy.cpp:129-131`). | +| `CometSearch::SearchThreadProc` | `CometSearch.cpp:1220` | Once per protein-search job dispatch | Classic FASTA three-sweep search. Per-job, not per-spectrum; each job is comparatively expensive (protein-by-protein FASTA scoring), so lock overhead is a much smaller fraction of total work here. | + +### The pattern that already avoids this problem + +The fused batch FI_DB path (`CometPreprocess::FusedLoadAndSearchSpectra`, +`CometPreprocess.cpp:3246-3278`) does **not** call `AcquirePoolSlot()` at all. It +launches exactly `iNumThreads` long-lived consumer jobs up front, each one closed +over a fixed slot index `t`: + +```cpp +const int iNumSlots = g_staticParams.options.iNumThreads; +BoundedSpectrumQueue queue(static_cast(iNumSlots) * 4); + +for (int t = 0; t < iNumSlots; ++t) +{ + tp->doJob([&queue, t, &session]() + { + Spectrum spec; + while (queue.pop(spec)) + FusedSearchSpectrum(std::move(spec), t, session); // pre-assigned slot, no lock + }); +} +``` + +Each worker keeps its slot for the worker's entire lifetime instead of +acquiring/releasing it per spectrum. `RunSearch(Query*, int iSlot)` +(`CometSearch.cpp:186`) exists specifically to take this pre-assigned slot, +bypassing `AcquirePoolSlot()` entirely. This is proven, already-shipping code -- +it is the model for Phase 1 below, not a new design. + +## Why this matters (and the honest caveat) + +The RTS path's entire purpose is per-spectrum throughput (Hz) under concurrent +load from multiple C# `Task` threads. Every `DoSingleSpectrumSearchMultiResults` +call pays for two lock/unlock pairs on a global mutex shared with every other +concurrent caller, even when no actual contention exists. + +**Caveat:** the Appendix benchmark measures the synchronization primitive in +isolation, with a near-zero critical-section hold time (touch one byte). Real +search work (`SearchFragmentIndex` / `SearchPeptideIndex`) holds the slot for +however long the actual XCorr/peptide-index search takes -- almost certainly +microseconds to low milliseconds, not nanoseconds. If that real hold time +dominates, the relative cost of the lock itself may be a small fraction of total +per-spectrum latency, and this entire effort would not show up in real Hz +numbers. **This needs to be measured in situ before committing to Phase 2.** +Phase 0 below exists specifically to answer that question first. + +## Proposed plan + +### Phase 0 -- Measure in situ before optimizing further + +Use the existing `RTS_TIMING` build flag (see the `comet-build` skill; +`CometSearch.vcxproj` Release config, or `RTS_TIMING_OFF`/`RTS_TIMING` +preprocessor define) to instrument real per-spectrum timing inside +`DoSingleSpectrumSearchMultiResults`, and drive it with a synthetic +high-concurrency load (many concurrent RTS calls, thread count well above +`iNumThreads` to force the pool into contention). Compare: + +- Wall time spent inside `AcquirePoolSlot()`/`releaseSlot()` vs. total per-spectrum + wall time. +- futex wait counts / `perf lock` contention stats under sustained concurrent load, + if available on the target platform. + +**Only proceed to Phase 1/2 if this shows a non-negligible fraction of +per-spectrum latency** (a reasonable bar: >5-10%), or direct evidence of lock +contention at realistic concurrent RTS thread counts. If the real hold time of +the search work dominates, stop here -- the isolated microbenchmark result does +not by itself justify the added code complexity. + +### Phase 1 -- Extend the existing pre-assigned-slot pattern to RTS (low risk) + +The fused batch path can pre-assign slots because it owns a fixed-size worker +pool it creates itself. RTS callers arrive on whatever thread the .NET `Task` +scheduler happens to run them on, so there is no equivalent fixed "worker index" +threaded through `CometWrapper` today. + +Proposed mechanism: **thread-local lazy slot pinning.** On the first call into +`RunSearch(Query*)` from a given OS thread, claim a slot once via the existing +(mutex-protected) `acquireSlot()` and cache it in a `thread_local int`. Every +subsequent call from that same OS thread reuses the cached slot directly -- +no further lock operations for the rest of that thread's lifetime. If the pool +is already fully claimed by other threads when a new thread needs a permanent +slot, fall back to today's per-call dynamic acquire/release for that thread +(graceful degradation, not a hard failure). + +- **Implementation surface:** a thin wrapper at the two `AcquirePoolSlot()` call + sites inside `RunSearch(Query* pQuery)` (`CometSearch.cpp:122,164`). No change + to `SearchMemoryPool` itself. +- **Risk:** low. Reuses the existing tested mutex/free-list code for the + one-time claim and the overflow fallback; adds only a `thread_local` cache. +- **Open question to confirm, not assume:** this only pays off if the number of + distinct OS threads that ever call into RTS search stays bounded near + `iNumThreads`. `RealtimeSearch.cs`'s `Parallel.ForEach` over the scan queue + does not currently pin a fixed degree of parallelism matching `iNumThreads` -- + .NET's `ThreadPool` can grow under sustained load. Confirm actual concurrent + thread counts in production-like load before relying on this assumption; if + unbounded, either cap `Parallel.ForEach`'s `MaxDegreeOfParallelism` to + `iNumThreads` on the C# side, or size the pool to comfortably exceed observed + peak concurrency. + +### Phase 2 -- Lock-free fast path (only if Phase 0 justifies it) + +If Phase 0 shows the mutex matters and Phase 1's thread-affinity assumption +doesn't hold for some caller, replace the mutex+condition_variable+vector design +with a lock-free atomic bitmask: + +- `std::atomic` for pools up to 64 slots (two words for up to 128; + `iNumThreads` realistically never exceeds this range). +- `acquireSlot()`: CAS loop -- find the lowest set bit (a free slot), atomically + clear it. O(1), wait-free in the common case. +- `releaseSlot()`: atomic fetch-or to set the bit back. O(1), wait-free. +- Keep the existing mutex+condition_variable only as the rare fallback path for + "pool fully exhausted, must block" -- its only remaining job, instead of being + on every call. + +This is more general than Phase 1 (no assumption about caller thread identity or +lifetime) but carries more implementation and review risk: lock-free bitmask +code is straightforward to write but easy to get subtly wrong (memory ordering, +the exhausted-pool fallback path), and is harder to verify by inspection than a +`thread_local` cache. Pursue only if Phase 1 doesn't fully close the gap Phase 0 +identified. + +### Phase 3 -- Re-benchmark and re-measure after each phase + +- Re-run the standalone `SearchMemoryPool` benchmark (Appendix) after each phase + to confirm the synchronization primitive itself improved. +- Re-run the Phase 0 in-situ `RTS_TIMING` measurement after each phase to confirm + the improvement is visible in real per-spectrum Hz numbers, not just the + isolated microbenchmark. A microbenchmark win that doesn't move real Hz numbers + is not worth the added code complexity or review risk -- don't merge Phase 2 + on the strength of the isolated benchmark alone. + +## Other shared mutexes considered and ruled out of scope (for now) + +`docs/GlobalVariables.md` lists several other process-wide mutexes: +`g_pvDBIndexMutex`, `g_preprocessMemoryPoolMutex`, `g_ms1AlignerMutex`, +`g_pvQueryMutex`. None of these sit on the per-spectrum hot path the way +`SearchMemoryPool`'s mutex does -- they guard one-time initialization work (DB +index reads, spectral-library loading) or comparatively low-frequency updates +(MS1 RT alignment history, once per MS1 RTS call on a lower-volume path). Revisit +only if a Phase-0-style measurement on one of those specific paths shows an +actual problem; don't speculatively rewrite them without evidence -- that was +exactly the mistake this document is trying to avoid by leading with Phase 0. + +## Appendix: benchmark methodology + +Standalone harness, compiled directly against the real +`threading/SearchMemoryPool.cpp` and `Threading.cpp` (no need to link the rest of +Comet's dependency tree -- `logout`/`logerr` are macros over `cout`/`cerr`, and +`CometStatus` is fully defined inline in its header): + +```cpp +// bench_pool.cpp +#include "threading/SearchMemoryPool.h" +#include "CometStatus.h" +#include +#include +#include +#include +#include +#include + +CometStatus g_cometStatus; // extern required by SearchMemoryPool.cpp's bad_alloc path + +int main(int argc, char** argv) +{ + int nSlots = argc > 1 ? atoi(argv[1]) : 8; + int nThreads = argc > 2 ? atoi(argv[2]) : 32; + long nIters = argc > 3 ? atol(argv[3]) : 200000; + + SearchMemoryPool pool; + if (!pool.allocate(nSlots, 16)) { fprintf(stderr, "allocate failed\n"); return 1; } + + std::atomic totalOps{0}; + std::vector threads; + auto tStart = std::chrono::steady_clock::now(); + + for (int t = 0; t < nThreads; ++t) + { + threads.emplace_back([&pool, nIters, &totalOps]() { + for (long i = 0; i < nIters; ++i) + { + int slot = pool.acquireSlot(); + if (slot < 0) continue; + SearchMemoryPoolSlotGuard guard{pool, slot}; + volatile bool* p = pool.duplFragmentArr(slot); + p[0] = !p[0]; // simulate minimal real work + totalOps.fetch_add(1, std::memory_order_relaxed); + } + }); + } + for (auto& th : threads) th.join(); + + double sec = std::chrono::duration(std::chrono::steady_clock::now() - tStart).count(); + long ops = totalOps.load(); + printf("slots=%d threads=%d total_ops=%ld time=%.4fs ops/sec=%.0f avg_latency_ns=%.1f\n", + nSlots, nThreads, ops, sec, ops / sec, (sec * 1e9) / ops); + + pool.deallocate(); + return 0; +} +``` + +Compile from `CometSearch/`: + +```bash +g++ -O3 -std=c++20 -fpermissive -I. -I../MSToolkit/include \ + -I../MSToolkit/extern/expat-2.2.9/lib -I../MSToolkit/extern/zlib-1.2.11 \ + -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE \ + bench_pool.cpp threading/SearchMemoryPool.cpp Threading.cpp -lpthread -o bench_pool +./bench_pool +``` + +Results from the prior benchmarking pass (O(n) linear-scan implementation vs. the +O(1) free-list that replaced it -- both still mutex-bound, which is the point of +this document): + +| Slots/Threads | O(n) scan ops/sec | O(1) free-list ops/sec | Delta | +|---|---|---|---| +| 8/8 | 3.81M | 4.00M | +5% | +| 8/32 | 4.28M | 4.05M | -5% (noise) | +| 16/64 | 3.91M | 3.87M | -1% (noise) | +| 32/128 | 3.89M | 4.11M | +6% | +| 256/256 | 3.81M | 5.05M | +33% | +| 512/512 | 3.26M | 4.39M | +35% | + +At realistic pool sizes (`iNumThreads`, typically <= 64), throughput is flat +across both implementations within noise -- confirming the mutex, not the scan, +sets the ceiling. The free-list version only pulls ahead once slot counts grow +well past any realistic `iNumThreads` value, which is informative for +understanding *why* the scan wasn't the bottleneck but does not by itself +indicate a production-relevant win. Phase 0 of this plan is how to find out +whether removing the mutex itself would be. diff --git a/docs/DataStructures.md b/docs/DataStructures.md index 682f30ef..011a332f 100644 --- a/docs/DataStructures.md +++ b/docs/DataStructures.md @@ -1,6 +1,12 @@ # Core Data Structures -Key types used throughout `CometSearch/`. All are defined in `CometDataInternal.h` unless noted. Types from `CometData.h` (the public API header) are marked accordingly. +Key types used throughout `CometSearch/`. Struct definitions were reorganized in Phase 3-4 of the architecture migration: +- `core/Types.h` -- per-spectrum, index, and runtime structs (`Results`, `Query`, `QueryMS1`, `DBIndex`, `PlainPeptideIndexStruct`, `FragmentPeptidesStruct`, `ProteinsListCSR`, etc.) +- `core/Params.h` -- `StaticParams` and all its nested sub-structs +- `core/Constants.h` -- compile-time constants (`MAX_PEPTIDE_LEN`, `VMODS`, `HISTO_SIZE`, etc.) +- `CometData.h` -- public API types that cross the library boundary into `CometWrapper` and `RealtimeSearch` + +`CometDataInternal.h` `#include`s all three `core/` headers; existing code that includes `CometDataInternal.h` continues to see everything. --- @@ -9,7 +15,7 @@ Key types used throughout `CometSearch/`. All are defined in `CometDataInternal. The central per-spectrum data object. One `Query` is allocated for each spectrum/charge combination in a batch. ```cpp -struct Query // CometDataInternal.h:861 +struct Query // core/Types.h ``` **Scoring state:** @@ -17,23 +23,31 @@ struct Query // CometDataInternal.h:861 | Field | Purpose | |-------|---------| | `iXcorrHistogram[HISTO_SIZE]` | Histogram of XCorr scores for E-value estimation (152 bins). | -| `iHistogramCount` | Number of entries in the histogram. | +| `uiHistogramCount` | Number of entries in the histogram. | | `fPar[4]` | Fitted LMA regression parameters from `LinearRegression()`. | | `siMaxXcorr` | Bin index of the histogram maximum. | +| `iMinXcorrHisto` | Minimum xcorr bin used in histogram; adjusts E-value floor for sparse spectra. | | `dLowestXcorrScore` / `dLowestDecoyXcorrScore` | Current minimum stored XCorr; gates whether a new hit is kept. | +| `siLowestXcorrScoreIndex` / `siLowestDecoyXcorrScoreIndex` | Index of the current lowest-scoring result slot. | +| `fLowestSpecLibScore` | Current minimum stored speclib score for the MS2 speclib path. | | `iMatchPeptideCount` / `iDecoyMatchPeptideCount` | Number of results actually stored. | -| `_uliNumMatchedPeptides` | Total peptides scored (including those below cutoff). | +| `_uliNumMatchedPeptides` / `_uliNumMatchedDecoyPeptides` | Total peptides scored (including those below cutoff). | +| `dMangoIndex` | Decimal scan-number encoding for Mango TMT-precursor searches. | **Spectrum data (set by CometPreprocess):** | Field | Purpose | |-------|---------| -| `pfFastXcorrData[]` | Preprocessed intensity array for XCorr calculation. | -| `pfFastXcorrDataNL[]` | Same with NH3/H2O neutral loss contributions. | -| `pfSpScoreData[]` / `ppfSparseSpScoreData[][]` | Binned intensity for SP scoring. Sparse representation saves memory for large bin arrays. | -| `iFastXcorrDataSize` / `iSpScoreData` | Array sizes for the above. | +| `ppfSparseSpScoreData[][]` | Sparse 2D binned intensity array for SP scoring. | +| `ppfSparseFastXcorrData[][]` | Sparse 2D preprocessed intensity array for XCorr calculation. | +| `ppfSparseFastXcorrDataNL[][]` | Same with NH3/H2O neutral loss contributions. | +| `iSpScoreData` / `iFastXcorrDataSize` | Outer dimension of the respective sparse arrays. | +| `bSparseFromPool` | `true` when the sparse child arrays belong to the RTS thread-local `RtsScratch` pool; the destructor must **not** `delete[]` them in this case. | +| `vfRawFragmentPeakMass` | Raw fragment peak masses for fragment index search (intensity not needed at scoring stage). | +| `vRawFragmentPeakMassIntensity` | Raw peaks as `AScoreProCpp::Centroid` pairs; populated when AScorePro is enabled. | | `_pepMassInfo` | Experimental mass and tolerance window (see `PepMassInfo`). | -| `_spectrumInfoInternal` | Scan number, charge state, RT, array size (see `SpectrumInfoInternal`). | +| `_spectrumInfoInternal` | Scan number, charge state, RT, array size, nativeID (see `SpectrumInfoInternal`). | +| `tSearchStart` | Per-query search start time; used to enforce `iMaxIndexRunTime` timeout. | **Results:** @@ -41,9 +55,10 @@ struct Query // CometDataInternal.h:861 |-------|---------| | `_pResults` | Heap-allocated `Results[iNumStored]` array for target hits. | | `_pDecoys` | Same for decoy hits (separate decoy mode only; `iDecoySearch == 2`). | +| `_pSpecLibResults` | MS2 spectral library results (`SpecLibResults[iNumStored]`). | | `accessMutex` | Per-query mutex; guards `_pResults` updates in concurrent search threads. | -**Lifecycle:** allocated in `CometPreprocess`, freed in `Query::~Query()`. In batch mode, all `Query*` objects live in `g_pvQuery`. In the RTS thread-local path, each call owns its own heap `Query*` and frees it at the end of the call. +**Lifecycle:** Allocated in `CometPreprocess`, freed in `Query::~Query()`. In batch mode, all `Query*` objects live in `SearchSession.queries`. In the RTS thread-local path, each call owns its own heap `Query*` and frees it at the end of the call. --- @@ -52,21 +67,34 @@ struct Query // CometDataInternal.h:861 Holds one peptide hit. Each `Query` owns an array of `Results[iNumStored]`. ```cpp -struct Results // CometDataInternal.h:194 +struct Results // core/Types.h ``` | Field | Type | Purpose | |-------|------|---------| | `fXcorr` | `float` | Cross-correlation score. | | `fScoreSp` | `float` | Preliminary SP score. | +| `fDeltaCn` | `float` | Delta-Cn (score difference to next-best hit). | +| `fLastDeltaCn` | `float` | Delta-Cn to the last stored hit. | +| `fAScorePro` | `float` | AScorePro phosphosite localization score. | | `dExpect` | `double` | E-value from LMA-fitted histogram. | | `dPepMass` | `double` | Calculated peptide MH+ mass. | -| `iRankSp` / `iMatchedIons` / `iTotalIons` | `int` | SP rank and ion match counts. | +| `usiRankXcorr` | `unsigned short` | Xcorr rank. | +| `usiRankSp` | `unsigned short` | SP rank. | +| `usiMatchedIons` | `unsigned short` | Number of matched fragment ions. | +| `usiTotalIons` | `unsigned short` | Total theoretical fragment ions. | +| `usiLenPeptide` | `unsigned short` | Peptide length. | +| `lProteinFilePosition` | `comet_fileoffset_t` | File offset into the FASTA for the matched protein; for index searches, an entry index into `g_pvProteinsList`. | +| `lWhichProtein` | `long` | Which entry in `g_pvProteinsList[]` contains the matched proteins. | | `szPeptide[MAX_PEPTIDE_LEN]` | `char[]` | Peptide sequence (no flanking AAs). | -| `szPrevNextAA[2]` | `char[]` | `[0]` = preceding AA, `[1]` = following AA. | +| `cPrevAA` / `cNextAA` | `char` | Preceding and following amino acid. | +| `bClippedM` | `bool` | `true` if this is a new N-terminal peptide due to a clipped methionine. | +| `cHasVariableMod` | `char` | `HasVariableModType` enum: 0 = none, 1 = variable mod, 2 = AScorePro mod. | | `piVarModSites[MAX_PEPTIDE_LEN_P2]` | `int[]` | Per-position variable mod encoding. Values 1-9 map to `varModList[0-8]`. Values >= `COMPOUNDMODS_OFFSET` (100) encode compound mods. Indices `iLenPeptide` and `iLenPeptide+1` hold N/C-terminal mod codes. | | `pdVarModSites[MAX_PEPTIDE_LEN_P2]` | `double[]` | Mass delta at each modified position. | -| `lProteinFilePosition` | `comet_fileoffset_t` | File offset into the FASTA for the matched protein. | +| `pszMod[MAX_PEPTIDE_LEN][MAX_PEFFMOD_LEN]` | `char[][]` | PEFF modification strings, one per position. | +| `sPeffOrigResidues` | `string` | Original residues for PEFF variants. | +| `sAScoreProSiteScores` | `string` | Comma-separated per-site AScorePro scores. | | `pWhichProtein` | `vector` | All proteins sharing this peptide (sorted by file offset). | | `pWhichDecoyProtein` | `vector` | Decoy proteins (concatenated search mode). | @@ -77,11 +105,11 @@ struct Results // CometDataInternal.h:194 The global parameter aggregate. Fully populated before any search thread starts; treated as read-only during search. ```cpp -struct StaticParams // CometDataInternal.h:602 +struct StaticParams // core/Params.h extern StaticParams g_staticParams; ``` -Contains nested sub-structs (all defined in `CometDataInternal.h`): +Contains nested sub-structs (all defined in `core/Params.h`): | Sub-struct | Type | Key contents | |------------|------|-------------| @@ -102,8 +130,8 @@ Contains nested sub-structs (all defined in `CometDataInternal.h`): ## VarMods / VarModParams ```cpp -struct VarMods // CometData.h:218 (one entry per mod slot) -struct VarModParams // CometDataInternal.h:472 (all mod config) +struct VarMods // CometData.h (one entry per mod slot) +struct VarModParams // core/Params.h (all mod config) ``` `VarModParams` contains: @@ -115,7 +143,7 @@ struct VarModParams // CometDataInternal.h:472 (all mod config) | `bVarModSearch` | Set to `true` if any mod has a non-zero mass; gates the `WithVariableMods` code path. | | `iMaxVarModPerPeptide` | Total modified residues allowed per peptide across all mods. | | `iMaxPermutations` | Cap on permutation count in `WithVariableMods`. | -| `vdCompoundMasses` | `vector` of masses from the compound mods file (compoundmods branch). | +| `vdCompoundMasses` | `vector` of masses from the compound mods file. | | `iNumCompoundMasses` | `size_t` size of `vdCompoundMasses`. | Each `VarMods` entry: @@ -134,35 +162,145 @@ Each `VarMods` entry: ## DBIndex -One entry in the peptide index (`g_pvDBIndex`), sorted by mass for binary-search lookup. +One entry in the peptide index (`g_pvDBIndex`), used during index generation and FASTA search. Sorted by peptide sequence and mass for deduplication. ```cpp -struct DBIndex // CometDataInternal.h:377 +struct DBIndex // core/Types.h +``` + +| Field | Type | Purpose | +|-------|------|---------| +| `sPeptide[MAX_PEPTIDE_LEN]` | `char[]` | Peptide amino acid sequence (null-terminated). | +| `cPrevAA` / `cNextAA` | `char` | Flanking residues (for enzyme termini check). | +| `pcVarModSites` | `vector` | Variable mod encoding per position. Empty = unmodified; otherwise `[iLen+2]` chars using the same 0-9 scheme as `piVarModSites`. | +| `dPepMass` | `double` | MH+ mass; used as sort key within equal sequences. | +| `siVarModProteinFilter` | `unsigned short` | Bitwise filter derived from the protein filter file; `0` when not filtering. Initialized to `0`. | +| `lIndexProteinFilePosition` | `comet_fileoffset_t` | Index into `g_pvProteinsList` mapping to the list of protein file offsets. | + +`DBIndex` provides `operator==` (sequence + mass + mod-sites) and `operator<` (sequence -> mass -> mod-sites -> protein position). + +--- + +## PlainPeptideIndexStruct + +Compact fixed-size tuple stored in the plain peptide index (`.idx` file) and loaded into `g_vRawPeptides` at runtime. Same core fields as `DBIndex` but without the `vector` mod-site field (only unmodified peptides are stored here; modifications are layered on in `g_vFragmentPeptides`). + +```cpp +struct PlainPeptideIndexStruct // core/Types.h ``` | Field | Purpose | |-------|---------| -| `szPeptide[MAX_PEPTIDE_LEN]` | Peptide amino acid sequence. | -| `szPrevNextAA[2]` | Flanking residues (for enzyme termini check in index search). | -| `pcVarModSites[MAX_PEPTIDE_LEN_P2]` | Compact mod-site encoding (0-9; same scheme as `piVarModSites`). | -| `dPepMass` | MH+ mass; the sort key. | -| `lIndexProteinFilePosition` | Index into `g_pvProteinsList` mapping to a list of protein file offsets. | +| `szPeptide[MAX_PEPTIDE_LEN]` | Peptide sequence (null-terminated). | +| `cPrevAA` / `cNextAA` | Flanking residues. | +| `dPepMass` | Unmodified MH+ mass. | +| `siVarModProteinFilter` | Protein filter bitfield. | +| `lIndexProteinFilePosition` | Row index into `g_pvProteinsList`. | --- -## PepMassInfo / SpectrumInfoInternal +## FragmentPeptidesStruct -Small structs embedded in each `Query`. +One entry in the fragment index peptide list (`g_vFragmentPeptides`). Represents one (peptide, mod-state) combination. Sorted by mass so that RunSearch can binary-search for mass-matching candidates. + +```cpp +struct FragmentPeptidesStruct // core/Types.h +``` + +| Field | Purpose | +|-------|---------| +| `iWhichPeptide` | Index into `g_vRawPeptides`; provides sequence and protein info. | +| `modNumIdx` | Index into `MOD_NUMBERS`; 0 = unmodified. | +| `dPepMass` | Modified MH+ mass (= unmodified mass + sum of applied mod masses). | +| `cNtermMod` / `cCtermMod` | N/C-terminal variable mod codes (index into `varModList`). | + +--- + +## ProteinsListCSR + +CSR (Compressed Sparse Row)-style storage for the per-peptide protein list. Replaces `vector>` to eliminate the ~190 M individual heap allocations (one per inner vector) that caused a multi-minute free-time tail when building large MHC `.idx` files. ```cpp -struct PepMassInfo // CometDataInternal.h:219 +class ProteinsListCSR // core/Types.h +extern ProteinsListCSR g_pvProteinsList; ``` -Stores the experimental MH+ mass (`dExpPepMass`) and the +/- tolerance window (`dPeptideMassToleranceMinus` / `dPeptideMassTolerancePlus`) pre-computed for fast range checks. + +The external interface mirrors `vector>`: `size()`, `empty()`, `clear()`, `reserve()`, `push_back(vector&&)`, `append_flat()`, `operator[](i)`, `at(i)`, range-for. `operator[](i)` returns a lightweight `Row` proxy (`ptr` + `n`) with `size()`, `operator[]`, `begin()`/`end()`. Only two internal heap allocations regardless of how many rows are stored (`m_flat`: all protein file offsets concatenated; `m_off`: `[N+1]` uint64 CSR offsets). + +--- + +## SearchSession + +Owns all mutable state for one batch search run. Created once at the top of `CometSearchManager::DoSearch()` and passed by reference through `Pipeline` to `ISearchStrategy` implementations. ```cpp -struct SpectrumInfoInternal // CometDataInternal.h:228 +struct SearchSession // search/SearchSession.h ``` -Stores scan number, charge state, retention time, array size, and the nativeID string from mzML files. + +| Field | Purpose | +|-------|---------| +| `queries` | `vector` -- per-batch MS2 query accumulator (replaces global `g_pvQuery` for the batch path). Protected by `queriesMutex`. | +| `ms1Queries` | `vector` -- per-batch MS1 query accumulator (replaces global `g_pvQueryMS1`). | +| `queriesMutex` | `std::mutex` -- guards `queries` and `ms1Queries` during parallel spectrum loading. | +| `bPerformDatabaseSearch` | Replaces the former global `g_bPerformDatabaseSearch`. | +| `bPerformSpecLibSearch` | Replaces the former global `g_bPerformSpecLibSearch`. | +| `bIdxNoFasta` | Replaces the former global `g_bIdxNoFasta`. | +| `statusRef` | `CometStatus&` -- a **reference** to the process-wide singleton `g_cometStatus`, not a per-run copy. Pipeline and strategy code use `session.statusRef` so they are not coupled to the global name, but both spellings touch the same object. | + +`SearchSession` has no `params` member -- code reads `g_staticParams` directly throughout; an earlier draft carried a `const StaticParams& params` field but it was unused and removed. There is also no `bPlainPeptideIndexRead` / `bSpecLibRead` member: `g_bPlainPeptideIndexRead`, `g_bSpecLibRead`, and `g_pvQueryMutex` remain plain globals rather than `SearchSession` fields, specifically because the RTS path (which never constructs a `SearchSession`) also reads/writes them -- see the header comment in `search/SearchSession.h` and the `g_pvQueryMutex` entry in `docs/GlobalVariables.md`. + +`SearchSession` is non-copyable. The RTS paths (`DoSingleSpectrumSearchMultiResults`, `DoMS1SearchMultiResults`) do **not** use `SearchSession`; they use per-call `Query*`/`QueryMS1*` objects directly. + +--- + +## Pipeline and ISearchStrategy + +Added in Phase 5. `DoSearch()` instantiates a `Pipeline` + one concrete `ISearchStrategy` and calls `pipeline.run()`. + +```cpp +class ISearchStrategy // search/ISearchStrategy.h +class Pipeline // search/Pipeline.h +``` + +**ISearchStrategy** interface methods: + +| Method | Called | Purpose | +|--------|--------|---------| +| `initialize(session, tp)` | Once before file loop | Allocate pools, load/build index, pre-read precursors (FI_DB), read var-mod filter file (FASTA). | +| `openFiles(szDB, fpfasta, fpidx, fpdb, session)` | Once per file | Open DB file handles; set `session.bIdxNoFasta`. | +| `executeBatch(mstReader, firstScan, lastScan, analysisType, iPercentStart, iPercentEnd, tp, session)` | Once per batch | Preprocess + search + post-analysis for one spectrum batch; fills `session.queries`. | +| `closeFiles(fpfasta, fpidx)` | Once per file | Close file handles. | +| `finalize()` | Once after all files | Free memory pools and index arrays. | +| `isIndexBased()` | Any time | `true` for `FiStrategy`/`PiStrategy`. `Pipeline::run()` is the only consumer, and uses it solely to choose between the compact index-style progress line and the verbose FASTA-style per-file banners -- it carries no other semantics and must not be used to gate search behavior. | + +**Concrete strategies:** + +| Class | File | DB type | Notes | +|-------|------|---------|-------| +| `FiStrategy` | `search/FiStrategy.cpp` | `FI_DB` | Fused load+search path when `bPerformDatabaseSearch && !bMango && !bPerformSpecLibSearch`; legacy three-sweep otherwise. | +| `FastaStrategy` | `search/FastaStrategy.cpp` | `FASTA_DB` | Classic three-sweep (load -> allocate -> RunSearch -> PostAnalysis). | +| `PiStrategy` | `search/PiStrategy.cpp` | `PI_DB` | Three-sweep like FASTA but against the plain peptide index; no Mango block. | + +**AScore lifecycle:** `Pipeline::run()` -- not `DoSearch()` -- owns `SetAScoreOptions()` / `CreateAScoreDllInterface()` / `DeleteAScoreDllInterface()` for the batch path, called immediately after `_strategy->initialize()` succeeds and immediately after `_strategy->finalize()` runs. This ordering matters: for `FI_DB`, `FiStrategy::initialize()` calls `ReadPlainPeptideIndex()`, which overwrites `g_staticParams.variableModParameters.varModList[]` from the `.idx` file's `VariableMod:` header -- `SetAScoreOptions()` must run after that overwrite, not before, or it configures AScore from stale/default mod values. (The RTS path's `InitializeSingleSpectrumSearch()` has its own, separate, already-correctly-ordered AScore setup and is not affected by this.) + +**`_pQueries` discipline (PI_DB):** `CometSearch::BinarySearchMass()` and the `AnalyzePeptideIndex(int iWhichQuery, ...)` overload read the query list through the `CometSearch` member `_pQueries` rather than a parameter -- mirroring `CometSearch::DoSearch()` (the FASTA path), which sets `_pQueries = &queries` at entry. `CometSearch::SearchPeptideIndex(ThreadPool*, vector&)` (the PI_DB batch path, called from a freshly constructed `CometSearch` instance in `RunSearch()`) must do the same at its own entry; omitting it leaves `_pQueries` `nullptr` and crashes on the first call into `BinarySearchMass()`. Any new code path that calls into these two functions needs the same assignment first. + +**IResultWriter** (`output/IResultWriter.h`) is the parallel output abstraction. Each format (`TxtWriter`, `PepXmlWriter`, `SqtWriter`, `PercolatorWriter`, `MzIdentMlWriter`) implements `open()`, `write()`, `close()`. `Pipeline` holds a `vector>` and calls them around the batch loop. `close()` must be safe to call even if `open()` was never invoked or returned false: when one writer's `open()` fails, `Pipeline::run()` calls `close(false, false)` on every writer in the vector, including ones after the failed one. + +--- + +## PepMassInfo / SpectrumInfoInternal + +Small structs embedded in each `Query`. + +```cpp +struct PepMassInfo // core/Types.h +struct SpectrumInfoInternal // core/Types.h +``` + +`PepMassInfo` stores the experimental MH+ mass (`dExpPepMass`) and the +/- tolerance window (`dPeptideMassToleranceMinus` / `dPeptideMassTolerancePlus`) pre-computed for fast range checks. + +`SpectrumInfoInternal` stores scan number, charge state, retention time, array size, Mango encoding, and the nativeID string from mzML files. --- @@ -171,7 +309,7 @@ Stores scan number, charge state, retention time, array size, and the nativeID s Passed through the FASTA search loop; holds data for a single protein from the database. ```cpp -typedef struct sDBEntry // CometDataInternal.h:348 +typedef struct sDBEntry // core/Types.h ``` | Field | Purpose | @@ -187,11 +325,11 @@ typedef struct sDBEntry // CometDataInternal.h:348 ## MassRange ```cpp -struct MassRange // CometDataInternal.h:243 +struct MassRange // CometDataInternal.h extern MassRange g_massRange; ``` -Computed once per spectrum batch from the lowest and highest precursor masses in `g_pvQuery`. Search threads read `dMinMass` / `dMaxMass` for early-exit decisions in `SearchForPeptides`. `iMaxFragmentCharge` caps the fragment ion charge loop. +Computed once per spectrum batch from the lowest and highest precursor masses in `SearchSession.queries`. Search threads read `dMinMass` / `dMaxMass` for early-exit decisions in `SearchForPeptides`. `iMaxFragmentCharge` caps the fragment ion charge loop. --- diff --git a/docs/GlobalVariables.md b/docs/GlobalVariables.md index 7a2184d8..e3793802 100644 --- a/docs/GlobalVariables.md +++ b/docs/GlobalVariables.md @@ -1,6 +1,6 @@ # Global Variables Reference -All globals are defined in `CometSearch/CometSearchManager.cpp` (unless noted) and declared `extern` in `CometSearch/CometDataInternal.h`. +All globals are defined in `CometSearch/CometSearchManager.cpp` (unless noted) and declared `extern` in `CometSearch/CometDataInternal.h` or `CometSearch/core/Types.h`. --- @@ -16,13 +16,13 @@ All globals are defined in `CometSearch/CometSearchManager.cpp` (unless noted) a ## Spectrum batch containers -Used only in the batch search path (`DoSearch` -> `RunSearch`). The RTS paths do not touch these. +Used only in the batch search path (`DoSearch` -> `Pipeline` -> strategies). The RTS paths do not touch these. Batch-path query lists and per-run flags were moved from bare globals into `SearchSession` (defined in `search/SearchSession.h`) as part of the Phase 4-5 architecture migration. -| Variable | Type | Thread-safe? | Notes | -|----------|------|:------------:|-------| -| `g_pvQuery` | `vector` | Batch path only | One `Query*` per spectrum/charge combination for the current batch. Populated by `CometPreprocess`, consumed by `CometSearch` and `CometPostAnalysis`. Not safe for concurrent writes without `g_pvQueryMutex`. | -| `g_pvQueryMS1` | `vector` | Batch path only | Analogous to `g_pvQuery` for MS1 spectral library batch searches. | -| `g_pvQueryMutex` | `Mutex` | -- | Protects `g_pvQuery` insertions during batch preprocessing. | +| Variable | Type / Location | Thread-safe? | Notes | +|----------|----------------|:------------:|-------| +| `SearchSession::queries` | `vector` | Guarded by `queriesMutex` | One `Query*` per spectrum/charge combination for the current batch. Populated by `CometPreprocess`, consumed by `CometSearch` and `CometPostAnalysis`. Replaces the former global `g_pvQuery`. | +| `SearchSession::ms1Queries` | `vector` | Guarded by `queriesMutex` | Analogous to `queries` for MS1 spectral library batch searches. Replaces the former global `g_pvQueryMS1`. | +| `SearchSession::queriesMutex` | `std::mutex` | -- | Protects `queries` / `ms1Queries` insertions during batch preprocessing. Replaces the former `g_pvQueryMutex`. | | `g_pvInputFiles` | `vector` | Read-only after init | List of input files to search; set before `DoSearch()` begins. | --- @@ -31,15 +31,16 @@ Used only in the batch search path (`DoSearch` -> `RunSearch`). The RTS paths do Populated during index build / load; treated as read-only during all searches. Safe for concurrent reads from RTS threads. +The fragment index uses a **CSR (Compressed Sparse Row)** layout. For a given fragment mass bin `b`, the entries in `g_vFragmentPeptides` are at positions `g_iFragmentIndexOffset[b]` through `g_iFragmentIndexOffset[b+1] - 1` (half-open interval), and the values stored there are indices into `g_vFragmentPeptides`. + | Variable | Type | Notes | |----------|------|-------| -| `g_iFragmentIndex` | `unsigned int**` | 2D array: `[BIN(fragment mass)][entry index]`. Each row lists which entries in `g_vFragmentPeptides` contain that fragment mass bin. | -| `g_iCountFragmentIndex` | `unsigned int*` | `[BIN(fragment mass)]` -- count of entries in each row of `g_iFragmentIndex`. | +| `g_iFragmentIndex` | `unsigned int*` | Flat CSR data array. Each element is an index into `g_vFragmentPeptides`. Entries for bin `b` span `[g_iFragmentIndexOffset[b], g_iFragmentIndexOffset[b+1])`. | +| `g_iFragmentIndexOffset` | `uint64_t*` | CSR offset array; length = (max bin + 1) + 1. Must be 64-bit -- the total entry count can exceed UINT_MAX for large databases with many variable mods. | | `g_vFragmentPeptides` | `vector` | Mass-sorted list of all (peptide, mod-state) combinations. Each entry references a row in `g_vRawPeptides` via `iWhichPeptide`. | | `g_vRawPeptides` | `vector` | List of unique unmodified peptide sequences with protein file-position pointers. | | `g_bIndexPrecursors` | `bool*` | Boolean bitmap over precursor mass bins; marks which precursor masses are present in the current input file(s). | | `g_bPeptideIndexRead` | `std::atomic` | Set to `true` once the peptide index has been fully loaded. Checked with `acquire` ordering before RTS searches begin. | -| `g_bPlainPeptideIndexRead` | `bool` | Set to `true` if the plain peptide index was read and a fragment index was generated from it. | --- @@ -49,9 +50,6 @@ Populated during index build / load; treated as read-only during all searches. S |----------|------|-------| | `g_vSpecLib` | `vector` | In-memory spectral library entries. Each entry holds peaks, charge, RT, and a unit-vector representation for dot-product scoring. | | `g_vulSpecLibPrecursorIndex` | `vector>` | Mass index into `g_vSpecLib`; maps precursor mass bins to library entry indices for fast lookup. | -| `g_bSpecLibRead` | `bool` | Set to `true` once the spectral library is fully loaded. | -| `g_bPerformSpecLibSearch` | `bool` | `true` if MS1 speclib search is active for this run. | -| `g_bPerformDatabaseSearch` | `bool` | `true` if FASTA/index database search is active for this run. | | `RetentionMatchHistory` | `std::deque` | Rolling window of (query RT, reference RT) pairs used by the MS1 RT aligner. Protected by `g_ms1AlignerMutex`. | --- @@ -60,9 +58,10 @@ Populated during index build / load; treated as read-only during all searches. S | Variable | Type | Notes | |----------|------|-------| -| `g_pvDBIndex` | `vector` | Peptide index entries (mass-sorted). Each entry holds peptide sequence, mass, var-mod encoding, and a protein file-position pointer. | -| `g_pvProteinNames` | `map` | Maps protein file-position to accession string and ordinal. | -| `g_pvProteinsList` | `vector>` | Maps index positions to lists of protein file offsets (for multi-protein peptides). | +| `g_pvDBIndex` | `vector` | Peptide index entries used during index build. Each entry holds peptide sequence, mass, var-mod encoding, and a protein file-position pointer. | +| `g_pvProteinNames` | `map` | Maps protein file-position to accession string and ordinal. Used for FASTA searches and legacy index paths. | +| `g_pvProteinsList` | `ProteinsListCSR` | Maps peptide index positions to lists of protein file offsets (for multi-protein peptides). `ProteinsListCSR` is a CSR-layout replacement for `vector>`; exposes the same `operator[]`/`size()`/range-for interface but uses only two heap allocations total. | +| `g_pvProteinNameCache` | `unordered_map` | Protein name lookup cache for index-based searches. Populated at index load time from the protein name blocks in the `.idx` file. Maps protein file-position offsets to accession strings. ~7 MB for a human target-decoy database. Allows O(1) protein name resolution during RTS without file I/O. | | `g_pvDIAWindows` | `vector` | Flat list of DIA isolation window edges (start, end, start, end, ...). Empty if not doing DIA. | --- @@ -87,6 +86,7 @@ Used by the variable mod permutation engine (`CometModificationsPermuter`). | `MOD_SEQ_MOD_NUM_START` / `MOD_SEQ_MOD_NUM_CNT` | `int*` -- index into `MOD_NUMBERS` per modifiable sequence. | | `PEPTIDE_MOD_SEQ_IDXS` | `int*` -- maps peptides to their modifiable sequence index. | | `MOD_NUM` | `int` -- total number of distinct modification combinations. | +| `g_vvvPepGenShort` / `g_vvvPepGenLong` | Per-thread peptide generation scratch buffers; populated during index build and reused across peptides to avoid repeated allocation. | --- @@ -94,13 +94,16 @@ Used by the variable mod permutation engine (`CometModificationsPermuter`). | Variable | Type | Notes | |----------|------|-------| -| `g_pvQueryMutex` | `Mutex` | Protects `g_pvQuery` insertions during batch preprocessing. | | `g_pvDBIndexMutex` | `Mutex` | Protects database index reads where concurrent access is possible. | | `g_preprocessMemoryPoolMutex` | `Mutex` | Protects the shared preprocessing memory pool. | -| `g_searchMemoryPoolMutex` | `Mutex` | Protects the shared search memory pool. | +| `g_pvQueryMutex` | `Mutex` | Protects `g_vSpecLib` load/access (`CometSpecLib.cpp`, `CometPreprocess.cpp`). Name is a holdover from before the architecture migration, when it also guarded the now-removed `g_pvQuery` global; it was repurposed rather than renamed. Remains a global (not a `SearchSession` member) because it is also used by the RTS path -- see `search/SearchSession.h`'s header comment. | | `g_ms1AlignerMutex` | `Mutex` | Protects `RetentionMatchHistory` updates in `DoMS1SearchMultiResults`. | -| `g_vSpecLibMutex` | `Mutex` | Protects speclib access where needed. | -| `g_dbIndexMutex` | `Mutex` | Protects DB index access where needed. | + +**Note:** `g_searchMemoryPoolMutex` and the paired `g_searchPoolCV` condition variable were removed during the architecture migration; the search memory pool's locking is now encapsulated inside the `SearchMemoryPool` class (see below) instead of living as bare globals. + +### SearchMemoryPool (`threading/SearchMemoryPool.h`) + +Not a global variable, but the direct replacement for the old `_pbSearchMemoryPool` static array, `g_searchMemoryPoolMutex`, and `g_searchPoolCV` trio, so it is documented here for anyone updating this table. `CometSearch.cpp` holds a single file-static instance, `s_pool`, owning its own `std::mutex` and `std::condition_variable`. `CometSearch::AllocateMemory(N)` calls `s_pool.allocate(N, g_staticParams.iArraySizeGlobal)`; `AcquirePoolSlot()` / `releaseSlot()` forward to `s_pool.acquireSlot()` / `s_pool.releaseSlot()`. Every acquire site wraps the returned slot in a `SearchMemoryPoolSlotGuard` (RAII; releases on scope exit, including exception unwind) so a throw out of a search body cannot leak a slot and stall the next `acquireSlot()` caller for up to 240 s. --- @@ -119,7 +122,6 @@ Used by the variable mod permutation engine (`CometModificationsPermuter`). |----------|-------| | `g_bCometPreprocessMemoryAllocated` | `true` when `CometPreprocess::AllocateMemory()` has been called. | | `g_bCometSearchMemoryAllocated` | `true` when `CometSearch::AllocateMemory()` has been called. | -| `g_bIdxNoFasta` | `true` when searching a `.idx` file without the corresponding `.fasta` present. | --- @@ -154,14 +156,14 @@ Used by the variable mod permutation engine (`CometModificationsPermuter`). ``` Safe to read from any concurrent RTS thread (after init): - g_staticParams, g_iFragmentIndex, g_iCountFragmentIndex, + g_staticParams, g_iFragmentIndex, g_iFragmentIndexOffset, g_vFragmentPeptides, g_vRawPeptides, g_pvProteinNames, g_pvProteinsList, - g_vSpecLib, g_vulSpecLibPrecursorIndex, g_pvDIAWindows, + g_pvProteinNameCache, g_vSpecLib, g_vulSpecLibPrecursorIndex, g_pvDIAWindows, g_AScoreOptions, g_AScoreInterface, MOD_NUMBERS, MOD_SEQS, g_massRange.iMaxFragmentCharge (after batch setup) Written per batch (batch path only -- not touched by RTS): - g_pvQuery, g_pvQueryMS1, + SearchSession::queries, SearchSession::ms1Queries, g_massRange.dMinMass / dMaxMass / bNarrowMassRange, g_staticParams.databaseInfo.uliTotAACount diff --git a/docs/RealTimeSearch.md b/docs/RealTimeSearch.md index 30283240..a277e170 100644 --- a/docs/RealTimeSearch.md +++ b/docs/RealTimeSearch.md @@ -2,7 +2,7 @@ Comet supports two search modes: -- **Batch search**: `DoSearch()` -- reads a file, processes spectra in configurable batches, writes result files. +- **Batch search**: `DoSearch()` -- reads a file, processes spectra in configurable batches, writes result files. `DoSearch()` is orchestrated by a `Pipeline` that owns one concrete `ISearchStrategy` (`FiStrategy`, `FastaStrategy`, or `PiStrategy`) and a set of `IResultWriter` implementations. All mutable batch-run state (query lists, per-run flags) lives in a `SearchSession` struct passed by reference through the pipeline. - **Real-time search (RTS)**: called per-spectrum by an external C# application; returns results synchronously within the same call. Designed for concurrent calls from multiple threads. This document covers the RTS path. The design history and task-by-task implementation record are in `docs/20260227_RTS_THREAD_PLAN.md` (MS2) and `docs/20260228_MS1_THREAD_PLAN.md` (MS1). @@ -62,8 +62,9 @@ slow path: mutex-guarded check + initialization -> ValidateSequenceDatabaseFile() validates FASTA / index; sets bCreateFragmentIndex=true if .idx is absent but FASTA exists -> CometPreprocess::AllocateMemory() preprocessing thread buffers - -> CometSearch::AllocateMemory() search thread pool (_pbSearchMemoryPool, - _ppbDuplFragmentArr) used by AcquirePoolSlot() + -> CometSearch::AllocateMemory() search thread pool (s_pool, a SearchMemoryPool + instance; aliased into _ppbDuplFragmentArr) + used by AcquirePoolSlot() -> tp->fillPool() -> if iDbType == FI_DB: if bCreateFragmentIndex: @@ -72,11 +73,12 @@ slow path: mutex-guarded check + initialization internally before returning CometSearch::AllocateMemory() re-allocate search pool freed by DoSearch() above ReadPlainPeptideIndex() loads g_vRawPeptides from the .idx file - CreateFragmentIndex(tp) builds g_iFragmentIndex in memory (CSR posting lists) + CreateFragmentIndex(tp) builds g_iFragmentIndex / g_iFragmentIndexOffset + in memory (CSR posting lists) -> singleSearchInitializationComplete.store(true, release) ``` -The `release` store ensures all threads that subsequently load the flag with `acquire` see a fully initialized `g_iFragmentIndex` and all other globals. +The `release` store ensures all threads that subsequently load the flag with `acquire` see a fully initialized `g_iFragmentIndex`, `g_iFragmentIndexOffset`, `g_pvProteinNameCache`, and all other globals. **Note on the index-build path:** When the `.idx` file is absent, `CreateFragmentIndex()` calls `DoSearch()` with `m_bRTSIndexBuild=true`. `DoSearch()` writes the `.idx` file, calls `CometSearch::DeallocateMemory()` to free the large FASTA-parse memory, then returns early (skipping the spec-lib and batch-search logic that follows in `DoSearch()`). `InitializeSingleSpectrumSearch()` then re-allocates the search pool before proceeding to load the index. @@ -113,7 +115,7 @@ DoSingleSpectrumSearchMultiResults(topN, charge, mz, masses, intensities, nPeaks +- CometPreprocess::PreprocessSingleSpectrumThreadLocal(charge, mz, masses, intensities) | -> allocates caller-owned Query* on the heap | -> fills it with binned spectrum data - | -> does NOT touch g_pvQuery + | -> does NOT touch SearchSession::queries | -> returns nullptr on failure (caller checks and returns false) | +- pdTmpSpectrum = new double[iArraySize] <- per-call allocation @@ -121,7 +123,8 @@ DoSingleSpectrumSearchMultiResults(topN, charge, mz, masses, intensities, nPeaks +- CometSearch::RunSearch(pQuery, tRealTimeStart) | -> allocates per-call bool* pbDuplFragment[] | -> SearchFragmentIndex(pQuery, pbDuplFragment, tRealTimeStart) - | reads g_iFragmentIndex / g_vFragmentPeptides (READ-ONLY) [x] + | reads g_iFragmentIndex / g_iFragmentIndexOffset (READ-ONLY) [x] + | reads g_vFragmentPeptides (READ-ONLY) [x] | XcorrScoreI(pQuery, ...) -- updates only pQuery->_pResults | CheckMassMatch(pQuery, dMass) -- reads only pQuery->_pepMassInfo | timeout checked against local tRealTimeStart @@ -133,6 +136,7 @@ DoSingleSpectrumSearchMultiResults(topN, charge, mz, masses, intensities, nPeaks +- CometPostAnalysis::CalculateAScorePro(pQuery, g_AScoreInterface) | +- sort _pResults by XCorr, extract top topN hits into output vectors + | protein names resolved via g_pvProteinNameCache.find(offset) [READ-ONLY, O(1)] [x] | +- cleanup_results: delete pQuery (destructor frees sparse arrays, _pResults[], accessMutex) @@ -149,7 +153,7 @@ DoMS1SearchMultiResults(dMaxMS1RTDiff, charge, mz, masses, intensities, nPeaks, | +- CometPreprocess::PreprocessMS1SingleSpectrumThreadLocal(charge, mz, masses, intensities) | -> allocates caller-owned QueryMS1* on the heap - | -> does NOT touch g_pvQueryMS1 + | -> does NOT touch SearchSession::ms1Queries | +- CometSpecLib::RunMS1Search(pQueryMS1, ...) | reads g_vSpecLib / g_vulSpecLibPrecursorIndex (READ-ONLY) [x] @@ -171,11 +175,12 @@ DoMS1SearchMultiResults(dMaxMS1RTDiff, charge, mz, masses, intensities, nPeaks, | State | RTS path | Notes | |-------|:--------:|-------| | `g_staticParams` | Read-only [x] | Set once at init; never written during search. | -| `g_iFragmentIndex` / `g_vFragmentPeptides` / `g_vRawPeptides` | Read-only [x] | Loaded at init; never modified. | +| `g_iFragmentIndex` / `g_iFragmentIndexOffset` | Read-only [x] | CSR index loaded at init; never modified. | +| `g_vFragmentPeptides` / `g_vRawPeptides` | Read-only [x] | Loaded at init; never modified. | | `g_vSpecLib` / `g_vulSpecLibPrecursorIndex` | Read-only [x] | Loaded at init. | -| `g_pvProteinNames` / `g_pvProteinsList` | Read-only [x] | Loaded at init. | +| `g_pvProteinNames` / `g_pvProteinsList` / `g_pvProteinNameCache` | Read-only [x] | Loaded at init. | | `g_AScoreOptions` / `g_AScoreInterface` | Read-only [x] | Pointer set at init; each call uses its own data. | -| `g_pvQuery` / `g_pvQueryMS1` | Not touched [x] | RTS path uses per-call `Query*` / `QueryMS1*`. | +| `SearchSession::queries` / `SearchSession::ms1Queries` | Not touched [x] | `SearchSession` is batch-path only. RTS path uses per-call `Query*` / `QueryMS1*`. | | `g_massRange` | Not written [x] | Mass limits derived from per-call `Query*._pepMassInfo`. | | `tRealTimeStart` | Per-call local [x] | Each call has its own `chrono::time_point`. | | `Query*` / `QueryMS1*` | Per-call heap [x] | Each call allocates and owns its object; freed at end. | @@ -254,7 +259,8 @@ The timeout clock is a `chrono::time_point tRealTimeStart` local to each call, p **Shared pools (allocated once at init, reused across calls):** - `CometPreprocess::AllocateMemory(N)` -- per-thread preprocessing buffers for the batch path. The RTS thread-local path bypasses this pool and allocates directly. -- `CometSearch::AllocateMemory(N)` -- allocates `_pbSearchMemoryPool[N]` and `_ppbDuplFragmentArr[N][]`, used by `AcquirePoolSlot()` to hand each concurrent call a dedicated duplicate-fragment scratch buffer. Must be valid before any call reaches `RunSearch(Query*, ...)`. If the index-build path was taken during init, this pool is freed inside `DoSearch()` and re-allocated by `InitializeSingleSpectrumSearch()` before proceeding. +- `CometSearch::AllocateMemory(N)` -- calls `s_pool.allocate(N, g_staticParams.iArraySizeGlobal)` (`s_pool` is a file-static `SearchMemoryPool` instance in `CometSearch.cpp`; see `threading/SearchMemoryPool.h`) and aliases each slot's scratch buffer into `_ppbDuplFragmentArr[N][]`. `AcquirePoolSlot()` / `releaseSlot()` forward to `s_pool.acquireSlot()` / `s_pool.releaseSlot()`. Every acquire site wraps the slot in a `SearchMemoryPoolSlotGuard` so the slot is released on scope exit even if the search body throws. Must be valid before any call reaches `RunSearch(Query*, ...)`. If the index-build path was taken during init, this pool is freed inside `DoSearch()` and re-allocated by `InitializeSingleSpectrumSearch()` before proceeding. +- **Known limitation:** `s_pool` is a single process-wide instance, so it does not support multiple concurrent `ICometSearchManager` instances performing RTS searches against different fragment indexes in the same process -- see the `TODO` comment at the top of `CometSearch.cpp` and `docs/20260615_multiple_rts_instances.md`. --- @@ -266,7 +272,7 @@ For a new **search or per-spectrum call** that routes through `ICometSearchManag 2. Implement in `CometSearchManager.cpp` using the thread-local pattern: - Use `PreprocessSingleSpectrumThreadLocal()` (not `PreprocessSingleSpectrum()`). - Call `CometSearch::RunSearch(pQuery, tRealTimeStart)` (not `RunSearch(ThreadPool*)`). - - Never write `g_pvQuery`, `g_massRange`, or `g_staticParams` from within the call. + - Never write `SearchSession` fields, `g_massRange`, or `g_staticParams` from within the call. 3. Add a managed wrapper method in `CometWrapper/CometWrapper.cpp` with `pin_ptr` for array parameters. 4. If new return types are needed, add wrapper structs to `CometDataWrapper.h` (and mirror in `CometData.h`). 5. Call from `RealtimeSearch/SearchMS1MS2.cs`. diff --git a/tests/regression/run_regression.py b/tests/regression/run_regression.py index 2e8d119f..f9bc92a0 100644 --- a/tests/regression/run_regression.py +++ b/tests/regression/run_regression.py @@ -7,20 +7,34 @@ fi -- fragment ion index search (index built fresh by each binary) pi -- peptide index search (index built fresh by each binary) -Comparison metrics per mode: +Each mode is run under one or more decoy variants (each backed by its own +params file, decoy_search baked in): + nodecoy -- decoy_search = 0 (comet_phospho.params) + internaldecoy1 -- decoy_search = 1, internal decoy concatenated + (comet_phospho_internaldecoy1.params) + internaldecoy2 -- decoy_search = 2, internal decoy separate + (comet_phospho_internaldecoy2.params) +internaldecoy1/internaldecoy2 only run against fasta and pi -- FI does not +support Comet's internal (on-the-fly) decoy generation, so that combination +is skipped automatically. + +Comparison metrics per mode/variant: - Wall-clock search time (seconds); index build time reported separately - PSM count (number of lines in .txt output above xcorr threshold) - PSM overlap: fraction of scans where both binaries agree on the top peptide + - For internaldecoy2 (decoy_search=2 writes a separate .decoy.txt): + the same PSM count/overlap comparison is also run on that decoy-only file. Usage: # 1. Fetch baseline binary first: python setup_baselines.py - # 2. Run all three modes against default baseline tag(s): + # 2. Run all modes x all decoy variants against default baseline tag(s): python run_regression.py - # 3. Restrict modes or tags: + # 3. Restrict modes, decoy variants, or tags: python run_regression.py --modes fasta fi + python run_regression.py --decoy-variants nodecoy internaldecoy2 python run_regression.py --tags v2026.01.1 # 4. Point at non-default binaries or data: @@ -28,8 +42,9 @@ python run_regression.py --data ../../data Output: - results/_/report.txt human-readable summary - results/_/report.json machine-readable metrics + results/_/report.txt human-readable summary (all variants x modes) + results/_/report.json machine-readable metrics + results/_///... raw per-run Comet output (baseline.txt, current.txt, etc.) """ import argparse @@ -67,6 +82,22 @@ DEFAULT_TAGS = ["v2026.01.1"] MODES = ["fasta", "fi", "pi"] +# Decoy variants: filename (relative to the effective data dir) for each variant's +# params file. Each is identical to comet_phospho.params except decoy_search. +DECOY_VARIANT_FILENAMES = { + "nodecoy": "comet_phospho.params", + "internaldecoy1": "comet_phospho_internaldecoy1.params", + "internaldecoy2": "comet_phospho_internaldecoy2.params", +} +# Modes each variant is valid for. FI does not support Comet's internal +# (on-the-fly) decoy generation, so internaldecoy1/internaldecoy2 are fasta/pi only. +DECOY_VARIANT_MODES = { + "nodecoy": {"fasta", "fi", "pi"}, + "internaldecoy1": {"fasta", "pi"}, + "internaldecoy2": {"fasta", "pi"}, +} +DEFAULT_DECOY_VARIANTS = list(DECOY_VARIANT_FILENAMES.keys()) + XCORR_THRESHOLD = 2.5 # minimum xcorr to count a PSM @@ -161,18 +192,27 @@ def build_index(binary: Path, params_path: Path, mode: str, work_dir: Path) -> f def run_search(binary: Path, params_path: Path, mzxml: Path, - work_dir: Path) -> tuple[float, Path]: - """Run a search; return (elapsed_seconds, path_to_.txt_output).""" - # Comet writes output next to the mzXML; remove stale file first. - out_txt = mzxml.with_suffix(".txt") + work_dir: Path) -> tuple[float, Path, Path]: + """ + Run a search; return (elapsed_seconds, path_to_.txt_output, path_to_decoy_txt_output). + + The decoy path is only populated by Comet when decoy_search=2 (separate + target/decoy output, see TxtWriter::open() / IResultWriter::BuildNames()); + for decoy_search=0/1 it simply won't exist and callers should check before use. + """ + # Comet writes output next to the mzXML; remove stale files first. + out_txt = mzxml.with_suffix(".txt") + out_decoy = mzxml.parent / (mzxml.stem + ".decoy.txt") if out_txt.exists(): out_txt.unlink() + if out_decoy.exists(): + out_decoy.unlink() elapsed, _ = run_comet( binary, [f"-P{comet_path(params_path)}", comet_path(mzxml)], work_dir, ) - return elapsed, out_txt + return elapsed, out_txt, out_decoy # --------------------------------------------------------------------------- @@ -289,7 +329,7 @@ def run_mode(mode: str, current_bin: Path, baseline_bin: Path, for label, binary in [("baseline", baseline_bin), ("current", current_bin)]: print(f" [fasta] running {label} ...") try: - t, txt_src = run_search(binary, params_path, MZXML_FILE, run_dir) + t, txt_src, decoy_src = run_search(binary, params_path, MZXML_FILE, run_dir) metrics[f"search_time_{label}_s"] = round(t, 2) except RuntimeError as e: print(f" ERROR: {e}", file=sys.stderr) @@ -298,6 +338,8 @@ def run_mode(mode: str, current_bin: Path, baseline_bin: Path, dest = run_dir / f"{label}.txt" if txt_src.exists(): shutil.copy(txt_src, dest) + if decoy_src.exists(): + shutil.copy(decoy_src, run_dir / f"{label}.decoy.txt") else: # ---- FI / PI: build index per binary in its own subdirectory ---- @@ -343,7 +385,7 @@ def run_mode(mode: str, current_bin: Path, baseline_bin: Path, print(f" [{mode}] running {label} search ...") try: - t, txt_src = run_search(binary, search_params_path, MZXML_FILE, sub) + t, txt_src, decoy_src = run_search(binary, search_params_path, MZXML_FILE, sub) metrics[f"search_time_{label}_s"] = round(t, 2) except RuntimeError as e: print(f" ERROR in search: {e}", file=sys.stderr) @@ -353,12 +395,24 @@ def run_mode(mode: str, current_bin: Path, baseline_bin: Path, dest = run_dir / f"{label}.txt" if txt_src.exists(): shutil.copy(txt_src, dest) + if decoy_src.exists(): + shutil.copy(decoy_src, run_dir / f"{label}.decoy.txt") # ---- Compare ---- print(f" [{mode}] comparing results ...") base_psms = parse_txt(run_dir / "baseline.txt") curr_psms = parse_txt(run_dir / "current.txt") metrics.update(compare_results(base_psms, curr_psms)) + + # decoy_search=2 writes a separate .decoy.txt; compare that too + # if either binary produced one (decoy_search=0/1 never will). + base_decoy_path = run_dir / "baseline.decoy.txt" + curr_decoy_path = run_dir / "current.decoy.txt" + if base_decoy_path.exists() or curr_decoy_path.exists(): + base_decoy_psms = parse_txt(base_decoy_path) + curr_decoy_psms = parse_txt(curr_decoy_path) + metrics["decoy_file"] = compare_results(base_decoy_psms, curr_decoy_psms) + return metrics @@ -383,25 +437,39 @@ def print_report(all_metrics: list[dict], current_bin: Path, baseline_tag: str): print(f" current : {current_bin}") print(f" xcorr threshold for PSM count: >= {XCORR_THRESHOLD}") print(sep) + def print_comparison(prefix: str, c: dict): + print(f" {prefix}PSMs >= {XCORR_THRESHOLD} (baseline) : {fmt(c.get('base_psm_count'))}") + print(f" {prefix}PSMs >= {XCORR_THRESHOLD} (current) : {fmt(c.get('curr_psm_count'))}") + af = c.get("agree_frac") + if af is not None: + pct = af * 100 + print(f" {prefix}top-peptide agreement : {c['agree_top_peptide']:>8} / " + f"{c['common_scans']} common scans ({pct:.2f}%)") + print(f" {prefix}only in baseline : {fmt(c.get('only_in_baseline'))}") + print(f" {prefix}only in current : {fmt(c.get('only_in_current'))}") + else: + print(f" {prefix}top-peptide agreement : {'N/A':>8}") + for m in all_metrics: - mode = m["mode"] - print(f"\nMode: {mode.upper()}") + mode = m["mode"] + variant = m.get("decoy_variant", "nodecoy") + print(f"\nVariant: {variant} Mode: {mode.upper()}") + + if m.get("skipped"): + print(f" SKIPPED: {m.get('skip_reason', 'not applicable')}") + continue + if m.get("index_build_time_baseline_s", "absent") != "absent": print(f" index build (baseline) : {fmt(m['index_build_time_baseline_s'], decimals=1)} s") print(f" index build (current) : {fmt(m.get('index_build_time_current_s'), decimals=1)} s") print(f" search time (baseline) : {fmt(m.get('search_time_baseline_s'), decimals=1)} s") print(f" search time (current) : {fmt(m.get('search_time_current_s'), decimals=1)} s") - print(f" PSMs >= {XCORR_THRESHOLD} (baseline) : {fmt(m.get('base_psm_count'))}") - print(f" PSMs >= {XCORR_THRESHOLD} (current) : {fmt(m.get('curr_psm_count'))}") - af = m.get("agree_frac") - if af is not None: - pct = af * 100 - print(f" top-peptide agreement : {m['agree_top_peptide']:>8} / " - f"{m['common_scans']} common scans ({pct:.2f}%)") - print(f" only in baseline : {fmt(m.get('only_in_baseline'))}") - print(f" only in current : {fmt(m.get('only_in_current'))}") - else: - print(f" top-peptide agreement : {'N/A':>8}") + print_comparison("", m) + + decoy_file = m.get("decoy_file") + if decoy_file is not None: + print(f" -- separate decoy file (decoy_search=2) --") + print_comparison("decoy ", decoy_file) print(sep) @@ -418,6 +486,11 @@ def main(): help=f"baseline release tags (default: {DEFAULT_TAGS})") parser.add_argument("--modes", nargs="+", default=MODES, choices=MODES, help=f"search modes to run (default: all)") + parser.add_argument("--decoy-variants", nargs="+", default=DEFAULT_DECOY_VARIANTS, + choices=DEFAULT_DECOY_VARIANTS, + help=f"decoy_search configurations to test (default: all). " + f"internaldecoy1/internaldecoy2 are skipped for the fi mode -- " + f"FI does not support Comet's internal decoy generation.") parser.add_argument("--data", type=Path, default=DATA_DIR, help=f"directory with FASTA, mzXML, and params (default: {DATA_DIR})") args = parser.parse_args() @@ -429,7 +502,13 @@ def main(): MZXML_FILE = args.data / MZXML_FILE.name PARAMS_FILE = args.data / PARAMS_FILE.name - for req, label in [(FASTA_FILE, "FASTA"), (MZXML_FILE, "mzXML"), (PARAMS_FILE, "params")]: + decoy_variant_paths = { + v: (args.data / DECOY_VARIANT_FILENAMES[v]) for v in args.decoy_variants + } + + required = [(FASTA_FILE, "FASTA"), (MZXML_FILE, "mzXML")] + required += [(p, f"{v} params") for v, p in decoy_variant_paths.items()] + for req, label in required: if not req.exists(): print(f"ERROR: {label} file not found: {req}", file=sys.stderr) sys.exit(1) @@ -438,9 +517,9 @@ def main(): print(f"ERROR: current binary not found: {args.current}", file=sys.stderr) sys.exit(1) - base_params = load_params(PARAMS_FILE) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - had_error = False + decoy_variant_params = {v: load_params(p) for v, p in decoy_variant_paths.items()} + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + had_error = False for tag in args.tags: baseline_bin = BASELINES_DIR / tag / ("Comet.exe" if IS_WINDOWS else "comet") @@ -456,15 +535,25 @@ def main(): run_root = RESULTS_DIR / f"{timestamp}_{tag}" tag_metrics = [] - for mode in args.modes: - try: - m = run_mode(mode, args.current, baseline_bin, - base_params, run_root / mode) - except Exception as e: - print(f" [{mode}] FAILED: {e}", file=sys.stderr) - m = {"mode": mode, "error": str(e)} - had_error = True - tag_metrics.append(m) + for variant in args.decoy_variants: + for mode in args.modes: + if mode not in DECOY_VARIANT_MODES[variant]: + print(f" [{variant}/{mode}] SKIPPED: FI does not support " + f"Comet's internal decoy generation") + tag_metrics.append({ + "mode": mode, "decoy_variant": variant, "skipped": True, + "skip_reason": "FI does not support Comet's internal decoy generation", + }) + continue + try: + m = run_mode(mode, args.current, baseline_bin, + decoy_variant_params[variant], run_root / variant / mode) + except Exception as e: + print(f" [{variant}/{mode}] FAILED: {e}", file=sys.stderr) + m = {"mode": mode, "error": str(e)} + had_error = True + m["decoy_variant"] = variant + tag_metrics.append(m) print_report(tag_metrics, args.current, tag) diff --git a/tests/regression/test_raw_vs_mzxml.py b/tests/regression/test_raw_vs_mzxml.py new file mode 100644 index 00000000..6c33cea6 --- /dev/null +++ b/tests/regression/test_raw_vs_mzxml.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 +""" +Windows .raw file support test -- compares the same Windows Comet binary +searching the identical Hela run via .mzXML vs .raw, across all 5 output +formats (txt, sqt, pep.xml, mzid, pin). + +Only the Windows release reads .raw files directly (Thermo vendor library); +this test is SKIPPED (exit 0, not a failure) when given a non-Windows binary +or when the .raw fixture is absent, since both are expected/documented +conditions rather than test failures. + +Goal: confirm (a) .raw file reading works correctly -- the .mzXML and .raw +searches should agree "near exactly" (same underlying spectra, two different +encodings, so tiny floating-point/centroiding differences are tolerated but +not large disagreements) -- and (b) every enabled output format is valid and +non-empty for both input formats, not just the default .txt. + +Usage: + python test_raw_vs_mzxml.py + python test_raw_vs_mzxml.py --comet ../../x64/Release/Comet.exe + python test_raw_vs_mzxml.py --data ../../data +""" + +import argparse +import re +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.resolve())) +import run_regression as rr # reuse run_comet/parse_txt/compare_results/patch_params/etc. + +REGRESSION_DIR = Path(__file__).parent.resolve() +REPO_ROOT = REGRESSION_DIR.parent.parent +DATA_DIR = REPO_ROOT / "data" +DEFAULT_COMET_WIN = REPO_ROOT / "x64" / "Release" / "Comet.exe" + +FASTA_FILE = DATA_DIR / "human.small.fasta" +MZXML_FILE = DATA_DIR / "20250520_Hela_60min_06.mzXML" +RAW_FILE = DATA_DIR / "20250520_Hela_60min_06.raw" +PARAMS_FILE = DATA_DIR / "comet_phospho.params" + +XCORR_THRESHOLD = 2.5 # same bar used by run_regression.py's .txt PSM comparison +MIN_AGREE_FRAC = 0.99 # "near exact" bar -- not byte-exact, since .raw and .mzXML + # are two different encodings of the same underlying spectra +MAX_COUNT_DRIFT = 0.01 # 1% tolerance on record counts for the non-txt formats (these + # are spectrum-processed counts, not scoring-threshold-sensitive, + # so they should track each other tightly) +MAX_PSM_COUNT_DRIFT = 0.05 # 5% tolerance on the xcorr>=threshold PSM count itself -- + # looser than MAX_COUNT_DRIFT because this count is sensitive to + # borderline scores flipping across the threshold from the tiny + # numeric differences between vendor-raw and converted-mzXML peaks + +# format label -> (params flag to enable it, output file extension) +OUTPUT_FORMATS = { + "txt": ("output_txtfile", ".txt"), + "sqt": ("output_sqtfile", ".sqt"), + "pepxml": ("output_pepxmlfile", ".pep.xml"), + "mzidentml": ("output_mzidentmlfile", ".mzid"), + "percolator": ("output_percolatorfile", ".pin"), +} + + +# --------------------------------------------------------------------------- +# Windows-binary / path helpers (binary-driven, not host-OS-driven -- this +# script is meant to invoke a Windows .exe from any host, e.g. via WSL interop) +# --------------------------------------------------------------------------- + +def is_windows_binary(path: Path) -> bool: + try: + with open(path, "rb") as f: + return f.read(2) == b"MZ" + except Exception: + return False + + +def to_win_path(p: Path) -> str: + s = str(p) + if s.startswith("/mnt/"): + parts = s[5:].split("/", 1) + drive = parts[0].upper() + ":" + rest = parts[1].replace("/", "\\") if len(parts) > 1 else "" + return drive + "\\" + rest + return s + + +# --------------------------------------------------------------------------- +# Lightweight per-format record counters (just enough to confirm "valid and +# not blank", plus a count to compare between the mzXML and .raw runs). +# Full peptide-level comparison is only done for .txt, via run_regression's +# already-proven parse_txt()/compare_results(). +# --------------------------------------------------------------------------- + +def count_sqt_spectra(path: Path) -> int: + if not path.exists(): + return 0 + n = 0 + with open(path, encoding="utf-8", errors="replace") as fh: + for line in fh: + if line.startswith("S\t"): + n += 1 + return n + + +def count_pepxml_spectra(path: Path) -> int: + if not path.exists(): + return 0 + n = 0 + with open(path, encoding="utf-8", errors="replace") as fh: + for line in fh: + n += line.count(" int: + if not path.exists(): + return 0 + n = 0 + with open(path, encoding="utf-8", errors="replace") as fh: + for line in fh: + n += len(re.findall(r" bool: + if a == b: + return True + denom = max(a, b, 1) + return abs(a - b) / denom <= tol + + +# --------------------------------------------------------------------------- +# Search execution +# --------------------------------------------------------------------------- + +def run_one_search(comet: Path, params_path: Path, input_file: Path, work_dir: Path): + """Run comet against input_file; return elapsed seconds.""" + elapsed, _ = rr.run_comet( + comet, + [f"-P{to_win_path(params_path)}", to_win_path(input_file)], + work_dir, + ) + return elapsed + + +def collect_outputs(input_file: Path, dest_dir: Path, label: str) -> dict: + """ + Move (not copy) every produced output file from next to input_file into + dest_dir/