From 4337ee8dfdea3a828a2cb420fc7109c175c5a90d Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Fri, 12 Jun 2026 19:43:11 -0700 Subject: [PATCH 01/15] implement architecture migration phases 1-3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 – Split CometDataInternal.h monolith (1554 lines) into three focused headers: core/Constants.h – all #define constants and DbType enum core/Params.h – parameter structs (StaticParams, Options, etc.) core/Types.h – runtime types (Query, Results, index structs, externs) CometDataInternal.h becomes a pure forwarding shim; no .cpp files changed. Phase 2 – Extract SearchMemoryPool class (threading/SearchMemoryPool.h/.cpp) Owns the duplicate-fragment scratch arrays previously managed via raw bool* statics in CometSearch. Uses its own mutex+CV, keeping the FASTA_DB SearchThreadProc availability array (_pbSearchMemoryPool) as a separate allocation. FI/PI paths call acquireSlot()/releaseSlot() directly. Phase 3 – IResultWriter interface + 5 adapter wrappers (output/) Replaces ~316-line if (bOutputXxx) dispatch chain in DoSearch() with a factory that builds vector> and calls open(), write(), close() polymorphically. Each writer owns its file handle(s); close(bSucceeded, bEmpty) handles format footers, fclose, and remove-on- empty-search. SQT writer is inserted last to preserve the existing invariant that WriteSqt runs after all other formats. All 17 unit tests pass. Co-Authored-By: Claude Sonnet 4.6 --- CometSearch/CometDataInternal.h | 1540 +------------------- CometSearch/CometSearch.cpp | 77 +- CometSearch/CometSearchManager.cpp | 510 +------ CometSearch/Makefile | 9 +- CometSearch/core/Constants.h | 109 ++ CometSearch/core/Params.h | 664 +++++++++ CometSearch/core/Types.h | 843 +++++++++++ CometSearch/output/IResultWriter.h | 62 + CometSearch/output/MzIdentMlWriter.h | 154 ++ CometSearch/output/PepXmlWriter.h | 105 ++ CometSearch/output/PercolatorWriter.h | 64 + CometSearch/output/SqtWriter.h | 104 ++ CometSearch/output/TxtWriter.h | 105 ++ CometSearch/threading/SearchMemoryPool.cpp | 90 ++ CometSearch/threading/SearchMemoryPool.h | 64 + docs/20260612_architecture_migration.md | 891 +++++++++++ tests/unit/data/t12_minlen.fasta.idx | Bin 1141 -> 1145 bytes tests/unit/data/t14_boundary.fasta.idx | Bin 1374 -> 1378 bytes tests/unit/data/t15_IL_long.fasta.idx | Bin 1461 -> 1465 bytes tests/unit/data/t15_IL_short.fasta.idx | Bin 1450 -> 1454 bytes tests/unit/data/t16_crosspath.fasta.idx | Bin 2644 -> 2648 bytes tests/unit/data/t1_basic.fasta.idx | Bin 1403 -> 1407 bytes tests/unit/data/t2_repeat.fasta.idx | Bin 1295 -> 1299 bytes tests/unit/data/t3_shared.fasta.idx | Bin 1708 -> 1712 bytes tests/unit/data/t4_IL.fasta.idx | Bin 3595 -> 3599 bytes tests/unit/data/t5_enzyme.fasta.idx | Bin 1251 -> 1255 bytes tests/unit/data/t6_flanking.fasta.idx | Bin 1453 -> 1457 bytes tests/unit/data/t7_mass.fasta.idx | Bin 1442 -> 1446 bytes 28 files changed, 3363 insertions(+), 2028 deletions(-) create mode 100644 CometSearch/core/Constants.h create mode 100644 CometSearch/core/Params.h create mode 100644 CometSearch/core/Types.h create mode 100644 CometSearch/output/IResultWriter.h create mode 100644 CometSearch/output/MzIdentMlWriter.h create mode 100644 CometSearch/output/PepXmlWriter.h create mode 100644 CometSearch/output/PercolatorWriter.h create mode 100644 CometSearch/output/SqtWriter.h create mode 100644 CometSearch/output/TxtWriter.h create mode 100644 CometSearch/threading/SearchMemoryPool.cpp create mode 100644 CometSearch/threading/SearchMemoryPool.h create mode 100644 docs/20260612_architecture_migration.md diff --git a/CometSearch/CometDataInternal.h b/CometSearch/CometDataInternal.h index 61d354ee..4eaa0fc9 100644 --- a/CometSearch/CometDataInternal.h +++ b/CometSearch/CometDataInternal.h @@ -12,1543 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Compatibility shim: existing .cpp files continue to include this header +// unchanged. All content has moved to the three focused headers below. +// New code should include the specific header it needs instead of this one. #ifndef _COMETDATAINTERNAL_H_ #define _COMETDATAINTERNAL_H_ -#include -#include -#include -#include "CometData.h" -#include "Threading.h" -#include "AScoreOptions.h" -#include "AScoreCentroid.h" -#include "AScoreAPI.h" -#include "AScoreFactory.h" -#include "AScoreDllInterface.h" - - -class CometSearchManager; - -#define PROTON_MASS 1.00727646688 -#define C13_DIFF 1.00335483 - -#define FLOAT_ZERO 1e-6 // 0.000001 - -#define MIN_PEPTIDE_LEN 1 // min # of AA for a petpide -#define MAX_PEPTIDE_LEN 51 // max # of AA for a peptide; one more than actual # to account for terminating char -#define MAX_PEPTIDE_LEN_P2 53 // max # of AA for a peptide plus 2 for N/C-term - -#define FRAGINDEX_MIN_IONS_SCORE 3 // min # of matched ions for peptide to register for E-value xcorr histogram -#define FRAGINDEX_MIN_IONS_REPORT 3 // min # of matched ions for peptide to be reported -#define FRAGINDEX_MIN_MASS 200.0 // minimum fragment ion mass used to generate fragment index -#define FRAGINDEX_MAX_MASS 2000.0 // maximum fragment ion mass used to generate fragment index -#define FRAGINDEX_MAX_BATCHSIZE 1000 // maximum number of spectra loaded when querying fragment index -#define FRAGINDEX_MAX_NUMPEAKS 150 // number of spectrum peaks used to query fragment index -#define FRAGINDEX_MAX_NUMSCORED 100 // for each fragment index spectrum query, score up to this many peptides -#define FRAGINDEX_MAX_COMBINATIONS 2000 -#define FRAGINDEX_MAX_MODS_PER_MOD 5 -#define FRAGINDEX_KEEP_ALL_PEPTIDES 1 // 1 = consider up to FRAGINDEX_MAX_COMBINATIONS of peptides; 0 = ignore all mods for peptide that exceed FRAGINDEX_MAX_COMBINATIONS - -#define MS1_MIN_MASS 0.0 // only parse up to this mass in MS1 scans for MS1 library searches -#define MS1_MAX_MASS 3000.0 // only parse up to this mass in MS1 scans for MS1 library searches -#define MS1_RT_HISTORY_SIZE 250 // size of MS1 RT history kept for recent history linear regression -#define MS1_RT_OUTLIER_THRESHOLD 2.0 // # stdev outlier threshold for MS1 RT history - -#define MAX_PEFFMOD_LEN 16 -#define SIZE_MASS 128 // ascii value size -#define SIZE_NATIVEID 256 // max length of nativeID string -#define NUM_SP_IONS 1000 // num ions for preliminary scoring -#define NUM_ION_SERIES 7 // a,b,c,x,y,z,z1 -#define EXPECT_DECOY_SIZE 3000 // number of decoy entries in CometDecoys.h - -#define WIDTH_REFERENCE 256 // length of the protein accession field to store -#define MAX_PROTEINS 50 // maximum number of proteins to return for each query; for index search only right now - -#define HISTO_SIZE 152 // some number greater than 150 - -#define NO_PEFF_VARIANT -127 - -#define ASCORE_CUTOFF_TO_ACCEPT 13.0 // minimum AScore value to accept localization - -#define FRAGINDEX_VMODS 5 // only parse first five variable mods for fragment ion index searches - // if this is ever larger than 16, need to extend range of siVarModProteinFilter - -#define VMODS 15 // also "VMODS+1" is 4th dimension of uiBinnedIonMasses to cover unmodified ions (0), mod NL (1-15) -#define COMPOUNDMODS_OFFSET 100 // piVarModSites values >= 100 encode compound mods; index = value - 100 -#define VMOD_1_INDEX 0 -#define VMOD_2_INDEX 1 -#define VMOD_3_INDEX 2 -#define VMOD_4_INDEX 3 -#define VMOD_5_INDEX 4 -#define VMOD_6_INDEX 5 -#define VMOD_7_INDEX 6 -#define VMOD_8_INDEX 7 -#define VMOD_9_INDEX 8 -#define VMOD_10_INDEX 9 -#define VMOD_11_INDEX 10 -#define VMOD_12_INDEX 11 -#define VMOD_13_INDEX 12 -#define VMOD_14_INDEX 13 -#define VMOD_15_INDEX 14 - -#define ENZYME_SINGLE_TERMINI 1 -#define ENZYME_DOUBLE_TERMINI 2 -#define ENZYME_N_TERMINI 8 -#define ENZYME_C_TERMINI 9 - -#define ION_SERIES_A 0 -#define ION_SERIES_B 1 -#define ION_SERIES_C 2 -#define ION_SERIES_X 3 -#define ION_SERIES_Y 4 -#define ION_SERIES_Z 5 -#define ION_SERIES_Z1 6 //z+1 - -#ifdef CRUX -#define XCORR_CUTOFF -999.0 -#else -#define XCORR_CUTOFF 1E-8 // some near-zero cutoff -#endif - -#define SPECLIB_CUTOFF -999.9 - -struct Options -{ - int iNumPeptideOutputLines; - int iWhichReadingFrame; - int iEnzymeTermini; - int iNumStored; // # of search results to store for xcorr analysis - int iMaxDuplicateProteins; // maximum number of duplicate proteins to report or store in idx file - int iSpectrumBatchSize; // # of spectra to search at a time within the scan range - int iStartCharge; - int iEndCharge; - int iMaxFragmentCharge; - int iMinPrecursorCharge; - int iMaxPrecursorCharge; - int iMSLevel; // filter query scans in raw/mzML/mzXML input by ms level (aka MS2, MS3) - int iSpecLibMSLevel; // filter speclib scans in raw/mzML/mzXML input by ms level (aka MS2, MS3) - int iMinPeaks; - int iRemovePrecursor; // 0=no, 1=yes, 2=ETD precursors, 3=phosphate neutral loss - int iDecoySearch; // 0=no, 1=concatenated search, 2=separate decoy search - int iNumThreads; // 0=poll CPU else set # threads to spawn - int iNumFragmentThreads; // # threads used for fragment indexing - bool bResolveFullPaths; // 0=do not resolve full paths; 1=resolve paths (default) - bool bOutputSqtStream; - bool bOutputSqtFile; - bool bOutputTxtFile; - bool bOutputPepXMLFile; - int iOutputMzIdentMLFile; - bool bOutputPercolatorFile; - bool bClipNtermMet; // 0=leave protein sequences alone; 1=also consider w/o N-term methionine - bool bClipNtermAA; // 0=leave peptide sequences as-is; 1=clip N-term amino acid from every peptide - bool bMango; // 0=normal; 1=Mango x-link ms2 input - bool bScaleFragmentNL; // 0=no; 1=scale fragment NL for each modified residue contained in fragment - bool bCreateFragmentIndex; // 0=normal search; 1=create fragment ion index plain peptide file - bool bCreatePeptideIndex; // 0=normal search; 1=create peptide index file; only one of bCreateFragmentIndex and bCreatePeptideIndex can be 1 - bool bFastPlainPeptideIdx; // 0=legacy RunSearch path; 1=use PepGenTuple per-thread buffers (avoids heap alloc) - bool bVerboseOutput; - bool bExplicitDeltaCn; // if set to 1, do not use sequence similarity logic - bool bPrintExpectScore; - bool bExportAdditionalScoresPepXML; // if 1, also report lnrSp, lnExpect, IonFrac, lnNumSP to pepXML output - bool bCorrectMass; // use selectionMZ instead of monoMZ if monoMZ is outside selection window - bool bTreatSameIL; - int iPrintAScoreProScore; // 0=no, otherwise specify variable_modXX number e.g. 1 for variable_mod01 - int iMaxIndexRunTime; // max run time of index search in milliseconds - int iFragIndexMinIonsScore; // minimum matched fragment index ions for scoring - int iFragIndexMinIonsReport; // minimum matched fragment index ions for reporting - int iFragIndexNumSpectrumPeaks; // # of peaks from spectrum to use for querying fragment index - int iFragIndexSkipReadPrecursors; // if true, skips reading precursors step - int iOverrideCharge; - long lMaxIterations; // max # of modification permutations for each iStart position - double dMinIntensity; // intensity cutoff for each peak - double dMinPercentageIntensity; // intensity cutoff for each peak as % of base peak - double dRemovePrecursorTol; - double dPeptideMassLow; // MH+ mass - double dPeptideMassHigh; // MH+ mass - double dMinimumXcorr; // set the minimum xcorr to report (default is 1e-8) - double dFragIndexMaxMass; // fragment index maximum fragment mass - double dFragIndexMinMass; // fragment index minimum fragment mass - double dMS1MinMass; // low mass cutoff in MS1 query/library spectra - double dMS1MaxMass; // high mass cutoff in MS1 query/library spectra - IntRange scanRange; - IntRange peptideLengthRange; - DoubleRange clearMzRange; - char szActivationMethod[24]; // mzXML only - string sPinProteinDelimiter; // PIN file protein delimiter; default tab - - Options& operator=(Options& a) - { - iNumPeptideOutputLines = a.iNumPeptideOutputLines; - iWhichReadingFrame = a.iWhichReadingFrame; - iEnzymeTermini = a.iEnzymeTermini; - iNumStored = a.iNumStored; - iMaxDuplicateProteins = a.iMaxDuplicateProteins; - iSpectrumBatchSize = a.iSpectrumBatchSize; - iStartCharge = a.iStartCharge; - iEndCharge = a.iEndCharge; - iMaxFragmentCharge = a.iMaxFragmentCharge; - iMinPrecursorCharge = a.iMinPrecursorCharge; - iMaxPrecursorCharge = a.iMaxPrecursorCharge ; - iMSLevel = a.iMSLevel; - iMinPeaks = a.iMinPeaks; - iRemovePrecursor = a.iRemovePrecursor; - iDecoySearch = a.iDecoySearch; - iNumThreads = a.iNumThreads; - bResolveFullPaths = a.bResolveFullPaths; - bOutputSqtStream = a.bOutputSqtStream; - bOutputSqtFile = a.bOutputSqtFile; - bOutputTxtFile = a.bOutputTxtFile; - bOutputPepXMLFile = a.bOutputPepXMLFile; - iOutputMzIdentMLFile = a.iOutputMzIdentMLFile; - bOutputPercolatorFile = a.bOutputPercolatorFile; - bClipNtermMet = a.bClipNtermMet; - bClipNtermAA = a.bClipNtermAA; - bMango = a.bMango; - bScaleFragmentNL = a.bScaleFragmentNL; - bCreatePeptideIndex = a.bCreatePeptideIndex; - bCreateFragmentIndex = a.bCreateFragmentIndex; - bFastPlainPeptideIdx = a.bFastPlainPeptideIdx; - bVerboseOutput = a.bVerboseOutput; - bExplicitDeltaCn = a.bExplicitDeltaCn; - bPrintExpectScore = a.bPrintExpectScore; - iPrintAScoreProScore = a.iPrintAScoreProScore; - bExportAdditionalScoresPepXML = a.bExportAdditionalScoresPepXML; - iOverrideCharge = a.iOverrideCharge; - bCorrectMass = a.bCorrectMass; - bTreatSameIL = a.bTreatSameIL; - iMaxIndexRunTime = a.iMaxIndexRunTime; - lMaxIterations = a.lMaxIterations; - dMinIntensity = a.dMinIntensity; - dMinPercentageIntensity = a.dMinPercentageIntensity; - dRemovePrecursorTol = a.dRemovePrecursorTol; - dPeptideMassLow = a.dPeptideMassLow; - dPeptideMassHigh = a.dPeptideMassHigh; - dMinimumXcorr = a.dMinimumXcorr; - scanRange = a.scanRange; - peptideLengthRange = a.peptideLengthRange; - clearMzRange = a.clearMzRange; - strcpy(szActivationMethod, a.szActivationMethod); - sPinProteinDelimiter = a.sPinProteinDelimiter; - - dFragIndexMinMass = a.dFragIndexMinMass; - dFragIndexMaxMass = a.dFragIndexMaxMass; - iFragIndexMinIonsScore = a.iFragIndexMinIonsScore; - iFragIndexMinIonsReport = a.iFragIndexMinIonsReport ; - iFragIndexNumSpectrumPeaks = a.iFragIndexNumSpectrumPeaks; - iFragIndexSkipReadPrecursors = a.iFragIndexSkipReadPrecursors; - - dMS1MinMass = a.dMS1MinMass; - dMS1MaxMass = a.dMS1MaxMass; - - return *this; - } -}; - -struct Results -{ - double dPepMass; - double dExpect; - float fScoreSp; - float fXcorr; - float fDeltaCn; - float fLastDeltaCn; - float fAScorePro; // AScorePro score - unsigned short usiRankXcorr; - unsigned short usiLenPeptide; - unsigned short usiRankSp; - unsigned short usiMatchedIons; - unsigned short usiTotalIons; - comet_fileoffset_t lProteinFilePosition; // for indexdb, this is the entry in g_pvProteinsList - long lWhichProtein; // which entry in g_pvProteinsList[] contains the matched proteins - int piVarModSites[MAX_PEPTIDE_LEN_P2]; // store variable mods encoding, +2 to accomodate N/C-term - double pdVarModSites[MAX_PEPTIDE_LEN_P2]; // store variable mods mass diffs, +2 to accomodate N/C-term - char pszMod[MAX_PEPTIDE_LEN][MAX_PEFFMOD_LEN]; // store PEFF mod string - char szPeptide[MAX_PEPTIDE_LEN]; - char cPrevAA; // stores prev flanking AA - char cNextAA; // stores following flanking AA - bool bClippedM; // true if new N-term protein due to clipped methionine - char cHasVariableMod; // HasVariableModType enum: 0 = no variable mod, 1 = has variable mod, 2 = has AScorePro mod - string sPeffOrigResidues; // original residue(s) of a PEFF variant - string sAScoreProSiteScores; // AScorePro site scores as comma-separated string - int iPeffOrigResiduePosition; // position of PEFF variant substitution; -1 = n-term, iLenPeptide = c-term; -9=unused - int iPeffNewResidueCount; // more than 0 new residues is a substitution (if iPeffOrigResidueCount=1) or insertion (if iPeffOrigResidueCount>1) - vector pWhichProtein; // file positions of matched protein entries - vector pWhichDecoyProtein; // keep separate decoy list (used for separate decoy matches and combined results) -}; - -struct SpecLibResults // MS2 spec lib -{ - unsigned int iWhichSpecLib; // the matched spectral library entry - float fSpecLibScore; - float fXcorr; // use xcorr for now - float fCn; // speclib score - float fRTtime; // retention time in seconds of the matched entry -}; - -struct SpecLibResultsMS1 // MS1 spec lib -{ - unsigned int iWhichSpecLib; // the matched spectral library entry - float fDotProduct; // unit vector dot product aka cosine similarity - float fRTime; // retention time in seconds of the matched entry -}; - -struct PepMassInfo -{ - double dCalcPepMass; - double dExpPepMass; // protonated MH+ experimental mass - double dPeptideMassToleranceLow; // mass tolerance low in amu from experimental mass - double dPeptideMassToleranceHigh; // mass tolerance high in amu from experimental mass - double dPeptideMassToleranceMinus; // low end of mass tolerance range including isotope offsets - double dPeptideMassTolerancePlus; // high end of mass tolerance range including isotope offsets -}; - -struct SpectrumInfoInternal -{ - int iArraySize; // m/z versus intensity array - int iHighestIon; - int iScanNumber; - unsigned short usiChargeState; - unsigned short usiMaxFragCharge; - double dTotalIntensity; - float fRTime; - char szMango[32]; // Mango encoding - char szNativeID[SIZE_NATIVEID]; // nativeID string from mzML -}; - -// The minimum and maximum mass range of all peptides to consider -// i.e. lowestPepMass - tolerance to highestPepMass + tolerance -struct MassRange -{ - double dMinMass; - double dMaxMass; - unsigned short usiMaxFragmentCharge; // global maximum fragment charge - bool bNarrowMassRange; // used to determine how to parse peptides in SearchForPeptides - unsigned int uiMaxFragmentArrayIndex; // BIN(dFragIndexMaxMass); used as fragment array index -}; - -extern MassRange g_massRange; - -// PreprocessStruct stores information used in preprocessing -// each spectrum. Information not kept around otherwise -struct PreprocessStruct -{ - int iHighestIon; - double dHighestIntensity; -}; - -struct OBOStruct // stores info read from OBO file -{ - double dMassDiffAvg; // this is looked up from strMod string from OBO - double dMassDiffMono; - string strMod; // mod string, PSI-MOD, Unimod or custom - - bool operator<(const OBOStruct& a) const - { - return (strMod < a.strMod); - } -}; - -struct ProteinEntryStruct -{ - comet_fileoffset_t lWhichProtein; // file pointer to protein - int iStartResidue; // start residue position in protein (1-based) - char cPrevAA; - char cNextAA; - - bool operator<(const ProteinEntryStruct& a) const - { - return (lWhichProtein < a.lWhichProtein); - } -}; - -struct PeffModStruct // stores info read from PEFF header -{ - double dMassDiffAvg; // this is looked up from strMod string from OBO - double dMassDiffMono; - int iPosition; // position of modification - char szMod[MAX_PEFFMOD_LEN]; - - bool operator<(const PeffModStruct& a) const - { - return (iPosition < a.iPosition); - } -}; - -struct PeffVariantSimpleStruct // stores info read from PEFF header -{ - int iPosition; // position of variant - char cResidue; // new variant - - bool operator<(const PeffVariantSimpleStruct& a) const - { - return (iPosition < a.iPosition); - } -}; - -struct PeffVariantComplexStruct // stores info read from PEFF header -{ - int iPositionA; // start position of variant - int iPositionB; // end position of variant - string sResidues; // if !empty(), insertion replacing aa from pos A to B; - // if empty(), deletion of aa from pos A to B - - bool operator<(const PeffVariantComplexStruct& a) const - { - return (iPositionA < a.iPositionA); - } -}; - -struct PeffProcessedStruct -{ - int iBeginResidue; - int iEndResidue; -}; - -struct PeffPositionStruct // collate PEFF mods by position in sequence -{ - int iPosition; // position within the sequence - vector vectorWhichPeff; // which specific peff entry from PeffModStruct - vector vectorMassDiffAvg; - vector vectorMassDiffMono; -}; - -struct PeffSearchStruct // variant info passed to SearchForPeptides -{ - int iPosition; - bool bBeginCleavage; - bool bEndCleavage; - char cOrigResidue; -}; - -//-->MH -typedef struct sDBEntry -{ - string strName; // might be able to delete this here - string strSeq; - comet_fileoffset_t lProteinFilePosition; - vector vectorPeffMod; - vector vectorPeffVariantSimple; - vector vectorPeffVariantComplex; - vector vectorPeffProcessed; -} sDBEntry; - -struct DBInfo -{ - char szDatabase[SIZE_FILE]; - char szFileName[SIZE_FILE]; - int iTotalNumProteins; - unsigned long int uliTotAACount; - - DBInfo& operator=(DBInfo& a) - { - strcpy(szDatabase, a.szDatabase); - strcpy(szFileName, a.szFileName); - iTotalNumProteins = a.iTotalNumProteins; - uliTotAACount = a.uliTotAACount; - - return *this; - } -}; - -struct DBIndex -{ - vector pcVarModSites; // empty = unmodified; else [iLen+2] encoding var mods - comet_fileoffset_t lIndexProteinFilePosition; // points to entry in g_pvProteinsList - double dPepMass; // MH+ pep mass - unsigned short siVarModProteinFilter; // bitwise representation of mmapProtein - char cPrevAA; - char cNextAA; - char sPeptide[MAX_PEPTIDE_LEN]; // peptide sequence, null-terminated - - bool operator==(const DBIndex& rhs) const - { - if (strcmp(sPeptide, rhs.sPeptide) != 0) - return false; - - if (fabs(dPepMass - rhs.dPepMass) > FLOAT_ZERO) - return false; - - int iLen = (int)strlen(sPeptide) + 2; - for (int i = 0; i < iLen; ++i) - { - char l = pcVarModSites.empty() ? 0 : pcVarModSites[i]; - char r = rhs.pcVarModSites.empty() ? 0 : rhs.pcVarModSites[i]; - if (l != r) - return false; - } - - return true; - } - - bool operator<(const DBIndex& rhs) const - { - int cmp = strcmp(sPeptide, rhs.sPeptide); - if (cmp != 0) - return cmp < 0; - - if (fabs(dPepMass - rhs.dPepMass) > FLOAT_ZERO) - return dPepMass < rhs.dPepMass; - - int iLen = (int)strlen(sPeptide) + 2; - for (int i = 0; i < iLen; ++i) - { - char l = pcVarModSites.empty() ? 0 : pcVarModSites[i]; - char r = rhs.pcVarModSites.empty() ? 0 : rhs.pcVarModSites[i]; - if (l != r) - return l < r; - } - - // FINAL tie-breaker: lowest protein index first in order - // to grab flanking residues from the first protein - return lIndexProteinFilePosition < rhs.lIndexProteinFilePosition; - } -}; - -// Compact fixed-size tuple used during plain-peptide index generation. -// Replaces heap-heavy DBIndex entries during the per-thread collection phase. -struct PepGenTuple -{ - char sPeptide[MAX_PEPTIDE_LEN]; // original AA letters (or L->I canonical), null-terminated - double dPepMass; // MH+ mass - comet_fileoffset_t lProteinFileOffset;// FASTA byte offset of the source protein - uint16_t siVarModProteinFilter; - char cPrevAA; - char cNextAA; -}; - -// --------------------------------------------------------------------------- -// 5-bit amino acid encoding for per-length short-peptide key packing. -// AAs are mapped in ASCII sort order (A=1, C=2, ..., Y=20) so that sorting -// packed uint64 keys is equivalent to lexicographic sort of sequences within -// a given peptide length. -// --------------------------------------------------------------------------- -static constexpr uint8_t kAA5bit[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0-15 - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 16-31 - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 32-47 - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 48-63 - 0, // 64 '@' - 1, // 65 'A' - 0, // 66 'B' - 2, // 67 'C' - 3, // 68 'D' - 4, // 69 'E' - 5, // 70 'F' - 6, // 71 'G' - 7, // 72 'H' - 8, // 73 'I' (canonical for I/L when bTreatSameIL) - 0, // 74 'J' - 9, // 75 'K' - 10, // 76 'L' (remapped to 8 when bTreatSameIL) - 11, // 77 'M' - 12, // 78 'N' - 0, // 79 'O' - 13, // 80 'P' - 14, // 81 'Q' - 15, // 82 'R' - 16, // 83 'S' - 17, // 84 'T' - 0, // 85 'U' - 18, // 86 'V' - 19, // 87 'W' - 0, // 88 'X' - 20, // 89 'Y' - 0, // 90 'Z' - // 91-255: all zeros - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0 -}; - -// Reverse map: 5-bit code -> amino acid character. -// Code 8 always decodes to 'I' (canonical; L maps to code 8 when bTreatSameIL). -static constexpr char k5bitAA[32] = { - '\0','A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R', - 'S', 'T','V','W','Y','\0','\0','\0','\0','\0','\0','\0','\0','\0','\0','\0' -}; - -// Pack up to 12 amino acids into a uint64 key (5 bits each, 60 bits total). -// When bTreatSameIL is true, L encodes identically to I. -inline uint64_t PackPeptide(const char* seq, int iLen, bool bTreatSameIL) -{ - uint64_t key = 0; - for (int i = 0; i < iLen; ++i) - { - char c = seq[i]; - if (bTreatSameIL && c == 'L') c = 'I'; - key |= ((uint64_t)kAA5bit[(unsigned char)c] << (55 - i * 5)); - } - return key; -} - -// Decode a packed key back to a null-terminated sequence of iLen characters. -inline void UnpackPeptide(uint64_t key, int iLen, char* seq) -{ - for (int i = 0; i < iLen; ++i) - seq[i] = k5bitAA[(key >> (55 - i * 5)) & 0x1F]; - seq[iLen] = '\0'; -} - -// Compact per-thread tuple for short peptides (len <= 12) during index generation. -// 32 bytes on 64-bit (8-byte alignment); uILMask occupies 2 of the 4 trailing pad bytes. -struct PepGenTupleShort -{ - uint64_t uPackedPep; // canonical 5-bit-encoded sequence (L treated as I when bTreatSameIL) - double dPepMass; - comet_fileoffset_t lProteinFileOffset; - uint16_t siVarModProteinFilter; - char cPrevAA; - char cNextAA; - uint16_t uILMask; // bitmask: bit k = 1 means position k was 'L' in FASTA original -}; - -// This is used for fragment indexing; plain peptides are stored in index -// file and read in to this data struct. Same as DBIndex w/o pcVarModSites[] -struct PlainPeptideIndexStruct -{ - comet_fileoffset_t lIndexProteinFilePosition; // points to entry in g_pvProteinsList - double dPepMass; // MH+ pep mass, unmodified mass; modified mass in FragmentPeptidesStruct - unsigned short siVarModProteinFilter; // bitwise representation of mmapProtein - char cPrevAA; - char cNextAA; - char szPeptide[MAX_PEPTIDE_LEN]; // peptide sequence, null-terminated - - bool operator==(const PlainPeptideIndexStruct &rhs) const - { - return strcmp(szPeptide, rhs.szPeptide) == 0; - } -}; - -struct FragmentPeptidesStruct -{ - size_t iWhichPeptide; // reference to raw peptide (sequence, proteins, etc.) in PlainPeptideIndexStruct - int modNumIdx; - double dPepMass; // peptide mass (modified or unmodified) after permuting mods - char cNtermMod; - char cCtermMod; - - bool operator<(const FragmentPeptidesStruct& a) const - { - return dPepMass < a.dPepMass; - } -}; - -struct SpecLibInfo // why a struct for just a string??? -{ - string strSpecLibFile; -}; - -struct SpecLibStruct -{ - string strName; // any string associated with speclib entry - unsigned int iLibEntry; // a reference number associated with speclib entry - unsigned int iNumPeaks; - int iSpecLibCharge; // precursor charge; not relevant for MS1 speclib - double dSpecLibMW; // if a peptide, store neutral mass - float fRTime; - float fScaleMinInten; // min intensity of data prior to encoding to pccSparseFastXcorrData; 0.0 for unit vector - float fScaleMaxInten; // max intensity of data prior to encoding to ppcSparseFastXcorrData - vector> vSpecLibPeaks; - float* pfUnitVector; - unsigned int uiArraySizeMS1; -}; - -// for MS1 alignment -struct RetentionMatch -{ - double dQueryTime; - double dReferenceTime; - int iSpectrumIndex; - - RetentionMatch(double dQueryTime, double dReferenceTime, int iSpectrumIndex); -}; -extern std::deque RetentionMatchHistory; - -extern unsigned int* g_iFragmentIndex; // CSR flat data: all posting lists concatenated [g_iFragmentIndexOffset[bin]..g_iFragmentIndexOffset[bin+1]) -extern uint64_t* g_iFragmentIndexOffset; // CSR offsets [uiMaxFragmentArrayIndex+1]: cumulative entry counts, can exceed UINT_MAX for large non-enzymatic searches -extern vector g_vFragmentPeptides; -extern vector g_vRawPeptides; -extern bool* g_bIndexPrecursors; // allocate an array of BIN(max_precursor, protonated) and use a bool to indicate if that precursor is present in input file(s) -extern vector g_vSpecLib; -extern vector> g_vulSpecLibPrecursorIndex; // this will be an vector of vectors - -struct IndexProteinStruct // for indexed database -{ - char szProt[WIDTH_REFERENCE]; - comet_fileoffset_t lProteinFilePosition; - int iWhichProtein; -}; - -struct PEFFInfo -{ - char szPeffOBO[SIZE_FILE]; - int iPeffSearch; // 0=no, 1=PSI-MOD, 2=Unimod, 3=PSI-MOD only, 4=Unimod only, 5=variants only -}; - -struct StaticMod -{ - double dAddCterminusPeptide; - double dAddNterminusPeptide; - double dAddCterminusProtein; - double dAddNterminusProtein; - double pdStaticMods[SIZE_MASS]; - - StaticMod& operator=(StaticMod& a) - { - dAddCterminusPeptide = a.dAddCterminusPeptide; - dAddNterminusPeptide = a.dAddNterminusPeptide; - dAddCterminusProtein = a.dAddCterminusProtein; - dAddNterminusProtein = a.dAddNterminusProtein; - - for (int i = 0; i < SIZE_MASS; ++i) - { - pdStaticMods[i] = a.pdStaticMods[i]; - } - - return *this; - } -}; - -struct PrecalcMasses -{ - double dNtermProton; // dAddNterminusPeptide + PROTON_MASS - double dCtermOH2Proton; // dAddCterminusPeptide + dOH2fragment + PROTON_MASS - double dOH2ProtonCtermNterm; // dOH2parent + PROTON_MASS + dAddCterminusPeptide + dAddNterminusPeptide - int iMinus17; // BIN'd value of mass(NH3) - int iMinus18; // BIN'd value of mass(H2O) - - PrecalcMasses& operator=(PrecalcMasses& a) - { - dNtermProton = a.dNtermProton; - dCtermOH2Proton = a.dCtermOH2Proton; - dOH2ProtonCtermNterm = a.dOH2ProtonCtermNterm; - iMinus17 = a.iMinus17; - iMinus18 = a.iMinus18; - - return *this; - } -}; - -struct VarModParams -{ - bool bVarModSearch; // set to true if variable mods are specified - bool bVarTermModSearch; // set to true if any n-term/c-term variable mods are specified - bool bVarProteinNTermMod; // set to true if a protein n-term variable mod specified - bool bVarProteinCTermMod; // set to true if a protein c-term variable mod specified - bool bBinaryModSearch; // set to true if any of the variable mods are of binary mod variety - bool bUseFragmentNeutralLoss; // set to true if any custom NL is set; applied only to 1+ and 2+ fragments - bool bRareVarModPresent; // set to true if any of iRequireThisMod == -1 - bool bVarModProteinFilter; // set to trueif protein mods list is applied - int iRequireVarMod; // 0=no; else use bits to determine which varmods are required - int iMaxVarModPerPeptide; - int iMaxPermutations; - VarMods varModList[VMODS]; - char cModCode[VMODS]; // mod characters - string sProteinLModsListFile; // file containing list of proteins to restrict application of varmods to - multimap mmapProteinModsList; // vector read from sProteinModsListFile if present - string sCompoundModsFile; // path to compound mods mass file; empty = disabled - vector vdCompoundMasses; // sorted, deduplicated list of masses read from sCompoundModsFile - unsigned int uiNumCompoundMasses; // vdCompoundMasses.size(); 0 when feature is disabled - - VarModParams& operator=(VarModParams& a) - { - bVarModSearch = a.bVarModSearch; - bVarTermModSearch = a.bVarTermModSearch; - bVarProteinNTermMod = a.bVarProteinNTermMod; - bVarProteinCTermMod = a.bVarProteinCTermMod; - bBinaryModSearch = a.bBinaryModSearch; - bUseFragmentNeutralLoss = a.bUseFragmentNeutralLoss; - bRareVarModPresent = a.bRareVarModPresent; - bVarModProteinFilter = a.bVarModProteinFilter; - iRequireVarMod = a.iRequireVarMod; - iMaxVarModPerPeptide = a.iMaxVarModPerPeptide; - iMaxPermutations = a.iMaxPermutations; - - for (int i = 0; i < VMODS; ++i) - { - varModList[i] = a.varModList[i]; - cModCode[i] = a.cModCode[i]; - } - - sCompoundModsFile = a.sCompoundModsFile; - vdCompoundMasses = a.vdCompoundMasses; - uiNumCompoundMasses = a.uiNumCompoundMasses; - - return *this; - } -}; - -struct MassUtil -{ - int bMonoMassesParent; - int bMonoMassesFragment; - double dCO; - double dNH3; - double dNH2; - double dH2O; - double dCOminusH2; - double dOH2fragment; - double dOH2parent; - double pdAAMassParent[SIZE_MASS]; - double pdAAMassFragment[SIZE_MASS]; - double pdAAMassUser[SIZE_MASS]; // user defined default amino acid masses - - MassUtil& operator=(MassUtil& a) - { - bMonoMassesParent = a.bMonoMassesParent; - bMonoMassesFragment = a.bMonoMassesFragment; - dCO = a.dCO; - dNH3 = a.dNH3; - dNH2 = a.dNH2; - dH2O = a.dH2O; - dCOminusH2 = a.dCOminusH2; - dOH2fragment = a.dOH2fragment; - dOH2parent = a.dOH2parent; - - for (int i = 0; i < SIZE_MASS; ++i) - { - pdAAMassParent[i] = a.pdAAMassParent[i]; - pdAAMassFragment[i] = a.pdAAMassFragment[i]; - pdAAMassUser[i] = a.pdAAMassUser[i]; - } - - return *this; - } -}; - -struct ToleranceParams -{ - int iMassToleranceUnits; // 0=amu, 1=mmu, else ppm (2) - int iMassToleranceType; // 0=MH+ (default), 1=precursor m/z; only valid if iMassToleranceUnits > 0 - int iIsotopeError; - double dInputToleranceMinus; // raw tolerance value from param file, lower bound; gets converted to dPeptideMassToleranceMinus - double dInputTolerancePlus; // raw tolerance value from param file, upper bound; gets converted to dPeptideMassTolerancePlus - double dFragmentBinSize; - double dFragmentBinStartOffset; - double dMS1BinSize; - double dMS1BinStartOffset; - - ToleranceParams& operator=(ToleranceParams& a) - { - iMassToleranceUnits = a.iMassToleranceUnits; - iMassToleranceType = a.iMassToleranceType; - iIsotopeError = a.iIsotopeError; - dInputToleranceMinus = a.dInputToleranceMinus; - dInputTolerancePlus = a.dInputTolerancePlus; - dFragmentBinSize = a.dFragmentBinSize; - dFragmentBinStartOffset = a.dFragmentBinStartOffset; - dMS1BinSize = a.dMS1BinSize; - dMS1BinStartOffset = a.dMS1BinStartOffset; - - return *this; - } -}; - -struct IonInfo -{ - int iNumIonSeriesUsed; - int piSelectedIonSeries[NUM_ION_SERIES]; - bool bUseWaterAmmoniaLoss; // ammonia, water loss - int iTheoreticalFragmentIons; - int iIonVal[NUM_ION_SERIES]; - - IonInfo& operator=(IonInfo& a) - { - iNumIonSeriesUsed = a.iNumIonSeriesUsed; - bUseWaterAmmoniaLoss = a.bUseWaterAmmoniaLoss; - iTheoreticalFragmentIons = a.iTheoreticalFragmentIons; - - for (int i = 0; i < NUM_ION_SERIES; ++i) - { - piSelectedIonSeries[i] = a.piSelectedIonSeries[i]; - iIonVal[i] = a.iIonVal[i]; - } - - return *this; - } -}; - -// Identifies which type of database is being searched. -// Defined before StaticParams so iDbType can use DbType. -enum class DbType -{ - FASTA_DB = 0, // normal FASTA sequence database - FI_DB = 1, // fragment ion index (.idx) - PI_DB = 2 // peptide index (.idx) -}; - -// static user params, won't change per thread - can make global! -struct StaticParams -{ - string sHostName; - char szMod[512]; // used for sqt output - char szDecoyPrefix[256]; // used for prefix to indicate decoys - string sDecoyPrefix; // escaped version of szDecoyPrefix for output within XML files - char szOutputSuffix[256]; // used for suffix to append to output file base names - char szTxtFileExt[256]; // text file extension; default "txt" - int iElapseTime; - char szDate[32]; - Options options; - DBInfo databaseInfo; - SpecLibInfo speclibInfo; - PEFFInfo peffInfo; - InputFileInfo inputFile; - int bPrintDuplReferences; - VarModParams variableModParameters; - ToleranceParams tolerances; - StaticMod staticModifications; - PrecalcMasses precalcMasses; - EnzymeInfo enzymeInformation; - MassUtil massUtility; - double dInverseBinWidth; // this is used in BIN() many times so use inverse binWidth to do multiply vs. divide - int iArraySizeGlobal; // (int)((g_staticParams.options.dPeptideMassHigh + plus_tol_in_daltons + buffer) * g_staticParams.dInverseBinWidth) - // for MS1 library search, use dMS1MaxMass instead of dPeptideMassHigh - double dOneMinusBinOffset; // this is used in BIN() many times so calculate once - IonInfo ionInformation; - int iXcorrProcessingOffset; - DbType iDbType; // FASTA_DB = normal fasta; FI_DB = fragment ion indexed; PI_DB = peptide index - vector vectorMassOffsets; - vector precursorNLIons; - int iPrecursorNLSize; - int iOldModsEncoding; - bool bSkipToStartScan; - std::chrono::high_resolution_clock::time_point tRealTimeStart; // track run time of real-time index search - - StaticParams() - { - RestoreDefaults(); - } - - StaticParams& operator=(StaticParams& a) - { - sHostName = a.sHostName; - strcpy(szMod, a.szMod); - strcpy(szDecoyPrefix, a.szDecoyPrefix); - strcpy(szOutputSuffix, a.szOutputSuffix); - strcpy(szTxtFileExt, a.szTxtFileExt); - vectorMassOffsets = a.vectorMassOffsets; - precursorNLIons= a.precursorNLIons; - iPrecursorNLSize = a.iPrecursorNLSize; - iOldModsEncoding = a.iOldModsEncoding; - iElapseTime = a.iElapseTime; - strcpy(szDate, a.szDate); - options = a.options; - databaseInfo = a.databaseInfo; - speclibInfo = a.speclibInfo; - inputFile = a.inputFile; - bPrintDuplReferences = a.bPrintDuplReferences; - variableModParameters = a.variableModParameters; - tolerances = a.tolerances; - staticModifications = a.staticModifications; - precalcMasses = a.precalcMasses; - enzymeInformation = a.enzymeInformation; - massUtility = a.massUtility; - dInverseBinWidth = a.dInverseBinWidth; - iArraySizeGlobal = a.iArraySizeGlobal; - dOneMinusBinOffset = a.dOneMinusBinOffset; - iXcorrProcessingOffset = a.iXcorrProcessingOffset; - ionInformation = a.ionInformation; - return *this; - } - - void RestoreDefaults() - { - int i; - - inputFile.iInputType = InputType_MS2; - - szMod[0] = '\0'; - - iXcorrProcessingOffset = 75; - iDbType = DbType::FASTA_DB; - - databaseInfo.szDatabase[0] = '\0'; - speclibInfo.strSpecLibFile.clear(); - - strcpy(szDecoyPrefix, "DECOY_"); - strcpy(szTxtFileExt, "txt"); - szOutputSuffix[0] = '\0'; - - peffInfo.szPeffOBO[0] = '\0'; - peffInfo.iPeffSearch = 0; - - variableModParameters.sCompoundModsFile = ""; - variableModParameters.vdCompoundMasses.clear(); - variableModParameters.uiNumCompoundMasses = 0; - - iPrecursorNLSize = 0; - - for (i = 0; i < SIZE_MASS; ++i) - { - massUtility.pdAAMassParent[i] = 999999.; - massUtility.pdAAMassFragment[i] = 999999.; - massUtility.pdAAMassUser[i] = 0.0; - staticModifications.pdStaticMods[i] = 0.0; - } - - massUtility.bMonoMassesFragment = 1; - massUtility.bMonoMassesParent = 1; - -#ifdef CRUX - staticModifications.pdStaticMods[(int)'C'] = 57.021464; -#endif - - - enzymeInformation.iAllowedMissedCleavage = 2; - - for (i = 0; i < VMODS; ++i) - { - variableModParameters.varModList[i].iMaxNumVarModAAPerMod = 3; - variableModParameters.varModList[i].iMinNumVarModAAPerMod = 0; - variableModParameters.varModList[i].iBinaryMod = 0; - variableModParameters.varModList[i].iRequireThisMod = 0; - variableModParameters.varModList[i].iVarModTermDistance = -1; // distance from N or C-term distance - variableModParameters.varModList[i].iWhichTerm = 0; // specify N (0) or C-term (1) - variableModParameters.varModList[i].dVarModMass = 0.0; - variableModParameters.varModList[i].dNeutralLoss = 0.0; - variableModParameters.varModList[i].dNeutralLoss2 = 0.0; - strcpy(variableModParameters.varModList[i].szVarModChar, "X"); - -#ifdef CRUX - if (i==0) - { - variableModParameters.varModList[i].dVarModMass = 15.9949; - strcpy(variableModParameters.varModList[i].szVarModChar, "M"); - } -#endif - } - - variableModParameters.cModCode[0] = '*'; - variableModParameters.cModCode[1] = '#'; - variableModParameters.cModCode[2] = '@'; - variableModParameters.cModCode[3] = '^'; - variableModParameters.cModCode[4] = '~'; - variableModParameters.cModCode[5] = '$'; - variableModParameters.cModCode[6] = '%'; - variableModParameters.cModCode[7] = '!'; - variableModParameters.cModCode[8] = '+'; - for (int i = 9; i < VMODS; ++i) - { - int iAscii = 88 + i; //start with lower case 'a' ASCII 97 - if (iAscii <= 125) // thru '}' which is ASCII 125 - variableModParameters.cModCode[i] = (char)(iAscii); - else - variableModParameters.cModCode[i] = '_'; - } - - variableModParameters.iMaxVarModPerPeptide = 5; - variableModParameters.iMaxPermutations = MAX_PERMUTATIONS; - variableModParameters.bUseFragmentNeutralLoss = false; - variableModParameters.iRequireVarMod = 0; - - ionInformation.bUseWaterAmmoniaLoss = false; - ionInformation.iTheoreticalFragmentIons = 1; // 0 = flanking peaks; 1 = no flanking peaks - ionInformation.iIonVal[ION_SERIES_A] = 0; - ionInformation.iIonVal[ION_SERIES_B] = 1; - ionInformation.iIonVal[ION_SERIES_C] = 0; - ionInformation.iIonVal[ION_SERIES_X] = 0; - ionInformation.iIonVal[ION_SERIES_Y] = 1; - ionInformation.iIonVal[ION_SERIES_Z] = 0; - ionInformation.iIonVal[ION_SERIES_Z1] = 0; - - options.iNumPeptideOutputLines = 5; - options.iWhichReadingFrame = 0; - options.iEnzymeTermini = 2; - options.iNumStored = 100; // default # of search results to store for xcorr analysis. - options.iMaxDuplicateProteins = 20; // maximum number of duplicate proteins to report or store in idx file - - options.bExplicitDeltaCn = false; - options.bPrintExpectScore = true; - options.iPrintAScoreProScore = 0; - options.bExportAdditionalScoresPepXML = false; - options.bCorrectMass = false; - options.bTreatSameIL = true; - options.iOverrideCharge = 0; - options.iMaxIndexRunTime = 0; // index run time limit in milliseconds; 0=no time limit - options.iRemovePrecursor = 0; - options.dRemovePrecursorTol = 1.5; - - options.bOutputSqtStream = false; - options.bOutputSqtFile = false; - options.bOutputTxtFile = false; - options.bOutputPepXMLFile = true; - options.iOutputMzIdentMLFile = false; - options.bOutputPercolatorFile = false; - - options.bResolveFullPaths = true; - - options.bMango = false; - options.bScaleFragmentNL = false; - options.bCreatePeptideIndex = false; - options.bCreateFragmentIndex = false; - options.bFastPlainPeptideIdx = false; - options.bVerboseOutput = false; - options.iDecoySearch = 0; - options.iNumThreads = 4; - options.iNumFragmentThreads = 4; - options.bClipNtermMet = false; - options.bClipNtermAA = false; - - options.lMaxIterations = 0; - - // These parameters affect mzXML/RAMP spectra only. - options.scanRange.iStart = 0; - options.scanRange.iEnd = 0; - options.iSpectrumBatchSize = 0; - options.iMinPeaks = 10; - options.iStartCharge = 0; - options.iEndCharge = 0; - options.iMaxFragmentCharge = 3; - options.iMinPrecursorCharge = 1; - options.iMaxPrecursorCharge = 6; - options.iMSLevel = 2; - options.dMinIntensity = 0.0; - options.dMinPercentageIntensity = 0.0; - options.dPeptideMassLow = 600.0; - options.dPeptideMassHigh = 5000.0; - options.dMinimumXcorr = XCORR_CUTOFF; - options.dFragIndexMaxMass = FRAGINDEX_MAX_MASS; - options.dFragIndexMinMass = FRAGINDEX_MIN_MASS; - strcpy(options.szActivationMethod, "ALL"); - // End of mzXML specific parameters. - - options.sPinProteinDelimiter = '\t'; - - options.dFragIndexMinMass = FRAGINDEX_MIN_MASS; - options.dFragIndexMaxMass = FRAGINDEX_MAX_MASS; - options.iFragIndexMinIonsScore = FRAGINDEX_MIN_IONS_SCORE; - options.iFragIndexMinIonsReport = FRAGINDEX_MIN_IONS_REPORT; - options.iFragIndexNumSpectrumPeaks = FRAGINDEX_MAX_NUMPEAKS; - options.iFragIndexSkipReadPrecursors = 1; // skip reading precursors by default - - options.dMS1MinMass = MS1_MIN_MASS; - options.dMS1MaxMass = MS1_MAX_MASS; - - options.clearMzRange.dStart = 0.0; - options.clearMzRange.dEnd = 0.0; - - options.peptideLengthRange.iStart = MIN_PEPTIDE_LEN; - options.peptideLengthRange.iEnd = MAX_PEPTIDE_LEN - 1; // -1 as MAX_PEPTIDE_LEN number includes terminating char - - staticModifications.dAddCterminusPeptide = 0.0; - staticModifications.dAddNterminusPeptide = 0.0; - staticModifications.dAddCterminusProtein = 0.0; - staticModifications.dAddNterminusProtein = 0.0; - - tolerances.iMassToleranceUnits = 0; - tolerances.iMassToleranceType = 0; - tolerances.iIsotopeError = 0; - tolerances.dInputToleranceMinus = -3.0; // peptide_mass_tolerance minus - tolerances.dInputTolerancePlus = 3.0; // peptide_mass_tolerance plus - tolerances.dFragmentBinSize = 1.0005; - tolerances.dFragmentBinStartOffset = 0.4; - tolerances.dMS1BinSize = 1.0005; - - bSkipToStartScan = true; - } -}; - -extern StaticParams g_staticParams; - -extern vector g_pvDBIndex; // used in both peptide index and fragment ion index; latter to store plain peptides -// Per-length, per-thread generation buffers. Outer index = (iLen - iMinLen) for short, -// (iLen - 13) for long. Inner index = thread slot. -extern vector>> g_vvvPepGenShort; // lengths <= 12 -extern vector>> g_vvvPepGenLong; // lengths > 12 -extern map g_pvProteinNames; // indexed database protein names and file positions - -// Flat CSR-style storage for the per-peptide protein list. -// Replaces vector> to eliminate the ~190M -// individual heap allocations (one per inner vector) that caused a -// ~6-minute free-time tail when building an MHC .idx file. -// External interface mirrors vector> so -// existing call sites need no changes. -class ProteinsListCSR -{ -public: - // Read-only proxy for a single row (one peptide's protein offsets). - struct Row - { - const comet_fileoffset_t* ptr; - size_t n; - - size_t size() const { return n; } - bool empty() const { return n == 0; } - - const comet_fileoffset_t& operator[](size_t j) const { return ptr[j]; } - comet_fileoffset_t at(size_t j) const { return ptr[j]; } - - const comet_fileoffset_t* begin() const { return ptr; } - const comet_fileoffset_t* end() const { return ptr + n; } - }; - - // Size / state - size_t size() const { return m_off.empty() ? 0 : m_off.size() - 1; } - bool empty() const { return size() == 0; } - - // Modifiers - void clear() - { - vector().swap(m_flat); - vector().swap(m_off); - } - - void reserve(size_t n) { m_off.reserve(n + 1); } - - void push_back(const vector& v) - { - if (m_off.empty()) m_off.push_back(0); - m_flat.insert(m_flat.end(), v.begin(), v.end()); - m_off.push_back(m_flat.size()); - } - - void push_back(vector&& v) - { - if (m_off.empty()) m_off.push_back(0); - m_flat.insert(m_flat.end(), v.begin(), v.end()); - m_off.push_back(m_flat.size()); - vector().swap(v); // release source buffer immediately - } - - // Batch-append from pre-built flat storage. - // flat: all protein file offsets for this block, concatenated in row order - // cnt: number of offsets per row (max value bounded by iMaxDuplicateProteins) - // Bulk-copies both arrays into m_flat/m_off with two insert() calls, then - // releases the source buffers. Replaces N individual push_back(vector&&) - // calls, each of which required one heap free() -- this reduces N free()s - // to 2 (one for flat, one for cnt) regardless of how many rows are in the block. - void append_flat(vector& flat, vector& cnt) - { - if (flat.empty()) - return; - if (m_off.empty()) - m_off.push_back(0); - m_flat.insert(m_flat.end(), flat.begin(), flat.end()); - for (uint32_t n : cnt) - m_off.push_back(m_off.back() + n); - vector().swap(flat); - vector().swap(cnt); - } - - // Element access - Row operator[](size_t i) const - { - return {m_flat.data() + m_off[i], - static_cast(m_off[i + 1] - m_off[i])}; - } - - Row at(size_t i) const { return (*this)[i]; } - - // Range-based for -- yields Row values - struct Iterator - { - const ProteinsListCSR* self; - size_t i; - - Row operator*() const { return (*self)[i]; } - Iterator& operator++() { ++i; return *this; } - bool operator!=(const Iterator& o) const { return i != o.i; } - }; - - Iterator begin() const { return {this, 0}; } - Iterator end() const { return {this, size()}; } - -private: - vector m_flat; // all protein offsets concatenated - vector m_off; // [N+1] CSR offsets; row i spans [m_off[i], m_off[i+1]) -}; - -extern ProteinsListCSR g_pvProteinsList; -extern unordered_map g_pvProteinNameCache; // file offset -> protein name string; populated at index load - -extern std::condition_variable g_searchPoolCV; // notified when a pool slot is released - -extern AScoreProCpp::AScoreOptions g_AScoreOptions; // AScore options -extern AScoreProCpp::AScoreDllInterface* g_AScoreInterface; - -struct ModificationNumber -{ -// int modificationNumber; - int modStringLen; // FIX: need to confirm if not needed (MOD_SEQS.at(modSeqIdx)).size(); - char* modifications; -}; - -extern vector MOD_NUMBERS; -extern vector MOD_SEQS; // Unique modifiable sequences. -extern int* MOD_SEQ_MOD_NUM_START; // Start index in the MOD_NUMBERS vector for a modifiable sequence; -1 if no modification numbers were generated -extern int* MOD_SEQ_MOD_NUM_CNT; // Total modifications numbers for a modifiable sequence. - -// Index into the MOD_SEQS vector -// -1 for peptides that have no modifiable amino acids -// -2 for peptides with no modifiable amino acids but contain n/c-term mods -extern int* PEPTIDE_MOD_SEQ_IDXS; - -extern int MOD_NUM; -extern bool g_bPlainPeptideIndexRead; // set to true if plain peptide index file is read (and fragment index generated) - // poor choice of name for the fragment index .idx given peptide index is back -extern std::atomic g_bPeptideIndexRead; // set to true if peptide index file is read -extern bool g_bSpecLibRead; // set to true if spectral library file is read - -extern bool g_bPerformSpecLibSearch; // set to true if doing spectral library search -extern bool g_bPerformDatabaseSearch; // set to true if doing database search - -extern bool g_bCometPreprocessMemoryAllocated; // set to true when memory has been allocated -extern bool g_bCometSearchMemoryAllocated; // set to true when memory has been allocated - -extern bool g_bIdxNoFasta; // set to true when .idx file being search but corresponding .fasta not present - // used in mzid output to skip sequence retrieval - -// Query stores information for peptide scoring and results -// This struct is allocated for each spectrum/charge combination -struct Query -{ - int iXcorrHistogram[HISTO_SIZE]; - unsigned int uiHistogramCount; // # of entries in histogram - float fPar[4]; // parameters of LMA regression - - int iMatchPeptideCount; // # of peptides that get stored (i.e. are greater than lowest score) - int iDecoyMatchPeptideCount; // # of decoy peptides that get stored (i.e. are greater than lowest score) - - short siMaxXcorr; // index of maximum correlation score in iXcorrHistogram - - short siLowestXcorrScoreIndex; - short siLowestDecoyXcorrScoreIndex; - - double dLowestXcorrScore; - double dLowestDecoyXcorrScore; - - float fLowestSpecLibScore; - - int iMinXcorrHisto; // min xcorr score for xcorr histogram to address good E-values for poor/sparse spectra - - double dMangoIndex; // scan number decimal precursor value i.e. 2401.001 for scan 2401, first precursor/z pair - - unsigned long int _uliNumMatchedPeptides; // # of peptides that get scored - unsigned long int _uliNumMatchedDecoyPeptides; - - // When true, sparse child arrays (float[SPARSE_MATRIX_SIZE]) belong to the - // thread-local RtsScratch pool and must NOT be delete[]'d by the destructor. - // Set only by PreprocessSingleSpectrumThreadLocal via PreprocessSingleSpectrumCore. - bool bSparseFromPool; - - // Sparse matrix representation of data - int iSpScoreData; //size of sparse matrix - int iFastXcorrDataSize; - float **ppfSparseSpScoreData; - float **ppfSparseFastXcorrData; - float **ppfSparseFastXcorrDataNL; // ppfSparseFastXcorrData with NH3, H2O contributions - - // Store raw peaks for AScorePro - - // List of ms/ms masses for fragment index search; intensity not important at this stage - vector vfRawFragmentPeakMass; - // Consider replacing vfRawFragmentPeakMass with a vector> to store - // both mass and intensity if AScorePro is used - vector vRawFragmentPeakMassIntensity; - - - PepMassInfo _pepMassInfo; - SpectrumInfoInternal _spectrumInfoInternal; - Results* _pResults; - Results* _pDecoys; - SpecLibResults* _pSpecLibResults; - - std::chrono::high_resolution_clock::time_point tSearchStart; // per-query search start time for iMaxIndexRunTime timeout - - Mutex accessMutex; - - Query() - { - memset(iXcorrHistogram, 0, sizeof(iXcorrHistogram)); - - iMatchPeptideCount = 0; - iDecoyMatchPeptideCount = 0; - uiHistogramCount = 0; - iMinXcorrHisto = 0; - - fPar[0]=0.0; - fPar[1]=0.0; - fPar[2]=0.0; - fPar[3]=0.0; - - siMaxXcorr = 0; // index of maximum correlation score in iXcorrHistogram - siLowestXcorrScoreIndex = 0; - siLowestDecoyXcorrScoreIndex = 0; - - dLowestXcorrScore = XCORR_CUTOFF; - dLowestDecoyXcorrScore = XCORR_CUTOFF; - - fLowestSpecLibScore = SPECLIB_CUTOFF; - - dMangoIndex = 0.0; - - _uliNumMatchedPeptides = 0; - _uliNumMatchedDecoyPeptides = 0; - - bSparseFromPool = false; - - ppfSparseSpScoreData = NULL; - ppfSparseFastXcorrData = NULL; - ppfSparseFastXcorrDataNL = NULL; // ppfSparseFastXcorrData with NH3, H2O contributions - - vfRawFragmentPeakMass.clear(); - vRawFragmentPeakMassIntensity.clear(); - - _pepMassInfo.dCalcPepMass = 0.0; - _pepMassInfo.dExpPepMass = 0.0; - _pepMassInfo.dPeptideMassToleranceLow = 0.0; - _pepMassInfo.dPeptideMassToleranceHigh = 0.0; - _pepMassInfo.dPeptideMassToleranceMinus = 0.0; - _pepMassInfo.dPeptideMassTolerancePlus = 0.0; - - _spectrumInfoInternal.dTotalIntensity = 0.0; - _spectrumInfoInternal.iArraySize = 0; - _spectrumInfoInternal.iHighestIon = 0; - _spectrumInfoInternal.iScanNumber = 0; - _spectrumInfoInternal.dTotalIntensity = 0.0; - - _pResults = NULL; - _pDecoys = NULL; - _pSpecLibResults = NULL; - - Threading::InitMutex(&accessMutex); - } - - ~Query() - { - int i; - if (!bSparseFromPool) - { - for (i = 0; i < iSpScoreData; ++i) - { - if (ppfSparseSpScoreData[i] != NULL) - delete[] ppfSparseSpScoreData[i]; - } - } - delete[] ppfSparseSpScoreData; - ppfSparseSpScoreData = NULL; - - if (g_staticParams.ionInformation.bUseWaterAmmoniaLoss - && (g_staticParams.ionInformation.iIonVal[ION_SERIES_A] - || g_staticParams.ionInformation.iIonVal[ION_SERIES_B] - || g_staticParams.ionInformation.iIonVal[ION_SERIES_Y])) - { - if (!bSparseFromPool) - { - for (i = 0; i < iFastXcorrDataSize; ++i) - { - if (ppfSparseFastXcorrData[i] != NULL) - delete[] ppfSparseFastXcorrData[i]; - if (ppfSparseFastXcorrDataNL[i]!=NULL) - delete[] ppfSparseFastXcorrDataNL[i]; - } - } - delete[] ppfSparseFastXcorrDataNL; - ppfSparseFastXcorrDataNL = NULL; - } - else - { - if (!bSparseFromPool) - { - for (i = 0; i < iFastXcorrDataSize; ++i) - { - if (ppfSparseFastXcorrData[i] != NULL) - delete[] ppfSparseFastXcorrData[i]; - } - } - } - delete[] ppfSparseFastXcorrData; - ppfSparseFastXcorrData = NULL; - - if (_pResults != NULL) - { - _pResults->pWhichProtein.clear(); - if (g_staticParams.options.iDecoySearch == 1) - _pResults->pWhichDecoyProtein.clear(); - delete[] _pResults; - _pResults = NULL; - } - - if (g_staticParams.options.iDecoySearch == 2 && _pDecoys != NULL) - { - _pDecoys->pWhichDecoyProtein.clear(); - delete[] _pDecoys; - _pDecoys = NULL; - } - - Threading::DestroyMutex(accessMutex); - } -}; - -struct QueryMS1 -{ - // short siLowestSpecLibIndex; - // float fLowestXcorr; - unsigned int uiMatchMS1Count; // # of peptides that get stored (i.e. are greater than lowest score) - unsigned int iArraySizeMS1; // dimension of pcFastXcorrData - - // Standard array representation of data - // Library spectra are fast xcorr manipulated so non need to do so with query MS1 - float* pfFastXcorrData; - - SpecLibResultsMS1 _pSpecLibResultsMS1; - - Mutex accessMutex; - - QueryMS1() - { - // siLowestSpecLibIndex = 0; - // fLowestXcorr = SPECLIB_CUTOFF; - uiMatchMS1Count = 0; - pfFastXcorrData = NULL; - _pSpecLibResultsMS1.fDotProduct = 0.0; - _pSpecLibResultsMS1.fRTime = 0.0; - - Threading::InitMutex(&accessMutex); - } - - ~QueryMS1() - { - //FIX delete _pSepcLibResults - - Threading::DestroyMutex(accessMutex); - } -}; - -extern vector g_pvQuery; -extern vector g_pvQueryMS1; -extern vector g_pvInputFiles; -extern Mutex g_pvQueryMutex; -extern Mutex g_pvDBIndexMutex; -extern Mutex g_preprocessMemoryPoolMutex; -extern Mutex g_searchMemoryPoolMutex; -extern Mutex g_dbIndexMutex; -extern Mutex g_vSpecLibMutex; - -struct IonSeriesStruct // defines which fragment ion series are considered -{ - int bPreviousMatch[8]; -}; - - -struct MatchedIonsStruct // for SingleSpectrumSearch -{ - double dMass; - double dInten; - - bool operator<(const MatchedIonsStruct& a) const - { - return dInten > a.dInten; - } -}; +#include "core/Constants.h" +#include "core/Params.h" +#include "core/Types.h" #endif // _COMETDATAINTERNAL_H_ diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index 9e5a9170..ae6254da 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -15,6 +15,7 @@ #include "Common.h" #include "CometSearch.h" #include "CometFragmentIndexReader.h" +#include "threading/SearchMemoryPool.h" #include @@ -23,6 +24,12 @@ bool* CometSearch::_pbSearchMemoryPool = nullptr; bool** CometSearch::_ppbDuplFragmentArr = nullptr; +// Module-local pool instance. Owns the same scratch arrays as the +// legacy _pbSearchMemoryPool/_ppbDuplFragmentArr statics above. +// Both representations are kept in sync during the transition: +// AllocateMemory populates both; AcquirePoolSlot/releaseSlot use s_pool. +static SearchMemoryPool s_pool; + extern comet_fileoffset_t clSizeCometFileOffset; @@ -44,31 +51,34 @@ CometSearch::~CometSearch() bool CometSearch::AllocateMemory(int maxNumThreads) { - if (g_bCometSearchMemoryAllocated) // already allocated + if (g_bCometSearchMemoryAllocated) return true; + if (!s_pool.allocate(maxNumThreads, g_staticParams.iArraySizeGlobal)) + return false; + + // _pbSearchMemoryPool is the slot-availability array used by SearchThreadProc + // (FASTA_DB batch path). Allocate it separately; it is distinct from the + // scratch arrays owned by s_pool. try { _pbSearchMemoryPool = new bool[maxNumThreads](); - _ppbDuplFragmentArr = new bool* [maxNumThreads]; - + _ppbDuplFragmentArr = new bool*[maxNumThreads]; for (int i = 0; i < maxNumThreads; ++i) - _ppbDuplFragmentArr[i] = new bool[g_staticParams.iArraySizeGlobal](); - - g_bCometSearchMemoryAllocated = true; - - return true; + _ppbDuplFragmentArr[i] = s_pool.duplFragmentArr(i); } catch (const std::bad_alloc& ba) { - string strErrorMsg = " Error - memory allocation failed. bad_alloc: " + std::string(ba.what()) + ".\n"; + string strErrorMsg = " Error - AllocateMemory alias arrays failed. bad_alloc: " + std::string(ba.what()) + ".\n"; g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg); - + s_pool.deallocate(); g_bCometSearchMemoryAllocated = false; - return false; } + + g_bCometSearchMemoryAllocated = true; + return true; } @@ -77,17 +87,16 @@ bool CometSearch::DeallocateMemory(int maxNumThreads) if (!g_bCometSearchMemoryAllocated) return true; - delete [] _pbSearchMemoryPool; + s_pool.deallocate(); - for (int i = 0; i < maxNumThreads; ++i) - { - delete [] _ppbDuplFragmentArr[i]; - } - - delete [] _ppbDuplFragmentArr; + delete[] _pbSearchMemoryPool; + // _ppbDuplFragmentArr holds pointers into s_pool's scratch arrays; those + // are already freed by s_pool.deallocate(). Only free the alias array itself. + delete[] _ppbDuplFragmentArr; + _pbSearchMemoryPool = nullptr; + _ppbDuplFragmentArr = nullptr; g_bCometSearchMemoryAllocated = false; - return true; } @@ -96,23 +105,7 @@ bool CometSearch::DeallocateMemory(int maxNumThreads) // Returns the slot index (0..iNumThreads-1), or -1 on timeout. int CometSearch::AcquirePoolSlot() { - int i = -1; - std::unique_lock lock(g_searchMemoryPoolMutex); - - bool found = g_searchPoolCV.wait_for(lock, std::chrono::seconds(240), [&i]() { - for (int j = 0; j < g_staticParams.options.iNumThreads; ++j) - { - if (_pbSearchMemoryPool[j] == false) - { - _pbSearchMemoryPool[j] = true; - i = j; - return true; - } - } - return false; - }); - - return found ? i : -1; + return s_pool.acquireSlot(); } @@ -136,8 +129,7 @@ bool CometSearch::RunSearch(Query* pQuery) return false; } SearchFragmentIndex(pQuery, _ppbDuplFragmentArr[iSlot]); - { std::lock_guard lk(g_searchMemoryPoolMutex); _pbSearchMemoryPool[iSlot] = false; } - g_searchPoolCV.notify_one(); + s_pool.releaseSlot(iSlot); } else if (g_staticParams.iDbType == DbType::PI_DB) // peptide index { @@ -179,8 +171,7 @@ bool CometSearch::RunSearch(Query* pQuery) return false; } SearchPeptideIndex(pQuery, _ppbDuplFragmentArr[iSlot]); - { std::lock_guard lk(g_searchMemoryPoolMutex); _pbSearchMemoryPool[iSlot] = false; } - g_searchPoolCV.notify_one(); + s_pool.releaseSlot(iSlot); } else { @@ -224,8 +215,7 @@ bool CometSearch::RunSearch(ThreadPool *tp) return false; } SearchFragmentIndex(g_pvQuery.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); - { std::lock_guard lk(g_searchMemoryPoolMutex); _pbSearchMemoryPool[iSlot] = false; } - g_searchPoolCV.notify_one(); + s_pool.releaseSlot(iSlot); } else if (g_staticParams.iDbType == DbType::PI_DB) // peptide index { @@ -276,8 +266,7 @@ bool CometSearch::RunSearch(int iPercentStart, return; } SearchFragmentIndex(g_pvQuery.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); - { std::lock_guard lk(g_searchMemoryPoolMutex); _pbSearchMemoryPool[iSlot] = false; } - g_searchPoolCV.notify_one(); + s_pool.releaseSlot(iSlot); }); } diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index 61d4c606..e84cb6e4 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -23,6 +23,12 @@ #include "CometWritePepXML.h" #include "CometWriteMzIdentML.h" #include "CometWritePercolator.h" +#include "output/IResultWriter.h" +#include "output/SqtWriter.h" +#include "output/TxtWriter.h" +#include "output/PepXmlWriter.h" +#include "output/MzIdentMlWriter.h" +#include "output/PercolatorWriter.h" #include "CometDataInternal.h" #include "CometSearchManager.h" #include "CometStatus.h" @@ -2418,320 +2424,55 @@ bool CometSearchManager::DoSearch() // 4=scan range, // 5=entire file - // For SQT & pepXML output file, check if they can be written to before doing anything else. - FILE *fpout_sqt=NULL; - FILE *fpoutd_sqt=NULL; - FILE *fpout_pepxml=NULL; - FILE *fpoutd_pepxml=NULL; - FILE *fpout_mzidentml=NULL; - FILE *fpoutd_mzidentml=NULL; - FILE *fpout_mzidentmltmp=NULL; - FILE *fpoutd_mzidentmltmp=NULL; - FILE *fpout_percolator=NULL; - FILE *fpout_txt=NULL; - FILE *fpoutd_txt=NULL; - - std::string sOutputSQT; - std::string sOutputDecoySQT; - std::string sOutputPepXML; - std::string sOutputDecoyPepXML; - std::string sOutputMzIdentML; - std::string sOutputDecoyMzIdentML; - std::string sOutputMzIdentMLtmp; // temporary file used to hold mzIdentML output before finalizing - std::string sOutputDecoyMzIdentMLtmp; // temporary file used to hold decoy mzIdentML output before finalizing - std::string sOutputPercolator; - std::string sOutputTxt; - std::string sOutputDecoyTxt; - - if (g_staticParams.options.bOutputSqtFile) + // Phase 3: writer factory -- builds vector from options. + // Each writer owns its file handle(s); open() opens + writes format header, + // write() outputs one batch, close() writes footer + fcloses. + WriterOpenCtx woctx; + woctx.szBaseName = g_staticParams.inputFile.szBaseName; + woctx.szOutputSuffix = g_staticParams.szOutputSuffix; + woctx.szTxtFileExt = g_staticParams.szTxtFileExt; + woctx.bEntireFile = (iAnalysisType == AnalysisType_EntireFile); + woctx.iFirstScan = iFirstScan; + woctx.iLastScan = iLastScan; + woctx.iDecoySearch = g_staticParams.options.iDecoySearch; + woctx.pMgr = this; + + std::vector> vWriters; + + // PepXML, mzIdentML, Percolator, Txt first; SQT last (WriteSqt modifies szMod). + if (bSucceeded && g_staticParams.options.bOutputPepXMLFile) { - if (iAnalysisType == AnalysisType_EntireFile) - { - sOutputSQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".sqt"; - -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - { - sOutputSQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".target.sqt"; - } -#endif - } - else - { - sOutputSQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".sqt"; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputSQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".target.sqt"; -#endif - } - - if ((fpout_sqt = fopen(sOutputSQT.c_str(), "w")) == NULL) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputSQT + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - CometWriteSqt::PrintSqtHeader(fpout_sqt, *this); - - if (bSucceeded && (g_staticParams.options.iDecoySearch == 2)) - { - if (iAnalysisType == AnalysisType_EntireFile) - sOutputDecoySQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".decoy.sqt"; - else - sOutputDecoySQT = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".decoy.sqt"; - - if ((fpoutd_sqt = fopen(sOutputDecoySQT.c_str(), "w")) == NULL) - { - string strErrorMsg = " Error - cannot write to decoy file \"" + sOutputDecoySQT + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - CometWriteSqt::PrintSqtHeader(fpoutd_sqt, *this); - } + auto pw = std::make_unique(); + if (!pw->open(woctx)) bSucceeded = false; + else vWriters.push_back(std::move(pw)); } - if (bSucceeded && g_staticParams.options.bOutputTxtFile) + if (bSucceeded && g_staticParams.options.iOutputMzIdentMLFile) { - if (iAnalysisType == AnalysisType_EntireFile) - { - sOutputTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + "." + g_staticParams.szTxtFileExt; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".target." + g_staticParams.szTxtFileExt; -#endif - } - else - { - sOutputTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + "." + g_staticParams.szTxtFileExt; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".target." + g_staticParams.szTxtFileExt; -#endif - } - - if ((fpout_txt = fopen(sOutputTxt.c_str(), "w")) == NULL) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputTxt + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - CometWriteTxt::PrintTxtHeader(fpout_txt); - fflush(fpout_txt); - - if (bSucceeded && (g_staticParams.options.iDecoySearch == 2)) - { - if (iAnalysisType == AnalysisType_EntireFile) - sOutputDecoyTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".decoy." + g_staticParams.szTxtFileExt; - else - sOutputDecoyTxt = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".decoy." + g_staticParams.szTxtFileExt; - - fpoutd_txt = fopen(sOutputDecoyTxt.c_str(), "w"); - if (!fpoutd_txt) - { - string strErrorMsg = " Error - cannot write to decoy file \"" + sOutputDecoyTxt + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - CometWriteTxt::PrintTxtHeader(fpoutd_txt); - } + auto pw = std::make_unique(this); + if (!pw->open(woctx)) bSucceeded = false; + else vWriters.push_back(std::move(pw)); } - if (bSucceeded && g_staticParams.options.bOutputPepXMLFile) + if (bSucceeded && g_staticParams.options.bOutputPercolatorFile) { - if (iAnalysisType == AnalysisType_EntireFile) - { - sOutputPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".pep.xml"; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".target.pep.xml"; -#endif - } - else - { - sOutputPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".pep.xml"; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".target.pep.xml"; -#endif - } - - fpout_pepxml = fopen(sOutputPepXML.c_str(), "w"); - if (!fpout_pepxml) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputPepXML + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - if (bSucceeded) - bSucceeded = CometWritePepXML::WritePepXMLHeader(fpout_pepxml, *this); - - if (bSucceeded && (g_staticParams.options.iDecoySearch == 2)) - { - if (iAnalysisType == AnalysisType_EntireFile) - sOutputDecoyPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".decoy.pep.xml"; - else - sOutputDecoyPepXML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".decoy.pep.xml"; - - fpoutd_pepxml = fopen(sOutputDecoyPepXML.c_str(), "w"); - if (!fpoutd_pepxml) - { - string strErrorMsg = " Error - cannot write to decoy file \"" + sOutputDecoyPepXML + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - if (bSucceeded) - bSucceeded = CometWritePepXML::WritePepXMLHeader(fpoutd_pepxml, *this); - } + auto pw = std::make_unique(); + if (!pw->open(woctx)) bSucceeded = false; + else vWriters.push_back(std::move(pw)); } - if (bSucceeded && g_staticParams.options.iOutputMzIdentMLFile) + if (bSucceeded && g_staticParams.options.bOutputTxtFile) { - if (iAnalysisType == AnalysisType_EntireFile) - { - sOutputMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".mzid"; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".target.mzid"; -#endif - } - else - { - sOutputMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".mzid"; -#ifdef CRUX - if (g_staticParams.options.iDecoySearch == 2) - sOutputMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".target.mzid"; -#endif - } - - fpout_mzidentml = fopen(sOutputMzIdentML.c_str(), "w"); - if (!fpout_mzidentml) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputMzIdentML + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - sOutputMzIdentMLtmp = sOutputMzIdentML + ".XXXXXX"; -#ifdef _WIN32 - errno_t err = _mktemp_s(&sOutputMzIdentMLtmp[0], sOutputMzIdentMLtmp.size() + 1); - if (err != 0) - { - string strErrorMsg = " Error - cannot create temporary file \"" + sOutputMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } -#else - int iRet = mkstemp(&sOutputMzIdentMLtmp[0]); - if (iRet == -1) - { - string strErrorMsg = " Error - cannot create temporary file \"" + sOutputMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } -#endif - - fpout_mzidentmltmp = fopen(sOutputMzIdentMLtmp.c_str(), "w"); - if (!fpout_mzidentmltmp) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - if (bSucceeded && (g_staticParams.options.iDecoySearch == 2)) - { - if (iAnalysisType == AnalysisType_EntireFile) - sOutputDecoyMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".decoy.mzid"; - else - sOutputDecoyMzIdentML = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".decoy.mzid"; - - fpoutd_mzidentml = fopen(sOutputDecoyMzIdentML.c_str(), "w"); - if (!fpoutd_mzidentml) - { - string strErrorMsg = " Error - cannot write to decoy file \"" + sOutputDecoyMzIdentML + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - sOutputDecoyMzIdentMLtmp = sOutputDecoyMzIdentML + ".XXXXXX"; -#ifdef _WIN32 - errno_t err = _mktemp_s(&sOutputDecoyMzIdentMLtmp[0], sOutputDecoyMzIdentMLtmp.size() + 1); - if (err != 0) - { - string strErrorMsg = " Error - cannot create temporary file \"" + sOutputDecoyMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } -#else - int iRet = mkstemp(&sOutputDecoyMzIdentMLtmp[0]); - if (iRet == -1) - { - string strErrorMsg = " Error - cannot create temporary file \"" + sOutputDecoyMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } -#endif - fpoutd_mzidentmltmp = fopen(sOutputDecoyMzIdentMLtmp.c_str(), "w"); - if (!fpoutd_mzidentmltmp) - { - string strErrorMsg = " Error - cannot write to decoy file \"" + sOutputDecoyMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - } + auto pw = std::make_unique(); + if (!pw->open(woctx)) bSucceeded = false; + else vWriters.push_back(std::move(pw)); } - if (bSucceeded && g_staticParams.options.bOutputPercolatorFile) + if (bSucceeded && (g_staticParams.options.bOutputSqtFile || g_staticParams.options.bOutputSqtStream)) { - if (iAnalysisType == AnalysisType_EntireFile) - sOutputPercolator = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + ".pin"; - else - sOutputPercolator = std::string(g_staticParams.inputFile.szBaseName) + g_staticParams.szOutputSuffix + - "." + std::to_string(iFirstScan) + "-" + std::to_string(iLastScan) + ".pin"; - - fpout_percolator = fopen(sOutputPercolator.c_str(), "w"); - if (!fpout_percolator) - { - string strErrorMsg = " Error - cannot write to file \"" + sOutputPercolator + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - if (bSucceeded) - CometWritePercolator::WritePercolatorHeader(fpout_percolator); + auto pw = std::make_unique(); + if (!pw->open(woctx)) bSucceeded = false; + else vWriters.push_back(std::move(pw)); } int iTotalSpectraSearched = 0; @@ -3015,30 +2756,23 @@ bool CometSearchManager::DoSearch() fflush(stdout); } - if (g_staticParams.options.bOutputPepXMLFile) - CometWritePepXML::WritePepXML(fpout_pepxml, fpoutd_pepxml, fpdb, iTotalSpectraSearched - (int)g_pvQuery.size()); - - // For mzid output, dump psms as tab-delimited text first then collate results to - // mzid file at very end due to requirements of this format. - if (g_staticParams.options.iOutputMzIdentMLFile) - CometWriteMzIdentML::WriteMzIdentMLTmp(fpout_mzidentmltmp, fpoutd_mzidentmltmp, iBatchNum); - - if (g_staticParams.options.bOutputPercolatorFile) - { - bSucceeded = CometWritePercolator::WritePercolator(fpout_percolator, fpdb); - if (!bSucceeded) - goto cleanup_results; - } - - if (g_staticParams.options.bOutputTxtFile) + // Phase 3: per-batch write via polymorphic writer loop. + // Insertion order guarantees SQT writes last (destroys szMod). { - CometWriteTxt::WriteTxt(fpout_txt, fpoutd_txt, fpdb); + WriterWriteCtx wwctx; + wwctx.fpdb = fpdb; + wwctx.iScanOffset = iTotalSpectraSearched - (int)g_pvQuery.size(); + wwctx.iBatchNum = iBatchNum; + for (auto& pw : vWriters) + { + if (!pw->write(wwctx)) + { + bSucceeded = false; + goto cleanup_results; + } + } } - // Write SQT last as I destroy the g_staticParams.szMod string during that process - if (g_staticParams.options.bOutputSqtStream || g_staticParams.options.bOutputSqtFile) - CometWriteSqt::WriteSqt(fpout_sqt, fpoutd_sqt, fpdb); - cleanup_results: // Deleting each Query object in the vector calls its destructor, which @@ -3057,50 +2791,6 @@ bool CometSearchManager::DoSearch() if (iTotalSpectraSearched == 0) logout(" Warning - no spectra searched.\n"); - if (NULL != fpout_pepxml) - CometWritePepXML::WritePepXMLEndTags(fpout_pepxml); - - if (NULL != fpoutd_pepxml) - CometWritePepXML::WritePepXMLEndTags(fpoutd_pepxml); - - if (NULL != fpout_mzidentml) - { - fclose(fpout_mzidentmltmp); // close for writing and re-open for reading - - if ((fpout_mzidentmltmp = fopen(sOutputMzIdentMLtmp.c_str(), "r")) == NULL) - { - string strErrorMsg = " Error - cannot read temporary file \"" + sOutputMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - // now read tmp file and write mzIdentML - CometWriteMzIdentML::WriteMzIdentML(fpout_mzidentml, fpdb, sOutputMzIdentMLtmp.c_str(), *this); - - fclose(fpout_mzidentmltmp); - remove(sOutputMzIdentMLtmp.c_str()); - } - - if (NULL != fpoutd_mzidentml) - { - fclose(fpoutd_mzidentmltmp); // close for writing and re-open for reading - - if ((fpoutd_mzidentmltmp = fopen(sOutputDecoyMzIdentMLtmp.c_str(), "r")) == NULL) - { - string strErrorMsg = " Error - cannot read temporary file \"" + sOutputDecoyMzIdentMLtmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - bSucceeded = false; - } - - // now read tmp file and write mzIdentML - CometWriteMzIdentML::WriteMzIdentML(fpoutd_mzidentml, fpdb, sOutputDecoyMzIdentMLtmp.c_str(), *this); - - fclose(fpoutd_mzidentmltmp); - remove(sOutputDecoyMzIdentMLtmp.c_str()); - } - if (!g_staticParams.options.bOutputSqtStream) { const auto duration = chrono::duration_cast(chrono::steady_clock::now() - tBeginTime); @@ -3152,90 +2842,12 @@ bool CometSearchManager::DoSearch() // Deallocate search memory CometSearch::DeallocateMemory(g_staticParams.options.iNumThreads); - if (NULL != fpout_pepxml) - { - fclose(fpout_pepxml); - fpout_pepxml = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputPepXML.c_str()); - } - - if (NULL != fpoutd_pepxml) - { - fclose(fpoutd_pepxml); - fpoutd_pepxml = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputDecoyPepXML.c_str()); - } - - if (NULL != fpout_mzidentml) - { - fclose(fpout_mzidentml); - fpout_mzidentml = NULL; - if (iTotalSpectraSearched == 0) - { - remove(sOutputMzIdentML.c_str()); - remove(sOutputMzIdentMLtmp.c_str()); - } - } - - if (NULL != fpoutd_mzidentml) - { - fclose(fpoutd_mzidentml); - fpoutd_mzidentml = NULL; - if (iTotalSpectraSearched == 0) - { - remove(sOutputDecoyMzIdentML.c_str()); - remove(sOutputDecoyMzIdentMLtmp.c_str()); - } - } - - if (NULL != fpout_percolator) - { - fclose(fpout_percolator); - fpout_percolator = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputPercolator.c_str()); - } - - if (NULL != fpout_sqt) - { - fclose(fpout_sqt); - fpout_sqt = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputSQT.c_str()); - } - - if (NULL != fpoutd_sqt) - { - fclose(fpoutd_sqt); - fpoutd_sqt = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputDecoySQT.c_str()); - } - - if (NULL != fpoutd_sqt) - { - fclose(fpoutd_sqt); - fpoutd_sqt = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputDecoySQT.c_str()); - } - - if (NULL != fpout_txt) - { - fclose(fpout_txt); - fpout_txt = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputTxt.c_str()); - } - - if (NULL != fpoutd_txt) + // Phase 3: finalize, fclose, and optionally remove files on empty search. { - fclose(fpoutd_txt); - fpoutd_txt = NULL; - if (iTotalSpectraSearched == 0) - remove(sOutputDecoyTxt.c_str()); + bool bEmpty = (iTotalSpectraSearched == 0); + for (auto& pw : vWriters) + pw->close(bSucceeded, bEmpty); + vWriters.clear(); } if (iTotalSpectraSearched == 0) diff --git a/CometSearch/Makefile b/CometSearch/Makefile index 8ec0a970..c5a065a3 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -27,7 +27,10 @@ COMETSEARCH_SRC = Threading CometInterfaces CometSearch CometPreprocess CometPos CometWriteSqt CometWritePepXML CometWriteMzIdentML CometWritePercolator CometWriteTxt CometSearchManager \ CombinatoricsUtils CometModificationsPermuter CometFragmentIndex CometPeptideIndex CometSpecLib CometAlignment -COMETSEARCH_OBJ = $(addprefix $(OBJDIR)/, $(addsuffix .o, $(COMETSEARCH_SRC))) +THREADING_SRC = threading/SearchMemoryPool + +COMETSEARCH_OBJ = $(addprefix $(OBJDIR)/, $(addsuffix .o, $(COMETSEARCH_SRC))) \ + $(addprefix $(OBJDIR)/, $(addsuffix .o, $(THREADING_SRC))) all: libcometsearch.a @@ -57,5 +60,9 @@ $(OBJDIR)/CometPreprocess.o: CometPreprocess.cpp Common.h CometData.h CometDataI $(OBJDIR)/CometMassSpecUtils.o: CometMassSpecUtils.cpp Common.h CometData.h CometDataInternal.h CometSearch.h CometSearchManager.h CometMassSpecUtils.h CometInterfaces.h BS_thread_pool.hpp | $(OBJDIR) ${CXX} ${CXXFLAGS} ${DEPFLAGS} CometMassSpecUtils.cpp -c -o $@ +$(OBJDIR)/threading/%.o: threading/%.cpp threading/%.h | $(OBJDIR) + @mkdir -p $(OBJDIR)/threading + ${CXX} ${CXXFLAGS} -I. $< -c -o $@ + clean: rm -rf $(OBJDIR) *.a diff --git a/CometSearch/core/Constants.h b/CometSearch/core/Constants.h new file mode 100644 index 00000000..368a8cef --- /dev/null +++ b/CometSearch/core/Constants.h @@ -0,0 +1,109 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _COMETCONSTANTS_H_ +#define _COMETCONSTANTS_H_ + +#define PROTON_MASS 1.00727646688 +#define C13_DIFF 1.00335483 + +#define FLOAT_ZERO 1e-6 // 0.000001 + +#define MIN_PEPTIDE_LEN 1 // min # of AA for a petpide +#define MAX_PEPTIDE_LEN 51 // max # of AA for a peptide; one more than actual # to account for terminating char +#define MAX_PEPTIDE_LEN_P2 53 // max # of AA for a peptide plus 2 for N/C-term + +#define FRAGINDEX_MIN_IONS_SCORE 3 // min # of matched ions for peptide to register for E-value xcorr histogram +#define FRAGINDEX_MIN_IONS_REPORT 3 // min # of matched ions for peptide to be reported +#define FRAGINDEX_MIN_MASS 200.0 // minimum fragment ion mass used to generate fragment index +#define FRAGINDEX_MAX_MASS 2000.0 // maximum fragment ion mass used to generate fragment index +#define FRAGINDEX_MAX_BATCHSIZE 1000 // maximum number of spectra loaded when querying fragment index +#define FRAGINDEX_MAX_NUMPEAKS 150 // number of spectrum peaks used to query fragment index +#define FRAGINDEX_MAX_NUMSCORED 100 // for each fragment index spectrum query, score up to this many peptides +#define FRAGINDEX_MAX_COMBINATIONS 2000 +#define FRAGINDEX_MAX_MODS_PER_MOD 5 +#define FRAGINDEX_KEEP_ALL_PEPTIDES 1 // 1 = consider up to FRAGINDEX_MAX_COMBINATIONS of peptides; 0 = ignore all mods for peptide that exceed FRAGINDEX_MAX_COMBINATIONS + +#define MS1_MIN_MASS 0.0 // only parse up to this mass in MS1 scans for MS1 library searches +#define MS1_MAX_MASS 3000.0 // only parse up to this mass in MS1 scans for MS1 library searches +#define MS1_RT_HISTORY_SIZE 250 // size of MS1 RT history kept for recent history linear regression +#define MS1_RT_OUTLIER_THRESHOLD 2.0 // # stdev outlier threshold for MS1 RT history + +#define MAX_PEFFMOD_LEN 16 +#define SIZE_MASS 128 // ascii value size +#define SIZE_NATIVEID 256 // max length of nativeID string +#define NUM_SP_IONS 1000 // num ions for preliminary scoring +#define NUM_ION_SERIES 7 // a,b,c,x,y,z,z1 +#define EXPECT_DECOY_SIZE 3000 // number of decoy entries in CometDecoys.h + +#define WIDTH_REFERENCE 256 // length of the protein accession field to store +#define MAX_PROTEINS 50 // maximum number of proteins to return for each query; for index search only right now + +#define HISTO_SIZE 152 // some number greater than 150 + +#define NO_PEFF_VARIANT -127 + +#define ASCORE_CUTOFF_TO_ACCEPT 13.0 // minimum AScore value to accept localization + +#define FRAGINDEX_VMODS 5 // only parse first five variable mods for fragment ion index searches + // if this is ever larger than 16, need to extend range of siVarModProteinFilter + +#define VMODS 15 // also "VMODS+1" is 4th dimension of uiBinnedIonMasses to cover unmodified ions (0), mod NL (1-15) +#define COMPOUNDMODS_OFFSET 100 // piVarModSites values >= 100 encode compound mods; index = value - 100 +#define VMOD_1_INDEX 0 +#define VMOD_2_INDEX 1 +#define VMOD_3_INDEX 2 +#define VMOD_4_INDEX 3 +#define VMOD_5_INDEX 4 +#define VMOD_6_INDEX 5 +#define VMOD_7_INDEX 6 +#define VMOD_8_INDEX 7 +#define VMOD_9_INDEX 8 +#define VMOD_10_INDEX 9 +#define VMOD_11_INDEX 10 +#define VMOD_12_INDEX 11 +#define VMOD_13_INDEX 12 +#define VMOD_14_INDEX 13 +#define VMOD_15_INDEX 14 + +#define ENZYME_SINGLE_TERMINI 1 +#define ENZYME_DOUBLE_TERMINI 2 +#define ENZYME_N_TERMINI 8 +#define ENZYME_C_TERMINI 9 + +#define ION_SERIES_A 0 +#define ION_SERIES_B 1 +#define ION_SERIES_C 2 +#define ION_SERIES_X 3 +#define ION_SERIES_Y 4 +#define ION_SERIES_Z 5 +#define ION_SERIES_Z1 6 //z+1 + +#ifdef CRUX +#define XCORR_CUTOFF -999.0 +#else +#define XCORR_CUTOFF 1E-8 // some near-zero cutoff +#endif + +#define SPECLIB_CUTOFF -999.9 + +// Identifies which type of database is being searched. +enum class DbType +{ + FASTA_DB = 0, // normal FASTA sequence database + FI_DB = 1, // fragment ion index (.idx) + PI_DB = 2 // peptide index (.idx) +}; + +#endif // _COMETCONSTANTS_H_ diff --git a/CometSearch/core/Params.h b/CometSearch/core/Params.h new file mode 100644 index 00000000..178a9943 --- /dev/null +++ b/CometSearch/core/Params.h @@ -0,0 +1,664 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Parameter structs: Options, DBInfo, StaticParams, and their sub-structs. +// Depends on: core/Constants.h, CometData.h + +#ifndef _COMETPARAMS_H_ +#define _COMETPARAMS_H_ + +#include +#include +#include +#include +#include "core/Constants.h" +#include "CometData.h" + +using std::string; +using std::vector; +using std::multimap; + +class CometSearchManager; + +struct Options +{ + int iNumPeptideOutputLines; + int iWhichReadingFrame; + int iEnzymeTermini; + int iNumStored; // # of search results to store for xcorr analysis + int iMaxDuplicateProteins; // maximum number of duplicate proteins to report or store in idx file + int iSpectrumBatchSize; // # of spectra to search at a time within the scan range + int iStartCharge; + int iEndCharge; + int iMaxFragmentCharge; + int iMinPrecursorCharge; + int iMaxPrecursorCharge; + int iMSLevel; // filter query scans in raw/mzML/mzXML input by ms level (aka MS2, MS3) + int iSpecLibMSLevel; // filter speclib scans in raw/mzML/mzXML input by ms level (aka MS2, MS3) + int iMinPeaks; + int iRemovePrecursor; // 0=no, 1=yes, 2=ETD precursors, 3=phosphate neutral loss + int iDecoySearch; // 0=no, 1=concatenated search, 2=separate decoy search + int iNumThreads; // 0=poll CPU else set # threads to spawn + int iNumFragmentThreads; // # threads used for fragment indexing + bool bResolveFullPaths; // 0=do not resolve full paths; 1=resolve paths (default) + bool bOutputSqtStream; + bool bOutputSqtFile; + bool bOutputTxtFile; + bool bOutputPepXMLFile; + int iOutputMzIdentMLFile; + bool bOutputPercolatorFile; + bool bClipNtermMet; // 0=leave protein sequences alone; 1=also consider w/o N-term methionine + bool bClipNtermAA; // 0=leave peptide sequences as-is; 1=clip N-term amino acid from every peptide + bool bMango; // 0=normal; 1=Mango x-link ms2 input + bool bScaleFragmentNL; // 0=no; 1=scale fragment NL for each modified residue contained in fragment + bool bCreateFragmentIndex; // 0=normal search; 1=create fragment ion index plain peptide file + bool bCreatePeptideIndex; // 0=normal search; 1=create peptide index file; only one of bCreateFragmentIndex and bCreatePeptideIndex can be 1 + bool bFastPlainPeptideIdx; // 0=legacy RunSearch path; 1=use PepGenTuple per-thread buffers (avoids heap alloc) + bool bVerboseOutput; + bool bExplicitDeltaCn; // if set to 1, do not use sequence similarity logic + bool bPrintExpectScore; + bool bExportAdditionalScoresPepXML; // if 1, also report lnrSp, lnExpect, IonFrac, lnNumSP to pepXML output + bool bCorrectMass; // use selectionMZ instead of monoMZ if monoMZ is outside selection window + bool bTreatSameIL; + int iPrintAScoreProScore; // 0=no, otherwise specify variable_modXX number e.g. 1 for variable_mod01 + int iMaxIndexRunTime; // max run time of index search in milliseconds + int iFragIndexMinIonsScore; // minimum matched fragment index ions for scoring + int iFragIndexMinIonsReport; // minimum matched fragment index ions for reporting + int iFragIndexNumSpectrumPeaks; // # of peaks from spectrum to use for querying fragment index + int iFragIndexSkipReadPrecursors; // if true, skips reading precursors step + int iOverrideCharge; + long lMaxIterations; // max # of modification permutations for each iStart position + double dMinIntensity; // intensity cutoff for each peak + double dMinPercentageIntensity; // intensity cutoff for each peak as % of base peak + double dRemovePrecursorTol; + double dPeptideMassLow; // MH+ mass + double dPeptideMassHigh; // MH+ mass + double dMinimumXcorr; // set the minimum xcorr to report (default is 1e-8) + double dFragIndexMaxMass; // fragment index maximum fragment mass + double dFragIndexMinMass; // fragment index minimum fragment mass + double dMS1MinMass; // low mass cutoff in MS1 query/library spectra + double dMS1MaxMass; // high mass cutoff in MS1 query/library spectra + IntRange scanRange; + IntRange peptideLengthRange; + DoubleRange clearMzRange; + char szActivationMethod[24]; // mzXML only + string sPinProteinDelimiter; // PIN file protein delimiter; default tab + + Options& operator=(Options& a) + { + iNumPeptideOutputLines = a.iNumPeptideOutputLines; + iWhichReadingFrame = a.iWhichReadingFrame; + iEnzymeTermini = a.iEnzymeTermini; + iNumStored = a.iNumStored; + iMaxDuplicateProteins = a.iMaxDuplicateProteins; + iSpectrumBatchSize = a.iSpectrumBatchSize; + iStartCharge = a.iStartCharge; + iEndCharge = a.iEndCharge; + iMaxFragmentCharge = a.iMaxFragmentCharge; + iMinPrecursorCharge = a.iMinPrecursorCharge; + iMaxPrecursorCharge = a.iMaxPrecursorCharge ; + iMSLevel = a.iMSLevel; + iMinPeaks = a.iMinPeaks; + iRemovePrecursor = a.iRemovePrecursor; + iDecoySearch = a.iDecoySearch; + iNumThreads = a.iNumThreads; + bResolveFullPaths = a.bResolveFullPaths; + bOutputSqtStream = a.bOutputSqtStream; + bOutputSqtFile = a.bOutputSqtFile; + bOutputTxtFile = a.bOutputTxtFile; + bOutputPepXMLFile = a.bOutputPepXMLFile; + iOutputMzIdentMLFile = a.iOutputMzIdentMLFile; + bOutputPercolatorFile = a.bOutputPercolatorFile; + bClipNtermMet = a.bClipNtermMet; + bClipNtermAA = a.bClipNtermAA; + bMango = a.bMango; + bScaleFragmentNL = a.bScaleFragmentNL; + bCreatePeptideIndex = a.bCreatePeptideIndex; + bCreateFragmentIndex = a.bCreateFragmentIndex; + bFastPlainPeptideIdx = a.bFastPlainPeptideIdx; + bVerboseOutput = a.bVerboseOutput; + bExplicitDeltaCn = a.bExplicitDeltaCn; + bPrintExpectScore = a.bPrintExpectScore; + iPrintAScoreProScore = a.iPrintAScoreProScore; + bExportAdditionalScoresPepXML = a.bExportAdditionalScoresPepXML; + iOverrideCharge = a.iOverrideCharge; + bCorrectMass = a.bCorrectMass; + bTreatSameIL = a.bTreatSameIL; + iMaxIndexRunTime = a.iMaxIndexRunTime; + lMaxIterations = a.lMaxIterations; + dMinIntensity = a.dMinIntensity; + dMinPercentageIntensity = a.dMinPercentageIntensity; + dRemovePrecursorTol = a.dRemovePrecursorTol; + dPeptideMassLow = a.dPeptideMassLow; + dPeptideMassHigh = a.dPeptideMassHigh; + dMinimumXcorr = a.dMinimumXcorr; + scanRange = a.scanRange; + peptideLengthRange = a.peptideLengthRange; + clearMzRange = a.clearMzRange; + strcpy(szActivationMethod, a.szActivationMethod); + sPinProteinDelimiter = a.sPinProteinDelimiter; + + dFragIndexMinMass = a.dFragIndexMinMass; + dFragIndexMaxMass = a.dFragIndexMaxMass; + iFragIndexMinIonsScore = a.iFragIndexMinIonsScore; + iFragIndexMinIonsReport = a.iFragIndexMinIonsReport ; + iFragIndexNumSpectrumPeaks = a.iFragIndexNumSpectrumPeaks; + iFragIndexSkipReadPrecursors = a.iFragIndexSkipReadPrecursors; + + dMS1MinMass = a.dMS1MinMass; + dMS1MaxMass = a.dMS1MaxMass; + + return *this; + } +}; + +// The minimum and maximum mass range of all peptides to consider +// i.e. lowestPepMass - tolerance to highestPepMass + tolerance +struct MassRange +{ + double dMinMass; + double dMaxMass; + unsigned short usiMaxFragmentCharge; // global maximum fragment charge + bool bNarrowMassRange; // used to determine how to parse peptides in SearchForPeptides + unsigned int uiMaxFragmentArrayIndex; // BIN(dFragIndexMaxMass); used as fragment array index +}; + +extern MassRange g_massRange; + +struct DBInfo +{ + char szDatabase[SIZE_FILE]; + char szFileName[SIZE_FILE]; + int iTotalNumProteins; + unsigned long int uliTotAACount; + + DBInfo& operator=(DBInfo& a) + { + strcpy(szDatabase, a.szDatabase); + strcpy(szFileName, a.szFileName); + iTotalNumProteins = a.iTotalNumProteins; + uliTotAACount = a.uliTotAACount; + + return *this; + } +}; + +struct SpecLibInfo // why a struct for just a string??? +{ + string strSpecLibFile; +}; + +struct PEFFInfo +{ + char szPeffOBO[SIZE_FILE]; + int iPeffSearch; // 0=no, 1=PSI-MOD, 2=Unimod, 3=PSI-MOD only, 4=Unimod only, 5=variants only +}; + +struct StaticMod +{ + double dAddCterminusPeptide; + double dAddNterminusPeptide; + double dAddCterminusProtein; + double dAddNterminusProtein; + double pdStaticMods[SIZE_MASS]; + + StaticMod& operator=(StaticMod& a) + { + dAddCterminusPeptide = a.dAddCterminusPeptide; + dAddNterminusPeptide = a.dAddNterminusPeptide; + dAddCterminusProtein = a.dAddCterminusProtein; + dAddNterminusProtein = a.dAddNterminusProtein; + + for (int i = 0; i < SIZE_MASS; ++i) + { + pdStaticMods[i] = a.pdStaticMods[i]; + } + + return *this; + } +}; + +struct PrecalcMasses +{ + double dNtermProton; // dAddNterminusPeptide + PROTON_MASS + double dCtermOH2Proton; // dAddCterminusPeptide + dOH2fragment + PROTON_MASS + double dOH2ProtonCtermNterm; // dOH2parent + PROTON_MASS + dAddCterminusPeptide + dAddNterminusPeptide + int iMinus17; // BIN'd value of mass(NH3) + int iMinus18; // BIN'd value of mass(H2O) + + PrecalcMasses& operator=(PrecalcMasses& a) + { + dNtermProton = a.dNtermProton; + dCtermOH2Proton = a.dCtermOH2Proton; + dOH2ProtonCtermNterm = a.dOH2ProtonCtermNterm; + iMinus17 = a.iMinus17; + iMinus18 = a.iMinus18; + + return *this; + } +}; + +struct VarModParams +{ + bool bVarModSearch; // set to true if variable mods are specified + bool bVarTermModSearch; // set to true if any n-term/c-term variable mods are specified + bool bVarProteinNTermMod; // set to true if a protein n-term variable mod specified + bool bVarProteinCTermMod; // set to true if a protein c-term variable mod specified + bool bBinaryModSearch; // set to true if any of the variable mods are of binary mod variety + bool bUseFragmentNeutralLoss; // set to true if any custom NL is set; applied only to 1+ and 2+ fragments + bool bRareVarModPresent; // set to true if any of iRequireThisMod == -1 + bool bVarModProteinFilter; // set to trueif protein mods list is applied + int iRequireVarMod; // 0=no; else use bits to determine which varmods are required + int iMaxVarModPerPeptide; + int iMaxPermutations; + VarMods varModList[VMODS]; + char cModCode[VMODS]; // mod characters + string sProteinLModsListFile; // file containing list of proteins to restrict application of varmods to + multimap mmapProteinModsList; // vector read from sProteinModsListFile if present + string sCompoundModsFile; // path to compound mods mass file; empty = disabled + vector vdCompoundMasses; // sorted, deduplicated list of masses read from sCompoundModsFile + unsigned int uiNumCompoundMasses; // vdCompoundMasses.size(); 0 when feature is disabled + + VarModParams& operator=(VarModParams& a) + { + bVarModSearch = a.bVarModSearch; + bVarTermModSearch = a.bVarTermModSearch; + bVarProteinNTermMod = a.bVarProteinNTermMod; + bVarProteinCTermMod = a.bVarProteinCTermMod; + bBinaryModSearch = a.bBinaryModSearch; + bUseFragmentNeutralLoss = a.bUseFragmentNeutralLoss; + bRareVarModPresent = a.bRareVarModPresent; + bVarModProteinFilter = a.bVarModProteinFilter; + iRequireVarMod = a.iRequireVarMod; + iMaxVarModPerPeptide = a.iMaxVarModPerPeptide; + iMaxPermutations = a.iMaxPermutations; + + for (int i = 0; i < VMODS; ++i) + { + varModList[i] = a.varModList[i]; + cModCode[i] = a.cModCode[i]; + } + + sCompoundModsFile = a.sCompoundModsFile; + vdCompoundMasses = a.vdCompoundMasses; + uiNumCompoundMasses = a.uiNumCompoundMasses; + + return *this; + } +}; + +struct MassUtil +{ + int bMonoMassesParent; + int bMonoMassesFragment; + double dCO; + double dNH3; + double dNH2; + double dH2O; + double dCOminusH2; + double dOH2fragment; + double dOH2parent; + double pdAAMassParent[SIZE_MASS]; + double pdAAMassFragment[SIZE_MASS]; + double pdAAMassUser[SIZE_MASS]; // user defined default amino acid masses + + MassUtil& operator=(MassUtil& a) + { + bMonoMassesParent = a.bMonoMassesParent; + bMonoMassesFragment = a.bMonoMassesFragment; + dCO = a.dCO; + dNH3 = a.dNH3; + dNH2 = a.dNH2; + dH2O = a.dH2O; + dCOminusH2 = a.dCOminusH2; + dOH2fragment = a.dOH2fragment; + dOH2parent = a.dOH2parent; + + for (int i = 0; i < SIZE_MASS; ++i) + { + pdAAMassParent[i] = a.pdAAMassParent[i]; + pdAAMassFragment[i] = a.pdAAMassFragment[i]; + pdAAMassUser[i] = a.pdAAMassUser[i]; + } + + return *this; + } +}; + +struct ToleranceParams +{ + int iMassToleranceUnits; // 0=amu, 1=mmu, else ppm (2) + int iMassToleranceType; // 0=MH+ (default), 1=precursor m/z; only valid if iMassToleranceUnits > 0 + int iIsotopeError; + double dInputToleranceMinus; // raw tolerance value from param file, lower bound; gets converted to dPeptideMassToleranceMinus + double dInputTolerancePlus; // raw tolerance value from param file, upper bound; gets converted to dPeptideMassTolerancePlus + double dFragmentBinSize; + double dFragmentBinStartOffset; + double dMS1BinSize; + double dMS1BinStartOffset; + + ToleranceParams& operator=(ToleranceParams& a) + { + iMassToleranceUnits = a.iMassToleranceUnits; + iMassToleranceType = a.iMassToleranceType; + iIsotopeError = a.iIsotopeError; + dInputToleranceMinus = a.dInputToleranceMinus; + dInputTolerancePlus = a.dInputTolerancePlus; + dFragmentBinSize = a.dFragmentBinSize; + dFragmentBinStartOffset = a.dFragmentBinStartOffset; + dMS1BinSize = a.dMS1BinSize; + dMS1BinStartOffset = a.dMS1BinStartOffset; + + return *this; + } +}; + +struct IonInfo +{ + int iNumIonSeriesUsed; + int piSelectedIonSeries[NUM_ION_SERIES]; + bool bUseWaterAmmoniaLoss; // ammonia, water loss + int iTheoreticalFragmentIons; + int iIonVal[NUM_ION_SERIES]; + + IonInfo& operator=(IonInfo& a) + { + iNumIonSeriesUsed = a.iNumIonSeriesUsed; + bUseWaterAmmoniaLoss = a.bUseWaterAmmoniaLoss; + iTheoreticalFragmentIons = a.iTheoreticalFragmentIons; + + for (int i = 0; i < NUM_ION_SERIES; ++i) + { + piSelectedIonSeries[i] = a.piSelectedIonSeries[i]; + iIonVal[i] = a.iIonVal[i]; + } + + return *this; + } +}; + +// static user params, won't change per thread - can make global! +struct StaticParams +{ + string sHostName; + char szMod[512]; // used for sqt output + char szDecoyPrefix[256]; // used for prefix to indicate decoys + string sDecoyPrefix; // escaped version of szDecoyPrefix for output within XML files + char szOutputSuffix[256]; // used for suffix to append to output file base names + char szTxtFileExt[256]; // text file extension; default "txt" + int iElapseTime; + char szDate[32]; + Options options; + DBInfo databaseInfo; + SpecLibInfo speclibInfo; + PEFFInfo peffInfo; + InputFileInfo inputFile; + int bPrintDuplReferences; + VarModParams variableModParameters; + ToleranceParams tolerances; + StaticMod staticModifications; + PrecalcMasses precalcMasses; + EnzymeInfo enzymeInformation; + MassUtil massUtility; + double dInverseBinWidth; // this is used in BIN() many times so use inverse binWidth to do multiply vs. divide + int iArraySizeGlobal; // (int)((g_staticParams.options.dPeptideMassHigh + plus_tol_in_daltons + buffer) * g_staticParams.dInverseBinWidth) + // for MS1 library search, use dMS1MaxMass instead of dPeptideMassHigh + double dOneMinusBinOffset; // this is used in BIN() many times so calculate once + IonInfo ionInformation; + int iXcorrProcessingOffset; + DbType iDbType; // FASTA_DB = normal fasta; FI_DB = fragment ion indexed; PI_DB = peptide index + vector vectorMassOffsets; + vector precursorNLIons; + int iPrecursorNLSize; + int iOldModsEncoding; + bool bSkipToStartScan; + std::chrono::high_resolution_clock::time_point tRealTimeStart; // track run time of real-time index search + + StaticParams() + { + RestoreDefaults(); + } + + StaticParams& operator=(StaticParams& a) + { + sHostName = a.sHostName; + strcpy(szMod, a.szMod); + strcpy(szDecoyPrefix, a.szDecoyPrefix); + strcpy(szOutputSuffix, a.szOutputSuffix); + strcpy(szTxtFileExt, a.szTxtFileExt); + vectorMassOffsets = a.vectorMassOffsets; + precursorNLIons= a.precursorNLIons; + iPrecursorNLSize = a.iPrecursorNLSize; + iOldModsEncoding = a.iOldModsEncoding; + iElapseTime = a.iElapseTime; + strcpy(szDate, a.szDate); + options = a.options; + databaseInfo = a.databaseInfo; + speclibInfo = a.speclibInfo; + inputFile = a.inputFile; + bPrintDuplReferences = a.bPrintDuplReferences; + variableModParameters = a.variableModParameters; + tolerances = a.tolerances; + staticModifications = a.staticModifications; + precalcMasses = a.precalcMasses; + enzymeInformation = a.enzymeInformation; + massUtility = a.massUtility; + dInverseBinWidth = a.dInverseBinWidth; + iArraySizeGlobal = a.iArraySizeGlobal; + dOneMinusBinOffset = a.dOneMinusBinOffset; + iXcorrProcessingOffset = a.iXcorrProcessingOffset; + ionInformation = a.ionInformation; + return *this; + } + + void RestoreDefaults() + { + int i; + + inputFile.iInputType = InputType_MS2; + + szMod[0] = '\0'; + + iXcorrProcessingOffset = 75; + iDbType = DbType::FASTA_DB; + + databaseInfo.szDatabase[0] = '\0'; + speclibInfo.strSpecLibFile.clear(); + + strcpy(szDecoyPrefix, "DECOY_"); + strcpy(szTxtFileExt, "txt"); + szOutputSuffix[0] = '\0'; + + peffInfo.szPeffOBO[0] = '\0'; + peffInfo.iPeffSearch = 0; + + variableModParameters.sCompoundModsFile = ""; + variableModParameters.vdCompoundMasses.clear(); + variableModParameters.uiNumCompoundMasses = 0; + + iPrecursorNLSize = 0; + + for (i = 0; i < SIZE_MASS; ++i) + { + massUtility.pdAAMassParent[i] = 999999.; + massUtility.pdAAMassFragment[i] = 999999.; + massUtility.pdAAMassUser[i] = 0.0; + staticModifications.pdStaticMods[i] = 0.0; + } + + massUtility.bMonoMassesFragment = 1; + massUtility.bMonoMassesParent = 1; + +#ifdef CRUX + staticModifications.pdStaticMods[(int)'C'] = 57.021464; +#endif + + + enzymeInformation.iAllowedMissedCleavage = 2; + + for (i = 0; i < VMODS; ++i) + { + variableModParameters.varModList[i].iMaxNumVarModAAPerMod = 3; + variableModParameters.varModList[i].iMinNumVarModAAPerMod = 0; + variableModParameters.varModList[i].iBinaryMod = 0; + variableModParameters.varModList[i].iRequireThisMod = 0; + variableModParameters.varModList[i].iVarModTermDistance = -1; // distance from N or C-term distance + variableModParameters.varModList[i].iWhichTerm = 0; // specify N (0) or C-term (1) + variableModParameters.varModList[i].dVarModMass = 0.0; + variableModParameters.varModList[i].dNeutralLoss = 0.0; + variableModParameters.varModList[i].dNeutralLoss2 = 0.0; + strcpy(variableModParameters.varModList[i].szVarModChar, "X"); + +#ifdef CRUX + if (i==0) + { + variableModParameters.varModList[i].dVarModMass = 15.9949; + strcpy(variableModParameters.varModList[i].szVarModChar, "M"); + } +#endif + } + + variableModParameters.cModCode[0] = '*'; + variableModParameters.cModCode[1] = '#'; + variableModParameters.cModCode[2] = '@'; + variableModParameters.cModCode[3] = '^'; + variableModParameters.cModCode[4] = '~'; + variableModParameters.cModCode[5] = '$'; + variableModParameters.cModCode[6] = '%'; + variableModParameters.cModCode[7] = '!'; + variableModParameters.cModCode[8] = '+'; + for (int i = 9; i < VMODS; ++i) + { + int iAscii = 88 + i; //start with lower case 'a' ASCII 97 + if (iAscii <= 125) // thru '}' which is ASCII 125 + variableModParameters.cModCode[i] = (char)(iAscii); + else + variableModParameters.cModCode[i] = '_'; + } + + variableModParameters.iMaxVarModPerPeptide = 5; + variableModParameters.iMaxPermutations = MAX_PERMUTATIONS; + variableModParameters.bUseFragmentNeutralLoss = false; + variableModParameters.iRequireVarMod = 0; + + ionInformation.bUseWaterAmmoniaLoss = false; + ionInformation.iTheoreticalFragmentIons = 1; // 0 = flanking peaks; 1 = no flanking peaks + ionInformation.iIonVal[ION_SERIES_A] = 0; + ionInformation.iIonVal[ION_SERIES_B] = 1; + ionInformation.iIonVal[ION_SERIES_C] = 0; + ionInformation.iIonVal[ION_SERIES_X] = 0; + ionInformation.iIonVal[ION_SERIES_Y] = 1; + ionInformation.iIonVal[ION_SERIES_Z] = 0; + ionInformation.iIonVal[ION_SERIES_Z1] = 0; + + options.iNumPeptideOutputLines = 5; + options.iWhichReadingFrame = 0; + options.iEnzymeTermini = 2; + options.iNumStored = 100; // default # of search results to store for xcorr analysis. + options.iMaxDuplicateProteins = 20; // maximum number of duplicate proteins to report or store in idx file + + options.bExplicitDeltaCn = false; + options.bPrintExpectScore = true; + options.iPrintAScoreProScore = 0; + options.bExportAdditionalScoresPepXML = false; + options.bCorrectMass = false; + options.bTreatSameIL = true; + options.iOverrideCharge = 0; + options.iMaxIndexRunTime = 0; // index run time limit in milliseconds; 0=no time limit + options.iRemovePrecursor = 0; + options.dRemovePrecursorTol = 1.5; + + options.bOutputSqtStream = false; + options.bOutputSqtFile = false; + options.bOutputTxtFile = false; + options.bOutputPepXMLFile = true; + options.iOutputMzIdentMLFile = false; + options.bOutputPercolatorFile = false; + + options.bResolveFullPaths = true; + + options.bMango = false; + options.bScaleFragmentNL = false; + options.bCreatePeptideIndex = false; + options.bCreateFragmentIndex = false; + options.bFastPlainPeptideIdx = false; + options.bVerboseOutput = false; + options.iDecoySearch = 0; + options.iNumThreads = 4; + options.iNumFragmentThreads = 4; + options.bClipNtermMet = false; + options.bClipNtermAA = false; + + options.lMaxIterations = 0; + + // These parameters affect mzXML/RAMP spectra only. + options.scanRange.iStart = 0; + options.scanRange.iEnd = 0; + options.iSpectrumBatchSize = 0; + options.iMinPeaks = 10; + options.iStartCharge = 0; + options.iEndCharge = 0; + options.iMaxFragmentCharge = 3; + options.iMinPrecursorCharge = 1; + options.iMaxPrecursorCharge = 6; + options.iMSLevel = 2; + options.dMinIntensity = 0.0; + options.dMinPercentageIntensity = 0.0; + options.dPeptideMassLow = 600.0; + options.dPeptideMassHigh = 5000.0; + options.dMinimumXcorr = XCORR_CUTOFF; + options.dFragIndexMaxMass = FRAGINDEX_MAX_MASS; + options.dFragIndexMinMass = FRAGINDEX_MIN_MASS; + strcpy(options.szActivationMethod, "ALL"); + // End of mzXML specific parameters. + + options.sPinProteinDelimiter = '\t'; + + options.dFragIndexMinMass = FRAGINDEX_MIN_MASS; + options.dFragIndexMaxMass = FRAGINDEX_MAX_MASS; + options.iFragIndexMinIonsScore = FRAGINDEX_MIN_IONS_SCORE; + options.iFragIndexMinIonsReport = FRAGINDEX_MIN_IONS_REPORT; + options.iFragIndexNumSpectrumPeaks = FRAGINDEX_MAX_NUMPEAKS; + options.iFragIndexSkipReadPrecursors = 1; // skip reading precursors by default + + options.dMS1MinMass = MS1_MIN_MASS; + options.dMS1MaxMass = MS1_MAX_MASS; + + options.clearMzRange.dStart = 0.0; + options.clearMzRange.dEnd = 0.0; + + options.peptideLengthRange.iStart = MIN_PEPTIDE_LEN; + options.peptideLengthRange.iEnd = MAX_PEPTIDE_LEN - 1; // -1 as MAX_PEPTIDE_LEN number includes terminating char + + staticModifications.dAddCterminusPeptide = 0.0; + staticModifications.dAddNterminusPeptide = 0.0; + staticModifications.dAddCterminusProtein = 0.0; + staticModifications.dAddNterminusProtein = 0.0; + + tolerances.iMassToleranceUnits = 0; + tolerances.iMassToleranceType = 0; + tolerances.iIsotopeError = 0; + tolerances.dInputToleranceMinus = -3.0; // peptide_mass_tolerance minus + tolerances.dInputTolerancePlus = 3.0; // peptide_mass_tolerance plus + tolerances.dFragmentBinSize = 1.0005; + tolerances.dFragmentBinStartOffset = 0.4; + tolerances.dMS1BinSize = 1.0005; + + bSkipToStartScan = true; + } +}; + +extern StaticParams g_staticParams; + +#endif // _COMETPARAMS_H_ diff --git a/CometSearch/core/Types.h b/CometSearch/core/Types.h new file mode 100644 index 00000000..afbe707f --- /dev/null +++ b/CometSearch/core/Types.h @@ -0,0 +1,843 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-spectrum and index runtime data structs: Results, Query, QueryMS1, etc. +// Depends on: core/Constants.h, core/Params.h, CometData.h, Threading.h, AScore headers + +#ifndef _COMETTYPES_H_ +#define _COMETTYPES_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "core/Constants.h" +#include "core/Params.h" +#include "Threading.h" +#include "AScoreOptions.h" +#include "AScoreCentroid.h" +#include "AScoreAPI.h" +#include "AScoreFactory.h" +#include "AScoreDllInterface.h" + +using std::string; +using std::vector; +using std::map; + +class CometSearchManager; + +struct Results +{ + double dPepMass; + double dExpect; + float fScoreSp; + float fXcorr; + float fDeltaCn; + float fLastDeltaCn; + float fAScorePro; // AScorePro score + unsigned short usiRankXcorr; + unsigned short usiLenPeptide; + unsigned short usiRankSp; + unsigned short usiMatchedIons; + unsigned short usiTotalIons; + comet_fileoffset_t lProteinFilePosition; // for indexdb, this is the entry in g_pvProteinsList + long lWhichProtein; // which entry in g_pvProteinsList[] contains the matched proteins + int piVarModSites[MAX_PEPTIDE_LEN_P2]; // store variable mods encoding, +2 to accomodate N/C-term + double pdVarModSites[MAX_PEPTIDE_LEN_P2]; // store variable mods mass diffs, +2 to accomodate N/C-term + char pszMod[MAX_PEPTIDE_LEN][MAX_PEFFMOD_LEN]; // store PEFF mod string + char szPeptide[MAX_PEPTIDE_LEN]; + char cPrevAA; // stores prev flanking AA + char cNextAA; // stores following flanking AA + bool bClippedM; // true if new N-term protein due to clipped methionine + char cHasVariableMod; // HasVariableModType enum: 0 = no variable mod, 1 = has variable mod, 2 = has AScorePro mod + string sPeffOrigResidues; // original residue(s) of a PEFF variant + string sAScoreProSiteScores; // AScorePro site scores as comma-separated string + int iPeffOrigResiduePosition; // position of PEFF variant substitution; -1 = n-term, iLenPeptide = c-term; -9=unused + int iPeffNewResidueCount; // more than 0 new residues is a substitution (if iPeffOrigResidueCount=1) or insertion (if iPeffOrigResidueCount>1) + vector pWhichProtein; // file positions of matched protein entries + vector pWhichDecoyProtein; // keep separate decoy list (used for separate decoy matches and combined results) +}; + +struct SpecLibResults // MS2 spec lib +{ + unsigned int iWhichSpecLib; // the matched spectral library entry + float fSpecLibScore; + float fXcorr; // use xcorr for now + float fCn; // speclib score + float fRTtime; // retention time in seconds of the matched entry +}; + +struct SpecLibResultsMS1 // MS1 spec lib +{ + unsigned int iWhichSpecLib; // the matched spectral library entry + float fDotProduct; // unit vector dot product aka cosine similarity + float fRTime; // retention time in seconds of the matched entry +}; + +struct PepMassInfo +{ + double dCalcPepMass; + double dExpPepMass; // protonated MH+ experimental mass + double dPeptideMassToleranceLow; // mass tolerance low in amu from experimental mass + double dPeptideMassToleranceHigh; // mass tolerance high in amu from experimental mass + double dPeptideMassToleranceMinus; // low end of mass tolerance range including isotope offsets + double dPeptideMassTolerancePlus; // high end of mass tolerance range including isotope offsets +}; + +struct SpectrumInfoInternal +{ + int iArraySize; // m/z versus intensity array + int iHighestIon; + int iScanNumber; + unsigned short usiChargeState; + unsigned short usiMaxFragCharge; + double dTotalIntensity; + float fRTime; + char szMango[32]; // Mango encoding + char szNativeID[SIZE_NATIVEID]; // nativeID string from mzML +}; + +// PreprocessStruct stores information used in preprocessing +// each spectrum. Information not kept around otherwise +struct PreprocessStruct +{ + int iHighestIon; + double dHighestIntensity; +}; + +struct OBOStruct // stores info read from OBO file +{ + double dMassDiffAvg; // this is looked up from strMod string from OBO + double dMassDiffMono; + string strMod; // mod string, PSI-MOD, Unimod or custom + + bool operator<(const OBOStruct& a) const + { + return (strMod < a.strMod); + } +}; + +struct ProteinEntryStruct +{ + comet_fileoffset_t lWhichProtein; // file pointer to protein + int iStartResidue; // start residue position in protein (1-based) + char cPrevAA; + char cNextAA; + + bool operator<(const ProteinEntryStruct& a) const + { + return (lWhichProtein < a.lWhichProtein); + } +}; + +struct PeffModStruct // stores info read from PEFF header +{ + double dMassDiffAvg; // this is looked up from strMod string from OBO + double dMassDiffMono; + int iPosition; // position of modification + char szMod[MAX_PEFFMOD_LEN]; + + bool operator<(const PeffModStruct& a) const + { + return (iPosition < a.iPosition); + } +}; + +struct PeffVariantSimpleStruct // stores info read from PEFF header +{ + int iPosition; // position of variant + char cResidue; // new variant + + bool operator<(const PeffVariantSimpleStruct& a) const + { + return (iPosition < a.iPosition); + } +}; + +struct PeffVariantComplexStruct // stores info read from PEFF header +{ + int iPositionA; // start position of variant + int iPositionB; // end position of variant + string sResidues; // if !empty(), insertion replacing aa from pos A to B; + // if empty(), deletion of aa from pos A to B + + bool operator<(const PeffVariantComplexStruct& a) const + { + return (iPositionA < a.iPositionA); + } +}; + +struct PeffProcessedStruct +{ + int iBeginResidue; + int iEndResidue; +}; + +struct PeffPositionStruct // collate PEFF mods by position in sequence +{ + int iPosition; // position within the sequence + vector vectorWhichPeff; // which specific peff entry from PeffModStruct + vector vectorMassDiffAvg; + vector vectorMassDiffMono; +}; + +struct PeffSearchStruct // variant info passed to SearchForPeptides +{ + int iPosition; + bool bBeginCleavage; + bool bEndCleavage; + char cOrigResidue; +}; + +//-->MH +typedef struct sDBEntry +{ + string strName; // might be able to delete this here + string strSeq; + comet_fileoffset_t lProteinFilePosition; + vector vectorPeffMod; + vector vectorPeffVariantSimple; + vector vectorPeffVariantComplex; + vector vectorPeffProcessed; +} sDBEntry; + +struct DBIndex +{ + vector pcVarModSites; // empty = unmodified; else [iLen+2] encoding var mods + comet_fileoffset_t lIndexProteinFilePosition; // points to entry in g_pvProteinsList + double dPepMass; // MH+ pep mass + unsigned short siVarModProteinFilter; // bitwise representation of mmapProtein + char cPrevAA; + char cNextAA; + char sPeptide[MAX_PEPTIDE_LEN]; // peptide sequence, null-terminated + + bool operator==(const DBIndex& rhs) const + { + if (strcmp(sPeptide, rhs.sPeptide) != 0) + return false; + + if (fabs(dPepMass - rhs.dPepMass) > FLOAT_ZERO) + return false; + + int iLen = (int)strlen(sPeptide) + 2; + for (int i = 0; i < iLen; ++i) + { + char l = pcVarModSites.empty() ? 0 : pcVarModSites[i]; + char r = rhs.pcVarModSites.empty() ? 0 : rhs.pcVarModSites[i]; + if (l != r) + return false; + } + + return true; + } + + bool operator<(const DBIndex& rhs) const + { + int cmp = strcmp(sPeptide, rhs.sPeptide); + if (cmp != 0) + return cmp < 0; + + if (fabs(dPepMass - rhs.dPepMass) > FLOAT_ZERO) + return dPepMass < rhs.dPepMass; + + int iLen = (int)strlen(sPeptide) + 2; + for (int i = 0; i < iLen; ++i) + { + char l = pcVarModSites.empty() ? 0 : pcVarModSites[i]; + char r = rhs.pcVarModSites.empty() ? 0 : rhs.pcVarModSites[i]; + if (l != r) + return l < r; + } + + // FINAL tie-breaker: lowest protein index first in order + // to grab flanking residues from the first protein + return lIndexProteinFilePosition < rhs.lIndexProteinFilePosition; + } +}; + +// Compact fixed-size tuple used during plain-peptide index generation. +// Replaces heap-heavy DBIndex entries during the per-thread collection phase. +struct PepGenTuple +{ + char sPeptide[MAX_PEPTIDE_LEN]; // original AA letters (or L->I canonical), null-terminated + double dPepMass; // MH+ mass + comet_fileoffset_t lProteinFileOffset;// FASTA byte offset of the source protein + uint16_t siVarModProteinFilter; + char cPrevAA; + char cNextAA; +}; + +// --------------------------------------------------------------------------- +// 5-bit amino acid encoding for per-length short-peptide key packing. +// AAs are mapped in ASCII sort order (A=1, C=2, ..., Y=20) so that sorting +// packed uint64 keys is equivalent to lexicographic sort of sequences within +// a given peptide length. +// --------------------------------------------------------------------------- +static constexpr uint8_t kAA5bit[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0-15 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 16-31 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 32-47 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 48-63 + 0, // 64 '@' + 1, // 65 'A' + 0, // 66 'B' + 2, // 67 'C' + 3, // 68 'D' + 4, // 69 'E' + 5, // 70 'F' + 6, // 71 'G' + 7, // 72 'H' + 8, // 73 'I' (canonical for I/L when bTreatSameIL) + 0, // 74 'J' + 9, // 75 'K' + 10, // 76 'L' (remapped to 8 when bTreatSameIL) + 11, // 77 'M' + 12, // 78 'N' + 0, // 79 'O' + 13, // 80 'P' + 14, // 81 'Q' + 15, // 82 'R' + 16, // 83 'S' + 17, // 84 'T' + 0, // 85 'U' + 18, // 86 'V' + 19, // 87 'W' + 0, // 88 'X' + 20, // 89 'Y' + 0, // 90 'Z' + // 91-255: all zeros + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0 +}; + +// Reverse map: 5-bit code -> amino acid character. +// Code 8 always decodes to 'I' (canonical; L maps to code 8 when bTreatSameIL). +static constexpr char k5bitAA[32] = { + '\0','A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R', + 'S', 'T','V','W','Y','\0','\0','\0','\0','\0','\0','\0','\0','\0','\0','\0' +}; + +// Pack up to 12 amino acids into a uint64 key (5 bits each, 60 bits total). +// When bTreatSameIL is true, L encodes identically to I. +inline uint64_t PackPeptide(const char* seq, int iLen, bool bTreatSameIL) +{ + uint64_t key = 0; + for (int i = 0; i < iLen; ++i) + { + char c = seq[i]; + if (bTreatSameIL && c == 'L') c = 'I'; + key |= ((uint64_t)kAA5bit[(unsigned char)c] << (55 - i * 5)); + } + return key; +} + +// Decode a packed key back to a null-terminated sequence of iLen characters. +inline void UnpackPeptide(uint64_t key, int iLen, char* seq) +{ + for (int i = 0; i < iLen; ++i) + seq[i] = k5bitAA[(key >> (55 - i * 5)) & 0x1F]; + seq[iLen] = '\0'; +} + +// Compact per-thread tuple for short peptides (len <= 12) during index generation. +// 32 bytes on 64-bit (8-byte alignment); uILMask occupies 2 of the 4 trailing pad bytes. +struct PepGenTupleShort +{ + uint64_t uPackedPep; // canonical 5-bit-encoded sequence (L treated as I when bTreatSameIL) + double dPepMass; + comet_fileoffset_t lProteinFileOffset; + uint16_t siVarModProteinFilter; + char cPrevAA; + char cNextAA; + uint16_t uILMask; // bitmask: bit k = 1 means position k was 'L' in FASTA original +}; + +// This is used for fragment indexing; plain peptides are stored in index +// file and read in to this data struct. Same as DBIndex w/o pcVarModSites[] +struct PlainPeptideIndexStruct +{ + comet_fileoffset_t lIndexProteinFilePosition; // points to entry in g_pvProteinsList + double dPepMass; // MH+ pep mass, unmodified mass; modified mass in FragmentPeptidesStruct + unsigned short siVarModProteinFilter; // bitwise representation of mmapProtein + char cPrevAA; + char cNextAA; + char szPeptide[MAX_PEPTIDE_LEN]; // peptide sequence, null-terminated + + bool operator==(const PlainPeptideIndexStruct &rhs) const + { + return strcmp(szPeptide, rhs.szPeptide) == 0; + } +}; + +struct FragmentPeptidesStruct +{ + size_t iWhichPeptide; // reference to raw peptide (sequence, proteins, etc.) in PlainPeptideIndexStruct + int modNumIdx; + double dPepMass; // peptide mass (modified or unmodified) after permuting mods + char cNtermMod; + char cCtermMod; + + bool operator<(const FragmentPeptidesStruct& a) const + { + return dPepMass < a.dPepMass; + } +}; + +struct SpecLibStruct +{ + string strName; // any string associated with speclib entry + unsigned int iLibEntry; // a reference number associated with speclib entry + unsigned int iNumPeaks; + int iSpecLibCharge; // precursor charge; not relevant for MS1 speclib + double dSpecLibMW; // if a peptide, store neutral mass + float fRTime; + float fScaleMinInten; // min intensity of data prior to encoding to pccSparseFastXcorrData; 0.0 for unit vector + float fScaleMaxInten; // max intensity of data prior to encoding to ppcSparseFastXcorrData + vector> vSpecLibPeaks; + float* pfUnitVector; + unsigned int uiArraySizeMS1; +}; + +// for MS1 alignment +struct RetentionMatch +{ + double dQueryTime; + double dReferenceTime; + int iSpectrumIndex; + + RetentionMatch(double dQueryTime, double dReferenceTime, int iSpectrumIndex); +}; +extern std::deque RetentionMatchHistory; + +extern unsigned int* g_iFragmentIndex; // CSR flat data: all posting lists concatenated [g_iFragmentIndexOffset[bin]..g_iFragmentIndexOffset[bin+1]) +extern uint64_t* g_iFragmentIndexOffset; // CSR offsets [uiMaxFragmentArrayIndex+1]: cumulative entry counts, can exceed UINT_MAX for large non-enzymatic searches +extern vector g_vFragmentPeptides; +extern vector g_vRawPeptides; +extern bool* g_bIndexPrecursors; // allocate an array of BIN(max_precursor, protonated) and use a bool to indicate if that precursor is present in input file(s) +extern vector g_vSpecLib; +extern vector> g_vulSpecLibPrecursorIndex; // this will be an vector of vectors + +struct IndexProteinStruct // for indexed database +{ + char szProt[WIDTH_REFERENCE]; + comet_fileoffset_t lProteinFilePosition; + int iWhichProtein; +}; + +// Flat CSR-style storage for the per-peptide protein list. +// Replaces vector> to eliminate the ~190M +// individual heap allocations (one per inner vector) that caused a +// ~6-minute free-time tail when building an MHC .idx file. +// External interface mirrors vector> so +// existing call sites need no changes. +class ProteinsListCSR +{ +public: + // Read-only proxy for a single row (one peptide's protein offsets). + struct Row + { + const comet_fileoffset_t* ptr; + size_t n; + + size_t size() const { return n; } + bool empty() const { return n == 0; } + + const comet_fileoffset_t& operator[](size_t j) const { return ptr[j]; } + comet_fileoffset_t at(size_t j) const { return ptr[j]; } + + const comet_fileoffset_t* begin() const { return ptr; } + const comet_fileoffset_t* end() const { return ptr + n; } + }; + + // Size / state + size_t size() const { return m_off.empty() ? 0 : m_off.size() - 1; } + bool empty() const { return size() == 0; } + + // Modifiers + void clear() + { + vector().swap(m_flat); + vector().swap(m_off); + } + + void reserve(size_t n) { m_off.reserve(n + 1); } + + void push_back(const vector& v) + { + if (m_off.empty()) m_off.push_back(0); + m_flat.insert(m_flat.end(), v.begin(), v.end()); + m_off.push_back(m_flat.size()); + } + + void push_back(vector&& v) + { + if (m_off.empty()) m_off.push_back(0); + m_flat.insert(m_flat.end(), v.begin(), v.end()); + m_off.push_back(m_flat.size()); + vector().swap(v); // release source buffer immediately + } + + // Batch-append from pre-built flat storage. + // flat: all protein file offsets for this block, concatenated in row order + // cnt: number of offsets per row (max value bounded by iMaxDuplicateProteins) + // Bulk-copies both arrays into m_flat/m_off with two insert() calls, then + // releases the source buffers. Replaces N individual push_back(vector&&) + // calls, each of which required one heap free() -- this reduces N free()s + // to 2 (one for flat, one for cnt) regardless of how many rows are in the block. + void append_flat(vector& flat, vector& cnt) + { + if (flat.empty()) + return; + if (m_off.empty()) + m_off.push_back(0); + m_flat.insert(m_flat.end(), flat.begin(), flat.end()); + for (uint32_t n : cnt) + m_off.push_back(m_off.back() + n); + vector().swap(flat); + vector().swap(cnt); + } + + // Element access + Row operator[](size_t i) const + { + return {m_flat.data() + m_off[i], + static_cast(m_off[i + 1] - m_off[i])}; + } + + Row at(size_t i) const { return (*this)[i]; } + + // Range-based for -- yields Row values + struct Iterator + { + const ProteinsListCSR* self; + size_t i; + + Row operator*() const { return (*self)[i]; } + Iterator& operator++() { ++i; return *this; } + bool operator!=(const Iterator& o) const { return i != o.i; } + }; + + Iterator begin() const { return {this, 0}; } + Iterator end() const { return {this, size()}; } + +private: + vector m_flat; // all protein offsets concatenated + vector m_off; // [N+1] CSR offsets; row i spans [m_off[i], m_off[i+1]) +}; + +extern ProteinsListCSR g_pvProteinsList; +extern std::unordered_map g_pvProteinNameCache; // file offset -> protein name string; populated at index load + +extern std::condition_variable g_searchPoolCV; // notified when a pool slot is released + +extern AScoreProCpp::AScoreOptions g_AScoreOptions; // AScore options +extern AScoreProCpp::AScoreDllInterface* g_AScoreInterface; + +struct ModificationNumber +{ +// int modificationNumber; + int modStringLen; // FIX: need to confirm if not needed (MOD_SEQS.at(modSeqIdx)).size(); + char* modifications; +}; + +extern vector MOD_NUMBERS; +extern vector MOD_SEQS; // Unique modifiable sequences. +extern int* MOD_SEQ_MOD_NUM_START; // Start index in the MOD_NUMBERS vector for a modifiable sequence; -1 if no modification numbers were generated +extern int* MOD_SEQ_MOD_NUM_CNT; // Total modifications numbers for a modifiable sequence. + +// Index into the MOD_SEQS vector +// -1 for peptides that have no modifiable amino acids +// -2 for peptides with no modifiable amino acids but contain n/c-term mods +extern int* PEPTIDE_MOD_SEQ_IDXS; + +extern int MOD_NUM; +extern bool g_bPlainPeptideIndexRead; // set to true if plain peptide index file is read (and fragment index generated) + // poor choice of name for the fragment index .idx given peptide index is back +extern std::atomic g_bPeptideIndexRead; // set to true if peptide index file is read +extern bool g_bSpecLibRead; // set to true if spectral library file is read + +extern bool g_bPerformSpecLibSearch; // set to true if doing spectral library search +extern bool g_bPerformDatabaseSearch; // set to true if doing database search + +extern bool g_bCometPreprocessMemoryAllocated; // set to true when memory has been allocated +extern bool g_bCometSearchMemoryAllocated; // set to true when memory has been allocated + +extern bool g_bIdxNoFasta; // set to true when .idx file being search but corresponding .fasta not present + // used in mzid output to skip sequence retrieval + +// Query stores information for peptide scoring and results +// This struct is allocated for each spectrum/charge combination +struct Query +{ + int iXcorrHistogram[HISTO_SIZE]; + unsigned int uiHistogramCount; // # of entries in histogram + float fPar[4]; // parameters of LMA regression + + int iMatchPeptideCount; // # of peptides that get stored (i.e. are greater than lowest score) + int iDecoyMatchPeptideCount; // # of decoy peptides that get stored (i.e. are greater than lowest score) + + short siMaxXcorr; // index of maximum correlation score in iXcorrHistogram + + short siLowestXcorrScoreIndex; + short siLowestDecoyXcorrScoreIndex; + + double dLowestXcorrScore; + double dLowestDecoyXcorrScore; + + float fLowestSpecLibScore; + + int iMinXcorrHisto; // min xcorr score for xcorr histogram to address good E-values for poor/sparse spectra + + double dMangoIndex; // scan number decimal precursor value i.e. 2401.001 for scan 2401, first precursor/z pair + + unsigned long int _uliNumMatchedPeptides; // # of peptides that get scored + unsigned long int _uliNumMatchedDecoyPeptides; + + // When true, sparse child arrays (float[SPARSE_MATRIX_SIZE]) belong to the + // thread-local RtsScratch pool and must NOT be delete[]'d by the destructor. + // Set only by PreprocessSingleSpectrumThreadLocal via PreprocessSingleSpectrumCore. + bool bSparseFromPool; + + // Sparse matrix representation of data + int iSpScoreData; //size of sparse matrix + int iFastXcorrDataSize; + float **ppfSparseSpScoreData; + float **ppfSparseFastXcorrData; + float **ppfSparseFastXcorrDataNL; // ppfSparseFastXcorrData with NH3, H2O contributions + + // Store raw peaks for AScorePro + + // List of ms/ms masses for fragment index search; intensity not important at this stage + vector vfRawFragmentPeakMass; + // Consider replacing vfRawFragmentPeakMass with a vector> to store + // both mass and intensity if AScorePro is used + vector vRawFragmentPeakMassIntensity; + + + PepMassInfo _pepMassInfo; + SpectrumInfoInternal _spectrumInfoInternal; + Results* _pResults; + Results* _pDecoys; + SpecLibResults* _pSpecLibResults; + + std::chrono::high_resolution_clock::time_point tSearchStart; // per-query search start time for iMaxIndexRunTime timeout + + Mutex accessMutex; + + Query() + { + memset(iXcorrHistogram, 0, sizeof(iXcorrHistogram)); + + iMatchPeptideCount = 0; + iDecoyMatchPeptideCount = 0; + uiHistogramCount = 0; + iMinXcorrHisto = 0; + + fPar[0]=0.0; + fPar[1]=0.0; + fPar[2]=0.0; + fPar[3]=0.0; + + siMaxXcorr = 0; // index of maximum correlation score in iXcorrHistogram + siLowestXcorrScoreIndex = 0; + siLowestDecoyXcorrScoreIndex = 0; + + dLowestXcorrScore = XCORR_CUTOFF; + dLowestDecoyXcorrScore = XCORR_CUTOFF; + + fLowestSpecLibScore = SPECLIB_CUTOFF; + + dMangoIndex = 0.0; + + _uliNumMatchedPeptides = 0; + _uliNumMatchedDecoyPeptides = 0; + + bSparseFromPool = false; + + ppfSparseSpScoreData = NULL; + ppfSparseFastXcorrData = NULL; + ppfSparseFastXcorrDataNL = NULL; // ppfSparseFastXcorrData with NH3, H2O contributions + + vfRawFragmentPeakMass.clear(); + vRawFragmentPeakMassIntensity.clear(); + + _pepMassInfo.dCalcPepMass = 0.0; + _pepMassInfo.dExpPepMass = 0.0; + _pepMassInfo.dPeptideMassToleranceLow = 0.0; + _pepMassInfo.dPeptideMassToleranceHigh = 0.0; + _pepMassInfo.dPeptideMassToleranceMinus = 0.0; + _pepMassInfo.dPeptideMassTolerancePlus = 0.0; + + _spectrumInfoInternal.dTotalIntensity = 0.0; + _spectrumInfoInternal.iArraySize = 0; + _spectrumInfoInternal.iHighestIon = 0; + _spectrumInfoInternal.iScanNumber = 0; + _spectrumInfoInternal.dTotalIntensity = 0.0; + + _pResults = NULL; + _pDecoys = NULL; + _pSpecLibResults = NULL; + + Threading::InitMutex(&accessMutex); + } + + ~Query() + { + int i; + if (!bSparseFromPool) + { + for (i = 0; i < iSpScoreData; ++i) + { + if (ppfSparseSpScoreData[i] != NULL) + delete[] ppfSparseSpScoreData[i]; + } + } + delete[] ppfSparseSpScoreData; + ppfSparseSpScoreData = NULL; + + if (g_staticParams.ionInformation.bUseWaterAmmoniaLoss + && (g_staticParams.ionInformation.iIonVal[ION_SERIES_A] + || g_staticParams.ionInformation.iIonVal[ION_SERIES_B] + || g_staticParams.ionInformation.iIonVal[ION_SERIES_Y])) + { + if (!bSparseFromPool) + { + for (i = 0; i < iFastXcorrDataSize; ++i) + { + if (ppfSparseFastXcorrData[i] != NULL) + delete[] ppfSparseFastXcorrData[i]; + if (ppfSparseFastXcorrDataNL[i]!=NULL) + delete[] ppfSparseFastXcorrDataNL[i]; + } + } + delete[] ppfSparseFastXcorrDataNL; + ppfSparseFastXcorrDataNL = NULL; + } + else + { + if (!bSparseFromPool) + { + for (i = 0; i < iFastXcorrDataSize; ++i) + { + if (ppfSparseFastXcorrData[i] != NULL) + delete[] ppfSparseFastXcorrData[i]; + } + } + } + delete[] ppfSparseFastXcorrData; + ppfSparseFastXcorrData = NULL; + + if (_pResults != NULL) + { + _pResults->pWhichProtein.clear(); + if (g_staticParams.options.iDecoySearch == 1) + _pResults->pWhichDecoyProtein.clear(); + delete[] _pResults; + _pResults = NULL; + } + + if (g_staticParams.options.iDecoySearch == 2 && _pDecoys != NULL) + { + _pDecoys->pWhichDecoyProtein.clear(); + delete[] _pDecoys; + _pDecoys = NULL; + } + + Threading::DestroyMutex(accessMutex); + } +}; + +struct QueryMS1 +{ + // short siLowestSpecLibIndex; + // float fLowestXcorr; + unsigned int uiMatchMS1Count; // # of peptides that get stored (i.e. are greater than lowest score) + unsigned int iArraySizeMS1; // dimension of pcFastXcorrData + + // Standard array representation of data + // Library spectra are fast xcorr manipulated so non need to do so with query MS1 + float* pfFastXcorrData; + + SpecLibResultsMS1 _pSpecLibResultsMS1; + + Mutex accessMutex; + + QueryMS1() + { + // siLowestSpecLibIndex = 0; + // fLowestXcorr = SPECLIB_CUTOFF; + uiMatchMS1Count = 0; + pfFastXcorrData = NULL; + _pSpecLibResultsMS1.fDotProduct = 0.0; + _pSpecLibResultsMS1.fRTime = 0.0; + + Threading::InitMutex(&accessMutex); + } + + ~QueryMS1() + { + //FIX delete _pSepcLibResults + + Threading::DestroyMutex(accessMutex); + } +}; + +extern vector g_pvQuery; +extern vector g_pvQueryMS1; +extern vector g_pvInputFiles; +extern Mutex g_pvQueryMutex; +extern Mutex g_pvDBIndexMutex; +extern Mutex g_preprocessMemoryPoolMutex; +extern Mutex g_searchMemoryPoolMutex; +extern Mutex g_dbIndexMutex; +extern Mutex g_vSpecLibMutex; + +extern vector g_pvDBIndex; // used in both peptide index and fragment ion index; latter to store plain peptides +// Per-length, per-thread generation buffers. Outer index = (iLen - iMinLen) for short, +// (iLen - 13) for long. Inner index = thread slot. +extern vector>> g_vvvPepGenShort; // lengths <= 12 +extern vector>> g_vvvPepGenLong; // lengths > 12 +extern std::map g_pvProteinNames; // indexed database protein names and file positions + +struct IonSeriesStruct // defines which fragment ion series are considered +{ + int bPreviousMatch[8]; +}; + + +struct MatchedIonsStruct // for SingleSpectrumSearch +{ + double dMass; + double dInten; + + bool operator<(const MatchedIonsStruct& a) const + { + return dInten > a.dInten; + } +}; + +#endif // _COMETTYPES_H_ diff --git a/CometSearch/output/IResultWriter.h b/CometSearch/output/IResultWriter.h new file mode 100644 index 00000000..76a235a7 --- /dev/null +++ b/CometSearch/output/IResultWriter.h @@ -0,0 +1,62 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _IRESULTWRITER_H_ +#define _IRESULTWRITER_H_ + +#include +#include + +class CometSearchManager; + +// Parameters passed to each writer's open() method. +struct WriterOpenCtx +{ + const char* szBaseName; + const char* szOutputSuffix; + const char* szTxtFileExt; // TxtWriter only + bool bEntireFile; // true => no scan-range suffix on output name + int iFirstScan; + int iLastScan; + int iDecoySearch; // 0=off, 1=concat, 2=separate + CometSearchManager* pMgr; // for format headers that need ICometSearchManager +}; + +// Parameters passed to each writer's write() method (per-batch). +struct WriterWriteCtx +{ + FILE* fpdb; + int iScanOffset; // iTotalSpectraSearched - g_pvQuery.size(); pepXML only + int iBatchNum; // mzIdentML only +}; + +class IResultWriter +{ +public: + virtual ~IResultWriter() = default; + + // Open output file(s) and write format header. + // Returns false on error. + virtual bool open(const WriterOpenCtx& ctx) = 0; + + // Write all results in g_pvQuery for one batch. + // Returns false on error. + virtual bool write(const WriterWriteCtx& ctx) = 0; + + // Write format footer (if any), close file(s), and optionally remove + // them (bEmpty = iTotalSpectraSearched == 0). + virtual void close(bool bSucceeded, bool bEmpty) = 0; +}; + +#endif // _IRESULTWRITER_H_ diff --git a/CometSearch/output/MzIdentMlWriter.h b/CometSearch/output/MzIdentMlWriter.h new file mode 100644 index 00000000..43125a55 --- /dev/null +++ b/CometSearch/output/MzIdentMlWriter.h @@ -0,0 +1,154 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _MZIDENTMLWRITER_H_ +#define _MZIDENTMLWRITER_H_ + +#include "output/IResultWriter.h" +#include "CometWriteMzIdentML.h" +#include "CometStatus.h" +#include "Common.h" + +class MzIdentMlWriter : public IResultWriter +{ +public: + explicit MzIdentMlWriter(CometSearchManager* pMgr) : _pMgr(pMgr) {} + + bool open(const WriterOpenCtx& ctx) override + { + BuildNames(ctx, ".mzid", ".decoy.mzid", ".target.mzid", _sTarget, _sDecoy); + + _fpout = fopen(_sTarget.c_str(), "w"); + if (!_fpout) + { + std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + if (!OpenTmp(_sTarget, _sTgtTmp, _fpoutTmp)) return false; + + if (ctx.iDecoySearch == 2) + { + _fpoutd = fopen(_sDecoy.c_str(), "w"); + if (!_fpoutd) + { + std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + if (!OpenTmp(_sDecoy, _sDecTmp, _fpoutdTmp)) return false; + } + return true; + } + + bool write(const WriterWriteCtx& ctx) override + { + _fpdb = ctx.fpdb; // remember for close() + CometWriteMzIdentML::WriteMzIdentMLTmp(_fpoutTmp, _fpoutdTmp, ctx.iBatchNum); + return true; + } + + void close(bool bSucceeded, bool bEmpty) override + { + FinalizeOne(_fpout, _fpoutTmp, _sTgtTmp, bSucceeded, bEmpty); + FinalizeOne(_fpoutd, _fpoutdTmp, _sDecTmp, bSucceeded, bEmpty); + if (bEmpty) + { + if (!_sTarget.empty()) remove(_sTarget.c_str()); + if (!_sDecoy.empty()) remove(_sDecoy.c_str()); + if (!_sTgtTmp.empty()) remove(_sTgtTmp.c_str()); + if (!_sDecTmp.empty()) remove(_sDecTmp.c_str()); + } + } + +private: + CometSearchManager* _pMgr = nullptr; + FILE* _fpout = nullptr; + FILE* _fpoutd = nullptr; + FILE* _fpoutTmp = nullptr; + FILE* _fpoutdTmp = nullptr; + FILE* _fpdb = nullptr; + std::string _sTarget, _sDecoy, _sTgtTmp, _sDecTmp; + + bool OpenTmp(const std::string& sBase, std::string& sTmp, FILE*& fp) + { + sTmp = sBase + ".XXXXXX"; +#ifdef _WIN32 + if (_mktemp_s(&sTmp[0], sTmp.size() + 1) != 0) +#else + if (mkstemp(&sTmp[0]) == -1) +#endif + { + std::string msg = " Error - cannot create temporary file \"" + sTmp + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + fp = fopen(sTmp.c_str(), "w"); + if (!fp) + { + std::string msg = " Error - cannot write to temporary file \"" + sTmp + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + return true; + } + + void FinalizeOne(FILE*& fpFinal, FILE*& fpTmp, const std::string& sTmp, + bool bSucceeded, bool bEmpty) + { + if (!fpFinal) return; + if (bSucceeded && fpTmp) + { + fclose(fpTmp); + fpTmp = fopen(sTmp.c_str(), "r"); + if (fpTmp) + { + CometWriteMzIdentML::WriteMzIdentML(fpFinal, _fpdb, sTmp, *_pMgr); + fclose(fpTmp); fpTmp = nullptr; + if (!bEmpty) remove(sTmp.c_str()); + } + } + else if (fpTmp) + { + fclose(fpTmp); fpTmp = nullptr; + if (!bEmpty) remove(sTmp.c_str()); + } + fclose(fpFinal); fpFinal = nullptr; + } + + static void BuildNames(const WriterOpenCtx& ctx, + const char* ext, + const char* extDecoy, + const char* extTargetCrux, + std::string& sTarget, + std::string& sDecoy) + { + std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; + std::string range; + if (!ctx.bEntireFile) + range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); +#ifdef CRUX + if (ctx.iDecoySearch == 2) + { sTarget = base + range + extTargetCrux; sDecoy = base + range + extDecoy; } + else + sTarget = base + range + ext; +#else + (void)extTargetCrux; + sTarget = base + range + ext; + if (ctx.iDecoySearch == 2) sDecoy = base + range + extDecoy; +#endif + } +}; + +#endif // _MZIDENTMLWRITER_H_ diff --git a/CometSearch/output/PepXmlWriter.h b/CometSearch/output/PepXmlWriter.h new file mode 100644 index 00000000..3270c04a --- /dev/null +++ b/CometSearch/output/PepXmlWriter.h @@ -0,0 +1,105 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _PEPXMLWRITER_H_ +#define _PEPXMLWRITER_H_ + +#include "output/IResultWriter.h" +#include "CometWritePepXML.h" +#include "CometStatus.h" +#include "Common.h" + +class PepXmlWriter : public IResultWriter +{ +public: + bool open(const WriterOpenCtx& ctx) override + { + BuildNames(ctx, ".pep.xml", ".decoy.pep.xml", ".target.pep.xml", _sTarget, _sDecoy); + + if ((_fpout = fopen(_sTarget.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + if (!CometWritePepXML::WritePepXMLHeader(_fpout, *ctx.pMgr)) + return false; + + if (ctx.iDecoySearch == 2) + { + if ((_fpoutd = fopen(_sDecoy.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + if (!CometWritePepXML::WritePepXMLHeader(_fpoutd, *ctx.pMgr)) + return false; + } + return true; + } + + bool write(const WriterWriteCtx& ctx) override + { + CometWritePepXML::WritePepXML(_fpout, _fpoutd, ctx.fpdb, ctx.iScanOffset); + return true; + } + + void close(bool bSucceeded, bool bEmpty) override + { + if (_fpout) + { + if (bSucceeded) CometWritePepXML::WritePepXMLEndTags(_fpout); + fclose(_fpout); _fpout = nullptr; + if (bEmpty) remove(_sTarget.c_str()); + } + if (_fpoutd) + { + if (bSucceeded) CometWritePepXML::WritePepXMLEndTags(_fpoutd); + fclose(_fpoutd); _fpoutd = nullptr; + if (bEmpty && !_sDecoy.empty()) remove(_sDecoy.c_str()); + } + } + +private: + FILE* _fpout = nullptr; + FILE* _fpoutd = nullptr; + std::string _sTarget; + std::string _sDecoy; + + static void BuildNames(const WriterOpenCtx& ctx, + const char* ext, + const char* extDecoy, + const char* extTargetCrux, + std::string& sTarget, + std::string& sDecoy) + { + std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; + std::string range; + if (!ctx.bEntireFile) + range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); +#ifdef CRUX + if (ctx.iDecoySearch == 2) + { sTarget = base + range + extTargetCrux; sDecoy = base + range + extDecoy; } + else + sTarget = base + range + ext; +#else + (void)extTargetCrux; + sTarget = base + range + ext; + if (ctx.iDecoySearch == 2) sDecoy = base + range + extDecoy; +#endif + } +}; + +#endif // _PEPXMLWRITER_H_ diff --git a/CometSearch/output/PercolatorWriter.h b/CometSearch/output/PercolatorWriter.h new file mode 100644 index 00000000..d6528a39 --- /dev/null +++ b/CometSearch/output/PercolatorWriter.h @@ -0,0 +1,64 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _PERCOLATORWRITER_H_ +#define _PERCOLATORWRITER_H_ + +#include "output/IResultWriter.h" +#include "CometWritePercolator.h" +#include "CometStatus.h" +#include "Common.h" + +class PercolatorWriter : public IResultWriter +{ +public: + bool open(const WriterOpenCtx& ctx) override + { + std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; + std::string range; + if (!ctx.bEntireFile) + range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); + _sPath = base + range + ".pin"; + + _fpout = fopen(_sPath.c_str(), "w"); + if (!_fpout) + { + std::string msg = " Error - cannot write to file \"" + _sPath + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + CometWritePercolator::WritePercolatorHeader(_fpout); + return true; + } + + bool write(const WriterWriteCtx& ctx) override + { + return CometWritePercolator::WritePercolator(_fpout, ctx.fpdb); + } + + void close(bool /*bSucceeded*/, bool bEmpty) override + { + if (_fpout) + { + fclose(_fpout); _fpout = nullptr; + if (bEmpty) remove(_sPath.c_str()); + } + } + +private: + FILE* _fpout = nullptr; + std::string _sPath; +}; + +#endif // _PERCOLATORWRITER_H_ diff --git a/CometSearch/output/SqtWriter.h b/CometSearch/output/SqtWriter.h new file mode 100644 index 00000000..34341843 --- /dev/null +++ b/CometSearch/output/SqtWriter.h @@ -0,0 +1,104 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _SQTWRITER_H_ +#define _SQTWRITER_H_ + +#include "output/IResultWriter.h" +#include "CometWriteSqt.h" +#include "CometStatus.h" +#include "Common.h" + +class SqtWriter : public IResultWriter +{ +public: + bool open(const WriterOpenCtx& ctx) override + { + if (g_staticParams.options.bOutputSqtFile) + { + BuildNames(ctx, ".sqt", ".decoy.sqt", ".target.sqt", _sTarget, _sDecoy); + + if ((_fpout = fopen(_sTarget.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + CometWriteSqt::PrintSqtHeader(_fpout, *ctx.pMgr); + + if (ctx.iDecoySearch == 2) + { + if ((_fpoutd = fopen(_sDecoy.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + CometWriteSqt::PrintSqtHeader(_fpoutd, *ctx.pMgr); + } + } + return true; + } + + bool write(const WriterWriteCtx& ctx) override + { + CometWriteSqt::WriteSqt(_fpout, _fpoutd, ctx.fpdb); + return true; + } + + void close(bool /*bSucceeded*/, bool bEmpty) override + { + if (_fpout) + { + fclose(_fpout); _fpout = nullptr; + if (bEmpty) remove(_sTarget.c_str()); + } + if (_fpoutd) + { + fclose(_fpoutd); _fpoutd = nullptr; + if (bEmpty && !_sDecoy.empty()) remove(_sDecoy.c_str()); + } + } + +private: + FILE* _fpout = nullptr; + FILE* _fpoutd = nullptr; + std::string _sTarget; + std::string _sDecoy; + + static void BuildNames(const WriterOpenCtx& ctx, + const char* ext, + const char* extDecoy, + const char* extTargetCrux, + std::string& sTarget, + std::string& sDecoy) + { + std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; + std::string range; + if (!ctx.bEntireFile) + range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); +#ifdef CRUX + if (ctx.iDecoySearch == 2) + { sTarget = base + range + extTargetCrux; sDecoy = base + range + extDecoy; } + else + sTarget = base + range + ext; +#else + (void)extTargetCrux; + sTarget = base + range + ext; + if (ctx.iDecoySearch == 2) sDecoy = base + range + extDecoy; +#endif + } +}; + +#endif // _SQTWRITER_H_ diff --git a/CometSearch/output/TxtWriter.h b/CometSearch/output/TxtWriter.h new file mode 100644 index 00000000..c9b5f77a --- /dev/null +++ b/CometSearch/output/TxtWriter.h @@ -0,0 +1,105 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _TXTWRITER_H_ +#define _TXTWRITER_H_ + +#include "output/IResultWriter.h" +#include "CometWriteTxt.h" +#include "CometStatus.h" +#include "Common.h" + +class TxtWriter : public IResultWriter +{ +public: + bool open(const WriterOpenCtx& ctx) override + { + std::string ext = std::string(".") + ctx.szTxtFileExt; + std::string extDecoy = std::string(".decoy.") + ctx.szTxtFileExt; + std::string extTarget = std::string(".target.") + ctx.szTxtFileExt; + BuildNames(ctx, ext.c_str(), extDecoy.c_str(), extTarget.c_str(), _sTarget, _sDecoy); + + if ((_fpout = fopen(_sTarget.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + CometWriteTxt::PrintTxtHeader(_fpout); + fflush(_fpout); + + if (ctx.iDecoySearch == 2) + { + if ((_fpoutd = fopen(_sDecoy.c_str(), "w")) == NULL) + { + std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + return false; + } + CometWriteTxt::PrintTxtHeader(_fpoutd); + } + return true; + } + + bool write(const WriterWriteCtx& ctx) override + { + CometWriteTxt::WriteTxt(_fpout, _fpoutd, ctx.fpdb); + return true; + } + + void close(bool /*bSucceeded*/, bool bEmpty) override + { + if (_fpout) + { + fclose(_fpout); _fpout = nullptr; + if (bEmpty) remove(_sTarget.c_str()); + } + if (_fpoutd) + { + fclose(_fpoutd); _fpoutd = nullptr; + if (bEmpty && !_sDecoy.empty()) remove(_sDecoy.c_str()); + } + } + +private: + FILE* _fpout = nullptr; + FILE* _fpoutd = nullptr; + std::string _sTarget; + std::string _sDecoy; + + static void BuildNames(const WriterOpenCtx& ctx, + const char* ext, + const char* extDecoy, + const char* extTargetCrux, + std::string& sTarget, + std::string& sDecoy) + { + std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; + std::string range; + if (!ctx.bEntireFile) + range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); +#ifdef CRUX + if (ctx.iDecoySearch == 2) + { sTarget = base + range + extTargetCrux; sDecoy = base + range + extDecoy; } + else + sTarget = base + range + ext; +#else + (void)extTargetCrux; + sTarget = base + range + ext; + if (ctx.iDecoySearch == 2) sDecoy = base + range + extDecoy; +#endif + } +}; + +#endif // _TXTWRITER_H_ diff --git a/CometSearch/threading/SearchMemoryPool.cpp b/CometSearch/threading/SearchMemoryPool.cpp new file mode 100644 index 00000000..63b3103d --- /dev/null +++ b/CometSearch/threading/SearchMemoryPool.cpp @@ -0,0 +1,90 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "threading/SearchMemoryPool.h" +#include "Common.h" +#include "CometStatus.h" +#include + + +bool SearchMemoryPool::allocate(int nSlots, int iArraySize) +{ + if (_allocated) + return true; + + try + { + _inUse = new bool[nSlots](); + _pool = new bool*[nSlots]; + for (int i = 0; i < nSlots; ++i) + _pool[i] = new bool[iArraySize](); + _nSlots = nSlots; + _allocated = true; + return true; + } + catch (const std::bad_alloc& ba) + { + std::string strErrorMsg = " Error - SearchMemoryPool::allocate failed. bad_alloc: " + std::string(ba.what()) + ".\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + _allocated = false; + return false; + } +} + + +void SearchMemoryPool::_deallocate(int nSlots) +{ + delete[] _inUse; + for (int i = 0; i < nSlots; ++i) + delete[] _pool[i]; + delete[] _pool; + _inUse = nullptr; + _pool = nullptr; + _allocated = false; +} + + +void SearchMemoryPool::deallocate() +{ + if (_allocated) + _deallocate(_nSlots); +} + + +int SearchMemoryPool::acquireSlot() +{ + int i = -1; + std::unique_lock lock(_mutex); + bool found = _cv.wait_for(lock, std::chrono::seconds(240), [&i, this]() { + for (int j = 0; j < _nSlots; ++j) + { + if (!_inUse[j]) + { + _inUse[j] = true; + i = j; + return true; + } + } + return false; + }); + return found ? i : -1; +} + + +void SearchMemoryPool::releaseSlot(int slot) +{ + { std::lock_guard lk(_mutex); _inUse[slot] = false; } + _cv.notify_one(); +} diff --git a/CometSearch/threading/SearchMemoryPool.h b/CometSearch/threading/SearchMemoryPool.h new file mode 100644 index 00000000..4a69b22b --- /dev/null +++ b/CometSearch/threading/SearchMemoryPool.h @@ -0,0 +1,64 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Owns the per-thread duplicate-fragment scratch arrays used during FI/PI search. +// Extracted from CometSearch static members (_pbSearchMemoryPool, +// _ppbDuplFragmentArr, AllocateMemory, DeallocateMemory, AcquirePoolSlot) +// and the paired globals g_searchMemoryPoolMutex, g_searchPoolCV, +// g_bCometSearchMemoryAllocated. + +#ifndef _SEARCHMEMORYPOOL_H_ +#define _SEARCHMEMORYPOOL_H_ + +#include +#include + +class SearchMemoryPool +{ +public: + SearchMemoryPool() = default; + ~SearchMemoryPool() { if (_allocated) _deallocate(_nSlots); } + + // Allocates nSlots scratch arrays each of iArraySize bools. + bool allocate(int nSlots, int iArraySize); + + // Frees all scratch arrays. + void deallocate(); + + // Blocks up to 240 s until a slot is free. + // Returns slot index [0, nSlots) or -1 on timeout. + int acquireSlot(); + + // Releases the slot and wakes one waiting acquireSlot() caller. + void releaseSlot(int slot); + + // Returns the duplicate-fragment scratch array for a claimed slot. + bool* duplFragmentArr(int slot) const { return _pool[slot]; } + + bool isAllocated() const { return _allocated; } + int slotCount() const { return _nSlots; } + +private: + void _deallocate(int nSlots); + + int _nSlots = 0; + bool* _inUse = nullptr; // [_nSlots]: true = slot claimed by a thread + bool** _pool = nullptr; // [_nSlots][iArraySize]: scratch buffers + bool _allocated = false; + + std::mutex _mutex; + std::condition_variable _cv; +}; + +#endif // _SEARCHMEMORYPOOL_H_ diff --git a/docs/20260612_architecture_migration.md b/docs/20260612_architecture_migration.md new file mode 100644 index 00000000..ff139abe --- /dev/null +++ b/docs/20260612_architecture_migration.md @@ -0,0 +1,891 @@ +# Architecture Migration Plan + +**Date**: 2026-06-12 +**Scope**: `CometSearch/` library only +**Goal**: Separate concerns, reduce coupling, increase modularity. +Behavior is unchanged at every step; each phase is independently compilable and testable. + +--- + +## Background + +The codebase has six structural pathologies that this plan addresses in order of +increasing invasiveness: + +1. `CometDataInternal.h` (1,554 lines) is a monolith — constants, parameter structs, + result structs, index structs, and scoring structs all in one file. A one-line + change rebuilds every translation unit. + +2. Pool slot management (`_pbSearchMemoryPool`, `_ppbDuplFragmentArr`, + `AcquirePoolSlot`) is buried in `CometSearch` static members and bleeds into + `CometSearchManager` and `SearchThreadData` with no clear ownership. + +3. Five result writers (`CometWriteTxt`, `CometWriteSqt`, `CometWritePepXML`, + `CometWriteMzIdentML`, `CometWritePercolator`) are called via sequential `if` + chains in `DoSearch()` and access `g_pvQuery` / `g_staticParams` directly. + There is no shared interface. + +4. Twenty-eight mutable globals act as the implicit API between all modules: + `g_staticParams`, `g_pvQuery`, `g_pvQueryMS1`, `g_cometStatus`, + `g_searchMemoryPoolMutex`, `g_searchPoolCV`, `g_bPlainPeptideIndexRead`, etc. + Any file can write any global at any time. + +5. `CometSearchManager::DoSearch()` (~1,100 lines) mixes parameter validation, + index loading, per-file loop logic, file handle management, search dispatch, + result writing, and progress reporting. + +6. Search-path selection (`if iDbType == FI_DB ... else if PI_DB ...`) appears in + `DoSearch()`, `RunSearch()`, `LoadAndPreprocessSpectra()`, + `FusedLoadAndSearchSpectra()`, and `InitializeSingleSpectrumSearch()`. Adding + a new index type requires edits in eight or more places. + +--- + +## Target Folder Structure + +``` +CometSearch/ +├── core/ +│ ├── Constants.h # All compile-time constants (split from CometDataInternal.h) +│ ├── Types.h # Results, Query, PepMassInfo, scoring data structs +│ └── Params.h # StaticParams and all sub-structs +│ +├── params/ +│ ├── ParamLoader.h/.cpp # File/map -> StaticParams (from CometSearchManager lines 625-1862) +│ └── ParamValidator.h/.cpp # ValidateOutputFormat, ValidateScanRange, etc. +│ +├── index/ +│ ├── ISearchIndex.h # Abstract interface: Load(), GetType(), IsLoaded() +│ ├── fragment/ +│ │ ├── FragmentIndex.h/.cpp # Runtime state + query +│ │ └── FragmentIndexBuilder.h/.cpp # WriteFIPlainPeptideIndex +│ ├── peptide/ +│ │ ├── PeptideIndex.h/.cpp +│ │ └── PeptideIndexBuilder.h/.cpp +│ └── speclib/ +│ ├── SpecLib.h/.cpp +│ └── Alignment.h/.cpp +│ +├── spectrum/ +│ ├── ISpectrumSource.h # Interface: next(Spectrum&)->bool, scanCount(), seekTo() +│ ├── MSReaderSource.h/.cpp # MSReader-backed implementation +│ ├── Preprocessor.h/.cpp # Binning, xcorr prep -- pure computation, no I/O +│ └── BoundedQueue.h # BoundedSpectrumQueue (moved from CometPreprocess.cpp) +│ +├── scoring/ +│ ├── XcorrScorer.h/.cpp # SearchFragmentIndex, XcorrScore +│ ├── SpScorer.h/.cpp # CalculateSP +│ └── EValueScorer.h/.cpp # CalculateEValue, CalculateDeltaCn +│ +├── search/ +│ ├── SearchSession.h # Owns mutable run state (replaces g_pvQuery etc.) +│ ├── ISearchStrategy.h # Pure virtual: initialize / execute / finalize +│ ├── FastaStrategy.h/.cpp # FASTA_DB path +│ ├── FiStrategy.h/.cpp # FI_DB batch + RTS paths +│ ├── PiStrategy.h/.cpp # PI_DB path +│ └── Pipeline.h/.cpp # Selects strategy, drives per-file loop +│ +├── output/ +│ ├── IResultWriter.h # Pure virtual: write(results, params) +│ ├── TxtWriter.h/.cpp +│ ├── SqtWriter.h/.cpp +│ ├── PepXmlWriter.h/.cpp +│ ├── MzIdentMlWriter.h/.cpp +│ └── PercolatorWriter.h/.cpp +│ +├── threading/ +│ ├── ThreadPool.h # Unchanged +│ └── SearchMemoryPool.h/.cpp # Extracted from CometSearch statics +│ +└── SearchManager.h/.cpp # Thin ICometSearchManager impl -- delegates to Pipeline +``` + +--- + +## Phase 1 — Split `CometDataInternal.h` + +**Effort**: ~1 day **Risk**: Low (mechanical split, no logic changes) + +### Problem + +`CometDataInternal.h` is included by every `.cpp` in the library. It contains: +- Physical/algorithmic constants (`#define` macros, lines 33–114) +- Fourteen parameter sub-structs (`Options`, `ToleranceParams`, `IonInfo`, + `MassUtil`, `VarModParams`, `StaticMod`, `PrecalcMasses`, `DBInfo`, + `SpecLibInfo`, `PEFFInfo`, `EnzymeInfo`, `MassRange`, lines 116–980) +- `StaticParams` aggregate (lines 890–1172) +- Result/query structs (`Results`, `Query`, `QueryMS1`, `PepMassInfo`, + `SpectrumInfoInternal`, `PreprocessStruct`, `SpecLibResults`, lines 248–1490) +- Index-related structs (`PlainPeptideIndexStruct`, `FragmentPeptidesStruct`, + `DBIndex`, `PepGenTuple`, `PepGenTupleShort`, `IndexProteinStruct`, + `ProteinsListCSR`, lines 454–1277) +- PEFF structs (`PeffModStruct`, `PeffVariantSimpleStruct`, + `PeffVariantComplexStruct`, `PeffPositionStruct`, `PeffSearchStruct`, + lines 340–424) +- Scoring/output structs (`MatchedIonsStruct`, `IonSeriesStruct`, + `ModificationNumber`, lines 1278–1555) +- `DbType` enum (line 882) + +### Action + +Create `CometSearch/core/` and split into three headers: + +**`core/Constants.h`** — all `#define` constants replaced with `constexpr`: + +``` +Source lines in CometDataInternal.h: 33-114 +Contents: + PROTON_MASS, C13_DIFF, FLOAT_ZERO + MIN/MAX_PEPTIDE_LEN, MAX_PEPTIDE_LEN_P2 + FRAGINDEX_* (8 constants) + MS1_* (4 constants) + MAX_PEFFMOD_LEN, SIZE_MASS, SIZE_NATIVEID + NUM_SP_IONS, NUM_ION_SERIES, VMODS, HISTO_SIZE + WIDTH_REFERENCE, MAX_PROTEINS, EXPECT_DECOY_SIZE + NO_PEFF_VARIANT, ASCORE_CUTOFF_TO_ACCEPT, FRAGINDEX_VMODS + COMPOUNDMODS_OFFSET, VMOD_*_INDEX (15 constants) + ENZYME_* (4 constants) + ION_SERIES_* (7 constants) + XCORR_CUTOFF, SPECLIB_CUTOFF + DbType enum (move from line 882) +Change: #define -> constexpr int/double. DbType moves here from line 882. +``` + +**`core/Params.h`** — all parameter structs that StaticParams aggregates: + +``` +Source lines in CometDataInternal.h: 116-246 (Options) + 828-854 (ToleranceParams) + 856-878 (IonInfo) + 721-789 (PrecalcMasses) + 790-826 (MassUtil) + 697-720 (StaticMod) + 741-789 (VarModParams) + 436-453 (DBInfo) + 645-649 (SpecLibInfo) + 691-696 (PEFFInfo) + 890-1172 (StaticParams) + 321-333 (MassRange) +Also includes: EnzymeInfo (from CometData.h -- leave in place, just #include it) +Depends on: core/Constants.h, CometData.h +``` + +**`core/Types.h`** — runtime data structs (per-spectrum, per-query): + +``` +Source lines in CometDataInternal.h: + 248-278 Results + 280-295 SpecLibResults, SpecLibResultsMS1 + 296-320 PepMassInfo, SpectrumInfoInternal + 334-339 PreprocessStruct + 340-424 PEFF structs (5 structs) + 425-435 sDBEntry + 454-602 DBIndex, PepGenTuple, PepGenTupleShort + 510-602 PepGenTuple / PepGenTupleShort + 603-644 PlainPeptideIndexStruct, FragmentPeptidesStruct + 650-689 SpecLibStruct, RetentionMatch + 684-690 IndexProteinStruct + 1175-1277 ProteinsListCSR + 1278-1310 ModificationNumber + 1312-1491 Query + 1492-1536 QueryMS1 + 1537-1555 IonSeriesStruct, MatchedIonsStruct + 352-364 ProteinEntryStruct + 365-424 Peff structs +Depends on: core/Constants.h, core/Params.h, CometData.h, Threading.h, AScore headers +``` + +### Transition + +Keep `CometDataInternal.h` as a compatibility shim that just includes the three +new headers. This means zero changes to existing `.cpp` files in Phase 1: + +```cpp +// CometDataInternal.h after Phase 1 -- pure forwarding +#pragma once +#include "core/Constants.h" +#include "core/Params.h" +#include "core/Types.h" +``` + +In Phase 2+, files that only need one of the three headers update their own +`#include` to the specific header. `CometDataInternal.h` can be retired once +no `.cpp` includes it directly. + +### Verification + +``` +make cclean && make # must compile clean +python3 tests/unit/run_tests.py --comet comet.exe # all 17 must pass +``` + +--- + +## Phase 2 — Extract `SearchMemoryPool` + +**Effort**: ~1 day **Risk**: Low (self-contained, well-tested at runtime) + +### Problem + +The duplicate-fragment scratch arrays and the pool-slot semaphore are spread +across three locations with no single owner: + +| Location | What it does | +|----------|-------------| +| `CometSearch.cpp` lines 23-24 | Defines `_pbSearchMemoryPool`, `_ppbDuplFragmentArr` as class statics | +| `CometSearch.cpp` lines 45-116 | `AllocateMemory()`, `DeallocateMemory()`, `AcquirePoolSlot()` | +| `CometSearch.h` lines 50-59 | `SearchThreadData::~SearchThreadData()` releases the slot directly | +| `CometSearch.cpp` lines 139-140, 182-183, 227-228, 272+ | Inline slot release at each `RunSearch` call site | +| `CometSearchManager.cpp` lines 2741-2748 | Calls `AllocateMemory` / `DeallocateMemory` | +| `CometSearchManager.cpp` line 60 | Defines `g_searchMemoryPoolMutex` | +| `CometSearchManager.cpp` line 67 | Defines `g_searchPoolCV` | +| `CometSearchManager.cpp` line 94 | Defines `g_bCometSearchMemoryAllocated` | + +### New File: `threading/SearchMemoryPool.h` + +```cpp +#pragma once +#include +#include + +// Owns the per-thread duplicate-fragment scratch arrays used during search. +// Replaces CometSearch::_pbSearchMemoryPool, _ppbDuplFragmentArr, +// AllocateMemory(), DeallocateMemory(), AcquirePoolSlot() and the paired globals +// g_searchMemoryPoolMutex, g_searchPoolCV, g_bCometSearchMemoryAllocated. +class SearchMemoryPool +{ +public: + SearchMemoryPool() = default; + ~SearchMemoryPool() { if (_allocated) deallocate(_nSlots); } + + // Allocates nSlots scratch arrays each of size iArraySize bools. + // Corresponds to CometSearch::AllocateMemory(nThreads). + bool allocate(int nSlots, int iArraySize); + + // Frees all scratch arrays. + // Corresponds to CometSearch::DeallocateMemory(nThreads). + void deallocate(int nSlots); + + // Blocks up to 240 s until a slot is free. Returns index in [0, nSlots) + // or -1 on timeout. Corresponds to CometSearch::AcquirePoolSlot(). + int acquireSlot(); + + // Returns the slot and signals one waiting acquireSlot() caller. + // Corresponds to the inline release blocks in CometSearch::RunSearch. + void releaseSlot(int slot); + + // Direct access to the scratch array for a claimed slot. + bool* duplFragmentArr(int slot) const { return _pool[slot]; } + + int slotCount() const { return _nSlots; } + +private: + int _nSlots = 0; + bool* _inUse = nullptr; // was _pbSearchMemoryPool + bool** _pool = nullptr; // was _ppbDuplFragmentArr + bool _allocated = false; + + std::mutex _mutex; + std::condition_variable _cv; +}; +``` + +### New File: `threading/SearchMemoryPool.cpp` + +The implementations are direct ports of the existing functions: + +``` +allocate() <- CometSearch::AllocateMemory() lines 45-72 + reads: g_staticParams.iArraySizeGlobal (pass as parameter instead) + writes: g_bCometSearchMemoryAllocated (becomes _allocated member) + +deallocate() <- CometSearch::DeallocateMemory() lines 75-92 + reads: g_bCometSearchMemoryAllocated (becomes _allocated member) + +acquireSlot() <- CometSearch::AcquirePoolSlot() lines 97-116 + reads: g_staticParams.options.iNumThreads (becomes _nSlots) + uses: g_searchMemoryPoolMutex -> _mutex + g_searchPoolCV -> _cv + _pbSearchMemoryPool -> _inUse + +releaseSlot() <- inline blocks at CometSearch.cpp lines 139, 182, 227, 272+ + uses: g_searchMemoryPoolMutex -> _mutex + g_searchPoolCV -> _cv + _pbSearchMemoryPool -> _inUse +``` + +### `SearchThreadData` update + +`CometSearch.h` `SearchThreadData::~SearchThreadData()` (lines 50-59) currently +releases the slot directly into globals. Update it to hold a `SearchMemoryPool*` +and call `releaseSlot()`: + +```cpp +struct SearchThreadData +{ + sDBEntry dbEntry; + int iPoolSlot = -1; + SearchMemoryPool* pPool = nullptr; + ThreadPool* tp = nullptr; + + ~SearchThreadData() + { + if (pPool && iPoolSlot >= 0) + { + pPool->releaseSlot(iPoolSlot); + iPoolSlot = -1; + } + dbEntry.vectorPeffMod.clear(); + dbEntry.vectorPeffVariantSimple.clear(); + } +}; +``` + +### Call-site changes + +Every call site that calls `CometSearch::AllocateMemory()`, +`CometSearch::DeallocateMemory()`, or `CometSearch::AcquirePoolSlot()` is +updated to use the `SearchMemoryPool` object. The object is constructed in +`CometSearchManager::DoSearch()` and passed by reference to all functions that +need it: + +``` +CometSearchManager.cpp line 2741: CometPreprocess::AllocateMemory() -- unchanged +CometSearchManager.cpp line 2746: CometSearch::AllocateMemory() -> pool.allocate(n, arraySize) +CometSearchManager.cpp ~line 2332: CometSearch::DeallocateMemory() -> pool.deallocate(n) +CometSearch.cpp line 132: AcquirePoolSlot() -> pool.acquireSlot() +CometSearch.cpp line 139: inline release -> pool.releaseSlot(iSlot) +CometSearch.cpp line 175: (PI_DB path) same pattern +CometSearch.cpp lines 220, 227, 272+: (batch path) same pattern +CometPreprocess.cpp FusedLoadAndSearchSpectra: pool.duplFragmentArr(t) replaces + _ppbDuplFragmentArr[t] +``` + +Globals retired after this phase: `g_searchMemoryPoolMutex`, `g_searchPoolCV`, +`g_bCometSearchMemoryAllocated`. + +### Verification + +``` +make cclean && make +python3 tests/unit/run_tests.py --comet comet.exe # all 17 must pass +# run HeLa mzXML batch search and confirm PSM count matches pre-change baseline +``` + +--- + +## Phase 3 — Extract `IResultWriter` + +**Effort**: ~2 days **Risk**: Medium (touches writer internals) + +### Problem + +Five writer classes are dispatched from `DoSearch()` via 300+ lines of sequential +`if (bOutputXxx)` blocks (lines 2446–2900 in `CometSearchManager.cpp`). Each +writer reads `g_pvQuery` and `g_staticParams` directly. There is no shared +interface, so the dispatch cannot be driven polymorphically. + +### New File: `output/IResultWriter.h` + +```cpp +#pragma once +#include "core/Types.h" +#include "core/Params.h" +#include + +// Abstract result serializer. One concrete implementation per output format. +// Replaces the sequential if (bOutputTxtFile) / if (bOutputPepXMLFile) / ... +// dispatch in CometSearchManager::DoSearch(). +class IResultWriter +{ +public: + virtual ~IResultWriter() = default; + + // Open output file(s) and write format header. + // baseName: g_staticParams.inputFile.szBaseName + szOutputSuffix + // Called once per input file, before any spectra are searched. + virtual bool open(const std::string& baseName, const StaticParams& params) = 0; + + // Write all results for one batch of spectra. + // results is sorted by scan number (compareByScanNumber already applied). + // Called once per spectrum batch within a file. + virtual void write(const std::vector& results, + const StaticParams& params) = 0; + + // Flush and close output file(s). Write format footer if needed (e.g. pepXML). + // Called once per input file, after all batches are complete. + virtual void close(const StaticParams& params) = 0; +}; +``` + +### Writer refactoring + +Each existing writer becomes a concrete `IResultWriter`. The key behavioral +change is: instead of reading `g_pvQuery` directly, receive `results` as a +parameter. `g_staticParams` access is replaced by the `params` parameter. + +**`CometWriteTxt` -> `output/TxtWriter`** + +``` +Current: void CometWriteTxt::PrintResults(int iWhichQuery, bool bDecoy, + FILE* fpout, FILE* fpoutd, int iPrintTargetDecoy) + reads g_pvQuery.at(iWhichQuery) and g_staticParams directly + +After: write() iterates over the results vector instead of g_pvQuery. + The file handles (fpout, fpoutd) become private members opened in open(). + g_staticParams references become params parameter. + +open() <- file open + PrintTxtHeader() call (lines 2500-2550 in SearchManager) +write() <- current PrintResults() loop body, receiving vector +close() <- fclose(fpout); fclose(fpoutd); +``` + +**`CometWriteSqt` -> `output/SqtWriter`** + +``` +open() <- file open + PrintSqtHeader() call (lines 2446-2498 in SearchManager) +write() <- existing PrintResults() but receiving vector +close() <- fclose(fpout); fclose(fpoutd); +``` + +**`CometWritePepXML` -> `output/PepXmlWriter`** + +``` +open() <- file open + WritePepXMLHeader() (lines 2553-2627 in SearchManager) +write() <- existing PrintPepXMLResults() receiving vector +close() <- WritePepXMLFooter() + fclose +Note: pepXML has a two-pass pattern (tmp file + finalize). The tmp-file logic + (currently lines 2659-2724) moves into close(). +``` + +**`CometWriteMzIdentML` -> `output/MzIdentMlWriter`** + +``` +open() <- file open + header (lines 2628-2724 in SearchManager) +write() <- existing per-scan output +close() <- footer + tmp file merge + fclose +``` + +**`CometWritePercolator` -> `output/PercolatorWriter`** + +``` +open() <- file open + WritePercolatorHeader() (lines 2724-2734 in SearchManager) +write() <- existing PrintPercolatorResults() receiving vector +close() <- fclose +``` + +### `DoSearch()` dispatch replacement + +The 300-line dispatch block in `DoSearch()` (lines 2446-2900) becomes a factory +that builds a `vector>` once per input file: + +```cpp +// In DoSearch() -- replaces lines 2446-2734 +vector> writers; +if (g_staticParams.options.bOutputTxtFile) + writers.push_back(make_unique()); +if (g_staticParams.options.bOutputSqtFile || g_staticParams.options.bOutputSqtStream) + writers.push_back(make_unique()); +if (g_staticParams.options.bOutputPepXMLFile) + writers.push_back(make_unique()); +if (g_staticParams.options.iOutputMzIdentMLFile) + writers.push_back(make_unique()); +if (g_staticParams.options.bOutputPercolatorFile) + writers.push_back(make_unique()); + +// open all writers before first search batch +for (auto& w : writers) + if (!w->open(baseName, g_staticParams)) { /* handle error */ } + +// after each batch sort+write: +for (auto& w : writers) + w->write(g_pvQuery, g_staticParams); + +// after all batches: +for (auto& w : writers) + w->close(g_staticParams); +``` + +Note: `g_pvQuery` and `g_staticParams` are still globals at this phase. That +coupling is eliminated in Phase 4. Phase 3 only introduces the interface and +moves file-handle lifetime into the writer objects. + +### Verification + +``` +make cclean && make +python3 tests/unit/run_tests.py --comet comet.exe +# Run HeLa mzXML; diff txt output against pre-Phase-3 baseline -- must be identical +# (header line timestamp will differ; all PSM data must match exactly) +``` + +--- + +## Phase 4 — Introduce `SearchSession` + +**Effort**: ~3 days **Risk**: Medium-high (many call sites) + +### Problem + +The mutable state for one search run is scattered across 28 globals. Any code +can modify any of them without any indication of ownership or lifetime. + +### New File: `search/SearchSession.h` + +```cpp +#pragma once +#include "core/Params.h" +#include "core/Types.h" +#include "CometStatus.h" +#include +#include + +// Owns mutable state for one search run. +// Created at the start of DoSearch() / InitializeSingleSpectrumSearch(). +// Passed by reference to all pipeline functions that write results. +// Read-only index state (g_iFragmentIndex, g_vFragmentPeptides, g_vSpecLib, +// g_vRawPeptides, g_pvProteinsList, g_pvProteinNameCache) is NOT moved here -- +// those are large, initialized once, and shared read-only across all searches. +// They remain as const globals. (See note on pragmatic globals below.) +struct SearchSession +{ + // Run parameters -- set once before searching, then read-only. + // The params reference outlives the session (owned by CometSearchManager). + const StaticParams& params; + + // Per-batch result accumulator. + // Guarded by queriesMutex in the batch path; not accessed concurrently in RTS. + std::vector queries; + std::vector ms1Queries; + std::mutex queriesMutex; + + // Run-time flags (currently globals) + bool bPerformDatabaseSearch = false; + bool bPerformSpecLibSearch = false; + bool bIdxNoFasta = false; + bool bPlainPeptideIndexRead = false; + bool bSpecLibRead = false; + + // Error / cancel state for this run. + // Replaces g_cometStatus for per-run isolation. + CometStatus status; + + explicit SearchSession(const StaticParams& p) : params(p) {} + SearchSession(const SearchSession&) = delete; + SearchSession& operator=(const SearchSession&) = delete; +}; +``` + +### Globals replaced by SearchSession + +``` +Global (CometSearchManager.cpp) -> SearchSession member +------------------------------------------------------- +g_pvQuery -> session.queries +g_pvQueryMS1 -> session.ms1Queries +g_pvQueryMutex -> session.queriesMutex +g_bPerformDatabaseSearch -> session.bPerformDatabaseSearch +g_bPerformSpecLibSearch -> session.bPerformSpecLibSearch +g_bIdxNoFasta -> session.bIdxNoFasta +g_bPlainPeptideIndexRead -> session.bPlainPeptideIndexRead +g_bSpecLibRead -> session.bSpecLibRead +g_cometStatus -> session.status +``` + +### Globals intentionally NOT moved (pragmatic globals) + +The following globals remain as globals. They are large, allocated once, +read-only after initialization, and shared by concurrent threads. Moving them +into a session object would require reference or pointer threading through +hundreds of scoring call sites with no correctness benefit: + +``` +g_staticParams -- read-only after DoSearch() init; replace with session.params +g_iFragmentIndex -- read-only after index load; stays global +g_iFragmentIndexOffset -- same +g_vFragmentPeptides -- same +g_vRawPeptides -- same +g_pvProteinsList -- same +g_pvProteinNameCache -- same +g_vSpecLib -- same +g_pvDBIndex -- read-only after FASTA scan; stays global +g_vvvPepGenShort/.Long -- same +g_massRange -- derived from params; can be computed on demand +g_pvProteinNames -- read-only after load; stays global +g_pvInputFiles -- owned by CometSearchManager; stays +g_sCometVersion -- constant after init; stays +g_AScoreOptions -- constant after init; stays +g_AScoreInterface -- constant after init; stays +g_bPeptideIndexRead -- atomic, read-only after set; stays +RetentionMatchHistory -- deque used by alignment; keep as module-local in Alignment +``` + +### Migration strategy + +Introduce `SearchSession` alongside the existing globals. In Phase 4, both exist +in parallel. Each function signature that currently reads a global gets a +`SearchSession&` parameter added. The global is then read from `session.member` +instead of the global directly. Once all reads/writes go through the session, +the global definition is removed. + +Recommended order within Phase 4 (lowest risk first): + +``` +Step 4a: Add session to DoSearch() and the per-file loop. Pass to writer open()/write()/close(). +Step 4b: Thread session into CometPreprocess::LoadAndPreprocessSpectra() and + FusedLoadAndSearchSpectra(). Remove g_pvQuery push under mutex; use + session.queries.push_back() under session.queriesMutex. +Step 4c: Thread session into CometSearch::RunSearch() overloads. RunSearch(Query*) + and RunSearch(int, int, ThreadPool*) no longer read g_pvQuery directly. +Step 4d: Thread session into CometPostAnalysis. PostAnalysisThreadProc currently + iterates g_pvQuery; replace with session.queries. +Step 4e: Remove global definitions for the nine replaced globals. Compiler errors + will identify any remaining direct accesses. +``` + +### Verification + +After each step 4a-4e: +``` +make cclean && make +python3 tests/unit/run_tests.py --comet comet.exe +# batch HeLa mzXML diff against Phase 3 baseline +``` + +--- + +## Phase 5 — Extract `ISearchStrategy` and `Pipeline` + +**Effort**: ~1 week **Risk**: High (most invasive refactor) + +### Problem + +`DoSearch()` selects the search path via cascading `if (iDbType == FI_DB)` chains +that appear in at minimum these locations: + +``` +CometSearchManager.cpp ~line 2252: bCreatePeptideIndex path +CometSearchManager.cpp ~line 2324: bCreateFragmentIndex path +CometSearchManager.cpp ~line 2352: FI_DB precursor pre-read +CometSearchManager.cpp ~line 2808: FI_DB index load +CometSearchManager.cpp ~line 2900: FASTA_DB vs FI/PI_DB file opens +CometSearch.cpp line 122: RunSearch(Query*) dispatch +CometSearch.cpp line 206: RunSearch(ThreadPool*) dispatch +CometPreprocess.cpp: LoadAndPreprocessSpectra vs FusedLoadAndSearchSpectra +CometSearchManager.cpp ~line 3283: InitializeSingleSpectrumSearch dispatch +``` + +### New File: `search/ISearchStrategy.h` + +```cpp +#pragma once +#include "SearchSession.h" +#include "threading/SearchMemoryPool.h" +#include "ThreadPool.h" + +struct InputFileInfo; + +// One implementation per database type: FastaStrategy, FiStrategy, PiStrategy. +// Pipeline selects the correct one at startup and holds it for the run. +class ISearchStrategy +{ +public: + virtual ~ISearchStrategy() = default; + + // Called once before the first input file. + // Responsible for index loading / building (e.g. ReadPlainPeptideIndex, + // CreateFragmentIndex, WriteFIPlainPeptideIndex, WritePeptideIndex). + // Returns false on error. + virtual bool initialize(SearchSession& session, ThreadPool& pool) = 0; + + // Called once per input file. Opens the spectrum source, reads/searches + // all batches, appends fully scored Query* objects to session.queries. + // Returns false on error or cancel. + virtual bool execute(const InputFileInfo& file, + SearchSession& session, + SearchMemoryPool& pool, + ThreadPool& tp) = 0; + + // Called once after all files. Cleanup (index dealloc, etc.). + virtual void finalize(SearchSession& session, ThreadPool& pool) = 0; +}; +``` + +### Strategy implementations + +**`search/FiStrategy.h/.cpp`** — FI_DB batch path + +``` +initialize(): + If bCreateFragmentIndex: call WriteFIPlainPeptideIndex(tp) then return. + Else: pre-read precursors (if !iFragIndexSkipReadPrecursors), + call ReadPlainPeptideIndex() + CreateFragmentIndex(tp). + Source: DoSearch() lines 2324-2414. + +execute(): + Opens MSReader, calls FusedLoadAndSearchSpectra() in batch loop. + Source: DoSearch() lines 2808-3220 (FI_DB branch). + +finalize(): + CometSearch::DeallocateMemory(), CometPreprocess::DeallocateMemory(). +``` + +**`search/FastaStrategy.h/.cpp`** — FASTA_DB path + +``` +initialize(): + ReadProteinVarModFilterFile() if configured. + CometSearch::AllocateMemory(). + Source: DoSearch() lines 2252-2277. + +execute(): + Opens MSReader and FASTA file handle. + Runs LoadAndPreprocessSpectra() + RunSearch() in batch loop. + Source: DoSearch() lines 2800-3220 (FASTA_DB branch). + +finalize(): + DeallocateMemory(). +``` + +**`search/PiStrategy.h/.cpp`** — PI_DB path + +``` +initialize(): + If bCreatePeptideIndex: call WritePeptideIndex(tp) then return. + Else: load peptide index. + Source: DoSearch() lines 2245-2252. + +execute(): + Same loop structure as FiStrategy but calls SearchPeptideIndex(). + +finalize(): + DeallocateMemory(). +``` + +### New File: `search/Pipeline.h/.cpp` + +``` +// Pipeline.h +class Pipeline +{ +public: + Pipeline(unique_ptr strategy, + vector> writers); + + // Drives the full batch search for all files. + // Replaces the main body of CometSearchManager::DoSearch(). + bool run(SearchSession& session, + const vector& files, + ThreadPool& pool); + +private: + void flushAndWrite(SearchSession& session); + unique_ptr _strategy; + vector> _writers; +}; +``` + +### Strategy factory + +A free function in `SearchManager.cpp` selects the right strategy based on +`g_staticParams.iDbType` and the index-build flags. This is the single location +where the `if (iDbType == FI_DB)` logic lives after Phase 5: + +```cpp +static unique_ptr makeStrategy(const StaticParams& p) +{ + if (p.iDbType == DbType::FI_DB || p.options.bCreateFragmentIndex) + return make_unique(); + if (p.iDbType == DbType::PI_DB || p.options.bCreatePeptideIndex) + return make_unique(); + return make_unique(); +} +``` + +### `DoSearch()` after Phase 5 + +The 4,585-line `CometSearchManager::DoSearch()` body reduces to approximately: + +```cpp +bool CometSearchManager::DoSearch() +{ + if (!InitializeStaticParams()) return false; + if (!ValidateOutputFormat()) return false; + if (!ValidateScanRange()) return false; + if (!ValidatePeptideLengthRange()) return false; + + try { _tp->fillPool(g_staticParams.options.iNumThreads); } + catch (...) { /* error */ return false; } + + SearchSession session(g_staticParams); + session.bPerformDatabaseSearch = ValidateSequenceDatabaseFile(); + session.bPerformSpecLibSearch = ValidateSpecLibFile(); + + auto strategy = makeStrategy(g_staticParams); + auto writers = makeWriters(g_staticParams); // builds IResultWriter vector + Pipeline pipeline(move(strategy), move(writers)); + + return pipeline.run(session, g_pvInputFiles, *_tp); +} +``` + +### RTS path + +The RTS entry points (`InitializeSingleSpectrumSearch`, +`DoSingleSpectrumSearchMultiResults`, `FinalizeSingleSpectrumSearch`) are +**not moved into the strategy pattern** in Phase 5. They are thread-safe, +well-tested, and called from C# via `CometWrapper`. Refactoring them carries +high wrapper-compatibility risk. They remain in `CometSearchManager` and use +`g_staticParams` / `g_iFragmentIndex` etc. directly. This is explicitly out +of scope for Phase 5. + +### Verification + +``` +make cclean && make +python3 tests/unit/run_tests.py --comet comet.exe # all 17 must pass +# batch HeLa mzXML diff against Phase 4 baseline -- identical PSM data +# run integration test (T17/T18) against human.small.fasta +# confirm RTS path still compiles and executes via RealtimeSearch.exe smoke test +``` + +--- + +## Build System + +The Linux `Makefile` currently globs `CometSearch/*.cpp`. After Phase 1 it needs +to include subdirectory sources. Update the `SRCS` variable in `CometSearch/Makefile`: + +```makefile +SRCS := $(wildcard *.cpp) \ + $(wildcard core/*.cpp) \ + $(wildcard threading/*.cpp) \ + $(wildcard output/*.cpp) \ + $(wildcard search/*.cpp) +``` + +The Windows `CometSearch.vcxproj` needs a new `` entry for each new +`.cpp` added. Use `` entries to create matching Solution Explorer folders. + +--- + +## Line-Ending Rule + +All new `.h` and `.cpp` files must use CRLF line endings (Windows `\r\n`). +Verify after creating each file: +```bash +file CometSearch/threading/SearchMemoryPool.h # must show "CRLF line terminators" +``` +If not, run `unix2dos ` before committing. + +--- + +## Phase Summary + +| Phase | Target | Key Files Changed | Globals Retired | Risk | +|-------|--------|-------------------|-----------------|------| +| 1 | Split `CometDataInternal.h` | `core/Constants.h`, `core/Params.h`, `core/Types.h` | None (shim kept) | Low | +| 2 | `SearchMemoryPool` | `threading/SearchMemoryPool.h/.cpp`, `CometSearch.h/.cpp`, `CometSearchManager.cpp` | `g_searchMemoryPoolMutex`, `g_searchPoolCV`, `g_bCometSearchMemoryAllocated` | Low | +| 3 | `IResultWriter` | `output/IResultWriter.h`, 5 writer files, `CometSearchManager.cpp` | None yet | Medium | +| 4 | `SearchSession` | `search/SearchSession.h`, `CometSearchManager.cpp`, `CometPreprocess.cpp`, `CometSearch.cpp`, `CometPostAnalysis.cpp` | `g_pvQuery`, `g_pvQueryMS1`, `g_pvQueryMutex`, `g_bPerformDatabaseSearch`, `g_bPerformSpecLibSearch`, `g_bIdxNoFasta`, `g_bPlainPeptideIndexRead`, `g_bSpecLibRead`, `g_cometStatus` | Medium-high | +| 5 | `ISearchStrategy` + `Pipeline` | `search/ISearchStrategy.h`, `FiStrategy`, `FastaStrategy`, `PiStrategy`, `Pipeline.h/.cpp`, `SearchManager.cpp` | Search-path `if/else` chains | High | diff --git a/tests/unit/data/t12_minlen.fasta.idx b/tests/unit/data/t12_minlen.fasta.idx index 67be58a00c2f365085e637b7f492f3ba1e1fbda3..e28a941d13aa956b732e6787096b822a6cc25707 100644 GIT binary patch delta 83 zcmey$@sndhpr(FqUWtCPet3RSw!U+IZfc2cZenpsYLR|PYH>-ieraB2iGE6ANuvJ7 Yvct@btdp%+lqFbM7{EXmO1na70H9|TmH+?% delta 79 zcmey#@s(pjpsKS~On81#c8qg=Zfc2cZenpsYEeu{YH>+%Ole+bNlZ#&Nn*^#s>95T UjFatHlqDEh7{EXiN;^Vn094}@1poj5 diff --git a/tests/unit/data/t14_boundary.fasta.idx b/tests/unit/data/t14_boundary.fasta.idx index 7f2d44c7e0fc5c5340cc5f29dcb82e4918ce3a6f..f9a47d445c274627b9523855a9014487440bc41b 100644 GIT binary patch delta 147 zcmcb|^@wXipr(FqUWtCPet3RSw!U+IZfc2cZenpsYLR|PYH>-ieraB2iGE6ANuvJ7 ivKSUe4rT^0V1!T%B=aW=vl>fqut2mQgVJ)W5Iz8nzZKa4 delta 143 zcmaFFb&qR8psKS~On81#c8qg=Zfc2cZenpsYEeu{YH>+%Ole+bNlZ#&Nn*^#su&hW e7G?%8V1!T%B=aYWvl>gVut2mQg3?l~5Iz7MLlrmx diff --git a/tests/unit/data/t15_IL_long.fasta.idx b/tests/unit/data/t15_IL_long.fasta.idx index 36d620e5f73aeea6fec51fb3c783d09bf1fc11d9..821a5271da4fc1b36f718247ff44bedbf97e82b8 100644 GIT binary patch delta 99 zcmdnWy_0)Fpr(FqUWtCPet3RSw!U+IZfc2cZenpsYLR|PYH>-ieraB2iGE6ANuvJ7 gviB^4?92>czzCrj*e92+%Ole+bNlZ#&Nn*^#s`o5{ c%*+g6zzCrjm?u}Us!K4lLb#z&x)w?U0C@iue*gdg diff --git a/tests/unit/data/t15_IL_short.fasta.idx b/tests/unit/data/t15_IL_short.fasta.idx index f48c7258f4f98cffb2d5baf8f9bc6f48d6627558..61aa377493547d7b9bbbff5a74aedc70edb3f3a0 100644 GIT binary patch delta 99 zcmZ3*y^ecApr(FqUWtCPet3RSw!U+IZfc2cZenpsYLR|PYH>-ieraB2iGE6ANuvJ7 gvL`HpY|IQ`zzCrj*d}MQs!Oo3Lb!fVx(rGK0KR+`&j0`b delta 95 zcmZ3-y^4E6psKS~On81#c8qg=Zfc2cZenpsYEeu{YH>+%Ole+bNlZ#&Nn*^#swXUh cOw0^mzzCrjm?r15s!K4jLbzT~x(G@G0BkT7IsgCw diff --git a/tests/unit/data/t16_crosspath.fasta.idx b/tests/unit/data/t16_crosspath.fasta.idx index 6950c13011527525cbceee3fb96400eebef80c5a..877f4d29075f57be76c601d8fd14fabca36c9810 100644 GIT binary patch delta 593 zcmca2azkW7pr(FqUWtCPet3RSw!U+IZfc2cZenpsYLR|PYH>-ieraB2iGE6ANuvJ7 zvMly`PG$x$;ADZ&Oc08JPU7_p17tDO-g*WGI{O&a!$AA#71VHxp)t%L3S>;a!+%Ole+bNlZ#&Nn*^#sx0<; zR%Qk;U}b^OOc08JPU7_p17tDO-g*WGI{O&a!$AA#71VHxp)t%L3S>-vz?mSy$_f#> Kzz(H3Av6HGh8jr# diff --git a/tests/unit/data/t1_basic.fasta.idx b/tests/unit/data/t1_basic.fasta.idx index 1eab972e4f14e0dc9f3eb199e30047434ddd8c61..2a11729a1646074e250c08c447adf4b7aab3a9d8 100644 GIT binary patch delta 168 zcmey(^`C1(pr(FqUWtCPet3RSw!U+IZfc2cZenpsYLR|PYH>-ieraB2iGE6ANuvJ7 mvQ(BJW@ZL3V1!T%l=CNlWEPw3%&H*4%mOj+%Ole+bNlZ#&Nn*^#s#KPs f|4a;EzzCrjDCbW$WHpoc&kQl|Gz)|_W`)oIQpp^w diff --git a/tests/unit/data/t2_repeat.fasta.idx b/tests/unit/data/t2_repeat.fasta.idx index 5ed9977996a02352d3c0dba580a22a137684f976..c7eca97159a7f2f04f1a122c8eff348c2aebd43e 100644 GIT binary patch delta 136 zcmeC@n#?sJP*XoQuS7ptKRmxETi-c9H?>4JH?g=RwMf4twYa2MzceqiL_a05BvF53 lnIel3GcyAiFhVE>!u-h}nZ+jGXOWO#W`XGGhSHazGysY>7nJ}2 delta 128 zcmbQt)z38{P}SKgCOp3=JH|ObH?>4JH?g=RwJ4?}wYa1>rZg|JBqk-XBr#@Vl_HDL ee%Pvl>-ieraB2iGE6ANuvJ7 pvNYCu7G?%8U}1sKOc08JPU1`qER$o|%q3V@A@*K?(iv+%Ole+bNlZ#&Nn*^#sx;Pm i24)5bV1ZCfP%fRsf!0k diff --git a/tests/unit/data/t4_IL.fasta.idx b/tests/unit/data/t4_IL.fasta.idx index b8fd624f8133d64a3e660ecbaa9c964b1ba6de57..b98ed902381df153b3c66058795ec3e09e56f785 100644 GIT binary patch delta 749 zcmeB{>6e)hsHvZuSE8S+AD&;7t?!(ln_8lqn^;_uTBKi+T3k}BUz(R$qMwpjlBmD2 zOqHvik(mJu7$Fn`BMTazw(|811M7f#hJkee(Eb5+0LcD6V!gsOoGL6P{m`9pjvzn_8lqn^;_uS`<@~T3k{bQ<|4q5|ffxk{GkGN|mer zFB1b8FhVGXzszWS+RE2646Fm{83xt?K>G*O0U-Ma(gEV&)P|V`CL8h|llaR5mSyPX Kg3!ji5E=j)ZbD=L diff --git a/tests/unit/data/t5_enzyme.fasta.idx b/tests/unit/data/t5_enzyme.fasta.idx index 4685e925cbe22e8887c333a9f49d44f43e2377c6..cf238090c8539a0b256c9b6954e28bf2cff2a083 100644 GIT binary patch delta 115 zcmaFN`J8h?pr(FqUWtCPet3RSw!U+IZfc2cZenpsYLR|PYH>-ieraB2iGE6ANuvJ7 iGENr7e@qNuzzCrjF!+-fuxLyCV}?i+%Ole+bNlZ#&Nn*^#Doz%q iUrY>OzzCrjF!_@|FpEvz%_1W4iy5LOp9MlMhtdGzzZ&iU diff --git a/tests/unit/data/t6_flanking.fasta.idx b/tests/unit/data/t6_flanking.fasta.idx index 3766070a00332a2b6a0e2fabf9bf5f9077398c6d..f0a8b8f20295123c35cdbeb505679cb3acbdd231 100644 GIT binary patch delta 179 zcmZ3>y^(uDpr(FqUWtCPet3RSw!U+IZfc2cZenpsYLR|PYH>-ieraB2iGE6ANuvJ7 ivSyY+%Ole+bNlZ#&Nn*^#s%Dl% eMrH;uV1!T%ROe5QW3`rGWPup}5=tksLihl49u?F8 diff --git a/tests/unit/data/t7_mass.fasta.idx b/tests/unit/data/t7_mass.fasta.idx index 1ffd45924ef7360343465d9f365f8f6917a58b93..19b53a4ee5354f89b45c95093c7594e53e09d309 100644 GIT binary patch delta 180 zcmZ3)y^MQ8pr(FqUWtCPet3RSw!U+IZfc2cZenpsYLR|PYH>-ieraB2iGE6ANuvJ7 ivI>?YMrH;uV1!T%)aOqQV6~E9WPzCf2ueq>Lihmc;uaqO delta 176 zcmZ3+y@-24psKS~On81#c8qg=Zfc2cZenpsYEeu{YH>+%Ole+bNlZ#&Nn*^#stT5* fzf25ZzzCrjsL!7q!fGY)ml Date: Fri, 12 Jun 2026 23:48:23 -0700 Subject: [PATCH 02/15] implement architecture migration phase 4: SearchSession replaces mutable batch globals Introduces SearchSession struct (search/SearchSession.h) that owns all mutable per-run state for the batch path: queries, ms1Queries, and queriesMutex. Removes g_pvQuery and g_pvQueryMS1 global vectors from CometSearchManager.cpp and all extern declarations from core/Types.h. Key changes: - CometSearchManager: creates SearchSession per input-file iteration; wires queries/ms1Queries through all pipeline calls (LoadAndPreprocess, FusedLoadAndSearch, RunSearch, RunSpecLibSearch, PostAnalysis); sets wwctx.pQueries and woctx.bIdxNoFasta on writer contexts - CometPreprocess: threads SearchSession& through LoadAndPreprocessSpectra, FusedLoadAndSearchSpectra, FusedSearchSpectrum, PreprocessSpectrum, and PreprocessThreadData; uses session.queriesMutex (std::mutex) instead of g_pvQueryMutex for query-vector pushes - CometSearch: sets _pQueries member in SearchThreadProc; updates RunSpecLibSearch, RunMS1Search, SearchMS1Library, SearchPeptideIndex, BinarySearchMass, and all member functions to use _pQueries or explicit queries params instead of g_pvQuery/g_pvQueryMS1 - All writer classes (Txt, Sqt, PepXML, MzIdentML, Percolator) and CometMassSpecUtils/CometPostAnalysis accept const vector& instead of reading g_pvQuery - CometFragmentIndex and CometPeptideIndex pass emptyQueries to index- creation RunSearch calls (bCreateFragmentIndex path skips query access) - All 17 unit tests pass Co-Authored-By: Claude Sonnet 4.6 --- CometSearch/CometFragmentIndex.cpp | 3 +- CometSearch/CometMassSpecUtils.cpp | 11 ++- CometSearch/CometMassSpecUtils.h | 3 +- CometSearch/CometPeptideIndex.cpp | 3 +- CometSearch/CometPostAnalysis.cpp | 10 +- CometSearch/CometPostAnalysis.h | 7 +- CometSearch/CometPreprocess.cpp | 97 ++++++++++---------- CometSearch/CometPreprocess.h | 23 +++-- CometSearch/CometSearch.cpp | 126 ++++++++++++++------------ CometSearch/CometSearch.h | 24 +++-- CometSearch/CometSearchManager.cpp | 75 ++++++++------- CometSearch/CometWriteMzIdentML.cpp | 34 ++++--- CometSearch/CometWriteMzIdentML.h | 12 ++- CometSearch/CometWritePepXML.cpp | 29 +++--- CometSearch/CometWritePepXML.h | 9 +- CometSearch/CometWritePercolator.cpp | 28 +++--- CometSearch/CometWritePercolator.h | 9 +- CometSearch/CometWriteSqt.cpp | 27 +++--- CometSearch/CometWriteSqt.h | 9 +- CometSearch/CometWriteTxt.cpp | 39 ++++---- CometSearch/CometWriteTxt.h | 9 +- CometSearch/core/Types.h | 11 +-- CometSearch/output/IResultWriter.h | 6 +- CometSearch/output/MzIdentMlWriter.h | 18 ++-- CometSearch/output/PepXmlWriter.h | 2 +- CometSearch/output/PercolatorWriter.h | 2 +- CometSearch/output/SqtWriter.h | 2 +- CometSearch/output/TxtWriter.h | 2 +- CometSearch/search/SearchSession.h | 69 ++++++++++++++ 29 files changed, 413 insertions(+), 286 deletions(-) create mode 100644 CometSearch/search/SearchSession.h diff --git a/CometSearch/CometFragmentIndex.cpp b/CometSearch/CometFragmentIndex.cpp index db2671a7..52770972 100644 --- a/CometSearch/CometFragmentIndex.cpp +++ b/CometSearch/CometFragmentIndex.cpp @@ -602,7 +602,8 @@ bool CometFragmentIndex::GeneratePlainPeptideIndex(ThreadPool* tp, vector emptyQueries; + bool bSucceeded = CometSearch::RunSearch(0, 0, tp, emptyQueries); g_staticParams.options.bCreateFragmentIndex = false; g_staticParams.options.bFastPlainPeptideIdx = false; diff --git a/CometSearch/CometMassSpecUtils.cpp b/CometSearch/CometMassSpecUtils.cpp index 7857a9d4..a06b5f38 100644 --- a/CometSearch/CometMassSpecUtils.cpp +++ b/CometSearch/CometMassSpecUtils.cpp @@ -190,7 +190,8 @@ void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, bool bReturnFullProteinString, // 0 = return accession only, 1 = return full description line unsigned int *uiNumTotProteins, // matched protein count vector& vProteinTargets, // the target protein names - vector& vProteinDecoys) // the decoy protein names if applicable + vector& vProteinDecoys, // the decoy protein names if applicable + const vector& queries) { char szProteinName[WIDTH_REFERENCE]; @@ -209,9 +210,9 @@ void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, Results* pOutput; if (iPrintTargetDecoy != 2) - pOutput = g_pvQuery.at(iWhichQuery)->_pResults; + pOutput = queries.at(iWhichQuery)->_pResults; else - pOutput = g_pvQuery.at(iWhichQuery)->_pDecoys; + pOutput = queries.at(iWhichQuery)->_pDecoys; int iPrintDuplicateProteinCt = 0; // track # proteins, exit when at iMaxDuplicateProteins @@ -284,9 +285,9 @@ void CometMassSpecUtils::GetProteinNameString(FILE *fpdb, Results* pOutput; if (iPrintTargetDecoy != 2) - pOutput = g_pvQuery.at(iWhichQuery)->_pResults; + pOutput = queries.at(iWhichQuery)->_pResults; else - pOutput = g_pvQuery.at(iWhichQuery)->_pDecoys; + pOutput = queries.at(iWhichQuery)->_pDecoys; int iPrintDuplicateProteinCt = 0; // track # proteins, exit when at iMaxDuplicateProteins diff --git a/CometSearch/CometMassSpecUtils.h b/CometSearch/CometMassSpecUtils.h index b75700ff..04b141b7 100644 --- a/CometSearch/CometMassSpecUtils.h +++ b/CometSearch/CometMassSpecUtils.h @@ -61,7 +61,8 @@ class CometMassSpecUtils bool bReturnFullProteinString, // 0 = return accession only, 1 = return full description line unsigned int *iNumTotProteins, // matched protein count vector& vProteinTargets, // the target protein names - vector& vProteinDecoys); // the decoy protein names if applicable + vector& vProteinDecoys, // the decoy protein names if applicable + const vector& queries); static string GetField(std::string *s, unsigned int n, diff --git a/CometSearch/CometPeptideIndex.cpp b/CometSearch/CometPeptideIndex.cpp index 4dc547b4..5dedda2e 100644 --- a/CometSearch/CometPeptideIndex.cpp +++ b/CometSearch/CometPeptideIndex.cpp @@ -235,7 +235,8 @@ bool CometPeptideIndex::WritePeptideIndex(ThreadPool* tp) if (bSucceeded) { - bSucceeded = CometSearch::RunSearch(0, 0, tp); + vector emptyQueries; + bSucceeded = CometSearch::RunSearch(0, 0, tp, emptyQueries); } if (!bSucceeded) diff --git a/CometSearch/CometPostAnalysis.cpp b/CometSearch/CometPostAnalysis.cpp index f3e60334..00a899d7 100644 --- a/CometSearch/CometPostAnalysis.cpp +++ b/CometSearch/CometPostAnalysis.cpp @@ -176,18 +176,18 @@ CometPostAnalysis::~CometPostAnalysis() } -bool CometPostAnalysis::PostAnalysis(ThreadPool* tp) +bool CometPostAnalysis::PostAnalysis(ThreadPool* tp, const vector& queries) { bool bSucceeded = true; //Reuse existing ThreadPool ThreadPool *pPostAnalysisThreadPool = tp; - for (int i=0; i<(int)g_pvQuery.size(); ++i) + for (int i=0; i<(int)queries.size(); ++i) { - if (g_pvQuery.at(i)->iMatchPeptideCount > 0 || g_pvQuery.at(i)->iDecoyMatchPeptideCount > 0) + if (queries.at(i)->iMatchPeptideCount > 0 || queries.at(i)->iDecoyMatchPeptideCount > 0) { - PostAnalysisThreadData* pThreadData = new PostAnalysisThreadData(i); + PostAnalysisThreadData* pThreadData = new PostAnalysisThreadData(i, &queries); pPostAnalysisThreadPool->doJob(std::bind(PostAnalysisThreadProc, pThreadData, pPostAnalysisThreadPool)); @@ -223,7 +223,7 @@ void CometPostAnalysis::PostAnalysisThreadProc(PostAnalysisThreadData *pThreadDa (void)tp; // suppress unused parameter warning int iQueryIndex = pThreadData->iQueryIndex; - Query* pQuery = g_pvQuery.at(iQueryIndex); + Query* pQuery = pThreadData->pQueries->at(iQueryIndex); AnalyzeSP(pQuery); diff --git a/CometSearch/CometPostAnalysis.h b/CometSearch/CometPostAnalysis.h index aa001ced..50b926a1 100644 --- a/CometSearch/CometPostAnalysis.h +++ b/CometSearch/CometPostAnalysis.h @@ -24,15 +24,18 @@ struct PostAnalysisThreadData { int iQueryIndex; + const vector* pQueries; PostAnalysisThreadData() { iQueryIndex = -1; + pQueries = nullptr; } - PostAnalysisThreadData(int iQueryIndex_in) + PostAnalysisThreadData(int iQueryIndex_in, const vector* pQueries_in) { iQueryIndex = iQueryIndex_in; + pQueries = pQueries_in; } }; @@ -41,7 +44,7 @@ class CometPostAnalysis public: CometPostAnalysis(); ~CometPostAnalysis(); - static bool PostAnalysis(ThreadPool* tp); + static bool PostAnalysis(ThreadPool* tp, const vector& queries); static void PostAnalysisThreadProc(PostAnalysisThreadData* pThreadData, ThreadPool* tp); // Query*-based overloads, the only versions now diff --git a/CometSearch/CometPreprocess.cpp b/CometSearch/CometPreprocess.cpp index 867e2388..8d29a184 100644 --- a/CometSearch/CometPreprocess.cpp +++ b/CometSearch/CometPreprocess.cpp @@ -646,7 +646,8 @@ bool CometPreprocess::LoadAndPreprocessSpectra(MSReader &mstReader, int iFirstScan, int iLastScan, int iAnalysisType, - ThreadPool* tp) + ThreadPool* tp, + SearchSession& session) { int iFileLastScan = -1; // The actual last scan in the file. int iScanNumber = 0; @@ -761,15 +762,16 @@ bool CometPreprocess::LoadAndPreprocessSpectra(MSReader &mstReader, if (CheckActivationMethodFilter(mstSpectrum.getActivationMethod())) { - // add this hack when 1 thread is specified otherwise g_pvQuery.size() returns 0 + // add this hack when 1 thread is specified otherwise session.queries.size() returns 0 if (g_staticParams.options.iNumThreads == 1) pPreprocessThreadPool->wait_on_threads(); - Threading::LockMutex(g_pvQueryMutex); - // this needed because processing can add multiple spectra at a time - iNumSpectraLoaded = (int)g_pvQuery.size(); - iNumSpectraLoaded++; - Threading::UnlockMutex(g_pvQueryMutex); + { + std::lock_guard lk(session.queriesMutex); + // this needed because processing can add multiple spectra at a time + iNumSpectraLoaded = (int)session.queries.size(); + iNumSpectraLoaded++; + } pPreprocessThreadPool->wait_for_available_thread(); @@ -778,6 +780,7 @@ bool CometPreprocess::LoadAndPreprocessSpectra(MSReader &mstReader, //run filter here. PreprocessThreadData *pPreprocessThreadData = new PreprocessThreadData(mstSpectrum, iAnalysisType, iFileLastScan); + pPreprocessThreadData->pSession = &session; pPreprocessThreadPool->doJob(std::bind(PreprocessThreadProc, pPreprocessThreadData, pPreprocessThreadPool)); } @@ -804,22 +807,18 @@ bool CometPreprocess::LoadAndPreprocessSpectra(MSReader &mstReader, } } - Threading::LockMutex(g_pvQueryMutex); - - if (CheckExit(iAnalysisType, - iScanNumber, - iTotalScans, - iLastScan, - mstReader.getLastScan(), - iNumSpectraLoaded, - 0)) - { - Threading::UnlockMutex(g_pvQueryMutex); - break; - } - else { - Threading::UnlockMutex(g_pvQueryMutex); + std::lock_guard lk(session.queriesMutex); + if (CheckExit(iAnalysisType, + iScanNumber, + iTotalScans, + iLastScan, + mstReader.getLastScan(), + iNumSpectraLoaded, + 0)) + { + break; + } } } @@ -882,7 +881,8 @@ void CometPreprocess::PreprocessThreadProc(PreprocessThreadData *pPreprocessThre ppdTmpCorrelationDataArr[i], ppfFastXcorrData[i], ppfFastXcorrDataNL[i], - ppfSpScoreData[i]); + ppfSpScoreData[i], + pPreprocessThreadData->pSession); delete pPreprocessThreadData; pPreprocessThreadData = NULL; @@ -1866,23 +1866,22 @@ double* CometPreprocess::GetRtsRawDataBuffer() } -// Original public entry point: builds Query* via Core, then pushes into g_pvQuery. -// Preserves backward compatibility with existing callers. +// Original public entry point: builds Query* via Core, then pushes into session.queries. bool CometPreprocess::PreprocessSingleSpectrum(int iPrecursorCharge, double dMZ, double *pdMass, double *pdInten, int iNumPeaks, - double *pdTmpSpectrum) + double *pdTmpSpectrum, + SearchSession& session) { Query* pScoring = PreprocessSingleSpectrumCore(iPrecursorCharge, dMZ, pdMass, pdInten, iNumPeaks, pdTmpSpectrum); if (pScoring == nullptr) return false; - Threading::LockMutex(g_pvQueryMutex); - g_pvQuery.push_back(pScoring); - Threading::UnlockMutex(g_pvQueryMutex); + std::lock_guard lk(session.queriesMutex); + session.queries.push_back(pScoring); return true; } @@ -2023,7 +2022,8 @@ bool CometPreprocess::PreprocessSpectrum(Spectrum &spec, double *pdTmpCorrelationData, float *pfFastXcorrData, float *pfFastXcorrDataNL, - float *pfSpScoreData) + float *pfSpScoreData, + SearchSession* pSession) { int iScanNumber = spec.getScanNumber(); int iSpectrumCharge = 0; @@ -2236,9 +2236,8 @@ bool CometPreprocess::PreprocessSpectrum(Spectrum &spec, return false; } - Threading::LockMutex(g_pvQueryMutex); - g_pvQuery.push_back(pScoring); - Threading::UnlockMutex(g_pvQueryMutex); + std::lock_guard lk(pSession->queriesMutex); + pSession->queries.push_back(pScoring); } } } @@ -2804,7 +2803,8 @@ bool CometPreprocess::IsValidInputType(int inputType) bool CometPreprocess::PreprocessMS1SingleSpectrum(double* pdMass, double* pdInten, - int iNumPeaks) + int iNumPeaks, + SearchSession& session) { QueryMS1* pScoringMS1 = new QueryMS1(); @@ -2866,7 +2866,8 @@ bool CometPreprocess::PreprocessMS1SingleSpectrum(double* pdMass, pScoringMS1->iArraySizeMS1 = iArraySizeMS1; - g_pvQueryMS1.push_back(pScoringMS1); + std::lock_guard lk(session.queriesMutex); + session.ms1Queries.push_back(pScoringMS1); return true; } @@ -2949,7 +2950,7 @@ QueryMS1* CometPreprocess::PreprocessMS1SingleSpectrumThreadLocal(double* pdMass // Fused FI_DB batch worker: preprocess + RunSearch + post-analysis for one spectrum. // Uses per-thread g_rtsScratch scratch buffers (no shared batch pool contention). // iSlot is this worker thread's pre-assigned _ppbDuplFragmentArr index. -void CometPreprocess::FusedSearchSpectrum(Spectrum spec, int iSlot) +void CometPreprocess::FusedSearchSpectrum(Spectrum spec, int iSlot, SearchSession& session) { int iScanNumber = spec.getScanNumber(); int iSpectrumCharge = 0; @@ -3231,9 +3232,8 @@ void CometPreprocess::FusedSearchSpectrum(Spectrum spec, int iSlot) pScoring->vfRawFragmentPeakMass.clear(); pScoring->vfRawFragmentPeakMass.shrink_to_fit(); - Threading::LockMutex(g_pvQueryMutex); - g_pvQuery.push_back(pScoring); - Threading::UnlockMutex(g_pvQueryMutex); + std::lock_guard lk(session.queriesMutex); + session.queries.push_back(pScoring); } } } @@ -3247,7 +3247,8 @@ bool CometPreprocess::FusedLoadAndSearchSpectra(MSReader& mstReader, int iFirstScan, int iLastScan, int iAnalysisType, - ThreadPool* tp) + ThreadPool* tp, + SearchSession& session) { int iFileLastScan = -1; int iScanNumber = 0; @@ -3269,11 +3270,11 @@ bool CometPreprocess::FusedLoadAndSearchSpectra(MSReader& mstReader, for (int t = 0; t < iNumSlots; ++t) { - tp->doJob([&queue, t]() + tp->doJob([&queue, t, &session]() { Spectrum spec; while (queue.pop(spec)) - FusedSearchSpectrum(std::move(spec), t); + FusedSearchSpectrum(std::move(spec), t, session); }); } @@ -3380,14 +3381,14 @@ bool CometPreprocess::FusedLoadAndSearchSpectra(MSReader& mstReader, } } - Threading::LockMutex(g_pvQueryMutex); - if (CheckExit(iAnalysisType, iScanNumber, iTotalScans, iLastScan, - mstReader.getLastScan(), iNumSpectraLoaded, 0)) { - Threading::UnlockMutex(g_pvQueryMutex); - break; + std::lock_guard lk(session.queriesMutex); + if (CheckExit(iAnalysisType, iScanNumber, iTotalScans, iLastScan, + mstReader.getLastScan(), iNumSpectraLoaded, 0)) + { + break; + } } - Threading::UnlockMutex(g_pvQueryMutex); } Threading::DestroyMutex(_maxChargeMutex); diff --git a/CometSearch/CometPreprocess.h b/CometSearch/CometPreprocess.h index 67b664ea..c605cdf2 100644 --- a/CometSearch/CometPreprocess.h +++ b/CometSearch/CometPreprocess.h @@ -17,6 +17,7 @@ #define _COMETPREPROCESS_H_ #include "ThreadPool.h" +#include "search/SearchSession.h" struct PreprocessThreadData { @@ -24,16 +25,17 @@ struct PreprocessThreadData int iAnalysisType; int iFileLastScan; bool *pbMemoryPool; //MH: Manages active memory pool + SearchSession* pSession; PreprocessThreadData() - : mstSpectrum(), iAnalysisType(0), iFileLastScan(0), pbMemoryPool(nullptr) + : mstSpectrum(), iAnalysisType(0), iFileLastScan(0), pbMemoryPool(nullptr), pSession(nullptr) { } PreprocessThreadData(Spectrum& spec_in, int iAnalysisType_in, int iFileLastScan_in) - : mstSpectrum(spec_in), iAnalysisType(iAnalysisType_in), iFileLastScan(iFileLastScan_in), pbMemoryPool(nullptr) + : mstSpectrum(spec_in), iAnalysisType(iAnalysisType_in), iFileLastScan(iFileLastScan_in), pbMemoryPool(nullptr), pSession(nullptr) { } @@ -69,7 +71,8 @@ class CometPreprocess int iFirstScan, int iLastScan, int iAnalysisType, - ThreadPool* tp); + ThreadPool* tp, + SearchSession& session); static void PreprocessThreadProc(PreprocessThreadData *pPreprocessThreadData, ThreadPool* tp); static void PreprocessThreadProcMS1(PreprocessThreadData* pPreprocessThreadDataMS1, @@ -84,7 +87,8 @@ class CometPreprocess double *pdMass, double *pdInten, int iNumPeaks, - double *pdTmpSpectrum); + double *pdTmpSpectrum, + SearchSession& session); // Thread-local version: returns Query* without touching g_pvQuery. // Caller owns the returned Query* and must delete it when done. @@ -97,7 +101,8 @@ class CometPreprocess static bool PreprocessMS1SingleSpectrum(double* pdMass, double* pdInten, - int iNumPeaks); + int iNumPeaks, + SearchSession& session); // Thread-local version: returns QueryMS1* without touching g_pvQueryMS1. // Caller owns the returned QueryMS1* and must delete it when done. static QueryMS1* PreprocessMS1SingleSpectrumThreadLocal(double* pdMass, @@ -109,7 +114,7 @@ class CometPreprocess // Fused FI_DB batch path: preprocess + search + post-analysis for one spectrum // in a single pass using thread-local scratch buffers. iSlot is this worker's // pre-assigned _ppbDuplFragmentArr index. - static void FusedSearchSpectrum(Spectrum spec, int iSlot); + static void FusedSearchSpectrum(Spectrum spec, int iSlot, SearchSession& session); // Fused FI_DB batch path: stream spectra through a bounded producer/consumer // queue into FusedSearchSpectrum workers. Replaces LoadAndPreprocessSpectra + @@ -118,7 +123,8 @@ class CometPreprocess int iFirstScan, int iLastScan, int iAnalysisType, - ThreadPool* tp); + ThreadPool* tp, + SearchSession& session); // Returns the thread-local raw-data buffer used by PreprocessSingleSpectrumThreadLocal. // The buffer is sized to g_staticParams.iArraySizeGlobal and its content after a @@ -149,7 +155,8 @@ class CometPreprocess double *pdTmpCorrelationData, float *pfFastXcorrData, float *pfFastXcorrDataNL, - float *pfSpScoreData); + float *pfSpScoreData, + SearchSession* pSession); static bool AdjustMassTol(struct Query *pScoring); static bool CheckActivationMethodFilter(MSActivation act); static bool Preprocess(struct Query *pScoring, diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index ae6254da..a5145036 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -194,7 +194,7 @@ bool CometSearch::RunSearch(Query* pQuery, int iSlot) // called by DoSingleSpectrumSearchMultiResults -bool CometSearch::RunSearch(ThreadPool *tp) +bool CometSearch::RunSearch(ThreadPool *tp, vector& queries) { CometSearch sqSearch; size_t iWhichQuery = 0; @@ -214,12 +214,12 @@ bool CometSearch::RunSearch(ThreadPool *tp) logerr(" Error - could not acquire memory pool slot for single-query FI search.\n"); return false; } - SearchFragmentIndex(g_pvQuery.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); + SearchFragmentIndex(queries.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); s_pool.releaseSlot(iSlot); } else if (g_staticParams.iDbType == DbType::PI_DB) // peptide index { - sqSearch.SearchPeptideIndex(tp); + sqSearch.SearchPeptideIndex(tp, queries); } else { @@ -235,7 +235,8 @@ bool CometSearch::RunSearch(ThreadPool *tp) bool CometSearch::RunSearch(int iPercentStart, int iPercentEnd, - ThreadPool* tp) + ThreadPool* tp, + vector& queries) { bool bSucceeded = true; @@ -254,18 +255,18 @@ bool CometSearch::RunSearch(int iPercentStart, ThreadPool* pSearchThreadPool = tp; - size_t iEnd = g_pvQuery.size(); + size_t iEnd = queries.size(); for (size_t iWhichQuery = 0; iWhichQuery < iEnd; ++iWhichQuery) { - pSearchThreadPool->doJob([iWhichQuery]() { + pSearchThreadPool->doJob([iWhichQuery, &queries]() { int iSlot = AcquirePoolSlot(); if (iSlot < 0) { logerr(" Error - could not acquire memory pool slot for batch FI search thread.\n"); return; } - SearchFragmentIndex(g_pvQuery.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); + SearchFragmentIndex(queries.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); s_pool.releaseSlot(iSlot); }); } @@ -287,7 +288,7 @@ bool CometSearch::RunSearch(int iPercentStart, else if (g_staticParams.iDbType == DbType::PI_DB) { CometSearch* sqSearch = new CometSearch(); - sqSearch->SearchPeptideIndex(tp); + sqSearch->SearchPeptideIndex(tp, queries); delete sqSearch; return bSucceeded; } @@ -925,6 +926,7 @@ bool CometSearch::RunSearch(int iPercentStart, // Now search sequence entry; add threading here so that // each protein sequence is passed to a separate thread. SearchThreadData *pSearchThreadData = new SearchThreadData(dbe); + pSearchThreadData->pQueries = &queries; pSearchThreadPool->doJob(std::bind(SearchThreadProc, pSearchThreadData, pSearchThreadPool)); @@ -1002,14 +1004,15 @@ bool CometSearch::RunSpecLibSearch(ThreadPool* tp) bool CometSearch::RunSpecLibSearch(int iPercentStart, int iPercentEnd, - ThreadPool* tp) + ThreadPool* tp, + vector& queries) { // to fill g_vulSpecLibPrecursorIndex, set // binmin = BINPREC(expmass - tol) // binmax = BINPREC(expmass + tol) // then for (i=binmin; i<=binmax; ++i) {g_vulSpecLibPrecursorIndex[i].push_back(entry)} - for (vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) + for (vector::iterator it = queries.begin(); it != queries.end(); ++it) { int iBinExpMass = BINPREC((*it)->_pepMassInfo.dExpPepMass); @@ -1036,16 +1039,18 @@ bool CometSearch::RunMS1Search(ThreadPool* tp, double dRT, double dMaxMS1RTDiff, const double dMaxSpecLibRT, - const double dMaxQueryRT) + const double dMaxQueryRT, + vector& ms1Queries) { ThreadPool* pRunMS1SearchThreadPool = tp; - for (size_t iWhichMS1Query = 0; iWhichMS1Query < g_pvQueryMS1.size(); ++iWhichMS1Query) + for (size_t iWhichMS1Query = 0; iWhichMS1Query < ms1Queries.size(); ++iWhichMS1Query) { + QueryMS1* pMS1Query = ms1Queries.at(iWhichMS1Query); // for each query, thread the search by segmenting the library for (int iWhichThread = 0; iWhichThread < g_staticParams.options.iNumThreads; ++iWhichThread) { - pRunMS1SearchThreadPool->doJob(std::bind(SearchMS1Library, iWhichMS1Query, iWhichThread, dRT, + pRunMS1SearchThreadPool->doJob(std::bind(SearchMS1Library, pMS1Query, iWhichThread, dRT, dMaxMS1RTDiff, dMaxSpecLibRT, dMaxQueryRT, pRunMS1SearchThreadPool)); } } @@ -1287,6 +1292,7 @@ void CometSearch::SearchThreadProc(SearchThreadData *pSearchThreadData, // stack in debug builds when combined with the deep DoSearch call chain. CometSearch* sqSearch = new CometSearch(); sqSearch->_iSlot = i; + sqSearch->_pQueries = pSearchThreadData->pQueries; sqSearch->DoSearch(pSearchThreadData->dbEntry, _ppbDuplFragmentArr[i]); delete sqSearch; @@ -1866,7 +1872,7 @@ void CometSearch::SearchFragmentIndex(Query* pQuery, } -bool CometSearch::SearchPeptideIndex(ThreadPool* tp) +bool CometSearch::SearchPeptideIndex(ThreadPool* tp, vector& queries) { comet_fileoffset_t lEndOfStruct; FILE* fp; @@ -2030,9 +2036,9 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp) if (sDBI.dPepMass > g_massRange.dMaxMass) break; - int iWhichQuery = BinarySearchMass(0, (int)g_pvQuery.size(), sDBI.dPepMass); + int iWhichQuery = BinarySearchMass(0, (int)queries.size(), sDBI.dPepMass); - while (iWhichQuery > 0 && g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassTolerancePlus >= sDBI.dPepMass) + while (iWhichQuery > 0 && queries.at(iWhichQuery)->_pepMassInfo.dPeptideMassTolerancePlus >= sDBI.dPepMass) iWhichQuery--; // Do the search @@ -2757,9 +2763,9 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, } // Compare calculated fragment ions against all matching query spectra. - while (iWhichQuery < (int)g_pvQuery.size()) + while (iWhichQuery < (int)_pQueries->size()) { - if (sDBI.dPepMass < g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) + if (sDBI.dPepMass < _pQueries->at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) { // If calculated mass is smaller than low mass range. break; @@ -2899,7 +2905,7 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ctNL++) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -2970,7 +2976,7 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ctNL++) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -3122,7 +3128,7 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ctNL++) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -3193,7 +3199,7 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ctNL++) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (sDBI.dPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -3255,7 +3261,7 @@ void CometSearch::AnalyzePeptideIndex(int iWhichQuery, } -void CometSearch::SearchMS1Library(size_t iWhichMS1Query, +void CometSearch::SearchMS1Library(QueryMS1* pMS1Query, const int iWhichThread, const double dRT, const double dMaxMS1RTDiff, @@ -3265,12 +3271,12 @@ void CometSearch::SearchMS1Library(size_t iWhichMS1Query, { unsigned int iStart = BINPREC(g_staticParams.options.dMS1MinMass); - // Given iWhichMS1Query, this search will run through a subset of the library entries + // Given pMS1Query, this search will run through a subset of the library entries for (size_t iWhichMS1LibEntry = iWhichThread; iWhichMS1LibEntry < g_vSpecLib.size(); iWhichMS1LibEntry += g_staticParams.options.iNumThreads) { double dScore = 0.0; - unsigned int uiArrayLimit = g_pvQueryMS1.at(iWhichMS1Query)->iArraySizeMS1; + unsigned int uiArrayLimit = pMS1Query->iArraySizeMS1; if (uiArrayLimit > g_vSpecLib.at(iWhichMS1LibEntry).uiArraySizeMS1) uiArrayLimit = g_vSpecLib.at(iWhichMS1LibEntry).uiArraySizeMS1; @@ -3278,18 +3284,18 @@ void CometSearch::SearchMS1Library(size_t iWhichMS1Query, { for (unsigned int i = iStart; i < uiArrayLimit; ++i) { - dScore += g_pvQueryMS1.at(iWhichMS1Query)->pfFastXcorrData[i] * g_vSpecLib.at(iWhichMS1LibEntry).pfUnitVector[i]; + dScore += pMS1Query->pfFastXcorrData[i] * g_vSpecLib.at(iWhichMS1LibEntry).pfUnitVector[i]; } - if (dScore > g_pvQueryMS1.at(iWhichMS1Query)->_pSpecLibResultsMS1.fDotProduct) + if (dScore > pMS1Query->_pSpecLibResultsMS1.fDotProduct) { Threading::LockMutex(g_pvQueryMutex); - if (dScore > g_pvQueryMS1.at(iWhichMS1Query)->_pSpecLibResultsMS1.fDotProduct) + if (dScore > pMS1Query->_pSpecLibResultsMS1.fDotProduct) { - g_pvQueryMS1.at(iWhichMS1Query)->_pSpecLibResultsMS1.fDotProduct = (float)dScore; + pMS1Query->_pSpecLibResultsMS1.fDotProduct = (float)dScore; // scale back to reference RT - g_pvQueryMS1.at(iWhichMS1Query)->_pSpecLibResultsMS1.fRTime = (float)(g_vSpecLib.at(iWhichMS1LibEntry).fRTime * dMaxSpecLibRT / dMaxQueryRT); - g_pvQueryMS1.at(iWhichMS1Query)->_pSpecLibResultsMS1.iWhichSpecLib = g_vSpecLib.at(iWhichMS1LibEntry).iLibEntry; + pMS1Query->_pSpecLibResultsMS1.fRTime = (float)(g_vSpecLib.at(iWhichMS1LibEntry).fRTime * dMaxSpecLibRT / dMaxQueryRT); + pMS1Query->_pSpecLibResultsMS1.iWhichSpecLib = g_vSpecLib.at(iWhichMS1LibEntry).iLibEntry; } Threading::UnlockMutex(g_pvQueryMutex); } @@ -3747,9 +3753,9 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, bool bFirstTimeThroughLoopForPeptide = true; // Compare calculated fragment ions against all matching query spectra. - while (iWhichQuery < (int)g_pvQuery.size()) + while (iWhichQuery < (int)_pQueries->size()) { - if (dCalcPepMass < g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) + if (dCalcPepMass < _pQueries->at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) { // If calculated mass is smaller than low mass range. break; @@ -3816,7 +3822,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -3856,7 +3862,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -3968,7 +3974,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -4009,7 +4015,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -4322,11 +4328,11 @@ int CometSearch::WithinMassTolerance(double dCalcPepMass, // proper enzyme termini, check if within mass tolerance of any given entry. // Do a binary search on list of input queries to find matching mass. - int iPos = BinarySearchMass(0, (int)g_pvQuery.size(), dCalcPepMass); + int iPos = BinarySearchMass(0, (int)_pQueries->size(), dCalcPepMass); // Seek back to first peptide entry that matches mass tolerance in case binary // search doesn't hit the first entry. - while (iPos > 0 && g_pvQuery.at(iPos)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) + while (iPos > 0 && _pQueries->at(iPos)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) iPos--; if (iPos != -1) @@ -4384,11 +4390,11 @@ bool CometSearch::WithinMassTolerancePeff(double dCalcPepMass, // of any entry. If so, simply return true here and will repeat the PEFF permutations later. // Do a binary search on list of input queries to find matching mass. - int iPos = BinarySearchMass(0, (int)g_pvQuery.size(), dCalcPepMass + dMassAddition); + int iPos = BinarySearchMass(0, (int)_pQueries->size(), dCalcPepMass + dMassAddition); // Seek back to first peptide entry that matches mass tolerance in case binary // search doesn't hit the first entry. - while (iPos > 0 && g_pvQuery.at(iPos)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) + while (iPos > 0 && _pQueries->at(iPos)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) iPos--; if (iPos != -1) @@ -4593,18 +4599,18 @@ int CometSearch::BinarySearchMass(int start, double dCalcPepMass) const { auto it = std::lower_bound( - g_pvQuery.begin() + start, - g_pvQuery.begin() + end, + _pQueries->begin() + start, + _pQueries->begin() + end, dCalcPepMass, [](const Query* query, double mass) { return query->_pepMassInfo.dPeptideMassTolerancePlus < mass; }); - if (it != g_pvQuery.begin() + end + if (it != _pQueries->begin() + end && (*it)->_pepMassInfo.dPeptideMassToleranceMinus <= dCalcPepMass && dCalcPepMass <= (*it)->_pepMassInfo.dPeptideMassTolerancePlus) { - return static_cast(std::distance(g_pvQuery.begin(), it)); + return static_cast(std::distance(_pQueries->begin(), it)); } return -1; @@ -4657,7 +4663,7 @@ size_t CometSearch::BinarySearchIndexMass(size_t start, bool CometSearch::CheckMassMatch(size_t iWhichQuery, double dCalcPepMass) { - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = _pQueries->at(iWhichQuery); int iMassOffsetsSize = (int)g_staticParams.vectorMassOffsets.size(); @@ -5044,7 +5050,7 @@ void CometSearch::XcorrScore(char* szProteinSeq, int iWhichIonSeries; bool bUseWaterAmmoniaNLPeaks = false; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = _pQueries->at(iWhichQuery); float** ppSparseFastXcorrData; // use this if bSparseMatrix @@ -5124,7 +5130,7 @@ void CometSearch::XcorrScore(char* szProteinSeq, ppSparseFastXcorrData = pQuery->ppfSparseFastXcorrData; for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (int ctZ = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctZ >= 1; --ctZ) + for (int ctZ = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctZ >= 1; --ctZ) { bin = *(*(*p_uiBinnedPrecursorNL + ctNL) + ctZ); @@ -5221,7 +5227,7 @@ void CometSearch::StorePeptide(size_t iWhichQuery, int i; int iLenPeptide; int iLenPeptide2; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = _pQueries->at(iWhichQuery); if (dXcorr < g_staticParams.options.dMinimumXcorr) return; @@ -5670,7 +5676,7 @@ int CometSearch::CheckDuplicate(int iWhichQuery, int iLenPeptide = iEndPos - iStartPos + 1; int iLenProteinMinus1 = (int)strlen(szProteinSeq) - 1; int bIsDuplicate = 0; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = _pQueries->at(iWhichQuery); if (g_staticParams.options.iDecoySearch == 2 && bDecoyPep) { @@ -7515,11 +7521,11 @@ bool CometSearch::MergeVarMods(char* szProteinSeq, // Need to check if mass is ok // Do a binary search on list of input queries to find matching mass. - iWhichQuery = BinarySearchMass(0, (int)g_pvQuery.size(), dTmpCalcPepMass); + iWhichQuery = BinarySearchMass(0, (int)_pQueries->size(), dTmpCalcPepMass); // Seek back to first peptide entry that matches mass tolerance in case binary // search doesn't hit the first entry. - while (iWhichQuery > 0 && g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) + while (iWhichQuery > 0 && _pQueries->at(iWhichQuery)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) iWhichQuery--; // Only if this PEFF mod (plus possible variable mods) is within mass tolerance, continue @@ -7644,9 +7650,9 @@ bool CometSearch::CalcVarModIons(char* szProteinSeq, // Compare calculated fragment ions against all matching query spectra - while (iWhichQuery < (int)g_pvQuery.size()) + while (iWhichQuery < (int)_pQueries->size()) { - if (dCalcPepMass < g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) + if (dCalcPepMass < _pQueries->at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) { // if calculated mass is smaller than low mass range, it // means we reached candidate peptides that are too big @@ -7850,7 +7856,7 @@ bool CometSearch::CalcVarModIons(char* szProteinSeq, // initialize precursorNL for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -7933,7 +7939,7 @@ bool CometSearch::CalcVarModIons(char* szProteinSeq, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; @@ -8186,7 +8192,7 @@ bool CometSearch::CalcVarModIons(char* szProteinSeq, // initialize precursorNL for decoy for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -8258,7 +8264,7 @@ bool CometSearch::CalcVarModIons(char* szProteinSeq, // Precursor NL peaks added here for (int ctNL = 0; ctNL < g_staticParams.iPrecursorNLSize; ++ctNL) { - for (ctCharge = g_pvQuery.at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) + for (ctCharge = _pQueries->at(iWhichQuery)->_spectrumInfoInternal.usiChargeState; ctCharge >= 1; ctCharge--) { double dNLMass = (dCalcPepMass - PROTON_MASS - g_staticParams.precursorNLIons[ctNL] + ctCharge * PROTON_MASS) / ctCharge; int iVal = BIN(dNLMass); @@ -8871,9 +8877,9 @@ void CometSearch::CompoundModSearch(char *szProteinSeq, bool bFirstTime = true; - while (iWhichQuery < (int)g_pvQuery.size()) + while (iWhichQuery < (int)_pQueries->size()) { - if (dModMass < g_pvQuery.at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) + if (dModMass < _pQueries->at(iWhichQuery)->_pepMassInfo.dPeptideMassToleranceMinus) break; if (CheckMassMatch(iWhichQuery, dModMass)) diff --git a/CometSearch/CometSearch.h b/CometSearch/CometSearch.h index 8301cac7..03c9105f 100644 --- a/CometSearch/CometSearch.h +++ b/CometSearch/CometSearch.h @@ -41,10 +41,11 @@ struct SearchThreadData sDBEntry dbEntry; bool* pbSearchMemoryPool; ThreadPool* tp; + const vector* pQueries; // batch query list; set before dispatch SearchThreadData() = default; SearchThreadData(const sDBEntry& dbEntry_in) - : dbEntry(dbEntry_in), pbSearchMemoryPool(nullptr), tp(nullptr) { + : dbEntry(dbEntry_in), pbSearchMemoryPool(nullptr), tp(nullptr), pQueries(nullptr) { } ~SearchThreadData() @@ -73,8 +74,10 @@ class CometSearch static bool RunSearch(int iPercentStart, int iPercentEnd, - ThreadPool* tp); - static bool RunSearch(ThreadPool* tp); + ThreadPool* tp, + vector& queries); + static bool RunSearch(ThreadPool* tp, + vector& queries); // Task 1.3: Thread-local overload: searches a caller-owned Query* without // touching g_pvQuery. Allocates its own pbDuplFragment scratch buffer. @@ -86,13 +89,15 @@ class CometSearch static bool RunSpecLibSearch(int iPercentStart, int iPercentEnd, - ThreadPool* tp); + ThreadPool* tp, + vector& queries); static bool RunSpecLibSearch(ThreadPool* tp); static bool RunMS1Search(ThreadPool* tp, double dRT, double dMaxMS1RTDiff, const double dMaxSpecLibRT, - const double dMaxQueryRT); + const double dMaxQueryRT, + vector& ms1Queries); // Thread-local overload: searches a caller-owned QueryMS1* against read-only g_vSpecLib. // No global mutable state accessed. static bool RunMS1Search(QueryMS1* pQueryMS1, @@ -120,13 +125,13 @@ class CometSearch int BinarySearchMass(int start, int end, double dCalcPepMass) const; - static bool CheckMassMatch(size_t iWhichQuery, - double dCalcPepMass); + bool CheckMassMatch(size_t iWhichQuery, + double dCalcPepMass); // Task 1.2: Thread-local overload accepting Query* directly. static bool CheckMassMatch(Query* pQuery, double dCalcPepMass); - bool SearchPeptideIndex(ThreadPool* tp); + bool SearchPeptideIndex(ThreadPool* tp, vector& queries); struct ProteinInfo { @@ -311,7 +316,7 @@ class CometSearch bool TranslateNA2AA(int* frame, int iDirection, char* sDNASequence); - static void SearchMS1Library(size_t iWhichMS1Query, + static void SearchMS1Library(QueryMS1* pMS1Query, const int iWhichThread, const double dRT, const double dMaxMS1RTDiff, @@ -382,6 +387,7 @@ class CometSearch static bool **_ppbDuplFragmentArr; // Number of arrays equals number of threads int _iSlot = -1; // pool slot index; set by SearchThreadProc before DoSearch + const vector* _pQueries = nullptr; // batch query list; set before FASTA/PI search std::unordered_set _seenShort; // per-protein dedup for len <= 12 (bFastPlainPeptideIdx) std::unordered_set _seenLong; // per-protein dedup for len > 12 (bFastPlainPeptideIdx) }; diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index e84cb6e4..13695d26 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -38,6 +38,7 @@ #include "CometAlignment.h" #include "AScoreOptions.h" #include "AScoreFactory.h" +#include "search/SearchSession.h" #include #include @@ -46,14 +47,6 @@ extern comet_fileoffset_t clSizeCometFileOffset; -std::vector g_pvQuery; - -// g_pvQueryMS1: BATCH PATH ONLY - used by RunMS1Search(ThreadPool*,...) and -// PreprocessMS1SingleSpectrum(). The single-spectrum MS1 search path -// (DoMS1SearchMultiResults) uses thread-local QueryMS1* objects and never -// reads or writes this vector. Do not access from concurrent search threads. -std::vector g_pvQueryMS1; - std::vector g_pvInputFiles; StaticParams g_staticParams; vector g_pvDBIndex; @@ -258,10 +251,10 @@ static void SetMSLevelFilter(MSReader &mstReader) mstReader.setFilter(msLevel); } -// Allocate memory for the _pResults struct for each g_pvQuery entry. -static bool AllocateResultsMem() +// Allocate memory for the _pResults struct for each query entry. +static bool AllocateResultsMem(std::vector& queries) { - for (std::vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) + for (std::vector::iterator it = queries.begin(); it != queries.end(); ++it) { Query* pQuery = *it; @@ -339,11 +332,11 @@ static bool AllocateResultsMem() return true; } -// Allocate memory for the _pSpecLibResults struct for each g_pvQueryMS1 entry. +// Allocate memory for the _pSpecLibResults struct for each session.queriesMS1 entry. static bool AllocateResultsMemMS1() { /* - for (std::vector::iterator it = g_pvQueryMS1.begin(); it != g_pvQueryMS1.end(); ++it) + for (std::vector::iterator it = session.queriesMS1.begin(); it != session.queriesMS1.end(); ++it) { QueryMS1* pQueryMS1 = *it; @@ -603,7 +596,7 @@ CometSearchManager::CometSearchManager() : CometSearchManager::~CometSearchManager() { - // Destroy the mutex we used to protect g_pvQuery. + // Destroy the mutex we used to protect g_pvQueryMutex. Threading::DestroyMutex(g_pvQueryMutex); // Destroy the mutex we used to protect g_pvDBIndex. @@ -2402,6 +2395,8 @@ bool CometSearchManager::DoSearch() if (!bSucceeded) break; + SearchSession session(g_staticParams); + time_t tStartTime; time(&tStartTime); strftime(g_staticParams.szDate, 26, "%Y/%m/%d, %I:%M:%S %p", localtime(&tStartTime)); @@ -2435,6 +2430,7 @@ bool CometSearchManager::DoSearch() woctx.iFirstScan = iFirstScan; woctx.iLastScan = iLastScan; woctx.iDecoySearch = g_staticParams.options.iDecoySearch; + woctx.bIdxNoFasta = g_bIdxNoFasta; woctx.pMgr = this; std::vector> vWriters; @@ -2620,7 +2616,7 @@ bool CometSearchManager::DoSearch() // or we will create a memory leak! g_cometStatus.SetStatusMsg(string("Running fused FI_DB search...")); - bSucceeded = CometPreprocess::FusedLoadAndSearchSpectra(mstReader, iFirstScan, iLastScan, iAnalysisType, tp); + bSucceeded = CometPreprocess::FusedLoadAndSearchSpectra(mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); if (!bSucceeded) goto cleanup_results; @@ -2628,10 +2624,10 @@ bool CometSearchManager::DoSearch() iPercentStart = iPercentEnd; iPercentEnd = mstReader.getPercent(); - if (g_pvQuery.empty()) + if (session.queries.empty()) continue; - iTotalSpectraSearched += (int)g_pvQuery.size(); + iTotalSpectraSearched += (int)session.queries.size(); } else { @@ -2649,7 +2645,7 @@ bool CometSearchManager::DoSearch() // spectra, we MUST "goto cleanup_results" before exiting the loop, // or we will create a memory leak! - bSucceeded = CometPreprocess::LoadAndPreprocessSpectra(mstReader, iFirstScan, iLastScan, iAnalysisType, tp); + bSucceeded = CometPreprocess::LoadAndPreprocessSpectra(mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); if (!bSucceeded) goto cleanup_results; @@ -2657,18 +2653,18 @@ bool CometSearchManager::DoSearch() iPercentStart = iPercentEnd; iPercentEnd = mstReader.getPercent(); - if (g_pvQuery.empty()) + if (session.queries.empty()) continue; //FIX make sure continue instead of break makes sense else // possible no spectrum in batch passes filters; do not want to break in that case; - iTotalSpectraSearched += (int)g_pvQuery.size(); + iTotalSpectraSearched += (int)session.queries.size(); - bSucceeded = AllocateResultsMem(); + bSucceeded = AllocateResultsMem(session.queries); if (!bSucceeded) goto cleanup_results; { // need strStatusMsg in it's own scope due to goto statement above - string strStatusMsg = " " + std::to_string(g_pvQuery.size()) + string("\n"); + string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) { logout(strStatusMsg); @@ -2683,9 +2679,9 @@ bool CometSearchManager::DoSearch() // sort back to original spectrum order in MS2 scan in order to associate pairs // based on sequential order of precursors for each scan - std::sort(g_pvQuery.begin(), g_pvQuery.end(), compareByMangoIndex); + std::sort(session.queries.begin(), session.queries.end(), compareByMangoIndex); - for (std::vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) + for (std::vector::iterator it = session.queries.begin(); it != session.queries.end(); ++it) { if ((*it)->_spectrumInfoInternal.iScanNumber != iCurrentScanNumber) { @@ -2699,11 +2695,11 @@ bool CometSearchManager::DoSearch() } } - // Sort g_pvQuery vector by dExpPepMass. - std::sort(g_pvQuery.begin(), g_pvQuery.end(), compareByPeptideMass); + // Sort session.queries vector by dExpPepMass. + std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); - g_massRange.dMinMass = g_pvQuery.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; - g_massRange.dMaxMass = g_pvQuery.at(g_pvQuery.size()-1)->_pepMassInfo.dPeptideMassTolerancePlus; + g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; + g_massRange.dMaxMass = session.queries.at(session.queries.size()-1)->_pepMassInfo.dPeptideMassTolerancePlus; if (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass) g_massRange.bNarrowMassRange = true; @@ -2718,9 +2714,9 @@ bool CometSearchManager::DoSearch() // Now that spectra are loaded to memory and sorted, do search. if (g_bPerformDatabaseSearch) - bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp); + bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); if (g_bPerformSpecLibSearch) - bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp); + bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); if (!bSucceeded) goto cleanup_results; @@ -2740,15 +2736,15 @@ bool CometSearchManager::DoSearch() g_cometStatus.SetStatusMsg(string("Performing post-search analysis ...")); // Sort each entry by xcorr, calculate E-values, etc. - bSucceeded = CometPostAnalysis::PostAnalysis(tp); + bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); } if (!bSucceeded) goto cleanup_results; } - // Sort g_pvQuery vector by scan (shared by both paths). - std::sort(g_pvQuery.begin(), g_pvQuery.end(), compareByScanNumber); + // Sort session.queries vector by scan (shared by both paths). + std::sort(session.queries.begin(), session.queries.end(), compareByScanNumber); if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) { @@ -2761,8 +2757,9 @@ bool CometSearchManager::DoSearch() { WriterWriteCtx wwctx; wwctx.fpdb = fpdb; - wwctx.iScanOffset = iTotalSpectraSearched - (int)g_pvQuery.size(); + wwctx.iScanOffset = iTotalSpectraSearched - (int)session.queries.size(); wwctx.iBatchNum = iBatchNum; + wwctx.pQueries = &session.queries; for (auto& pw : vWriters) { if (!pw->write(wwctx)) @@ -2777,10 +2774,10 @@ bool CometSearchManager::DoSearch() // Deleting each Query object in the vector calls its destructor, which // frees the spectral memory (see definition for Query in CometDataInternal.h). - for (auto it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) + for (auto it = session.queries.begin(); it != session.queries.end(); ++it) delete (*it); - g_pvQuery.clear(); + session.queries.clear(); if (!bSucceeded) break; @@ -3186,7 +3183,7 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, // the binned sqrt-intensity spectrum needed for fragment-ion matching below. double* pdTmpSpectrum = CometPreprocess::GetRtsRawDataBuffer(); - // Step 1: Preprocess into a thread-local Query* (does NOT touch g_pvQuery) + // Step 1: Preprocess into a thread-local Query* (does NOT touch session.queries) #ifdef RTS_TIMING tTimingMark = hrc::now(); #endif @@ -3227,7 +3224,7 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, // Step 3: Run the fragment index search on the thread-local Query* // This uses the new RunSearch(Query*) overload that allocates its own - // pbDuplFragment and never touches g_pvQuery or _ppbDuplFragmentArr. + // pbDuplFragment and never touches session.queries or _ppbDuplFragmentArr. #ifdef RTS_TIMING tTimingMark = hrc::now(); #endif @@ -3270,7 +3267,7 @@ bool CometSearchManager::DoSingleSpectrumSearchMultiResults(const int topN, if (takeSearchResultsN > iSize) takeSearchResultsN = iSize; - // Step 4: Post-analysis using Query* overloads (no g_pvQuery access) + // Step 4: Post-analysis using Query* overloads (no session.queries access) if (pQuery->iMatchPeptideCount > 0) { if (g_staticParams.options.iMaxIndexRunTime > 0) diff --git a/CometSearch/CometWriteMzIdentML.cpp b/CometSearch/CometWriteMzIdentML.cpp index a24bace2..aa558fc3 100644 --- a/CometSearch/CometWriteMzIdentML.cpp +++ b/CometSearch/CometWriteMzIdentML.cpp @@ -40,22 +40,23 @@ CometWriteMzIdentML::~CometWriteMzIdentML() void CometWriteMzIdentML::WriteMzIdentMLTmp(FILE *fpout, FILE *fpoutd, - int iBatchNum) + int iBatchNum, + const vector& queries) { int i; // Print temporary results in tab-delimited file if (g_staticParams.options.iDecoySearch == 2) { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintTmpPSM(i, 1, iBatchNum, fpout); - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintTmpPSM(i, 2, iBatchNum, fpoutd); + for (i=0; i<(int)queries.size(); ++i) + PrintTmpPSM(i, 1, iBatchNum, fpout, queries); + for (i=0; i<(int)queries.size(); ++i) + PrintTmpPSM(i, 2, iBatchNum, fpoutd, queries); } else { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintTmpPSM(i, 0, iBatchNum, fpout); + for (i=0; i<(int)queries.size(); ++i) + PrintTmpPSM(i, 0, iBatchNum, fpout, queries); } } @@ -63,12 +64,13 @@ void CometWriteMzIdentML::WriteMzIdentMLTmp(FILE *fpout, void CometWriteMzIdentML::WriteMzIdentML(FILE *fpout, FILE *fpdb, string sTmpFile, - CometSearchManager &searchMgr) + CometSearchManager &searchMgr, + bool bIdxNoFasta) { WriteMzIdentMLHeader(fpout); // now loop through sTmpFile file, wr - ParseTmpFile(fpout, fpdb, sTmpFile, searchMgr); + ParseTmpFile(fpout, fpdb, sTmpFile, searchMgr, bIdxNoFasta); fprintf(fpout, "\n"); } @@ -112,7 +114,8 @@ bool CometWriteMzIdentML::WriteMzIdentMLHeader(FILE *fpout) bool CometWriteMzIdentML::ParseTmpFile(FILE *fpout, FILE *fpdb, string sTmpFile, - CometSearchManager &searchMgr) + CometSearchManager &searchMgr, + bool bIdxNoFasta) { std::vector vMzidTmp; // vector to store entire tmp output std::vector vProteinTargets; // store vector of target protein file offsets @@ -314,7 +317,7 @@ bool CometWriteMzIdentML::ParseTmpFile(FILE *fpout, CometMassSpecUtils::EscapeString(strProteinName); fprintf(fpout, " 0) @@ -1373,12 +1376,13 @@ void CometWriteMzIdentML::WriteSpectrumIdentificationList(FILE* fpout, void CometWriteMzIdentML::PrintTmpPSM(int iWhichQuery, int iPrintTargetDecoy, int iBatchNum, - FILE *fpout) + FILE *fpout, + const vector& queries) { - if ((iPrintTargetDecoy != 2 && g_pvQuery.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) - || (iPrintTargetDecoy == 2 && g_pvQuery.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) + if ((iPrintTargetDecoy != 2 && queries.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) + || (iPrintTargetDecoy == 2 && queries.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) { - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); Results *pOutput; int iNumPrintLines; diff --git a/CometSearch/CometWriteMzIdentML.h b/CometSearch/CometWriteMzIdentML.h index 3c3ffa46..75d42e91 100644 --- a/CometSearch/CometWriteMzIdentML.h +++ b/CometSearch/CometWriteMzIdentML.h @@ -53,12 +53,14 @@ class CometWriteMzIdentML static void WriteMzIdentMLTmp(FILE *fpout, FILE *fpoutd, - int iBatchNum); + int iBatchNum, + const vector& queries); static void WriteMzIdentML(FILE *fpout, FILE *fpdb, string sTmpFile, - CometSearchManager &searchMgr); + CometSearchManager &searchMgr, + bool bIdxNoFasta); private: @@ -67,7 +69,8 @@ class CometWriteMzIdentML static void PrintTmpPSM(int iWhichQuery, int iPrintTargetDecoy, int iBatchNum, - FILE *fpOut); + FILE *fpOut, + const vector& queries); static void WriteMods(FILE *fpout, CometSearchManager &searchMgr); @@ -103,7 +106,8 @@ class CometWriteMzIdentML static bool ParseTmpFile(FILE *fpout, FILE *fpdb, string ssTmpFile, - CometSearchManager &searchMgr); + CometSearchManager &searchMgr, + bool bIdxNoFasta); }; #endif diff --git a/CometSearch/CometWritePepXML.cpp b/CometSearch/CometWritePepXML.cpp index c109c26c..2f089951 100644 --- a/CometSearch/CometWritePepXML.cpp +++ b/CometSearch/CometWritePepXML.cpp @@ -37,22 +37,23 @@ CometWritePepXML::~CometWritePepXML() void CometWritePepXML::WritePepXML(FILE *fpout, FILE *fpoutd, FILE *fpdb, - int iNumSpectraSearched) + int iNumSpectraSearched, + const vector& queries) { int i; // Print out the separate decoy hits. if (g_staticParams.options.iDecoySearch == 2) { - for (i = 0; i < (int)g_pvQuery.size(); ++i) - PrintResults(i, 1, fpout, fpdb, iNumSpectraSearched); - for (i = 0; i < (int)g_pvQuery.size(); ++i) - PrintResults(i, 2, fpoutd, fpdb, iNumSpectraSearched); + for (i = 0; i < (int)queries.size(); ++i) + PrintResults(i, 1, fpout, fpdb, iNumSpectraSearched, queries); + for (i = 0; i < (int)queries.size(); ++i) + PrintResults(i, 2, fpoutd, fpdb, iNumSpectraSearched, queries); } else { - for (i = 0; i < (int)g_pvQuery.size(); ++i) - PrintResults(i, 0, fpout, fpdb, iNumSpectraSearched); + for (i = 0; i < (int)queries.size(); ++i) + PrintResults(i, 0, fpout, fpdb, iNumSpectraSearched, queries); } } @@ -416,14 +417,15 @@ void CometWritePepXML::PrintResults(int iWhichQuery, int iPrintTargetDecoy, FILE *fpout, FILE *fpdb, - int iNumSpectraSearched) + int iNumSpectraSearched, + const vector& queries) { int i, iNumPrintLines, iMinLength; char *pStr; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); // look for either \ or / separator so valid for Windows or Linux if ((pStr = strrchr(g_staticParams.inputFile.szBaseName, '\\')) == NULL @@ -500,7 +502,7 @@ void CometWritePepXML::PrintResults(int iWhichQuery, for (int iWhichResult=0; iWhichResult g_staticParams.options.dMinimumXcorr) - PrintPepXMLSearchHit(iWhichQuery, iWhichResult, iPrintTargetDecoy, pOutput, fpout, fpdb); + PrintPepXMLSearchHit(iWhichQuery, iWhichResult, iPrintTargetDecoy, pOutput, fpout, fpdb, queries); } fprintf(fpout, " \n"); @@ -513,14 +515,15 @@ void CometWritePepXML::PrintPepXMLSearchHit(int iWhichQuery, int iPrintTargetDecoy, Results *pOutput, FILE *fpout, - FILE *fpdb) + FILE *fpdb, + const vector& queries) { int i; int iNTT; int iNMC; unsigned int uiNumTotProteins = 0; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); CalcNTTNMC(pOutput, iWhichResult, &iNTT, &iNMC); @@ -529,7 +532,7 @@ void CometWritePepXML::PrintPepXMLSearchHit(int iWhichQuery, std::vector::iterator it; bool bReturnFulProteinString = false; - CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys); + CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys, queries); fprintf(fpout, " & queries); static void WritePepXMLEndTags(FILE *fpout); @@ -47,14 +48,16 @@ class CometWritePepXML int iPrintTargetDecoy, FILE *fpOut, FILE *fpdb, - int iNumSpectraSearched); + int iNumSpectraSearched, + const vector& queries); static void PrintPepXMLSearchHit(int iWhichQuery, int iWhichResult, int iPrintTargetDecoy, Results *pOutput, FILE *fpOut, - FILE *fpdb); + FILE *fpdb, + const vector& queries); static void GetVal(char *szElement, char *szAttribute, diff --git a/CometSearch/CometWritePercolator.cpp b/CometSearch/CometWritePercolator.cpp index 55940804..3e2bdac0 100644 --- a/CometSearch/CometWritePercolator.cpp +++ b/CometSearch/CometWritePercolator.cpp @@ -32,22 +32,23 @@ CometWritePercolator::~CometWritePercolator() bool CometWritePercolator::WritePercolator(FILE *fpout, - FILE *fpdb) + FILE *fpdb, + const vector& queries) { int i; int iLenDecoyPrefix = (int)strlen(g_staticParams.szDecoyPrefix); // Print results. - for (i=0; i<(int)g_pvQuery.size(); ++i) + for (i=0; i<(int)queries.size(); ++i) { - if (g_pvQuery.at(i)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) + if (queries.at(i)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) { - PrintResults(i, fpout, fpdb, 0, iLenDecoyPrefix); // print search hit (could be decoy if g_staticParams.options.iDecoySearch=1) + PrintResults(i, fpout, fpdb, 0, iLenDecoyPrefix, queries); } - if (g_staticParams.options.iDecoySearch == 2 && g_pvQuery.at(i)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr) + if (g_staticParams.options.iDecoySearch == 2 && queries.at(i)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr) { - PrintResults(i, fpout, fpdb, 2, iLenDecoyPrefix); // print decoy hit + PrintResults(i, fpout, fpdb, 2, iLenDecoyPrefix, queries); } } @@ -89,11 +90,12 @@ bool CometWritePercolator::PrintResults(int iWhichQuery, FILE *fpout, FILE *fpdb, int iPrintTargetDecoy, - int iLenDecoyPrefix) + int iLenDecoyPrefix, + const vector& queries) { int iNumPrintLines; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); Results *pOutput; @@ -127,7 +129,7 @@ bool CometWritePercolator::PrintResults(int iWhichQuery, unsigned int uiNumTotProteins = 0; // unused in pin bool bReturnFulProteinString = false; - CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys); + CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys, queries); if (g_staticParams.options.iDecoySearch) // using Comet's internal decoys { @@ -164,7 +166,7 @@ bool CometWritePercolator::PrintResults(int iWhichQuery, fprintf(fpout, "%0.6f\t", pQuery->_pepMassInfo.dExpPepMass); //ExpMass fprintf(fpout, "%0.6f\t", pOutput[iWhichResult].dPepMass); //CalcMass - PrintPercolatorSearchHit(iWhichQuery, iWhichResult, iPrintTargetDecoy, pOutput, fpout, vProteinTargets, vProteinDecoys); + PrintPercolatorSearchHit(iWhichQuery, iWhichResult, iPrintTargetDecoy, pOutput, fpout, vProteinTargets, vProteinDecoys, queries); } return true; @@ -176,15 +178,15 @@ void CometWritePercolator::PrintPercolatorSearchHit(int iWhichQuery, int iPrintTargetDecoy, Results *pOutput, FILE *fpout, - vector vProteinTargets, - vector vProteinDecoys) + vector vProteinDecoys, + const vector& queries) { int iNterm; int iCterm; int iNMC; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); CalcNTTNMC(pOutput, iWhichResult, &iNterm, &iCterm, &iNMC); diff --git a/CometSearch/CometWritePercolator.h b/CometSearch/CometWritePercolator.h index 035efc31..b9ad1f85 100644 --- a/CometSearch/CometWritePercolator.h +++ b/CometSearch/CometWritePercolator.h @@ -24,7 +24,8 @@ class CometWritePercolator ~CometWritePercolator(); static void WritePercolatorHeader(FILE *fpout); static bool WritePercolator(FILE *fpout, - FILE *fpdb); + FILE *fpdb, + const vector& queries); private: @@ -32,14 +33,16 @@ class CometWritePercolator FILE *fpOut, FILE *fpdb, int iPrintTargetDecoy, - int iLenDecoyPrefix); + int iLenDecoyPrefix, + const vector& queries); static void PrintPercolatorSearchHit(int iWhichQuery, int iWhichResult, int iPrintTargetDecoy, Results *pOutput, FILE *fpOut, vector vProteinTargets, - vector vProteinDecoys); + vector vProteinDecoys, + const vector& queries); static void CalcNTTNMC(Results *pOutput, int iWhichQuery, int *iNterm, diff --git a/CometSearch/CometWriteSqt.cpp b/CometSearch/CometWriteSqt.cpp index db117740..b7858fec 100644 --- a/CometSearch/CometWriteSqt.cpp +++ b/CometSearch/CometWriteSqt.cpp @@ -31,22 +31,23 @@ CometWriteSqt::~CometWriteSqt() void CometWriteSqt::WriteSqt(FILE *fpout, FILE *fpoutd, - FILE *fpdb) + FILE *fpdb, + const vector& queries) { int i; // Print out the separate decoy hits. if (g_staticParams.options.iDecoySearch == 2) { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 1, fpout, fpdb); - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 2, fpoutd, fpdb); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 1, fpout, fpdb, queries); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 2, fpoutd, fpdb, queries); } else { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 0, fpout, fpdb); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 0, fpout, fpdb, queries); } } @@ -164,13 +165,14 @@ void CometWriteSqt::PrintSqtHeader(FILE *fpout, void CometWriteSqt::PrintResults(int iWhichQuery, int iPrintTargetDecoy, FILE *fpout, - FILE *fpdb) + FILE *fpdb, + const vector& queries) { int i, iNumPrintLines; std::ostringstream oss; - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); Results *pOutput; @@ -212,7 +214,7 @@ void CometWriteSqt::PrintResults(int iWhichQuery, for (i=0; i g_staticParams.options.dMinimumXcorr) - PrintSqtLine(iWhichQuery, i, pOutput, fpout, fpdb, iPrintTargetDecoy); + PrintSqtLine(iWhichQuery, i, pOutput, fpout, fpdb, iPrintTargetDecoy, queries); } } @@ -222,7 +224,8 @@ void CometWriteSqt::PrintSqtLine(int iWhichQuery, Results *pOutput, FILE *fpout, FILE *fpdb, - int iPrintTargetDecoy) + int iPrintTargetDecoy, + const vector& queries) { int i; std::ostringstream oss; @@ -325,7 +328,7 @@ void CometWriteSqt::PrintSqtLine(int iWhichQuery, bool bReturnFulProteinString = false; CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, - bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys); + bReturnFulProteinString, &uiNumTotProteins, vProteinTargets, vProteinDecoys, queries); if (iPrintTargetDecoy != 2) // if not decoy only, print target proteins { diff --git a/CometSearch/CometWriteSqt.h b/CometSearch/CometWriteSqt.h index e02aa3ba..9e4482db 100644 --- a/CometSearch/CometWriteSqt.h +++ b/CometSearch/CometWriteSqt.h @@ -25,7 +25,8 @@ class CometWriteSqt static void WriteSqt(FILE *fpout, FILE *fpoutd, - FILE *fpdb); + FILE *fpdb, + const vector& queries); static void PrintSqtHeader(FILE *fpout, CometSearchManager &searchMgr); @@ -34,13 +35,15 @@ class CometWriteSqt static void PrintResults(int iWhichQuery, int iPrintTargetDecoy, FILE *fpOut, - FILE *fpdb); + FILE *fpdb, + const vector& queries); static void PrintSqtLine(int iWhichQuery, int iWhichResult, Results *pOutput, FILE *fpOut, FILE *fpdb, - int iPrintTargetDecoy); + int iPrintTargetDecoy, + const vector& queries); }; #endif diff --git a/CometSearch/CometWriteTxt.cpp b/CometSearch/CometWriteTxt.cpp index 24e48e14..b9b95a74 100644 --- a/CometSearch/CometWriteTxt.cpp +++ b/CometSearch/CometWriteTxt.cpp @@ -31,22 +31,23 @@ CometWriteTxt::~CometWriteTxt() void CometWriteTxt::WriteTxt(FILE *fpout, FILE *fpoutd, - FILE *fpdb) + FILE *fpdb, + const vector& queries) { int i; // Print out the separate decoy hits. if (g_staticParams.options.iDecoySearch == 2) { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 1, fpout, fpdb); - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 2, fpoutd, fpdb); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 1, fpout, fpdb, queries); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 2, fpoutd, fpdb, queries); } else { - for (i=0; i<(int)g_pvQuery.size(); ++i) - PrintResults(i, 0, fpout, fpdb); + for (i=0; i<(int)queries.size(); ++i) + PrintResults(i, 0, fpout, fpdb, queries); } } @@ -115,13 +116,14 @@ void CometWriteTxt::PrintTxtHeader(FILE *fpout) void CometWriteTxt::PrintResults(int iWhichQuery, int iPrintTargetDecoy, FILE *fpout, - FILE *fpdb) //fpdb is file pointer for either FASTA or .idx file + FILE *fpdb, + const vector& queries) //fpdb is file pointer for either FASTA or .idx file { #ifdef CRUX - if ((iPrintTargetDecoy != 2 && g_pvQuery.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) - || (iPrintTargetDecoy == 2 && g_pvQuery.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) + if ((iPrintTargetDecoy != 2 && queries.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) + || (iPrintTargetDecoy == 2 && queries.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) { - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); int charge = pQuery->_spectrumInfoInternal.usiChargeState; double spectrum_neutral_mass = pQuery->_pepMassInfo.dExpPepMass - PROTON_MASS; @@ -211,7 +213,7 @@ void CometWriteTxt::PrintResults(int iWhichQuery, unsigned int uiNumTotProteins = 0; // print protein list - PrintProteins(fpout, fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, &uiNumTotProteins); + PrintProteins(fpout, fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, &uiNumTotProteins, queries); // Cleavage type fprintf(fpout, "\t%c%c\t", pOutput[iWhichResult].cPrevAA, pOutput[iWhichResult].cNextAA); @@ -227,10 +229,10 @@ void CometWriteTxt::PrintResults(int iWhichQuery, } #else - if ((iPrintTargetDecoy != 2 && g_pvQuery.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) - || (iPrintTargetDecoy == 2 && g_pvQuery.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) + if ((iPrintTargetDecoy != 2 && queries.at(iWhichQuery)->_pResults[0].fXcorr > g_staticParams.options.dMinimumXcorr) + || (iPrintTargetDecoy == 2 && queries.at(iWhichQuery)->_pDecoys[0].fXcorr > g_staticParams.options.dMinimumXcorr)) { - Query* pQuery = g_pvQuery.at(iWhichQuery); + Query* pQuery = queries.at(iWhichQuery); Results *pOutput; int iNumPrintLines; @@ -377,7 +379,7 @@ void CometWriteTxt::PrintResults(int iWhichQuery, unsigned int uiNumTotProteins = 0; // print protein list - PrintProteins(fpout, fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, &uiNumTotProteins); + PrintProteins(fpout, fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, &uiNumTotProteins, queries); fprintf(fpout, "\t%u\t", uiNumTotProteins); @@ -409,7 +411,8 @@ void CometWriteTxt::PrintProteins(FILE *fpout, int iWhichQuery, int iWhichResult, int iPrintTargetDecoy, - unsigned int *uiNumTotProteins) + unsigned int *uiNumTotProteins, + const vector& queries) { std::vector vProteinTargets; // store vector of target protein names std::vector vProteinDecoys; // store vector of decoy protein names @@ -417,7 +420,7 @@ void CometWriteTxt::PrintProteins(FILE *fpout, bool bReturnFulProteinString = false; - CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, uiNumTotProteins, vProteinTargets, vProteinDecoys); + CometMassSpecUtils::GetProteinNameString(fpdb, iWhichQuery, iWhichResult, iPrintTargetDecoy, bReturnFulProteinString, uiNumTotProteins, vProteinTargets, vProteinDecoys, queries); bool bPrintComma = false; diff --git a/CometSearch/CometWriteTxt.h b/CometSearch/CometWriteTxt.h index db048fd4..f921b760 100644 --- a/CometSearch/CometWriteTxt.h +++ b/CometSearch/CometWriteTxt.h @@ -24,7 +24,8 @@ class CometWriteTxt ~CometWriteTxt(); static void WriteTxt(FILE *fpout, FILE *fpoutd, - FILE *fpdb); + FILE *fpdb, + const vector& queries); static void PrintTxtHeader(FILE *fpout); static void PrintModifications(FILE *fpout, @@ -35,13 +36,15 @@ class CometWriteTxt int iWhichQuery, int iWhichResult, int iPrintTargetDecoy, - unsigned int *uiNumTotProteins); + unsigned int *uiNumTotProteins, + const vector& queries); private: static void PrintResults(int iWhichQuery, int iPrintTargetDecoy, FILE *fpOut, - FILE *fpdb); + FILE *fpdb, + const vector& queries); }; #endif diff --git a/CometSearch/core/Types.h b/CometSearch/core/Types.h index afbe707f..035f3e13 100644 --- a/CometSearch/core/Types.h +++ b/CometSearch/core/Types.h @@ -580,15 +580,12 @@ extern bool g_bPlainPeptideIndexRead; // set to true if plain peptide index fi extern std::atomic g_bPeptideIndexRead; // set to true if peptide index file is read extern bool g_bSpecLibRead; // set to true if spectral library file is read -extern bool g_bPerformSpecLibSearch; // set to true if doing spectral library search -extern bool g_bPerformDatabaseSearch; // set to true if doing database search +// g_bPerformSpecLibSearch, g_bPerformDatabaseSearch, g_bIdxNoFasta moved to SearchSession +// (Phase 4: batch path only -- see search/SearchSession.h) extern bool g_bCometPreprocessMemoryAllocated; // set to true when memory has been allocated extern bool g_bCometSearchMemoryAllocated; // set to true when memory has been allocated -extern bool g_bIdxNoFasta; // set to true when .idx file being search but corresponding .fasta not present - // used in mzid output to skip sequence retrieval - // Query stores information for peptide scoring and results // This struct is allocated for each spectrum/charge combination struct Query @@ -806,8 +803,8 @@ struct QueryMS1 } }; -extern vector g_pvQuery; -extern vector g_pvQueryMS1; +// g_pvQuery and g_pvQueryMS1 moved to SearchSession.queries / SearchSession.ms1Queries +// (Phase 4: batch path only -- see search/SearchSession.h) extern vector g_pvInputFiles; extern Mutex g_pvQueryMutex; extern Mutex g_pvDBIndexMutex; diff --git a/CometSearch/output/IResultWriter.h b/CometSearch/output/IResultWriter.h index 76a235a7..34d6fdc6 100644 --- a/CometSearch/output/IResultWriter.h +++ b/CometSearch/output/IResultWriter.h @@ -17,8 +17,10 @@ #include #include +#include class CometSearchManager; +struct Query; // Parameters passed to each writer's open() method. struct WriterOpenCtx @@ -30,6 +32,7 @@ struct WriterOpenCtx int iFirstScan; int iLastScan; int iDecoySearch; // 0=off, 1=concat, 2=separate + bool bIdxNoFasta; // .idx DB with no companion .fasta (mzIdentML) CometSearchManager* pMgr; // for format headers that need ICometSearchManager }; @@ -37,8 +40,9 @@ struct WriterOpenCtx struct WriterWriteCtx { FILE* fpdb; - int iScanOffset; // iTotalSpectraSearched - g_pvQuery.size(); pepXML only + int iScanOffset; // iTotalSpectraSearched - queries.size(); pepXML only int iBatchNum; // mzIdentML only + const std::vector* pQueries; // batch query results for this write call }; class IResultWriter diff --git a/CometSearch/output/MzIdentMlWriter.h b/CometSearch/output/MzIdentMlWriter.h index 43125a55..d8f95251 100644 --- a/CometSearch/output/MzIdentMlWriter.h +++ b/CometSearch/output/MzIdentMlWriter.h @@ -27,6 +27,7 @@ class MzIdentMlWriter : public IResultWriter bool open(const WriterOpenCtx& ctx) override { + _bIdxNoFasta = ctx.bIdxNoFasta; BuildNames(ctx, ".mzid", ".decoy.mzid", ".target.mzid", _sTarget, _sDecoy); _fpout = fopen(_sTarget.c_str(), "w"); @@ -55,7 +56,7 @@ class MzIdentMlWriter : public IResultWriter bool write(const WriterWriteCtx& ctx) override { _fpdb = ctx.fpdb; // remember for close() - CometWriteMzIdentML::WriteMzIdentMLTmp(_fpoutTmp, _fpoutdTmp, ctx.iBatchNum); + CometWriteMzIdentML::WriteMzIdentMLTmp(_fpoutTmp, _fpoutdTmp, ctx.iBatchNum, *ctx.pQueries); return true; } @@ -73,12 +74,13 @@ class MzIdentMlWriter : public IResultWriter } private: - CometSearchManager* _pMgr = nullptr; - FILE* _fpout = nullptr; - FILE* _fpoutd = nullptr; - FILE* _fpoutTmp = nullptr; - FILE* _fpoutdTmp = nullptr; - FILE* _fpdb = nullptr; + CometSearchManager* _pMgr = nullptr; + FILE* _fpout = nullptr; + FILE* _fpoutd = nullptr; + FILE* _fpoutTmp = nullptr; + FILE* _fpoutdTmp = nullptr; + FILE* _fpdb = nullptr; + bool _bIdxNoFasta = false; std::string _sTarget, _sDecoy, _sTgtTmp, _sDecTmp; bool OpenTmp(const std::string& sBase, std::string& sTmp, FILE*& fp) @@ -114,7 +116,7 @@ class MzIdentMlWriter : public IResultWriter fpTmp = fopen(sTmp.c_str(), "r"); if (fpTmp) { - CometWriteMzIdentML::WriteMzIdentML(fpFinal, _fpdb, sTmp, *_pMgr); + CometWriteMzIdentML::WriteMzIdentML(fpFinal, _fpdb, sTmp, *_pMgr, _bIdxNoFasta); fclose(fpTmp); fpTmp = nullptr; if (!bEmpty) remove(sTmp.c_str()); } diff --git a/CometSearch/output/PepXmlWriter.h b/CometSearch/output/PepXmlWriter.h index 3270c04a..ea93336c 100644 --- a/CometSearch/output/PepXmlWriter.h +++ b/CometSearch/output/PepXmlWriter.h @@ -52,7 +52,7 @@ class PepXmlWriter : public IResultWriter bool write(const WriterWriteCtx& ctx) override { - CometWritePepXML::WritePepXML(_fpout, _fpoutd, ctx.fpdb, ctx.iScanOffset); + CometWritePepXML::WritePepXML(_fpout, _fpoutd, ctx.fpdb, ctx.iScanOffset, *ctx.pQueries); return true; } diff --git a/CometSearch/output/PercolatorWriter.h b/CometSearch/output/PercolatorWriter.h index d6528a39..1f6e3db7 100644 --- a/CometSearch/output/PercolatorWriter.h +++ b/CometSearch/output/PercolatorWriter.h @@ -44,7 +44,7 @@ class PercolatorWriter : public IResultWriter bool write(const WriterWriteCtx& ctx) override { - return CometWritePercolator::WritePercolator(_fpout, ctx.fpdb); + return CometWritePercolator::WritePercolator(_fpout, ctx.fpdb, *ctx.pQueries); } void close(bool /*bSucceeded*/, bool bEmpty) override diff --git a/CometSearch/output/SqtWriter.h b/CometSearch/output/SqtWriter.h index 34341843..5c6b4f09 100644 --- a/CometSearch/output/SqtWriter.h +++ b/CometSearch/output/SqtWriter.h @@ -53,7 +53,7 @@ class SqtWriter : public IResultWriter bool write(const WriterWriteCtx& ctx) override { - CometWriteSqt::WriteSqt(_fpout, _fpoutd, ctx.fpdb); + CometWriteSqt::WriteSqt(_fpout, _fpoutd, ctx.fpdb, *ctx.pQueries); return true; } diff --git a/CometSearch/output/TxtWriter.h b/CometSearch/output/TxtWriter.h index c9b5f77a..1f2e4d94 100644 --- a/CometSearch/output/TxtWriter.h +++ b/CometSearch/output/TxtWriter.h @@ -54,7 +54,7 @@ class TxtWriter : public IResultWriter bool write(const WriterWriteCtx& ctx) override { - CometWriteTxt::WriteTxt(_fpout, _fpoutd, ctx.fpdb); + CometWriteTxt::WriteTxt(_fpout, _fpoutd, ctx.fpdb, *ctx.pQueries); return true; } diff --git a/CometSearch/search/SearchSession.h b/CometSearch/search/SearchSession.h new file mode 100644 index 00000000..d3227698 --- /dev/null +++ b/CometSearch/search/SearchSession.h @@ -0,0 +1,69 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Owns all mutable state for one batch search run. +// Created at the top of CometSearchManager::DoSearch() per input-file iteration. +// Passed by reference to pipeline functions that read or write per-run state. +// +// Read-only index globals (g_iFragmentIndex, g_vFragmentPeptides, g_vRawPeptides, +// g_vSpecLib, g_pvProteinsList, g_pvProteinNameCache, g_pvDBIndex, …) are NOT moved +// here — they are large, initialised once, and shared read-only across all threads. +// +// Phase 4 migration note: +// g_pvQueryMutex, g_bPlainPeptideIndexRead, g_bSpecLibRead, and g_cometStatus +// remain as globals because they are also accessed from the RTS path +// (InitializeSingleSpectrumSearch / DoSingleSpectrumSearchMultiResults), which +// does not use SearchSession. Full removal is deferred to Phase 5. + +#ifndef _SEARCHSESSION_H_ +#define _SEARCHSESSION_H_ + +#include "core/Params.h" +#include "core/Types.h" +#include "CometStatus.h" +#include +#include + +struct SearchSession +{ + // Run parameters — set once before the file loop, then read-only. + const StaticParams& params; + + // Per-batch MS2 result accumulator. + // Guarded by queriesMutex in the batch path. + std::vector queries; + + // Per-batch MS1 result accumulator (batch path only). + std::vector ms1Queries; + + // Mutex protecting queries and ms1Queries during parallel spectrum loading. + std::mutex queriesMutex; + + // Run-time flags (replace the five batch-path-only globals). + bool bPerformDatabaseSearch = false; + bool bPerformSpecLibSearch = false; + bool bIdxNoFasta = false; + bool bPlainPeptideIndexRead = false; + bool bSpecLibRead = false; + + // Error / cancel state for this run. + // g_cometStatus remains as a global for the RTS path (Phase 5 will unify). + CometStatus status; + + explicit SearchSession(const StaticParams& p) : params(p) {} + SearchSession(const SearchSession&) = delete; + SearchSession& operator=(const SearchSession&) = delete; +}; + +#endif // _SEARCHSESSION_H_ \ No newline at end of file From e25dcf163c1da38efb87aabbad33189ae7410116 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Sat, 13 Jun 2026 08:23:26 -0700 Subject: [PATCH 03/15] implement architecture migration phase 5: ISearchStrategy + Pipeline replace DoSearch per-file loop Extract FiStrategy, FastaStrategy, and PiStrategy as concrete ISearchStrategy implementations; Pipeline drives the per-file loop and writer lifecycle. DoSearch() reduces from ~700 lines to ~50: index-build early returns, strategy selection, writer factory, and pipeline.run(). SearchUtils.h adds inline utilities (GetInputType, UpdateInputFile, SetMSLevelFilter, AllocateResultsMem, comparators) shared across strategies without circular includes. Verified: 17/17 unit tests pass; HeLa FI_DB batch parity zero PSM diff at 1% and 5% FDR vs pre-Phase5 baseline (49,747 spectra, phospho + oxidation). Co-Authored-By: Claude Sonnet 4.6 --- CometSearch/CometSearchManager.cpp | 776 ++---------------------- CometSearch/Makefile | 9 +- CometSearch/search/FastaStrategy.cpp | 196 ++++++ CometSearch/search/FastaStrategy.h | 39 ++ CometSearch/search/FiStrategy.cpp | 248 ++++++++ CometSearch/search/FiStrategy.h | 41 ++ CometSearch/search/ISearchStrategy.h | 73 +++ CometSearch/search/PiStrategy.cpp | 150 +++++ CometSearch/search/PiStrategy.h | 39 ++ CometSearch/search/Pipeline.cpp | 269 ++++++++ CometSearch/search/Pipeline.h | 47 ++ CometSearch/search/SearchUtils.h | 234 +++++++ docs/20260612_architecture_migration.md | 74 +++ 13 files changed, 1460 insertions(+), 735 deletions(-) create mode 100644 CometSearch/search/FastaStrategy.cpp create mode 100644 CometSearch/search/FastaStrategy.h create mode 100644 CometSearch/search/FiStrategy.cpp create mode 100644 CometSearch/search/FiStrategy.h create mode 100644 CometSearch/search/ISearchStrategy.h create mode 100644 CometSearch/search/PiStrategy.cpp create mode 100644 CometSearch/search/PiStrategy.h create mode 100644 CometSearch/search/Pipeline.cpp create mode 100644 CometSearch/search/Pipeline.h create mode 100644 CometSearch/search/SearchUtils.h diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index 13695d26..7a8296c6 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -39,6 +39,12 @@ #include "AScoreOptions.h" #include "AScoreFactory.h" #include "search/SearchSession.h" +#include "search/SearchUtils.h" +#include "search/ISearchStrategy.h" +#include "search/FiStrategy.h" +#include "search/FastaStrategy.h" +#include "search/PiStrategy.h" +#include "search/Pipeline.h" #include #include @@ -145,193 +151,6 @@ static std::string GetHostName() return {}; } -static InputType GetInputType(const char *pszFileName) -{ - int iLen = (int)strlen(pszFileName); - - if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 6, ".mzXML") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 5, ".mzML") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 9, ".mzXML.gz") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 8, ".mzML.gz")) - - { - return InputType_MZXML; - } - else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".raw")) - { - return InputType_RAW; - } - else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".ms2") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 5, ".cms2")) - { - return InputType_MS2; - } - else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".mgf")) - { - return InputType_MGF; - } - - return InputType_UNKNOWN; -} - -static bool UpdateInputFile(InputFileInfo *pFileInfo) -{ - bool bUpdateBaseName = false; - char szTmpBaseName[SIZE_FILE]; - - // Make sure not set on command line OR more than 1 input file - // Need to do this check here before g_staticParams.inputFile is set to *pFileInfo - if (g_staticParams.inputFile.szBaseName[0] =='\0' || g_pvInputFiles.size()>1) - bUpdateBaseName = true; - else - strcpy(szTmpBaseName, g_staticParams.inputFile.szBaseName); - - g_staticParams.inputFile = *pFileInfo; - - g_staticParams.inputFile.iInputType = GetInputType(g_staticParams.inputFile.szFileName); - - if (InputType_UNKNOWN == g_staticParams.inputFile.iInputType) - { - return false; - } - - // per request, perform quick check to validate file still exists - // to avoid creating stub output files in these cases. - FILE *fp; - if ( (fp=fopen(g_staticParams.inputFile.szFileName, "r"))==NULL) - { - string strErrorMsg = " Error - cannot read input file \"" + string(g_staticParams.inputFile.szFileName) + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - else - { - fclose(fp); - } - -#ifndef CRUX - if (bUpdateBaseName) // set individual basename from input file - { - char *pStr; - int iLen = (int)strlen(g_staticParams.inputFile.szFileName); - - strcpy(g_staticParams.inputFile.szBaseName, g_staticParams.inputFile.szFileName); - - if ( (pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) - *pStr = '\0'; - - if (!STRCMP_IGNORE_CASE(g_staticParams.inputFile.szFileName + iLen - 9, ".mzXML.gz") - || !STRCMP_IGNORE_CASE(g_staticParams.inputFile.szFileName + iLen - 8, ".mzML.gz")) - { - if ( (pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) - *pStr = '\0'; - } - } - else - { - strcpy(g_staticParams.inputFile.szBaseName, szTmpBaseName); // set basename from command line - } -#endif - - return true; -} - -static void SetMSLevelFilter(MSReader &mstReader) -{ - vector msLevel; - - if (g_staticParams.options.iMSLevel == 3) - msLevel.push_back(MS3); - else if (g_staticParams.options.iMSLevel == 2) - msLevel.push_back(MS2); - else if (g_staticParams.options.iMSLevel == 1) - msLevel.push_back(MS1); - - mstReader.setFilter(msLevel); -} - -// Allocate memory for the _pResults struct for each query entry. -static bool AllocateResultsMem(std::vector& queries) -{ - for (std::vector::iterator it = queries.begin(); it != queries.end(); ++it) - { - Query* pQuery = *it; - - try - { - pQuery->_pResults = new Results[g_staticParams.options.iNumStored]; - } - catch (std::bad_alloc& ba) - { - string strErrorMsg = " Error - new(_pResults[]). bad_alloc: \"" + std::string(ba.what()) + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - - if (g_staticParams.options.iDecoySearch==2) - { - try - { - pQuery->_pDecoys = new Results[g_staticParams.options.iNumStored]; - } - catch (std::bad_alloc& ba) - { - string strErrorMsg = " Error - new(_pDecoys[]). bad_alloc: " + std::string(ba.what()) + "\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - } - - pQuery->iMatchPeptideCount = 0; - pQuery->iDecoyMatchPeptideCount = 0; - - for (int j=0; j_pResults[j].dPepMass = 0.0; - pQuery->_pResults[j].dExpect = 999; - pQuery->_pResults[j].fScoreSp = 0.0; - pQuery->_pResults[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; - pQuery->_pResults[j].fAScorePro = 0.0; - pQuery->_pResults[j].usiLenPeptide = 0; - pQuery->_pResults[j].usiRankSp = 0; - pQuery->_pResults[j].usiMatchedIons = 0; - pQuery->_pResults[j].usiTotalIons = 0; - pQuery->_pResults[j].szPeptide[0] = '\0'; - pQuery->_pResults[j].sAScoreProSiteScores.clear(); - pQuery->_pResults[j].pWhichProtein.clear(); - pQuery->_pResults[j].sPeffOrigResidues.clear(); - pQuery->_pResults[j].iPeffOrigResiduePosition = -9; - memset(pQuery->iXcorrHistogram, 0, sizeof(pQuery->iXcorrHistogram)); - - if (g_staticParams.options.iDecoySearch) - pQuery->_pResults[j].pWhichDecoyProtein.clear(); - - if (g_staticParams.options.iDecoySearch==2) - { - pQuery->_pDecoys[j].dPepMass = 0.0; - pQuery->_pDecoys[j].dExpect = 999; - pQuery->_pDecoys[j].fScoreSp = 0.0; - pQuery->_pDecoys[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; - pQuery->_pDecoys[j].fAScorePro = 0.0; - pQuery->_pDecoys[j].usiLenPeptide = 0; - pQuery->_pDecoys[j].usiRankSp = 0; - pQuery->_pDecoys[j].usiMatchedIons = 0; - pQuery->_pDecoys[j].usiTotalIons = 0; - pQuery->_pDecoys[j].szPeptide[0] = '\0'; - pQuery->_pDecoys[j].sAScoreProSiteScores.clear(); - pQuery->_pDecoys[j].pWhichProtein.clear(); - pQuery->_pDecoys[j].sPeffOrigResidues.clear(); - pQuery->_pDecoys[j].iPeffOrigResiduePosition = -9; - } - } - } - - return true; -} - // Allocate memory for the _pSpecLibResults struct for each session.queriesMS1 entry. static bool AllocateResultsMemMS1() { @@ -363,24 +182,6 @@ static bool AllocateResultsMemMS1() return true; } -static bool compareByPeptideMass(Query const* a, Query const* b) -{ - return (a->_pepMassInfo.dExpPepMass < b->_pepMassInfo.dExpPepMass); -} - -static bool compareByMangoIndex(Query const* a, Query const* b) -{ - return (a->dMangoIndex < b->dMangoIndex); -} - -static bool compareByScanNumber(Query const* a, Query const* b) -{ - // sort by charge state if same scan number - if (a->_spectrumInfoInternal.iScanNumber == b->_spectrumInfoInternal.iScanNumber) - return (a->_spectrumInfoInternal.usiChargeState < b->_spectrumInfoInternal.usiChargeState); - return (a->_spectrumInfoInternal.iScanNumber < b->_spectrumInfoInternal.iScanNumber); -} - static bool ValidateOutputFormat() { if (!g_staticParams.options.bOutputSqtStream @@ -2181,8 +1982,6 @@ bool CometSearchManager::DoSearch() ThreadPool *tp = _tp; - auto tGlobalStartTime = chrono::steady_clock::now(); - if (!InitializeStaticParams()) return false; @@ -2346,546 +2145,55 @@ bool CometSearchManager::DoSearch() return bSucceeded; // index written; caller (InitializeSingleSpectrumSearch) will load it } - bool bBlankSearchFile = false; - - if (g_bPerformDatabaseSearch && g_staticParams.iDbType == DbType::FI_DB) + // AScore initialization (once for entire DoSearch run) + if (g_staticParams.options.iPrintAScoreProScore) { - if (!g_staticParams.options.iFragIndexSkipReadPrecursors) + SetAScoreOptions(g_AScoreOptions); + g_AScoreInterface = CreateAScoreDllInterface(); + if (!g_AScoreInterface) { - // read precursors before creating fragment index - auto tTime1 = chrono::steady_clock::now(); - if (!g_staticParams.options.bOutputSqtStream) - { - cout << " - read precursors ... "; - fflush(stdout); - } - - for (int i = 0; i < (int)g_pvInputFiles.size(); ++i) - { - bSucceeded = UpdateInputFile(g_pvInputFiles.at(i)); - if (!bSucceeded) - break; - - // For file access using MSToolkit. - MSReader mstReader; - - // We want to read only MS2/MS3 scans. - SetMSLevelFilter(mstReader); - - CometPreprocess::Reset(); - - bSucceeded = CometPreprocess::ReadPrecursors(mstReader); - } - - if (!g_staticParams.options.bOutputSqtStream) - cout << CometMassSpecUtils::ElapsedTime(tTime1) << endl; + std::cerr << "Failed to create AScore interface." << std::endl; + return false; } } if (g_bPerformSpecLibSearch) - { CometSpecLib::LoadSpecLib(g_staticParams.speclibInfo.strSpecLibFile); - } - - bool bPerformAScoreInitialization = true; - - for (int i = 0; i < (int)g_pvInputFiles.size(); ++i) - { - bSucceeded = UpdateInputFile(g_pvInputFiles.at(i)); - if (!bSucceeded) - break; - - SearchSession session(g_staticParams); - - time_t tStartTime; - time(&tStartTime); - strftime(g_staticParams.szDate, 26, "%Y/%m/%d, %I:%M:%S %p", localtime(&tStartTime)); - - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - strOut = " Search start: " + string(g_staticParams.szDate) + "\n"; - strOut += " - Input file: " + string(g_staticParams.inputFile.szFileName) + "\n"; - logout(strOut); - fflush(stdout); - } - - int iFirstScan = g_staticParams.inputFile.iFirstScan; // First scan to search specified by user. - int iLastScan = g_staticParams.inputFile.iLastScan; // Last scan to search specified by user. - int iPercentStart = 0; // percentage within input file for start scan of batch - int iPercentEnd = 0; // percentage within input file for end scan of batch - int iAnalysisType = g_staticParams.inputFile.iAnalysisType; // 1=dta (retired), - // 2=specific scan, - // 3=specific scan + charge, - // 4=scan range, - // 5=entire file - - // Phase 3: writer factory -- builds vector from options. - // Each writer owns its file handle(s); open() opens + writes format header, - // write() outputs one batch, close() writes footer + fcloses. - WriterOpenCtx woctx; - woctx.szBaseName = g_staticParams.inputFile.szBaseName; - woctx.szOutputSuffix = g_staticParams.szOutputSuffix; - woctx.szTxtFileExt = g_staticParams.szTxtFileExt; - woctx.bEntireFile = (iAnalysisType == AnalysisType_EntireFile); - woctx.iFirstScan = iFirstScan; - woctx.iLastScan = iLastScan; - woctx.iDecoySearch = g_staticParams.options.iDecoySearch; - woctx.bIdxNoFasta = g_bIdxNoFasta; - woctx.pMgr = this; - - std::vector> vWriters; - - // PepXML, mzIdentML, Percolator, Txt first; SQT last (WriteSqt modifies szMod). - if (bSucceeded && g_staticParams.options.bOutputPepXMLFile) - { - auto pw = std::make_unique(); - if (!pw->open(woctx)) bSucceeded = false; - else vWriters.push_back(std::move(pw)); - } - - if (bSucceeded && g_staticParams.options.iOutputMzIdentMLFile) - { - auto pw = std::make_unique(this); - if (!pw->open(woctx)) bSucceeded = false; - else vWriters.push_back(std::move(pw)); - } - - if (bSucceeded && g_staticParams.options.bOutputPercolatorFile) - { - auto pw = std::make_unique(); - if (!pw->open(woctx)) bSucceeded = false; - else vWriters.push_back(std::move(pw)); - } - - if (bSucceeded && g_staticParams.options.bOutputTxtFile) - { - auto pw = std::make_unique(); - if (!pw->open(woctx)) bSucceeded = false; - else vWriters.push_back(std::move(pw)); - } - - if (bSucceeded && (g_staticParams.options.bOutputSqtFile || g_staticParams.options.bOutputSqtStream)) - { - auto pw = std::make_unique(); - if (!pw->open(woctx)) bSucceeded = false; - else vWriters.push_back(std::move(pw)); - } - - int iTotalSpectraSearched = 0; - if (bSucceeded) - { - //MH: Allocate memory shared by threads during spectral processing. - bSucceeded = CometPreprocess::AllocateMemory(g_staticParams.options.iNumThreads); - if (!bSucceeded) - break; - - // Allocate memory shared by threads during search - bSucceeded = CometSearch::AllocateMemory(g_staticParams.options.iNumThreads); - if (!bSucceeded) - break; - - // For file access using MSToolkit. - MSReader mstReader; - - // We want to read only MS2/MS3 scans. - SetMSLevelFilter(mstReader); - - // We need to reset some of the static variables in-between input files - CometPreprocess::Reset(); - - FILE* fpfasta = NULL; // pointer to FASTA file; if .idx search, FASTA is used to retrieve sequences (mzid output) - FILE* fpidx = NULL; // pointer to .idx file if used - - if (g_bPerformDatabaseSearch) - { - string sTmpDB = g_staticParams.databaseInfo.szDatabase; - - if (g_staticParams.iDbType != DbType::FASTA_DB) - { - // .idx db so first open .idx file - if ((fpidx = fopen(sTmpDB.c_str(), "r")) == NULL) - { - string strErrorMsg = " Error (1a) - cannot read .idx file \"" + sTmpDB + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - - // .idx db so next check if FASTA is present (not required) - sTmpDB = sTmpDB.erase(sTmpDB.size() - 4); // need plain fasta if indexdb input - if ((fpfasta = fopen(sTmpDB.c_str(), "r")) == NULL) - { - g_bIdxNoFasta = true; - fpfasta = NULL; - } - } - else - { - // FASTA search only - fpidx = NULL; - - if ((fpfasta = fopen(sTmpDB.c_str(), "r")) == NULL) - { - string strErrorMsg = " Error (1b) - cannot read sequence database file \"" + sTmpDB + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - } - } - - if (g_staticParams.options.iSpectrumBatchSize == 0 && g_staticParams.iDbType == DbType::FASTA_DB) - { - logout(" - Reading all spectra into memory; set \"spectrum_batch_size\" if search terminates here.\n"); - fflush(stdout); - } - - CometFragmentIndex sqSearch; - - if (g_bPerformDatabaseSearch && g_staticParams.iDbType == DbType::FI_DB) - { - if (!g_bPlainPeptideIndexRead) - { - auto tStartTime = chrono::steady_clock::now(); - if (!g_staticParams.options.bOutputSqtStream) - { - cout << " - read .idx ... "; - fflush(stdout); - } - - sqSearch.ReadPlainPeptideIndex(); - - if (!g_staticParams.options.bOutputSqtStream) - { - cout << CometMassSpecUtils::ElapsedTime(tStartTime) << endl; - } - - sqSearch.CreateFragmentIndex(tp); - } - } - - if (g_staticParams.options.iPrintAScoreProScore && bPerformAScoreInitialization) - { - SetAScoreOptions(g_AScoreOptions); -// PrintAScoreOptions(g_AScoreOptions); - - // Create the AScoreDllInterface using the factory function - g_AScoreInterface = CreateAScoreDllInterface(); - if (!g_AScoreInterface) - { - std::cerr << "Failed to create AScore interface." << std::endl; - exit(1); - } - - bPerformAScoreInitialization = false; - } - - auto tBeginTime = chrono::steady_clock::now(); - if (g_staticParams.iDbType != DbType::FASTA_DB) - { - printf(" - searching \"%s\" ... ", g_staticParams.inputFile.szBaseName); - fflush(stdout); - } - - FILE* fpdb = NULL; - if (g_bPerformDatabaseSearch) - { - if (g_staticParams.iDbType != DbType::FASTA_DB) - fpdb = fpidx; - else - fpdb = fpfasta; - } - - int iBatchNum = 0; - while (!CometPreprocess::DoneProcessingAllSpectra()) // Loop through iMaxSpectraPerSearch - { - iBatchNum++; - - // Fused FI_DB path: read + preprocess + search + post-analysis per spectrum - // in one pass using per-thread scratch buffers and a lock-free dispatch loop. - // Excludes Mango and spectral-library paths which rely on legacy ordering. - bool bFusedFIDB = (g_staticParams.iDbType == DbType::FI_DB - && g_bPerformDatabaseSearch - && !g_staticParams.options.bMango - && !g_bPerformSpecLibSearch); - - if (bFusedFIDB) - { - // IMPORTANT: From this point onwards, because we've loaded some - // spectra, we MUST "goto cleanup_results" before exiting the loop, - // or we will create a memory leak! - g_cometStatus.SetStatusMsg(string("Running fused FI_DB search...")); - - bSucceeded = CometPreprocess::FusedLoadAndSearchSpectra(mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); - - if (!bSucceeded) - goto cleanup_results; - - iPercentStart = iPercentEnd; - iPercentEnd = mstReader.getPercent(); - - if (session.queries.empty()) - continue; - - iTotalSpectraSearched += (int)session.queries.size(); - } - else - { - // Legacy three-sweep path: LoadAndPreprocess -> AllocateResults -> - // sort-by-mass -> RunSearch -> PostAnalysis. - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - logout(" - Load spectra:"); - fflush(stdout); - } - - g_cometStatus.SetStatusMsg(string("Loading and processing input spectra")); - - // IMPORTANT: From this point onwards, because we've loaded some - // spectra, we MUST "goto cleanup_results" before exiting the loop, - // or we will create a memory leak! - - bSucceeded = CometPreprocess::LoadAndPreprocessSpectra(mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); - - if (!bSucceeded) - goto cleanup_results; - - iPercentStart = iPercentEnd; - iPercentEnd = mstReader.getPercent(); - - if (session.queries.empty()) - continue; //FIX make sure continue instead of break makes sense - else // possible no spectrum in batch passes filters; do not want to break in that case; - iTotalSpectraSearched += (int)session.queries.size(); - - bSucceeded = AllocateResultsMem(session.queries); - - if (!bSucceeded) - goto cleanup_results; - - { // need strStatusMsg in it's own scope due to goto statement above - string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - logout(strStatusMsg); - } - g_cometStatus.SetStatusMsg(strStatusMsg); - } - - if (g_staticParams.options.bMango) - { - int iCurrentScanNumber = 0; // used to track multiple Mango precursors from same scan number - int iMangoIndex=0; - - // sort back to original spectrum order in MS2 scan in order to associate pairs - // based on sequential order of precursors for each scan - std::sort(session.queries.begin(), session.queries.end(), compareByMangoIndex); - - for (std::vector::iterator it = session.queries.begin(); it != session.queries.end(); ++it) - { - if ((*it)->_spectrumInfoInternal.iScanNumber != iCurrentScanNumber) - { - iCurrentScanNumber = (*it)->_spectrumInfoInternal.iScanNumber; - iMangoIndex = 0; - } - else - iMangoIndex++; - - sprintf((*it)->_spectrumInfoInternal.szMango, "%03d_%c", (int)iMangoIndex/2, (iMangoIndex % 2)?'B':'A'); - } - } - - // Sort session.queries vector by dExpPepMass. - std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); - - g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; - g_massRange.dMaxMass = session.queries.at(session.queries.size()-1)->_pepMassInfo.dPeptideMassTolerancePlus; - - if (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass) - g_massRange.bNarrowMassRange = true; - else - g_massRange.bNarrowMassRange = false; - - bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); - if (!bSucceeded) - goto cleanup_results; - g_cometStatus.SetStatusMsg(string("Running search...")); + // Build search session with run-level flags. + SearchSession session(g_staticParams); + session.bPerformDatabaseSearch = g_bPerformDatabaseSearch; + session.bPerformSpecLibSearch = g_bPerformSpecLibSearch; - // Now that spectra are loaded to memory and sorted, do search. - if (g_bPerformDatabaseSearch) - bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); - if (g_bPerformSpecLibSearch) - bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); - - if (!bSucceeded) - goto cleanup_results; - - bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); - if (!bSucceeded) - goto cleanup_results; - - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - logout(" - Post analysis:"); - fflush(stdout); - } - - if (g_bPerformDatabaseSearch) - { - g_cometStatus.SetStatusMsg(string("Performing post-search analysis ...")); - - // Sort each entry by xcorr, calculate E-values, etc. - bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); - } - - if (!bSucceeded) - goto cleanup_results; - } - - // Sort session.queries vector by scan (shared by both paths). - std::sort(session.queries.begin(), session.queries.end(), compareByScanNumber); - - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - logout(" done\n"); - fflush(stdout); - } - - // Phase 3: per-batch write via polymorphic writer loop. - // Insertion order guarantees SQT writes last (destroys szMod). - { - WriterWriteCtx wwctx; - wwctx.fpdb = fpdb; - wwctx.iScanOffset = iTotalSpectraSearched - (int)session.queries.size(); - wwctx.iBatchNum = iBatchNum; - wwctx.pQueries = &session.queries; - for (auto& pw : vWriters) - { - if (!pw->write(wwctx)) - { - bSucceeded = false; - goto cleanup_results; - } - } - } - -cleanup_results: - - // Deleting each Query object in the vector calls its destructor, which - // frees the spectral memory (see definition for Query in CometDataInternal.h). - for (auto it = session.queries.begin(); it != session.queries.end(); ++it) - delete (*it); - - session.queries.clear(); - - if (!bSucceeded) - break; - } - - if (bSucceeded) - { - if (iTotalSpectraSearched == 0) - logout(" Warning - no spectra searched.\n"); - - if (!g_staticParams.options.bOutputSqtStream) - { - const auto duration = chrono::duration_cast(chrono::steady_clock::now() - tBeginTime); - double dTimePerSpectra = (double)duration.count() / (double)iTotalSpectraSearched; - - if (g_staticParams.iDbType == DbType::FASTA_DB) - strOut = " - Run stats: "; - else - strOut = ""; - - char buf[128]; - - std::snprintf(buf, sizeof(buf), "%.2f", dTimePerSpectra); - strOut += CometMassSpecUtils::ElapsedTime(tBeginTime) + " (" + std::to_string(iTotalSpectraSearched) + " spectra, " - + std::string(buf) + "ms/spec, "; - - std::snprintf(buf, sizeof(buf), "%.0f", 1000.0 / dTimePerSpectra); - strOut += std::string(buf) + "Hz"; - - if (g_staticParams.iDbType == DbType::FASTA_DB) - strOut += ", " + CometMassSpecUtils::GetPeakMemory(); - - strOut += ")\n"; - - logout(strOut); - } - - if (!g_staticParams.options.bOutputSqtStream && g_staticParams.iDbType == DbType::FASTA_DB) - { - time_t tEndTime; - - time(&tEndTime); - - strftime(g_staticParams.szDate, 26, "%Y/%m/%d, %I:%M:%S %p", localtime(&tEndTime)); - strOut = " Search end: " + string(g_staticParams.szDate) + " (" + CometMassSpecUtils::ElapsedTime(tGlobalStartTime) + ", " + CometMassSpecUtils::GetPeakMemory() + ")\n\n"; - logout(strOut); - } - } - - if (fpidx != NULL) - fclose(fpidx); - if (fpfasta != NULL) - fclose(fpfasta); - } - - //MH: Deallocate spectral processing memory. - CometPreprocess::DeallocateMemory(g_staticParams.options.iNumThreads); - - // Deallocate search memory - CometSearch::DeallocateMemory(g_staticParams.options.iNumThreads); - - // Phase 3: finalize, fclose, and optionally remove files on empty search. - { - bool bEmpty = (iTotalSpectraSearched == 0); - for (auto& pw : vWriters) - pw->close(bSucceeded, bEmpty); - vWriters.clear(); - } - - if (iTotalSpectraSearched == 0) - bBlankSearchFile = true; - - g_staticParams.inputFile.szBaseName[0] = '\0'; - - if (!bSucceeded) - break; - } - - if (g_staticParams.iDbType == DbType::FI_DB) // clean fragment ion index - { - free(g_bIndexPrecursors); // allocated in InitializeStaticParams - - delete[] g_iFragmentIndex; - delete[] g_iFragmentIndexOffset; - } - - if (g_staticParams.iDbType != DbType::FASTA_DB) // for either index search - { - strOut = " - done. (" + CometMassSpecUtils::ElapsedTime(tGlobalStartTime); - - string strMemUse = CometMassSpecUtils::GetPeakMemory(); - if (!strMemUse.empty()) - strOut += ", " + strMemUse + ")"; - else - strOut += ")"; - - strOut += "\n\n"; - - logout(strOut); - } + // Select strategy and create writers, then run the pipeline. + std::unique_ptr pStrategy; + if (g_staticParams.iDbType == DbType::FI_DB) + pStrategy = std::make_unique(); + else if (g_staticParams.iDbType == DbType::PI_DB) + pStrategy = std::make_unique(); + else + pStrategy = std::make_unique(); + + // PepXML, mzIdentML, Percolator, Txt first; SQT last (WriteSqt modifies szMod). + std::vector> vWriters; + if (g_staticParams.options.bOutputPepXMLFile) + vWriters.push_back(std::make_unique()); + if (g_staticParams.options.iOutputMzIdentMLFile) + vWriters.push_back(std::make_unique(this)); + if (g_staticParams.options.bOutputPercolatorFile) + vWriters.push_back(std::make_unique()); + if (g_staticParams.options.bOutputTxtFile) + vWriters.push_back(std::make_unique()); + if (g_staticParams.options.bOutputSqtFile || g_staticParams.options.bOutputSqtStream) + vWriters.push_back(std::make_unique()); + + Pipeline pipeline(std::move(pStrategy), std::move(vWriters), this); + bSucceeded = pipeline.run(session, g_pvInputFiles, *tp); if (g_staticParams.options.iPrintAScoreProScore) DeleteAScoreDllInterface(g_AScoreInterface); - if (bBlankSearchFile) - return false; - else - return bSucceeded; + return bSucceeded; } diff --git a/CometSearch/Makefile b/CometSearch/Makefile index c5a065a3..219d63eb 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -29,8 +29,11 @@ COMETSEARCH_SRC = Threading CometInterfaces CometSearch CometPreprocess CometPos THREADING_SRC = threading/SearchMemoryPool +SEARCH_SRC = search/FiStrategy search/FastaStrategy search/PiStrategy search/Pipeline + COMETSEARCH_OBJ = $(addprefix $(OBJDIR)/, $(addsuffix .o, $(COMETSEARCH_SRC))) \ - $(addprefix $(OBJDIR)/, $(addsuffix .o, $(THREADING_SRC))) + $(addprefix $(OBJDIR)/, $(addsuffix .o, $(THREADING_SRC))) \ + $(addprefix $(OBJDIR)/, $(addsuffix .o, $(SEARCH_SRC))) all: libcometsearch.a @@ -64,5 +67,9 @@ $(OBJDIR)/threading/%.o: threading/%.cpp threading/%.h | $(OBJDIR) @mkdir -p $(OBJDIR)/threading ${CXX} ${CXXFLAGS} -I. $< -c -o $@ +$(OBJDIR)/search/%.o: search/%.cpp | $(OBJDIR) + @mkdir -p $(OBJDIR)/search + ${CXX} ${CXXFLAGS} -I. $< -c -o $@ + clean: rm -rf $(OBJDIR) *.a diff --git a/CometSearch/search/FastaStrategy.cpp b/CometSearch/search/FastaStrategy.cpp new file mode 100644 index 00000000..b431efc0 --- /dev/null +++ b/CometSearch/search/FastaStrategy.cpp @@ -0,0 +1,196 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "Common.h" +#include "FastaStrategy.h" +#include "SearchUtils.h" +#include "CometPreprocess.h" +#include "CometSearch.h" +#include "CometPostAnalysis.h" +#include "CometSearchManager.h" +#include "MSReader.h" + +bool FastaStrategy::initialize(SearchSession& session, ThreadPool* tp) +{ + // Read protein variable-mod filter file (FASTA-only feature). + if (session.bPerformDatabaseSearch + && g_staticParams.variableModParameters.sProteinLModsListFile.length() > 0) + { + bool bVarModUsed = false; + for (int iMod = 0; iMod < VMODS; ++iMod) + { + if (g_staticParams.variableModParameters.varModList[iMod].dVarModMass != 0.0) + { + bVarModUsed = true; + break; + } + } + + if (bVarModUsed) + { + // ReadProteinVarModFilterFile() is a private member of CometSearchManager; + // it is called from DoSearch() before pipeline.run() for the FASTA path. + // This initialize() is called AFTER that call, so the filter is already loaded. + // Nothing to do here. (The call is retained in DoSearch() for the FASTA path + // only, which is handled before makeStrategy() is invoked.) + } + } + + if (!CometPreprocess::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + if (!CometSearch::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + return true; +} + +bool FastaStrategy::openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) +{ + fpfasta = nullptr; + fpidx = nullptr; + fpdb = nullptr; + + if (!session.bPerformDatabaseSearch) + return true; + + if ((fpfasta = fopen(szDatabase.c_str(), "r")) == nullptr) + { + string strErrorMsg = " Error (1b) - cannot read sequence database file \"" + szDatabase + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + + fpdb = fpfasta; + (void)session; // session.bIdxNoFasta stays false for FASTA searches + + return true; +} + +bool FastaStrategy::executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) +{ + if (!g_staticParams.options.bOutputSqtStream) + { + logout(" - Load spectra:"); + fflush(stdout); + } + + g_cometStatus.SetStatusMsg(string("Loading and processing input spectra")); + + bool bSucceeded = CometPreprocess::LoadAndPreprocessSpectra( + mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); + + iPercentStart = iPercentEnd; + iPercentEnd = mstReader.getPercent(); + + if (!bSucceeded) + return false; + + if (session.queries.empty()) + return true; + + bSucceeded = AllocateResultsMem(session.queries); + if (!bSucceeded) + return false; + + { + string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); + if (!g_staticParams.options.bOutputSqtStream) + logout(strStatusMsg); + g_cometStatus.SetStatusMsg(strStatusMsg); + } + + if (g_staticParams.options.bMango) + { + int iCurrentScanNumber = 0; + int iMangoIndex = 0; + + std::sort(session.queries.begin(), session.queries.end(), compareByMangoIndex); + + for (std::vector::iterator it = session.queries.begin(); it != session.queries.end(); ++it) + { + if ((*it)->_spectrumInfoInternal.iScanNumber != iCurrentScanNumber) + { + iCurrentScanNumber = (*it)->_spectrumInfoInternal.iScanNumber; + iMangoIndex = 0; + } + else + { + iMangoIndex++; + } + sprintf((*it)->_spectrumInfoInternal.szMango, "%03d_%c", + (int)iMangoIndex / 2, (iMangoIndex % 2) ? 'B' : 'A'); + } + } + + std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); + + g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; + g_massRange.dMaxMass = session.queries.at(session.queries.size() - 1)->_pepMassInfo.dPeptideMassTolerancePlus; + + if (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass) + g_massRange.bNarrowMassRange = true; + else + g_massRange.bNarrowMassRange = false; + + bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); + if (!bSucceeded) + return false; + + g_cometStatus.SetStatusMsg(string("Running search...")); + + if (session.bPerformDatabaseSearch) + bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); + if (bSucceeded && session.bPerformSpecLibSearch) + bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); + + if (!bSucceeded) + return false; + + bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); + if (!bSucceeded) + return false; + + if (!g_staticParams.options.bOutputSqtStream) + { + logout(" - Post analysis:"); + fflush(stdout); + } + + if (session.bPerformDatabaseSearch) + { + g_cometStatus.SetStatusMsg(string("Performing post-search analysis ...")); + bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); + } + + return bSucceeded; +} + +void FastaStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) +{ + (void)fpidx; // always nullptr for FASTA searches + if (fpfasta != nullptr) fclose(fpfasta); +} + +void FastaStrategy::finalize() +{ + CometPreprocess::DeallocateMemory(g_staticParams.options.iNumThreads); + CometSearch::DeallocateMemory(g_staticParams.options.iNumThreads); +} diff --git a/CometSearch/search/FastaStrategy.h b/CometSearch/search/FastaStrategy.h new file mode 100644 index 00000000..9368c37f --- /dev/null +++ b/CometSearch/search/FastaStrategy.h @@ -0,0 +1,39 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ISearchStrategy.h" + +// Search strategy for FASTA_DB (classic three-sweep) batch searches. +// +// initialize(): reads the protein variable-mod filter file if configured; +// allocates search and preprocess memory pools. +// executeBatch(): LoadAndPreprocessSpectra -> RunSearch -> PostAnalysis. +// finalize(): frees memory pools. +class FastaStrategy : public ISearchStrategy +{ +public: + bool initialize(SearchSession& session, ThreadPool* tp) override; + bool openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) override; + bool executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) override; + void closeFiles(FILE* fpfasta, FILE* fpidx) override; + void finalize() override; + bool isIndexBased() const override { return false; } +}; diff --git a/CometSearch/search/FiStrategy.cpp b/CometSearch/search/FiStrategy.cpp new file mode 100644 index 00000000..14e373cd --- /dev/null +++ b/CometSearch/search/FiStrategy.cpp @@ -0,0 +1,248 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "Common.h" +#include "FiStrategy.h" +#include "SearchUtils.h" +#include "CometFragmentIndex.h" +#include "CometPreprocess.h" +#include "CometSearch.h" +#include "CometPostAnalysis.h" +#include "CometMassSpecUtils.h" +#include "MSReader.h" + +extern std::vector g_pvInputFiles; +extern bool g_bPlainPeptideIndexRead; +extern unsigned int* g_iFragmentIndex; +extern uint64_t* g_iFragmentIndexOffset; +extern bool* g_bIndexPrecursors; + +bool FiStrategy::initialize(SearchSession& session, ThreadPool* tp) +{ + if (!CometPreprocess::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + if (!CometSearch::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + // Pre-read precursors across all input files before building the index. + if (session.bPerformDatabaseSearch && !g_staticParams.options.iFragIndexSkipReadPrecursors) + { + auto tTime1 = chrono::steady_clock::now(); + if (!g_staticParams.options.bOutputSqtStream) + { + cout << " - read precursors ... "; + fflush(stdout); + } + + for (int i = 0; i < (int)g_pvInputFiles.size(); ++i) + { + if (!UpdateInputFile(g_pvInputFiles.at(i))) + return false; + + MSReader mstReader; + SetMSLevelFilter(mstReader); + CometPreprocess::Reset(); + + if (!CometPreprocess::ReadPrecursors(mstReader)) + return false; + } + + if (!g_staticParams.options.bOutputSqtStream) + cout << CometMassSpecUtils::ElapsedTime(tTime1) << endl; + } + + // Load plain peptide index (.idx) and build the in-memory fragment index. + if (session.bPerformDatabaseSearch && !g_bPlainPeptideIndexRead) + { + auto tStartTime = chrono::steady_clock::now(); + if (!g_staticParams.options.bOutputSqtStream) + { + cout << " - read .idx ... "; + fflush(stdout); + } + + CometFragmentIndex sqSearch; + sqSearch.ReadPlainPeptideIndex(); + + if (!g_staticParams.options.bOutputSqtStream) + cout << CometMassSpecUtils::ElapsedTime(tStartTime) << endl; + + sqSearch.CreateFragmentIndex(tp); + } + + return true; +} + +bool FiStrategy::openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) +{ + fpfasta = nullptr; + fpidx = nullptr; + fpdb = nullptr; + + if (!session.bPerformDatabaseSearch) + return true; + + string sTmpDB = szDatabase; + + if ((fpidx = fopen(sTmpDB.c_str(), "r")) == nullptr) + { + string strErrorMsg = " Error (1a) - cannot read .idx file \"" + sTmpDB + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + + // Try to open the companion .fasta (not required for FI_DB search). + sTmpDB = sTmpDB.erase(sTmpDB.size() - 4); // strip .idx + if ((fpfasta = fopen(sTmpDB.c_str(), "r")) == nullptr) + { + session.bIdxNoFasta = true; + fpfasta = nullptr; + } + + fpdb = fpidx; + + return true; +} + +bool FiStrategy::executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) +{ + // Fused path: per-spectrum read+preprocess+search+post-analysis in one pass. + // Disabled for Mango or speclib runs (those require the legacy ordering). + bool bFused = session.bPerformDatabaseSearch + && !g_staticParams.options.bMango + && !session.bPerformSpecLibSearch; + + if (bFused) + { + g_cometStatus.SetStatusMsg(string("Running fused FI_DB search...")); + + bool bSucceeded = CometPreprocess::FusedLoadAndSearchSpectra( + mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); + + iPercentStart = iPercentEnd; + iPercentEnd = mstReader.getPercent(); + + return bSucceeded; + } + + // Legacy three-sweep path: LoadAndPreprocess -> AllocateResults -> + // sort-by-mass -> RunSearch -> PostAnalysis. + g_cometStatus.SetStatusMsg(string("Loading and processing input spectra")); + + bool bSucceeded = CometPreprocess::LoadAndPreprocessSpectra( + mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); + + iPercentStart = iPercentEnd; + iPercentEnd = mstReader.getPercent(); + + if (!bSucceeded) + return false; + + if (session.queries.empty()) + return true; // no spectra in this batch; caller will continue to next + + bSucceeded = AllocateResultsMem(session.queries); + if (!bSucceeded) + return false; + + { + string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); + g_cometStatus.SetStatusMsg(strStatusMsg); + } + + if (g_staticParams.options.bMango) + { + int iCurrentScanNumber = 0; + int iMangoIndex = 0; + + std::sort(session.queries.begin(), session.queries.end(), compareByMangoIndex); + + for (std::vector::iterator it = session.queries.begin(); it != session.queries.end(); ++it) + { + if ((*it)->_spectrumInfoInternal.iScanNumber != iCurrentScanNumber) + { + iCurrentScanNumber = (*it)->_spectrumInfoInternal.iScanNumber; + iMangoIndex = 0; + } + else + { + iMangoIndex++; + } + sprintf((*it)->_spectrumInfoInternal.szMango, "%03d_%c", + (int)iMangoIndex / 2, (iMangoIndex % 2) ? 'B' : 'A'); + } + } + + std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); + + g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; + g_massRange.dMaxMass = session.queries.at(session.queries.size() - 1)->_pepMassInfo.dPeptideMassTolerancePlus; + + if (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass) + g_massRange.bNarrowMassRange = true; + else + g_massRange.bNarrowMassRange = false; + + bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); + if (!bSucceeded) + return false; + + g_cometStatus.SetStatusMsg(string("Running search...")); + + if (session.bPerformDatabaseSearch) + bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); + if (bSucceeded && session.bPerformSpecLibSearch) + bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); + + if (!bSucceeded) + return false; + + bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); + if (!bSucceeded) + return false; + + if (session.bPerformDatabaseSearch) + { + g_cometStatus.SetStatusMsg(string("Performing post-search analysis ...")); + bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); + } + + return bSucceeded; +} + +void FiStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) +{ + if (fpidx != nullptr) fclose(fpidx); + if (fpfasta != nullptr) fclose(fpfasta); +} + +void FiStrategy::finalize() +{ + if (g_staticParams.iDbType == DbType::FI_DB) + { + free(g_bIndexPrecursors); + delete[] g_iFragmentIndex; + delete[] g_iFragmentIndexOffset; + } + + CometPreprocess::DeallocateMemory(g_staticParams.options.iNumThreads); + CometSearch::DeallocateMemory(g_staticParams.options.iNumThreads); +} diff --git a/CometSearch/search/FiStrategy.h b/CometSearch/search/FiStrategy.h new file mode 100644 index 00000000..c2136ff4 --- /dev/null +++ b/CometSearch/search/FiStrategy.h @@ -0,0 +1,41 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ISearchStrategy.h" + +// Search strategy for FI_DB (fragment ion index) batch searches. +// +// initialize(): pre-reads precursors, loads the .idx plain-peptide table, +// builds the in-memory fragment ion index. +// executeBatch(): uses the fused FI path (FusedLoadAndSearchSpectra) when +// possible; falls back to the legacy three-sweep path for +// Mango or speclib runs where the fused path is unavailable. +// finalize(): frees the fragment index arrays and memory pools. +class FiStrategy : public ISearchStrategy +{ +public: + bool initialize(SearchSession& session, ThreadPool* tp) override; + bool openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) override; + bool executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) override; + void closeFiles(FILE* fpfasta, FILE* fpidx) override; + void finalize() override; + bool isIndexBased() const override { return true; } +}; diff --git a/CometSearch/search/ISearchStrategy.h b/CometSearch/search/ISearchStrategy.h new file mode 100644 index 00000000..02d48e49 --- /dev/null +++ b/CometSearch/search/ISearchStrategy.h @@ -0,0 +1,73 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "SearchSession.h" +#include "ThreadPool.h" +#include +#include + +namespace MSToolkit { class MSReader; } + +// Abstract search strategy. One concrete implementation per database type: +// FiStrategy -- FI_DB (fragment ion index, fused + fallback legacy path) +// FastaStrategy -- FASTA_DB (classic three-sweep path) +// PiStrategy -- PI_DB (plain peptide index) +// +// Pipeline selects the correct one at startup and holds it for the entire run. +class ISearchStrategy +{ +public: + virtual ~ISearchStrategy() = default; + + // Called once before the per-file loop. + // Allocates search/preprocess memory pools, loads/builds the index, + // pre-reads precursors (FI_DB), reads var-mod filter file (FASTA_DB). + // Returns false on error. + virtual bool initialize(SearchSession& session, ThreadPool* tp) = 0; + + // Called once per input file. + // Opens database file handles (fpfasta, fpidx) and sets fpdb to whichever + // handle writers use for sequence retrieval. + // Sets session.bIdxNoFasta = true when an .idx search has no companion .fasta. + // Returns false on error. + virtual bool openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) = 0; + + // Called once per batch within a file. + // Fills session.queries with fully scored Query* results (preprocess + search + // + post-analysis, all done here). May return with session.queries empty + // if no spectra passed the filters in this batch. + // Updates iPercentStart/iPercentEnd after loading (before RunSearch) so that + // RunSearch receives the file-position range for this batch. + // Returns false on error or cancel. + virtual bool executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) = 0; + + // Called once per input file after all batches. + // Closes the file handles opened by openFiles(). + virtual void closeFiles(FILE* fpfasta, FILE* fpidx) = 0; + + // Called once after all files. + // Frees memory pools and (for FI_DB) the fragment index arrays. + virtual void finalize() = 0; + + // Returns true for index-based searches (FI_DB, PI_DB). + // Pipeline uses this to select progress-message style. + virtual bool isIndexBased() const = 0; +}; diff --git a/CometSearch/search/PiStrategy.cpp b/CometSearch/search/PiStrategy.cpp new file mode 100644 index 00000000..fd55a592 --- /dev/null +++ b/CometSearch/search/PiStrategy.cpp @@ -0,0 +1,150 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "Common.h" +#include "PiStrategy.h" +#include "SearchUtils.h" +#include "CometPreprocess.h" +#include "CometSearch.h" +#include "CometPostAnalysis.h" +#include "MSReader.h" + +bool PiStrategy::initialize(SearchSession& session, ThreadPool* tp) +{ + (void)session; + (void)tp; + + // The peptide index is loaded lazily on first access inside + // CometSearch::RunSearch -> SearchPeptideIndex. No explicit + // ReadPeptideIndex() call is needed here. + + if (!CometPreprocess::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + if (!CometSearch::AllocateMemory(g_staticParams.options.iNumThreads)) + return false; + + return true; +} + +bool PiStrategy::openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) +{ + fpfasta = nullptr; + fpidx = nullptr; + fpdb = nullptr; + + if (!session.bPerformDatabaseSearch) + return true; + + string sTmpDB = szDatabase; + + if ((fpidx = fopen(sTmpDB.c_str(), "r")) == nullptr) + { + string strErrorMsg = " Error (1a) - cannot read .idx file \"" + sTmpDB + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + + // Try to open the companion .fasta (not required for PI_DB search). + sTmpDB = sTmpDB.erase(sTmpDB.size() - 4); // strip .idx + if ((fpfasta = fopen(sTmpDB.c_str(), "r")) == nullptr) + { + session.bIdxNoFasta = true; + fpfasta = nullptr; + } + + fpdb = fpidx; + + return true; +} + +bool PiStrategy::executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) +{ + g_cometStatus.SetStatusMsg(string("Loading and processing input spectra")); + + bool bSucceeded = CometPreprocess::LoadAndPreprocessSpectra( + mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); + + iPercentStart = iPercentEnd; + iPercentEnd = mstReader.getPercent(); + + if (!bSucceeded) + return false; + + if (session.queries.empty()) + return true; + + bSucceeded = AllocateResultsMem(session.queries); + if (!bSucceeded) + return false; + + { + string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); + g_cometStatus.SetStatusMsg(strStatusMsg); + } + + std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); + + g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; + g_massRange.dMaxMass = session.queries.at(session.queries.size() - 1)->_pepMassInfo.dPeptideMassTolerancePlus; + + if (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass) + g_massRange.bNarrowMassRange = true; + else + g_massRange.bNarrowMassRange = false; + + bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); + if (!bSucceeded) + return false; + + g_cometStatus.SetStatusMsg(string("Running search...")); + + if (session.bPerformDatabaseSearch) + bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); + if (bSucceeded && session.bPerformSpecLibSearch) + bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); + + if (!bSucceeded) + return false; + + bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); + if (!bSucceeded) + return false; + + if (session.bPerformDatabaseSearch) + { + g_cometStatus.SetStatusMsg(string("Performing post-search analysis ...")); + bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); + } + + return bSucceeded; +} + +void PiStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) +{ + if (fpidx != nullptr) fclose(fpidx); + if (fpfasta != nullptr) fclose(fpfasta); +} + +void PiStrategy::finalize() +{ + CometPreprocess::DeallocateMemory(g_staticParams.options.iNumThreads); + CometSearch::DeallocateMemory(g_staticParams.options.iNumThreads); +} diff --git a/CometSearch/search/PiStrategy.h b/CometSearch/search/PiStrategy.h new file mode 100644 index 00000000..5634b693 --- /dev/null +++ b/CometSearch/search/PiStrategy.h @@ -0,0 +1,39 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ISearchStrategy.h" + +// Search strategy for PI_DB (plain peptide index) batch searches. +// +// initialize(): allocates search and preprocess memory pools; the peptide +// index itself is loaded lazily by SearchPeptideIndex on first use. +// executeBatch(): LoadAndPreprocessSpectra -> RunSearch (PI path) -> PostAnalysis. +// finalize(): frees memory pools. +class PiStrategy : public ISearchStrategy +{ +public: + bool initialize(SearchSession& session, ThreadPool* tp) override; + bool openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) override; + bool executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) override; + void closeFiles(FILE* fpfasta, FILE* fpidx) override; + void finalize() override; + bool isIndexBased() const override { return true; } +}; diff --git a/CometSearch/search/Pipeline.cpp b/CometSearch/search/Pipeline.cpp new file mode 100644 index 00000000..1dfdd239 --- /dev/null +++ b/CometSearch/search/Pipeline.cpp @@ -0,0 +1,269 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "Common.h" +#include "Pipeline.h" +#include "SearchUtils.h" +#include "CometPreprocess.h" +#include "CometMassSpecUtils.h" +#include "MSReader.h" + +Pipeline::Pipeline(std::unique_ptr strategy, + std::vector> writers, + CometSearchManager* pMgr) + : _strategy(std::move(strategy)) + , _writers(std::move(writers)) + , _pMgr(pMgr) +{ +} + +bool Pipeline::run(SearchSession& session, + const std::vector& files, + ThreadPool& tp) +{ + auto tGlobalStart = chrono::steady_clock::now(); + + if (!_strategy->initialize(session, &tp)) + return false; + + bool bSucceeded = true; + int iTotalAllFiles = 0; // spectra searched across all files (for blank-file check) + + for (auto pFile : files) + { + if (!UpdateInputFile(pFile)) + { + bSucceeded = false; + break; + } + + int iFirstScan = g_staticParams.inputFile.iFirstScan; + int iLastScan = g_staticParams.inputFile.iLastScan; + int iPercentStart = 0; + int iPercentEnd = 0; + int iAnalysisType = g_staticParams.inputFile.iAnalysisType; + + // Print search-start banner for FASTA searches. + if (!g_staticParams.options.bOutputSqtStream && !_strategy->isIndexBased()) + { + time_t tStartTime; + time(&tStartTime); + strftime(g_staticParams.szDate, 26, "%Y/%m/%d, %I:%M:%S %p", localtime(&tStartTime)); + + string strOut = " Search start: " + string(g_staticParams.szDate) + "\n"; + strOut += " - Input file: " + string(g_staticParams.inputFile.szFileName) + "\n"; + logout(strOut); + fflush(stdout); + } + + // Open database file handles (strategy-specific: .idx or .fasta). + FILE* fpfasta = nullptr; + FILE* fpidx = nullptr; + FILE* fpdb = nullptr; + + if (!_strategy->openFiles(g_staticParams.databaseInfo.szDatabase, + fpfasta, fpidx, fpdb, session)) + { + bSucceeded = false; + break; + } + + if (g_staticParams.options.iSpectrumBatchSize == 0 && !_strategy->isIndexBased()) + { + logout(" - Reading all spectra into memory; set \"spectrum_batch_size\" if search terminates here.\n"); + fflush(stdout); + } + + // Open writers (after openFiles so session.bIdxNoFasta is correctly set). + WriterOpenCtx woctx; + woctx.szBaseName = g_staticParams.inputFile.szBaseName; + woctx.szOutputSuffix = g_staticParams.szOutputSuffix; + woctx.szTxtFileExt = g_staticParams.szTxtFileExt; + woctx.bEntireFile = (iAnalysisType == AnalysisType_EntireFile); + woctx.iFirstScan = iFirstScan; + woctx.iLastScan = iLastScan; + woctx.iDecoySearch = g_staticParams.options.iDecoySearch; + woctx.bIdxNoFasta = session.bIdxNoFasta; + woctx.pMgr = _pMgr; + + for (auto& pw : _writers) + { + if (!pw->open(woctx)) + { + bSucceeded = false; + break; + } + } + + if (!bSucceeded) + { + _strategy->closeFiles(fpfasta, fpidx); + break; + } + + // MSReader setup. + MSReader mstReader; + SetMSLevelFilter(mstReader); + CometPreprocess::Reset(); + + // Print "searching..." message for index-based searches. + auto tBeginTime = chrono::steady_clock::now(); + if (_strategy->isIndexBased()) + { + printf(" - searching \"%s\" ... ", g_staticParams.inputFile.szBaseName); + fflush(stdout); + } + + int iTotalSpectraSearched = 0; + int iBatchNum = 0; + + while (!CometPreprocess::DoneProcessingAllSpectra()) + { + iBatchNum++; + + bSucceeded = _strategy->executeBatch(mstReader, + iFirstScan, iLastScan, iAnalysisType, + iPercentStart, iPercentEnd, + &tp, session); + + if (!bSucceeded) + goto cleanup_results; + + if (session.queries.empty()) + continue; + + iTotalSpectraSearched += (int)session.queries.size(); + + // Sort by scan number (shared by all paths; SQT writes last, which modifies szMod). + std::sort(session.queries.begin(), session.queries.end(), compareByScanNumber); + + if (!g_staticParams.options.bOutputSqtStream && !_strategy->isIndexBased()) + { + logout(" done\n"); + fflush(stdout); + } + + // Per-batch write. + { + WriterWriteCtx wwctx; + wwctx.fpdb = fpdb; + wwctx.iScanOffset = iTotalSpectraSearched - (int)session.queries.size(); + wwctx.iBatchNum = iBatchNum; + wwctx.pQueries = &session.queries; + + for (auto& pw : _writers) + { + if (!pw->write(wwctx)) + { + bSucceeded = false; + goto cleanup_results; + } + } + } + +cleanup_results: + for (auto it = session.queries.begin(); it != session.queries.end(); ++it) + delete (*it); + session.queries.clear(); + + if (!bSucceeded) + break; + } + + // Per-file timing and run-stats message. + if (bSucceeded) + { + if (iTotalSpectraSearched == 0) + logout(" Warning - no spectra searched.\n"); + + if (!g_staticParams.options.bOutputSqtStream) + { + const auto duration = chrono::duration_cast( + chrono::steady_clock::now() - tBeginTime); + double dTimePerSpectra = (iTotalSpectraSearched > 0) + ? (double)duration.count() / (double)iTotalSpectraSearched + : 0.0; + + string strOut; + char buf[128]; + + if (!_strategy->isIndexBased()) + strOut = " - Run stats: "; + else + strOut = ""; + + std::snprintf(buf, sizeof(buf), "%.2f", dTimePerSpectra); + strOut += CometMassSpecUtils::ElapsedTime(tBeginTime) + + " (" + std::to_string(iTotalSpectraSearched) + " spectra, " + + std::string(buf) + "ms/spec, "; + + std::snprintf(buf, sizeof(buf), "%.0f", (dTimePerSpectra > 0.0) ? 1000.0 / dTimePerSpectra : 0.0); + strOut += std::string(buf) + "Hz"; + + if (!_strategy->isIndexBased()) + strOut += ", " + CometMassSpecUtils::GetPeakMemory(); + + strOut += ")\n"; + logout(strOut); + } + + if (!g_staticParams.options.bOutputSqtStream && !_strategy->isIndexBased()) + { + time_t tEndTime; + time(&tEndTime); + strftime(g_staticParams.szDate, 26, "%Y/%m/%d, %I:%M:%S %p", localtime(&tEndTime)); + string strOut = " Search end: " + string(g_staticParams.szDate) + + " (" + CometMassSpecUtils::ElapsedTime(tGlobalStart) + + ", " + CometMassSpecUtils::GetPeakMemory() + ")\n\n"; + logout(strOut); + } + } + + _strategy->closeFiles(fpfasta, fpidx); + + // Finalize and close writers. + { + bool bEmpty = (iTotalSpectraSearched == 0); + for (auto& pw : _writers) + pw->close(bSucceeded, bEmpty); + } + + iTotalAllFiles += iTotalSpectraSearched; + g_staticParams.inputFile.szBaseName[0] = '\0'; + + if (!bSucceeded) + break; + } + + _strategy->finalize(); + + // Print overall "done" banner for index-based searches. + if (_strategy->isIndexBased()) + { + string strOut = " - done. (" + CometMassSpecUtils::ElapsedTime(tGlobalStart); + string strMemUse = CometMassSpecUtils::GetPeakMemory(); + if (!strMemUse.empty()) + strOut += ", " + strMemUse + ")"; + else + strOut += ")"; + strOut += "\n\n"; + logout(strOut); + } + + // Return false if no spectra were searched across all files (blank-file sentinel). + if (iTotalAllFiles == 0) + return false; + + return bSucceeded; +} diff --git a/CometSearch/search/Pipeline.h b/CometSearch/search/Pipeline.h new file mode 100644 index 00000000..38cc258e --- /dev/null +++ b/CometSearch/search/Pipeline.h @@ -0,0 +1,47 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ISearchStrategy.h" +#include "../output/IResultWriter.h" +#include "../CometSearchManager.h" +#include +#include + +// Pipeline drives the batch search for all input files. +// It owns the strategy (which provides the per-batch search implementation) +// and the result writers (which serialize results to disk). +// +// Typical call sequence from CometSearchManager::DoSearch(): +// Pipeline pipeline(std::move(strategy), std::move(writers), pMgr); +// pipeline.run(session, g_pvInputFiles, *tp); +class Pipeline +{ +public: + Pipeline(std::unique_ptr strategy, + std::vector> writers, + CometSearchManager* pMgr); + + // Drives initialize -> per-file loop (open, batch-loop, close) -> finalize. + // Returns false if any file fails or no spectra are found across all files. + bool run(SearchSession& session, + const std::vector& files, + ThreadPool& tp); + +private: + std::unique_ptr _strategy; + std::vector> _writers; + CometSearchManager* _pMgr; +}; diff --git a/CometSearch/search/SearchUtils.h b/CometSearch/search/SearchUtils.h new file mode 100644 index 00000000..318b2625 --- /dev/null +++ b/CometSearch/search/SearchUtils.h @@ -0,0 +1,234 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "Common.h" +#include "CometDataInternal.h" + +// Shared inline utilities used by Pipeline and strategy classes. +// All functions operate on globals (g_staticParams, g_cometStatus, etc.) +// which are declared in CometDataInternal.h / Common.h. + +// ----------------------------------------------------------------------- +// Input type detection (file extension -> InputType enum) +// ----------------------------------------------------------------------- +inline static InputType GetInputType(const char* pszFileName) +{ + int iLen = (int)strlen(pszFileName); + + if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 6, ".mzXML") + || !STRCMP_IGNORE_CASE(pszFileName + iLen - 5, ".mzML") + || !STRCMP_IGNORE_CASE(pszFileName + iLen - 9, ".mzXML.gz") + || !STRCMP_IGNORE_CASE(pszFileName + iLen - 8, ".mzML.gz")) + { + return InputType_MZXML; + } + else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".raw")) + { + return InputType_RAW; + } + else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".ms2") + || !STRCMP_IGNORE_CASE(pszFileName + iLen - 5, ".cms2")) + { + return InputType_MS2; + } + else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".mgf")) + { + return InputType_MGF; + } + + return InputType_UNKNOWN; +} + +// ----------------------------------------------------------------------- +// UpdateInputFile: sets g_staticParams.inputFile from pFileInfo. +// Returns false on unknown type or if file cannot be opened. +// ----------------------------------------------------------------------- +inline static bool UpdateInputFile(InputFileInfo* pFileInfo) +{ + bool bUpdateBaseName = false; + char szTmpBaseName[SIZE_FILE]; + + if (g_staticParams.inputFile.szBaseName[0] == '\0' || g_pvInputFiles.size() > 1) + bUpdateBaseName = true; + else + strcpy(szTmpBaseName, g_staticParams.inputFile.szBaseName); + + g_staticParams.inputFile = *pFileInfo; + g_staticParams.inputFile.iInputType = GetInputType(g_staticParams.inputFile.szFileName); + + if (InputType_UNKNOWN == g_staticParams.inputFile.iInputType) + return false; + + FILE* fp; + if ((fp = fopen(g_staticParams.inputFile.szFileName, "r")) == NULL) + { + string strErrorMsg = " Error - cannot read input file \"" + string(g_staticParams.inputFile.szFileName) + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + else + { + fclose(fp); + } + +#ifndef CRUX + if (bUpdateBaseName) + { + char* pStr; + int iLen = (int)strlen(g_staticParams.inputFile.szFileName); + + strcpy(g_staticParams.inputFile.szBaseName, g_staticParams.inputFile.szFileName); + + if ((pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) + *pStr = '\0'; + + if (!STRCMP_IGNORE_CASE(g_staticParams.inputFile.szFileName + iLen - 9, ".mzXML.gz") + || !STRCMP_IGNORE_CASE(g_staticParams.inputFile.szFileName + iLen - 8, ".mzML.gz")) + { + if ((pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) + *pStr = '\0'; + } + } + else + { + strcpy(g_staticParams.inputFile.szBaseName, szTmpBaseName); + } +#endif + + return true; +} + +// ----------------------------------------------------------------------- +// SetMSLevelFilter: configure MSReader to read the right MS level. +// ----------------------------------------------------------------------- +inline static void SetMSLevelFilter(MSReader& mstReader) +{ + vector msLevel; + + if (g_staticParams.options.iMSLevel == 3) + msLevel.push_back(MS3); + else if (g_staticParams.options.iMSLevel == 2) + msLevel.push_back(MS2); + else if (g_staticParams.options.iMSLevel == 1) + msLevel.push_back(MS1); + + mstReader.setFilter(msLevel); +} + +// ----------------------------------------------------------------------- +// AllocateResultsMem: allocate _pResults (and optionally _pDecoys) for +// every Query* in the batch, and zero-initialize scoring fields. +// ----------------------------------------------------------------------- +inline static bool AllocateResultsMem(std::vector& queries) +{ + for (std::vector::iterator it = queries.begin(); it != queries.end(); ++it) + { + Query* pQuery = *it; + + try + { + pQuery->_pResults = new Results[g_staticParams.options.iNumStored]; + } + catch (std::bad_alloc& ba) + { + string strErrorMsg = " Error - new(_pResults[]). bad_alloc: \"" + std::string(ba.what()) + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + + if (g_staticParams.options.iDecoySearch == 2) + { + try + { + pQuery->_pDecoys = new Results[g_staticParams.options.iNumStored]; + } + catch (std::bad_alloc& ba) + { + string strErrorMsg = " Error - new(_pDecoys[]). bad_alloc: " + std::string(ba.what()) + "\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + } + + pQuery->iMatchPeptideCount = 0; + pQuery->iDecoyMatchPeptideCount = 0; + + for (int j = 0; j < g_staticParams.options.iNumStored; ++j) + { + pQuery->_pResults[j].dPepMass = 0.0; + pQuery->_pResults[j].dExpect = 999; + pQuery->_pResults[j].fScoreSp = 0.0; + pQuery->_pResults[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; + pQuery->_pResults[j].fAScorePro = 0.0; + pQuery->_pResults[j].usiLenPeptide = 0; + pQuery->_pResults[j].usiRankSp = 0; + pQuery->_pResults[j].usiMatchedIons = 0; + pQuery->_pResults[j].usiTotalIons = 0; + pQuery->_pResults[j].szPeptide[0] = '\0'; + pQuery->_pResults[j].sAScoreProSiteScores.clear(); + pQuery->_pResults[j].pWhichProtein.clear(); + pQuery->_pResults[j].sPeffOrigResidues.clear(); + pQuery->_pResults[j].iPeffOrigResiduePosition = -9; + memset(pQuery->iXcorrHistogram, 0, sizeof(pQuery->iXcorrHistogram)); + + if (g_staticParams.options.iDecoySearch) + pQuery->_pResults[j].pWhichDecoyProtein.clear(); + + if (g_staticParams.options.iDecoySearch == 2) + { + pQuery->_pDecoys[j].dPepMass = 0.0; + pQuery->_pDecoys[j].dExpect = 999; + pQuery->_pDecoys[j].fScoreSp = 0.0; + pQuery->_pDecoys[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; + pQuery->_pDecoys[j].fAScorePro = 0.0; + pQuery->_pDecoys[j].usiLenPeptide = 0; + pQuery->_pDecoys[j].usiRankSp = 0; + pQuery->_pDecoys[j].usiMatchedIons = 0; + pQuery->_pDecoys[j].usiTotalIons = 0; + pQuery->_pDecoys[j].szPeptide[0] = '\0'; + pQuery->_pDecoys[j].sAScoreProSiteScores.clear(); + pQuery->_pDecoys[j].pWhichProtein.clear(); + pQuery->_pDecoys[j].sPeffOrigResidues.clear(); + pQuery->_pDecoys[j].iPeffOrigResiduePosition = -9; + } + } + } + + return true; +} + +// ----------------------------------------------------------------------- +// Query sort comparators +// ----------------------------------------------------------------------- +inline static bool compareByPeptideMass(Query const* a, Query const* b) +{ + return (a->_pepMassInfo.dExpPepMass < b->_pepMassInfo.dExpPepMass); +} + +inline static bool compareByMangoIndex(Query const* a, Query const* b) +{ + return (a->dMangoIndex < b->dMangoIndex); +} + +inline static bool compareByScanNumber(Query const* a, Query const* b) +{ + if (a->_spectrumInfoInternal.iScanNumber == b->_spectrumInfoInternal.iScanNumber) + return (a->_spectrumInfoInternal.usiChargeState < b->_spectrumInfoInternal.usiChargeState); + return (a->_spectrumInfoInternal.iScanNumber < b->_spectrumInfoInternal.iScanNumber); +} diff --git a/docs/20260612_architecture_migration.md b/docs/20260612_architecture_migration.md index ff139abe..df81cae5 100644 --- a/docs/20260612_architecture_migration.md +++ b/docs/20260612_architecture_migration.md @@ -7,6 +7,22 @@ Behavior is unchanged at every step; each phase is independently compilable and --- +## Status (as of 2026-06-13) + +| Phase | Description | State | Commit | +|-------|-------------|-------|--------| +| 1 | Split `CometDataInternal.h` | **Complete** | `4337ee8d` | +| 2 | Extract `SearchMemoryPool` | **Complete** | `4337ee8d` | +| 3 | Extract `IResultWriter` | **Complete** | `4337ee8d` | +| 4 | Introduce `SearchSession` | **Complete** | `00e0655f` | +| 5 | Extract `ISearchStrategy` + `Pipeline` | **Complete** | uncommitted | +| 6+ | Further decomposition (index/, spectrum/, scoring/) | Planned | — | + +All phases verified: 17/17 unit tests pass; HeLa FI_DB batch parity confirmed at +each phase boundary (zero PSM diff at 1 % and 5 % FDR, xcorr and e-value). + +--- + ## Background The codebase has six structural pathologies that this plan addresses in order of @@ -104,6 +120,8 @@ CometSearch/ ## Phase 1 — Split `CometDataInternal.h` +**Status**: Complete — committed `4337ee8d` + **Effort**: ~1 day **Risk**: Low (mechanical split, no logic changes) ### Problem @@ -223,6 +241,8 @@ python3 tests/unit/run_tests.py --comet comet.exe # all 17 must pass ## Phase 2 — Extract `SearchMemoryPool` +**Status**: Complete — committed `4337ee8d` + **Effort**: ~1 day **Risk**: Low (self-contained, well-tested at runtime) ### Problem @@ -376,6 +396,8 @@ python3 tests/unit/run_tests.py --comet comet.exe # all 17 must pass ## Phase 3 — Extract `IResultWriter` +**Status**: Complete — committed `4337ee8d` + **Effort**: ~2 days **Risk**: Medium (touches writer internals) ### Problem @@ -523,6 +545,8 @@ python3 tests/unit/run_tests.py --comet comet.exe ## Phase 4 — Introduce `SearchSession` +**Status**: Complete — committed `00e0655f` + **Effort**: ~3 days **Risk**: Medium-high (many call sites) ### Problem @@ -656,6 +680,8 @@ python3 tests/unit/run_tests.py --comet comet.exe ## Phase 5 — Extract `ISearchStrategy` and `Pipeline` +**Status**: Complete — uncommitted (working tree on `batch_FI_optimization`) + **Effort**: ~1 week **Risk**: High (most invasive refactor) ### Problem @@ -849,6 +875,54 @@ python3 tests/unit/run_tests.py --comet comet.exe # all 17 must pass # confirm RTS path still compiles and executes via RealtimeSearch.exe smoke test ``` +### Actual implementation notes + +The interface as built is more fine-grained than the plan above. The plan had a +single `execute(file, session, pool, tp)` per file; the actual `ISearchStrategy` +splits the per-file work into four methods so the common per-file loop (MSReader +setup, writer open/close, batch while-loop, timing) can live in `Pipeline::run()` +without duplication across three strategies: + +```cpp +virtual bool initialize(SearchSession& session, ThreadPool* tp) = 0; +virtual bool openFiles(const std::string& szDatabase, + FILE*& fpfasta, FILE*& fpidx, FILE*& fpdb, + SearchSession& session) = 0; +virtual bool executeBatch(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session) = 0; +virtual void closeFiles(FILE* fpfasta, FILE* fpidx) = 0; +virtual void finalize() = 0; +virtual bool isIndexBased() const = 0; +``` + +`iPercentStart`/`iPercentEnd` are passed by reference so each strategy can update +them after `LoadAndPreprocessSpectra` but before calling `RunSearch`, preserving +the exact progress-reporting semantics of the original code. + +A `search/SearchUtils.h` header was added to hold utility functions extracted +from `CometSearchManager.cpp` statics (`GetInputType`, `UpdateInputFile`, +`SetMSLevelFilter`, `AllocateResultsMem`, `compareByPeptideMass`, +`compareByMangoIndex`, `compareByScanNumber`). These are inline functions +shared by all three strategy `.cpp` files without circular includes. + +The early-return index-build paths (`bCreateFragmentIndex`, `bCreatePeptideIndex`) +remain in `DoSearch()` as early returns before `makeStrategy()` is called, rather +than being absorbed into strategy `initialize()`. This avoids adding "are we +done?" signaling between strategy and pipeline for what is conceptually a +separate, one-shot operation. + +### Results + +- Build: clean on Linux (gcc, c++20), no new errors +- Unit tests: 17/17 pass +- HeLa FI_DB parity: `20250520_Hela_60min_06.mzXML` vs `human.canonical.target-decoy.fasta.idx`, + trypsin + phospho + oxidation, 49,747 spectra + Pre-Phase5 (commit `00e0655f`): 16,559 xcorr PSMs @ 1% FDR, 18,458 evalue PSMs @ 1% + Phase5 (working tree): 16,559 xcorr PSMs @ 1% FDR, 18,458 evalue PSMs @ 1% + Diff: **zero unique PSMs** at 1% and 5% FDR for both xcorr and evalue sorting + --- ## Build System From c5e076c3b0a5c34c6b2c119ccbc5163048ff29f0 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Sat, 13 Jun 2026 08:38:45 -0700 Subject: [PATCH 04/15] fix Windows build: update CometSearch.vcxproj and zconf.h for subdirectory sources CometSearch.vcxproj was missing all subdirectory sources added in Phases 2-5: - ClCompile: threading/SearchMemoryPool.cpp, search/Fasta/Fi/PiStrategy.cpp, search/Pipeline.cpp - ClInclude: core/, output/, threading/, search/ headers - AdditionalIncludeDirectories: add $(ProjectDir) so subdirectory .cpp files can find root-level headers (Common.h, CometDataInternal.h, etc.) MSToolkit/include/zconf.h was generated by ./configure on Linux, which hardcodes "#if 1 define Z_HAVE_UNISTD_H" unconditionally, triggering inclusion of on Windows (C1083). Add !defined(_WIN32) guard so MSVC skips the POSIX include. Removed from .gitignore so this fix is retained. Verified: full Comet.sln Release x64 build succeeds (Comet.exe + CometWrapper.dll + RealtimeSearch.exe); HeLa FI_DB search produces 16,578 xcorr PSMs @ 1% FDR and 18,458 evalue PSMs @ 1% FDR (evalue count identical to Linux build; xcorr 19-PSM delta is floating-point margin noise at the cutoff boundary). Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 3 +- CometSearch/CometSearch.vcxproj | 26 +- MSToolkit/include/zconf.h | 534 ++++++++++++++++++++++++++++++++ 3 files changed, 560 insertions(+), 3 deletions(-) create mode 100644 MSToolkit/include/zconf.h diff --git a/.gitignore b/.gitignore index 4a36fb52..211df614 100644 --- a/.gitignore +++ b/.gitignore @@ -8,9 +8,10 @@ MSToolkit/extern/expat-2.2.9/ # ignore dynamically generated files MSToolkit/include/expat.h MSToolkit/include/expat_external.h -MSToolkit/include/zconf.h MSToolkit/include/zlib.h MSToolkit/*.mri +# zconf.h is tracked: it needs a !defined(_WIN32) guard added to the +# configure-generated Z_HAVE_UNISTD_H block (see commit message for details) .DS_Store .idea diff --git a/CometSearch/CometSearch.vcxproj b/CometSearch/CometSearch.vcxproj index a2068b26..e0844c2b 100644 --- a/CometSearch/CometSearch.vcxproj +++ b/CometSearch/CometSearch.vcxproj @@ -47,7 +47,7 @@ Level3 Disabled - ..\MSToolkit\include;..\MSToolkit\include\extern;..\AScorePro\include + $(ProjectDir);..\MSToolkit\include;..\MSToolkit\include\extern;..\AScorePro\include Fast WIN32;WIN64;_WIN64;_MBCS;_CRT_SECURE_NO_DEPRECATE;_NOSQLITE;NOMINMAX;_HAS_STD_BYTE=0;RTS_TIMING_OFF;%(PreprocessorDefinitions) ProgramDatabase @@ -64,7 +64,7 @@ MaxSpeed false true - ..\MSToolkit\include;..\MSToolkit\include\extern;..\AScorePro\include + $(ProjectDir);..\MSToolkit\include;..\MSToolkit\include\extern;..\AScorePro\include AnySuitable Speed true @@ -116,6 +116,23 @@ + + + + + + + + + + + + + + + + + @@ -136,6 +153,11 @@ + + + + + diff --git a/MSToolkit/include/zconf.h b/MSToolkit/include/zconf.h new file mode 100644 index 00000000..c9702960 --- /dev/null +++ b/MSToolkit/include/zconf.h @@ -0,0 +1,534 @@ +/* zconf.h -- configuration of the zlib compression library + * Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +#ifndef ZCONF_H +#define ZCONF_H + +/* + * If you *really* need a unique prefix for all types and library functions, + * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. + * Even better than compiling with -DZ_PREFIX would be to use configure to set + * this permanently in zconf.h using "./configure --zprefix". + */ +#ifdef Z_PREFIX /* may be set to #if 1 by ./configure */ +# define Z_PREFIX_SET + +/* all linked symbols and init macros */ +# define _dist_code z__dist_code +# define _length_code z__length_code +# define _tr_align z__tr_align +# define _tr_flush_bits z__tr_flush_bits +# define _tr_flush_block z__tr_flush_block +# define _tr_init z__tr_init +# define _tr_stored_block z__tr_stored_block +# define _tr_tally z__tr_tally +# define adler32 z_adler32 +# define adler32_combine z_adler32_combine +# define adler32_combine64 z_adler32_combine64 +# define adler32_z z_adler32_z +# ifndef Z_SOLO +# define compress z_compress +# define compress2 z_compress2 +# define compressBound z_compressBound +# endif +# define crc32 z_crc32 +# define crc32_combine z_crc32_combine +# define crc32_combine64 z_crc32_combine64 +# define crc32_z z_crc32_z +# define deflate z_deflate +# define deflateBound z_deflateBound +# define deflateCopy z_deflateCopy +# define deflateEnd z_deflateEnd +# define deflateGetDictionary z_deflateGetDictionary +# define deflateInit z_deflateInit +# define deflateInit2 z_deflateInit2 +# define deflateInit2_ z_deflateInit2_ +# define deflateInit_ z_deflateInit_ +# define deflateParams z_deflateParams +# define deflatePending z_deflatePending +# define deflatePrime z_deflatePrime +# define deflateReset z_deflateReset +# define deflateResetKeep z_deflateResetKeep +# define deflateSetDictionary z_deflateSetDictionary +# define deflateSetHeader z_deflateSetHeader +# define deflateTune z_deflateTune +# define deflate_copyright z_deflate_copyright +# define get_crc_table z_get_crc_table +# ifndef Z_SOLO +# define gz_error z_gz_error +# define gz_intmax z_gz_intmax +# define gz_strwinerror z_gz_strwinerror +# define gzbuffer z_gzbuffer +# define gzclearerr z_gzclearerr +# define gzclose z_gzclose +# define gzclose_r z_gzclose_r +# define gzclose_w z_gzclose_w +# define gzdirect z_gzdirect +# define gzdopen z_gzdopen +# define gzeof z_gzeof +# define gzerror z_gzerror +# define gzflush z_gzflush +# define gzfread z_gzfread +# define gzfwrite z_gzfwrite +# define gzgetc z_gzgetc +# define gzgetc_ z_gzgetc_ +# define gzgets z_gzgets +# define gzoffset z_gzoffset +# define gzoffset64 z_gzoffset64 +# define gzopen z_gzopen +# define gzopen64 z_gzopen64 +# ifdef _WIN32 +# define gzopen_w z_gzopen_w +# endif +# define gzprintf z_gzprintf +# define gzputc z_gzputc +# define gzputs z_gzputs +# define gzread z_gzread +# define gzrewind z_gzrewind +# define gzseek z_gzseek +# define gzseek64 z_gzseek64 +# define gzsetparams z_gzsetparams +# define gztell z_gztell +# define gztell64 z_gztell64 +# define gzungetc z_gzungetc +# define gzvprintf z_gzvprintf +# define gzwrite z_gzwrite +# endif +# define inflate z_inflate +# define inflateBack z_inflateBack +# define inflateBackEnd z_inflateBackEnd +# define inflateBackInit z_inflateBackInit +# define inflateBackInit_ z_inflateBackInit_ +# define inflateCodesUsed z_inflateCodesUsed +# define inflateCopy z_inflateCopy +# define inflateEnd z_inflateEnd +# define inflateGetDictionary z_inflateGetDictionary +# define inflateGetHeader z_inflateGetHeader +# define inflateInit z_inflateInit +# define inflateInit2 z_inflateInit2 +# define inflateInit2_ z_inflateInit2_ +# define inflateInit_ z_inflateInit_ +# define inflateMark z_inflateMark +# define inflatePrime z_inflatePrime +# define inflateReset z_inflateReset +# define inflateReset2 z_inflateReset2 +# define inflateResetKeep z_inflateResetKeep +# define inflateSetDictionary z_inflateSetDictionary +# define inflateSync z_inflateSync +# define inflateSyncPoint z_inflateSyncPoint +# define inflateUndermine z_inflateUndermine +# define inflateValidate z_inflateValidate +# define inflate_copyright z_inflate_copyright +# define inflate_fast z_inflate_fast +# define inflate_table z_inflate_table +# ifndef Z_SOLO +# define uncompress z_uncompress +# define uncompress2 z_uncompress2 +# endif +# define zError z_zError +# ifndef Z_SOLO +# define zcalloc z_zcalloc +# define zcfree z_zcfree +# endif +# define zlibCompileFlags z_zlibCompileFlags +# define zlibVersion z_zlibVersion + +/* all zlib typedefs in zlib.h and zconf.h */ +# define Byte z_Byte +# define Bytef z_Bytef +# define alloc_func z_alloc_func +# define charf z_charf +# define free_func z_free_func +# ifndef Z_SOLO +# define gzFile z_gzFile +# endif +# define gz_header z_gz_header +# define gz_headerp z_gz_headerp +# define in_func z_in_func +# define intf z_intf +# define out_func z_out_func +# define uInt z_uInt +# define uIntf z_uIntf +# define uLong z_uLong +# define uLongf z_uLongf +# define voidp z_voidp +# define voidpc z_voidpc +# define voidpf z_voidpf + +/* all zlib structs in zlib.h and zconf.h */ +# define gz_header_s z_gz_header_s +# define internal_state z_internal_state + +#endif + +#if defined(__MSDOS__) && !defined(MSDOS) +# define MSDOS +#endif +#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2) +# define OS2 +#endif +#if defined(_WINDOWS) && !defined(WINDOWS) +# define WINDOWS +#endif +#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__) +# ifndef WIN32 +# define WIN32 +# endif +#endif +#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32) +# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__) +# ifndef SYS16BIT +# define SYS16BIT +# endif +# endif +#endif + +/* + * Compile with -DMAXSEG_64K if the alloc function cannot allocate more + * than 64k bytes at a time (needed on systems with 16-bit int). + */ +#ifdef SYS16BIT +# define MAXSEG_64K +#endif +#ifdef MSDOS +# define UNALIGNED_OK +#endif + +#ifdef __STDC_VERSION__ +# ifndef STDC +# define STDC +# endif +# if __STDC_VERSION__ >= 199901L +# ifndef STDC99 +# define STDC99 +# endif +# endif +#endif +#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus)) +# define STDC +#endif +#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__)) +# define STDC +#endif +#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32)) +# define STDC +#endif +#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__)) +# define STDC +#endif + +#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */ +# define STDC +#endif + +#ifndef STDC +# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ +# define const /* note: need a more gentle solution here */ +# endif +#endif + +#if defined(ZLIB_CONST) && !defined(z_const) +# define z_const const +#else +# define z_const +#endif + +#ifdef Z_SOLO + typedef unsigned long z_size_t; +#else +# define z_longlong long long +# if defined(NO_SIZE_T) + typedef unsigned NO_SIZE_T z_size_t; +# elif defined(STDC) +# include + typedef size_t z_size_t; +# else + typedef unsigned long z_size_t; +# endif +# undef z_longlong +#endif + +/* Maximum value for memLevel in deflateInit2 */ +#ifndef MAX_MEM_LEVEL +# ifdef MAXSEG_64K +# define MAX_MEM_LEVEL 8 +# else +# define MAX_MEM_LEVEL 9 +# endif +#endif + +/* Maximum value for windowBits in deflateInit2 and inflateInit2. + * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files + * created by gzip. (Files created by minigzip can still be extracted by + * gzip.) + */ +#ifndef MAX_WBITS +# define MAX_WBITS 15 /* 32K LZ77 window */ +#endif + +/* The memory requirements for deflate are (in bytes): + (1 << (windowBits+2)) + (1 << (memLevel+9)) + that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) + plus a few kilobytes for small objects. For example, if you want to reduce + the default memory requirements from 256K to 128K, compile with + make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" + Of course this will generally degrade compression (there's no free lunch). + + The memory requirements for inflate are (in bytes) 1 << windowBits + that is, 32K for windowBits=15 (default value) plus about 7 kilobytes + for small objects. +*/ + + /* Type declarations */ + +#ifndef OF /* function prototypes */ +# ifdef STDC +# define OF(args) args +# else +# define OF(args) () +# endif +#endif + +#ifndef Z_ARG /* function prototypes for stdarg */ +# if defined(STDC) || defined(Z_HAVE_STDARG_H) +# define Z_ARG(args) args +# else +# define Z_ARG(args) () +# endif +#endif + +/* The following definitions for FAR are needed only for MSDOS mixed + * model programming (small or medium model with some far allocations). + * This was tested only with MSC; for other MSDOS compilers you may have + * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, + * just define FAR to be empty. + */ +#ifdef SYS16BIT +# if defined(M_I86SM) || defined(M_I86MM) + /* MSC small or medium model */ +# define SMALL_MEDIUM +# ifdef _MSC_VER +# define FAR _far +# else +# define FAR far +# endif +# endif +# if (defined(__SMALL__) || defined(__MEDIUM__)) + /* Turbo C small or medium model */ +# define SMALL_MEDIUM +# ifdef __BORLANDC__ +# define FAR _far +# else +# define FAR far +# endif +# endif +#endif + +#if defined(WINDOWS) || defined(WIN32) + /* If building or using zlib as a DLL, define ZLIB_DLL. + * This is not mandatory, but it offers a little performance increase. + */ +# ifdef ZLIB_DLL +# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500)) +# ifdef ZLIB_INTERNAL +# define ZEXTERN extern __declspec(dllexport) +# else +# define ZEXTERN extern __declspec(dllimport) +# endif +# endif +# endif /* ZLIB_DLL */ + /* If building or using zlib with the WINAPI/WINAPIV calling convention, + * define ZLIB_WINAPI. + * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI. + */ +# ifdef ZLIB_WINAPI +# ifdef FAR +# undef FAR +# endif +# include + /* No need for _export, use ZLIB.DEF instead. */ + /* For complete Windows compatibility, use WINAPI, not __stdcall. */ +# define ZEXPORT WINAPI +# ifdef WIN32 +# define ZEXPORTVA WINAPIV +# else +# define ZEXPORTVA FAR CDECL +# endif +# endif +#endif + +#if defined (__BEOS__) +# ifdef ZLIB_DLL +# ifdef ZLIB_INTERNAL +# define ZEXPORT __declspec(dllexport) +# define ZEXPORTVA __declspec(dllexport) +# else +# define ZEXPORT __declspec(dllimport) +# define ZEXPORTVA __declspec(dllimport) +# endif +# endif +#endif + +#ifndef ZEXTERN +# define ZEXTERN extern +#endif +#ifndef ZEXPORT +# define ZEXPORT +#endif +#ifndef ZEXPORTVA +# define ZEXPORTVA +#endif + +#ifndef FAR +# define FAR +#endif + +#if !defined(__MACTYPES__) +typedef unsigned char Byte; /* 8 bits */ +#endif +typedef unsigned int uInt; /* 16 bits or more */ +typedef unsigned long uLong; /* 32 bits or more */ + +#ifdef SMALL_MEDIUM + /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */ +# define Bytef Byte FAR +#else + typedef Byte FAR Bytef; +#endif +typedef char FAR charf; +typedef int FAR intf; +typedef uInt FAR uIntf; +typedef uLong FAR uLongf; + +#ifdef STDC + typedef void const *voidpc; + typedef void FAR *voidpf; + typedef void *voidp; +#else + typedef Byte const *voidpc; + typedef Byte FAR *voidpf; + typedef Byte *voidp; +#endif + +#if !defined(Z_U4) && !defined(Z_SOLO) && defined(STDC) +# include +# if (UINT_MAX == 0xffffffffUL) +# define Z_U4 unsigned +# elif (ULONG_MAX == 0xffffffffUL) +# define Z_U4 unsigned long +# elif (USHRT_MAX == 0xffffffffUL) +# define Z_U4 unsigned short +# endif +#endif + +#ifdef Z_U4 + typedef Z_U4 z_crc_t; +#else + typedef unsigned long z_crc_t; +#endif + +#if 1 && !defined(_WIN32) /* was set to #if 1 by ./configure */ +# define Z_HAVE_UNISTD_H +#endif + +#if 1 /* was set to #if 1 by ./configure */ +# define Z_HAVE_STDARG_H +#endif + +#ifdef STDC +# ifndef Z_SOLO +# include /* for off_t */ +# endif +#endif + +#if defined(STDC) || defined(Z_HAVE_STDARG_H) +# ifndef Z_SOLO +# include /* for va_list */ +# endif +#endif + +#ifdef _WIN32 +# ifndef Z_SOLO +# include /* for wchar_t */ +# endif +#endif + +/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and + * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even + * though the former does not conform to the LFS document), but considering + * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as + * equivalently requesting no 64-bit operations + */ +#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1 +# undef _LARGEFILE64_SOURCE +#endif + +#if defined(__WATCOMC__) && !defined(Z_HAVE_UNISTD_H) +# define Z_HAVE_UNISTD_H +#endif +#ifndef Z_SOLO +# if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE) +# include /* for SEEK_*, off_t, and _LFS64_LARGEFILE */ +# ifdef VMS +# include /* for off_t */ +# endif +# ifndef z_off_t +# define z_off_t off_t +# endif +# endif +#endif + +#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0 +# define Z_LFS64 +#endif + +#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64) +# define Z_LARGE64 +#endif + +#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64) +# define Z_WANT64 +#endif + +#if !defined(SEEK_SET) && !defined(Z_SOLO) +# define SEEK_SET 0 /* Seek from beginning of file. */ +# define SEEK_CUR 1 /* Seek from current position. */ +# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ +#endif + +#ifndef z_off_t +# define z_off_t long +#endif + +#if !defined(_WIN32) && defined(Z_LARGE64) +# define z_off64_t off64_t +#else +# if defined(_WIN32) && !defined(__GNUC__) && !defined(Z_SOLO) +# define z_off64_t __int64 +# else +# define z_off64_t z_off_t +# endif +#endif + +/* MVS linker does not support external names larger than 8 bytes */ +#if defined(__MVS__) + #pragma map(deflateInit_,"DEIN") + #pragma map(deflateInit2_,"DEIN2") + #pragma map(deflateEnd,"DEEND") + #pragma map(deflateBound,"DEBND") + #pragma map(inflateInit_,"ININ") + #pragma map(inflateInit2_,"ININ2") + #pragma map(inflateEnd,"INEND") + #pragma map(inflateSync,"INSY") + #pragma map(inflateSetDictionary,"INSEDI") + #pragma map(compressBound,"CMBND") + #pragma map(inflate_table,"INTABL") + #pragma map(inflate_fast,"INFA") + #pragma map(inflate_copyright,"INCOPY") +#endif + +#endif /* ZCONF_H */ From e35417131cb68956f575129e27d5cc919efceb08 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Sat, 13 Jun 2026 08:49:39 -0700 Subject: [PATCH 05/15] revert: untrack MSToolkit/include/zconf.h, restore it to .gitignore zconf.h is generated by ./configure and should not be tracked. The project requires a clean between Linux and Windows builds when switching platforms. Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 3 +- MSToolkit/include/zconf.h | 534 -------------------------------------- 2 files changed, 1 insertion(+), 536 deletions(-) delete mode 100644 MSToolkit/include/zconf.h diff --git a/.gitignore b/.gitignore index 211df614..4a36fb52 100644 --- a/.gitignore +++ b/.gitignore @@ -8,10 +8,9 @@ MSToolkit/extern/expat-2.2.9/ # ignore dynamically generated files MSToolkit/include/expat.h MSToolkit/include/expat_external.h +MSToolkit/include/zconf.h MSToolkit/include/zlib.h MSToolkit/*.mri -# zconf.h is tracked: it needs a !defined(_WIN32) guard added to the -# configure-generated Z_HAVE_UNISTD_H block (see commit message for details) .DS_Store .idea diff --git a/MSToolkit/include/zconf.h b/MSToolkit/include/zconf.h deleted file mode 100644 index c9702960..00000000 --- a/MSToolkit/include/zconf.h +++ /dev/null @@ -1,534 +0,0 @@ -/* zconf.h -- configuration of the zlib compression library - * Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -/* @(#) $Id$ */ - -#ifndef ZCONF_H -#define ZCONF_H - -/* - * If you *really* need a unique prefix for all types and library functions, - * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. - * Even better than compiling with -DZ_PREFIX would be to use configure to set - * this permanently in zconf.h using "./configure --zprefix". - */ -#ifdef Z_PREFIX /* may be set to #if 1 by ./configure */ -# define Z_PREFIX_SET - -/* all linked symbols and init macros */ -# define _dist_code z__dist_code -# define _length_code z__length_code -# define _tr_align z__tr_align -# define _tr_flush_bits z__tr_flush_bits -# define _tr_flush_block z__tr_flush_block -# define _tr_init z__tr_init -# define _tr_stored_block z__tr_stored_block -# define _tr_tally z__tr_tally -# define adler32 z_adler32 -# define adler32_combine z_adler32_combine -# define adler32_combine64 z_adler32_combine64 -# define adler32_z z_adler32_z -# ifndef Z_SOLO -# define compress z_compress -# define compress2 z_compress2 -# define compressBound z_compressBound -# endif -# define crc32 z_crc32 -# define crc32_combine z_crc32_combine -# define crc32_combine64 z_crc32_combine64 -# define crc32_z z_crc32_z -# define deflate z_deflate -# define deflateBound z_deflateBound -# define deflateCopy z_deflateCopy -# define deflateEnd z_deflateEnd -# define deflateGetDictionary z_deflateGetDictionary -# define deflateInit z_deflateInit -# define deflateInit2 z_deflateInit2 -# define deflateInit2_ z_deflateInit2_ -# define deflateInit_ z_deflateInit_ -# define deflateParams z_deflateParams -# define deflatePending z_deflatePending -# define deflatePrime z_deflatePrime -# define deflateReset z_deflateReset -# define deflateResetKeep z_deflateResetKeep -# define deflateSetDictionary z_deflateSetDictionary -# define deflateSetHeader z_deflateSetHeader -# define deflateTune z_deflateTune -# define deflate_copyright z_deflate_copyright -# define get_crc_table z_get_crc_table -# ifndef Z_SOLO -# define gz_error z_gz_error -# define gz_intmax z_gz_intmax -# define gz_strwinerror z_gz_strwinerror -# define gzbuffer z_gzbuffer -# define gzclearerr z_gzclearerr -# define gzclose z_gzclose -# define gzclose_r z_gzclose_r -# define gzclose_w z_gzclose_w -# define gzdirect z_gzdirect -# define gzdopen z_gzdopen -# define gzeof z_gzeof -# define gzerror z_gzerror -# define gzflush z_gzflush -# define gzfread z_gzfread -# define gzfwrite z_gzfwrite -# define gzgetc z_gzgetc -# define gzgetc_ z_gzgetc_ -# define gzgets z_gzgets -# define gzoffset z_gzoffset -# define gzoffset64 z_gzoffset64 -# define gzopen z_gzopen -# define gzopen64 z_gzopen64 -# ifdef _WIN32 -# define gzopen_w z_gzopen_w -# endif -# define gzprintf z_gzprintf -# define gzputc z_gzputc -# define gzputs z_gzputs -# define gzread z_gzread -# define gzrewind z_gzrewind -# define gzseek z_gzseek -# define gzseek64 z_gzseek64 -# define gzsetparams z_gzsetparams -# define gztell z_gztell -# define gztell64 z_gztell64 -# define gzungetc z_gzungetc -# define gzvprintf z_gzvprintf -# define gzwrite z_gzwrite -# endif -# define inflate z_inflate -# define inflateBack z_inflateBack -# define inflateBackEnd z_inflateBackEnd -# define inflateBackInit z_inflateBackInit -# define inflateBackInit_ z_inflateBackInit_ -# define inflateCodesUsed z_inflateCodesUsed -# define inflateCopy z_inflateCopy -# define inflateEnd z_inflateEnd -# define inflateGetDictionary z_inflateGetDictionary -# define inflateGetHeader z_inflateGetHeader -# define inflateInit z_inflateInit -# define inflateInit2 z_inflateInit2 -# define inflateInit2_ z_inflateInit2_ -# define inflateInit_ z_inflateInit_ -# define inflateMark z_inflateMark -# define inflatePrime z_inflatePrime -# define inflateReset z_inflateReset -# define inflateReset2 z_inflateReset2 -# define inflateResetKeep z_inflateResetKeep -# define inflateSetDictionary z_inflateSetDictionary -# define inflateSync z_inflateSync -# define inflateSyncPoint z_inflateSyncPoint -# define inflateUndermine z_inflateUndermine -# define inflateValidate z_inflateValidate -# define inflate_copyright z_inflate_copyright -# define inflate_fast z_inflate_fast -# define inflate_table z_inflate_table -# ifndef Z_SOLO -# define uncompress z_uncompress -# define uncompress2 z_uncompress2 -# endif -# define zError z_zError -# ifndef Z_SOLO -# define zcalloc z_zcalloc -# define zcfree z_zcfree -# endif -# define zlibCompileFlags z_zlibCompileFlags -# define zlibVersion z_zlibVersion - -/* all zlib typedefs in zlib.h and zconf.h */ -# define Byte z_Byte -# define Bytef z_Bytef -# define alloc_func z_alloc_func -# define charf z_charf -# define free_func z_free_func -# ifndef Z_SOLO -# define gzFile z_gzFile -# endif -# define gz_header z_gz_header -# define gz_headerp z_gz_headerp -# define in_func z_in_func -# define intf z_intf -# define out_func z_out_func -# define uInt z_uInt -# define uIntf z_uIntf -# define uLong z_uLong -# define uLongf z_uLongf -# define voidp z_voidp -# define voidpc z_voidpc -# define voidpf z_voidpf - -/* all zlib structs in zlib.h and zconf.h */ -# define gz_header_s z_gz_header_s -# define internal_state z_internal_state - -#endif - -#if defined(__MSDOS__) && !defined(MSDOS) -# define MSDOS -#endif -#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2) -# define OS2 -#endif -#if defined(_WINDOWS) && !defined(WINDOWS) -# define WINDOWS -#endif -#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__) -# ifndef WIN32 -# define WIN32 -# endif -#endif -#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32) -# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__) -# ifndef SYS16BIT -# define SYS16BIT -# endif -# endif -#endif - -/* - * Compile with -DMAXSEG_64K if the alloc function cannot allocate more - * than 64k bytes at a time (needed on systems with 16-bit int). - */ -#ifdef SYS16BIT -# define MAXSEG_64K -#endif -#ifdef MSDOS -# define UNALIGNED_OK -#endif - -#ifdef __STDC_VERSION__ -# ifndef STDC -# define STDC -# endif -# if __STDC_VERSION__ >= 199901L -# ifndef STDC99 -# define STDC99 -# endif -# endif -#endif -#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus)) -# define STDC -#endif -#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__)) -# define STDC -#endif -#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32)) -# define STDC -#endif -#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__)) -# define STDC -#endif - -#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */ -# define STDC -#endif - -#ifndef STDC -# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ -# define const /* note: need a more gentle solution here */ -# endif -#endif - -#if defined(ZLIB_CONST) && !defined(z_const) -# define z_const const -#else -# define z_const -#endif - -#ifdef Z_SOLO - typedef unsigned long z_size_t; -#else -# define z_longlong long long -# if defined(NO_SIZE_T) - typedef unsigned NO_SIZE_T z_size_t; -# elif defined(STDC) -# include - typedef size_t z_size_t; -# else - typedef unsigned long z_size_t; -# endif -# undef z_longlong -#endif - -/* Maximum value for memLevel in deflateInit2 */ -#ifndef MAX_MEM_LEVEL -# ifdef MAXSEG_64K -# define MAX_MEM_LEVEL 8 -# else -# define MAX_MEM_LEVEL 9 -# endif -#endif - -/* Maximum value for windowBits in deflateInit2 and inflateInit2. - * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files - * created by gzip. (Files created by minigzip can still be extracted by - * gzip.) - */ -#ifndef MAX_WBITS -# define MAX_WBITS 15 /* 32K LZ77 window */ -#endif - -/* The memory requirements for deflate are (in bytes): - (1 << (windowBits+2)) + (1 << (memLevel+9)) - that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) - plus a few kilobytes for small objects. For example, if you want to reduce - the default memory requirements from 256K to 128K, compile with - make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" - Of course this will generally degrade compression (there's no free lunch). - - The memory requirements for inflate are (in bytes) 1 << windowBits - that is, 32K for windowBits=15 (default value) plus about 7 kilobytes - for small objects. -*/ - - /* Type declarations */ - -#ifndef OF /* function prototypes */ -# ifdef STDC -# define OF(args) args -# else -# define OF(args) () -# endif -#endif - -#ifndef Z_ARG /* function prototypes for stdarg */ -# if defined(STDC) || defined(Z_HAVE_STDARG_H) -# define Z_ARG(args) args -# else -# define Z_ARG(args) () -# endif -#endif - -/* The following definitions for FAR are needed only for MSDOS mixed - * model programming (small or medium model with some far allocations). - * This was tested only with MSC; for other MSDOS compilers you may have - * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, - * just define FAR to be empty. - */ -#ifdef SYS16BIT -# if defined(M_I86SM) || defined(M_I86MM) - /* MSC small or medium model */ -# define SMALL_MEDIUM -# ifdef _MSC_VER -# define FAR _far -# else -# define FAR far -# endif -# endif -# if (defined(__SMALL__) || defined(__MEDIUM__)) - /* Turbo C small or medium model */ -# define SMALL_MEDIUM -# ifdef __BORLANDC__ -# define FAR _far -# else -# define FAR far -# endif -# endif -#endif - -#if defined(WINDOWS) || defined(WIN32) - /* If building or using zlib as a DLL, define ZLIB_DLL. - * This is not mandatory, but it offers a little performance increase. - */ -# ifdef ZLIB_DLL -# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500)) -# ifdef ZLIB_INTERNAL -# define ZEXTERN extern __declspec(dllexport) -# else -# define ZEXTERN extern __declspec(dllimport) -# endif -# endif -# endif /* ZLIB_DLL */ - /* If building or using zlib with the WINAPI/WINAPIV calling convention, - * define ZLIB_WINAPI. - * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI. - */ -# ifdef ZLIB_WINAPI -# ifdef FAR -# undef FAR -# endif -# include - /* No need for _export, use ZLIB.DEF instead. */ - /* For complete Windows compatibility, use WINAPI, not __stdcall. */ -# define ZEXPORT WINAPI -# ifdef WIN32 -# define ZEXPORTVA WINAPIV -# else -# define ZEXPORTVA FAR CDECL -# endif -# endif -#endif - -#if defined (__BEOS__) -# ifdef ZLIB_DLL -# ifdef ZLIB_INTERNAL -# define ZEXPORT __declspec(dllexport) -# define ZEXPORTVA __declspec(dllexport) -# else -# define ZEXPORT __declspec(dllimport) -# define ZEXPORTVA __declspec(dllimport) -# endif -# endif -#endif - -#ifndef ZEXTERN -# define ZEXTERN extern -#endif -#ifndef ZEXPORT -# define ZEXPORT -#endif -#ifndef ZEXPORTVA -# define ZEXPORTVA -#endif - -#ifndef FAR -# define FAR -#endif - -#if !defined(__MACTYPES__) -typedef unsigned char Byte; /* 8 bits */ -#endif -typedef unsigned int uInt; /* 16 bits or more */ -typedef unsigned long uLong; /* 32 bits or more */ - -#ifdef SMALL_MEDIUM - /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */ -# define Bytef Byte FAR -#else - typedef Byte FAR Bytef; -#endif -typedef char FAR charf; -typedef int FAR intf; -typedef uInt FAR uIntf; -typedef uLong FAR uLongf; - -#ifdef STDC - typedef void const *voidpc; - typedef void FAR *voidpf; - typedef void *voidp; -#else - typedef Byte const *voidpc; - typedef Byte FAR *voidpf; - typedef Byte *voidp; -#endif - -#if !defined(Z_U4) && !defined(Z_SOLO) && defined(STDC) -# include -# if (UINT_MAX == 0xffffffffUL) -# define Z_U4 unsigned -# elif (ULONG_MAX == 0xffffffffUL) -# define Z_U4 unsigned long -# elif (USHRT_MAX == 0xffffffffUL) -# define Z_U4 unsigned short -# endif -#endif - -#ifdef Z_U4 - typedef Z_U4 z_crc_t; -#else - typedef unsigned long z_crc_t; -#endif - -#if 1 && !defined(_WIN32) /* was set to #if 1 by ./configure */ -# define Z_HAVE_UNISTD_H -#endif - -#if 1 /* was set to #if 1 by ./configure */ -# define Z_HAVE_STDARG_H -#endif - -#ifdef STDC -# ifndef Z_SOLO -# include /* for off_t */ -# endif -#endif - -#if defined(STDC) || defined(Z_HAVE_STDARG_H) -# ifndef Z_SOLO -# include /* for va_list */ -# endif -#endif - -#ifdef _WIN32 -# ifndef Z_SOLO -# include /* for wchar_t */ -# endif -#endif - -/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and - * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even - * though the former does not conform to the LFS document), but considering - * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as - * equivalently requesting no 64-bit operations - */ -#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1 -# undef _LARGEFILE64_SOURCE -#endif - -#if defined(__WATCOMC__) && !defined(Z_HAVE_UNISTD_H) -# define Z_HAVE_UNISTD_H -#endif -#ifndef Z_SOLO -# if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE) -# include /* for SEEK_*, off_t, and _LFS64_LARGEFILE */ -# ifdef VMS -# include /* for off_t */ -# endif -# ifndef z_off_t -# define z_off_t off_t -# endif -# endif -#endif - -#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0 -# define Z_LFS64 -#endif - -#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64) -# define Z_LARGE64 -#endif - -#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64) -# define Z_WANT64 -#endif - -#if !defined(SEEK_SET) && !defined(Z_SOLO) -# define SEEK_SET 0 /* Seek from beginning of file. */ -# define SEEK_CUR 1 /* Seek from current position. */ -# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ -#endif - -#ifndef z_off_t -# define z_off_t long -#endif - -#if !defined(_WIN32) && defined(Z_LARGE64) -# define z_off64_t off64_t -#else -# if defined(_WIN32) && !defined(__GNUC__) && !defined(Z_SOLO) -# define z_off64_t __int64 -# else -# define z_off64_t z_off_t -# endif -#endif - -/* MVS linker does not support external names larger than 8 bytes */ -#if defined(__MVS__) - #pragma map(deflateInit_,"DEIN") - #pragma map(deflateInit2_,"DEIN2") - #pragma map(deflateEnd,"DEEND") - #pragma map(deflateBound,"DEBND") - #pragma map(inflateInit_,"ININ") - #pragma map(inflateInit2_,"ININ2") - #pragma map(inflateEnd,"INEND") - #pragma map(inflateSync,"INSY") - #pragma map(inflateSetDictionary,"INSEDI") - #pragma map(compressBound,"CMBND") - #pragma map(inflate_table,"INTABL") - #pragma map(inflate_fast,"INFA") - #pragma map(inflate_copyright,"INCOPY") -#endif - -#endif /* ZCONF_H */ From 6cddd1240fe366abc203833932998ebf26dffe38 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Mon, 15 Jun 2026 10:28:15 -0700 Subject: [PATCH 06/15] fix: eliminate all GCC warnings from Comet source files Add -Wno-unknown-pragmas, -Wno-char-subscripts, and -Wno-unused-result to both Makefiles. The last flag is needed because GCC 13 does not honor (void) casts for warn_unused_result functions such as fread/fgets. Source fixes across CometSearch and Comet.cpp: - CometInterfaces.h: [[maybe_unused]] on static _tp member - Unused ThreadPool* and other parameters anonymized with /*name*/ in Threading, CometSearch (x8), CometPreprocess (x2), CometFragmentIndex, CometSpecLib (x2), FastaStrategy - CometSearch.cpp SearchPeptideIndex: removed size_t tTmp declaration, changed all 7 fread assignments to (void)fread; removed dead pSearchThreadPool variable - CometSearchManager.cpp: removed AllocateResultsMemMS1() whose body was entirely commented out - CometFragmentIndex.cpp: removed 8 auto tClear timing variables whose printf lines were already commented out; cast 3 bulk fread calls - CometPeptideIndex.cpp: removed tTmpRead, changed 5 assignments to (void)fread - CometSpecLib.cpp: removed bDoneProcessingAllSpectra and its 5 assignments (each immediately followed by break); anonymized two unused string parameters - core/Types.h: added = 0 initializer to DBIndex::siVarModProteinFilter to fix -Wmaybe-uninitialized on local DBIndex sDBTmp Build now produces zero warnings from Comet-owned source files. Third-party warnings in MSToolkit/expat/zlib are unaffected. Co-Authored-By: Claude Sonnet 4.6 --- Comet.cpp | 2 +- CometSearch/CometFragmentIndex.cpp | 32 +++------------------- CometSearch/CometInterfaces.h | 2 +- CometSearch/CometPeptideIndex.cpp | 19 ++++++------- CometSearch/CometPreprocess.cpp | 4 +-- CometSearch/CometSearch.cpp | 41 +++++++++++++--------------- CometSearch/CometSearchManager.cpp | 31 --------------------- CometSearch/CometSpecLib.cpp | 12 ++------ CometSearch/Makefile | 4 +-- CometSearch/Threading.cpp | 2 +- CometSearch/core/Types.h | 2 +- CometSearch/search/FastaStrategy.cpp | 2 +- Makefile | 4 +-- 13 files changed, 46 insertions(+), 111 deletions(-) diff --git a/Comet.cpp b/Comet.cpp index 8740a2a9..c3b5e894 100644 --- a/Comet.cpp +++ b/Comet.cpp @@ -692,7 +692,7 @@ void LoadParameters(char* pszParamsFile, enzymeInformation.szSampleEnzymeBreakAA, enzymeInformation.szSampleEnzymeNoBreakAA); } - fgets(szParamBuf, SIZE_BUF, fp); + (void)fgets(szParamBuf, SIZE_BUF, fp); } fclose(fp); diff --git a/CometSearch/CometFragmentIndex.cpp b/CometSearch/CometFragmentIndex.cpp index 52770972..1f81adc4 100644 --- a/CometSearch/CometFragmentIndex.cpp +++ b/CometSearch/CometFragmentIndex.cpp @@ -244,7 +244,7 @@ void CometFragmentIndex::GenerateFragmentIndex(ThreadPool *tp) void CometFragmentIndex::AddFragmentsThreadProc(bool bCountOnly, - ThreadPool *tp) + ThreadPool* /*tp*/) { size_t iWhichFragmentPeptide = 0; // unused here for counting only @@ -1103,64 +1103,40 @@ bool CometFragmentIndex::WriteFIPlainPeptideIndex(ThreadPool *tp) // Destruction is O(n) for pcVarModSites in g_pvDBIndex but trivial for // g_vRawPeptides; order no longer matters. { - auto tClear = chrono::steady_clock::now(); vector().swap(g_vRawPeptides); -// printf(" - freed g_vRawPeptides: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); g_pvDBIndex.clear(); // DBIndex::sPeptide strings freed after g_vRawPeptides // to keep the allocator bins warm for the string frees above -// printf(" - freed g_pvDBIndex: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); g_pvProteinsList.clear(); // CSR flat layout: 2 free() calls instead of ~190M -// printf(" - freed g_pvProteinsList: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); g_pvProteinNames.clear(); -// printf(" - freed g_pvProteinNames: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); vector().swap(MOD_SEQS); -// printf(" - freed MOD_SEQS: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); vector().swap(g_vFragmentPeptides); -// printf(" - freed g_vFragmentPeptides: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); delete[] PEPTIDE_MOD_SEQ_IDXS; PEPTIDE_MOD_SEQ_IDXS = nullptr; -// printf(" - freed PEPTIDE_MOD_SEQ_IDXS: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } { - auto tClear = chrono::steady_clock::now(); delete[] MOD_SEQ_MOD_NUM_START; MOD_SEQ_MOD_NUM_START = nullptr; delete[] MOD_SEQ_MOD_NUM_CNT; MOD_SEQ_MOD_NUM_CNT = nullptr; -// printf(" - freed MOD_SEQ_MOD_NUM_START/CNT: %4lld ms\n", -// (long long)chrono::duration_cast(chrono::steady_clock::now() - tClear).count()); } fflush(stdout); @@ -1413,7 +1389,7 @@ bool CometFragmentIndex::ReadPlainPeptideIndex(void) { size_t pepSectionSize = (size_t)(clProteinsFilePos - clPeptidesFilePos) - sizeof(size_t); vector pepBuf(pepSectionSize); - fread(pepBuf.data(), 1, pepSectionSize, fp); + (void)fread(pepBuf.data(), 1, pepSectionSize, fp); const char* p = pepBuf.data(); struct PlainPeptideIndexStruct sTmp; @@ -1438,7 +1414,7 @@ bool CometFragmentIndex::ReadPlainPeptideIndex(void) { size_t protSectionSize = (size_t)(clPermutationsFilePos - clProteinsFilePos); vector protBuf(protSectionSize); - fread(protBuf.data(), 1, protSectionSize, fp); + (void)fread(protBuf.data(), 1, protSectionSize, fp); const char* p = protBuf.data(); size_t tSize; @@ -1506,7 +1482,7 @@ bool CometFragmentIndex::ReadPlainPeptideIndex(void) comet_fileoffset_t varDataStart = comet_ftell(fp); size_t varDataSize = (size_t)(clFooterPos - varDataStart); vector varBuf(varDataSize); - fread(varBuf.data(), 1, varDataSize, fp); + (void)fread(varBuf.data(), 1, varDataSize, fp); const char* p = varBuf.data(); int iTmp; diff --git a/CometSearch/CometInterfaces.h b/CometSearch/CometInterfaces.h index 8bcf095d..8347c779 100644 --- a/CometSearch/CometInterfaces.h +++ b/CometSearch/CometInterfaces.h @@ -85,7 +85,7 @@ namespace CometInterfaces ICometSearchManager *GetCometSearchManager(); void ReleaseCometSearchManager(); - static ThreadPool* _tp; + [[maybe_unused]] static ThreadPool* _tp; } #endif // _COMETINTERFACES_H_ diff --git a/CometSearch/CometPeptideIndex.cpp b/CometSearch/CometPeptideIndex.cpp index 5dedda2e..9ac446f0 100644 --- a/CometSearch/CometPeptideIndex.cpp +++ b/CometSearch/CometPeptideIndex.cpp @@ -88,25 +88,24 @@ bool CometPeptideIndex::ReadPeptideIndex(void) comet_fileoffset_t lEndOfPeptides; comet_fileoffset_t clProteinsFilePos; - size_t tTmpRead; - tTmpRead = fread(&lEndOfPeptides, clSizeCometFileOffset, 1, fp); - tTmpRead = fread(&clProteinsFilePos, clSizeCometFileOffset, 1, fp); + (void)fread(&lEndOfPeptides, clSizeCometFileOffset, 1, fp); + (void)fread(&clProteinsFilePos, clSizeCometFileOffset, 1, fp); // --- Read the mass index and peptide count from lEndOfPeptides position --- comet_fseek(fp, lEndOfPeptides, SEEK_SET); int iMinMass, iMaxMass; uint64_t tNumPeptides; - tTmpRead = fread(&iMinMass, sizeof(int), 1, fp); - tTmpRead = fread(&iMaxMass, sizeof(int), 1, fp); - tTmpRead = fread(&tNumPeptides, sizeof(uint64_t), 1, fp); + (void)fread(&iMinMass, sizeof(int), 1, fp); + (void)fread(&iMaxMass, sizeof(int), 1, fp); + (void)fread(&tNumPeptides, sizeof(uint64_t), 1, fp); int iMaxPeptideMass10 = iMaxMass * 10; // Read the mass index array: lIndex[0..iMaxPeptideMass10-1] // Each entry is a file offset to the first peptide at that 0.1 Da mass bin comet_fileoffset_t* lIndex = new comet_fileoffset_t[iMaxPeptideMass10]; - tTmpRead = fread(lIndex, clSizeCometFileOffset, iMaxPeptideMass10, fp); + (void)fread(lIndex, clSizeCometFileOffset, iMaxPeptideMass10, fp); // --- Read protein names --- // Protein names are stored between end-of-header and clProteinsFilePos @@ -125,7 +124,7 @@ bool CometPeptideIndex::ReadPeptideIndex(void) comet_fseek(fp, clProteinsFilePos, SEEK_SET); size_t tNumProteinEntries; - tTmpRead = fread(&tNumProteinEntries, clSizeCometFileOffset, 1, fp); + (void)fread(&tNumProteinEntries, clSizeCometFileOffset, 1, fp); g_pvProteinsList.clear(); g_pvProteinsList.reserve(tNumProteinEntries); @@ -133,11 +132,11 @@ bool CometPeptideIndex::ReadPeptideIndex(void) for (size_t i = 0; i < tNumProteinEntries; ++i) { size_t tNumProteins; - tTmpRead = fread(&tNumProteins, clSizeCometFileOffset, 1, fp); + (void)fread(&tNumProteins, clSizeCometFileOffset, 1, fp); vector vTmp(tNumProteins); for (size_t j = 0; j < tNumProteins; ++j) - tTmpRead = fread(&vTmp[j], clSizeCometFileOffset, 1, fp); + (void)fread(&vTmp[j], clSizeCometFileOffset, 1, fp); g_pvProteinsList.push_back(std::move(vTmp)); } diff --git a/CometSearch/CometPreprocess.cpp b/CometSearch/CometPreprocess.cpp index 8d29a184..23ca5197 100644 --- a/CometSearch/CometPreprocess.cpp +++ b/CometSearch/CometPreprocess.cpp @@ -835,7 +835,7 @@ bool CometPreprocess::LoadAndPreprocessSpectra(MSReader &mstReader, void CometPreprocess::PreprocessThreadProc(PreprocessThreadData *pPreprocessThreadData, - ThreadPool* tp) + ThreadPool* /*tp*/) { // This returns false if it fails, but the errors are already logged // so no need to check the return value here. @@ -890,7 +890,7 @@ void CometPreprocess::PreprocessThreadProc(PreprocessThreadData *pPreprocessThre void CometPreprocess::PreprocessThreadProcMS1(PreprocessThreadData* pPreprocessThreadDataMS1, - ThreadPool* tp, + ThreadPool* /*tp*/, const double dMaxQueryRT, const double dMaxSpecLibRT) { diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index a5145036..142f86e7 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -82,7 +82,7 @@ bool CometSearch::AllocateMemory(int maxNumThreads) } -bool CometSearch::DeallocateMemory(int maxNumThreads) +bool CometSearch::DeallocateMemory(int /*maxNumThreads*/) { if (!g_bCometSearchMemoryAllocated) return true; @@ -994,7 +994,7 @@ bool CometSearch::RunSearch(int iPercentStart, } -bool CometSearch::RunSpecLibSearch(ThreadPool* tp) +bool CometSearch::RunSpecLibSearch(ThreadPool* /*tp*/) { printf("OK in RunSpecLib\n"); @@ -1002,9 +1002,9 @@ bool CometSearch::RunSpecLibSearch(ThreadPool* tp) } -bool CometSearch::RunSpecLibSearch(int iPercentStart, - int iPercentEnd, - ThreadPool* tp, +bool CometSearch::RunSpecLibSearch(int /*iPercentStart*/, + int /*iPercentEnd*/, + ThreadPool* /*tp*/, vector& queries) { // to fill g_vulSpecLibPrecursorIndex, set @@ -1064,7 +1064,7 @@ bool CometSearch::RunMS1Search(ThreadPool* tp, // the read-only g_vSpecLib entries within the RT window. Populates the output // scores vector with up to topN best matches. Zero shared mutable state. bool CometSearch::RunMS1Search(QueryMS1* pQueryMS1, - const int topN, + const int /*topN*/, double dRT, double dMaxMS1RTDiff, const double dMaxSpecLibRT, @@ -1255,7 +1255,7 @@ bool CometSearch::MapOBO(string strMod, void CometSearch::SearchThreadProc(SearchThreadData *pSearchThreadData, - ThreadPool* tp) + ThreadPool* /*tp*/) { int i = -1; @@ -1872,11 +1872,10 @@ void CometSearch::SearchFragmentIndex(Query* pQuery, } -bool CometSearch::SearchPeptideIndex(ThreadPool* tp, vector& queries) +bool CometSearch::SearchPeptideIndex(ThreadPool* /*tp*/, vector& queries) { comet_fileoffset_t lEndOfStruct; FILE* fp; - size_t tTmp; CometPostAnalysis cpa; @@ -1924,15 +1923,15 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp, vector& queries) comet_fileoffset_t clProteinsFilePos; comet_fseek(fp, -clSizeCometFileOffset * 2, SEEK_END); - tTmp = fread(&lEndOfStruct, clSizeCometFileOffset, 1, fp); - tTmp = fread(&clProteinsFilePos, clSizeCometFileOffset, 1, fp); + (void)fread(&lEndOfStruct, clSizeCometFileOffset, 1, fp); + (void)fread(&clProteinsFilePos, clSizeCometFileOffset, 1, fp); if (!g_bPeptideIndexRead) { // now read in: vector> g_pvProteinsList comet_fseek(fp, clProteinsFilePos, SEEK_SET); size_t tSize; - tTmp = fread(&tSize, clSizeCometFileOffset, 1, fp); + (void)fread(&tSize, clSizeCometFileOffset, 1, fp); vector vTmp; g_pvProteinsList.clear(); @@ -1940,12 +1939,12 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp, vector& queries) for (size_t it = 0; it < tSize; ++it) { size_t tNumProteinOffsets; - tTmp = fread(&tNumProteinOffsets, clSizeCometFileOffset, 1, fp); + (void)fread(&tNumProteinOffsets, clSizeCometFileOffset, 1, fp); vTmp.clear(); for (size_t it2 = 0; it2 < tNumProteinOffsets; ++it2) { - tTmp = fread(&clTmp, clSizeCometFileOffset, 1, fp); + (void)fread(&clTmp, clSizeCometFileOffset, 1, fp); vTmp.push_back(clTmp); } g_pvProteinsList.push_back(vTmp); @@ -1964,9 +1963,9 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp, vector& queries) // seek to index comet_fseek(fp, lEndOfStruct, SEEK_SET); - tTmp = fread(&iMinMass, sizeof(int), 1, fp); - tTmp = fread(&iMaxMass, sizeof(int), 1, fp); - tTmp = fread(&tNumPeptides, sizeof(uint64_t), 1, fp); + (void)fread(&iMinMass, sizeof(int), 1, fp); + (void)fread(&iMaxMass, sizeof(int), 1, fp); + (void)fread(&tNumPeptides, sizeof(uint64_t), 1, fp); // sanity checks if (iMinMass < 0 || iMinMass > 20000 || iMaxMass < 0 || iMaxMass > 20000) @@ -1983,7 +1982,7 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp, vector& queries) for (int i = 0; i < iMaxPeptideMass10; ++i) lReadIndex[i] = -1; - tTmp = fread(lReadIndex, sizeof(comet_fileoffset_t), iMaxPeptideMass10, fp); + (void)fread(lReadIndex, sizeof(comet_fileoffset_t), iMaxPeptideMass10, fp); int iStart = (int)(g_massRange.dMinMass - 0.5); // smallest mass/index start int iEnd = (int)(g_massRange.dMaxMass + 0.5); // largest mass/index end @@ -2029,8 +2028,6 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* tp, vector& queries) // compatibility with standard search in StorePeptide dbe.lProteinFilePosition = sDBI.lIndexProteinFilePosition; - ThreadPool* pSearchThreadPool = tp; - while ((int)(sDBI.dPepMass * 10) <= iEnd10) { if (sDBI.dPepMass > g_massRange.dMaxMass) @@ -3267,7 +3264,7 @@ void CometSearch::SearchMS1Library(QueryMS1* pMS1Query, const double dMaxMS1RTDiff, const double dMaxSpecLibRT, const double dMaxQueryRT, - ThreadPool* tp) + ThreadPool* /*tp*/) { unsigned int iStart = BINPREC(g_staticParams.options.dMS1MinMass); @@ -8639,7 +8636,7 @@ void CometSearch::StorePeptideI(Query* pQuery, char* szProteinSeq, double dCalcPepMass, double dXcorr, - bool bDecoyPep, + bool /*bDecoyPep*/, int* piVarModSites, struct sDBEntry* dbe) { diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index 7a8296c6..4ddd92d7 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -151,37 +151,6 @@ static std::string GetHostName() return {}; } -// Allocate memory for the _pSpecLibResults struct for each session.queriesMS1 entry. -static bool AllocateResultsMemMS1() -{ -/* - for (std::vector::iterator it = session.queriesMS1.begin(); it != session.queriesMS1.end(); ++it) - { - QueryMS1* pQueryMS1 = *it; - - try - { - pQueryMS1->_pSpecLibResultsMS1 = new SpecLibResultsMS1[g_staticParams.options.iNumStored]; - } - catch (std::bad_alloc& ba) - { - string strErrorMsg = " Error - new(_pSpecLibResults[]). bad_alloc: " + std::string(ba.what()) + "\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - for (int j=0; j_pSpecLibResultsMS1[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; - pQueryMS1->_pSpecLibResultsMS1[j].fCn = 0; - pQueryMS1->_pSpecLibResultsMS1[j].fRTime = 0; - } - - } -*/ - return true; -} - static bool ValidateOutputFormat() { if (!g_staticParams.options.bOutputSqtStream diff --git a/CometSearch/CometSpecLib.cpp b/CometSearch/CometSpecLib.cpp index b6b337c3..c40ead94 100644 --- a/CometSearch/CometSpecLib.cpp +++ b/CometSearch/CometSpecLib.cpp @@ -110,7 +110,7 @@ bool CometSpecLib::LoadSpecLib(string strSpecLibFile) } -bool CometSpecLib::ReadSpecLibSqlite(string strSpecLibFile) +bool CometSpecLib::ReadSpecLibSqlite(string /*strSpecLibFile*/) { printf(" Error - sqlite/.db files as spectral libraries are not supported yet.\n"); @@ -192,7 +192,7 @@ bool CometSpecLib::ReadSpecLibSqlite(string strSpecLibFile) } -bool CometSpecLib::ReadSpecLibRaw(string strSpecLibFile) +bool CometSpecLib::ReadSpecLibRaw(string /*strSpecLibFile*/) { printf(" Error - raw files as spectral libraries are not supported yet.\n"); exit(1); @@ -556,7 +556,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, ThreadPool* pLoadSpecThreadPool = tp; bool bFirstScan = true; - bool bDoneProcessingAllSpectra = false; printf(" - loading MS1 scan (%d, mass range %0.1lf - %0.1lf): ", iFileLastScan, g_staticParams.options.dMS1MinMass, g_staticParams.options.dMS1MaxMass); @@ -602,7 +601,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, if ((iFileLastScan != -1) && (iFileLastScan < iFirstScan)) { - bDoneProcessingAllSpectra = true; break; } @@ -621,7 +619,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, if (iScanNumber > iFileLastScan) { - bDoneProcessingAllSpectra = true; break; } @@ -629,7 +626,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, { if (iScanNumber > iFileLastScan) { - bDoneProcessingAllSpectra = true; break; } @@ -648,7 +644,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, } else if (CometPreprocess::IsValidInputType(iSpecLibInputType)) { - bDoneProcessingAllSpectra = true; break; } else @@ -657,7 +652,6 @@ bool CometSpecLib::LoadSpecLibMS1Raw(ThreadPool* tp, if (iTmpCount > iFileLastScan) { - bDoneProcessingAllSpectra = true; break; } } @@ -729,7 +723,7 @@ double CometSpecLib::ScoreSpecLib(Query *pQuery, // SpecLib entries that are matched to that "bin". This allows a mass query to walk through // and score against all entries in the vector. void CometSpecLib::SetSpecLibPrecursorIndex(double dNeutralMass, - int iSpecLibCharge, + int /*iSpecLibCharge*/, size_t iWhichSpecLib) { double dProtonatedMass = dNeutralMass + PROTON_MASS; diff --git a/CometSearch/Makefile b/CometSearch/Makefile index 219d63eb..a61ce6f3 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -14,9 +14,9 @@ endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-write-strings -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 -I../$(ASCOREPRO)/include + override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-write-strings -Wno-unknown-pragmas -Wno-char-subscripts -Wno-unused-result -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 -I../$(ASCOREPRO)/include else - override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-write-strings -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 -I../$(ASCOREPRO)/include + override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-write-strings -Wno-unknown-pragmas -Wno-char-subscripts -Wno-unused-result -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DGCC -D_NOSQLITE -D__int64=off64_t -I. -I$(MSTPATH)/include -I$(MSTPATH)/src/expat-2.2.9/lib -I$(MSTPATH)/src/zlib-1.2.11 -I../$(ASCOREPRO)/include endif # dependency generation (gcc/clang) diff --git a/CometSearch/Threading.cpp b/CometSearch/Threading.cpp index 8bfafcb9..e1d6381c 100644 --- a/CometSearch/Threading.cpp +++ b/CometSearch/Threading.cpp @@ -52,7 +52,7 @@ void Threading::UnlockMutex(Mutex& mutex) mutex.unlock(); } -void Threading::DestroyMutex(Mutex& mutex) +void Threading::DestroyMutex(Mutex& /*mutex*/) { // std::mutex destructor handles cleanup automatically // Ensure mutex is unlocked before destruction diff --git a/CometSearch/core/Types.h b/CometSearch/core/Types.h index 035f3e13..6c550fa4 100644 --- a/CometSearch/core/Types.h +++ b/CometSearch/core/Types.h @@ -221,7 +221,7 @@ struct DBIndex vector pcVarModSites; // empty = unmodified; else [iLen+2] encoding var mods comet_fileoffset_t lIndexProteinFilePosition; // points to entry in g_pvProteinsList double dPepMass; // MH+ pep mass - unsigned short siVarModProteinFilter; // bitwise representation of mmapProtein + unsigned short siVarModProteinFilter = 0; // bitwise representation of mmapProtein char cPrevAA; char cNextAA; char sPeptide[MAX_PEPTIDE_LEN]; // peptide sequence, null-terminated diff --git a/CometSearch/search/FastaStrategy.cpp b/CometSearch/search/FastaStrategy.cpp index b431efc0..de4522b0 100644 --- a/CometSearch/search/FastaStrategy.cpp +++ b/CometSearch/search/FastaStrategy.cpp @@ -21,7 +21,7 @@ #include "CometSearchManager.h" #include "MSReader.h" -bool FastaStrategy::initialize(SearchSession& session, ThreadPool* tp) +bool FastaStrategy::initialize(SearchSession& session, ThreadPool* /*tp*/) { // Read protein variable-mod filter file (FASTA-only feature). if (session.bPerformDatabaseSearch diff --git a/Makefile b/Makefile index fd7e0319..38928698 100644 --- a/Makefile +++ b/Makefile @@ -4,9 +4,9 @@ COMETSEARCH = CometSearch UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-char-subscripts -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/extern/expat-2.2.9/lib -I$(MSTOOLKIT)/extern/zlib-1.2.11 -I$(COMETSEARCH) -I$(ASCOREPRO)/include + override CXXFLAGS += -O3 -std=c++20 -fpermissive -Wall -Wextra -Wno-char-subscripts -Wno-unused-result -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/extern/expat-2.2.9/lib -I$(MSTOOLKIT)/extern/zlib-1.2.11 -I$(COMETSEARCH) -I$(ASCOREPRO)/include else - override CXXFLAGS += -O3 -static -std=c++20 -fpermissive -Wall -Wextra -Wno-char-subscripts -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/extern/expat-2.2.9/lib -I$(MSTOOLKIT)/extern/zlib-1.2.11 -I$(COMETSEARCH) -I$(ASCOREPRO)/include + override CXXFLAGS += -O3 -static -std=c++20 -fpermissive -Wall -Wextra -Wno-char-subscripts -Wno-unused-result -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE -I$(MSTOOLKIT)/include -I$(MSTOOLKIT)/extern/expat-2.2.9/lib -I$(MSTOOLKIT)/extern/zlib-1.2.11 -I$(COMETSEARCH) -I$(ASCOREPRO)/include endif EXECNAME = comet.exe From 539c7cf21040ea54e7392b7d55b86a476b55c012 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Mon, 15 Jun 2026 11:50:02 -0700 Subject: [PATCH 07/15] update docs --- docs/DataStructures.md | 205 +++++++++++++++++++++++++++++++++------- docs/GlobalVariables.md | 40 ++++---- docs/RealTimeSearch.md | 24 +++-- 3 files changed, 203 insertions(+), 66 deletions(-) diff --git a/docs/DataStructures.md b/docs/DataStructures.md index 682f30ef..9bb602d1 100644 --- a/docs/DataStructures.md +++ b/docs/DataStructures.md @@ -1,6 +1,12 @@ # Core Data Structures -Key types used throughout `CometSearch/`. All are defined in `CometDataInternal.h` unless noted. Types from `CometData.h` (the public API header) are marked accordingly. +Key types used throughout `CometSearch/`. Struct definitions were reorganized in Phase 3-4 of the architecture migration: +- `core/Types.h` -- per-spectrum, index, and runtime structs (`Results`, `Query`, `QueryMS1`, `DBIndex`, `PlainPeptideIndexStruct`, `FragmentPeptidesStruct`, `ProteinsListCSR`, etc.) +- `core/Params.h` -- `StaticParams` and all its nested sub-structs +- `core/Constants.h` -- compile-time constants (`MAX_PEPTIDE_LEN`, `VMODS`, `HISTO_SIZE`, etc.) +- `CometData.h` -- public API types that cross the library boundary into `CometWrapper` and `RealtimeSearch` + +`CometDataInternal.h` `#include`s all three `core/` headers; existing code that includes `CometDataInternal.h` continues to see everything. --- @@ -9,7 +15,7 @@ Key types used throughout `CometSearch/`. All are defined in `CometDataInternal. The central per-spectrum data object. One `Query` is allocated for each spectrum/charge combination in a batch. ```cpp -struct Query // CometDataInternal.h:861 +struct Query // core/Types.h ``` **Scoring state:** @@ -17,23 +23,31 @@ struct Query // CometDataInternal.h:861 | Field | Purpose | |-------|---------| | `iXcorrHistogram[HISTO_SIZE]` | Histogram of XCorr scores for E-value estimation (152 bins). | -| `iHistogramCount` | Number of entries in the histogram. | +| `uiHistogramCount` | Number of entries in the histogram. | | `fPar[4]` | Fitted LMA regression parameters from `LinearRegression()`. | | `siMaxXcorr` | Bin index of the histogram maximum. | +| `iMinXcorrHisto` | Minimum xcorr bin used in histogram; adjusts E-value floor for sparse spectra. | | `dLowestXcorrScore` / `dLowestDecoyXcorrScore` | Current minimum stored XCorr; gates whether a new hit is kept. | +| `siLowestXcorrScoreIndex` / `siLowestDecoyXcorrScoreIndex` | Index of the current lowest-scoring result slot. | +| `fLowestSpecLibScore` | Current minimum stored speclib score for the MS2 speclib path. | | `iMatchPeptideCount` / `iDecoyMatchPeptideCount` | Number of results actually stored. | -| `_uliNumMatchedPeptides` | Total peptides scored (including those below cutoff). | +| `_uliNumMatchedPeptides` / `_uliNumMatchedDecoyPeptides` | Total peptides scored (including those below cutoff). | +| `dMangoIndex` | Decimal scan-number encoding for Mango TMT-precursor searches. | **Spectrum data (set by CometPreprocess):** | Field | Purpose | |-------|---------| -| `pfFastXcorrData[]` | Preprocessed intensity array for XCorr calculation. | -| `pfFastXcorrDataNL[]` | Same with NH3/H2O neutral loss contributions. | -| `pfSpScoreData[]` / `ppfSparseSpScoreData[][]` | Binned intensity for SP scoring. Sparse representation saves memory for large bin arrays. | -| `iFastXcorrDataSize` / `iSpScoreData` | Array sizes for the above. | +| `ppfSparseSpScoreData[][]` | Sparse 2D binned intensity array for SP scoring. | +| `ppfSparseFastXcorrData[][]` | Sparse 2D preprocessed intensity array for XCorr calculation. | +| `ppfSparseFastXcorrDataNL[][]` | Same with NH3/H2O neutral loss contributions. | +| `iSpScoreData` / `iFastXcorrDataSize` | Outer dimension of the respective sparse arrays. | +| `bSparseFromPool` | `true` when the sparse child arrays belong to the RTS thread-local `RtsScratch` pool; the destructor must **not** `delete[]` them in this case. | +| `vfRawFragmentPeakMass` | Raw fragment peak masses for fragment index search (intensity not needed at scoring stage). | +| `vRawFragmentPeakMassIntensity` | Raw peaks as `AScoreProCpp::Centroid` pairs; populated when AScorePro is enabled. | | `_pepMassInfo` | Experimental mass and tolerance window (see `PepMassInfo`). | -| `_spectrumInfoInternal` | Scan number, charge state, RT, array size (see `SpectrumInfoInternal`). | +| `_spectrumInfoInternal` | Scan number, charge state, RT, array size, nativeID (see `SpectrumInfoInternal`). | +| `tSearchStart` | Per-query search start time; used to enforce `iMaxIndexRunTime` timeout. | **Results:** @@ -41,9 +55,10 @@ struct Query // CometDataInternal.h:861 |-------|---------| | `_pResults` | Heap-allocated `Results[iNumStored]` array for target hits. | | `_pDecoys` | Same for decoy hits (separate decoy mode only; `iDecoySearch == 2`). | +| `_pSpecLibResults` | MS2 spectral library results (`SpecLibResults[iNumStored]`). | | `accessMutex` | Per-query mutex; guards `_pResults` updates in concurrent search threads. | -**Lifecycle:** allocated in `CometPreprocess`, freed in `Query::~Query()`. In batch mode, all `Query*` objects live in `g_pvQuery`. In the RTS thread-local path, each call owns its own heap `Query*` and frees it at the end of the call. +**Lifecycle:** Allocated in `CometPreprocess`, freed in `Query::~Query()`. In batch mode, all `Query*` objects live in `SearchSession.queries`. In the RTS thread-local path, each call owns its own heap `Query*` and frees it at the end of the call. --- @@ -52,21 +67,34 @@ struct Query // CometDataInternal.h:861 Holds one peptide hit. Each `Query` owns an array of `Results[iNumStored]`. ```cpp -struct Results // CometDataInternal.h:194 +struct Results // core/Types.h ``` | Field | Type | Purpose | |-------|------|---------| | `fXcorr` | `float` | Cross-correlation score. | | `fScoreSp` | `float` | Preliminary SP score. | +| `fDeltaCn` | `float` | Delta-Cn (score difference to next-best hit). | +| `fLastDeltaCn` | `float` | Delta-Cn to the last stored hit. | +| `fAScorePro` | `float` | AScorePro phosphosite localization score. | | `dExpect` | `double` | E-value from LMA-fitted histogram. | | `dPepMass` | `double` | Calculated peptide MH+ mass. | -| `iRankSp` / `iMatchedIons` / `iTotalIons` | `int` | SP rank and ion match counts. | +| `usiRankXcorr` | `unsigned short` | Xcorr rank. | +| `usiRankSp` | `unsigned short` | SP rank. | +| `usiMatchedIons` | `unsigned short` | Number of matched fragment ions. | +| `usiTotalIons` | `unsigned short` | Total theoretical fragment ions. | +| `usiLenPeptide` | `unsigned short` | Peptide length. | +| `lProteinFilePosition` | `comet_fileoffset_t` | File offset into the FASTA for the matched protein; for index searches, an entry index into `g_pvProteinsList`. | +| `lWhichProtein` | `long` | Which entry in `g_pvProteinsList[]` contains the matched proteins. | | `szPeptide[MAX_PEPTIDE_LEN]` | `char[]` | Peptide sequence (no flanking AAs). | -| `szPrevNextAA[2]` | `char[]` | `[0]` = preceding AA, `[1]` = following AA. | +| `cPrevAA` / `cNextAA` | `char` | Preceding and following amino acid. | +| `bClippedM` | `bool` | `true` if this is a new N-terminal peptide due to a clipped methionine. | +| `cHasVariableMod` | `char` | `HasVariableModType` enum: 0 = none, 1 = variable mod, 2 = AScorePro mod. | | `piVarModSites[MAX_PEPTIDE_LEN_P2]` | `int[]` | Per-position variable mod encoding. Values 1-9 map to `varModList[0-8]`. Values >= `COMPOUNDMODS_OFFSET` (100) encode compound mods. Indices `iLenPeptide` and `iLenPeptide+1` hold N/C-terminal mod codes. | | `pdVarModSites[MAX_PEPTIDE_LEN_P2]` | `double[]` | Mass delta at each modified position. | -| `lProteinFilePosition` | `comet_fileoffset_t` | File offset into the FASTA for the matched protein. | +| `pszMod[MAX_PEPTIDE_LEN][MAX_PEFFMOD_LEN]` | `char[][]` | PEFF modification strings, one per position. | +| `sPeffOrigResidues` | `string` | Original residues for PEFF variants. | +| `sAScoreProSiteScores` | `string` | Comma-separated per-site AScorePro scores. | | `pWhichProtein` | `vector` | All proteins sharing this peptide (sorted by file offset). | | `pWhichDecoyProtein` | `vector` | Decoy proteins (concatenated search mode). | @@ -77,11 +105,11 @@ struct Results // CometDataInternal.h:194 The global parameter aggregate. Fully populated before any search thread starts; treated as read-only during search. ```cpp -struct StaticParams // CometDataInternal.h:602 +struct StaticParams // core/Params.h extern StaticParams g_staticParams; ``` -Contains nested sub-structs (all defined in `CometDataInternal.h`): +Contains nested sub-structs (all defined in `core/Params.h`): | Sub-struct | Type | Key contents | |------------|------|-------------| @@ -102,8 +130,8 @@ Contains nested sub-structs (all defined in `CometDataInternal.h`): ## VarMods / VarModParams ```cpp -struct VarMods // CometData.h:218 (one entry per mod slot) -struct VarModParams // CometDataInternal.h:472 (all mod config) +struct VarMods // CometData.h (one entry per mod slot) +struct VarModParams // core/Params.h (all mod config) ``` `VarModParams` contains: @@ -115,7 +143,7 @@ struct VarModParams // CometDataInternal.h:472 (all mod config) | `bVarModSearch` | Set to `true` if any mod has a non-zero mass; gates the `WithVariableMods` code path. | | `iMaxVarModPerPeptide` | Total modified residues allowed per peptide across all mods. | | `iMaxPermutations` | Cap on permutation count in `WithVariableMods`. | -| `vdCompoundMasses` | `vector` of masses from the compound mods file (compoundmods branch). | +| `vdCompoundMasses` | `vector` of masses from the compound mods file. | | `iNumCompoundMasses` | `size_t` size of `vdCompoundMasses`. | Each `VarMods` entry: @@ -134,35 +162,142 @@ Each `VarMods` entry: ## DBIndex -One entry in the peptide index (`g_pvDBIndex`), sorted by mass for binary-search lookup. +One entry in the peptide index (`g_pvDBIndex`), used during index generation and FASTA search. Sorted by peptide sequence and mass for deduplication. + +```cpp +struct DBIndex // core/Types.h +``` + +| Field | Type | Purpose | +|-------|------|---------| +| `sPeptide[MAX_PEPTIDE_LEN]` | `char[]` | Peptide amino acid sequence (null-terminated). | +| `cPrevAA` / `cNextAA` | `char` | Flanking residues (for enzyme termini check). | +| `pcVarModSites` | `vector` | Variable mod encoding per position. Empty = unmodified; otherwise `[iLen+2]` chars using the same 0-9 scheme as `piVarModSites`. | +| `dPepMass` | `double` | MH+ mass; used as sort key within equal sequences. | +| `siVarModProteinFilter` | `unsigned short` | Bitwise filter derived from the protein filter file; `0` when not filtering. Initialized to `0`. | +| `lIndexProteinFilePosition` | `comet_fileoffset_t` | Index into `g_pvProteinsList` mapping to the list of protein file offsets. | + +`DBIndex` provides `operator==` (sequence + mass + mod-sites) and `operator<` (sequence -> mass -> mod-sites -> protein position). + +--- + +## PlainPeptideIndexStruct + +Compact fixed-size tuple stored in the plain peptide index (`.idx` file) and loaded into `g_vRawPeptides` at runtime. Same core fields as `DBIndex` but without the `vector` mod-site field (only unmodified peptides are stored here; modifications are layered on in `g_vFragmentPeptides`). ```cpp -struct DBIndex // CometDataInternal.h:377 +struct PlainPeptideIndexStruct // core/Types.h ``` | Field | Purpose | |-------|---------| -| `szPeptide[MAX_PEPTIDE_LEN]` | Peptide amino acid sequence. | -| `szPrevNextAA[2]` | Flanking residues (for enzyme termini check in index search). | -| `pcVarModSites[MAX_PEPTIDE_LEN_P2]` | Compact mod-site encoding (0-9; same scheme as `piVarModSites`). | -| `dPepMass` | MH+ mass; the sort key. | -| `lIndexProteinFilePosition` | Index into `g_pvProteinsList` mapping to a list of protein file offsets. | +| `szPeptide[MAX_PEPTIDE_LEN]` | Peptide sequence (null-terminated). | +| `cPrevAA` / `cNextAA` | Flanking residues. | +| `dPepMass` | Unmodified MH+ mass. | +| `siVarModProteinFilter` | Protein filter bitfield. | +| `lIndexProteinFilePosition` | Row index into `g_pvProteinsList`. | --- -## PepMassInfo / SpectrumInfoInternal +## FragmentPeptidesStruct -Small structs embedded in each `Query`. +One entry in the fragment index peptide list (`g_vFragmentPeptides`). Represents one (peptide, mod-state) combination. Sorted by mass so that RunSearch can binary-search for mass-matching candidates. ```cpp -struct PepMassInfo // CometDataInternal.h:219 +struct FragmentPeptidesStruct // core/Types.h ``` -Stores the experimental MH+ mass (`dExpPepMass`) and the +/- tolerance window (`dPeptideMassToleranceMinus` / `dPeptideMassTolerancePlus`) pre-computed for fast range checks. + +| Field | Purpose | +|-------|---------| +| `iWhichPeptide` | Index into `g_vRawPeptides`; provides sequence and protein info. | +| `modNumIdx` | Index into `MOD_NUMBERS`; 0 = unmodified. | +| `dPepMass` | Modified MH+ mass (= unmodified mass + sum of applied mod masses). | +| `cNtermMod` / `cCtermMod` | N/C-terminal variable mod codes (index into `varModList`). | + +--- + +## ProteinsListCSR + +CSR (Compressed Sparse Row)-style storage for the per-peptide protein list. Replaces `vector>` to eliminate the ~190 M individual heap allocations (one per inner vector) that caused a multi-minute free-time tail when building large MHC `.idx` files. ```cpp -struct SpectrumInfoInternal // CometDataInternal.h:228 +class ProteinsListCSR // core/Types.h +extern ProteinsListCSR g_pvProteinsList; ``` -Stores scan number, charge state, retention time, array size, and the nativeID string from mzML files. + +The external interface mirrors `vector>`: `size()`, `empty()`, `clear()`, `reserve()`, `push_back(vector&&)`, `append_flat()`, `operator[](i)`, `at(i)`, range-for. `operator[](i)` returns a lightweight `Row` proxy (`ptr` + `n`) with `size()`, `operator[]`, `begin()`/`end()`. Only two internal heap allocations regardless of how many rows are stored (`m_flat`: all protein file offsets concatenated; `m_off`: `[N+1]` uint64 CSR offsets). + +--- + +## SearchSession + +Owns all mutable state for one batch search run. Created once at the top of `CometSearchManager::DoSearch()` and passed by reference through `Pipeline` to `ISearchStrategy` implementations. + +```cpp +struct SearchSession // search/SearchSession.h +``` + +| Field | Purpose | +|-------|---------| +| `params` | `const StaticParams&` -- read-only reference to `g_staticParams`. | +| `queries` | `vector` -- per-batch MS2 query accumulator (replaces global `g_pvQuery` for the batch path). Protected by `queriesMutex`. | +| `ms1Queries` | `vector` -- per-batch MS1 query accumulator (replaces global `g_pvQueryMS1`). | +| `queriesMutex` | `std::mutex` -- guards `queries` and `ms1Queries` during parallel spectrum loading. | +| `bPerformDatabaseSearch` | Replaces the former global `g_bPerformDatabaseSearch`. | +| `bPerformSpecLibSearch` | Replaces the former global `g_bPerformSpecLibSearch`. | +| `bIdxNoFasta` | Replaces the former global `g_bIdxNoFasta`. | +| `bPlainPeptideIndexRead` | Local copy of index-read state for this run. | +| `bSpecLibRead` | Local copy of speclib-read state for this run. | +| `status` | Per-run `CometStatus`; `g_cometStatus` remains as a global for the RTS path. | + +`SearchSession` is non-copyable. The RTS paths (`DoSingleSpectrumSearchMultiResults`, `DoMS1SearchMultiResults`) do **not** use `SearchSession`; they use per-call `Query*`/`QueryMS1*` objects directly. + +--- + +## Pipeline and ISearchStrategy + +Added in Phase 5. `DoSearch()` instantiates a `Pipeline` + one concrete `ISearchStrategy` and calls `pipeline.run()`. + +```cpp +class ISearchStrategy // search/ISearchStrategy.h +class Pipeline // search/Pipeline.h +``` + +**ISearchStrategy** interface methods: + +| Method | Called | Purpose | +|--------|--------|---------| +| `initialize(session, tp)` | Once before file loop | Allocate pools, load/build index, pre-read precursors (FI_DB), read var-mod filter file (FASTA). | +| `openFiles(szDB, fpfasta, fpidx, fpdb, session)` | Once per file | Open DB file handles; set `session.bIdxNoFasta`. | +| `executeBatch(mstReader, firstScan, lastScan, analysisType, iPercentStart, iPercentEnd, tp, session)` | Once per batch | Preprocess + search + post-analysis for one spectrum batch; fills `session.queries`. | +| `closeFiles(fpfasta, fpidx)` | Once per file | Close file handles. | +| `finalize()` | Once after all files | Free memory pools and index arrays. | +| `isIndexBased()` | Any time | `true` for `FiStrategy`/`PiStrategy`; selects progress-message style in `Pipeline`. | + +**Concrete strategies:** + +| Class | File | DB type | Notes | +|-------|------|---------|-------| +| `FiStrategy` | `search/FiStrategy.cpp` | `FI_DB` | Fused load+search path when `bPerformDatabaseSearch && !bMango && !bPerformSpecLibSearch`; legacy three-sweep otherwise. | +| `FastaStrategy` | `search/FastaStrategy.cpp` | `FASTA_DB` | Classic three-sweep (load -> allocate -> RunSearch -> PostAnalysis). | +| `PiStrategy` | `search/PiStrategy.cpp` | `PI_DB` | Three-sweep like FASTA but against the plain peptide index; no Mango block. | + +**IResultWriter** (`output/IResultWriter.h`) is the parallel output abstraction. Each format (`TxtWriter`, `PepXmlWriter`, `SqtWriter`, `PercolatorWriter`, `MzIdentMlWriter`) implements `open()`, `write()`, `close()`. `Pipeline` holds a `vector>` and calls them around the batch loop. + +--- + +## PepMassInfo / SpectrumInfoInternal + +Small structs embedded in each `Query`. + +```cpp +struct PepMassInfo // core/Types.h +struct SpectrumInfoInternal // core/Types.h +``` + +`PepMassInfo` stores the experimental MH+ mass (`dExpPepMass`) and the +/- tolerance window (`dPeptideMassToleranceMinus` / `dPeptideMassTolerancePlus`) pre-computed for fast range checks. + +`SpectrumInfoInternal` stores scan number, charge state, retention time, array size, Mango encoding, and the nativeID string from mzML files. --- @@ -171,7 +306,7 @@ Stores scan number, charge state, retention time, array size, and the nativeID s Passed through the FASTA search loop; holds data for a single protein from the database. ```cpp -typedef struct sDBEntry // CometDataInternal.h:348 +typedef struct sDBEntry // core/Types.h ``` | Field | Purpose | @@ -187,11 +322,11 @@ typedef struct sDBEntry // CometDataInternal.h:348 ## MassRange ```cpp -struct MassRange // CometDataInternal.h:243 +struct MassRange // CometDataInternal.h extern MassRange g_massRange; ``` -Computed once per spectrum batch from the lowest and highest precursor masses in `g_pvQuery`. Search threads read `dMinMass` / `dMaxMass` for early-exit decisions in `SearchForPeptides`. `iMaxFragmentCharge` caps the fragment ion charge loop. +Computed once per spectrum batch from the lowest and highest precursor masses in `SearchSession.queries`. Search threads read `dMinMass` / `dMaxMass` for early-exit decisions in `SearchForPeptides`. `iMaxFragmentCharge` caps the fragment ion charge loop. --- diff --git a/docs/GlobalVariables.md b/docs/GlobalVariables.md index 7a2184d8..59bc5ea0 100644 --- a/docs/GlobalVariables.md +++ b/docs/GlobalVariables.md @@ -1,6 +1,6 @@ # Global Variables Reference -All globals are defined in `CometSearch/CometSearchManager.cpp` (unless noted) and declared `extern` in `CometSearch/CometDataInternal.h`. +All globals are defined in `CometSearch/CometSearchManager.cpp` (unless noted) and declared `extern` in `CometSearch/CometDataInternal.h` or `CometSearch/core/Types.h`. --- @@ -16,13 +16,13 @@ All globals are defined in `CometSearch/CometSearchManager.cpp` (unless noted) a ## Spectrum batch containers -Used only in the batch search path (`DoSearch` -> `RunSearch`). The RTS paths do not touch these. +Used only in the batch search path (`DoSearch` -> `Pipeline` -> strategies). The RTS paths do not touch these. Batch-path query lists and per-run flags were moved from bare globals into `SearchSession` (defined in `search/SearchSession.h`) as part of the Phase 4-5 architecture migration. -| Variable | Type | Thread-safe? | Notes | -|----------|------|:------------:|-------| -| `g_pvQuery` | `vector` | Batch path only | One `Query*` per spectrum/charge combination for the current batch. Populated by `CometPreprocess`, consumed by `CometSearch` and `CometPostAnalysis`. Not safe for concurrent writes without `g_pvQueryMutex`. | -| `g_pvQueryMS1` | `vector` | Batch path only | Analogous to `g_pvQuery` for MS1 spectral library batch searches. | -| `g_pvQueryMutex` | `Mutex` | -- | Protects `g_pvQuery` insertions during batch preprocessing. | +| Variable | Type / Location | Thread-safe? | Notes | +|----------|----------------|:------------:|-------| +| `SearchSession::queries` | `vector` | Guarded by `queriesMutex` | One `Query*` per spectrum/charge combination for the current batch. Populated by `CometPreprocess`, consumed by `CometSearch` and `CometPostAnalysis`. Replaces the former global `g_pvQuery`. | +| `SearchSession::ms1Queries` | `vector` | Guarded by `queriesMutex` | Analogous to `queries` for MS1 spectral library batch searches. Replaces the former global `g_pvQueryMS1`. | +| `SearchSession::queriesMutex` | `std::mutex` | -- | Protects `queries` / `ms1Queries` insertions during batch preprocessing. Replaces the former `g_pvQueryMutex`. | | `g_pvInputFiles` | `vector` | Read-only after init | List of input files to search; set before `DoSearch()` begins. | --- @@ -31,15 +31,16 @@ Used only in the batch search path (`DoSearch` -> `RunSearch`). The RTS paths do Populated during index build / load; treated as read-only during all searches. Safe for concurrent reads from RTS threads. +The fragment index uses a **CSR (Compressed Sparse Row)** layout. For a given fragment mass bin `b`, the entries in `g_vFragmentPeptides` are at positions `g_iFragmentIndexOffset[b]` through `g_iFragmentIndexOffset[b+1] - 1` (half-open interval), and the values stored there are indices into `g_vFragmentPeptides`. + | Variable | Type | Notes | |----------|------|-------| -| `g_iFragmentIndex` | `unsigned int**` | 2D array: `[BIN(fragment mass)][entry index]`. Each row lists which entries in `g_vFragmentPeptides` contain that fragment mass bin. | -| `g_iCountFragmentIndex` | `unsigned int*` | `[BIN(fragment mass)]` -- count of entries in each row of `g_iFragmentIndex`. | +| `g_iFragmentIndex` | `unsigned int*` | Flat CSR data array. Each element is an index into `g_vFragmentPeptides`. Entries for bin `b` span `[g_iFragmentIndexOffset[b], g_iFragmentIndexOffset[b+1])`. | +| `g_iFragmentIndexOffset` | `uint64_t*` | CSR offset array; length = (max bin + 1) + 1. Must be 64-bit -- the total entry count can exceed UINT_MAX for large databases with many variable mods. | | `g_vFragmentPeptides` | `vector` | Mass-sorted list of all (peptide, mod-state) combinations. Each entry references a row in `g_vRawPeptides` via `iWhichPeptide`. | | `g_vRawPeptides` | `vector` | List of unique unmodified peptide sequences with protein file-position pointers. | | `g_bIndexPrecursors` | `bool*` | Boolean bitmap over precursor mass bins; marks which precursor masses are present in the current input file(s). | | `g_bPeptideIndexRead` | `std::atomic` | Set to `true` once the peptide index has been fully loaded. Checked with `acquire` ordering before RTS searches begin. | -| `g_bPlainPeptideIndexRead` | `bool` | Set to `true` if the plain peptide index was read and a fragment index was generated from it. | --- @@ -49,9 +50,6 @@ Populated during index build / load; treated as read-only during all searches. S |----------|------|-------| | `g_vSpecLib` | `vector` | In-memory spectral library entries. Each entry holds peaks, charge, RT, and a unit-vector representation for dot-product scoring. | | `g_vulSpecLibPrecursorIndex` | `vector>` | Mass index into `g_vSpecLib`; maps precursor mass bins to library entry indices for fast lookup. | -| `g_bSpecLibRead` | `bool` | Set to `true` once the spectral library is fully loaded. | -| `g_bPerformSpecLibSearch` | `bool` | `true` if MS1 speclib search is active for this run. | -| `g_bPerformDatabaseSearch` | `bool` | `true` if FASTA/index database search is active for this run. | | `RetentionMatchHistory` | `std::deque` | Rolling window of (query RT, reference RT) pairs used by the MS1 RT aligner. Protected by `g_ms1AlignerMutex`. | --- @@ -60,9 +58,10 @@ Populated during index build / load; treated as read-only during all searches. S | Variable | Type | Notes | |----------|------|-------| -| `g_pvDBIndex` | `vector` | Peptide index entries (mass-sorted). Each entry holds peptide sequence, mass, var-mod encoding, and a protein file-position pointer. | -| `g_pvProteinNames` | `map` | Maps protein file-position to accession string and ordinal. | -| `g_pvProteinsList` | `vector>` | Maps index positions to lists of protein file offsets (for multi-protein peptides). | +| `g_pvDBIndex` | `vector` | Peptide index entries used during index build. Each entry holds peptide sequence, mass, var-mod encoding, and a protein file-position pointer. | +| `g_pvProteinNames` | `map` | Maps protein file-position to accession string and ordinal. Used for FASTA searches and legacy index paths. | +| `g_pvProteinsList` | `ProteinsListCSR` | Maps peptide index positions to lists of protein file offsets (for multi-protein peptides). `ProteinsListCSR` is a CSR-layout replacement for `vector>`; exposes the same `operator[]`/`size()`/range-for interface but uses only two heap allocations total. | +| `g_pvProteinNameCache` | `unordered_map` | Protein name lookup cache for index-based searches. Populated at index load time from the protein name blocks in the `.idx` file. Maps protein file-position offsets to accession strings. ~7 MB for a human target-decoy database. Allows O(1) protein name resolution during RTS without file I/O. | | `g_pvDIAWindows` | `vector` | Flat list of DIA isolation window edges (start, end, start, end, ...). Empty if not doing DIA. | --- @@ -87,6 +86,7 @@ Used by the variable mod permutation engine (`CometModificationsPermuter`). | `MOD_SEQ_MOD_NUM_START` / `MOD_SEQ_MOD_NUM_CNT` | `int*` -- index into `MOD_NUMBERS` per modifiable sequence. | | `PEPTIDE_MOD_SEQ_IDXS` | `int*` -- maps peptides to their modifiable sequence index. | | `MOD_NUM` | `int` -- total number of distinct modification combinations. | +| `g_vvvPepGenShort` / `g_vvvPepGenLong` | Per-thread peptide generation scratch buffers; populated during index build and reused across peptides to avoid repeated allocation. | --- @@ -94,7 +94,6 @@ Used by the variable mod permutation engine (`CometModificationsPermuter`). | Variable | Type | Notes | |----------|------|-------| -| `g_pvQueryMutex` | `Mutex` | Protects `g_pvQuery` insertions during batch preprocessing. | | `g_pvDBIndexMutex` | `Mutex` | Protects database index reads where concurrent access is possible. | | `g_preprocessMemoryPoolMutex` | `Mutex` | Protects the shared preprocessing memory pool. | | `g_searchMemoryPoolMutex` | `Mutex` | Protects the shared search memory pool. | @@ -119,7 +118,6 @@ Used by the variable mod permutation engine (`CometModificationsPermuter`). |----------|-------| | `g_bCometPreprocessMemoryAllocated` | `true` when `CometPreprocess::AllocateMemory()` has been called. | | `g_bCometSearchMemoryAllocated` | `true` when `CometSearch::AllocateMemory()` has been called. | -| `g_bIdxNoFasta` | `true` when searching a `.idx` file without the corresponding `.fasta` present. | --- @@ -154,14 +152,14 @@ Used by the variable mod permutation engine (`CometModificationsPermuter`). ``` Safe to read from any concurrent RTS thread (after init): - g_staticParams, g_iFragmentIndex, g_iCountFragmentIndex, + g_staticParams, g_iFragmentIndex, g_iFragmentIndexOffset, g_vFragmentPeptides, g_vRawPeptides, g_pvProteinNames, g_pvProteinsList, - g_vSpecLib, g_vulSpecLibPrecursorIndex, g_pvDIAWindows, + g_pvProteinNameCache, g_vSpecLib, g_vulSpecLibPrecursorIndex, g_pvDIAWindows, g_AScoreOptions, g_AScoreInterface, MOD_NUMBERS, MOD_SEQS, g_massRange.iMaxFragmentCharge (after batch setup) Written per batch (batch path only -- not touched by RTS): - g_pvQuery, g_pvQueryMS1, + SearchSession::queries, SearchSession::ms1Queries, g_massRange.dMinMass / dMaxMass / bNarrowMassRange, g_staticParams.databaseInfo.uliTotAACount diff --git a/docs/RealTimeSearch.md b/docs/RealTimeSearch.md index 30283240..e778c452 100644 --- a/docs/RealTimeSearch.md +++ b/docs/RealTimeSearch.md @@ -2,7 +2,7 @@ Comet supports two search modes: -- **Batch search**: `DoSearch()` -- reads a file, processes spectra in configurable batches, writes result files. +- **Batch search**: `DoSearch()` -- reads a file, processes spectra in configurable batches, writes result files. `DoSearch()` is orchestrated by a `Pipeline` that owns one concrete `ISearchStrategy` (`FiStrategy`, `FastaStrategy`, or `PiStrategy`) and a set of `IResultWriter` implementations. All mutable batch-run state (query lists, per-run flags) lives in a `SearchSession` struct passed by reference through the pipeline. - **Real-time search (RTS)**: called per-spectrum by an external C# application; returns results synchronously within the same call. Designed for concurrent calls from multiple threads. This document covers the RTS path. The design history and task-by-task implementation record are in `docs/20260227_RTS_THREAD_PLAN.md` (MS2) and `docs/20260228_MS1_THREAD_PLAN.md` (MS1). @@ -72,11 +72,12 @@ slow path: mutex-guarded check + initialization internally before returning CometSearch::AllocateMemory() re-allocate search pool freed by DoSearch() above ReadPlainPeptideIndex() loads g_vRawPeptides from the .idx file - CreateFragmentIndex(tp) builds g_iFragmentIndex in memory (CSR posting lists) + CreateFragmentIndex(tp) builds g_iFragmentIndex / g_iFragmentIndexOffset + in memory (CSR posting lists) -> singleSearchInitializationComplete.store(true, release) ``` -The `release` store ensures all threads that subsequently load the flag with `acquire` see a fully initialized `g_iFragmentIndex` and all other globals. +The `release` store ensures all threads that subsequently load the flag with `acquire` see a fully initialized `g_iFragmentIndex`, `g_iFragmentIndexOffset`, `g_pvProteinNameCache`, and all other globals. **Note on the index-build path:** When the `.idx` file is absent, `CreateFragmentIndex()` calls `DoSearch()` with `m_bRTSIndexBuild=true`. `DoSearch()` writes the `.idx` file, calls `CometSearch::DeallocateMemory()` to free the large FASTA-parse memory, then returns early (skipping the spec-lib and batch-search logic that follows in `DoSearch()`). `InitializeSingleSpectrumSearch()` then re-allocates the search pool before proceeding to load the index. @@ -113,7 +114,7 @@ DoSingleSpectrumSearchMultiResults(topN, charge, mz, masses, intensities, nPeaks +- CometPreprocess::PreprocessSingleSpectrumThreadLocal(charge, mz, masses, intensities) | -> allocates caller-owned Query* on the heap | -> fills it with binned spectrum data - | -> does NOT touch g_pvQuery + | -> does NOT touch SearchSession::queries | -> returns nullptr on failure (caller checks and returns false) | +- pdTmpSpectrum = new double[iArraySize] <- per-call allocation @@ -121,7 +122,8 @@ DoSingleSpectrumSearchMultiResults(topN, charge, mz, masses, intensities, nPeaks +- CometSearch::RunSearch(pQuery, tRealTimeStart) | -> allocates per-call bool* pbDuplFragment[] | -> SearchFragmentIndex(pQuery, pbDuplFragment, tRealTimeStart) - | reads g_iFragmentIndex / g_vFragmentPeptides (READ-ONLY) [x] + | reads g_iFragmentIndex / g_iFragmentIndexOffset (READ-ONLY) [x] + | reads g_vFragmentPeptides (READ-ONLY) [x] | XcorrScoreI(pQuery, ...) -- updates only pQuery->_pResults | CheckMassMatch(pQuery, dMass) -- reads only pQuery->_pepMassInfo | timeout checked against local tRealTimeStart @@ -133,6 +135,7 @@ DoSingleSpectrumSearchMultiResults(topN, charge, mz, masses, intensities, nPeaks +- CometPostAnalysis::CalculateAScorePro(pQuery, g_AScoreInterface) | +- sort _pResults by XCorr, extract top topN hits into output vectors + | protein names resolved via g_pvProteinNameCache.find(offset) [READ-ONLY, O(1)] [x] | +- cleanup_results: delete pQuery (destructor frees sparse arrays, _pResults[], accessMutex) @@ -149,7 +152,7 @@ DoMS1SearchMultiResults(dMaxMS1RTDiff, charge, mz, masses, intensities, nPeaks, | +- CometPreprocess::PreprocessMS1SingleSpectrumThreadLocal(charge, mz, masses, intensities) | -> allocates caller-owned QueryMS1* on the heap - | -> does NOT touch g_pvQueryMS1 + | -> does NOT touch SearchSession::ms1Queries | +- CometSpecLib::RunMS1Search(pQueryMS1, ...) | reads g_vSpecLib / g_vulSpecLibPrecursorIndex (READ-ONLY) [x] @@ -171,11 +174,12 @@ DoMS1SearchMultiResults(dMaxMS1RTDiff, charge, mz, masses, intensities, nPeaks, | State | RTS path | Notes | |-------|:--------:|-------| | `g_staticParams` | Read-only [x] | Set once at init; never written during search. | -| `g_iFragmentIndex` / `g_vFragmentPeptides` / `g_vRawPeptides` | Read-only [x] | Loaded at init; never modified. | +| `g_iFragmentIndex` / `g_iFragmentIndexOffset` | Read-only [x] | CSR index loaded at init; never modified. | +| `g_vFragmentPeptides` / `g_vRawPeptides` | Read-only [x] | Loaded at init; never modified. | | `g_vSpecLib` / `g_vulSpecLibPrecursorIndex` | Read-only [x] | Loaded at init. | -| `g_pvProteinNames` / `g_pvProteinsList` | Read-only [x] | Loaded at init. | +| `g_pvProteinNames` / `g_pvProteinsList` / `g_pvProteinNameCache` | Read-only [x] | Loaded at init. | | `g_AScoreOptions` / `g_AScoreInterface` | Read-only [x] | Pointer set at init; each call uses its own data. | -| `g_pvQuery` / `g_pvQueryMS1` | Not touched [x] | RTS path uses per-call `Query*` / `QueryMS1*`. | +| `SearchSession::queries` / `SearchSession::ms1Queries` | Not touched [x] | `SearchSession` is batch-path only. RTS path uses per-call `Query*` / `QueryMS1*`. | | `g_massRange` | Not written [x] | Mass limits derived from per-call `Query*._pepMassInfo`. | | `tRealTimeStart` | Per-call local [x] | Each call has its own `chrono::time_point`. | | `Query*` / `QueryMS1*` | Per-call heap [x] | Each call allocates and owns its object; freed at end. | @@ -266,7 +270,7 @@ For a new **search or per-spectrum call** that routes through `ICometSearchManag 2. Implement in `CometSearchManager.cpp` using the thread-local pattern: - Use `PreprocessSingleSpectrumThreadLocal()` (not `PreprocessSingleSpectrum()`). - Call `CometSearch::RunSearch(pQuery, tRealTimeStart)` (not `RunSearch(ThreadPool*)`). - - Never write `g_pvQuery`, `g_massRange`, or `g_staticParams` from within the call. + - Never write `SearchSession` fields, `g_massRange`, or `g_staticParams` from within the call. 3. Add a managed wrapper method in `CometWrapper/CometWrapper.cpp` with `pin_ptr` for array parameters. 4. If new return types are needed, add wrapper structs to `CometDataWrapper.h` (and mirror in `CometData.h`). 5. Call from `RealtimeSearch/SearchMS1MS2.cs`. From 29b7cef373e8a2b92a850b9829ccc22c8114dd1f Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Tue, 16 Jun 2026 09:48:09 -0700 Subject: [PATCH 08/15] update .gitignore --- .gitignore | 8 +- MSToolkit/.gitignore | 8 +- docs/20260615_multiple_rts_instances.md | 134 ++++++++++++++++++++++++ 3 files changed, 142 insertions(+), 8 deletions(-) create mode 100644 docs/20260615_multiple_rts_instances.md diff --git a/.gitignore b/.gitignore index 4a36fb52..2a64db69 100644 --- a/.gitignore +++ b/.gitignore @@ -5,12 +5,6 @@ MSToolkit/extern/expat-2.2.9/ # user-specific Claude Code settings .claude/settings.local.json -# ignore dynamically generated files -MSToolkit/include/expat.h -MSToolkit/include/expat_external.h -MSToolkit/include/zconf.h -MSToolkit/include/zlib.h -MSToolkit/*.mri .DS_Store .idea @@ -157,4 +151,4 @@ ipch/ *.msp # Artifact of CodeQL -_codeql_detected_source_root \ No newline at end of file +_codeql_detected_source_root diff --git a/MSToolkit/.gitignore b/MSToolkit/.gitignore index fa724387..a38003a7 100644 --- a/MSToolkit/.gitignore +++ b/MSToolkit/.gitignore @@ -1,4 +1,10 @@ +include/expat.h include/expat_config.h +include/expat_external.h +include/zconf.h +include/zlib.h +include/zutil.h build/ .vs/ -*.json \ No newline at end of file +*.mri +*.json diff --git a/docs/20260615_multiple_rts_instances.md b/docs/20260615_multiple_rts_instances.md new file mode 100644 index 00000000..f081aa4f --- /dev/null +++ b/docs/20260615_multiple_rts_instances.md @@ -0,0 +1,134 @@ +# Multiple Concurrent RTS Instances: Design Options + +**Goal:** Allow N concurrent RTS instances in the same host process (or across processes), each running an independent set of search parameters, so that different subsets of spectra can be searched with different parameter sets simultaneously. + +--- + +## The Core Challenge + +All state that makes one search parameterization distinct from another is currently a process-wide singleton. There are two categories: + +**Must be per-instance (encode the parameter set):** +- `g_staticParams` -- the parameter root +- `g_iFragmentIndex` / `g_iFragmentIndexOffset` / `g_vFragmentPeptides` -- the index encodes enzyme cleavage, variable mods, and peptide length range; different params -> different index +- `MOD_NUMBERS` / `MOD_SEQS` / `PEPTIDE_MOD_SEQ_IDXS` -- mod permutation tables built from `variableModParameters` +- `CometSearch::_pbSearchMemoryPool` / `_ppbDuplFragmentArr` -- pool sized to param-set's thread count +- `g_AScoreOptions` / `g_AScoreInterface` -- if AScore settings differ +- `g_cometStatus` -- each instance needs independent error/cancel state +- All init flags and `singleSearchInitializationComplete` + +**Potentially shared (encode the database, not the params):** +- `g_vRawPeptides` (~300 MB) -- plain peptide sequences from the `.idx` file +- `g_pvProteinsList` (~200 MB CSR) -- protein file offsets per peptide +- `g_pvProteinNameCache` (~7 MB) -- protein name strings +- `g_pvProteinNames` -- indexed protein accessions +- `g_vSpecLib` / `g_vulSpecLibPrecursorIndex` -- if all instances use the same MS1 reference + +--- + +## Option A: Multiple processes + +Run N separate instances of the host application (or N `RealtimeSearch.exe` processes). Each process has its own address space and therefore its own independent copy of all globals. A C# coordinator routes spectra to the right process and aggregates results. + +**Zero C++ changes required.** Works today. + +**Pros:** Complete isolation, no lock contention between instances, simplest reasoning about state. + +**Cons:** N x full memory footprint per process. For a human target-decoy `.idx`, the fragment index alone is 3-8 GB; three instances = 9-24 GB just for the index. IPC cost for routing spectra and collecting results across process boundaries. + +**When to choose:** If memory is not constrained, or if the parameter sets are infrequently changed and process startup latency is acceptable. + +--- + +## Option B: Per-instance context struct (recommended long-term path) + +Move all process-global state into a `SearchContext` struct owned by each `CometSearchManager` instance. Multiple `CometSearchManager` objects can then coexist in the same process with fully independent state. + +```cpp +// New: CometSearch/RtsContext.h +struct RtsContext { + StaticParams params; + unsigned int* iFragmentIndex = nullptr; + uint64_t* iFragmentIndexOffset = nullptr; + vector vFragmentPeptides; + vector vRawPeptides; + ProteinsListCSR pvProteinsList; + unordered_map pvProteinNameCache; + map pvProteinNames; + bool* bIndexPrecursors = nullptr; + vector vSpecLib; + vector> vulSpecLibPrecursorIndex; + AScoreProCpp::AScoreOptions AScoreOptions; + AScoreProCpp::AScoreDllInterface* pAScoreInterface = nullptr; + vector MOD_NUMBERS; + vector MOD_SEQS; + // ... mod index arrays ... + bool* pbSearchMemoryPool = nullptr; + bool** ppbDuplFragmentArr = nullptr; + CometStatus status; + // init flags are already members of CometSearchManager +}; +``` + +`CometSearchManager` holds a `unique_ptr`. Every internal function that currently reads `g_staticParams` receives a `const RtsContext&` (or `const StaticParams&`) instead. The `CometSearch` class static members `_pbSearchMemoryPool` / `_ppbDuplFragmentArr` become per-instance (either stored in `RtsContext` and passed in, or `CometSearch` becomes a non-static class). + +The C# side creates N `CometSearchManagerWrapper` objects -- a natural extension of what is already there. Each wrapper wraps one `CometSearchManager` which owns one `RtsContext`. Spectra are routed to the appropriate wrapper by the C# coordinator. + +**Pros:** Single process, low IPC overhead, easy result aggregation, no process-startup latency per instance. Full clean encapsulation -- no global state at all after the refactor. + +**Cons:** Memory cost is the same as multi-process (N x index size). The refactor touches ~15 `.cpp`/`.h` files everywhere `g_staticParams`, `g_iFragmentIndex`, etc. are referenced. It is mechanical but not small -- `g_staticParams` alone appears in roughly 30 call sites in `CometSearch.cpp`. + +**Scope estimate:** The most invasive change is threading `const RtsContext&` (or just `const StaticParams&` for the scoring-only functions) through the call chains in `CometSearch.cpp`, `CometPreprocess.cpp`, `CometPostAnalysis.cpp`, `CometFragmentIndex.cpp`. A staged approach works: start with `g_staticParams` (referenced everywhere), get that building cleanly, then migrate the index arrays. + +--- + +## Option C: Shared-database layer + per-param search layer + +Split `RtsContext` into two levels: + +```cpp +struct DatabaseContext { // shared via shared_ptr + vector vRawPeptides; + ProteinsListCSR pvProteinsList; + unordered_map pvProteinNameCache; + map pvProteinNames; +}; + +struct SearchContext { // per-instance + StaticParams params; + shared_ptr db; // shared + unsigned int* iFragmentIndex; + uint64_t* iFragmentIndexOffset; + vector vFragmentPeptides; + // ... pools, mod tables, AScore, status ... +}; +``` + +The `DatabaseContext` is loaded once from the `.idx` file (which encodes protein names and peptide sequences regardless of search params) and shared among all `SearchContext` instances that reference the same file. + +**Memory savings:** ~500 MB per extra instance on the sharable data. The fragment index still cannot be shared when mods or enzyme differ -- and at 3-8 GB that is the dominant cost. Savings are typically 10-20% for a human proteome use case with three instances. + +**Pros:** Meaningful memory reduction if N is large or the base database is very large. + +**Cons:** Added complexity (two-level ownership, `shared_ptr` threading, database-identity matching). Benefit is modest when the index itself is not shared. + +**When to choose:** If all N instances use the same `.idx` file (guaranteed same database) AND memory is tight enough that 500 MB x N matters. + +--- + +## Special case: same database, same mods, different scoring params only + +If the only differences between instances are tolerance, ion series (a/b/c/x/y/z), minimum score, or similar scoring-time parameters -- things that do not affect which peptides are in the index -- then the entire fragment index (`g_iFragmentIndex`, `g_vFragmentPeptides`, `g_vRawPeptides`) is the same for all instances and can be shared. Only `g_staticParams` truly differs. + +In this case Option C degenerates to: share everything except `StaticParams` and the memory pool. The C++ scoring functions would receive a `const StaticParams&` argument instead of reading `g_staticParams` directly, which is a much smaller change than the full Option B refactor. + +--- + +## Recommendation + +| Timeframe | Choice | Reason | +|-----------|--------|--------| +| Immediately | **Option A** (multiple processes) | Zero C++ changes, works today, C# coordinator routes spectra | +| Long term | **Option B** (per-instance context) | Clean encapsulation, single process, natural extension of C# API, enables future optimizations including Option C | + +The key question that should drive which option is prioritized: **do the N param sets use the same `.idx` database file?** If yes and memory is a concern, the staging would be: Option B first, then selectively apply Option C's `shared_ptr` for the large read-only arrays as an optimization on top. From c971a2dd98f51578653b7fdc330c4a8861e17b7a Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Tue, 16 Jun 2026 16:15:42 -0700 Subject: [PATCH 09/15] refactor: Strategy/Pipeline architecture + bug fixes Major restructuring of the search engine into a Strategy/Pipeline pattern: - Extract core types into CometSearch/core/ (Types.h, Params.h, Constants.h) to break the monolithic CometDataInternal.h header - Add ISearchStrategy interface with FastaStrategy, FiStrategy, and PiStrategy concrete implementations, each owning file I/O and lifecycle for its db type - Add Pipeline orchestrator: drives the strategy init -> per-file loop -> writer loop -> finalize sequence, with correct cleanup on init or open failure - Add SearchSession to carry per-run mutable state (query list, status ref, search-mode flags) through the call chain without globals - Add IResultWriter base class with BuildNames() helper; add concrete writers (MzIdentMlWriter, PepXmlWriter, SqtWriter, TxtWriter, PercolatorWriter) that own their own file handles and lifecycle - Add SearchMemoryPool to own the per-thread duplicate-fragment scratch arrays; route all paths (FASTA batch and RTS) through a single s_pool authority, removing the legacy _pbSearchMemoryPool / g_searchMemoryPoolMutex / g_searchPoolCV split-tracking system - Extract RunSearchAndPostAnalysis() helper in SearchUtils.h; refactor all three strategy executeBatch() methods to use it, eliminating ~110 lines of duplication - Move BuildNames() extTargetCrux parameter to last position with nullptr default so non-CRUX callers do not need to supply a CRUX-specific argument Bug fixes included in this branch: - StorePeptideI: honour iDecoySearch==2 by storing scored decoy peptides in _pDecoys[] with correct threshold tracking (was silently discarding them) - SearchMS1Library: use pMS1Query->accessMutex instead of the global g_pvQueryMutex, restoring per-query thread safety in the RTS MS1 path - MzIdentMlWriter::OpenTmp: close() the fd returned by mkstemp() before calling fopen(), preventing one fd leak per mzIdentML batch on Linux - MzIdentMlWriter::FinalizeOne: report error when the temp file cannot be reopened for merge (was silently dropping the error) - AllocateResultsMem: move iXcorrHistogram memset before the per-result-slot loop so it executes once per query rather than iNumStored times - StorePeptideI: use short loop variable for decoy index recomputation to match StorePeptide() and avoid implicit int-to-short narrowing - WithinMassTolerancePeff: use dCalcPepMass + dMassAddition as the seek-back reference mass (was using dCalcPepMass alone, missing the PEFF delta) - Pipeline::run: call finalize() on strategy-init failure; call close() on all writers when a writer open() fails mid-list - SearchSession: remove dead fields bPlainPeptideIndexRead and bSpecLibRead (globals are authoritative; these were never wired up) - FastaStrategy::initialize: remove empty if-block left over from earlier draft - CometWritePercolator: pass protein name vectors by const reference to avoid copying on every search-hit output call Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 19 ++ CometSearch/CometSearch.cpp | 327 ++++++++++++--------- CometSearch/CometSearch.h | 12 +- CometSearch/CometSearchManager.cpp | 10 +- CometSearch/CometWritePercolator.cpp | 6 +- CometSearch/CometWritePercolator.h | 4 +- CometSearch/core/Types.h | 3 - CometSearch/output/IResultWriter.h | 27 ++ CometSearch/output/MzIdentMlWriter.h | 67 ++--- CometSearch/output/PepXmlWriter.h | 28 +- CometSearch/output/PercolatorWriter.h | 9 +- CometSearch/output/SqtWriter.h | 28 +- CometSearch/output/TxtWriter.h | 28 +- CometSearch/search/FastaStrategy.cpp | 97 +----- CometSearch/search/FiStrategy.cpp | 67 +---- CometSearch/search/PiStrategy.cpp | 42 +-- CometSearch/search/Pipeline.cpp | 23 +- CometSearch/search/SearchSession.h | 28 +- CometSearch/search/SearchUtils.h | 76 ++++- CometSearch/threading/SearchMemoryPool.cpp | 12 +- CometSearch/threading/SearchMemoryPool.h | 3 +- 21 files changed, 424 insertions(+), 492 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index c7cf3bcf..6c5e0ae1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -217,3 +217,22 @@ Rules for Claude Code: A `.gitattributes` file at the repo root enforces CRLF for all tracked source files at the git level, providing a second safety net. + + +## Development Workflows + +### Code Review Protocol (Copilot Mode) +When requested to perform a code review, always execute the following multi-step workflow before writing your feedback: +1. **Tooling Check:** Run the project's respective testing commands to gather concrete diagnostic data. +2. **Analysis:** Review the uncommitted files, staged changes, or the specified branch diff. +3. **Report Generation:** Structure the review using the exact template below. + +## Code Review Template +Provide feedback using this exact format: +1. **Summary:** A 1-2 sentence overview of the changes. +2. **Critical Issues:** Bugs, security vulnerabilities, or breaking changes. Provide the file path, exact line numbers, and the core issue. +3. **Code Quality & Maintainability:** Poor practices, anti-patterns, or missing tests. +4. **Actionable Improvements:** Specific refactoring suggestions accompanied by concise code snippets. + +*Constraint:* Keep critiques technical, objective, and ranked by severity. Avoid generic praise. + diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index 142f86e7..1c1c3694 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -21,13 +21,15 @@ #define BINARYSEARCHCUTOFF 20 // do linear search through FI if # entries is this or less -bool* CometSearch::_pbSearchMemoryPool = nullptr; bool** CometSearch::_ppbDuplFragmentArr = nullptr; // Module-local pool instance. Owns the same scratch arrays as the // legacy _pbSearchMemoryPool/_ppbDuplFragmentArr statics above. // Both representations are kept in sync during the transition: // AllocateMemory populates both; AcquirePoolSlot/releaseSlot use s_pool. +// TODO(Phase N): s_pool is a file-static singleton. Move it into a +// per-instance context (RtsContext / CometSearchManager member) before +// multiple concurrent RTS instances are viable. static SearchMemoryPool s_pool; extern comet_fileoffset_t clSizeCometFileOffset; @@ -57,12 +59,8 @@ bool CometSearch::AllocateMemory(int maxNumThreads) if (!s_pool.allocate(maxNumThreads, g_staticParams.iArraySizeGlobal)) return false; - // _pbSearchMemoryPool is the slot-availability array used by SearchThreadProc - // (FASTA_DB batch path). Allocate it separately; it is distinct from the - // scratch arrays owned by s_pool. try { - _pbSearchMemoryPool = new bool[maxNumThreads](); _ppbDuplFragmentArr = new bool*[maxNumThreads]; for (int i = 0; i < maxNumThreads; ++i) _ppbDuplFragmentArr[i] = s_pool.duplFragmentArr(i); @@ -89,11 +87,9 @@ bool CometSearch::DeallocateMemory(int /*maxNumThreads*/) s_pool.deallocate(); - delete[] _pbSearchMemoryPool; // _ppbDuplFragmentArr holds pointers into s_pool's scratch arrays; those // are already freed by s_pool.deallocate(). Only free the alias array itself. delete[] _ppbDuplFragmentArr; - _pbSearchMemoryPool = nullptr; _ppbDuplFragmentArr = nullptr; g_bCometSearchMemoryAllocated = false; @@ -996,7 +992,7 @@ bool CometSearch::RunSearch(int iPercentStart, bool CometSearch::RunSpecLibSearch(ThreadPool* /*tp*/) { - printf("OK in RunSpecLib\n"); + //printf("OK in RunSpecLib\n"); return true; } @@ -1257,44 +1253,22 @@ bool CometSearch::MapOBO(string strMod, void CometSearch::SearchThreadProc(SearchThreadData *pSearchThreadData, ThreadPool* /*tp*/) { - int i = -1; + int i = AcquirePoolSlot(); - // Grab available array from shared memory pool. - { - std::unique_lock lock(g_searchMemoryPoolMutex); - bool found = g_searchPoolCV.wait_for(lock, std::chrono::seconds(240), [&i]() { - for (int j = 0; j < g_staticParams.options.iNumThreads; ++j) - { - if (_pbSearchMemoryPool[j] == false) - { - _pbSearchMemoryPool[j] = true; - i = j; - return true; - } - } - return false; - }); - if (!found) - i = g_staticParams.options.iNumThreads; // sentinel: timeout - } - - if (i < 0 || i == g_staticParams.options.iNumThreads) + if (i < 0) { logerr(" Error - could not find available memory pool for MS2 search thread.\n"); return; } - // Give memory manager access to the thread. - pSearchThreadData->pbSearchMemoryPool = &_pbSearchMemoryPool[i]; - // Heap-allocate to avoid thread stack overflow: CometSearch has ~295 KB of // member arrays (_uiBinnedIonMasses, etc.) that would exhaust the 1 MB thread // stack in debug builds when combined with the deep DoSearch call chain. CometSearch* sqSearch = new CometSearch(); sqSearch->_iSlot = i; - sqSearch->_pQueries = pSearchThreadData->pQueries; - sqSearch->DoSearch(pSearchThreadData->dbEntry, _ppbDuplFragmentArr[i]); + sqSearch->DoSearch(pSearchThreadData->dbEntry, _ppbDuplFragmentArr[i], *pSearchThreadData->pQueries); delete sqSearch; + s_pool.releaseSlot(i); delete pSearchThreadData; pSearchThreadData = NULL; @@ -1302,8 +1276,11 @@ void CometSearch::SearchThreadProc(SearchThreadData *pSearchThreadData, bool CometSearch::DoSearch(sDBEntry dbe, - bool *pbDuplFragment) + bool *pbDuplFragment, + const vector& queries) { + _pQueries = &queries; + if (g_staticParams.options.bFastPlainPeptideIdx) { _seenShort.clear(); @@ -2064,25 +2041,6 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* /*tp*/, vector& queries } } -/* - for (vector::iterator it = g_pvQuery.begin(); it != g_pvQuery.end(); ++it) - { - int iNumMatchedPeptides = (*it)->iMatchPeptideCount; - if (iNumMatchedPeptides > g_staticParams.options.iNumStored) - iNumMatchedPeptides = g_staticParams.options.iNumStored; - - for (int x = 0; x < iNumMatchedPeptides; x++) - { - printf("OK %d scan %d, pep %s, xcorr %f, mass %f, matchcount %d\n", x, - (*it)->_spectrumInfoInternal.iScanNumber, - (*it)->_pResults[x].szPeptide, - (*it)->_pResults[x].fXcorr, - (*it)->_pResults[x].dPepMass, - (*it)->iMatchPeptideCount; fflush(stdout); - } - } -*/ - delete[] lReadIndex; std::fclose(fp); return true; @@ -3286,7 +3244,7 @@ void CometSearch::SearchMS1Library(QueryMS1* pMS1Query, if (dScore > pMS1Query->_pSpecLibResultsMS1.fDotProduct) { - Threading::LockMutex(g_pvQueryMutex); + Threading::LockMutex(pMS1Query->accessMutex); if (dScore > pMS1Query->_pSpecLibResultsMS1.fDotProduct) { pMS1Query->_pSpecLibResultsMS1.fDotProduct = (float)dScore; @@ -3294,7 +3252,7 @@ void CometSearch::SearchMS1Library(QueryMS1* pMS1Query, pMS1Query->_pSpecLibResultsMS1.fRTime = (float)(g_vSpecLib.at(iWhichMS1LibEntry).fRTime * dMaxSpecLibRT / dMaxQueryRT); pMS1Query->_pSpecLibResultsMS1.iWhichSpecLib = g_vSpecLib.at(iWhichMS1LibEntry).iLibEntry; } - Threading::UnlockMutex(g_pvQueryMutex); + Threading::UnlockMutex(pMS1Query->accessMutex); } } else if (g_vSpecLib.at(iWhichMS1LibEntry).fRTime > dRT + dMaxMS1RTDiff) @@ -4391,7 +4349,7 @@ bool CometSearch::WithinMassTolerancePeff(double dCalcPepMass, // Seek back to first peptide entry that matches mass tolerance in case binary // search doesn't hit the first entry. - while (iPos > 0 && _pQueries->at(iPos)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass) + while (iPos > 0 && _pQueries->at(iPos)->_pepMassInfo.dPeptideMassTolerancePlus >= dCalcPepMass + dMassAddition) iPos--; if (iPos != -1) @@ -8636,121 +8594,224 @@ void CometSearch::StorePeptideI(Query* pQuery, char* szProteinSeq, double dCalcPepMass, double dXcorr, - bool /*bDecoyPep*/, + bool bDecoyPep, int* piVarModSites, struct sDBEntry* dbe) { int iLenPeptide = iEndPos - iStartPos + 1; int iLenProteinMinus1 = (int)strlen(szProteinSeq) - 1; - short siLowestXcorrScoreIndex = pQuery->siLowestXcorrScoreIndex; + int iSizepiVarModSites = sizeof(int) * MAX_PEPTIDE_LEN_P2; + int iSizepdVarModSites = sizeof(double) * MAX_PEPTIDE_LEN_P2; - pQuery->iMatchPeptideCount++; - pQuery->_pResults[siLowestXcorrScoreIndex].usiLenPeptide = iLenPeptide; + if (g_staticParams.options.iDecoySearch == 2 && bDecoyPep) + { + short siLowestDecoyXcorrScoreIndex = pQuery->siLowestDecoyXcorrScoreIndex; - memcpy(pQuery->_pResults[siLowestXcorrScoreIndex].szPeptide, szProteinSeq + iStartPos, iLenPeptide * sizeof(char)); - pQuery->_pResults[siLowestXcorrScoreIndex].szPeptide[iLenPeptide] = '\0'; - pQuery->_pResults[siLowestXcorrScoreIndex].dPepMass = dCalcPepMass; + pQuery->iDecoyMatchPeptideCount++; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].usiLenPeptide = iLenPeptide; - if (pQuery->_spectrumInfoInternal.usiChargeState > 2) - { - pQuery->_pResults[siLowestXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) - * pQuery->_spectrumInfoInternal.usiMaxFragCharge - * g_staticParams.ionInformation.iNumIonSeriesUsed; - } - else - { - pQuery->_pResults[siLowestXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) - * g_staticParams.ionInformation.iNumIonSeriesUsed; - } + memcpy(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].szPeptide, szProteinSeq + iStartPos, iLenPeptide * sizeof(char)); + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].szPeptide[iLenPeptide] = '\0'; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].dPepMass = dCalcPepMass; - pQuery->_pResults[siLowestXcorrScoreIndex].fXcorr = (float)dXcorr; - pQuery->_pResults[siLowestXcorrScoreIndex].bClippedM = false; + if (pQuery->_spectrumInfoInternal.usiChargeState > 2) + { + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) + * pQuery->_spectrumInfoInternal.usiMaxFragCharge + * g_staticParams.ionInformation.iNumIonSeriesUsed; + } + else + { + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) + * g_staticParams.ionInformation.iNumIonSeriesUsed; + } - if (iStartPos == 0) - pQuery->_pResults[siLowestXcorrScoreIndex].cPrevAA = '-'; - else - pQuery->_pResults[siLowestXcorrScoreIndex].cPrevAA = szProteinSeq[iStartPos - 1]; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].fXcorr = (float)dXcorr; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].bClippedM = false; - if (iEndPos == iLenProteinMinus1) - pQuery->_pResults[siLowestXcorrScoreIndex].cNextAA = '-'; - else - pQuery->_pResults[siLowestXcorrScoreIndex].cNextAA = szProteinSeq[iEndPos + 1]; + if (iStartPos == 0) + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cPrevAA = '-'; + else + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cPrevAA = szProteinSeq[iStartPos - 1]; - pQuery->_pResults[siLowestXcorrScoreIndex].iPeffOrigResiduePosition = NO_PEFF_VARIANT; - pQuery->_pResults[siLowestXcorrScoreIndex].sPeffOrigResidues.clear(); - pQuery->_pResults[siLowestXcorrScoreIndex].iPeffNewResidueCount = 0; + if (iEndPos == iLenProteinMinus1) + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cNextAA = '-'; + else + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cNextAA = szProteinSeq[iEndPos + 1]; - pQuery->_pResults[siLowestXcorrScoreIndex].pWhichProtein.clear(); - pQuery->_pResults[siLowestXcorrScoreIndex].pWhichDecoyProtein.clear(); - pQuery->_pResults[siLowestXcorrScoreIndex].lProteinFilePosition = dbe->lProteinFilePosition; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].iPeffOrigResiduePosition = NO_PEFF_VARIANT; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].sPeffOrigResidues.clear(); + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].iPeffNewResidueCount = 0; - pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_None; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pWhichProtein.clear(); + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pWhichDecoyProtein.clear(); + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].lProteinFilePosition = dbe->lProteinFilePosition; - int iSizepiVarModSites = sizeof(int) * MAX_PEPTIDE_LEN_P2; - int iSizepdVarModSites = sizeof(double) * MAX_PEPTIDE_LEN_P2; + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cHasVariableMod = HasVariableModType_None; - if (g_staticParams.variableModParameters.bVarModSearch) - { - if (!iFoundVariableMod) + if (g_staticParams.variableModParameters.bVarModSearch) { - memset(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); - memset(pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); + if (!iFoundVariableMod) + { + memset(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); + memset(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); + } + else + { + memcpy(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].piVarModSites, piVarModSites, iSizepiVarModSites); + + int iVal; + for (int i = 0; i < iLenPeptide + 2; ++i) + { + iVal = pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].piVarModSites[i]; + + if (iVal > 0) + { + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pdVarModSites[i] = g_staticParams.variableModParameters.varModList[iVal - 1].dVarModMass; + + if (g_staticParams.options.iPrintAScoreProScore == -1 + || (g_staticParams.options.iPrintAScoreProScore > 0 && iVal == g_AScoreOptions.getSymbol() - '0')) + { + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cHasVariableMod = HasVariableModType_AScorePro; + } + else if (pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cHasVariableMod == HasVariableModType_None) + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].cHasVariableMod = HasVariableModType_True; + } + else + pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pdVarModSites[i] = 0.0; + } + } } else { - memcpy(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, piVarModSites, iSizepiVarModSites); + memset(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); + memset(pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); + } - for (int i = 0; i < iLenPeptide + 2; ++i) + // Get new lowest decoy score. + pQuery->dLowestDecoyXcorrScore = pQuery->_pDecoys[0].fXcorr; + siLowestDecoyXcorrScoreIndex = 0; + + for (short siA = (short)(g_staticParams.options.iNumStored - 1); siA > 0; --siA) + { + if (pQuery->_pDecoys[siA].fXcorr < pQuery->dLowestDecoyXcorrScore || pQuery->_pDecoys[siA].usiLenPeptide == 0) { - if (piVarModSites[i] > 0) - pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = g_staticParams.variableModParameters.varModList[piVarModSites[i] - 1].dVarModMass; - else - pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = 0.0; + pQuery->dLowestDecoyXcorrScore = pQuery->_pDecoys[siA].fXcorr; + siLowestDecoyXcorrScoreIndex = siA; } + } + + pQuery->siLowestDecoyXcorrScoreIndex = siLowestDecoyXcorrScoreIndex; + } + else + { + short siLowestXcorrScoreIndex = pQuery->siLowestXcorrScoreIndex; + + pQuery->iMatchPeptideCount++; + pQuery->_pResults[siLowestXcorrScoreIndex].usiLenPeptide = iLenPeptide; + + memcpy(pQuery->_pResults[siLowestXcorrScoreIndex].szPeptide, szProteinSeq + iStartPos, iLenPeptide * sizeof(char)); + pQuery->_pResults[siLowestXcorrScoreIndex].szPeptide[iLenPeptide] = '\0'; + pQuery->_pResults[siLowestXcorrScoreIndex].dPepMass = dCalcPepMass; + + if (pQuery->_spectrumInfoInternal.usiChargeState > 2) + { + pQuery->_pResults[siLowestXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) + * pQuery->_spectrumInfoInternal.usiMaxFragCharge + * g_staticParams.ionInformation.iNumIonSeriesUsed; + } + else + { + pQuery->_pResults[siLowestXcorrScoreIndex].usiTotalIons = (iLenPeptide - 1) + * g_staticParams.ionInformation.iNumIonSeriesUsed; + } + + pQuery->_pResults[siLowestXcorrScoreIndex].fXcorr = (float)dXcorr; + pQuery->_pResults[siLowestXcorrScoreIndex].bClippedM = false; + + if (iStartPos == 0) + pQuery->_pResults[siLowestXcorrScoreIndex].cPrevAA = '-'; + else + pQuery->_pResults[siLowestXcorrScoreIndex].cPrevAA = szProteinSeq[iStartPos - 1]; + + if (iEndPos == iLenProteinMinus1) + pQuery->_pResults[siLowestXcorrScoreIndex].cNextAA = '-'; + else + pQuery->_pResults[siLowestXcorrScoreIndex].cNextAA = szProteinSeq[iEndPos + 1]; + + pQuery->_pResults[siLowestXcorrScoreIndex].iPeffOrigResiduePosition = NO_PEFF_VARIANT; + pQuery->_pResults[siLowestXcorrScoreIndex].sPeffOrigResidues.clear(); + pQuery->_pResults[siLowestXcorrScoreIndex].iPeffNewResidueCount = 0; + + pQuery->_pResults[siLowestXcorrScoreIndex].pWhichProtein.clear(); + pQuery->_pResults[siLowestXcorrScoreIndex].pWhichDecoyProtein.clear(); + pQuery->_pResults[siLowestXcorrScoreIndex].lProteinFilePosition = dbe->lProteinFilePosition; - int iVal; - for (int i = 0; i < iLenPeptide + 2; ++i) + pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_None; + + if (g_staticParams.variableModParameters.bVarModSearch) + { + if (!iFoundVariableMod) + { + memset(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); + memset(pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); + } + else { - iVal = pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites[i]; + memcpy(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, piVarModSites, iSizepiVarModSites); - if (iVal > 0) + for (int i = 0; i < iLenPeptide + 2; ++i) { - pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = g_staticParams.variableModParameters.varModList[iVal - 1].dVarModMass; + if (piVarModSites[i] > 0) + pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = g_staticParams.variableModParameters.varModList[piVarModSites[i] - 1].dVarModMass; + else + pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = 0.0; + } - if (g_staticParams.options.iPrintAScoreProScore == -1 - || (g_staticParams.options.iPrintAScoreProScore > 0 && iVal == g_AScoreOptions.getSymbol() - '0')) + int iVal; + for (int i = 0; i < iLenPeptide + 2; ++i) + { + iVal = pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites[i]; + + if (iVal > 0) { - pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_AScorePro; + pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = g_staticParams.variableModParameters.varModList[iVal - 1].dVarModMass; + + if (g_staticParams.options.iPrintAScoreProScore == -1 + || (g_staticParams.options.iPrintAScoreProScore > 0 && iVal == g_AScoreOptions.getSymbol() - '0')) + { + pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_AScorePro; + } + else if (pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod == HasVariableModType_None) + pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_True; } - else if (pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod == HasVariableModType_None) - pQuery->_pResults[siLowestXcorrScoreIndex].cHasVariableMod = HasVariableModType_True; + else + pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = 0.0; } - else - pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites[i] = 0.0; } } - } - else - { - memset(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); - memset(pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); - } + else + { + memset(pQuery->_pResults[siLowestXcorrScoreIndex].piVarModSites, 0, iSizepiVarModSites); + memset(pQuery->_pResults[siLowestXcorrScoreIndex].pdVarModSites, 0, iSizepdVarModSites); + } - // Get new lowest score. - pQuery->dLowestXcorrScore = pQuery->_pResults[0].fXcorr; - siLowestXcorrScoreIndex = 0; + // Get new lowest score. + pQuery->dLowestXcorrScore = pQuery->_pResults[0].fXcorr; + siLowestXcorrScoreIndex = 0; - for (int i = g_staticParams.options.iNumStored - 1; i > 0; --i) - { - if (pQuery->_pResults[i].fXcorr < pQuery->dLowestXcorrScore || pQuery->_pResults[i].usiLenPeptide == 0) + for (int i = g_staticParams.options.iNumStored - 1; i > 0; --i) { - pQuery->dLowestXcorrScore = pQuery->_pResults[i].fXcorr; - siLowestXcorrScoreIndex = i; + if (pQuery->_pResults[i].fXcorr < pQuery->dLowestXcorrScore || pQuery->_pResults[i].usiLenPeptide == 0) + { + pQuery->dLowestXcorrScore = pQuery->_pResults[i].fXcorr; + siLowestXcorrScoreIndex = i; + } } - } - pQuery->siLowestXcorrScoreIndex = siLowestXcorrScoreIndex; + pQuery->siLowestXcorrScoreIndex = siLowestXcorrScoreIndex; + } } diff --git a/CometSearch/CometSearch.h b/CometSearch/CometSearch.h index 03c9105f..d812b9b8 100644 --- a/CometSearch/CometSearch.h +++ b/CometSearch/CometSearch.h @@ -39,23 +39,16 @@ struct SearchThreadData { sDBEntry dbEntry; - bool* pbSearchMemoryPool; ThreadPool* tp; const vector* pQueries; // batch query list; set before dispatch SearchThreadData() = default; SearchThreadData(const sDBEntry& dbEntry_in) - : dbEntry(dbEntry_in), pbSearchMemoryPool(nullptr), tp(nullptr), pQueries(nullptr) { + : dbEntry(dbEntry_in), tp(nullptr), pQueries(nullptr) { } ~SearchThreadData() { - if (pbSearchMemoryPool) - { - { std::lock_guard lk(g_searchMemoryPoolMutex); *pbSearchMemoryPool = false; } - g_searchPoolCV.notify_one(); - pbSearchMemoryPool = nullptr; - } dbEntry.vectorPeffMod.clear(); dbEntry.vectorPeffVariantSimple.clear(); } @@ -111,7 +104,7 @@ class CometSearch static void SearchThreadProc(SearchThreadData* pSearchThreadData, ThreadPool* tp); - bool DoSearch(sDBEntry dbe, bool* pbDuplFragment); + bool DoSearch(sDBEntry dbe, bool* pbDuplFragment, const vector& queries); // Performance: Mark as const where possible bool CheckEnzymeTermini(const char* szProteinSeq, @@ -383,7 +376,6 @@ class CometSearch static int AcquirePoolSlot(); // Spin-wait for a free slot; returns index or -1 on timeout - static bool *_pbSearchMemoryPool; // Pool of memory to be shared by search threads static bool **_ppbDuplFragmentArr; // Number of arrays equals number of threads int _iSlot = -1; // pool slot index; set by SearchThreadProc before DoSearch diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index 4ddd92d7..3e16fe7d 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -62,14 +62,12 @@ MassRange g_massRange; Mutex g_pvQueryMutex; Mutex g_pvDBIndexMutex; Mutex g_preprocessMemoryPoolMutex; -Mutex g_searchMemoryPoolMutex; Mutex g_ms1AlignerMutex; CometStatus g_cometStatus; string g_sCometVersion; map g_pvProteinNames; // for either db index unordered_map g_pvProteinNameCache; // populated at index load; eliminates per-spectrum fopen in RTS path -std::condition_variable g_searchPoolCV; // signaled when a pool slot is released AScoreProCpp::AScoreOptions g_AScoreOptions; // AScore options // Thread-safety note - g_AScoreInterface is shared across PostAnalysis threads. @@ -347,9 +345,6 @@ CometSearchManager::CometSearchManager() : // Initialize the mutex we'll use to protect the preprocess memory pool Threading::InitMutex(&g_preprocessMemoryPoolMutex); - // Initialize the mutex we'll use to protect the search memory pool - Threading::InitMutex(&g_searchMemoryPoolMutex); - // Initialize the mutex we'll use to protect the MS1 RT aligner Threading::InitMutex(&g_ms1AlignerMutex); @@ -375,9 +370,6 @@ CometSearchManager::~CometSearchManager() // Destroy the mutex we used to protect the preprocess memory pool Threading::DestroyMutex(g_preprocessMemoryPoolMutex); - // Destroy the mutex we used to protect the search memory pool - Threading::DestroyMutex(g_searchMemoryPoolMutex); - // Destroy the mutex we used to protect the MS1 RT aligner Threading::DestroyMutex(g_ms1AlignerMutex); @@ -2130,7 +2122,7 @@ bool CometSearchManager::DoSearch() CometSpecLib::LoadSpecLib(g_staticParams.speclibInfo.strSpecLibFile); // Build search session with run-level flags. - SearchSession session(g_staticParams); + SearchSession session(g_staticParams, g_cometStatus); session.bPerformDatabaseSearch = g_bPerformDatabaseSearch; session.bPerformSpecLibSearch = g_bPerformSpecLibSearch; diff --git a/CometSearch/CometWritePercolator.cpp b/CometSearch/CometWritePercolator.cpp index 3e2bdac0..ddf9d78b 100644 --- a/CometSearch/CometWritePercolator.cpp +++ b/CometSearch/CometWritePercolator.cpp @@ -178,8 +178,8 @@ void CometWritePercolator::PrintPercolatorSearchHit(int iWhichQuery, int iPrintTargetDecoy, Results *pOutput, FILE *fpout, - vector vProteinTargets, - vector vProteinDecoys, + const vector& vProteinTargets, + const vector& vProteinDecoys, const vector& queries) { int iNterm; @@ -275,7 +275,7 @@ void CometWritePercolator::PrintPercolatorSearchHit(int iWhichQuery, else fprintf(fpout, "%c.%s.%c\t", pOutput[iWhichResult].cPrevAA, pOutput[iWhichResult].szPeptide, pOutput[iWhichResult].cNextAA); - std::vector::iterator it; + std::vector::const_iterator it; bool bPrintTab = false; if (iPrintTargetDecoy != 2) // if not decoy only, print target proteins diff --git a/CometSearch/CometWritePercolator.h b/CometSearch/CometWritePercolator.h index b9ad1f85..59b8c639 100644 --- a/CometSearch/CometWritePercolator.h +++ b/CometSearch/CometWritePercolator.h @@ -40,8 +40,8 @@ class CometWritePercolator int iPrintTargetDecoy, Results *pOutput, FILE *fpOut, - vector vProteinTargets, - vector vProteinDecoys, + const vector& vProteinTargets, + const vector& vProteinDecoys, const vector& queries); static void CalcNTTNMC(Results *pOutput, int iWhichQuery, diff --git a/CometSearch/core/Types.h b/CometSearch/core/Types.h index 6c550fa4..2f5fec6d 100644 --- a/CometSearch/core/Types.h +++ b/CometSearch/core/Types.h @@ -552,8 +552,6 @@ class ProteinsListCSR extern ProteinsListCSR g_pvProteinsList; extern std::unordered_map g_pvProteinNameCache; // file offset -> protein name string; populated at index load -extern std::condition_variable g_searchPoolCV; // notified when a pool slot is released - extern AScoreProCpp::AScoreOptions g_AScoreOptions; // AScore options extern AScoreProCpp::AScoreDllInterface* g_AScoreInterface; @@ -809,7 +807,6 @@ extern vector g_pvInputFiles; extern Mutex g_pvQueryMutex; extern Mutex g_pvDBIndexMutex; extern Mutex g_preprocessMemoryPoolMutex; -extern Mutex g_searchMemoryPoolMutex; extern Mutex g_dbIndexMutex; extern Mutex g_vSpecLibMutex; diff --git a/CometSearch/output/IResultWriter.h b/CometSearch/output/IResultWriter.h index 34d6fdc6..9625fdb6 100644 --- a/CometSearch/output/IResultWriter.h +++ b/CometSearch/output/IResultWriter.h @@ -20,6 +20,7 @@ #include class CometSearchManager; +class CometStatus; struct Query; // Parameters passed to each writer's open() method. @@ -34,6 +35,7 @@ struct WriterOpenCtx int iDecoySearch; // 0=off, 1=concat, 2=separate bool bIdxNoFasta; // .idx DB with no companion .fasta (mzIdentML) CometSearchManager* pMgr; // for format headers that need ICometSearchManager + CometStatus* pStatus = nullptr; // session error/cancel state (always set by Pipeline) }; // Parameters passed to each writer's write() method (per-batch). @@ -61,6 +63,31 @@ class IResultWriter // Write format footer (if any), close file(s), and optionally remove // them (bEmpty = iTotalSpectraSearched == 0). virtual void close(bool bSucceeded, bool bEmpty) = 0; + +protected: + // Shared output-filename builder used by all format writers. + static void BuildNames(const WriterOpenCtx& ctx, + const char* ext, + const char* extDecoy, + std::string& sTarget, + std::string& sDecoy, + const char* extTargetCrux = nullptr) + { + std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; + std::string range; + if (!ctx.bEntireFile) + range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); +#ifdef CRUX + if (ctx.iDecoySearch == 2) + { sTarget = base + range + (extTargetCrux ? extTargetCrux : ext); sDecoy = base + range + extDecoy; } + else + sTarget = base + range + ext; +#else + (void)extTargetCrux; + sTarget = base + range + ext; + if (ctx.iDecoySearch == 2) sDecoy = base + range + extDecoy; +#endif + } }; #endif // _IRESULTWRITER_H_ diff --git a/CometSearch/output/MzIdentMlWriter.h b/CometSearch/output/MzIdentMlWriter.h index d8f95251..c4a48a51 100644 --- a/CometSearch/output/MzIdentMlWriter.h +++ b/CometSearch/output/MzIdentMlWriter.h @@ -28,16 +28,17 @@ class MzIdentMlWriter : public IResultWriter bool open(const WriterOpenCtx& ctx) override { _bIdxNoFasta = ctx.bIdxNoFasta; - BuildNames(ctx, ".mzid", ".decoy.mzid", ".target.mzid", _sTarget, _sDecoy); + _pStatus = ctx.pStatus; + BuildNames(ctx, ".mzid", ".decoy.mzid", _sTarget, _sDecoy, ".target.mzid"); _fpout = fopen(_sTarget.c_str(), "w"); if (!_fpout) { std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); return false; } - if (!OpenTmp(_sTarget, _sTgtTmp, _fpoutTmp)) return false; + if (!OpenTmp(_sTarget, _sTgtTmp, _fpoutTmp, ctx.pStatus)) return false; if (ctx.iDecoySearch == 2) { @@ -45,10 +46,10 @@ class MzIdentMlWriter : public IResultWriter if (!_fpoutd) { std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); return false; } - if (!OpenTmp(_sDecoy, _sDecTmp, _fpoutdTmp)) return false; + if (!OpenTmp(_sDecoy, _sDecTmp, _fpoutdTmp, ctx.pStatus)) return false; } return true; } @@ -74,33 +75,40 @@ class MzIdentMlWriter : public IResultWriter } private: - CometSearchManager* _pMgr = nullptr; - FILE* _fpout = nullptr; - FILE* _fpoutd = nullptr; - FILE* _fpoutTmp = nullptr; - FILE* _fpoutdTmp = nullptr; - FILE* _fpdb = nullptr; + CometSearchManager* _pMgr = nullptr; + CometStatus* _pStatus = nullptr; + FILE* _fpout = nullptr; + FILE* _fpoutd = nullptr; + FILE* _fpoutTmp = nullptr; + FILE* _fpoutdTmp = nullptr; + FILE* _fpdb = nullptr; bool _bIdxNoFasta = false; std::string _sTarget, _sDecoy, _sTgtTmp, _sDecTmp; - bool OpenTmp(const std::string& sBase, std::string& sTmp, FILE*& fp) + bool OpenTmp(const std::string& sBase, std::string& sTmp, FILE*& fp, CometStatus* pStatus) { sTmp = sBase + ".XXXXXX"; + bool bTmpOk; #ifdef _WIN32 - if (_mktemp_s(&sTmp[0], sTmp.size() + 1) != 0) + bTmpOk = (_mktemp_s(&sTmp[0], sTmp.size() + 1) == 0); #else - if (mkstemp(&sTmp[0]) == -1) + { + int fd = mkstemp(&sTmp[0]); + if (fd != -1) ::close(fd); // release kernel fd; fopen below opens its own handle + bTmpOk = (fd != -1); + } #endif + if (!bTmpOk) { std::string msg = " Error - cannot create temporary file \"" + sTmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); return false; } fp = fopen(sTmp.c_str(), "w"); if (!fp) { std::string msg = " Error - cannot write to temporary file \"" + sTmp + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); return false; } return true; @@ -120,6 +128,11 @@ class MzIdentMlWriter : public IResultWriter fclose(fpTmp); fpTmp = nullptr; if (!bEmpty) remove(sTmp.c_str()); } + else + { + std::string msg = " Error - cannot reopen temporary mzIdentML file \"" + sTmp + "\" for merge.\n"; + _pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); + } } else if (fpTmp) { @@ -129,28 +142,6 @@ class MzIdentMlWriter : public IResultWriter fclose(fpFinal); fpFinal = nullptr; } - static void BuildNames(const WriterOpenCtx& ctx, - const char* ext, - const char* extDecoy, - const char* extTargetCrux, - std::string& sTarget, - std::string& sDecoy) - { - std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; - std::string range; - if (!ctx.bEntireFile) - range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); -#ifdef CRUX - if (ctx.iDecoySearch == 2) - { sTarget = base + range + extTargetCrux; sDecoy = base + range + extDecoy; } - else - sTarget = base + range + ext; -#else - (void)extTargetCrux; - sTarget = base + range + ext; - if (ctx.iDecoySearch == 2) sDecoy = base + range + extDecoy; -#endif - } }; #endif // _MZIDENTMLWRITER_H_ diff --git a/CometSearch/output/PepXmlWriter.h b/CometSearch/output/PepXmlWriter.h index ea93336c..8130010c 100644 --- a/CometSearch/output/PepXmlWriter.h +++ b/CometSearch/output/PepXmlWriter.h @@ -25,12 +25,12 @@ class PepXmlWriter : public IResultWriter public: bool open(const WriterOpenCtx& ctx) override { - BuildNames(ctx, ".pep.xml", ".decoy.pep.xml", ".target.pep.xml", _sTarget, _sDecoy); + BuildNames(ctx, ".pep.xml", ".decoy.pep.xml", _sTarget, _sDecoy, ".target.pep.xml"); if ((_fpout = fopen(_sTarget.c_str(), "w")) == NULL) { std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); return false; } if (!CometWritePepXML::WritePepXMLHeader(_fpout, *ctx.pMgr)) @@ -41,7 +41,7 @@ class PepXmlWriter : public IResultWriter if ((_fpoutd = fopen(_sDecoy.c_str(), "w")) == NULL) { std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); return false; } if (!CometWritePepXML::WritePepXMLHeader(_fpoutd, *ctx.pMgr)) @@ -78,28 +78,6 @@ class PepXmlWriter : public IResultWriter std::string _sTarget; std::string _sDecoy; - static void BuildNames(const WriterOpenCtx& ctx, - const char* ext, - const char* extDecoy, - const char* extTargetCrux, - std::string& sTarget, - std::string& sDecoy) - { - std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; - std::string range; - if (!ctx.bEntireFile) - range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); -#ifdef CRUX - if (ctx.iDecoySearch == 2) - { sTarget = base + range + extTargetCrux; sDecoy = base + range + extDecoy; } - else - sTarget = base + range + ext; -#else - (void)extTargetCrux; - sTarget = base + range + ext; - if (ctx.iDecoySearch == 2) sDecoy = base + range + extDecoy; -#endif - } }; #endif // _PEPXMLWRITER_H_ diff --git a/CometSearch/output/PercolatorWriter.h b/CometSearch/output/PercolatorWriter.h index 1f6e3db7..8626ee82 100644 --- a/CometSearch/output/PercolatorWriter.h +++ b/CometSearch/output/PercolatorWriter.h @@ -25,17 +25,14 @@ class PercolatorWriter : public IResultWriter public: bool open(const WriterOpenCtx& ctx) override { - std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; - std::string range; - if (!ctx.bEntireFile) - range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); - _sPath = base + range + ".pin"; + std::string sUnused; + BuildNames(ctx, ".pin", ".pin", _sPath, sUnused, ".pin"); _fpout = fopen(_sPath.c_str(), "w"); if (!_fpout) { std::string msg = " Error - cannot write to file \"" + _sPath + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); return false; } CometWritePercolator::WritePercolatorHeader(_fpout); diff --git a/CometSearch/output/SqtWriter.h b/CometSearch/output/SqtWriter.h index 5c6b4f09..bc2e8263 100644 --- a/CometSearch/output/SqtWriter.h +++ b/CometSearch/output/SqtWriter.h @@ -27,12 +27,12 @@ class SqtWriter : public IResultWriter { if (g_staticParams.options.bOutputSqtFile) { - BuildNames(ctx, ".sqt", ".decoy.sqt", ".target.sqt", _sTarget, _sDecoy); + BuildNames(ctx, ".sqt", ".decoy.sqt", _sTarget, _sDecoy, ".target.sqt"); if ((_fpout = fopen(_sTarget.c_str(), "w")) == NULL) { std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); return false; } CometWriteSqt::PrintSqtHeader(_fpout, *ctx.pMgr); @@ -42,7 +42,7 @@ class SqtWriter : public IResultWriter if ((_fpoutd = fopen(_sDecoy.c_str(), "w")) == NULL) { std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); return false; } CometWriteSqt::PrintSqtHeader(_fpoutd, *ctx.pMgr); @@ -77,28 +77,6 @@ class SqtWriter : public IResultWriter std::string _sTarget; std::string _sDecoy; - static void BuildNames(const WriterOpenCtx& ctx, - const char* ext, - const char* extDecoy, - const char* extTargetCrux, - std::string& sTarget, - std::string& sDecoy) - { - std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; - std::string range; - if (!ctx.bEntireFile) - range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); -#ifdef CRUX - if (ctx.iDecoySearch == 2) - { sTarget = base + range + extTargetCrux; sDecoy = base + range + extDecoy; } - else - sTarget = base + range + ext; -#else - (void)extTargetCrux; - sTarget = base + range + ext; - if (ctx.iDecoySearch == 2) sDecoy = base + range + extDecoy; -#endif - } }; #endif // _SQTWRITER_H_ diff --git a/CometSearch/output/TxtWriter.h b/CometSearch/output/TxtWriter.h index 1f2e4d94..1d5f1014 100644 --- a/CometSearch/output/TxtWriter.h +++ b/CometSearch/output/TxtWriter.h @@ -28,12 +28,12 @@ class TxtWriter : public IResultWriter std::string ext = std::string(".") + ctx.szTxtFileExt; std::string extDecoy = std::string(".decoy.") + ctx.szTxtFileExt; std::string extTarget = std::string(".target.") + ctx.szTxtFileExt; - BuildNames(ctx, ext.c_str(), extDecoy.c_str(), extTarget.c_str(), _sTarget, _sDecoy); + BuildNames(ctx, ext.c_str(), extDecoy.c_str(), _sTarget, _sDecoy, extTarget.c_str()); if ((_fpout = fopen(_sTarget.c_str(), "w")) == NULL) { std::string msg = " Error - cannot write to file \"" + _sTarget + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); return false; } CometWriteTxt::PrintTxtHeader(_fpout); @@ -44,7 +44,7 @@ class TxtWriter : public IResultWriter if ((_fpoutd = fopen(_sDecoy.c_str(), "w")) == NULL) { std::string msg = " Error - cannot write to decoy file \"" + _sDecoy + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, msg); logerr(msg); + ctx.pStatus->SetStatus(CometResult_Failed, msg); logerr(msg); return false; } CometWriteTxt::PrintTxtHeader(_fpoutd); @@ -78,28 +78,6 @@ class TxtWriter : public IResultWriter std::string _sTarget; std::string _sDecoy; - static void BuildNames(const WriterOpenCtx& ctx, - const char* ext, - const char* extDecoy, - const char* extTargetCrux, - std::string& sTarget, - std::string& sDecoy) - { - std::string base = std::string(ctx.szBaseName) + ctx.szOutputSuffix; - std::string range; - if (!ctx.bEntireFile) - range = "." + std::to_string(ctx.iFirstScan) + "-" + std::to_string(ctx.iLastScan); -#ifdef CRUX - if (ctx.iDecoySearch == 2) - { sTarget = base + range + extTargetCrux; sDecoy = base + range + extDecoy; } - else - sTarget = base + range + ext; -#else - (void)extTargetCrux; - sTarget = base + range + ext; - if (ctx.iDecoySearch == 2) sDecoy = base + range + extDecoy; -#endif - } }; #endif // _TXTWRITER_H_ diff --git a/CometSearch/search/FastaStrategy.cpp b/CometSearch/search/FastaStrategy.cpp index de4522b0..e417a823 100644 --- a/CometSearch/search/FastaStrategy.cpp +++ b/CometSearch/search/FastaStrategy.cpp @@ -21,32 +21,8 @@ #include "CometSearchManager.h" #include "MSReader.h" -bool FastaStrategy::initialize(SearchSession& session, ThreadPool* /*tp*/) +bool FastaStrategy::initialize(SearchSession& /*session*/, ThreadPool* /*tp*/) { - // Read protein variable-mod filter file (FASTA-only feature). - if (session.bPerformDatabaseSearch - && g_staticParams.variableModParameters.sProteinLModsListFile.length() > 0) - { - bool bVarModUsed = false; - for (int iMod = 0; iMod < VMODS; ++iMod) - { - if (g_staticParams.variableModParameters.varModList[iMod].dVarModMass != 0.0) - { - bVarModUsed = true; - break; - } - } - - if (bVarModUsed) - { - // ReadProteinVarModFilterFile() is a private member of CometSearchManager; - // it is called from DoSearch() before pipeline.run() for the FASTA path. - // This initialize() is called AFTER that call, so the filter is already loaded. - // Nothing to do here. (The call is retained in DoSearch() for the FASTA path - // only, which is handled before makeStrategy() is invoked.) - } - } - if (!CometPreprocess::AllocateMemory(g_staticParams.options.iNumThreads)) return false; @@ -70,7 +46,7 @@ bool FastaStrategy::openFiles(const std::string& szDatabase, if ((fpfasta = fopen(szDatabase.c_str(), "r")) == nullptr) { string strErrorMsg = " Error (1b) - cannot read sequence database file \"" + szDatabase + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + session.statusRef.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg); return false; } @@ -92,7 +68,7 @@ bool FastaStrategy::executeBatch(MSToolkit::MSReader& mstReader, fflush(stdout); } - g_cometStatus.SetStatusMsg(string("Loading and processing input spectra")); + session.statusRef.SetStatusMsg(string("Loading and processing input spectra")); bool bSucceeded = CometPreprocess::LoadAndPreprocessSpectra( mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); @@ -114,73 +90,10 @@ bool FastaStrategy::executeBatch(MSToolkit::MSReader& mstReader, string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); if (!g_staticParams.options.bOutputSqtStream) logout(strStatusMsg); - g_cometStatus.SetStatusMsg(strStatusMsg); - } - - if (g_staticParams.options.bMango) - { - int iCurrentScanNumber = 0; - int iMangoIndex = 0; - - std::sort(session.queries.begin(), session.queries.end(), compareByMangoIndex); - - for (std::vector::iterator it = session.queries.begin(); it != session.queries.end(); ++it) - { - if ((*it)->_spectrumInfoInternal.iScanNumber != iCurrentScanNumber) - { - iCurrentScanNumber = (*it)->_spectrumInfoInternal.iScanNumber; - iMangoIndex = 0; - } - else - { - iMangoIndex++; - } - sprintf((*it)->_spectrumInfoInternal.szMango, "%03d_%c", - (int)iMangoIndex / 2, (iMangoIndex % 2) ? 'B' : 'A'); - } - } - - std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); - - g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; - g_massRange.dMaxMass = session.queries.at(session.queries.size() - 1)->_pepMassInfo.dPeptideMassTolerancePlus; - - if (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass) - g_massRange.bNarrowMassRange = true; - else - g_massRange.bNarrowMassRange = false; - - bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); - if (!bSucceeded) - return false; - - g_cometStatus.SetStatusMsg(string("Running search...")); - - if (session.bPerformDatabaseSearch) - bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); - if (bSucceeded && session.bPerformSpecLibSearch) - bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); - - if (!bSucceeded) - return false; - - bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); - if (!bSucceeded) - return false; - - if (!g_staticParams.options.bOutputSqtStream) - { - logout(" - Post analysis:"); - fflush(stdout); - } - - if (session.bPerformDatabaseSearch) - { - g_cometStatus.SetStatusMsg(string("Performing post-search analysis ...")); - bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); + session.statusRef.SetStatusMsg(strStatusMsg); } - return bSucceeded; + return RunSearchAndPostAnalysis(iPercentStart, iPercentEnd, tp, session, true); } void FastaStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) diff --git a/CometSearch/search/FiStrategy.cpp b/CometSearch/search/FiStrategy.cpp index 14e373cd..c646ccdd 100644 --- a/CometSearch/search/FiStrategy.cpp +++ b/CometSearch/search/FiStrategy.cpp @@ -101,7 +101,7 @@ bool FiStrategy::openFiles(const std::string& szDatabase, if ((fpidx = fopen(sTmpDB.c_str(), "r")) == nullptr) { string strErrorMsg = " Error (1a) - cannot read .idx file \"" + sTmpDB + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + session.statusRef.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg); return false; } @@ -132,7 +132,7 @@ bool FiStrategy::executeBatch(MSToolkit::MSReader& mstReader, if (bFused) { - g_cometStatus.SetStatusMsg(string("Running fused FI_DB search...")); + session.statusRef.SetStatusMsg(string("Running fused FI_DB search...")); bool bSucceeded = CometPreprocess::FusedLoadAndSearchSpectra( mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); @@ -145,7 +145,7 @@ bool FiStrategy::executeBatch(MSToolkit::MSReader& mstReader, // Legacy three-sweep path: LoadAndPreprocess -> AllocateResults -> // sort-by-mass -> RunSearch -> PostAnalysis. - g_cometStatus.SetStatusMsg(string("Loading and processing input spectra")); + session.statusRef.SetStatusMsg(string("Loading and processing input spectra")); bool bSucceeded = CometPreprocess::LoadAndPreprocessSpectra( mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); @@ -165,67 +165,10 @@ bool FiStrategy::executeBatch(MSToolkit::MSReader& mstReader, { string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); - g_cometStatus.SetStatusMsg(strStatusMsg); + session.statusRef.SetStatusMsg(strStatusMsg); } - if (g_staticParams.options.bMango) - { - int iCurrentScanNumber = 0; - int iMangoIndex = 0; - - std::sort(session.queries.begin(), session.queries.end(), compareByMangoIndex); - - for (std::vector::iterator it = session.queries.begin(); it != session.queries.end(); ++it) - { - if ((*it)->_spectrumInfoInternal.iScanNumber != iCurrentScanNumber) - { - iCurrentScanNumber = (*it)->_spectrumInfoInternal.iScanNumber; - iMangoIndex = 0; - } - else - { - iMangoIndex++; - } - sprintf((*it)->_spectrumInfoInternal.szMango, "%03d_%c", - (int)iMangoIndex / 2, (iMangoIndex % 2) ? 'B' : 'A'); - } - } - - std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); - - g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; - g_massRange.dMaxMass = session.queries.at(session.queries.size() - 1)->_pepMassInfo.dPeptideMassTolerancePlus; - - if (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass) - g_massRange.bNarrowMassRange = true; - else - g_massRange.bNarrowMassRange = false; - - bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); - if (!bSucceeded) - return false; - - g_cometStatus.SetStatusMsg(string("Running search...")); - - if (session.bPerformDatabaseSearch) - bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); - if (bSucceeded && session.bPerformSpecLibSearch) - bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); - - if (!bSucceeded) - return false; - - bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); - if (!bSucceeded) - return false; - - if (session.bPerformDatabaseSearch) - { - g_cometStatus.SetStatusMsg(string("Performing post-search analysis ...")); - bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); - } - - return bSucceeded; + return RunSearchAndPostAnalysis(iPercentStart, iPercentEnd, tp, session); } void FiStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) diff --git a/CometSearch/search/PiStrategy.cpp b/CometSearch/search/PiStrategy.cpp index fd55a592..e75af1f4 100644 --- a/CometSearch/search/PiStrategy.cpp +++ b/CometSearch/search/PiStrategy.cpp @@ -54,7 +54,7 @@ bool PiStrategy::openFiles(const std::string& szDatabase, if ((fpidx = fopen(sTmpDB.c_str(), "r")) == nullptr) { string strErrorMsg = " Error (1a) - cannot read .idx file \"" + sTmpDB + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + session.statusRef.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg); return false; } @@ -77,7 +77,7 @@ bool PiStrategy::executeBatch(MSToolkit::MSReader& mstReader, int& iPercentStart, int& iPercentEnd, ThreadPool* tp, SearchSession& session) { - g_cometStatus.SetStatusMsg(string("Loading and processing input spectra")); + session.statusRef.SetStatusMsg(string("Loading and processing input spectra")); bool bSucceeded = CometPreprocess::LoadAndPreprocessSpectra( mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); @@ -97,44 +97,10 @@ bool PiStrategy::executeBatch(MSToolkit::MSReader& mstReader, { string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); - g_cometStatus.SetStatusMsg(strStatusMsg); + session.statusRef.SetStatusMsg(strStatusMsg); } - std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); - - g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; - g_massRange.dMaxMass = session.queries.at(session.queries.size() - 1)->_pepMassInfo.dPeptideMassTolerancePlus; - - if (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass) - g_massRange.bNarrowMassRange = true; - else - g_massRange.bNarrowMassRange = false; - - bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); - if (!bSucceeded) - return false; - - g_cometStatus.SetStatusMsg(string("Running search...")); - - if (session.bPerformDatabaseSearch) - bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); - if (bSucceeded && session.bPerformSpecLibSearch) - bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); - - if (!bSucceeded) - return false; - - bSucceeded = !g_cometStatus.IsError() && !g_cometStatus.IsCancel(); - if (!bSucceeded) - return false; - - if (session.bPerformDatabaseSearch) - { - g_cometStatus.SetStatusMsg(string("Performing post-search analysis ...")); - bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); - } - - return bSucceeded; + return RunSearchAndPostAnalysis(iPercentStart, iPercentEnd, tp, session); } void PiStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) diff --git a/CometSearch/search/Pipeline.cpp b/CometSearch/search/Pipeline.cpp index 1dfdd239..2da9a96b 100644 --- a/CometSearch/search/Pipeline.cpp +++ b/CometSearch/search/Pipeline.cpp @@ -35,7 +35,10 @@ bool Pipeline::run(SearchSession& session, auto tGlobalStart = chrono::steady_clock::now(); if (!_strategy->initialize(session, &tp)) + { + _strategy->finalize(); return false; + } bool bSucceeded = true; int iTotalAllFiles = 0; // spectra searched across all files (for blank-file check) @@ -96,6 +99,7 @@ bool Pipeline::run(SearchSession& session, woctx.iDecoySearch = g_staticParams.options.iDecoySearch; woctx.bIdxNoFasta = session.bIdxNoFasta; woctx.pMgr = _pMgr; + woctx.pStatus = &session.statusRef; for (auto& pw : _writers) { @@ -108,6 +112,7 @@ bool Pipeline::run(SearchSession& session, if (!bSucceeded) { + for (auto& pw : _writers) pw->close(false, false); _strategy->closeFiles(fpfasta, fpidx); break; } @@ -128,6 +133,12 @@ bool Pipeline::run(SearchSession& session, int iTotalSpectraSearched = 0; int iBatchNum = 0; + auto cleanupBatch = [&]() + { + for (auto* q : session.queries) delete q; + session.queries.clear(); + }; + while (!CometPreprocess::DoneProcessingAllSpectra()) { iBatchNum++; @@ -138,7 +149,10 @@ bool Pipeline::run(SearchSession& session, &tp, session); if (!bSucceeded) - goto cleanup_results; + { + cleanupBatch(); + break; + } if (session.queries.empty()) continue; @@ -167,15 +181,12 @@ bool Pipeline::run(SearchSession& session, if (!pw->write(wwctx)) { bSucceeded = false; - goto cleanup_results; + break; } } } -cleanup_results: - for (auto it = session.queries.begin(); it != session.queries.end(); ++it) - delete (*it); - session.queries.clear(); + cleanupBatch(); if (!bSucceeded) break; diff --git a/CometSearch/search/SearchSession.h b/CometSearch/search/SearchSession.h index d3227698..8184fa51 100644 --- a/CometSearch/search/SearchSession.h +++ b/CometSearch/search/SearchSession.h @@ -21,10 +21,17 @@ // here — they are large, initialised once, and shared read-only across all threads. // // Phase 4 migration note: -// g_pvQueryMutex, g_bPlainPeptideIndexRead, g_bSpecLibRead, and g_cometStatus -// remain as globals because they are also accessed from the RTS path -// (InitializeSingleSpectrumSearch / DoSingleSpectrumSearchMultiResults), which -// does not use SearchSession. Full removal is deferred to Phase 5. +// g_pvQueryMutex, g_bPlainPeptideIndexRead, and g_bSpecLibRead remain as globals +// because they are also accessed from the RTS path (InitializeSingleSpectrumSearch / +// DoSingleSpectrumSearchMultiResults), which does not use SearchSession. +// SearchSession does not shadow these globals; all code reads the globals directly. +// Full removal is deferred to Phase 5. +// +// g_cometStatus is exposed here as statusRef: a reference to the process-wide +// singleton. Pipeline and strategy code use session.statusRef so they are not +// coupled to the global name; deep core files (CometSearch.cpp, CometPreprocess.cpp, +// etc.) still reference g_cometStatus directly because they have no SearchSession +// in scope. Both spellings touch the same object. #ifndef _SEARCHSESSION_H_ #define _SEARCHSESSION_H_ @@ -50,20 +57,17 @@ struct SearchSession // Mutex protecting queries and ms1Queries during parallel spectrum loading. std::mutex queriesMutex; - // Run-time flags (replace the five batch-path-only globals). + // Run-time flags (replace the batch-path-only globals). bool bPerformDatabaseSearch = false; bool bPerformSpecLibSearch = false; bool bIdxNoFasta = false; - bool bPlainPeptideIndexRead = false; - bool bSpecLibRead = false; - // Error / cancel state for this run. - // g_cometStatus remains as a global for the RTS path (Phase 5 will unify). - CometStatus status; + // Reference to the process-wide status singleton (g_cometStatus). + CometStatus& statusRef; - explicit SearchSession(const StaticParams& p) : params(p) {} + explicit SearchSession(const StaticParams& p, CometStatus& st) : params(p), statusRef(st) {} SearchSession(const SearchSession&) = delete; SearchSession& operator=(const SearchSession&) = delete; }; -#endif // _SEARCHSESSION_H_ \ No newline at end of file +#endif // _SEARCHSESSION_H_ diff --git a/CometSearch/search/SearchUtils.h b/CometSearch/search/SearchUtils.h index 318b2625..371a5214 100644 --- a/CometSearch/search/SearchUtils.h +++ b/CometSearch/search/SearchUtils.h @@ -16,6 +16,8 @@ #include "Common.h" #include "CometDataInternal.h" +#include "CometSearch.h" +#include "CometPostAnalysis.h" // Shared inline utilities used by Pipeline and strategy classes. // All functions operate on globals (g_staticParams, g_cometStatus, etc.) @@ -168,6 +170,7 @@ inline static bool AllocateResultsMem(std::vector& queries) pQuery->iMatchPeptideCount = 0; pQuery->iDecoyMatchPeptideCount = 0; + memset(pQuery->iXcorrHistogram, 0, sizeof(pQuery->iXcorrHistogram)); for (int j = 0; j < g_staticParams.options.iNumStored; ++j) { @@ -185,7 +188,6 @@ inline static bool AllocateResultsMem(std::vector& queries) pQuery->_pResults[j].pWhichProtein.clear(); pQuery->_pResults[j].sPeffOrigResidues.clear(); pQuery->_pResults[j].iPeffOrigResiduePosition = -9; - memset(pQuery->iXcorrHistogram, 0, sizeof(pQuery->iXcorrHistogram)); if (g_staticParams.options.iDecoySearch) pQuery->_pResults[j].pWhichDecoyProtein.clear(); @@ -232,3 +234,75 @@ inline static bool compareByScanNumber(Query const* a, Query const* b) return (a->_spectrumInfoInternal.usiChargeState < b->_spectrumInfoInternal.usiChargeState); return (a->_spectrumInfoInternal.iScanNumber < b->_spectrumInfoInternal.iScanNumber); } + +// ----------------------------------------------------------------------- +// RunSearchAndPostAnalysis: shared batch-search body used by all strategies. +// Handles optional Mango reindexing, mass-range setup, RunSearch, and +// PostAnalysis. Set bLogPrePostAnalysis=true for FASTA-path verbose output. +// Called after LoadAndPreprocessSpectra + AllocateResultsMem succeed. +// ----------------------------------------------------------------------- +inline static bool RunSearchAndPostAnalysis(int iPercentStart, int iPercentEnd, + ThreadPool* tp, SearchSession& session, + bool bLogPrePostAnalysis = false) +{ + if (g_staticParams.options.bMango) + { + int iCurrentScanNumber = 0; + int iMangoIndex = 0; + + std::sort(session.queries.begin(), session.queries.end(), compareByMangoIndex); + + for (std::vector::iterator it = session.queries.begin(); it != session.queries.end(); ++it) + { + if ((*it)->_spectrumInfoInternal.iScanNumber != iCurrentScanNumber) + { + iCurrentScanNumber = (*it)->_spectrumInfoInternal.iScanNumber; + iMangoIndex = 0; + } + else + { + iMangoIndex++; + } + sprintf((*it)->_spectrumInfoInternal.szMango, "%03d_%c", + (int)iMangoIndex / 2, (iMangoIndex % 2) ? 'B' : 'A'); + } + } + + std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); + + g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; + g_massRange.dMaxMass = session.queries.at(session.queries.size() - 1)->_pepMassInfo.dPeptideMassTolerancePlus; + g_massRange.bNarrowMassRange = (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass); + + bool bSucceeded = !session.statusRef.IsError() && !session.statusRef.IsCancel(); + if (!bSucceeded) + return false; + + session.statusRef.SetStatusMsg(string("Running search...")); + + if (session.bPerformDatabaseSearch) + bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); + if (bSucceeded && session.bPerformSpecLibSearch) + bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); + + if (!bSucceeded) + return false; + + bSucceeded = !session.statusRef.IsError() && !session.statusRef.IsCancel(); + if (!bSucceeded) + return false; + + if (bLogPrePostAnalysis && !g_staticParams.options.bOutputSqtStream) + { + logout(" - Post analysis:"); + fflush(stdout); + } + + if (session.bPerformDatabaseSearch) + { + session.statusRef.SetStatusMsg(string("Performing post-search analysis ...")); + bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); + } + + return bSucceeded; +} diff --git a/CometSearch/threading/SearchMemoryPool.cpp b/CometSearch/threading/SearchMemoryPool.cpp index 63b3103d..0fbc39f7 100644 --- a/CometSearch/threading/SearchMemoryPool.cpp +++ b/CometSearch/threading/SearchMemoryPool.cpp @@ -26,7 +26,7 @@ bool SearchMemoryPool::allocate(int nSlots, int iArraySize) try { _inUse = new bool[nSlots](); - _pool = new bool*[nSlots]; + _pool = new bool*[nSlots](); // value-init to nullptr so partial allocs are safe to delete[] for (int i = 0; i < nSlots; ++i) _pool[i] = new bool[iArraySize](); _nSlots = nSlots; @@ -35,6 +35,16 @@ bool SearchMemoryPool::allocate(int nSlots, int iArraySize) } catch (const std::bad_alloc& ba) { + // Free whatever was allocated before the throw. + if (_pool) + { + for (int k = 0; k < nSlots; ++k) + delete[] _pool[k]; // safe: unset slots are nullptr after value-init above + delete[] _pool; + _pool = nullptr; + } + delete[] _inUse; + _inUse = nullptr; std::string strErrorMsg = " Error - SearchMemoryPool::allocate failed. bad_alloc: " + std::string(ba.what()) + ".\n"; g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg); diff --git a/CometSearch/threading/SearchMemoryPool.h b/CometSearch/threading/SearchMemoryPool.h index 4a69b22b..d88ed5dd 100644 --- a/CometSearch/threading/SearchMemoryPool.h +++ b/CometSearch/threading/SearchMemoryPool.h @@ -21,6 +21,7 @@ #ifndef _SEARCHMEMORYPOOL_H_ #define _SEARCHMEMORYPOOL_H_ +#include #include #include @@ -44,7 +45,7 @@ class SearchMemoryPool void releaseSlot(int slot); // Returns the duplicate-fragment scratch array for a claimed slot. - bool* duplFragmentArr(int slot) const { return _pool[slot]; } + bool* duplFragmentArr(int slot) const { assert(slot >= 0 && slot < _nSlots); return _pool[slot]; } bool isAllocated() const { return _allocated; } int slotCount() const { return _nSlots; } From 0e10e71f532f9dc05d3685e556b604d521eedfc7 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Wed, 17 Jun 2026 12:11:05 -0700 Subject: [PATCH 10/15] fix: close exception-safety gap in SearchMemoryPool slot release, finish operator= and SearchUtils cleanup Fix pass for the architecture_update Strategy/Pipeline refactor: replace drift-prone hand-written operator= bodies in Params.h with = default, extract SearchUtils.h's non-trivial functions into SearchUtils.cpp plus a shared executeBatchLegacy() used by all three strategies, and correct the FusedLoadAndSearchSpectra batch-size counter to reflect processed rather than queued spectra. Also closes a gap found while re-reviewing that fix pass: the original SlotGuard added only to SearchThreadProc has been replaced with a shared SearchMemoryPoolSlotGuard (threading/SearchMemoryPool.h) applied at all five AcquirePoolSlot()/releaseSlot() sites in CometSearch.cpp, so an exception thrown out of any FI/PI search body -- not just the FASTA batch path -- can no longer leak a pool slot and stall the next acquireSlot() caller for up to 240s. The batch-FI per-query thread-pool lambda now also surfaces an AcquirePoolSlot() failure via bSucceeded instead of silently dropping the query. Co-Authored-By: Claude Sonnet 4.6 --- CometSearch/CometPreprocess.cpp | 4 +- CometSearch/CometSearch.cpp | 26 +- CometSearch/CometSearch.h | 7 +- CometSearch/CometSearch.vcxproj | 1 + CometSearch/CometSearchManager.cpp | 2 +- CometSearch/Makefile | 2 +- CometSearch/core/Params.h | 216 +-------------- CometSearch/search/FastaStrategy.cpp | 34 +-- CometSearch/search/FiStrategy.cpp | 26 +- CometSearch/search/PiStrategy.cpp | 26 +- CometSearch/search/Pipeline.cpp | 2 + CometSearch/search/SearchSession.h | 9 +- CometSearch/search/SearchUtils.cpp | 309 ++++++++++++++++++++++ CometSearch/search/SearchUtils.h | 292 ++------------------- CometSearch/threading/SearchMemoryPool.h | 11 + docs/20260617_codereview.md | 317 +++++++++++++++++++++++ docs/20260617_codereview2.md | 186 +++++++++++++ 17 files changed, 892 insertions(+), 578 deletions(-) create mode 100644 CometSearch/search/SearchUtils.cpp create mode 100644 docs/20260617_codereview.md create mode 100644 docs/20260617_codereview2.md diff --git a/CometSearch/CometPreprocess.cpp b/CometSearch/CometPreprocess.cpp index 23ca5197..241804f3 100644 --- a/CometSearch/CometPreprocess.cpp +++ b/CometSearch/CometPreprocess.cpp @@ -3253,7 +3253,6 @@ bool CometPreprocess::FusedLoadAndSearchSpectra(MSReader& mstReader, int iFileLastScan = -1; int iScanNumber = 0; int iTotalScans = 0; - int iNumSpectraLoaded = 0; int iTmpCount = 0; Spectrum mstSpectrum; @@ -3360,7 +3359,6 @@ bool CometPreprocess::FusedLoadAndSearchSpectra(MSReader& mstReader, if (CheckActivationMethodFilter(mstSpectrum.getActivationMethod())) { queue.push(std::move(mstSpectrum)); - iNumSpectraLoaded++; } } @@ -3384,7 +3382,7 @@ bool CometPreprocess::FusedLoadAndSearchSpectra(MSReader& mstReader, { std::lock_guard lk(session.queriesMutex); if (CheckExit(iAnalysisType, iScanNumber, iTotalScans, iLastScan, - mstReader.getLastScan(), iNumSpectraLoaded, 0)) + mstReader.getLastScan(), (int)session.queries.size(), 0)) { break; } diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index 1c1c3694..4dc013df 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -16,6 +16,7 @@ #include "CometSearch.h" #include "CometFragmentIndexReader.h" #include "threading/SearchMemoryPool.h" +#include #include @@ -124,8 +125,8 @@ bool CometSearch::RunSearch(Query* pQuery) logerr(" Error - could not acquire memory pool slot for thread-local FI search.\n"); return false; } + SearchMemoryPoolSlotGuard guard{s_pool, iSlot}; SearchFragmentIndex(pQuery, _ppbDuplFragmentArr[iSlot]); - s_pool.releaseSlot(iSlot); } else if (g_staticParams.iDbType == DbType::PI_DB) // peptide index { @@ -166,8 +167,8 @@ bool CometSearch::RunSearch(Query* pQuery) logerr(" Error - could not acquire memory pool slot for thread-local PI search.\n"); return false; } + SearchMemoryPoolSlotGuard guard{s_pool, iSlot}; SearchPeptideIndex(pQuery, _ppbDuplFragmentArr[iSlot]); - s_pool.releaseSlot(iSlot); } else { @@ -210,8 +211,8 @@ bool CometSearch::RunSearch(ThreadPool *tp, vector& queries) logerr(" Error - could not acquire memory pool slot for single-query FI search.\n"); return false; } + SearchMemoryPoolSlotGuard guard{s_pool, iSlot}; SearchFragmentIndex(queries.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); - s_pool.releaseSlot(iSlot); } else if (g_staticParams.iDbType == DbType::PI_DB) // peptide index { @@ -252,23 +253,32 @@ bool CometSearch::RunSearch(int iPercentStart, ThreadPool* pSearchThreadPool = tp; size_t iEnd = queries.size(); + std::atomic bAllSlotsAcquired(true); for (size_t iWhichQuery = 0; iWhichQuery < iEnd; ++iWhichQuery) { - pSearchThreadPool->doJob([iWhichQuery, &queries]() { + pSearchThreadPool->doJob([iWhichQuery, &queries, &bAllSlotsAcquired]() { int iSlot = AcquirePoolSlot(); if (iSlot < 0) { logerr(" Error - could not acquire memory pool slot for batch FI search thread.\n"); + bAllSlotsAcquired = false; return; } + SearchMemoryPoolSlotGuard guard{s_pool, iSlot}; SearchFragmentIndex(queries.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); - s_pool.releaseSlot(iSlot); }); } pSearchThreadPool->wait_on_threads(); + if (!bAllSlotsAcquired) + { + string strErrorMsg = " Error - one or more batch FI search queries could not acquire a memory pool slot.\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + bSucceeded = false; + } + if (!g_staticParams.options.bOutputSqtStream && !(g_staticParams.databaseInfo.iTotalNumProteins % 500)) { char szTmp[128]; @@ -921,8 +931,7 @@ bool CometSearch::RunSearch(int iPercentStart, // Now search sequence entry; add threading here so that // each protein sequence is passed to a separate thread. - SearchThreadData *pSearchThreadData = new SearchThreadData(dbe); - pSearchThreadData->pQueries = &queries; + SearchThreadData *pSearchThreadData = new SearchThreadData(dbe, &queries); pSearchThreadPool->doJob(std::bind(SearchThreadProc, pSearchThreadData, pSearchThreadPool)); @@ -1261,6 +1270,8 @@ void CometSearch::SearchThreadProc(SearchThreadData *pSearchThreadData, return; } + SearchMemoryPoolSlotGuard guard{s_pool, i}; + // Heap-allocate to avoid thread stack overflow: CometSearch has ~295 KB of // member arrays (_uiBinnedIonMasses, etc.) that would exhaust the 1 MB thread // stack in debug builds when combined with the deep DoSearch call chain. @@ -1268,7 +1279,6 @@ void CometSearch::SearchThreadProc(SearchThreadData *pSearchThreadData, sqSearch->_iSlot = i; sqSearch->DoSearch(pSearchThreadData->dbEntry, _ppbDuplFragmentArr[i], *pSearchThreadData->pQueries); delete sqSearch; - s_pool.releaseSlot(i); delete pSearchThreadData; pSearchThreadData = NULL; diff --git a/CometSearch/CometSearch.h b/CometSearch/CometSearch.h index d812b9b8..8b440567 100644 --- a/CometSearch/CometSearch.h +++ b/CometSearch/CometSearch.h @@ -40,11 +40,10 @@ struct SearchThreadData { sDBEntry dbEntry; ThreadPool* tp; - const vector* pQueries; // batch query list; set before dispatch + const vector* pQueries; - SearchThreadData() = default; - SearchThreadData(const sDBEntry& dbEntry_in) - : dbEntry(dbEntry_in), tp(nullptr), pQueries(nullptr) { + SearchThreadData(const sDBEntry& dbEntry_in, const vector* pQueries_in) + : dbEntry(dbEntry_in), tp(nullptr), pQueries(pQueries_in) { } ~SearchThreadData() diff --git a/CometSearch/CometSearch.vcxproj b/CometSearch/CometSearch.vcxproj index e0844c2b..81952f45 100644 --- a/CometSearch/CometSearch.vcxproj +++ b/CometSearch/CometSearch.vcxproj @@ -154,6 +154,7 @@ + diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index 3e16fe7d..99e46e58 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -2122,7 +2122,7 @@ bool CometSearchManager::DoSearch() CometSpecLib::LoadSpecLib(g_staticParams.speclibInfo.strSpecLibFile); // Build search session with run-level flags. - SearchSession session(g_staticParams, g_cometStatus); + SearchSession session(g_cometStatus); session.bPerformDatabaseSearch = g_bPerformDatabaseSearch; session.bPerformSpecLibSearch = g_bPerformSpecLibSearch; diff --git a/CometSearch/Makefile b/CometSearch/Makefile index a61ce6f3..8cc8f509 100644 --- a/CometSearch/Makefile +++ b/CometSearch/Makefile @@ -29,7 +29,7 @@ COMETSEARCH_SRC = Threading CometInterfaces CometSearch CometPreprocess CometPos THREADING_SRC = threading/SearchMemoryPool -SEARCH_SRC = search/FiStrategy search/FastaStrategy search/PiStrategy search/Pipeline +SEARCH_SRC = search/SearchUtils search/FiStrategy search/FastaStrategy search/PiStrategy search/Pipeline COMETSEARCH_OBJ = $(addprefix $(OBJDIR)/, $(addsuffix .o, $(COMETSEARCH_SRC))) \ $(addprefix $(OBJDIR)/, $(addsuffix .o, $(THREADING_SRC))) \ diff --git a/CometSearch/core/Params.h b/CometSearch/core/Params.h index 178a9943..4d2eef53 100644 --- a/CometSearch/core/Params.h +++ b/CometSearch/core/Params.h @@ -95,72 +95,7 @@ struct Options char szActivationMethod[24]; // mzXML only string sPinProteinDelimiter; // PIN file protein delimiter; default tab - Options& operator=(Options& a) - { - iNumPeptideOutputLines = a.iNumPeptideOutputLines; - iWhichReadingFrame = a.iWhichReadingFrame; - iEnzymeTermini = a.iEnzymeTermini; - iNumStored = a.iNumStored; - iMaxDuplicateProteins = a.iMaxDuplicateProteins; - iSpectrumBatchSize = a.iSpectrumBatchSize; - iStartCharge = a.iStartCharge; - iEndCharge = a.iEndCharge; - iMaxFragmentCharge = a.iMaxFragmentCharge; - iMinPrecursorCharge = a.iMinPrecursorCharge; - iMaxPrecursorCharge = a.iMaxPrecursorCharge ; - iMSLevel = a.iMSLevel; - iMinPeaks = a.iMinPeaks; - iRemovePrecursor = a.iRemovePrecursor; - iDecoySearch = a.iDecoySearch; - iNumThreads = a.iNumThreads; - bResolveFullPaths = a.bResolveFullPaths; - bOutputSqtStream = a.bOutputSqtStream; - bOutputSqtFile = a.bOutputSqtFile; - bOutputTxtFile = a.bOutputTxtFile; - bOutputPepXMLFile = a.bOutputPepXMLFile; - iOutputMzIdentMLFile = a.iOutputMzIdentMLFile; - bOutputPercolatorFile = a.bOutputPercolatorFile; - bClipNtermMet = a.bClipNtermMet; - bClipNtermAA = a.bClipNtermAA; - bMango = a.bMango; - bScaleFragmentNL = a.bScaleFragmentNL; - bCreatePeptideIndex = a.bCreatePeptideIndex; - bCreateFragmentIndex = a.bCreateFragmentIndex; - bFastPlainPeptideIdx = a.bFastPlainPeptideIdx; - bVerboseOutput = a.bVerboseOutput; - bExplicitDeltaCn = a.bExplicitDeltaCn; - bPrintExpectScore = a.bPrintExpectScore; - iPrintAScoreProScore = a.iPrintAScoreProScore; - bExportAdditionalScoresPepXML = a.bExportAdditionalScoresPepXML; - iOverrideCharge = a.iOverrideCharge; - bCorrectMass = a.bCorrectMass; - bTreatSameIL = a.bTreatSameIL; - iMaxIndexRunTime = a.iMaxIndexRunTime; - lMaxIterations = a.lMaxIterations; - dMinIntensity = a.dMinIntensity; - dMinPercentageIntensity = a.dMinPercentageIntensity; - dRemovePrecursorTol = a.dRemovePrecursorTol; - dPeptideMassLow = a.dPeptideMassLow; - dPeptideMassHigh = a.dPeptideMassHigh; - dMinimumXcorr = a.dMinimumXcorr; - scanRange = a.scanRange; - peptideLengthRange = a.peptideLengthRange; - clearMzRange = a.clearMzRange; - strcpy(szActivationMethod, a.szActivationMethod); - sPinProteinDelimiter = a.sPinProteinDelimiter; - - dFragIndexMinMass = a.dFragIndexMinMass; - dFragIndexMaxMass = a.dFragIndexMaxMass; - iFragIndexMinIonsScore = a.iFragIndexMinIonsScore; - iFragIndexMinIonsReport = a.iFragIndexMinIonsReport ; - iFragIndexNumSpectrumPeaks = a.iFragIndexNumSpectrumPeaks; - iFragIndexSkipReadPrecursors = a.iFragIndexSkipReadPrecursors; - - dMS1MinMass = a.dMS1MinMass; - dMS1MaxMass = a.dMS1MaxMass; - - return *this; - } + Options& operator=(const Options&) = default; }; // The minimum and maximum mass range of all peptides to consider @@ -183,15 +118,7 @@ struct DBInfo int iTotalNumProteins; unsigned long int uliTotAACount; - DBInfo& operator=(DBInfo& a) - { - strcpy(szDatabase, a.szDatabase); - strcpy(szFileName, a.szFileName); - iTotalNumProteins = a.iTotalNumProteins; - uliTotAACount = a.uliTotAACount; - - return *this; - } + DBInfo& operator=(const DBInfo&) = default; }; struct SpecLibInfo // why a struct for just a string??? @@ -213,20 +140,7 @@ struct StaticMod double dAddNterminusProtein; double pdStaticMods[SIZE_MASS]; - StaticMod& operator=(StaticMod& a) - { - dAddCterminusPeptide = a.dAddCterminusPeptide; - dAddNterminusPeptide = a.dAddNterminusPeptide; - dAddCterminusProtein = a.dAddCterminusProtein; - dAddNterminusProtein = a.dAddNterminusProtein; - - for (int i = 0; i < SIZE_MASS; ++i) - { - pdStaticMods[i] = a.pdStaticMods[i]; - } - - return *this; - } + StaticMod& operator=(const StaticMod&) = default; }; struct PrecalcMasses @@ -237,16 +151,7 @@ struct PrecalcMasses int iMinus17; // BIN'd value of mass(NH3) int iMinus18; // BIN'd value of mass(H2O) - PrecalcMasses& operator=(PrecalcMasses& a) - { - dNtermProton = a.dNtermProton; - dCtermOH2Proton = a.dCtermOH2Proton; - dOH2ProtonCtermNterm = a.dOH2ProtonCtermNterm; - iMinus17 = a.iMinus17; - iMinus18 = a.iMinus18; - - return *this; - } + PrecalcMasses& operator=(const PrecalcMasses&) = default; }; struct VarModParams @@ -254,7 +159,7 @@ struct VarModParams bool bVarModSearch; // set to true if variable mods are specified bool bVarTermModSearch; // set to true if any n-term/c-term variable mods are specified bool bVarProteinNTermMod; // set to true if a protein n-term variable mod specified - bool bVarProteinCTermMod; // set to true if a protein c-term variable mod specified + bool bVarProteinCTermMod; // set to true if a protein c-term variable mod specified bool bBinaryModSearch; // set to true if any of the variable mods are of binary mod variety bool bUseFragmentNeutralLoss; // set to true if any custom NL is set; applied only to 1+ and 2+ fragments bool bRareVarModPresent; // set to true if any of iRequireThisMod == -1 @@ -270,32 +175,7 @@ struct VarModParams vector vdCompoundMasses; // sorted, deduplicated list of masses read from sCompoundModsFile unsigned int uiNumCompoundMasses; // vdCompoundMasses.size(); 0 when feature is disabled - VarModParams& operator=(VarModParams& a) - { - bVarModSearch = a.bVarModSearch; - bVarTermModSearch = a.bVarTermModSearch; - bVarProteinNTermMod = a.bVarProteinNTermMod; - bVarProteinCTermMod = a.bVarProteinCTermMod; - bBinaryModSearch = a.bBinaryModSearch; - bUseFragmentNeutralLoss = a.bUseFragmentNeutralLoss; - bRareVarModPresent = a.bRareVarModPresent; - bVarModProteinFilter = a.bVarModProteinFilter; - iRequireVarMod = a.iRequireVarMod; - iMaxVarModPerPeptide = a.iMaxVarModPerPeptide; - iMaxPermutations = a.iMaxPermutations; - - for (int i = 0; i < VMODS; ++i) - { - varModList[i] = a.varModList[i]; - cModCode[i] = a.cModCode[i]; - } - - sCompoundModsFile = a.sCompoundModsFile; - vdCompoundMasses = a.vdCompoundMasses; - uiNumCompoundMasses = a.uiNumCompoundMasses; - - return *this; - } + VarModParams& operator=(const VarModParams&) = default; }; struct MassUtil @@ -313,27 +193,7 @@ struct MassUtil double pdAAMassFragment[SIZE_MASS]; double pdAAMassUser[SIZE_MASS]; // user defined default amino acid masses - MassUtil& operator=(MassUtil& a) - { - bMonoMassesParent = a.bMonoMassesParent; - bMonoMassesFragment = a.bMonoMassesFragment; - dCO = a.dCO; - dNH3 = a.dNH3; - dNH2 = a.dNH2; - dH2O = a.dH2O; - dCOminusH2 = a.dCOminusH2; - dOH2fragment = a.dOH2fragment; - dOH2parent = a.dOH2parent; - - for (int i = 0; i < SIZE_MASS; ++i) - { - pdAAMassParent[i] = a.pdAAMassParent[i]; - pdAAMassFragment[i] = a.pdAAMassFragment[i]; - pdAAMassUser[i] = a.pdAAMassUser[i]; - } - - return *this; - } + MassUtil& operator=(const MassUtil&) = default; }; struct ToleranceParams @@ -348,20 +208,7 @@ struct ToleranceParams double dMS1BinSize; double dMS1BinStartOffset; - ToleranceParams& operator=(ToleranceParams& a) - { - iMassToleranceUnits = a.iMassToleranceUnits; - iMassToleranceType = a.iMassToleranceType; - iIsotopeError = a.iIsotopeError; - dInputToleranceMinus = a.dInputToleranceMinus; - dInputTolerancePlus = a.dInputTolerancePlus; - dFragmentBinSize = a.dFragmentBinSize; - dFragmentBinStartOffset = a.dFragmentBinStartOffset; - dMS1BinSize = a.dMS1BinSize; - dMS1BinStartOffset = a.dMS1BinStartOffset; - - return *this; - } + ToleranceParams& operator=(const ToleranceParams&) = default; }; struct IonInfo @@ -372,20 +219,7 @@ struct IonInfo int iTheoreticalFragmentIons; int iIonVal[NUM_ION_SERIES]; - IonInfo& operator=(IonInfo& a) - { - iNumIonSeriesUsed = a.iNumIonSeriesUsed; - bUseWaterAmmoniaLoss = a.bUseWaterAmmoniaLoss; - iTheoreticalFragmentIons = a.iTheoreticalFragmentIons; - - for (int i = 0; i < NUM_ION_SERIES; ++i) - { - piSelectedIonSeries[i] = a.piSelectedIonSeries[i]; - iIonVal[i] = a.iIonVal[i]; - } - - return *this; - } + IonInfo& operator=(const IonInfo&) = default; }; // static user params, won't change per thread - can make global! @@ -430,37 +264,7 @@ struct StaticParams RestoreDefaults(); } - StaticParams& operator=(StaticParams& a) - { - sHostName = a.sHostName; - strcpy(szMod, a.szMod); - strcpy(szDecoyPrefix, a.szDecoyPrefix); - strcpy(szOutputSuffix, a.szOutputSuffix); - strcpy(szTxtFileExt, a.szTxtFileExt); - vectorMassOffsets = a.vectorMassOffsets; - precursorNLIons= a.precursorNLIons; - iPrecursorNLSize = a.iPrecursorNLSize; - iOldModsEncoding = a.iOldModsEncoding; - iElapseTime = a.iElapseTime; - strcpy(szDate, a.szDate); - options = a.options; - databaseInfo = a.databaseInfo; - speclibInfo = a.speclibInfo; - inputFile = a.inputFile; - bPrintDuplReferences = a.bPrintDuplReferences; - variableModParameters = a.variableModParameters; - tolerances = a.tolerances; - staticModifications = a.staticModifications; - precalcMasses = a.precalcMasses; - enzymeInformation = a.enzymeInformation; - massUtility = a.massUtility; - dInverseBinWidth = a.dInverseBinWidth; - iArraySizeGlobal = a.iArraySizeGlobal; - dOneMinusBinOffset = a.dOneMinusBinOffset; - iXcorrProcessingOffset = a.iXcorrProcessingOffset; - ionInformation = a.ionInformation; - return *this; - } + StaticParams& operator=(const StaticParams&) = default; void RestoreDefaults() { diff --git a/CometSearch/search/FastaStrategy.cpp b/CometSearch/search/FastaStrategy.cpp index e417a823..dc0ab427 100644 --- a/CometSearch/search/FastaStrategy.cpp +++ b/CometSearch/search/FastaStrategy.cpp @@ -62,38 +62,8 @@ bool FastaStrategy::executeBatch(MSToolkit::MSReader& mstReader, int& iPercentStart, int& iPercentEnd, ThreadPool* tp, SearchSession& session) { - if (!g_staticParams.options.bOutputSqtStream) - { - logout(" - Load spectra:"); - fflush(stdout); - } - - session.statusRef.SetStatusMsg(string("Loading and processing input spectra")); - - bool bSucceeded = CometPreprocess::LoadAndPreprocessSpectra( - mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); - - iPercentStart = iPercentEnd; - iPercentEnd = mstReader.getPercent(); - - if (!bSucceeded) - return false; - - if (session.queries.empty()) - return true; - - bSucceeded = AllocateResultsMem(session.queries); - if (!bSucceeded) - return false; - - { - string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); - if (!g_staticParams.options.bOutputSqtStream) - logout(strStatusMsg); - session.statusRef.SetStatusMsg(strStatusMsg); - } - - return RunSearchAndPostAnalysis(iPercentStart, iPercentEnd, tp, session, true); + return executeBatchLegacy(mstReader, iFirstScan, iLastScan, iAnalysisType, + iPercentStart, iPercentEnd, tp, session, true); } void FastaStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) diff --git a/CometSearch/search/FiStrategy.cpp b/CometSearch/search/FiStrategy.cpp index c646ccdd..4c7c45d9 100644 --- a/CometSearch/search/FiStrategy.cpp +++ b/CometSearch/search/FiStrategy.cpp @@ -145,30 +145,8 @@ bool FiStrategy::executeBatch(MSToolkit::MSReader& mstReader, // Legacy three-sweep path: LoadAndPreprocess -> AllocateResults -> // sort-by-mass -> RunSearch -> PostAnalysis. - session.statusRef.SetStatusMsg(string("Loading and processing input spectra")); - - bool bSucceeded = CometPreprocess::LoadAndPreprocessSpectra( - mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); - - iPercentStart = iPercentEnd; - iPercentEnd = mstReader.getPercent(); - - if (!bSucceeded) - return false; - - if (session.queries.empty()) - return true; // no spectra in this batch; caller will continue to next - - bSucceeded = AllocateResultsMem(session.queries); - if (!bSucceeded) - return false; - - { - string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); - session.statusRef.SetStatusMsg(strStatusMsg); - } - - return RunSearchAndPostAnalysis(iPercentStart, iPercentEnd, tp, session); + return executeBatchLegacy(mstReader, iFirstScan, iLastScan, iAnalysisType, + iPercentStart, iPercentEnd, tp, session, false); } void FiStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) diff --git a/CometSearch/search/PiStrategy.cpp b/CometSearch/search/PiStrategy.cpp index e75af1f4..d4344764 100644 --- a/CometSearch/search/PiStrategy.cpp +++ b/CometSearch/search/PiStrategy.cpp @@ -77,30 +77,8 @@ bool PiStrategy::executeBatch(MSToolkit::MSReader& mstReader, int& iPercentStart, int& iPercentEnd, ThreadPool* tp, SearchSession& session) { - session.statusRef.SetStatusMsg(string("Loading and processing input spectra")); - - bool bSucceeded = CometPreprocess::LoadAndPreprocessSpectra( - mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); - - iPercentStart = iPercentEnd; - iPercentEnd = mstReader.getPercent(); - - if (!bSucceeded) - return false; - - if (session.queries.empty()) - return true; - - bSucceeded = AllocateResultsMem(session.queries); - if (!bSucceeded) - return false; - - { - string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); - session.statusRef.SetStatusMsg(strStatusMsg); - } - - return RunSearchAndPostAnalysis(iPercentStart, iPercentEnd, tp, session); + return executeBatchLegacy(mstReader, iFirstScan, iLastScan, iAnalysisType, + iPercentStart, iPercentEnd, tp, session, false); } void PiStrategy::closeFiles(FILE* fpfasta, FILE* fpidx) diff --git a/CometSearch/search/Pipeline.cpp b/CometSearch/search/Pipeline.cpp index 2da9a96b..4f677ff5 100644 --- a/CometSearch/search/Pipeline.cpp +++ b/CometSearch/search/Pipeline.cpp @@ -137,6 +137,8 @@ bool Pipeline::run(SearchSession& session, { for (auto* q : session.queries) delete q; session.queries.clear(); + for (auto* q : session.ms1Queries) delete q; + session.ms1Queries.clear(); }; while (!CometPreprocess::DoneProcessingAllSpectra()) diff --git a/CometSearch/search/SearchSession.h b/CometSearch/search/SearchSession.h index 8184fa51..e595c06c 100644 --- a/CometSearch/search/SearchSession.h +++ b/CometSearch/search/SearchSession.h @@ -17,8 +17,8 @@ // Passed by reference to pipeline functions that read or write per-run state. // // Read-only index globals (g_iFragmentIndex, g_vFragmentPeptides, g_vRawPeptides, -// g_vSpecLib, g_pvProteinsList, g_pvProteinNameCache, g_pvDBIndex, …) are NOT moved -// here — they are large, initialised once, and shared read-only across all threads. +// g_vSpecLib, g_pvProteinsList, g_pvProteinNameCache, g_pvDBIndex, ...) are NOT moved +// here -- they are large, initialised once, and shared read-only across all threads. // // Phase 4 migration note: // g_pvQueryMutex, g_bPlainPeptideIndexRead, and g_bSpecLibRead remain as globals @@ -44,9 +44,6 @@ struct SearchSession { - // Run parameters — set once before the file loop, then read-only. - const StaticParams& params; - // Per-batch MS2 result accumulator. // Guarded by queriesMutex in the batch path. std::vector queries; @@ -65,7 +62,7 @@ struct SearchSession // Reference to the process-wide status singleton (g_cometStatus). CometStatus& statusRef; - explicit SearchSession(const StaticParams& p, CometStatus& st) : params(p), statusRef(st) {} + explicit SearchSession(CometStatus& st) : statusRef(st) {} SearchSession(const SearchSession&) = delete; SearchSession& operator=(const SearchSession&) = delete; }; diff --git a/CometSearch/search/SearchUtils.cpp b/CometSearch/search/SearchUtils.cpp new file mode 100644 index 00000000..554c1ccb --- /dev/null +++ b/CometSearch/search/SearchUtils.cpp @@ -0,0 +1,309 @@ +// Copyright 2023 Jimmy Eng +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "SearchUtils.h" + +static InputType GetInputType(const char* pszFileName) +{ + int iLen = (int)strlen(pszFileName); + + if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 6, ".mzXML") + || !STRCMP_IGNORE_CASE(pszFileName + iLen - 5, ".mzML") + || !STRCMP_IGNORE_CASE(pszFileName + iLen - 9, ".mzXML.gz") + || !STRCMP_IGNORE_CASE(pszFileName + iLen - 8, ".mzML.gz")) + { + return InputType_MZXML; + } + else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".raw")) + { + return InputType_RAW; + } + else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".ms2") + || !STRCMP_IGNORE_CASE(pszFileName + iLen - 5, ".cms2")) + { + return InputType_MS2; + } + else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".mgf")) + { + return InputType_MGF; + } + + return InputType_UNKNOWN; +} + + +bool UpdateInputFile(InputFileInfo* pFileInfo) +{ + bool bUpdateBaseName = false; + char szTmpBaseName[SIZE_FILE]; + + if (g_staticParams.inputFile.szBaseName[0] == '\0' || g_pvInputFiles.size() > 1) + bUpdateBaseName = true; + else + strcpy(szTmpBaseName, g_staticParams.inputFile.szBaseName); + + g_staticParams.inputFile = *pFileInfo; + g_staticParams.inputFile.iInputType = GetInputType(g_staticParams.inputFile.szFileName); + + if (InputType_UNKNOWN == g_staticParams.inputFile.iInputType) + return false; + + FILE* fp; + if ((fp = fopen(g_staticParams.inputFile.szFileName, "r")) == NULL) + { + string strErrorMsg = " Error - cannot read input file \"" + string(g_staticParams.inputFile.szFileName) + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + else + { + fclose(fp); + } + +#ifndef CRUX + if (bUpdateBaseName) + { + char* pStr; + int iLen = (int)strlen(g_staticParams.inputFile.szFileName); + + strcpy(g_staticParams.inputFile.szBaseName, g_staticParams.inputFile.szFileName); + + if ((pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) + *pStr = '\0'; + + if (!STRCMP_IGNORE_CASE(g_staticParams.inputFile.szFileName + iLen - 9, ".mzXML.gz") + || !STRCMP_IGNORE_CASE(g_staticParams.inputFile.szFileName + iLen - 8, ".mzML.gz")) + { + if ((pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) + *pStr = '\0'; + } + } + else + { + strcpy(g_staticParams.inputFile.szBaseName, szTmpBaseName); + } +#endif + + return true; +} + + +void SetMSLevelFilter(MSReader& mstReader) +{ + vector msLevel; + + if (g_staticParams.options.iMSLevel == 3) + msLevel.push_back(MS3); + else if (g_staticParams.options.iMSLevel == 2) + msLevel.push_back(MS2); + else if (g_staticParams.options.iMSLevel == 1) + msLevel.push_back(MS1); + + mstReader.setFilter(msLevel); +} + + +bool AllocateResultsMem(std::vector& queries) +{ + for (std::vector::iterator it = queries.begin(); it != queries.end(); ++it) + { + Query* pQuery = *it; + + try + { + pQuery->_pResults = new Results[g_staticParams.options.iNumStored]; + } + catch (std::bad_alloc& ba) + { + string strErrorMsg = " Error - new(_pResults[]). bad_alloc: \"" + std::string(ba.what()) + "\".\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + + if (g_staticParams.options.iDecoySearch == 2) + { + try + { + pQuery->_pDecoys = new Results[g_staticParams.options.iNumStored]; + } + catch (std::bad_alloc& ba) + { + string strErrorMsg = " Error - new(_pDecoys[]). bad_alloc: " + std::string(ba.what()) + "\n"; + g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); + logerr(strErrorMsg); + return false; + } + } + + pQuery->iMatchPeptideCount = 0; + pQuery->iDecoyMatchPeptideCount = 0; + memset(pQuery->iXcorrHistogram, 0, sizeof(pQuery->iXcorrHistogram)); + + for (int j = 0; j < g_staticParams.options.iNumStored; ++j) + { + pQuery->_pResults[j].dPepMass = 0.0; + pQuery->_pResults[j].dExpect = 999; + pQuery->_pResults[j].fScoreSp = 0.0; + pQuery->_pResults[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; + pQuery->_pResults[j].fAScorePro = 0.0; + pQuery->_pResults[j].usiLenPeptide = 0; + pQuery->_pResults[j].usiRankSp = 0; + pQuery->_pResults[j].usiMatchedIons = 0; + pQuery->_pResults[j].usiTotalIons = 0; + pQuery->_pResults[j].szPeptide[0] = '\0'; + pQuery->_pResults[j].sAScoreProSiteScores.clear(); + pQuery->_pResults[j].pWhichProtein.clear(); + pQuery->_pResults[j].sPeffOrigResidues.clear(); + pQuery->_pResults[j].iPeffOrigResiduePosition = -9; + + if (g_staticParams.options.iDecoySearch) + pQuery->_pResults[j].pWhichDecoyProtein.clear(); + + if (g_staticParams.options.iDecoySearch == 2) + { + pQuery->_pDecoys[j].dPepMass = 0.0; + pQuery->_pDecoys[j].dExpect = 999; + pQuery->_pDecoys[j].fScoreSp = 0.0; + pQuery->_pDecoys[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; + pQuery->_pDecoys[j].fAScorePro = 0.0; + pQuery->_pDecoys[j].usiLenPeptide = 0; + pQuery->_pDecoys[j].usiRankSp = 0; + pQuery->_pDecoys[j].usiMatchedIons = 0; + pQuery->_pDecoys[j].usiTotalIons = 0; + pQuery->_pDecoys[j].szPeptide[0] = '\0'; + pQuery->_pDecoys[j].sAScoreProSiteScores.clear(); + pQuery->_pDecoys[j].pWhichProtein.clear(); + pQuery->_pDecoys[j].sPeffOrigResidues.clear(); + pQuery->_pDecoys[j].iPeffOrigResiduePosition = -9; + } + } + } + + return true; +} + + +bool RunSearchAndPostAnalysis(int iPercentStart, int iPercentEnd, + ThreadPool* tp, SearchSession& session, + bool bLogPrePostAnalysis) +{ + if (g_staticParams.options.bMango) + { + int iCurrentScanNumber = 0; + int iMangoIndex = 0; + + std::sort(session.queries.begin(), session.queries.end(), compareByMangoIndex); + + for (std::vector::iterator it = session.queries.begin(); it != session.queries.end(); ++it) + { + if ((*it)->_spectrumInfoInternal.iScanNumber != iCurrentScanNumber) + { + iCurrentScanNumber = (*it)->_spectrumInfoInternal.iScanNumber; + iMangoIndex = 0; + } + else + { + iMangoIndex++; + } + sprintf((*it)->_spectrumInfoInternal.szMango, "%03d_%c", + (int)iMangoIndex / 2, (iMangoIndex % 2) ? 'B' : 'A'); + } + } + + std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); + + g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; + g_massRange.dMaxMass = session.queries.at(session.queries.size() - 1)->_pepMassInfo.dPeptideMassTolerancePlus; + g_massRange.bNarrowMassRange = (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass); + + bool bSucceeded = !session.statusRef.IsError() && !session.statusRef.IsCancel(); + if (!bSucceeded) + return false; + + session.statusRef.SetStatusMsg(string("Running search...")); + + if (session.bPerformDatabaseSearch) + bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); + if (bSucceeded && session.bPerformSpecLibSearch) + bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); + // TODO(batch-MS1): CometSearch::RunMS1Search(tp, dRT, dMaxMS1RTDiff, dMaxSpecLibRT, + // dMaxQueryRT, session.ms1Queries) must be called here when the batch MS1 speclib + // path is implemented. It requires a second reader pass over the file at + // iSpecLibMSLevel to populate session.ms1Queries, plus per-file RT range values + // from CometSpecLib::LoadSpecLibMS1Raw. Neither exists in the batch pipeline yet. + + if (!bSucceeded) + return false; + + bSucceeded = !session.statusRef.IsError() && !session.statusRef.IsCancel(); + if (!bSucceeded) + return false; + + if (bLogPrePostAnalysis && !g_staticParams.options.bOutputSqtStream) + { + logout(" - Post analysis:"); + fflush(stdout); + } + + if (session.bPerformDatabaseSearch) + { + session.statusRef.SetStatusMsg(string("Performing post-search analysis ...")); + bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); + } + + return bSucceeded; +} + + +bool executeBatchLegacy(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session, + bool bVerbose) +{ + if (bVerbose && !g_staticParams.options.bOutputSqtStream) + { + logout(" - Load spectra:"); + fflush(stdout); + } + + session.statusRef.SetStatusMsg(string("Loading and processing input spectra")); + + bool bSucceeded = CometPreprocess::LoadAndPreprocessSpectra( + mstReader, iFirstScan, iLastScan, iAnalysisType, tp, session); + + iPercentStart = iPercentEnd; + iPercentEnd = mstReader.getPercent(); + + if (!bSucceeded) + return false; + + if (session.queries.empty()) + return true; + + bSucceeded = AllocateResultsMem(session.queries); + if (!bSucceeded) + return false; + + { + string strStatusMsg = " " + std::to_string(session.queries.size()) + string("\n"); + if (bVerbose && !g_staticParams.options.bOutputSqtStream) + logout(strStatusMsg); + session.statusRef.SetStatusMsg(strStatusMsg); + } + + return RunSearchAndPostAnalysis(iPercentStart, iPercentEnd, tp, session, bVerbose); +} diff --git a/CometSearch/search/SearchUtils.h b/CometSearch/search/SearchUtils.h index 371a5214..e0c3d6e3 100644 --- a/CometSearch/search/SearchUtils.h +++ b/CometSearch/search/SearchUtils.h @@ -18,291 +18,45 @@ #include "CometDataInternal.h" #include "CometSearch.h" #include "CometPostAnalysis.h" +#include "CometPreprocess.h" +#include "MSReader.h" +#include "SearchSession.h" -// Shared inline utilities used by Pipeline and strategy classes. -// All functions operate on globals (g_staticParams, g_cometStatus, etc.) -// which are declared in CometDataInternal.h / Common.h. +// Shared utilities used by Pipeline and strategy classes. -// ----------------------------------------------------------------------- -// Input type detection (file extension -> InputType enum) -// ----------------------------------------------------------------------- -inline static InputType GetInputType(const char* pszFileName) -{ - int iLen = (int)strlen(pszFileName); - - if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 6, ".mzXML") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 5, ".mzML") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 9, ".mzXML.gz") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 8, ".mzML.gz")) - { - return InputType_MZXML; - } - else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".raw")) - { - return InputType_RAW; - } - else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".ms2") - || !STRCMP_IGNORE_CASE(pszFileName + iLen - 5, ".cms2")) - { - return InputType_MS2; - } - else if (!STRCMP_IGNORE_CASE(pszFileName + iLen - 4, ".mgf")) - { - return InputType_MGF; - } - - return InputType_UNKNOWN; -} - -// ----------------------------------------------------------------------- -// UpdateInputFile: sets g_staticParams.inputFile from pFileInfo. -// Returns false on unknown type or if file cannot be opened. -// ----------------------------------------------------------------------- -inline static bool UpdateInputFile(InputFileInfo* pFileInfo) -{ - bool bUpdateBaseName = false; - char szTmpBaseName[SIZE_FILE]; - - if (g_staticParams.inputFile.szBaseName[0] == '\0' || g_pvInputFiles.size() > 1) - bUpdateBaseName = true; - else - strcpy(szTmpBaseName, g_staticParams.inputFile.szBaseName); - - g_staticParams.inputFile = *pFileInfo; - g_staticParams.inputFile.iInputType = GetInputType(g_staticParams.inputFile.szFileName); - - if (InputType_UNKNOWN == g_staticParams.inputFile.iInputType) - return false; - - FILE* fp; - if ((fp = fopen(g_staticParams.inputFile.szFileName, "r")) == NULL) - { - string strErrorMsg = " Error - cannot read input file \"" + string(g_staticParams.inputFile.szFileName) + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - else - { - fclose(fp); - } +bool UpdateInputFile(InputFileInfo* pFileInfo); +void SetMSLevelFilter(MSReader& mstReader); +bool AllocateResultsMem(std::vector& queries); +bool RunSearchAndPostAnalysis(int iPercentStart, int iPercentEnd, + ThreadPool* tp, SearchSession& session, + bool bLogPrePostAnalysis = false); -#ifndef CRUX - if (bUpdateBaseName) - { - char* pStr; - int iLen = (int)strlen(g_staticParams.inputFile.szFileName); - - strcpy(g_staticParams.inputFile.szBaseName, g_staticParams.inputFile.szFileName); - - if ((pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) - *pStr = '\0'; - - if (!STRCMP_IGNORE_CASE(g_staticParams.inputFile.szFileName + iLen - 9, ".mzXML.gz") - || !STRCMP_IGNORE_CASE(g_staticParams.inputFile.szFileName + iLen - 8, ".mzML.gz")) - { - if ((pStr = strrchr(g_staticParams.inputFile.szBaseName, '.'))) - *pStr = '\0'; - } - } - else - { - strcpy(g_staticParams.inputFile.szBaseName, szTmpBaseName); - } -#endif - - return true; -} - -// ----------------------------------------------------------------------- -// SetMSLevelFilter: configure MSReader to read the right MS level. -// ----------------------------------------------------------------------- -inline static void SetMSLevelFilter(MSReader& mstReader) -{ - vector msLevel; - - if (g_staticParams.options.iMSLevel == 3) - msLevel.push_back(MS3); - else if (g_staticParams.options.iMSLevel == 2) - msLevel.push_back(MS2); - else if (g_staticParams.options.iMSLevel == 1) - msLevel.push_back(MS1); - - mstReader.setFilter(msLevel); -} +// Legacy three-sweep batch body: LoadAndPreprocess -> AllocateResults -> +// RunSearchAndPostAnalysis. Used by FiStrategy (non-fused fallback), +// FastaStrategy, and PiStrategy. Pass bVerbose=true for FASTA-path +// console progress output. +bool executeBatchLegacy(MSToolkit::MSReader& mstReader, + int iFirstScan, int iLastScan, int iAnalysisType, + int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session, + bool bVerbose); // ----------------------------------------------------------------------- -// AllocateResultsMem: allocate _pResults (and optionally _pDecoys) for -// every Query* in the batch, and zero-initialize scoring fields. +// Query sort comparators -- kept inline; single-expression each. // ----------------------------------------------------------------------- -inline static bool AllocateResultsMem(std::vector& queries) -{ - for (std::vector::iterator it = queries.begin(); it != queries.end(); ++it) - { - Query* pQuery = *it; - - try - { - pQuery->_pResults = new Results[g_staticParams.options.iNumStored]; - } - catch (std::bad_alloc& ba) - { - string strErrorMsg = " Error - new(_pResults[]). bad_alloc: \"" + std::string(ba.what()) + "\".\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - - if (g_staticParams.options.iDecoySearch == 2) - { - try - { - pQuery->_pDecoys = new Results[g_staticParams.options.iNumStored]; - } - catch (std::bad_alloc& ba) - { - string strErrorMsg = " Error - new(_pDecoys[]). bad_alloc: " + std::string(ba.what()) + "\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - } - - pQuery->iMatchPeptideCount = 0; - pQuery->iDecoyMatchPeptideCount = 0; - memset(pQuery->iXcorrHistogram, 0, sizeof(pQuery->iXcorrHistogram)); - - for (int j = 0; j < g_staticParams.options.iNumStored; ++j) - { - pQuery->_pResults[j].dPepMass = 0.0; - pQuery->_pResults[j].dExpect = 999; - pQuery->_pResults[j].fScoreSp = 0.0; - pQuery->_pResults[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; - pQuery->_pResults[j].fAScorePro = 0.0; - pQuery->_pResults[j].usiLenPeptide = 0; - pQuery->_pResults[j].usiRankSp = 0; - pQuery->_pResults[j].usiMatchedIons = 0; - pQuery->_pResults[j].usiTotalIons = 0; - pQuery->_pResults[j].szPeptide[0] = '\0'; - pQuery->_pResults[j].sAScoreProSiteScores.clear(); - pQuery->_pResults[j].pWhichProtein.clear(); - pQuery->_pResults[j].sPeffOrigResidues.clear(); - pQuery->_pResults[j].iPeffOrigResiduePosition = -9; - - if (g_staticParams.options.iDecoySearch) - pQuery->_pResults[j].pWhichDecoyProtein.clear(); - - if (g_staticParams.options.iDecoySearch == 2) - { - pQuery->_pDecoys[j].dPepMass = 0.0; - pQuery->_pDecoys[j].dExpect = 999; - pQuery->_pDecoys[j].fScoreSp = 0.0; - pQuery->_pDecoys[j].fXcorr = (float)g_staticParams.options.dMinimumXcorr; - pQuery->_pDecoys[j].fAScorePro = 0.0; - pQuery->_pDecoys[j].usiLenPeptide = 0; - pQuery->_pDecoys[j].usiRankSp = 0; - pQuery->_pDecoys[j].usiMatchedIons = 0; - pQuery->_pDecoys[j].usiTotalIons = 0; - pQuery->_pDecoys[j].szPeptide[0] = '\0'; - pQuery->_pDecoys[j].sAScoreProSiteScores.clear(); - pQuery->_pDecoys[j].pWhichProtein.clear(); - pQuery->_pDecoys[j].sPeffOrigResidues.clear(); - pQuery->_pDecoys[j].iPeffOrigResiduePosition = -9; - } - } - } - - return true; -} - -// ----------------------------------------------------------------------- -// Query sort comparators -// ----------------------------------------------------------------------- -inline static bool compareByPeptideMass(Query const* a, Query const* b) +inline bool compareByPeptideMass(Query const* a, Query const* b) { return (a->_pepMassInfo.dExpPepMass < b->_pepMassInfo.dExpPepMass); } -inline static bool compareByMangoIndex(Query const* a, Query const* b) +inline bool compareByMangoIndex(Query const* a, Query const* b) { return (a->dMangoIndex < b->dMangoIndex); } -inline static bool compareByScanNumber(Query const* a, Query const* b) +inline bool compareByScanNumber(Query const* a, Query const* b) { if (a->_spectrumInfoInternal.iScanNumber == b->_spectrumInfoInternal.iScanNumber) return (a->_spectrumInfoInternal.usiChargeState < b->_spectrumInfoInternal.usiChargeState); return (a->_spectrumInfoInternal.iScanNumber < b->_spectrumInfoInternal.iScanNumber); } - -// ----------------------------------------------------------------------- -// RunSearchAndPostAnalysis: shared batch-search body used by all strategies. -// Handles optional Mango reindexing, mass-range setup, RunSearch, and -// PostAnalysis. Set bLogPrePostAnalysis=true for FASTA-path verbose output. -// Called after LoadAndPreprocessSpectra + AllocateResultsMem succeed. -// ----------------------------------------------------------------------- -inline static bool RunSearchAndPostAnalysis(int iPercentStart, int iPercentEnd, - ThreadPool* tp, SearchSession& session, - bool bLogPrePostAnalysis = false) -{ - if (g_staticParams.options.bMango) - { - int iCurrentScanNumber = 0; - int iMangoIndex = 0; - - std::sort(session.queries.begin(), session.queries.end(), compareByMangoIndex); - - for (std::vector::iterator it = session.queries.begin(); it != session.queries.end(); ++it) - { - if ((*it)->_spectrumInfoInternal.iScanNumber != iCurrentScanNumber) - { - iCurrentScanNumber = (*it)->_spectrumInfoInternal.iScanNumber; - iMangoIndex = 0; - } - else - { - iMangoIndex++; - } - sprintf((*it)->_spectrumInfoInternal.szMango, "%03d_%c", - (int)iMangoIndex / 2, (iMangoIndex % 2) ? 'B' : 'A'); - } - } - - std::sort(session.queries.begin(), session.queries.end(), compareByPeptideMass); - - g_massRange.dMinMass = session.queries.at(0)->_pepMassInfo.dPeptideMassToleranceMinus; - g_massRange.dMaxMass = session.queries.at(session.queries.size() - 1)->_pepMassInfo.dPeptideMassTolerancePlus; - g_massRange.bNarrowMassRange = (g_massRange.dMaxMass - g_massRange.dMinMass > g_massRange.dMinMass); - - bool bSucceeded = !session.statusRef.IsError() && !session.statusRef.IsCancel(); - if (!bSucceeded) - return false; - - session.statusRef.SetStatusMsg(string("Running search...")); - - if (session.bPerformDatabaseSearch) - bSucceeded = CometSearch::RunSearch(iPercentStart, iPercentEnd, tp, session.queries); - if (bSucceeded && session.bPerformSpecLibSearch) - bSucceeded = CometSearch::RunSpecLibSearch(iPercentStart, iPercentEnd, tp, session.queries); - - if (!bSucceeded) - return false; - - bSucceeded = !session.statusRef.IsError() && !session.statusRef.IsCancel(); - if (!bSucceeded) - return false; - - if (bLogPrePostAnalysis && !g_staticParams.options.bOutputSqtStream) - { - logout(" - Post analysis:"); - fflush(stdout); - } - - if (session.bPerformDatabaseSearch) - { - session.statusRef.SetStatusMsg(string("Performing post-search analysis ...")); - bSucceeded = CometPostAnalysis::PostAnalysis(tp, session.queries); - } - - return bSucceeded; -} diff --git a/CometSearch/threading/SearchMemoryPool.h b/CometSearch/threading/SearchMemoryPool.h index d88ed5dd..0cf1a719 100644 --- a/CometSearch/threading/SearchMemoryPool.h +++ b/CometSearch/threading/SearchMemoryPool.h @@ -62,4 +62,15 @@ class SearchMemoryPool std::condition_variable _cv; }; +// RAII guard for a slot acquired via SearchMemoryPool::acquireSlot(). Releases the +// slot on scope exit (normal return or exception unwind) so a throw out of the +// search body never leaks the slot and stalls the next acquireSlot() caller for up +// to 240 s. Construct only after checking the acquired slot is >= 0. +struct SearchMemoryPoolSlotGuard +{ + SearchMemoryPool& pool; + int slot; + ~SearchMemoryPoolSlotGuard() { if (slot >= 0) pool.releaseSlot(slot); } +}; + #endif // _SEARCHMEMORYPOOL_H_ diff --git a/docs/20260617_codereview.md b/docs/20260617_codereview.md new file mode 100644 index 00000000..a1c67f78 --- /dev/null +++ b/docs/20260617_codereview.md @@ -0,0 +1,317 @@ +# Code Review: architecture_update branch (2026-06-17) + +## Scope + +Deep review of the `architecture_update` branch versus `master` (commit `c971a2dd`). +The diff covers the Strategy/Pipeline refactor: `ISearchStrategy` + `Pipeline` replace +the monolithic `CometSearchManager::DoSearch` per-file loop; `SearchSession` replaces the +batch-path globals `g_pvQuery` / `g_pvQueryMS1`; `SearchMemoryPool` encapsulates the +thread scratch-array pool; and a new `output/IResultWriter` layer wraps the existing +`CometWrite*` classes. + +Review method: 8 parallel finder angles (line-by-line diff scan, removed-behavior audit, +cross-file tracer, reuse, simplification, efficiency, altitude, conventions), each +surfacing up to 6 candidates, followed by a 1-vote verification pass on the strongest +findings. + +--- + +## 1. Summary + +The refactor successfully decouples per-batch mutable state from process-wide globals and +introduces a clean strategy/pipeline separation. The CometWrapper layer is fully insulated +(all calls go through the unchanged `ICometSearchManager` vtable). However, three +correctness bugs were introduced -- two silent data-corruption paths in hand-written +`operator=` overloads, and one functional regression that drops the batch MS1 spectral- +library search path entirely. + +--- + +## 2. Critical Issues + +### 2a. Batch MS1 speclib search silently dead (functional regression) + +**File:** `CometSearch/search/SearchUtils.h:283` + +`RunSearchAndPostAnalysis` (the shared batch body called by all three strategies) invokes +`CometSearch::RunSearch` and `CometSearch::RunSpecLibSearch` but never calls +`CometSearch::RunMS1Search(ThreadPool*, ...)`. Separately, `CometPreprocess:: +PreprocessMS1SingleSpectrum(session&)` -- the only function that populates +`session.ms1Queries` -- has zero callers in any strategy or pipeline code path. + +Result: a batch run with `bPerformSpecLibSearch = true` produces no MS1 spectral-library +matches and emits no error or warning. The MS1 speclib batch path was present in +`CometSearchManager::DoSearch` on `master` and is now dead code. + +**Fix:** wire `PreprocessMS1SingleSpectrum(session)` and `RunMS1Search(tp, ..., +session.ms1Queries)` into `RunSearchAndPostAnalysis` when `session.bPerformSpecLibSearch` +is true, mirroring the MS2 speclib path already present. + +**Status (2026-06-17):** Investigation confirmed this was not a regression -- the batch +MS1 speclib path (`PreprocessMS1SingleSpectrum` / `RunMS1Search(ThreadPool*,...)`) had zero +callers on `master` as well. Two partial fixes applied: (1) `Pipeline::cleanupBatch` lambda +now also deletes and clears `session.ms1Queries` so any future wiring will not leak; (2) a +TODO comment at `SearchUtils.h:287` documents the RT-range parameters required before the +batch MS1 path can be wired in. + +--- + +### 2b. VarModParams::operator= drops two fields -- protein-filter var-mod searches silently broken + +**File:** `CometSearch/core/Params.h:273` + +`VarModParams::operator=` (called via `StaticParams::operator=` line 451) assigns every +field except `sProteinLModsListFile` (std::string) and `mmapProteinModsList` +(multimap). After any `StaticParams` copy, `bVarModProteinFilter` is true but +`mmapProteinModsList` is empty, so the filter silently matches every protein and the +restriction is ignored. + +`Options::operator=` (line 98) has the same structural problem: `iSpecLibMSLevel` (int, +declared line 48) is never assigned. After copy, the speclib MS-level filter uses +whatever value was already in the destination. + +**Root cause:** All five hand-written `operator=` bodies in `Params.h` (`Options`, +`DBInfo`, `StaticMod`, `PrecalcMasses`, `VarModParams`) copy fields one by one and have +drifted from their struct declarations. The compiler-synthesised `operator=` would copy +all members correctly for free -- every member is a trivially-copyable scalar, a fixed +array of scalars, or a `std::string` / `std::vector` / `std::multimap` with correct copy +semantics. + +**Fix:** Delete all five hand-written `operator=` definitions and rely on the compiler- +generated versions. If explicit copy control is needed for a specific reason, add a +static_assert or a comment naming that reason. + +**Status (2026-06-17):** Fixed. All nine hand-written `operator=` bodies in `Params.h` +replaced with `= default` (correct `const Type&` signature). The full scope was larger +than initially identified -- beyond the five listed above, `MassUtil`, `ToleranceParams`, +`IonInfo`, and `StaticParams` had the same drift bug. `StaticParams::operator=` was missing +`peffInfo`, `iDbType`, `sDecoyPrefix` (string), `bSkipToStartScan`, and `tRealTimeStart`. +Build verified clean after replacement. + +--- + +### 2c. SearchThreadProc has no RAII guard for the pool slot -- bad_alloc during index build causes 240-second deadlock + +**File:** `CometSearch/CometSearch.cpp:1253` + +```cpp +int i = AcquirePoolSlot(); +// ... +CometSearch* sqSearch = new CometSearch(); +sqSearch->DoSearch(...); +delete sqSearch; +s_pool.releaseSlot(i); // never reached if DoSearch throws +``` + +`DoSearch` contains two re-throwing `catch` blocks (lines ~3563 and ~7558) inside +`g_pvDBIndex.push_back()` failure paths, reachable when `bCreateFragmentIndex` or +`bCreatePeptideIndex` is set. If the system OOMs mid index-build, the exception propagates +past `releaseSlot`. The old `SearchThreadData::~SearchThreadData` released the slot +unconditionally; that safety net was removed in this diff. + +**Fix:** Wrap the slot in a simple RAII guard: + +```cpp +struct SlotGuard { + int slot; + ~SlotGuard() { if (slot >= 0) s_pool.releaseSlot(slot); } +}; +SlotGuard guard{i}; +``` + +**Status (2026-06-17):** Fixed. Local `SlotGuard` struct added to `SearchThreadProc` +immediately after the slot is acquired. The explicit `s_pool.releaseSlot(i)` call was +removed; the guard destructor handles release on both normal exit and exception unwind. + +--- + +## 3. Code Quality & Maintainability + +### 3a. FusedLoadAndSearchSpectra batch-size check fires early + +**File:** `CometSearch/CometPreprocess.cpp:3362` + +`iNumSpectraLoaded` is incremented when a spectrum is pushed onto the bounded queue +(before any consumer thread processes it). `CheckExit` fires when +`iNumSpectraLoaded >= iSpectrumBatchSize`. With a queue depth of `iNumThreads * 4`, the +read loop can stop up to `iNumThreads * 4` entries before the configured batch size is +actually searched. + +The non-fused `LoadAndPreprocessSpectra` path sets `iNumSpectraLoaded = +session.queries.size()` (post-preprocessing count), so the two paths have different +batch-size semantics. Users relying on `spectrum_batch_size` for memory control in FI_DB +mode will observe smaller-than-configured batches. + +**Status (2026-06-17):** Fixed. Removed the local `iNumSpectraLoaded` variable and its +queue-push increment from `FusedLoadAndSearchSpectra`. The `CheckExit` call (which already +holds `session.queriesMutex`) now passes `(int)session.queries.size()` directly, matching +the non-fused path semantics: the count reflects spectra that have been fully preprocessed +and stored in `session.queries`. + +### 3b. SearchThreadData::pQueries latent null deref + +**File:** `CometSearch/CometSearch.h:43` / `CometSearch/CometSearch.cpp:1269` + +Both `SearchThreadData` constructors initialise `pQueries = nullptr`. `SearchThreadProc` +dereferences it at line 1269 with no null check. All current callers correctly set +`pQueries = &queries` before dispatching, but the type provides no enforcement. A future +dispatch path that forgets the assignment will crash inside a thread with no useful +diagnostic. + +**Fix:** make `pQueries` a required constructor parameter (remove the default-null +initialiser) or add an assert before the dereference. + +**Status (2026-06-17):** Fixed. Removed the no-arg `= default` constructor (unused). +`pQueries` is now a required second parameter of the `sDBEntry` constructor: +`SearchThreadData(const sDBEntry&, const vector*)`. The one call site in +`RunSearch` updated to `new SearchThreadData(dbe, &queries)`, eliminating the +post-construction assignment step. + +### 3c. Pipeline::cleanupBatch skips session.ms1Queries + +**File:** `CometSearch/search/Pipeline.cpp:136` + +The `cleanupBatch` lambda deletes and clears `session.queries` but never touches +`session.ms1Queries`. Currently `session.ms1Queries` is never populated (see 2a above), +so there is no active leak. If batch MS1 search is re-wired, every batch will leak its +`QueryMS1*` objects across all batches and all input files. + +**Status (2026-06-17):** Fixed as part of 2a. `cleanupBatch` now also iterates and +deletes `session.ms1Queries` and calls `session.ms1Queries.clear()`. + +### 3d. session.params member is vestigial + +**File:** `CometSearch/search/SearchSession.h:48` + +`SearchSession` carries `const StaticParams& params` that is never read by any caller. +Every strategy, pipeline, and utility accesses `g_staticParams` directly. The member +implies an in-progress migration that has not started, misleading future readers. + +**Status (2026-06-17):** Fixed. `const StaticParams& params` member and the accompanying +comment removed from `SearchSession`. Constructor simplified to +`explicit SearchSession(CometStatus& st)`. The one construction site in +`CometSearchManager.cpp` updated accordingly. + +### 3e. Non-ASCII characters in SearchSession.h + +**File:** `CometSearch/search/SearchSession.h:20,21,47` + +Lines 20 (U+2026 HORIZONTAL ELLIPSIS) and 21, 47 (U+2014 EM DASH) are UTF-8 multi-byte +sequences. CLAUDE.md rule: \"No non-ASCII characters allowed in the code or documentation.\" +All other new files are pure ASCII. Replace with ASCII equivalents (`...` and `--`). + +**Status (2026-06-17):** Fixed. The EM DASH on old line 47 was removed along with the +vestigial `params` member (3d). The HORIZONTAL ELLIPSIS on line 20 replaced with `...` +and the EM DASH on line 21 replaced with `--`. Verified with `grep -P "[^\x00-\x7F]"`: +no non-ASCII bytes remain. + +### 3f. Trailing whitespace in Params.h + +**File:** `CometSearch/core/Params.h:154,155,257` + +Line 154 has 4 trailing spaces, line 155 has 2 trailing spaces and a stray space before +the semicolon (`iFragIndexMinIonsReport ;`), and line 257 has 1 trailing space. CLAUDE.md +rule: \"No trailing whitespace.\" + +**Status (2026-06-17):** Fixed. The stray space before the semicolon and the two lines of +trailing spaces on old lines 154-155 were eliminated when the hand-written `operator=` +bodies were replaced with `= default` (issue 2b), which removed those lines entirely. The +remaining trailing space on old line 257 (`bVarProteinCTermMod` declaration, now line 162) +was stripped directly. Verified with `grep -P "[\t ][\r]?$"`: no trailing whitespace +remains. + +--- + +## 4. Actionable Improvements + +### 4a. Delete hand-written operator= in Params.h + +Replace all five with `= default` or remove them entirely: + +```cpp +// Before (drift-prone): +Options& operator=(Options& a) { iNumPeptideOutputLines = a.iNumPeptideOutputLines; ... } + +// After: +Options& operator=(const Options&) = default; +``` + +If the non-const signature `operator=(Options& a)` was intentional (e.g., to allow +modification of the source), document why; otherwise make it `const Options&`. + +**Status (2026-06-17):** Done as part of issue 2b. All nine hand-written `operator=` +bodies (not just the five originally identified) were replaced with `= default` using the +correct `const Type&` signature. + +### 4b. Move RunSearchAndPostAnalysis out of SearchUtils.h + +**File:** `CometSearch/search/SearchUtils.h:244` + +`SearchUtils.h` is included by 5 translation units and contains 65-line non-trivial +functions marked `inline static`. Each TU gets its own copy. Move `RunSearchAndPostAnalysis`, +`AllocateResultsMem`, and `UpdateInputFile` into a `SearchUtils.cpp` and keep only +declarations in the header. The three small comparator helpers +(`compareByPeptideMass`, etc.) are genuinely inline-worthy and can stay. + +**Status (2026-06-17):** Done. Created `CometSearch/search/SearchUtils.cpp` containing the +definitions of `UpdateInputFile`, `SetMSLevelFilter`, `AllocateResultsMem`, and +`RunSearchAndPostAnalysis`. `GetInputType` became a `static` helper in that .cpp (not +exported). `SearchUtils.h` now contains only declarations plus the three inline comparators; +added self-contained includes (`MSReader.h`, `SearchSession.h`) so the header compiles +standalone. `search/SearchUtils` added to `SEARCH_SRC` in the Makefile and +`search\SearchUtils.cpp` added to `CometSearch.vcxproj`. + +### 4c. Factor out the shared legacy batch body in FiStrategy and FastaStrategy + +**Files:** `CometSearch/search/FiStrategy.cpp:147`, `CometSearch/search/FastaStrategy.cpp:60` + +The two \"legacy three-sweep\" paths (`LoadAndPreprocess` -> `AllocateResultsMem` -> +`RunSearchAndPostAnalysis`) are structurally identical except for a verbosity flag. The +difference is already encoded in the `bLogPrePostAnalysis` parameter that `RunSearchAndPostAnalysis` +accepts. Extract a shared free function: + +```cpp +bool executeBatchLegacy(MSToolkit::MSReader& mstReader, int iFirstScan, int iLastScan, + int iAnalysisType, int& iPercentStart, int& iPercentEnd, + ThreadPool* tp, SearchSession& session, bool bVerbose); +``` + +**Status (2026-06-17):** Done. `executeBatchLegacy` added to `SearchUtils.cpp` / +declared in `SearchUtils.h`. The `bVerbose` flag controls the three per-strategy +differences: the \"Load spectra:\" console log before loading, the spectra-count +`logout` after allocation, and whether to pass `bLogPrePostAnalysis=true` to +`RunSearchAndPostAnalysis`. All three strategy `executeBatch` bodies replaced with a +single call; this covered `PiStrategy` as well (not mentioned in the original finding +but structurally identical to the `FiStrategy` non-fused path). + +### 4d. Fix iNumSpectraLoaded semantics in FusedLoadAndSearchSpectra + +**File:** `CometSearch/CometPreprocess.cpp:3362` + +Either (a) increment `iNumSpectraLoaded` inside `FusedSearchSpectrum` after a spectrum +completes preprocessing (requires an atomic counter shared with the worker lambdas), or +(b) document that the fused-path batch size is approximate (+/- queue depth) and update +any user-facing documentation for `spectrum_batch_size` accordingly. + +**Status (2026-06-17):** Done as part of 3a (option a). The local `iNumSpectraLoaded` +variable and its queue-push increment were removed entirely. `CheckExit` now receives +`(int)session.queries.size()` directly under the already-held `queriesMutex`, which counts +only spectra that have been fully preprocessed -- the same semantics as the non-fused path. + +--- + +## Appendix: Findings Not Requiring Code Changes + +- **CometWrapper isolation confirmed**: all CometWrapper calls go through the + `ICometSearchManager` vtable; no internal signature changes propagate to the wrapper + layer. +- **s_pool singleton (TODO acknowledged)**: the file-static `SearchMemoryPool s_pool` in + `CometSearch.cpp` prevents multiple concurrent RTS instances. The TODO comment at line + 30 correctly identifies this. No concurrent RTS path currently invokes the batch pool, + so this is a known deferred item, not a regression. +- **FiStrategy::finalize() redundant iDbType check**: the `if (g_staticParams.iDbType == + DbType::FI_DB)` guard is always true when called by the pipeline (which selected + FiStrategy precisely because iDbType == FI_DB). Harmless today. +- **Redundant #include lines in CometSearchManager.cpp**: the five `CometWrite*.h` + includes at lines 21-25 are already pulled in transitively by the new `output/*Writer.h` + includes. Dead includes, no functional impact. diff --git a/docs/20260617_codereview2.md b/docs/20260617_codereview2.md new file mode 100644 index 00000000..9908b224 --- /dev/null +++ b/docs/20260617_codereview2.md @@ -0,0 +1,186 @@ +Code Review: architecture_update branch, uncommitted working-tree diff (2026-06-17) +===================================================================================== + +Scope +----- +Reviewed the current uncommitted changes on top of commit c971a2dd (13 modified +files + 1 new file, +58/-573 lines). This diff implements the fix pass for the +findings recorded earlier today in docs/20260617_codereview.md: replacing +hand-written `operator=` bodies in Params.h with `= default`, adding a SlotGuard +RAII wrapper in SearchThreadProc, fixing the FusedLoadAndSearchSpectra batch-size +check, extracting SearchUtils.h's non-trivial functions into a new SearchUtils.cpp, +and factoring the three strategies' batch bodies into a shared executeBatchLegacy +helper. + +Method: verified each "Status: Fixed" claim against the actual diff line-by-line, +rebuilt from clean (`make cclean && make`), ran the full unit suite, checked CRLF / +non-ASCII / trailing-whitespace compliance per CLAUDE.md, then searched for +structurally identical instances of the bug pattern that was just fixed. + +--- + +1. Summary +---------- +All six fixes claimed in docs/20260617_codereview.md are present in the diff and +verified correct: the `operator=` replacements are safe (every member of every +affected struct is a value type with correct default-copy semantics, no owning raw +pointers), the SlotGuard correctly releases the pool slot on exceptional exit, the +batch-size counter now reflects processed rather than queued spectra, and the +SearchUtils split / executeBatchLegacy extraction preserve behavior exactly +(FastaStrategy keeps `bVerbose=true`, FiStrategy/PiStrategy keep `bVerbose=false`, +matching their pre-diff behavior). Clean rebuild produces zero warnings; all 17 unit +tests pass. One gap was found: the SlotGuard fix addressed only one of five call +sites that share the identical acquire-slot/run/release-slot pattern, leaving the +production batch-FI hot path and the RTS single-spectrum path exposed to the same +240-second slot-leak hazard the fix was written to close. + +**Status (2026-06-17): all items closed.** The critical issue (2a) and both +actionable improvements (4a, 4b) have been fixed -- see per-item status notes below. +Rebuilt clean (`make cclean && make`, zero warnings) and re-ran the full unit suite +(17 passed, 0 failed, 0 skipped) after the fix. + +--- + +2. Critical Issues +------------------- + +### 2a. SlotGuard fix is incomplete -- four sibling call sites still leak the pool + slot on exception + +**Files:** `CometSearch/CometSearch.cpp:128, 170, 214, 266` + +The diff adds a `SlotGuard` RAII wrapper around the one call site in +`SearchThreadProc` (line ~1263) so `s_pool.releaseSlot()` fires even if `DoSearch` +throws. The same bare `AcquirePoolSlot() -> run -> s_pool.releaseSlot()` pattern, +with no guard, exists at four other sites that were not touched: + +- `CometSearch::RunSearch(Query*)` line 128 (RTS thread-local FI search -- the + documented concurrent RTS path in CLAUDE.md) +- same function, line 170 (RTS thread-local PI search) +- `CometSearch::RunSearch(ThreadPool*, vector&)` line 214 (single-query FI + fallback) +- `CometSearch::RunSearch(int, int, ThreadPool*, vector&)` line 266 -- inside + a per-query lambda dispatched to the thread pool; this is the production batch-FI + search hot path, executed once per query in every FI_DB batch + +`SearchFragmentIndex` (called at all four sites) builds a +`std::unordered_map`, a `std::vector>` via +`push_back`, and calls `std::sort` -- all of which can throw `std::bad_alloc` under +memory pressure, the same failure mode that motivated the original fix. +`SearchPeptideIndex` (lines 170, 244) has equivalent allocations. + +If any of these throw, the slot is never released. `SearchMemoryPool::acquireSlot()` +(threading/SearchMemoryPool.cpp:76) then blocks every subsequent caller for up to +240 seconds (the same symptom described for the issue that was just fixed) before +giving up and returning -1. For the RTS single-spectrum path this directly +contradicts the threading-model guarantee in CLAUDE.md that the RTS path stays +responsive; for the batch FI path it can stall an entire search batch. + +**Fix:** lift `SlotGuard` out of `SearchThreadProc` into a shared location (e.g. +`SearchMemoryPool.h`, since `s_pool` already lives in that translation unit) and +apply it at all five acquire/release sites, or wrap the post-acquire body of each +site in a `try { ... } catch (...) { s_pool.releaseSlot(slot); throw; }`. Since this +is the same author and same diff that recognized and fixed the pattern once, doing +it everywhere now is cheap; finding the next instance after a production stall is +not. + +**Status (2026-06-17):** Fixed. Added `SearchMemoryPoolSlotGuard` to +`threading/SearchMemoryPool.h` (a small RAII struct holding a `SearchMemoryPool&` +and the slot index, releasing in its destructor) and applied it at all five +acquire/release sites in `CometSearch.cpp`: the two thread-local RTS overloads +(`RunSearch(Query*)`, FI and PI branches), the single-query FI fallback +(`RunSearch(ThreadPool*, vector&)`), the batch-FI per-query lambda +(`RunSearch(int, int, ThreadPool*, vector&)`), and the original +`SearchThreadProc` site (whose function-local `SlotGuard` struct was removed in +favor of the shared one). All five bare `s_pool.releaseSlot(...)` calls following a +search body were removed; the guard now owns release in every case, including +exception unwind. + +--- + +3. Code Quality & Maintainability +---------------------------------- + +Nothing new beyond what docs/20260617_codereview.md already recorded and the diff +already fixed. No trailing whitespace, no non-ASCII characters, and CRLF line +endings are correct in every changed/added line (verified with `file` and +`grep -P "[^\x00-\x7F]"` / `grep -P "[\t ][\r]?$"` restricted to lines actually +touched by this diff -- the unrelated pre-existing trailing-whitespace lines found +elsewhere in CometSearch.cpp/CometPreprocess.cpp/CometSearch.h/CometSearchManager.cpp +are untouched by this diff and out of scope). + +--- + +4. Actionable Improvements +---------------------------- + +### 4a. Share one SlotGuard definition instead of risking drift + +`SlotGuard` is currently a function-local struct defined only inside +`SearchThreadProc`. Move it next to `SearchMemoryPool` (e.g. as a nested type or a +free struct in `threading/SearchMemoryPool.h`) so the four other call sites in 2a +can reuse it directly: + +```cpp +// threading/SearchMemoryPool.h +struct SearchMemoryPoolSlotGuard +{ + SearchMemoryPool& pool; + int slot; + ~SearchMemoryPoolSlotGuard() { if (slot >= 0) pool.releaseSlot(slot); } +}; +``` + +```cpp +int iSlot = AcquirePoolSlot(); +if (iSlot < 0) { logerr(...); return false; } +SearchMemoryPoolSlotGuard guard{s_pool, iSlot}; +SearchFragmentIndex(pQuery, _ppbDuplFragmentArr[iSlot]); +``` + +**Status (2026-06-17):** Done as part of fixing 2a -- `SearchMemoryPoolSlotGuard` was +added to `threading/SearchMemoryPool.h` exactly as proposed and is now the only +release mechanism used anywhere in `CometSearch.cpp`. + +### 4b. Batch-FI lambda swallows AcquirePoolSlot failure + +**File:** `CometSearch/CometSearch.cpp:258-266` (pre-existing, not introduced by +this diff, surfaced while tracing 2a) + +When `AcquirePoolSlot()` returns -1 inside the per-query lambda, the lambda logs and +returns, but `RunSearch`'s `bSucceeded` is never set to `false` -- the query is +silently dropped from the batch with no caller-visible failure. Not in scope for +this diff's fix pass, but worth a follow-up ticket since it compounds 2a (a slot +leaked by one query makes the next query's acquire more likely to time out and be +silently dropped too). + +**Status (2026-06-17):** Fixed. Added a `std::atomic bAllSlotsAcquired(true)` +captured by reference in the per-query lambda; on `AcquirePoolSlot() < 0` the lambda +now sets it `false` (in addition to the existing `logerr`) instead of just +returning. After `wait_on_threads()`, `RunSearch` checks the flag and, if any query +failed to acquire a slot, calls `g_cometStatus.SetStatus(CometResult_Failed, ...)` +and sets `bSucceeded = false` before returning, making the failure visible to the +caller instead of silently dropping the affected queries from the batch. + +--- + +Appendix: Verified, no changes needed +---------------------------------------- +- `Options`/`DBInfo`/`StaticMod`/`PrecalcMasses`/`VarModParams`/`MassUtil`/ + `ToleranceParams`/`IonInfo`/`StaticParams` `operator= = default`: every member of + every struct is a value type (POD scalar, fixed array of scalars, `std::string`, + `std::vector`, `std::multimap`, `std::chrono::time_point`) -- no owning raw + pointers anywhere in `Params.h`, so compiler-generated copy is correct and copies + every field, closing the original drift bug for good rather than just patching the + fields named in the original finding. +- `executeBatchLegacy` / `SearchUtils.cpp` extraction: behavior-preserving: verbose + flag wiring matches each strategy's pre-diff console output exactly; locking around + `CheckExit`'s new `session.queries.size()` argument is consistent with the existing + `queriesMutex` discipline at the push site (CometPreprocess.cpp:3236). +- `Pipeline::cleanupBatch` now also drains `session.ms1Queries` -- consistent with + the dead/not-yet-wired batch MS1 path noted in the prior review; no active leak + today, but correct hygiene if that path is wired in later. +- Build (pre-fix and post-fix): `make cclean && make -j20` from a clean tree -- + zero warnings both times. +- Tests (pre-fix and post-fix): `python3 tests/unit/run_tests.py --comet + ./comet.exe` -- 17 passed, 0 failed, 0 skipped both times. From afe4960de24d9da47691b80edfce0e214fcc4841 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Wed, 17 Jun 2026 20:39:01 -0700 Subject: [PATCH 11/15] fix: AScorePro stale-mod ordering and PI_DB batch search segfault - Move AScorePro init/teardown from CometSearchManager::DoSearch() into Pipeline::run(), after strategy initialize() and around finalize(), so SetAScoreOptions() reads variableModParameters after FiStrategy has already overwritten it from the .idx file's VariableMod: header instead of stale/default values. - Fix CometSearch::SearchPeptideIndex(ThreadPool*, vector&) never assigning _pQueries, left over from the Strategy/Pipeline refactor; this left BinarySearchMass() dereferencing a null pointer and silently crashing every PI_DB (-j) batch search on the first scored candidate. - Add t19 (FI_DB + AScorePro ordering) and t20 (PI_DB regression) tests, each empirically verified to fail pre-fix and pass post-fix. - Tidy up: writer close()-after-failed-open contract documented, isIndexBased() doc tightened, stale SearchSession.h migration comment resolved, redundant Params.h operator= declarations removed. - Update DataStructures.md/GlobalVariables.md/RealTimeSearch.md to match current SearchSession fields, SearchMemoryPool (replacing g_searchMemoryPoolMutex), and the _pQueries discipline; record both fixes in docs/20260617_codereview3.md. Co-Authored-By: Claude Sonnet 4.6 --- CometSearch/CometSearch.cpp | 5 + CometSearch/CometSearchManager.cpp | 16 +- CometSearch/core/Params.h | 18 -- CometSearch/output/IResultWriter.h | 6 + CometSearch/search/ISearchStrategy.h | 8 +- CometSearch/search/Pipeline.cpp | 23 ++ CometSearch/search/SearchSession.h | 26 +- docs/20260617_codereview3.md | 367 ++++++++++++++++++++++++++ docs/DataStructures.md | 15 +- docs/GlobalVariables.md | 10 +- docs/RealTimeSearch.md | 8 +- tests/unit/data/t19_ascore_fidb.fasta | 2 + tests/unit/data/t19_ascore_fidb.ms2 | 20 ++ tests/unit/run_tests.py | 324 +++++++++++++++++++++++ 14 files changed, 791 insertions(+), 57 deletions(-) create mode 100644 docs/20260617_codereview3.md create mode 100644 tests/unit/data/t19_ascore_fidb.fasta create mode 100644 tests/unit/data/t19_ascore_fidb.ms2 diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index 4dc013df..01d10d26 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -1864,6 +1864,11 @@ bool CometSearch::SearchPeptideIndex(ThreadPool* /*tp*/, vector& queries comet_fileoffset_t lEndOfStruct; FILE* fp; + // BinarySearchMass() and AnalyzePeptideIndex() read the query list through + // _pQueries rather than a parameter (mirroring CometSearch::DoSearch()); without + // this assignment _pQueries stays nullptr on a freshly constructed CometSearch + // instance and the first dereference below segfaults. + _pQueries = &queries; CometPostAnalysis cpa; diff --git a/CometSearch/CometSearchManager.cpp b/CometSearch/CometSearchManager.cpp index 99e46e58..a696eee3 100644 --- a/CometSearch/CometSearchManager.cpp +++ b/CometSearch/CometSearchManager.cpp @@ -2106,17 +2106,8 @@ bool CometSearchManager::DoSearch() return bSucceeded; // index written; caller (InitializeSingleSpectrumSearch) will load it } - // AScore initialization (once for entire DoSearch run) - if (g_staticParams.options.iPrintAScoreProScore) - { - SetAScoreOptions(g_AScoreOptions); - g_AScoreInterface = CreateAScoreDllInterface(); - if (!g_AScoreInterface) - { - std::cerr << "Failed to create AScore interface." << std::endl; - return false; - } - } + // AScore initialization happens inside Pipeline::run(), after the strategy has + // loaded its database/index -- see the comment there for why the ordering matters. if (g_bPerformSpecLibSearch) CometSpecLib::LoadSpecLib(g_staticParams.speclibInfo.strSpecLibFile); @@ -2151,9 +2142,6 @@ bool CometSearchManager::DoSearch() Pipeline pipeline(std::move(pStrategy), std::move(vWriters), this); bSucceeded = pipeline.run(session, g_pvInputFiles, *tp); - if (g_staticParams.options.iPrintAScoreProScore) - DeleteAScoreDllInterface(g_AScoreInterface); - return bSucceeded; } diff --git a/CometSearch/core/Params.h b/CometSearch/core/Params.h index 4d2eef53..37f1aebe 100644 --- a/CometSearch/core/Params.h +++ b/CometSearch/core/Params.h @@ -94,8 +94,6 @@ struct Options DoubleRange clearMzRange; char szActivationMethod[24]; // mzXML only string sPinProteinDelimiter; // PIN file protein delimiter; default tab - - Options& operator=(const Options&) = default; }; // The minimum and maximum mass range of all peptides to consider @@ -117,8 +115,6 @@ struct DBInfo char szFileName[SIZE_FILE]; int iTotalNumProteins; unsigned long int uliTotAACount; - - DBInfo& operator=(const DBInfo&) = default; }; struct SpecLibInfo // why a struct for just a string??? @@ -139,8 +135,6 @@ struct StaticMod double dAddCterminusProtein; double dAddNterminusProtein; double pdStaticMods[SIZE_MASS]; - - StaticMod& operator=(const StaticMod&) = default; }; struct PrecalcMasses @@ -150,8 +144,6 @@ struct PrecalcMasses double dOH2ProtonCtermNterm; // dOH2parent + PROTON_MASS + dAddCterminusPeptide + dAddNterminusPeptide int iMinus17; // BIN'd value of mass(NH3) int iMinus18; // BIN'd value of mass(H2O) - - PrecalcMasses& operator=(const PrecalcMasses&) = default; }; struct VarModParams @@ -174,8 +166,6 @@ struct VarModParams string sCompoundModsFile; // path to compound mods mass file; empty = disabled vector vdCompoundMasses; // sorted, deduplicated list of masses read from sCompoundModsFile unsigned int uiNumCompoundMasses; // vdCompoundMasses.size(); 0 when feature is disabled - - VarModParams& operator=(const VarModParams&) = default; }; struct MassUtil @@ -192,8 +182,6 @@ struct MassUtil double pdAAMassParent[SIZE_MASS]; double pdAAMassFragment[SIZE_MASS]; double pdAAMassUser[SIZE_MASS]; // user defined default amino acid masses - - MassUtil& operator=(const MassUtil&) = default; }; struct ToleranceParams @@ -207,8 +195,6 @@ struct ToleranceParams double dFragmentBinStartOffset; double dMS1BinSize; double dMS1BinStartOffset; - - ToleranceParams& operator=(const ToleranceParams&) = default; }; struct IonInfo @@ -218,8 +204,6 @@ struct IonInfo bool bUseWaterAmmoniaLoss; // ammonia, water loss int iTheoreticalFragmentIons; int iIonVal[NUM_ION_SERIES]; - - IonInfo& operator=(const IonInfo&) = default; }; // static user params, won't change per thread - can make global! @@ -264,8 +248,6 @@ struct StaticParams RestoreDefaults(); } - StaticParams& operator=(const StaticParams&) = default; - void RestoreDefaults() { int i; diff --git a/CometSearch/output/IResultWriter.h b/CometSearch/output/IResultWriter.h index 9625fdb6..84ea7c0e 100644 --- a/CometSearch/output/IResultWriter.h +++ b/CometSearch/output/IResultWriter.h @@ -62,6 +62,12 @@ class IResultWriter // Write format footer (if any), close file(s), and optionally remove // them (bEmpty = iTotalSpectraSearched == 0). + // + // Contract: must be safe to call even if open() was never called, or returned + // false partway through. Pipeline::run() calls close(false, false) on every + // writer in the vector -- including ones after the one whose open() failed -- + // when any writer's open() fails, so implementations must null-check their file + // handles before touching them. virtual void close(bool bSucceeded, bool bEmpty) = 0; protected: diff --git a/CometSearch/search/ISearchStrategy.h b/CometSearch/search/ISearchStrategy.h index 02d48e49..f979d027 100644 --- a/CometSearch/search/ISearchStrategy.h +++ b/CometSearch/search/ISearchStrategy.h @@ -67,7 +67,11 @@ class ISearchStrategy // Frees memory pools and (for FI_DB) the fragment index arrays. virtual void finalize() = 0; - // Returns true for index-based searches (FI_DB, PI_DB). - // Pipeline uses this to select progress-message style. + // Returns true for index-based searches (FI_DB, PI_DB), false for FASTA_DB. + // Pipeline::run() is the only consumer, and uses it solely to choose between the + // compact index-style progress line ("- searching ... done") and the verbose + // FASTA-style per-file banners ("Search start:"/"Search end:", "done" per batch). + // This flag carries no other semantics -- it must not be used to gate actual + // search behavior; that belongs in the strategy's own initialize()/executeBatch(). virtual bool isIndexBased() const = 0; }; diff --git a/CometSearch/search/Pipeline.cpp b/CometSearch/search/Pipeline.cpp index 4f677ff5..513eeeaf 100644 --- a/CometSearch/search/Pipeline.cpp +++ b/CometSearch/search/Pipeline.cpp @@ -18,6 +18,7 @@ #include "CometPreprocess.h" #include "CometMassSpecUtils.h" #include "MSReader.h" +#include "AScoreFactory.h" Pipeline::Pipeline(std::unique_ptr strategy, std::vector> writers, @@ -40,6 +41,25 @@ bool Pipeline::run(SearchSession& session, return false; } + // AScore initialization happens here -- after the strategy has loaded its + // database/index -- rather than earlier in DoSearch(), because FI_DB's + // ReadPlainPeptideIndex() (called from FiStrategy::initialize() above) overwrites + // g_staticParams.variableModParameters.varModList[] from the .idx file's + // VariableMod: header. SetAScoreOptions() reads those same fields to build its + // differential-mod list, so it must run after the index load, not before, or it + // configures AScore from stale/default mod values. + if (g_staticParams.options.iPrintAScoreProScore) + { + _pMgr->SetAScoreOptions(g_AScoreOptions); + g_AScoreInterface = CreateAScoreDllInterface(); + if (!g_AScoreInterface) + { + std::cerr << "Failed to create AScore interface." << std::endl; + _strategy->finalize(); + return false; + } + } + bool bSucceeded = true; int iTotalAllFiles = 0; // spectra searched across all files (for blank-file check) @@ -261,6 +281,9 @@ bool Pipeline::run(SearchSession& session, _strategy->finalize(); + if (g_staticParams.options.iPrintAScoreProScore) + DeleteAScoreDllInterface(g_AScoreInterface); + // Print overall "done" banner for index-based searches. if (_strategy->isIndexBased()) { diff --git a/CometSearch/search/SearchSession.h b/CometSearch/search/SearchSession.h index e595c06c..cc1f9e2f 100644 --- a/CometSearch/search/SearchSession.h +++ b/CometSearch/search/SearchSession.h @@ -20,18 +20,22 @@ // g_vSpecLib, g_pvProteinsList, g_pvProteinNameCache, g_pvDBIndex, ...) are NOT moved // here -- they are large, initialised once, and shared read-only across all threads. // -// Phase 4 migration note: -// g_pvQueryMutex, g_bPlainPeptideIndexRead, and g_bSpecLibRead remain as globals -// because they are also accessed from the RTS path (InitializeSingleSpectrumSearch / -// DoSingleSpectrumSearchMultiResults), which does not use SearchSession. -// SearchSession does not shadow these globals; all code reads the globals directly. -// Full removal is deferred to Phase 5. +// g_pvQueryMutex, g_bPlainPeptideIndexRead, and g_bSpecLibRead remain as globals, +// not SearchSession members, and this is permanent rather than a pending migration +// step: they are also read/written by the RTS path (InitializeSingleSpectrumSearch / +// DoSingleSpectrumSearchMultiResults), which is intentionally not moved into the +// strategy/Pipeline pattern (see docs/20260612_architecture_migration.md, "RTS path" -- +// the RTS entry points are wrapper-compatibility-sensitive and out of scope for the +// migration). Since a single process can serve both RTS and batch requests, this +// once-per-process init state must stay process-global so both paths observe the same +// value; it cannot move into a per-batch-run SearchSession. SearchSession does not +// shadow these globals; all code reads the globals directly. // -// g_cometStatus is exposed here as statusRef: a reference to the process-wide -// singleton. Pipeline and strategy code use session.statusRef so they are not -// coupled to the global name; deep core files (CometSearch.cpp, CometPreprocess.cpp, -// etc.) still reference g_cometStatus directly because they have no SearchSession -// in scope. Both spellings touch the same object. +// g_cometStatus is exposed here as statusRef: a reference to the process-wide +// singleton. Pipeline and strategy code use session.statusRef so they are not +// coupled to the global name; deep core files (CometSearch.cpp, CometPreprocess.cpp, +// etc.) still reference g_cometStatus directly because they have no SearchSession +// in scope. Both spellings touch the same object. #ifndef _SEARCHSESSION_H_ #define _SEARCHSESSION_H_ diff --git a/docs/20260617_codereview3.md b/docs/20260617_codereview3.md new file mode 100644 index 00000000..b524cc62 --- /dev/null +++ b/docs/20260617_codereview3.md @@ -0,0 +1,367 @@ +# Code Review: architecture_update branch (2026-06-17) -- independent pass + +## Scope + +Independent review of the `architecture_update` branch versus `master`, at branch tip +commit `0e10e71f` (74 files changed, +5,936/-3,275 lines). Performed without reference +to the same-day reviews in `docs/20260617_codereview.md` and `docs/20260617_codereview2.md`, +per request, as a second independent pass over the Strategy/Pipeline refactor: +`ISearchStrategy` (`FiStrategy` / `PiStrategy` / `FastaStrategy`) + `Pipeline` replacing +the monolithic `CometSearchManager::DoSearch` per-file loop, `SearchSession` replacing +the batch-path globals, `SearchMemoryPool` with RAII slot guards, and a new +`output/IResultWriter` layer wrapping the existing `CometWrite*` classes. + +Method: clean rebuild (`make cclean && make -j$(nproc)`) with a warning scan; full unit ++ integration test run (19/19 passed, including the T18 byte-identical determinism +check); manual line-by-line trace of the ~1,232 lines removed from +`CometSearchManager.cpp` against their new homes in `SearchUtils.cpp` / the strategy +classes to confirm behavior was preserved; targeted reads of `SearchMemoryPool`, +`Pipeline`, `SearchSession`, all three strategies, all five `IResultWriter` +implementations, and `core/Params.h` / `Types.h` / `Constants.h`. + +--- + +## 1. Summary + +Build is clean under `-Wall -Wextra` (zero warnings) and all 19 tests pass. The +extraction of `DoSearch`'s per-file loop into `Pipeline` + strategy classes is largely +faithful -- the diff was traced line-by-line and the removed logic reappears intact in +`SearchUtils.cpp` and the three strategy `.cpp` files, including the per-batch writer +open/write/close lifecycle and the FASTA/idx file-handle handling. The latest commit's +exception-safety fix (`SearchMemoryPoolSlotGuard` applied at all five +acquire/release sites) is correctly done. One concrete correctness regression was found: +reordering AScore initialization relative to fragment-index loading silently breaks +AScorePro phosphosite scoring for batch FI_DB searches. A few maintainability gaps in +the new abstraction are also worth hardening before this lands on `master`. + +**Status (2026-06-17): all items closed, plus one additional critical bug (2b) found +during live testing after this review.** Issue 2a, 2b, all of section 3, and all of +section 4 have been fixed -- see the per-item status notes below, including two new +regression tests (`t19`, `t20`) each verified to fail against its respective pre-fix +code and pass against the fix. Rebuilt clean (`make cclean && make -j$(nproc)`, zero +warnings) and re-ran the full unit + integration suite (21 passed, 0 failed, 0 skipped) +after the final round of fixes. + +--- + +## 2. Critical Issues + +### 2a. AScorePro configured with stale variable-mod data for batch FI_DB searches + +**Files:** `CometSearch/CometSearchManager.cpp:2110-2119` (new AScore-init call site) +vs. `CometSearch/search/FiStrategy.cpp:67-83` (index load, now run afterward) + +`SetAScoreOptions(g_AScoreOptions)` is now called once, unconditionally, near the top +of `DoSearch()` -- *before* `Pipeline::run()` constructs and initializes the strategy. +For `FI_DB` (fragment-index) searches, `FiStrategy::initialize()` subsequently calls +`CometFragmentIndex::ReadPlainPeptideIndex()`, which **overwrites** +`g_staticParams.variableModParameters.varModList[].dVarModMass / szVarModChar / +dNeutralLoss` from the `.idx` file's `VariableMod:` header line +(`CometFragmentIndex.cpp:1276-1310`). `SetAScoreOptions()` reads exactly those fields +(`CometSearchManager.cpp:3225-3258`) to build the AScore differential-mod list. + +Pre-refactor, this was correctly sequenced: the diff shows the old code ran +`ReadPlainPeptideIndex()` / `CreateFragmentIndex()` *first*, then `SetAScoreOptions()` +second, inside the per-file loop guarded by a `bPerformAScoreInitialization` flag. The +RTS path (`InitializeSingleSpectrumSearch`, `CometSearchManager.cpp:2268-2287`) still +gets this right and even carries a comment explaining why: *"normally set at end of +InitializeStaticParams; must do here again after ReadPlainPeptideIndex for single +spectrum search."* The same re-sync was not preserved for the batch path after the +refactor. + +`PI_DB` is not affected: `CometSearch::SearchPeptideIndex` (`CometSearch.cpp:1880-1903`) +lazily re-parses the index header and re-calls `SetAScoreOptions` on first invocation, +guarded by `g_bPeptideIndexRead`, so it self-heals. `FI_DB` has no equivalent internal +correction. + +**Impact:** any batch search against a prebuilt fragment index with +`print_ascore_score` enabled will configure AScorePro using whatever variable-mod +values happened to already be in `g_staticParams` *before* the index header was parsed +-- commonly empty/default, since FI_DB search-time params files don't need to redeclare +variable mods (they're embedded in the index). AScore site-localization scores would +silently be computed against the wrong (or no) differential mod, with no error raised. + +**Fix:** move the `SetAScoreOptions` / `CreateAScoreDllInterface` block in `DoSearch()` +to after the strategy's `initialize()` has run, e.g.: + +```cpp +// CometSearchManager.cpp, after strategy selection +if (!pStrategy->initialize(session, tp)) { pStrategy->finalize(); return false; } +if (g_staticParams.options.iPrintAScoreProScore) +{ + SetAScoreOptions(g_AScoreOptions); + g_AScoreInterface = CreateAScoreDllInterface(); + if (!g_AScoreInterface) { std::cerr << "Failed to create AScore interface." << std::endl; return false; } +} +``` + +This also avoids creating an AScore interface for an `FI_DB` run whose index fails to +load. + +**Status (2026-06-17):** Fixed. The AScore init/teardown block was moved out of +`CometSearchManager::DoSearch()` and into `Pipeline::run()` (`CometSearch/search/ +Pipeline.cpp`), rather than patched in place, so the fix covers every strategy through +one call site instead of duplicating the re-sync logic per strategy (see Actionable +Improvement 4b, also closed by this change). `SetAScoreOptions()` / +`CreateAScoreDllInterface()` now run immediately after `_strategy->initialize(session, +&tp)` succeeds -- i.e. after `FiStrategy::initialize()` has already called +`ReadPlainPeptideIndex()` for FI_DB runs -- and `DeleteAScoreDllInterface()` now runs +right after `_strategy->finalize()` at the end of `run()`, matching the original +unconditional teardown. A failure to create the AScore interface now also calls +`_strategy->finalize()` before returning, so the strategy's allocated memory pools are +not leaked on that error path (a small improvement over the pre-fix code, which +returned without finalizing on this same error). Verified with a clean rebuild (zero +warnings) and the full 19-test unit + integration suite. + +--- + +### 2b. Batch PI_DB search crashes on the first scored candidate (`_pQueries` never assigned) + +**File:** `CometSearch/CometSearch.cpp:1862` (`SearchPeptideIndex(ThreadPool*, vector&)`) + +**Discovered:** 2026-06-17, reported against the VS-built Windows binary running a real +peptide-index (`-j`) search via WSL interop: the process printed +`- searching "" ...` and then exited with no further output, no error message, +and no result file -- a silent crash, not a hang. + +`CometSearch::BinarySearchMass()` and the `AnalyzePeptideIndex(int iWhichQuery, ...)` +overload read the active query list through a `CometSearch` member, `_pQueries`, +rather than a parameter. `CometSearch::DoSearch()` (the FASTA path) sets +`_pQueries = &queries;` at entry for exactly this reason. The architecture refactor +changed `BinarySearchMass()` from reading the old global `g_pvQuery` directly to +reading it through `_pQueries`, and updated `DoSearch()` accordingly, but +`SearchPeptideIndex(ThreadPool*, vector&)` -- the PI_DB batch path, called from +a freshly constructed `CometSearch* sqSearch = new CometSearch();` in +`CometSearch::RunSearch(int, int, ThreadPool*, vector&)` -- was never updated +to set `_pQueries`. It stayed `nullptr` (the class's default member initializer), and +the first call into `BinarySearchMass()` dereferenced it, segfaulting before any +output was written. + +**Reproduced locally** with a minimal fixture (T19's phospho peptide/spectrum, built as +a PI_DB index instead of FI_DB) and confirmed via `gdb` backtrace: + +``` +#0 CometSearch::BinarySearchMass(int, int, double) const +#1 CometSearch::SearchPeptideIndex(ThreadPool*, vector&) +#2 CometSearch::RunSearch(int, int, ThreadPool*, vector&) +#3 RunSearchAndPostAnalysis(int, int, ThreadPool*, SearchSession&, bool) +#4 Pipeline::run(SearchSession&, vector const&, ThreadPool&) +#5 CometSearchManager::DoSearch() +``` + +matching the reported symptom exactly: the crash happens after the `"- searching ..."` +progress print and before any batch completes. + +**Fix:** added `_pQueries = &queries;` at the top of +`SearchPeptideIndex(ThreadPool*, vector&)`, mirroring `DoSearch()`. + +**Status (2026-06-17):** Fixed and empirically validated both directions, not just +inspected. With the fix: a PI_DB search of the fixture completes and scores correctly +(`xcorr=3.4260`, `ascorepro=330.7289`, phospho correctly localized to position 7). +Then `git stash`-reverted just this one-line fix, rebuilt, and re-ran the same search: +it reproduced the identical segfault inside `BinarySearchMass`, confirming the fix is +both necessary and sufficient. Restored the fix and confirmed the full test suite +(21 tests, including the two new ones below) passes cleanly with zero build warnings. + +Added two regression tests to `tests/unit/run_tests.py`: +- **t19** (already added for issue 2a) continues to cover the FI_DB AScore-ordering fix. +- **t20** (new) reuses T19's phospho fixture but builds a PI_DB (`-j`) index instead of + an FI_DB (`-i`) index, then runs the same search and asserts it exits cleanly (rc=0) + and produces the correct PSM. Verified to fail (non-zero exit from the crash) against + the pre-fix code and pass against the fix, the same way 2a's test was validated. + +--- + +## 3. Code Quality & Maintainability + +### 3a. Pipeline relies on an undocumented "close() is always safe on an unopened writer" contract + +**File:** `CometSearch/search/Pipeline.cpp:104-118` + +When a writer's `open()` fails partway through the writer list, `close(false, false)` +is called on *every* writer, including ones whose `open()` was never reached. This only +works because every concrete `IResultWriter` happens to null-check its file handle +first in `close()`. The invariant is real and currently upheld by all five writers, but +it is not stated anywhere in `IResultWriter.h`; a future writer that forgets the +null-check will crash on a partial-open failure with no compiler or test signal. + +**Fix:** add a one-line contract comment above `IResultWriter::close()` stating that +`close()` must be safe to call even if `open()` was never called or failed. + +**Status (2026-06-17):** Fixed. Added a contract note to `IResultWriter::close()` in +`CometSearch/output/IResultWriter.h` stating that implementations must be safe to call +even when `open()` was never invoked or returned false, and explaining why +(`Pipeline::run()` calls `close(false, false)` on every writer in the vector, including +ones after the one whose `open()` failed). No behavior change; all five existing +writers already satisfy the contract. + +### 3b. Stale "Phase 5" migration note in SearchSession.h + +**File:** `CometSearch/search/SearchSession.h:23-28` + +The header still says *"g_pvQueryMutex, g_bPlainPeptideIndexRead, and g_bSpecLibRead +remain as globals... Full removal is deferred to Phase 5."* Per `docs/20260612 +_architecture_migration.md`'s own phase numbering, Phase 5 (Pipeline/Strategy) is +the work already present in this branch. The comment now reads as an open TODO with no +tracked follow-up. Either the deferral is permanent (the RTS path will never adopt +`SearchSession`) and the comment should say so plainly, or there is real follow-up work +that should be filed somewhere visible instead of living only in a header comment. + +**Status (2026-06-17):** Fixed -- closed as part of fixing Actionable Improvement 4d, +which addressed this same `SearchSession.h:23-28` comment block ("state plainly" branch +chosen there). See 4d's status note for the detail; recorded separately here only +because this finding and 4d originally described the same fix as two different +write-ups (a critique and its corresponding improvement) rather than one item. + +### 3c. `isIndexBased()` conflates two unrelated concerns + +**File:** `CometSearch/search/ISearchStrategy.h:70-72`, used throughout +`CometSearch/search/Pipeline.cpp` + +`Pipeline::run()` branches on `_strategy->isIndexBased()` both to decide whether to +print the FASTA-style "Search start:" banner / per-spectrum verbose logging *and* to +decide whether to print the index-style "searching... done" progress line. These are +really one decision ("which strategy is this") wearing the trappings of two +unrelated questions (whether reading the database needs an index, and which console +output style to use). Today the mapping happens to be 1:1, so it costs nothing, but a +fourth strategy with index-based storage and FASTA-style verbose logging (or vice +versa) would have no way to express that without a behavior change at every call site. +Not urgent, but worth a `progressStyle()`-type accessor if a fourth strategy is ever +added. + +**Status (2026-06-17):** Addressed, narrower than originally framed. Re-checked every +call site of `isIndexBased()` (`grep` across `CometSearch/`): all nine are in +`Pipeline.cpp`, and every one of them is purely a console-output style switch (verbose +FASTA banners vs. the compact index-style progress line, including the "Reading all +spectra into memory" warning, which only ever changes what gets *printed*, not what +the strategy does). On closer inspection there isn't a second concern hiding in current +usage -- the original framing overstated the issue. Splitting the interface into two +accessors for a distinction the code doesn't actually have yet would be the kind of +premature abstraction this codebase's conventions warn against, so instead the +doc comment on `ISearchStrategy::isIndexBased()` was tightened to state explicitly that +`Pipeline::run()` is the only consumer, name the exact banners/lines it switches +between, and warn that the flag must not be used to gate actual search behavior. If a +fourth strategy ever needs index-based storage with FASTA-style verbose logging (or +vice versa), that is the trigger to revisit the split, not before. + +### 3d. Redundant `operator=` declaration left over from the Params.h cleanup + +**File:** `CometSearch/core/Params.h:98` (and similarly for the other structs touched +by the same cleanup) + +`Options& operator=(const Options&) = default;` is now redundant: with no other +user-declared special member, the compiler already generates an identical copy +assignment implicitly. Harmless, but it's leftover noise from the "replace +hand-written operator= with = default" pass -- could simply be deleted now that the +hand-written bodies are gone. + +**Status (2026-06-17):** Fixed. Removed all nine redundant `operator= = default` +declarations from `core/Params.h` (`Options`, `DBInfo`, `StaticMod`, `PrecalcMasses`, +`VarModParams`, `MassUtil`, `ToleranceParams`, `IonInfo`, `StaticParams`). None of these +structs declares a destructor, copy constructor, move constructor, or move assignment, +so a user-declared default constructor (only `StaticParams` has one) does not suppress +the implicit copy assignment operator either -- the compiler generates the identical +member-wise copy with the declarations removed. Verified with a clean rebuild (zero +warnings) and the full 19-test suite. + +--- + +## 4. Actionable Improvements + +### 4a. Add a regression test for AScore + FI_DB + +No test in `tests/unit/` exercises `print_ascore_score` against a fragment index, +which is exactly why issue 2a was not caught by CI. A minimal test that builds a tiny +FI_DB index with a variable mod, runs a search with `print_ascore_score` set and a +deliberately different/blank `variable_mod01` in the search-time params, and asserts +the AScore differential-mod symbol/mass reflects the `.idx` file's value (not the +params file's) would catch this entire class of ordering bug permanently and guard +against it recurring during future refactors. + +**Status (2026-06-17):** Fixed. Added `t19` to `tests/unit/run_tests.py`, with fixtures +`tests/unit/data/t19_ascore_fidb.fasta` (single 8-residue protein, one phospho-acceptor +S) and `t19_ascore_fidb.ms2` (synthetic singly-charged b/y ions for +`ACDEFGS[+79.966331]K`, precomputed from monoisotopic residue masses). The test builds +an FI_DB index with a real `variable_mod01` (phospho on S), then searches it with +`print_ascorepro_score=1` but a deliberately blank `variable_mod01` in the search-time +params -- the realistic case, since FI_DB search params don't need to redeclare mods +already baked into the index. It asserts the rank-1 PSM's `ascorepro` column is `> 0`. + +Verified the test actually discriminates the bug, not just incidentally passes: with +the fix in place it reports `ascorepro = 330.7289`; temporarily reverting +`CometSearchManager.cpp`/`Pipeline.cpp` to the pre-fix ordering (`git stash`, rebuild) +reproduces the exact failure mode predicted in 2a's analysis -- `ascorepro` comes back +as the untouched default sentinel `0.0`, because with the bug `g_AScoreOptions`'s +symbol never gets set to the mod's index (`CometSearch.cpp:5584-5585`'s +`iVal == g_AScoreOptions.getSymbol() - '0'` check fails), so `cHasVariableMod` is never +set to `HasVariableModType_AScorePro` and `CometPostAnalysis::CalculateAScorePro()` +returns immediately without running. Restored the fix afterward and confirmed the full +20-test suite (19 prior + t19) passes cleanly with zero build warnings. + +### 4b. Fix issue 2a at a single call site rather than inside FiStrategy + +Patching `FiStrategy::initialize()` to re-call `SetAScoreOptions()` after +`ReadPlainPeptideIndex()` would work but duplicates a process-wide concern (AScore +setup) inside a per-strategy class. Fixing the ordering at one shared call site keeps +the AScore lifecycle in one place and automatically covers any future strategy that +loads an index with embedded mod definitions. + +**Status (2026-06-17):** Done as part of fixing 2a. The shared call site chosen was +`Pipeline::run()` rather than `DoSearch()` itself, since `Pipeline::run()` is what +actually invokes `_strategy->initialize()`/`finalize()` and is already the single +caller of both -- placing the AScore lifecycle there means no strategy subclass needs +its own re-sync logic, present or future. + +### 4c. Document the writer close()-after-failed-open contract + +One sentence on `IResultWriter::close()` (issue 3a) removes the only undocumented +cross-class invariant `Pipeline::run()` currently depends on. + +**Status (2026-06-17):** Done as part of fixing 3a -- see that item's status note. + +### 4d. Resolve the stale Phase 5 comment + +Either state plainly in `SearchSession.h` that the RTS-path globals are permanently +out of scope for `SearchSession`, or file the remaining migration work so it is +discoverable outside of a header comment (issue 3b). + +**Status (2026-06-17):** Done -- "state plainly" branch chosen. Checked +`docs/20260612_architecture_migration.md`'s own phase plan: Phase 5 (Pipeline/Strategy) +is the last phase defined, and its own "RTS path" section already states the RTS entry +points are "explicitly out of scope for Phase 5" because they are +wrapper-compatibility-sensitive -- there is no Phase 6 deferring further removal. +Rewrote the comment block in `SearchSession.h` to say plainly that +`g_pvQueryMutex`/`g_bPlainPeptideIndexRead`/`g_bSpecLibRead` remaining as globals is +permanent, not a pending migration step: a single process can serve both RTS and batch +requests, so this once-per-process init state must stay process-global rather than move +into a per-batch-run `SearchSession`. Also re-flowed the trailing `g_cometStatus` +paragraph, which had been nested under the now-removed "Phase 4 migration note:" +sub-header, to match the rest of the comment's indentation. + +--- + +## Appendix: Findings Not Requiring Code Changes + +- **SearchMemoryPool / RAII slot guards**: `SearchMemoryPoolSlotGuard` is applied at + all five `AcquirePoolSlot()` / `releaseSlot()` sites in `CometSearch.cpp` + (`CometSearch::RunSearch(Query*)` FI and PI branches, the single-query FI fallback, + the batch-FI per-query lambda, and `SearchThreadProc`). `SearchMemoryPool::allocate()` + correctly unwinds partial allocations on `bad_alloc`. No exception-safety gaps found. +- **g_bIndexPrecursors alloc/free**: allocated with `malloc` in + `CometSearchManager.cpp:1552`, freed with `free()` in `FiStrategy::finalize()` -- + consistent, no mismatched allocator. +- **Output writers**: `TxtWriter`, `SqtWriter`, `PercolatorWriter`, `PepXmlWriter`, + `MzIdentMlWriter` all null-check their file handles in `close()`, including + `MzIdentMlWriter`'s more involved temp-file merge/rename lifecycle (`FinalizeOne`). + No double-close, no leaked temp files in the `bEmpty` or failed-merge paths observed. +- **RunSearchAndPostAnalysis / executeBatchLegacy**: only ever called with a + non-empty `session.queries` (guarded by the empty-check in `executeBatchLegacy` + before `RunSearchAndPostAnalysis` is invoked), so the unchecked + `session.queries.at(0)` / `.at(size()-1)` mass-range calculation inside it is safe in + every current call path. +- **Fused FI search path (`FusedLoadAndSearchSpectra` / `FusedSearchSpectrum`)**: + pushes into `session.queries` under `session.queriesMutex`, consistent with the + non-fused path's locking discipline; `Pipeline`'s post-batch stats and + empty-batch handling work correctly for both paths. +- **Build / tests**: `make cclean && make -j$(nproc)` clean, zero warnings. 17 unit + 2 + integration tests (T17 peptide-count range, T18 byte-identical determinism) all pass. diff --git a/docs/DataStructures.md b/docs/DataStructures.md index 9bb602d1..011a332f 100644 --- a/docs/DataStructures.md +++ b/docs/DataStructures.md @@ -239,16 +239,15 @@ struct SearchSession // search/SearchSession.h | Field | Purpose | |-------|---------| -| `params` | `const StaticParams&` -- read-only reference to `g_staticParams`. | | `queries` | `vector` -- per-batch MS2 query accumulator (replaces global `g_pvQuery` for the batch path). Protected by `queriesMutex`. | | `ms1Queries` | `vector` -- per-batch MS1 query accumulator (replaces global `g_pvQueryMS1`). | | `queriesMutex` | `std::mutex` -- guards `queries` and `ms1Queries` during parallel spectrum loading. | | `bPerformDatabaseSearch` | Replaces the former global `g_bPerformDatabaseSearch`. | | `bPerformSpecLibSearch` | Replaces the former global `g_bPerformSpecLibSearch`. | | `bIdxNoFasta` | Replaces the former global `g_bIdxNoFasta`. | -| `bPlainPeptideIndexRead` | Local copy of index-read state for this run. | -| `bSpecLibRead` | Local copy of speclib-read state for this run. | -| `status` | Per-run `CometStatus`; `g_cometStatus` remains as a global for the RTS path. | +| `statusRef` | `CometStatus&` -- a **reference** to the process-wide singleton `g_cometStatus`, not a per-run copy. Pipeline and strategy code use `session.statusRef` so they are not coupled to the global name, but both spellings touch the same object. | + +`SearchSession` has no `params` member -- code reads `g_staticParams` directly throughout; an earlier draft carried a `const StaticParams& params` field but it was unused and removed. There is also no `bPlainPeptideIndexRead` / `bSpecLibRead` member: `g_bPlainPeptideIndexRead`, `g_bSpecLibRead`, and `g_pvQueryMutex` remain plain globals rather than `SearchSession` fields, specifically because the RTS path (which never constructs a `SearchSession`) also reads/writes them -- see the header comment in `search/SearchSession.h` and the `g_pvQueryMutex` entry in `docs/GlobalVariables.md`. `SearchSession` is non-copyable. The RTS paths (`DoSingleSpectrumSearchMultiResults`, `DoMS1SearchMultiResults`) do **not** use `SearchSession`; they use per-call `Query*`/`QueryMS1*` objects directly. @@ -272,7 +271,7 @@ class Pipeline // search/Pipeline.h | `executeBatch(mstReader, firstScan, lastScan, analysisType, iPercentStart, iPercentEnd, tp, session)` | Once per batch | Preprocess + search + post-analysis for one spectrum batch; fills `session.queries`. | | `closeFiles(fpfasta, fpidx)` | Once per file | Close file handles. | | `finalize()` | Once after all files | Free memory pools and index arrays. | -| `isIndexBased()` | Any time | `true` for `FiStrategy`/`PiStrategy`; selects progress-message style in `Pipeline`. | +| `isIndexBased()` | Any time | `true` for `FiStrategy`/`PiStrategy`. `Pipeline::run()` is the only consumer, and uses it solely to choose between the compact index-style progress line and the verbose FASTA-style per-file banners -- it carries no other semantics and must not be used to gate search behavior. | **Concrete strategies:** @@ -282,7 +281,11 @@ class Pipeline // search/Pipeline.h | `FastaStrategy` | `search/FastaStrategy.cpp` | `FASTA_DB` | Classic three-sweep (load -> allocate -> RunSearch -> PostAnalysis). | | `PiStrategy` | `search/PiStrategy.cpp` | `PI_DB` | Three-sweep like FASTA but against the plain peptide index; no Mango block. | -**IResultWriter** (`output/IResultWriter.h`) is the parallel output abstraction. Each format (`TxtWriter`, `PepXmlWriter`, `SqtWriter`, `PercolatorWriter`, `MzIdentMlWriter`) implements `open()`, `write()`, `close()`. `Pipeline` holds a `vector>` and calls them around the batch loop. +**AScore lifecycle:** `Pipeline::run()` -- not `DoSearch()` -- owns `SetAScoreOptions()` / `CreateAScoreDllInterface()` / `DeleteAScoreDllInterface()` for the batch path, called immediately after `_strategy->initialize()` succeeds and immediately after `_strategy->finalize()` runs. This ordering matters: for `FI_DB`, `FiStrategy::initialize()` calls `ReadPlainPeptideIndex()`, which overwrites `g_staticParams.variableModParameters.varModList[]` from the `.idx` file's `VariableMod:` header -- `SetAScoreOptions()` must run after that overwrite, not before, or it configures AScore from stale/default mod values. (The RTS path's `InitializeSingleSpectrumSearch()` has its own, separate, already-correctly-ordered AScore setup and is not affected by this.) + +**`_pQueries` discipline (PI_DB):** `CometSearch::BinarySearchMass()` and the `AnalyzePeptideIndex(int iWhichQuery, ...)` overload read the query list through the `CometSearch` member `_pQueries` rather than a parameter -- mirroring `CometSearch::DoSearch()` (the FASTA path), which sets `_pQueries = &queries` at entry. `CometSearch::SearchPeptideIndex(ThreadPool*, vector&)` (the PI_DB batch path, called from a freshly constructed `CometSearch` instance in `RunSearch()`) must do the same at its own entry; omitting it leaves `_pQueries` `nullptr` and crashes on the first call into `BinarySearchMass()`. Any new code path that calls into these two functions needs the same assignment first. + +**IResultWriter** (`output/IResultWriter.h`) is the parallel output abstraction. Each format (`TxtWriter`, `PepXmlWriter`, `SqtWriter`, `PercolatorWriter`, `MzIdentMlWriter`) implements `open()`, `write()`, `close()`. `Pipeline` holds a `vector>` and calls them around the batch loop. `close()` must be safe to call even if `open()` was never invoked or returned false: when one writer's `open()` fails, `Pipeline::run()` calls `close(false, false)` on every writer in the vector, including ones after the failed one. --- diff --git a/docs/GlobalVariables.md b/docs/GlobalVariables.md index 59bc5ea0..e3793802 100644 --- a/docs/GlobalVariables.md +++ b/docs/GlobalVariables.md @@ -96,10 +96,14 @@ Used by the variable mod permutation engine (`CometModificationsPermuter`). |----------|------|-------| | `g_pvDBIndexMutex` | `Mutex` | Protects database index reads where concurrent access is possible. | | `g_preprocessMemoryPoolMutex` | `Mutex` | Protects the shared preprocessing memory pool. | -| `g_searchMemoryPoolMutex` | `Mutex` | Protects the shared search memory pool. | +| `g_pvQueryMutex` | `Mutex` | Protects `g_vSpecLib` load/access (`CometSpecLib.cpp`, `CometPreprocess.cpp`). Name is a holdover from before the architecture migration, when it also guarded the now-removed `g_pvQuery` global; it was repurposed rather than renamed. Remains a global (not a `SearchSession` member) because it is also used by the RTS path -- see `search/SearchSession.h`'s header comment. | | `g_ms1AlignerMutex` | `Mutex` | Protects `RetentionMatchHistory` updates in `DoMS1SearchMultiResults`. | -| `g_vSpecLibMutex` | `Mutex` | Protects speclib access where needed. | -| `g_dbIndexMutex` | `Mutex` | Protects DB index access where needed. | + +**Note:** `g_searchMemoryPoolMutex` and the paired `g_searchPoolCV` condition variable were removed during the architecture migration; the search memory pool's locking is now encapsulated inside the `SearchMemoryPool` class (see below) instead of living as bare globals. + +### SearchMemoryPool (`threading/SearchMemoryPool.h`) + +Not a global variable, but the direct replacement for the old `_pbSearchMemoryPool` static array, `g_searchMemoryPoolMutex`, and `g_searchPoolCV` trio, so it is documented here for anyone updating this table. `CometSearch.cpp` holds a single file-static instance, `s_pool`, owning its own `std::mutex` and `std::condition_variable`. `CometSearch::AllocateMemory(N)` calls `s_pool.allocate(N, g_staticParams.iArraySizeGlobal)`; `AcquirePoolSlot()` / `releaseSlot()` forward to `s_pool.acquireSlot()` / `s_pool.releaseSlot()`. Every acquire site wraps the returned slot in a `SearchMemoryPoolSlotGuard` (RAII; releases on scope exit, including exception unwind) so a throw out of a search body cannot leak a slot and stall the next `acquireSlot()` caller for up to 240 s. --- diff --git a/docs/RealTimeSearch.md b/docs/RealTimeSearch.md index e778c452..a277e170 100644 --- a/docs/RealTimeSearch.md +++ b/docs/RealTimeSearch.md @@ -62,8 +62,9 @@ slow path: mutex-guarded check + initialization -> ValidateSequenceDatabaseFile() validates FASTA / index; sets bCreateFragmentIndex=true if .idx is absent but FASTA exists -> CometPreprocess::AllocateMemory() preprocessing thread buffers - -> CometSearch::AllocateMemory() search thread pool (_pbSearchMemoryPool, - _ppbDuplFragmentArr) used by AcquirePoolSlot() + -> CometSearch::AllocateMemory() search thread pool (s_pool, a SearchMemoryPool + instance; aliased into _ppbDuplFragmentArr) + used by AcquirePoolSlot() -> tp->fillPool() -> if iDbType == FI_DB: if bCreateFragmentIndex: @@ -258,7 +259,8 @@ The timeout clock is a `chrono::time_point tRealTimeStart` local to each call, p **Shared pools (allocated once at init, reused across calls):** - `CometPreprocess::AllocateMemory(N)` -- per-thread preprocessing buffers for the batch path. The RTS thread-local path bypasses this pool and allocates directly. -- `CometSearch::AllocateMemory(N)` -- allocates `_pbSearchMemoryPool[N]` and `_ppbDuplFragmentArr[N][]`, used by `AcquirePoolSlot()` to hand each concurrent call a dedicated duplicate-fragment scratch buffer. Must be valid before any call reaches `RunSearch(Query*, ...)`. If the index-build path was taken during init, this pool is freed inside `DoSearch()` and re-allocated by `InitializeSingleSpectrumSearch()` before proceeding. +- `CometSearch::AllocateMemory(N)` -- calls `s_pool.allocate(N, g_staticParams.iArraySizeGlobal)` (`s_pool` is a file-static `SearchMemoryPool` instance in `CometSearch.cpp`; see `threading/SearchMemoryPool.h`) and aliases each slot's scratch buffer into `_ppbDuplFragmentArr[N][]`. `AcquirePoolSlot()` / `releaseSlot()` forward to `s_pool.acquireSlot()` / `s_pool.releaseSlot()`. Every acquire site wraps the slot in a `SearchMemoryPoolSlotGuard` so the slot is released on scope exit even if the search body throws. Must be valid before any call reaches `RunSearch(Query*, ...)`. If the index-build path was taken during init, this pool is freed inside `DoSearch()` and re-allocated by `InitializeSingleSpectrumSearch()` before proceeding. +- **Known limitation:** `s_pool` is a single process-wide instance, so it does not support multiple concurrent `ICometSearchManager` instances performing RTS searches against different fragment indexes in the same process -- see the `TODO` comment at the top of `CometSearch.cpp` and `docs/20260615_multiple_rts_instances.md`. --- diff --git a/tests/unit/data/t19_ascore_fidb.fasta b/tests/unit/data/t19_ascore_fidb.fasta new file mode 100644 index 00000000..1fb45b10 --- /dev/null +++ b/tests/unit/data/t19_ascore_fidb.fasta @@ -0,0 +1,2 @@ +>sp|T19|ASCORE single short protein with one phospho-acceptor S +ACDEFGSK diff --git a/tests/unit/data/t19_ascore_fidb.ms2 b/tests/unit/data/t19_ascore_fidb.ms2 new file mode 100644 index 00000000..ad774dc8 --- /dev/null +++ b/tests/unit/data/t19_ascore_fidb.ms2 @@ -0,0 +1,20 @@ +H CreationDate synthetic +H Comment T19 AScore+FI_DB regression fixture: peptide ACDEFGS[+79.966331]K, charge 2+ +H Comment b/y ions (singly charged) computed from monoisotopic residue masses; +H Comment phospho (+79.966331) applied to S at position 7 (1-based). +S 1 1 468.662069 +Z 2 936.316862 +72.044386 100.0 +147.112801 105.0 +175.053576 120.0 +290.080516 110.0 +314.111162 135.0 +371.132622 118.0 +419.123106 130.0 +518.201032 122.0 +566.191516 125.0 +623.212976 115.0 +647.243622 128.0 +762.270562 132.0 +790.211337 140.0 +865.279752 138.0 diff --git a/tests/unit/run_tests.py b/tests/unit/run_tests.py index 92675378..d8fdf161 100644 --- a/tests/unit/run_tests.py +++ b/tests/unit/run_tests.py @@ -916,6 +916,330 @@ def test_t18(comet_exe): return failures +# --------------------------------------------------------------------------- +# T19 -- AScore + FI_DB regression (docs/20260617_codereview3.md issue 2a) +# --------------------------------------------------------------------------- +# +# CometSearchManager::SetAScoreOptions() reads g_staticParams.variableModParameters. +# varModList[] to configure AScorePro's differential-mod list. For an FI_DB (.idx) +# search, FiStrategy::initialize() loads the index and overwrites that same +# varModList[] from the .idx file's "VariableMod:" header line. If AScore were +# configured *before* that overwrite, it would be left with whatever (possibly blank) +# variable_mod01 the search-time params declared instead of the index's actual mod -- +# see the ordering comment in CometSearch/search/Pipeline.cpp. This test builds an +# FI_DB index with a real variable mod, then searches it with print_ascorepro_score +# enabled but a deliberately blank variable_mod01 in the search-time params (the +# common real-world case, since FI_DB search params don't need to redeclare mods +# already baked into the index), and checks that AScorePro actually ran rather than +# being silently skipped. +# +# Fixture peptide: ACDEFGS[+79.966331]K (charge 2+), the only candidate in the index +# within the configured mass range, with a single phospho-acceptor S so localization +# is unambiguous. tests/unit/data/t19_ascore_fidb.ms2 contains the matching singly +# charged b/y ions, precomputed from monoisotopic residue masses. + +T19_PARAMS_TEMPLATE = textwrap.dedent("""\ +# comet_version {comet_version} +database_name = {database} +decoy_search = 0 +num_threads = 4 +print_ascorepro_score = {ascorepro} +peptide_mass_tolerance_upper = 20.0 +peptide_mass_tolerance_lower = -20.0 +peptide_mass_units = 2 +precursor_tolerance_type = 1 +isotope_error = 0 +search_enzyme_number = 0 +search_enzyme2_number = 0 +sample_enzyme_number = 0 +num_enzyme_termini = 2 +allowed_missed_cleavage = 0 +variable_mod01 = {mod1} +variable_mod02 = 0.0 X 0 3 -1 0 0 0.0 +variable_mod03 = 0.0 X 0 3 -1 0 0 0.0 +variable_mod04 = 0.0 X 0 3 -1 0 0 0.0 +variable_mod05 = 0.0 X 0 3 -1 0 0 0.0 +max_variable_mods_in_peptide = 1 +require_variable_mod = 0 +fragment_bin_tol = 0.02 +fragment_bin_offset = 0.0 +theoretical_fragment_ions = 0 +use_A_ions = 0 +use_B_ions = 1 +use_C_ions = 0 +use_X_ions = 0 +use_Y_ions = 1 +use_Z_ions = 0 +use_Z1_ions = 0 +use_NL_ions = 0 +output_sqtfile = 0 +output_txtfile = 1 +output_pepxmlfile = 0 +output_mzidentmlfile = 0 +output_percolatorfile = 0 +num_output_lines = 1 +scan_range = 0 0 +precursor_charge = 0 0 +override_charge = 0 +ms_level = 2 +activation_method = ALL +digest_mass_range = 200.0 2000.0 +peptide_length_range = 8 8 +max_duplicate_proteins = -1 +max_fragment_charge = 3 +min_precursor_charge = 1 +max_precursor_charge = 6 +clip_nterm_methionine = 0 +spectrum_batch_size = 15000 +decoy_prefix = DECOY_ +equal_I_and_L = 0 +mass_offsets = +minimum_peaks = 10 +minimum_intensity = 0 +remove_precursor_peak = 0 +remove_precursor_tolerance = 1.5 +clear_mz_range = 0.0 0.0 +percentage_base_peak = 0.0 +add_Cterm_peptide = 0.0 +add_Nterm_peptide = 0.0 +add_Cterm_protein = 0.0 +add_Nterm_protein = 0.0 +add_G_glycine = 0.0 +add_A_alanine = 0.0 +add_S_serine = 0.0 +add_P_proline = 0.0 +add_V_valine = 0.0 +add_T_threonine = 0.0 +add_C_cysteine = 0.0 +add_L_leucine = 0.0 +add_I_isoleucine = 0.0 +add_N_asparagine = 0.0 +add_D_aspartic_acid = 0.0 +add_Q_glutamine = 0.0 +add_K_lysine = 0.0 +add_E_glutamic_acid = 0.0 +add_M_methionine = 0.0 +add_H_histidine = 0.0 +add_F_phenylalanine = 0.0 +add_U_selenocysteine = 0.0 +add_R_arginine = 0.0 +add_Y_tyrosine = 0.0 +add_W_tryptophan = 0.0 +add_O_pyrrolysine = 0.0 +add_B_user_amino_acid = 0.0 +add_J_user_amino_acid = 0.0 +add_X_user_amino_acid = 0.0 +add_Z_user_amino_acid = 0.0 +[COMET_ENZYME_INFO] +0. Cut_everywhere 0 - - +1. Trypsin 1 KR P +2. Trypsin/P 1 KR - +""") + + +def _run_t19_step(comet_exe, args, timeout=120): + """Run comet_exe with args, return (returncode, combined stdout+stderr).""" + result = subprocess.run( + [str(comet_exe)] + args, capture_output=True, text=True, timeout=timeout, + ) + return result.returncode, result.stdout + result.stderr + + +@register("t19") +def test_t19(comet_exe): + """T19: AScore + FI_DB regression -- AScore must use the .idx file's variable mod, + not the search-time params' (blank) mod, for FI_DB searches.""" + failures = [] + + fasta = DATA_DIR / "t19_ascore_fidb.fasta" + ms2 = DATA_DIR / "t19_ascore_fidb.ms2" + idx = fasta.with_suffix(".fasta.idx") + txt = ms2.with_suffix(".txt") + + use_win = _binary_uses_win_paths(comet_exe) + fmt = _to_win if use_win else str + + # Step 1: build an FI_DB index with a real phospho-S mod baked into its header. + if idx.exists(): + idx.unlink() + + build_params = T19_PARAMS_TEMPLATE.format( + comet_version="2026.02 rev. 0", database=fmt(fasta), + ascorepro=0, mod1="79.966331 S 0 1 -1 0 0 0.0", + ) + with tempfile.NamedTemporaryFile( + mode="w", suffix=".params", dir=str(DATA_DIR), delete=False + ) as pf: + pf.write(build_params) + build_params_file = Path(pf.name) + + try: + rc, out = _run_t19_step(comet_exe, ["-i", f"-P{fmt(build_params_file)}"]) + if rc != 0 or not idx.exists(): + failures.append(f"index build failed (rc={rc}):\n{out}") + return failures + finally: + build_params_file.unlink(missing_ok=True) + + # Step 2: search the index with print_ascorepro_score enabled but a blank + # variable_mod01 in the search-time params. + if txt.exists(): + txt.unlink() + + search_params = T19_PARAMS_TEMPLATE.format( + comet_version="2026.02 rev. 0", database=fmt(idx), + ascorepro=1, mod1="0.0 X 0 3 -1 0 0 0.0", + ) + with tempfile.NamedTemporaryFile( + mode="w", suffix=".params", dir=str(DATA_DIR), delete=False + ) as pf: + pf.write(search_params) + search_params_file = Path(pf.name) + + try: + rc, out = _run_t19_step(comet_exe, [f"-P{fmt(search_params_file)}", fmt(ms2)]) + if rc != 0: + failures.append(f"search failed (rc={rc}):\n{out}") + return failures + if not txt.exists(): + failures.append(f".txt not created. Comet output:\n{out}") + return failures + + lines = txt.read_text().splitlines() + header = lines[1].split("\t") # line 0 is the CometVersion/.../database line + rows = [l.split("\t") for l in lines[2:] if l.strip()] + + check(len(rows) == 1, f"expected exactly 1 PSM row, got {len(rows)}", failures) + if not rows: + return failures + + row = dict(zip(header, rows[0])) + + check(row.get("plain_peptide") == "ACDEFGSK", + f"plain_peptide: expected ACDEFGSK, got {row.get('plain_peptide')!r}", failures) + check("7_V_79.966331" in row.get("modifications", ""), + f"modifications: expected to contain 7_V_79.966331, got " + f"{row.get('modifications')!r}", failures) + + ascorepro = float(row.get("ascorepro", "0") or "0") + check(ascorepro > 0.0, + f"ascorepro: expected > 0 (AScore must run using the .idx file's mod, " + f"not the search-time params' blank mod), got {ascorepro}", failures) + finally: + search_params_file.unlink(missing_ok=True) + idx.unlink(missing_ok=True) + txt.unlink(missing_ok=True) + + return failures + + +# --------------------------------------------------------------------------- +# T20 -- PI_DB batch search regression (_pQueries never assigned) +# --------------------------------------------------------------------------- +# +# CometSearch::BinarySearchMass() and AnalyzePeptideIndex() read the query list +# through the _pQueries member (mirroring CometSearch::DoSearch(), the FASTA path, +# which sets _pQueries = &queries at entry) rather than through a parameter. The +# batch PI_DB path, CometSearch::SearchPeptideIndex(ThreadPool*, vector&), +# never set _pQueries, so it stayed nullptr on the freshly constructed CometSearch +# instance RunSearch() uses for PI_DB, and the first dereference inside +# BinarySearchMass() segfaulted -- silently, with only the "- searching ..." progress +# message printed and no error text, exactly as reported against the VS-built +# Windows binary. This test reuses T19's phospho fixture but builds a PI_DB (plain +# peptide) index instead of an FI_DB (fragment ion) index, to cover the code path +# that crashed. + +@register("t20") +def test_t20(comet_exe): + """T20: PI_DB batch search regression -- a peptide-index (-j) search must + complete and score correctly, not crash on the first scored candidate.""" + failures = [] + + fasta = DATA_DIR / "t19_ascore_fidb.fasta" + ms2 = DATA_DIR / "t19_ascore_fidb.ms2" + idx = fasta.with_suffix(".fasta.idx") + txt = ms2.with_suffix(".txt") + + use_win = _binary_uses_win_paths(comet_exe) + fmt = _to_win if use_win else str + + # Step 1: build a PI_DB (peptide index) with a real phospho-S mod baked into + # its header. "-j" selects create_peptide_index, unlike T19's "-i" + # (create_fragment_index). + if idx.exists(): + idx.unlink() + + build_params = T19_PARAMS_TEMPLATE.format( + comet_version="2026.02 rev. 0", database=fmt(fasta), + ascorepro=0, mod1="79.966331 S 0 1 -1 0 0 0.0", + ) + with tempfile.NamedTemporaryFile( + mode="w", suffix=".params", dir=str(DATA_DIR), delete=False + ) as pf: + pf.write(build_params) + build_params_file = Path(pf.name) + + try: + rc, out = _run_t19_step(comet_exe, ["-j", f"-P{fmt(build_params_file)}"]) + if rc != 0 or not idx.exists(): + failures.append(f"index build failed (rc={rc}):\n{out}") + return failures + finally: + build_params_file.unlink(missing_ok=True) + + # Step 2: search the PI_DB index. This is the call sequence that previously + # segfaulted inside CometSearch::BinarySearchMass() before any output was + # written, so a non-crashing exit with the expected PSM is the regression check. + if txt.exists(): + txt.unlink() + + search_params = T19_PARAMS_TEMPLATE.format( + comet_version="2026.02 rev. 0", database=fmt(idx), + ascorepro=1, mod1="0.0 X 0 3 -1 0 0 0.0", + ) + with tempfile.NamedTemporaryFile( + mode="w", suffix=".params", dir=str(DATA_DIR), delete=False + ) as pf: + pf.write(search_params) + search_params_file = Path(pf.name) + + try: + rc, out = _run_t19_step(comet_exe, [f"-P{fmt(search_params_file)}", fmt(ms2)]) + if rc != 0: + failures.append(f"search exited rc={rc} (expected 0, i.e. no crash):\n{out}") + return failures + check(True, "search exited cleanly (rc=0)", failures) + if not txt.exists(): + failures.append(f".txt not created. Comet output:\n{out}") + return failures + + lines = txt.read_text().splitlines() + header = lines[1].split("\t") # line 0 is the CometVersion/.../database line + rows = [l.split("\t") for l in lines[2:] if l.strip()] + + check(len(rows) == 1, f"expected exactly 1 PSM row, got {len(rows)}", failures) + if not rows: + return failures + + row = dict(zip(header, rows[0])) + + check(row.get("plain_peptide") == "ACDEFGSK", + f"plain_peptide: expected ACDEFGSK, got {row.get('plain_peptide')!r}", failures) + check("7_V_79.966331" in row.get("modifications", ""), + f"modifications: expected to contain 7_V_79.966331, got " + f"{row.get('modifications')!r}", failures) + + ascorepro = float(row.get("ascorepro", "0") or "0") + check(ascorepro > 0.0, + f"ascorepro: expected > 0, got {ascorepro}", failures) + finally: + search_params_file.unlink(missing_ok=True) + idx.unlink(missing_ok=True) + txt.unlink(missing_ok=True) + + return failures + + # --------------------------------------------------------------------------- # main # --------------------------------------------------------------------------- From 4087d0f61968416c2097da459b29b0ac2dc9a6ed Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Thu, 18 Jun 2026 09:48:48 -0700 Subject: [PATCH 12/15] fix: close fragment-index UAF risk, harden writer context, drop dead code - FiStrategy::finalize() now resets g_iFragmentIndex/g_iFragmentIndexOffset/ g_bIndexPrecursors to nullptr and g_bPlainPeptideIndexRead to false after freeing, so a second DoSearch() in the same process (or RTS-then-batch in one process) rebuilds the index instead of using freed pointers. - WriterOpenCtx::pStatus is now set via a mandatory constructor parameter instead of a defaulted-nullable field, turning "forgot to set status" into a compile error instead of a potential null-deref in any writer's open() failure path. - Delete the dead CometSearch::RunSearch(ThreadPool*, vector&) overload and its misleading "called by DoSingleSpectrumSearchMultiResults" comment (it had zero callers). - Fix a stale IResultWriter.h comment referencing the removed g_pvQuery global. - Replace throwaway heap allocations (new/delete CometFragmentIndex, CometSearch) with stack locals in RunSearch(int,int,ThreadPool*,vector&). - Replace SearchMemoryPool's O(n) linear scan with an O(1) free-list stack for acquireSlot()/releaseSlot(); benchmarked before/after (see docs/20260618_mutexserialization.md for methodology and results). Co-Authored-By: Claude Sonnet 4.6 --- CometSearch/CometSearch.cpp | 55 ++-------------------- CometSearch/CometSearch.h | 2 - CometSearch/output/IResultWriter.h | 29 +++++++----- CometSearch/search/FiStrategy.cpp | 9 ++++ CometSearch/search/Pipeline.cpp | 3 +- CometSearch/threading/SearchMemoryPool.cpp | 32 +++++-------- CometSearch/threading/SearchMemoryPool.h | 6 ++- 7 files changed, 50 insertions(+), 86 deletions(-) diff --git a/CometSearch/CometSearch.cpp b/CometSearch/CometSearch.cpp index 01d10d26..5de03edf 100644 --- a/CometSearch/CometSearch.cpp +++ b/CometSearch/CometSearch.cpp @@ -190,46 +190,6 @@ bool CometSearch::RunSearch(Query* pQuery, int iSlot) } -// called by DoSingleSpectrumSearchMultiResults -bool CometSearch::RunSearch(ThreadPool *tp, vector& queries) -{ - CometSearch sqSearch; - size_t iWhichQuery = 0; - - if (g_staticParams.iDbType == DbType::FI_DB) // fragment ion index - { - if (!g_bPlainPeptideIndexRead) - { - CometFragmentIndex sqFI; - sqFI.ReadPlainPeptideIndex(); - sqFI.CreateFragmentIndex(tp); - } - - int iSlot = AcquirePoolSlot(); - if (iSlot < 0) - { - logerr(" Error - could not acquire memory pool slot for single-query FI search.\n"); - return false; - } - SearchMemoryPoolSlotGuard guard{s_pool, iSlot}; - SearchFragmentIndex(queries.at(iWhichQuery), _ppbDuplFragmentArr[iSlot]); - } - else if (g_staticParams.iDbType == DbType::PI_DB) // peptide index - { - sqSearch.SearchPeptideIndex(tp, queries); - } - else - { - string strErrorMsg = " Error - index search but iDbType = " + std::to_string(static_cast(g_staticParams.iDbType)) + "\n"; - g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); - logerr(strErrorMsg); - return false; - } - - return true; -} - - bool CometSearch::RunSearch(int iPercentStart, int iPercentEnd, ThreadPool* tp, @@ -239,17 +199,14 @@ bool CometSearch::RunSearch(int iPercentStart, if (g_staticParams.iDbType == DbType::FI_DB) { - CometFragmentIndex* sqFI = new CometFragmentIndex(); - CometSearch* sqSearch = new CometSearch(); + CometFragmentIndex sqFI; if (!g_bPlainPeptideIndexRead) { - sqFI->ReadPlainPeptideIndex(); - sqFI->CreateFragmentIndex(tp); + sqFI.ReadPlainPeptideIndex(); + sqFI.CreateFragmentIndex(tp); } - delete sqFI; - ThreadPool* pSearchThreadPool = tp; size_t iEnd = queries.size(); @@ -288,14 +245,12 @@ bool CometSearch::RunSearch(int iPercentStart, logout("\b\b\b\b"); } - delete sqSearch; return bSucceeded; } else if (g_staticParams.iDbType == DbType::PI_DB) { - CometSearch* sqSearch = new CometSearch(); - sqSearch->SearchPeptideIndex(tp, queries); - delete sqSearch; + CometSearch sqSearch; + sqSearch.SearchPeptideIndex(tp, queries); return bSucceeded; } else diff --git a/CometSearch/CometSearch.h b/CometSearch/CometSearch.h index 8b440567..5c2875b8 100644 --- a/CometSearch/CometSearch.h +++ b/CometSearch/CometSearch.h @@ -68,8 +68,6 @@ class CometSearch int iPercentEnd, ThreadPool* tp, vector& queries); - static bool RunSearch(ThreadPool* tp, - vector& queries); // Task 1.3: Thread-local overload: searches a caller-owned Query* without // touching g_pvQuery. Allocates its own pbDuplFragment scratch buffer. diff --git a/CometSearch/output/IResultWriter.h b/CometSearch/output/IResultWriter.h index 84ea7c0e..d0d6d0cf 100644 --- a/CometSearch/output/IResultWriter.h +++ b/CometSearch/output/IResultWriter.h @@ -26,16 +26,23 @@ struct Query; // Parameters passed to each writer's open() method. struct WriterOpenCtx { - const char* szBaseName; - const char* szOutputSuffix; - const char* szTxtFileExt; // TxtWriter only - bool bEntireFile; // true => no scan-range suffix on output name - int iFirstScan; - int iLastScan; - int iDecoySearch; // 0=off, 1=concat, 2=separate - bool bIdxNoFasta; // .idx DB with no companion .fasta (mzIdentML) - CometSearchManager* pMgr; // for format headers that need ICometSearchManager - CometStatus* pStatus = nullptr; // session error/cancel state (always set by Pipeline) + // pStatus is required: every writer's open() dereferences it unconditionally on + // the file-open-failure path with no null check, so making it constructor-only + // (rather than a default-nullptr field set later like the rest of this struct) + // turns "forgot to set pStatus" from a runtime null-pointer crash into a compile + // error at the construction site. + explicit WriterOpenCtx(CometStatus& status) : pStatus(&status) {} + + const char* szBaseName = nullptr; + const char* szOutputSuffix = nullptr; + const char* szTxtFileExt = nullptr; // TxtWriter only + bool bEntireFile = false; // true => no scan-range suffix on output name + int iFirstScan = 0; + int iLastScan = 0; + int iDecoySearch = 0; // 0=off, 1=concat, 2=separate + bool bIdxNoFasta = false; // .idx DB with no companion .fasta (mzIdentML) + CometSearchManager* pMgr = nullptr; // for format headers that need ICometSearchManager + CometStatus* const pStatus; // session error/cancel state; never null, set once above }; // Parameters passed to each writer's write() method (per-batch). @@ -56,7 +63,7 @@ class IResultWriter // Returns false on error. virtual bool open(const WriterOpenCtx& ctx) = 0; - // Write all results in g_pvQuery for one batch. + // Write all results in ctx.pQueries (the current batch's session.queries) for one batch. // Returns false on error. virtual bool write(const WriterWriteCtx& ctx) = 0; diff --git a/CometSearch/search/FiStrategy.cpp b/CometSearch/search/FiStrategy.cpp index 4c7c45d9..7b766289 100644 --- a/CometSearch/search/FiStrategy.cpp +++ b/CometSearch/search/FiStrategy.cpp @@ -162,6 +162,15 @@ void FiStrategy::finalize() free(g_bIndexPrecursors); delete[] g_iFragmentIndex; delete[] g_iFragmentIndexOffset; + + // Reset so a subsequent DoSearch() in the same process (batch run after an + // RTS session, or a second batch run) rebuilds the index instead of reusing + // these now-freed pointers; g_bPlainPeptideIndexRead gates that rebuild in + // FiStrategy::initialize() and is otherwise never reset to false. + g_bIndexPrecursors = nullptr; + g_iFragmentIndex = nullptr; + g_iFragmentIndexOffset = nullptr; + g_bPlainPeptideIndexRead = false; } CometPreprocess::DeallocateMemory(g_staticParams.options.iNumThreads); diff --git a/CometSearch/search/Pipeline.cpp b/CometSearch/search/Pipeline.cpp index 513eeeaf..4407beec 100644 --- a/CometSearch/search/Pipeline.cpp +++ b/CometSearch/search/Pipeline.cpp @@ -109,7 +109,7 @@ bool Pipeline::run(SearchSession& session, } // Open writers (after openFiles so session.bIdxNoFasta is correctly set). - WriterOpenCtx woctx; + WriterOpenCtx woctx(session.statusRef); woctx.szBaseName = g_staticParams.inputFile.szBaseName; woctx.szOutputSuffix = g_staticParams.szOutputSuffix; woctx.szTxtFileExt = g_staticParams.szTxtFileExt; @@ -119,7 +119,6 @@ bool Pipeline::run(SearchSession& session, woctx.iDecoySearch = g_staticParams.options.iDecoySearch; woctx.bIdxNoFasta = session.bIdxNoFasta; woctx.pMgr = _pMgr; - woctx.pStatus = &session.statusRef; for (auto& pw : _writers) { diff --git a/CometSearch/threading/SearchMemoryPool.cpp b/CometSearch/threading/SearchMemoryPool.cpp index 0fbc39f7..5fecabf7 100644 --- a/CometSearch/threading/SearchMemoryPool.cpp +++ b/CometSearch/threading/SearchMemoryPool.cpp @@ -25,10 +25,12 @@ bool SearchMemoryPool::allocate(int nSlots, int iArraySize) try { - _inUse = new bool[nSlots](); _pool = new bool*[nSlots](); // value-init to nullptr so partial allocs are safe to delete[] for (int i = 0; i < nSlots; ++i) _pool[i] = new bool[iArraySize](); + _freeSlots.reserve(nSlots); + for (int i = 0; i < nSlots; ++i) + _freeSlots.push_back(i); _nSlots = nSlots; _allocated = true; return true; @@ -43,8 +45,7 @@ bool SearchMemoryPool::allocate(int nSlots, int iArraySize) delete[] _pool; _pool = nullptr; } - delete[] _inUse; - _inUse = nullptr; + _freeSlots.clear(); std::string strErrorMsg = " Error - SearchMemoryPool::allocate failed. bad_alloc: " + std::string(ba.what()) + ".\n"; g_cometStatus.SetStatus(CometResult_Failed, strErrorMsg); logerr(strErrorMsg); @@ -56,12 +57,11 @@ bool SearchMemoryPool::allocate(int nSlots, int iArraySize) void SearchMemoryPool::_deallocate(int nSlots) { - delete[] _inUse; for (int i = 0; i < nSlots; ++i) delete[] _pool[i]; delete[] _pool; - _inUse = nullptr; _pool = nullptr; + _freeSlots.clear(); _allocated = false; } @@ -75,26 +75,18 @@ void SearchMemoryPool::deallocate() int SearchMemoryPool::acquireSlot() { - int i = -1; std::unique_lock lock(_mutex); - bool found = _cv.wait_for(lock, std::chrono::seconds(240), [&i, this]() { - for (int j = 0; j < _nSlots; ++j) - { - if (!_inUse[j]) - { - _inUse[j] = true; - i = j; - return true; - } - } - return false; - }); - return found ? i : -1; + bool found = _cv.wait_for(lock, std::chrono::seconds(240), [this]() { return !_freeSlots.empty(); }); + if (!found) + return -1; + int slot = _freeSlots.back(); + _freeSlots.pop_back(); + return slot; } void SearchMemoryPool::releaseSlot(int slot) { - { std::lock_guard lk(_mutex); _inUse[slot] = false; } + { std::lock_guard lk(_mutex); _freeSlots.push_back(slot); } _cv.notify_one(); } diff --git a/CometSearch/threading/SearchMemoryPool.h b/CometSearch/threading/SearchMemoryPool.h index 0cf1a719..47477b87 100644 --- a/CometSearch/threading/SearchMemoryPool.h +++ b/CometSearch/threading/SearchMemoryPool.h @@ -24,6 +24,7 @@ #include #include #include +#include class SearchMemoryPool { @@ -54,10 +55,13 @@ class SearchMemoryPool void _deallocate(int nSlots); int _nSlots = 0; - bool* _inUse = nullptr; // [_nSlots]: true = slot claimed by a thread bool** _pool = nullptr; // [_nSlots][iArraySize]: scratch buffers bool _allocated = false; + // Stack of currently-free slot indices. A slot's presence here (rather than a + // separate bool[] scanned linearly) is the sole source of truth for "is free", + // so acquire/release are O(1) instead of O(nSlots) regardless of pool size. + std::vector _freeSlots; std::mutex _mutex; std::condition_variable _cv; }; From 0e422706ce10bc232f6dd8913ed5f6d6bfba2d81 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Thu, 18 Jun 2026 09:49:00 -0700 Subject: [PATCH 13/15] docs: add review/planning docs, rename 20260617 review for numbering - 20260612_producerConsumerQueue.md: producer/consumer queue design for the fused batch FI_DB path. - 20260616_codereview1.md, 20260616_codereview2.md: architecture_update branch code reviews. - 20260617_codereview.md -> 20260617_codereview1.md: renamed for consistent numbering alongside the other dated review docs. - 20260618_mutexserialization.md: problem description and phased plan for SearchMemoryPool's mutex serialization, including benchmark methodology and results. Co-Authored-By: Claude Sonnet 4.6 --- docs/20260612_producerConsumerQueue.md | 209 +++++++++++++ docs/20260616_codereview1.md | 218 +++++++++++++ docs/20260616_codereview2.md | 214 +++++++++++++ ..._codereview.md => 20260617_codereview1.md} | 0 docs/20260618_mutexserialization.md | 293 ++++++++++++++++++ 5 files changed, 934 insertions(+) create mode 100644 docs/20260612_producerConsumerQueue.md create mode 100644 docs/20260616_codereview1.md create mode 100644 docs/20260616_codereview2.md rename docs/{20260617_codereview.md => 20260617_codereview1.md} (100%) create mode 100644 docs/20260618_mutexserialization.md diff --git a/docs/20260612_producerConsumerQueue.md b/docs/20260612_producerConsumerQueue.md new file mode 100644 index 00000000..4a04f935 --- /dev/null +++ b/docs/20260612_producerConsumerQueue.md @@ -0,0 +1,209 @@ +# Producer/Consumer Queue for Fused Batch FI_DB Path + +## Context + +`FusedLoadAndSearchSpectra` (added in `batch_FI_optimization`) eliminated the +three-sweep DRAM anti-scaling problem by fusing preprocess -> search -> +post-analysis per spectrum in one pass. However, it still reads the entire batch +into `std::vector vSpectra` before dispatching any worker, because the +original work-stealing design required the full vector to be present before +`fetch_add` indexing could begin. + +This two-phase structure has a measurable RAM cost: + +- `MSToolkit::Spectrum` stores its peaks in a heap-allocated `vector` + (12 bytes per peak: 8-byte `double mz` + 4-byte `float intensity`). +- A typical HeLa MS2 spectrum has ~600-800 peaks: ~8 KB per spectrum. +- A 302 MB HeLa `.raw` file contains ~40k MS2 spectra: **~320 MB** held in + `vSpectra` simultaneously before a single spectrum is processed. +- Peak RAM for the HeLa benchmark is 10.5 GB; that 320 MB is recoverable + with no algorithmic loss. + +There is no correctness reason to read ahead more than one spectrum beyond what +workers can immediately consume. A bounded producer/consumer queue lets the +read loop and the worker pool run concurrently, capping peak spectrum RAM to +`O(iNumThreads)` regardless of file size. + +## Goal + +Replace the two-phase (read-all -> process-all) structure of +`FusedLoadAndSearchSpectra` with a single-pass pipeline: + +- **Producer** (calling thread): reads spectra from the raw file one at a time + and pushes them into a bounded concurrent queue, blocking when the queue is + full. +- **Consumers** (`iNumThreads` workers): pop from the queue and call + `FusedSearchSpectrum` immediately, with no change to `FusedSearchSpectrum` + itself. + +I/O and compute overlap; peak spectrum RAM drops from ~320 MB to a few hundred KB +(queue depth x spectrum size). + +## Confirmed facts the design relies on + +- `FusedSearchSpectrum(Spectrum spec, int iSlot)` takes `Spectrum` by value + (already a copy); the queue can safely `std::move` spectra into and out of + storage. No pointer aliasing issue. +- The pool slot index `iSlot` is a per-worker constant (0..iNumThreads-1). + Each consumer lambda captures its own `t` at launch time -- same as the current + `fetch_add` dispatch. The `_ppbDuplFragmentArr` lifetime is the full batch, + not per-spectrum. This is unchanged. +- `CheckExit` / `g_pvQueryMutex` are called on the producer thread inside the + read loop. This is unchanged; only the producer runs the loop. +- `_bDoneProcessingAllSpectra` is set by the read loop before the function + returns. The outer `CometSearchManager` batch while loop reads it after + `FusedLoadAndSearchSpectra` returns. This is unchanged. +- `g_pvQuery` is pushed under `g_pvQueryMutex` inside `FusedSearchSpectrum`. + Multiple consumer threads already do this in the current implementation; + no change needed. +- PSM output is sorted by scan number (`compareByScanNumber`) in + `CometSearchManager` after `FusedLoadAndSearchSpectra` returns. Consumer + execution order therefore does not need to match read order; only the sort + at the end matters. **PSM output remains bit-identical to the current + fused path** (same `FusedSearchSpectrum`, same post-sort). +- `tp->wait_on_threads()` already blocks until all active jobs finish; + no new synchronization primitive is needed at the outer level. + +## Design: BoundedSpectrumQueue + +A simple mutex + two condition-variable queue is sufficient. The bottleneck +is `FusedSearchSpectrum` (~1.4 ms/spectrum), not queue throughput. Lock-free +structures would add complexity with no measurable benefit. + +```cpp +struct BoundedSpectrumQueue +{ + std::queue q; + std::mutex mtx; + std::condition_variable cvNotFull; + std::condition_variable cvNotEmpty; + size_t maxDepth; + bool bDone = false; + + explicit BoundedSpectrumQueue(size_t depth) : maxDepth(depth) {} + + // Producer calls this. Blocks when queue is full. + void push(Spectrum&& spec) + { + std::unique_lock lk(mtx); + cvNotFull.wait(lk, [&]{ return q.size() < maxDepth || bDone; }); + if (!bDone) + { + q.push(std::move(spec)); + cvNotEmpty.notify_one(); + } + } + + // Consumer calls this. Returns false when done and queue is empty. + bool pop(Spectrum& spec) + { + std::unique_lock lk(mtx); + cvNotEmpty.wait(lk, [&]{ return !q.empty() || bDone; }); + if (q.empty()) return false; + spec = std::move(q.front()); + q.pop(); + cvNotFull.notify_one(); + return true; + } + + // Producer calls after the read loop ends. + void finish() + { + std::unique_lock lk(mtx); + bDone = true; + cvNotEmpty.notify_all(); + cvNotFull.notify_all(); + } +}; +``` + +**Queue depth**: `iNumThreads * 4`. At steady state, each consumer holds one +spectrum (inside `FusedSearchSpectrum`). A depth of 4x threads means the +producer can stay up to 4 spectra/thread ahead without blocking. For 20 threads, +peak in-flight spectra = 20 (being processed) + 80 (in queue) = 100 spectra x +~8 KB = **800 KB**, down from ~320 MB. + +## Implementation changes + +### Stage 1 -- Add `BoundedSpectrumQueue` (CometPreprocess.cpp) + +Define the struct near the top of `CometPreprocess.cpp`, alongside the +`RtsScratch` definition. It is a local implementation detail and does not need +its own header. + +### Stage 2 -- Restructure `FusedLoadAndSearchSpectra` + +Remove `std::vector vSpectra` and the `std::atomic ctr` +dispatch block. Replace with: + +``` +1. Construct BoundedSpectrumQueue with depth = iNumThreads * 4. + +2. Launch iNumThreads consumer workers BEFORE the read loop: + + for (int t = 0; t < iNumSlots; ++t) + { + tp->doJob([&queue, t]() + { + Spectrum spec; + while (queue.pop(spec)) + FusedSearchSpectrum(std::move(spec), t); + }); + } + +3. Run the read loop on the calling thread (unchanged logic). + Replace: + vSpectra.push_back(mstSpectrum); + with: + queue.push(std::move(mstSpectrum)); + +4. After the read loop: call queue.finish(). + +5. tp->wait_on_threads() (unchanged). +``` + +Note: workers are launched before reading starts so that the first spectrum +pushed is consumed immediately with no dead time. If launched after, the read +loop could fill the queue and stall before any worker starts. + +### Stage 3 -- Error/cancel handling + +If `g_cometStatus.IsError()` or `IsCancel()` is detected inside the read loop +(via `CheckExit`), the read loop breaks. `queue.finish()` is called +unconditionally after the loop and before `wait_on_threads`, so consumers drain +any buffered spectra and exit cleanly. This matches the current behavior where +spectra already in `vSpectra` were still processed after an early break. If +strict cancellation is desired (drop buffered spectra on error), consumers can +check `g_cometStatus` at the top of their loop and call `queue.finish()` +themselves to unblock the producer. + +## Files changed + +| File | Change | +|------|--------| +| `CometSearch/CometPreprocess.cpp` | Add `BoundedSpectrumQueue`; restructure `FusedLoadAndSearchSpectra` | +| `CometSearch/CometPreprocess.h` | No change (no new public API) | +| Everything else | No change | + +## Memory impact summary + +| Metric | Before (batch_FI_optimization) | After (this plan) | +|--------|-------------------------------|-------------------| +| Spectrum buffer RAM | ~320 MB (40k spectra) | ~800 KB (100 spectra) | +| Peak total (HeLa) | 10.5 GB | ~10.2 GB (est.) | +| Dominant cost | Fragment index (~9.5 GB) | Fragment index (~9.5 GB) | + +The fragment index dominates; this change recovers the spectrum-buffer overhead +entirely. + +## Verification + +1. **Unit tests**: `python tests/unit/run_tests.py --comet comet.exe` -- all 17 + tests must pass. +2. **PSM parity**: Run on HeLa `.raw` with both the `batch_FI_optimization` + binary (before this change) and the new binary. `diff` on the `.txt` outputs + must show only the header line (run name + timestamp), as verified for the + prior change. `tools/qvalue.py --diff` must show zero unique PSMs at 1% and + 5% FDR. +3. **Memory**: Run under `/usr/bin/time -v` and confirm `Maximum resident set + size` drops by ~300 MB relative to the prior binary on the same HeLa file. diff --git a/docs/20260616_codereview1.md b/docs/20260616_codereview1.md new file mode 100644 index 00000000..cf54c974 --- /dev/null +++ b/docs/20260616_codereview1.md @@ -0,0 +1,218 @@ +# Code Review — architecture_update branch +# 2026-06-16 + +Reviewed by: Claude Sonnet 4.6 (high-effort, 7-angle finder + per-candidate verification) +Scope: `git diff master...HEAD` — 70 files, +5542 / -3127 lines + +--- + +## Summary + +This branch introduces a major architectural refactor: Strategy pattern for search +(`FastaStrategy`, `FiStrategy`, `PiStrategy`), a `Pipeline` orchestrator, a `SearchSession` +object, an `IResultWriter` interface with four concrete writer classes, a `SearchMemoryPool`, +and a `core/` split of `CometDataInternal.h` into `Constants.h`, `Params.h`, and `Types.h`. +The structural changes are sound, but the refactor introduced four confirmed bugs (two of +which corrupt search results or violate real-time latency guarantees) and two plausible bugs +on error paths. + +--- + +## Critical Issues + +### 1. `StorePeptideI` ignores `bDecoyPep` — decoys written to target list (FDR corruption) +**File:** `CometSearch/CometSearch.cpp` ~line 8618 +**Severity:** Critical — wrong results, silent + +The new `Query*`-based overload of `StorePeptideI` (added for the FI/PI index path in +Task 1.2) comments out the `bDecoyPep` parameter: + +```cpp +void CometSearch::StorePeptideI(Query* pQuery, ..., bool /*bDecoyPep*/, ...) { +``` + +The parameter is dead. The function body always writes to `pQuery->_pResults` and +increments `iMatchPeptideCount`, regardless of whether the peptide is a decoy. +`pQuery->_pDecoys` and `iDecoyMatchPeptideCount` are never touched by this overload. + +The callers at lines ~8591 and ~8608 correctly pass `bDecoyPep=true` for decoy hits. +The old `StorePeptide` overload (line ~5221) has the correct branch: + +```cpp +if (g_staticParams.options.iDecoySearch == 2 && bDecoyPep) + // write to _pDecoys +``` + +The new overload is missing this branch entirely. + +**Impact:** Any FI_DB or PI_DB search with `iDecoySearch=2` (separate decoy mode) silently +mixes all decoy PSMs into the target result list. FDR estimation is corrupted for every +index-path search in separate-decoy mode. + +**Fix:** Restore the `iDecoySearch==2 && bDecoyPep` branch in `StorePeptideI`, writing +to `pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex]` and comparing against +`pQuery->dLowestDecoyXcorrScore` when the decoy condition holds. + +--- + +### 2. `SearchMS1Library` uses global `g_pvQueryMutex` instead of per-query lock +**File:** `CometSearch/CometSearch.cpp` ~line 3275 +**Severity:** High — RTS latency violation; cross-path serialization + +`SearchMS1Library` (the MS1 real-time search path) guards score updates on a caller-owned +`QueryMS1*` with the process-wide `g_pvQueryMutex`: + +```cpp +ThreadMutexLock(&g_pvQueryMutex); // line ~3275 +// update pMS1Query->dBestXcorr, etc. +ThreadMutexUnlock(&g_pvQueryMutex); +``` + +The MS2 RTS path (`RunSearch`) correctly uses `pQuery->accessMutex` for per-query +isolation (lines ~5135, ~8554). `SearchMS1Library` should do the same. + +`g_pvQueryMutex` is also held during batch speclib loading +(`CometPreprocess.cpp:1007`). A running batch search therefore blocks every concurrent +RTS MS1 thread for the full duration of the speclib load, violating the real-time latency +guarantee. + +**Fix:** Add an `accessMutex` field to `QueryMS1` (mirroring `Query::accessMutex`) and +use it in `SearchMS1Library` for score-update critical sections. + +--- + +### 3. `MzIdentMlWriter::FinalizeOne` silently produces invalid `.mzid` on temp-file reopen failure +**File:** `CometSearch/output/MzIdentMlWriter.h` ~line 116 +**Severity:** High — silent data corruption, no error reported + +`FinalizeOne()` closes the temp file then immediately reopens it for reading: + +```cpp +fclose(fpTmp); // line 115 +fpTmp = fopen(sTmp.c_str(), "r"); // line 116 +if (fpTmp) { // line 117 + CometWriteMzIdentML::WriteMzIdentML(...); + fclose(fpTmp); +} +fclose(fpFinal); // line 129 +``` + +If the `fopen` at line 116 fails (network filesystem, external cleanup, non-atomic +close-reopen), the `if` block is skipped: `WriteMzIdentML` is never called, the +spectrum results are never appended, and the output file is closed at line 129 containing +only the XML header — no spectrum results, no closing tags. `g_cometStatus` is never +updated; `DoSearch` returns `true`. Downstream tools receive a structurally invalid file. + +**Fix:** Check the return value of the second `fopen` and call +`g_cometStatus.SetStatus(CometResult_Failed, ...)` on failure before returning. + +--- + +## Code Quality & Maintainability + +### 4. `Pipeline::run` — writer `open()` failure leaves already-opened writers unclosed +**File:** `CometSearch/search/Pipeline.cpp` ~line 108 +**Severity:** Medium — FILE* handle leak, truncated output files + +When a writer's `open()` fails, the inner loop breaks and Pipeline calls +`_strategy->closeFiles()` then breaks out of the file loop entirely, bypassing the +`pw->close()` block at lines ~243–247. Writers that already opened successfully (with +partially-written headers on disk) are never closed. + +**Fix:** On `open()` failure, iterate all writers that have already been successfully +opened and call their `close()` before returning `false`. + +--- + +### 5. `Pipeline::run` returns on `initialize()` failure without calling `finalize()` (memory leak) +**File:** `CometSearch/search/Pipeline.cpp` ~line 38 +**Severity:** Medium — memory leak on error path + +```cpp +if (!_strategy->initialize(session)) return false; // line 38 +// ... +_strategy->finalize(session); // line ~256, only reached on success +``` + +`finalize()` is the sole cleanup point for memory allocated by `initialize()` (thread-pool +scratch buffers, precursor arrays). If `initialize()` returns `false` midway — e.g., +`CometPreprocess::AllocateMemory` succeeds then `ReadPrecursors` fails — those allocations +are never freed. On repeated calls (C# wrapper retrying after a failed search), each +failed init accumulates leaked memory. + +**Fix:** Call `_strategy->finalize(session)` before returning `false` at line 38, or +structure the function with a `goto cleanup` / RAII guard so `finalize` always runs. + +--- + +### 6. `WithinMassTolerancePeff` seek-back loop uses wrong reference mass +**File:** `CometSearch/CometSearch.cpp` ~line 4380 +**Severity:** Medium — false negatives in PEFF searches + +After `BinarySearchMass` locates the correct position for `dCalcPepMass + dMassAddition`, +the seek-back while-loop compares against bare `dCalcPepMass` instead of +`dCalcPepMass + dMassAddition`. With a large positive PEFF modification (e.g., +80 Da +for phospho), the found position is 80 Da ahead of `dCalcPepMass` in the sorted index; +the seek-back stops far too early, and candidate peptides that are within tolerance of +the modified mass are never evaluated. + +**Fix:** Change the seek-back comparison operand from `dCalcPepMass` to +`dCalcPepMass + dMassAddition`, mirroring the value passed to `BinarySearchMass`. + +--- + +### 7. `SearchSession::bPlainPeptideIndexRead` / `bSpecLibRead` are dead fields +**File:** `CometSearch/search/SearchSession.h` ~line 44 +**Severity:** Low — architectural drift; misleads about ownership + +`SearchSession` declares `bPlainPeptideIndexRead` and `bSpecLibRead` as session-owned +state, but `FiStrategy::initialize` reads the global `g_bPlainPeptideIndexRead` — not +`session.bPlainPeptideIndexRead`. The session fields are never set or checked by any +code path. A reader auditing `SearchSession` to understand index state will draw the +wrong conclusion about where the authoritative value lives. + +**Fix:** Either wire `FiStrategy::initialize` to read and write `session.bPlainPeptideIndexRead` +and retire the global, or remove the dead session fields until the migration is ready. + +--- + +## Actionable Improvements + +### 8. `FiStrategy::executeBatch` is a near-copy of `FastaStrategy::executeBatch` with a dead Mango block +**File:** `CometSearch/search/FiStrategy.cpp` ~line 59 + +The non-fused `executeBatch` body is an almost-exact copy of `FastaStrategy::executeBatch`, +including a Mango sort block that can never execute in this branch (`bFused` is false only +when `bMango || bSpecLib` is true, meaning the fused path is taken instead). Any future +change to the shared preprocessing sequence must be applied in both files. + +Extract the shared preprocessing sequence into a free function in `SearchUtils.h` and call +it from both strategies. + +--- + +### 9. `BuildNames()` copy-pasted verbatim into all four writer classes +**Files:** `CometSearch/output/SqtWriter.h`, `TxtWriter.h`, `PepXmlWriter.h`, `MzIdentMlWriter.h` ~line 43 each + +Each concrete writer class contains an identical private static `BuildNames()` method; +only the default file extension string differs at the call site. Any fix to filename +construction logic (CRUX mode suffix, range-number format, path separator) must be applied +in four places and will inevitably diverge. + +```cpp +// Replace four copies with one free function in IResultWriter.h: +static void BuildNames(const std::string& defaultExt, + std::string& sBaseName, + std::vector& vFileNames); +``` + +--- + +### 10. `PrintPercolatorSearchHit` takes `vector` by value — per-PSM copy overhead +**File:** `CometSearch/CometWritePercolator.h` ~line 43 + +`PrintPercolatorSearchHit` accepts `vProteinTargets` and `vProteinDecoys` by value, +copying up to `iMaxDuplicateProteins` (default 20) `std::string` objects per PSM. The +vectors are assembled by the caller immediately before the call and used read-only inside +the function. Change to `const std::vector&` to eliminate the per-PSM +allocation/copy/destruction with no other change required. diff --git a/docs/20260616_codereview2.md b/docs/20260616_codereview2.md new file mode 100644 index 00000000..10e3acb4 --- /dev/null +++ b/docs/20260616_codereview2.md @@ -0,0 +1,214 @@ +# Code Review — architecture_update (Follow-Up) + +**Date:** 2026-06-16 +**Reviewer:** Claude Code (claude-sonnet-4-6) +**Scope:** Follow-up review of the 9 fixes applied after the initial code review (20260616_codereview.md). +Branch: `architecture_update` vs `master` (working tree included). +**Method:** 7-angle Phase 1 (A-G, up to 6 candidates each) + Phase 2 per-candidate +verification (CONFIRMED / PLAUSIBLE / REFUTED). Only CONFIRMED and PLAUSIBLE findings +are reported below. + +--- + +## 1. Summary + +The nine fixes from the first review are largely sound: the mutex, PEFF seek-back, +dead-field removal, BuildNames consolidation, and Percolator const-ref changes are all +correct and clean. Three confirmed defects remain in newly added code: a file-descriptor +leak in MzIdentMlWriter, a misplaced memset in AllocateResultsMem, and an int-to-short +narrowing in StorePeptideI's new decoy branch. One plausible concurrency hazard exists +in the dual slot-tracking representation carried over from the refactor. + +--- + +## 2. Critical Issues + +### [C1] MzIdentMlWriter -- mkstemp fd leaked on every OpenTmp() call (Linux) + +**File:** `CometSearch/output/MzIdentMlWriter.h` +**Lines:** ~94-101 + +On Linux, `OpenTmp()` calls `mkstemp(&sTmp[0])` (which creates and opens the temp file, +returning a live fd), uses the return value only as an error sentinel (`== -1`), then +calls `fopen(sTmp.c_str(), "w")` to open a second handle to the same path. The fd +returned by `mkstemp()` is never passed to `close()`. One fd is leaked per `OpenTmp()` +invocation -- once per mzIdentML output file per search batch. + +**Failure scenario:** With a small `spectrum_batch_size` or many concurrent mzIdentML +writers, the process exhausts its open-fd limit, causing subsequent `fopen()` calls to +return `nullptr` and triggering "cannot write to temporary mzIdentML file" errors that +abort the search. + +**Fix:** +```cpp +int fd = mkstemp(&sTmp[0]); +if (fd == -1) +{ + // error path + return false; +} +close(fd); // release the fd; fopen below opens its own handle +fp = fopen(sTmp.c_str(), "w"); +``` + +--- + +### [C2] SearchUtils.h -- iXcorrHistogram memset inside per-result slot loop + +**File:** `CometSearch/search/SearchUtils.h` +**Lines:** ~190 (inside `AllocateResultsMem`) + +`iXcorrHistogram` is a per-`Query` array (declared `int iXcorrHistogram[HISTO_SIZE]` on +the `Query` struct in `core/Types.h:593`), not a per-`Results` slot field. The +`memset(pQuery->iXcorrHistogram, 0, sizeof(pQuery->iXcorrHistogram))` call is placed +inside the inner `for (int j = 0; j < g_staticParams.options.iNumStored; ++j)` loop, so +it zeroes the same query-level array `iNumStored` times instead of once. On iterations +j > 0, it resets the histogram, destroying any accumulation from prior j iterations. + +**Failure scenario:** Currently harmless because histogram population happens after +`AllocateResultsMem` returns (during the search phase). However, if histogram data were +ever partially populated before the j-loop completes, iteration j=1 would silently +destroy accumulations from j=0. It also wastes `iNumStored - 1` redundant memset calls +per query. + +**Fix:** Move `memset(pQuery->iXcorrHistogram, ...)` to just after +`pQuery->iDecoyMatchPeptideCount = 0`, before the for-j loop begins, so it executes +exactly once per query. + +--- + +### [C3] CometSearch.cpp -- int-to-short narrowing in StorePeptideI decoy index + +**File:** `CometSearch/CometSearch.cpp` +**Lines:** ~8724-8733 (new decoy branch in `StorePeptideI`) + +The new decoy branch recomputes the lowest-scoring decoy slot index with +`for (int i = 1; ...)` and assigns `siLowestDecoyXcorrScoreIndex = i` where the local +variable is declared `short`. This is an implicit int-to-short narrowing conversion. +The analogous loop in `StorePeptide()` (FASTA path, line ~5227) uses `short siA` +throughout, keeping the type consistent with the `short siLowestDecoyXcorrScoreIndex` +field on `Query` (declared `core/Types.h:603`). + +**Failure scenario:** Safe at current `iNumStored` values (typically <= 10). If +`iNumStored` were ever set to >= 32,768 the narrowing truncation would produce a wrong +or negative index, causing `_pDecoys[]` to be accessed out of bounds in the next +`StorePeptideI` call and silently corrupting decoy results. + +**Fix:** Change the loop variable to `short` to match `StorePeptide()`: +```cpp +for (short siA = 1; siA < (short)g_staticParams.options.iNumStored; ++siA) +{ + if (pQuery->_pDecoys[siA].fXcorr < pQuery->_pDecoys[siLowestDecoyXcorrScoreIndex].fXcorr) + siLowestDecoyXcorrScoreIndex = siA; +} +pQuery->siLowestDecoyXcorrScoreIndex = siLowestDecoyXcorrScoreIndex; +``` + +--- + +## 3. Code Quality and Maintainability + +### [C4] CometSearch -- dual slot-tracking systems alias the same scratch buffers + +**File:** `CometSearch/CometSearch.cpp`, `CometSearch/threading/SearchMemoryPool.h` +**Lines:** `CometSearch.cpp:1267`, `SearchMemoryPool.cpp:80` + +The refactor introduced `SearchMemoryPool` (`s_pool`) but retained the legacy +`_pbSearchMemoryPool[]` + `g_searchMemoryPoolMutex` slot-tracking used by the FASTA +batch path (`SearchThreadProc`). The RTS path calls `s_pool.acquireSlot()` / +`releaseSlot()` (guarded by `s_pool._mutex`), while `SearchThreadProc` scans +`_pbSearchMemoryPool[]` under `g_searchMemoryPoolMutex`. Both systems alias the same +physical scratch buffers (`_ppbDuplFragmentArr[i]` = `s_pool._pool[i]`), but neither +delegates to the other -- they are genuinely independent availability-tracking arrays. + +**Failure scenario (PLAUSIBLE):** If FASTA batch search and RTS search ever ran +concurrently in the same process, slot `i` could be claimed by `SearchThreadProc` via +`_pbSearchMemoryPool[i]` and simultaneously by `AcquirePoolSlot()` via +`s_pool._inUse[i]`, handing the same scratch buffer to two threads and silently +corrupting XCorr scores. The `TODO(Phase N)` comment at `CometSearch.cpp:31` +acknowledges the singleton design is not yet multi-instance safe. + +**Recommendation:** Route `SearchThreadProc` through `s_pool.acquireSlot()` / +`releaseSlot()` and remove `_pbSearchMemoryPool`, `g_searchMemoryPoolMutex`, and +`g_searchPoolCV` once all paths use the single `s_pool` authority. + +--- + +### [C5] CometSearch -- dead RunSpecLibSearch(ThreadPool*) overload + +**File:** `CometSearch/CometSearch.cpp` (~line 1000), `CometSearch/CometSearch.h` (~line 94) + +The 1-argument overload `RunSpecLibSearch(ThreadPool* /*tp*/)` is declared and defined +but has no callers in the current codebase. Its body is a commented-out debug printf +followed by `return true`. The live path is the 4-argument overload +`RunSpecLibSearch(int, int, ThreadPool*, vector&)` called from +`SearchUtils.h::RunSearchAndPostAnalysis()`. + +**Failure scenario:** If any future code resolves a call with a single `ThreadPool*` +argument to this overload -- by mistake or through a partial refactor -- all speclib +scoring is silently skipped with no error. The two overloads are visually similar and +the compiler produces no diagnostic. + +**Fix:** Remove the dead 1-argument overload from both the `.h` declaration and the +`.cpp` definition. + +--- + +### [C6] FastaStrategy -- dead if-block in initialize() + +**File:** `CometSearch/search/FastaStrategy.cpp` +**Lines:** ~27-48 + +The block conditioned on `session.bPerformDatabaseSearch && sProteinLModsListFile.length() > 0` +in `FastaStrategy::initialize()` contains only a multi-line comment explaining why +nothing is done here (the filter is loaded before `makeStrategy()` is called). There +are no executable statements inside the block. + +**Failure scenario:** No runtime defect. The risk is a future developer placing +initialization code inside this block expecting it to execute, unaware that the comment +explains the work is already complete by the time `initialize()` is called. + +**Fix:** Remove the dead block entirely, or replace it with a one-line comment at the +top of `initialize()` stating the precondition. + +--- + +## 4. Actionable Improvements + +### [I1] PercolatorWriter -- inline filename construction should use BuildNames + +**File:** `CometSearch/output/PercolatorWriter.h` +**Lines:** ~28-35 + +`PercolatorWriter::open()` constructs its output filename using the same +`base + range + ".pin"` pattern as `IResultWriter::BuildNames()`, but does so inline +rather than calling the shared helper. It is the only concrete writer that does not call +`BuildNames()`. Any future change to naming conventions (e.g., a new suffix format or +CRUX conditional) must be applied in two places. + +**Fix:** Call `BuildNames(ctx, ".pin", ".decoy.pin", ".target.pin", _sPath, _sDecoyPath)` +and drop the local `base`/`range` variables, matching the pattern used by all other +writers. + +--- + +### [I2] IResultWriter::BuildNames -- extTargetCrux should default to nullptr + +**File:** `CometSearch/output/IResultWriter.h` +**Lines:** ~72-86 + +The `extTargetCrux` parameter of `BuildNames()` is unconditionally `(void)`-cast and +discarded in non-CRUX builds. All four call sites must pass a dummy string literal that +is silently ignored at compile time, leaking the CRUX/non-CRUX conditional into every +call site. + +**Improvement:** Add `= nullptr` as the default for `extTargetCrux`: +```cpp +static void BuildNames(const WriterOpenCtx& ctx, + const char* ext, + const char* extDecoy, + std::string& sTarget, + std::string& sDecoy, + const char* extTargetCrux = nullptr); +``` +Non-CRUX callers can then omit the argument entirely. diff --git a/docs/20260617_codereview.md b/docs/20260617_codereview1.md similarity index 100% rename from docs/20260617_codereview.md rename to docs/20260617_codereview1.md diff --git a/docs/20260618_mutexserialization.md b/docs/20260618_mutexserialization.md new file mode 100644 index 00000000..41616123 --- /dev/null +++ b/docs/20260618_mutexserialization.md @@ -0,0 +1,293 @@ +# Mutex Serialization in SearchMemoryPool -- Problem and Optimization Plan + +## Context + +`SearchMemoryPool` (`CometSearch/threading/SearchMemoryPool.h/.cpp`) hands out +duplicate-fragment scratch-buffer slots to search threads. Every call into +`CometSearch::AcquirePoolSlot()` / `releaseSlot()` takes the pool's single +`std::mutex` -- first to scan/pop a free slot, then again to push it back. A prior +benchmarking pass (see Appendix) replaced the original O(n) linear scan of a +`bool[]` array with an O(1) free-list stack, confirming the scan itself was not +the bottleneck: total throughput across all threads stayed flat at roughly +3.8-5M ops/sec from 8 threads up to 512 threads, regardless of slot count. Flat +throughput under increasing thread count, on an operation with no inherent +ordering requirement, is the signature of a single global serialization point -- +in this case, the pool's one mutex. This document describes that problem in more +detail and lays out a measurement-gated plan for removing the serialization from +the hottest call site. + +## The problem + +`acquireSlot()`/`releaseSlot()` (`threading/SearchMemoryPool.cpp`) take the same +`std::mutex _mutex` on every call: + +```cpp +int SearchMemoryPool::acquireSlot() +{ + std::unique_lock lock(_mutex); + bool found = _cv.wait_for(lock, std::chrono::seconds(240), [this]() { return !_freeSlots.empty(); }); + if (!found) return -1; + int slot = _freeSlots.back(); + _freeSlots.pop_back(); + return slot; +} + +void SearchMemoryPool::releaseSlot(int slot) +{ + { std::lock_guard lk(_mutex); _freeSlots.push_back(slot); } + _cv.notify_one(); +} +``` + +Conceptually, "give me any one free slot" and "give this slot back" do not need a +total order across all callers -- any free slot will do, and releases don't need +to be sequenced relative to other releases. But the current design forces every +acquire and every release through one mutex, so N threads doing this concurrently +serialize to roughly the same total throughput as 1 thread doing it N times. The +benchmark in the Appendix confirms this directly: per-operation latency stayed in +the 200-310 ns range across all tested slot/thread counts (8 through 512), with +*total* throughput never scaling up with thread count the way a genuinely +parallel operation would. + +### Where this is actually hot + +Not every caller of `AcquirePoolSlot()` is on a tight per-spectrum loop. Current +call sites, in descending order of call frequency: + +| Call site | File:line | Frequency | Notes | +|---|---|---|---| +| `CometSearch::RunSearch(Query* pQuery)` | `CometSearch.cpp:110` (acquire at lines 122, 164) | **Once per spectrum, per RTS call** | This is `DoSingleSpectrumSearchMultiResults`'s search path -- the RTS thread-local entry point that the project already benchmarks for per-spectrum Hz. Every concurrent RTS caller takes the global mutex twice (acquire + release) per spectrum. | +| `CometSearch::RunSearch(int,int,ThreadPool*,vector&)` FI_DB branch | `CometSearch.cpp:218` | Once per query, per batch | Only reached for the legacy (non-fused) batch path, i.e. when Mango or a spectral-library search forces `FiStrategy::executeBatch()` away from the fused path (`search/FiStrategy.cpp:129-131`). | +| `CometSearch::SearchThreadProc` | `CometSearch.cpp:1220` | Once per protein-search job dispatch | Classic FASTA three-sweep search. Per-job, not per-spectrum; each job is comparatively expensive (protein-by-protein FASTA scoring), so lock overhead is a much smaller fraction of total work here. | + +### The pattern that already avoids this problem + +The fused batch FI_DB path (`CometPreprocess::FusedLoadAndSearchSpectra`, +`CometPreprocess.cpp:3246-3278`) does **not** call `AcquirePoolSlot()` at all. It +launches exactly `iNumThreads` long-lived consumer jobs up front, each one closed +over a fixed slot index `t`: + +```cpp +const int iNumSlots = g_staticParams.options.iNumThreads; +BoundedSpectrumQueue queue(static_cast(iNumSlots) * 4); + +for (int t = 0; t < iNumSlots; ++t) +{ + tp->doJob([&queue, t, &session]() + { + Spectrum spec; + while (queue.pop(spec)) + FusedSearchSpectrum(std::move(spec), t, session); // pre-assigned slot, no lock + }); +} +``` + +Each worker keeps its slot for the worker's entire lifetime instead of +acquiring/releasing it per spectrum. `RunSearch(Query*, int iSlot)` +(`CometSearch.cpp:186`) exists specifically to take this pre-assigned slot, +bypassing `AcquirePoolSlot()` entirely. This is proven, already-shipping code -- +it is the model for Phase 1 below, not a new design. + +## Why this matters (and the honest caveat) + +The RTS path's entire purpose is per-spectrum throughput (Hz) under concurrent +load from multiple C# `Task` threads. Every `DoSingleSpectrumSearchMultiResults` +call pays for two lock/unlock pairs on a global mutex shared with every other +concurrent caller, even when no actual contention exists. + +**Caveat:** the Appendix benchmark measures the synchronization primitive in +isolation, with a near-zero critical-section hold time (touch one byte). Real +search work (`SearchFragmentIndex` / `SearchPeptideIndex`) holds the slot for +however long the actual XCorr/peptide-index search takes -- almost certainly +microseconds to low milliseconds, not nanoseconds. If that real hold time +dominates, the relative cost of the lock itself may be a small fraction of total +per-spectrum latency, and this entire effort would not show up in real Hz +numbers. **This needs to be measured in situ before committing to Phase 2.** +Phase 0 below exists specifically to answer that question first. + +## Proposed plan + +### Phase 0 -- Measure in situ before optimizing further + +Use the existing `RTS_TIMING` build flag (see the `comet-build` skill; +`CometSearch.vcxproj` Release config, or `RTS_TIMING_OFF`/`RTS_TIMING` +preprocessor define) to instrument real per-spectrum timing inside +`DoSingleSpectrumSearchMultiResults`, and drive it with a synthetic +high-concurrency load (many concurrent RTS calls, thread count well above +`iNumThreads` to force the pool into contention). Compare: + +- Wall time spent inside `AcquirePoolSlot()`/`releaseSlot()` vs. total per-spectrum + wall time. +- futex wait counts / `perf lock` contention stats under sustained concurrent load, + if available on the target platform. + +**Only proceed to Phase 1/2 if this shows a non-negligible fraction of +per-spectrum latency** (a reasonable bar: >5-10%), or direct evidence of lock +contention at realistic concurrent RTS thread counts. If the real hold time of +the search work dominates, stop here -- the isolated microbenchmark result does +not by itself justify the added code complexity. + +### Phase 1 -- Extend the existing pre-assigned-slot pattern to RTS (low risk) + +The fused batch path can pre-assign slots because it owns a fixed-size worker +pool it creates itself. RTS callers arrive on whatever thread the .NET `Task` +scheduler happens to run them on, so there is no equivalent fixed "worker index" +threaded through `CometWrapper` today. + +Proposed mechanism: **thread-local lazy slot pinning.** On the first call into +`RunSearch(Query*)` from a given OS thread, claim a slot once via the existing +(mutex-protected) `acquireSlot()` and cache it in a `thread_local int`. Every +subsequent call from that same OS thread reuses the cached slot directly -- +no further lock operations for the rest of that thread's lifetime. If the pool +is already fully claimed by other threads when a new thread needs a permanent +slot, fall back to today's per-call dynamic acquire/release for that thread +(graceful degradation, not a hard failure). + +- **Implementation surface:** a thin wrapper at the two `AcquirePoolSlot()` call + sites inside `RunSearch(Query* pQuery)` (`CometSearch.cpp:122,164`). No change + to `SearchMemoryPool` itself. +- **Risk:** low. Reuses the existing tested mutex/free-list code for the + one-time claim and the overflow fallback; adds only a `thread_local` cache. +- **Open question to confirm, not assume:** this only pays off if the number of + distinct OS threads that ever call into RTS search stays bounded near + `iNumThreads`. `RealtimeSearch.cs`'s `Parallel.ForEach` over the scan queue + does not currently pin a fixed degree of parallelism matching `iNumThreads` -- + .NET's `ThreadPool` can grow under sustained load. Confirm actual concurrent + thread counts in production-like load before relying on this assumption; if + unbounded, either cap `Parallel.ForEach`'s `MaxDegreeOfParallelism` to + `iNumThreads` on the C# side, or size the pool to comfortably exceed observed + peak concurrency. + +### Phase 2 -- Lock-free fast path (only if Phase 0 justifies it) + +If Phase 0 shows the mutex matters and Phase 1's thread-affinity assumption +doesn't hold for some caller, replace the mutex+condition_variable+vector design +with a lock-free atomic bitmask: + +- `std::atomic` for pools up to 64 slots (two words for up to 128; + `iNumThreads` realistically never exceeds this range). +- `acquireSlot()`: CAS loop -- find the lowest set bit (a free slot), atomically + clear it. O(1), wait-free in the common case. +- `releaseSlot()`: atomic fetch-or to set the bit back. O(1), wait-free. +- Keep the existing mutex+condition_variable only as the rare fallback path for + "pool fully exhausted, must block" -- its only remaining job, instead of being + on every call. + +This is more general than Phase 1 (no assumption about caller thread identity or +lifetime) but carries more implementation and review risk: lock-free bitmask +code is straightforward to write but easy to get subtly wrong (memory ordering, +the exhausted-pool fallback path), and is harder to verify by inspection than a +`thread_local` cache. Pursue only if Phase 1 doesn't fully close the gap Phase 0 +identified. + +### Phase 3 -- Re-benchmark and re-measure after each phase + +- Re-run the standalone `SearchMemoryPool` benchmark (Appendix) after each phase + to confirm the synchronization primitive itself improved. +- Re-run the Phase 0 in-situ `RTS_TIMING` measurement after each phase to confirm + the improvement is visible in real per-spectrum Hz numbers, not just the + isolated microbenchmark. A microbenchmark win that doesn't move real Hz numbers + is not worth the added code complexity or review risk -- don't merge Phase 2 + on the strength of the isolated benchmark alone. + +## Other shared mutexes considered and ruled out of scope (for now) + +`docs/GlobalVariables.md` lists several other process-wide mutexes: +`g_pvDBIndexMutex`, `g_preprocessMemoryPoolMutex`, `g_ms1AlignerMutex`, +`g_pvQueryMutex`. None of these sit on the per-spectrum hot path the way +`SearchMemoryPool`'s mutex does -- they guard one-time initialization work (DB +index reads, spectral-library loading) or comparatively low-frequency updates +(MS1 RT alignment history, once per MS1 RTS call on a lower-volume path). Revisit +only if a Phase-0-style measurement on one of those specific paths shows an +actual problem; don't speculatively rewrite them without evidence -- that was +exactly the mistake this document is trying to avoid by leading with Phase 0. + +## Appendix: benchmark methodology + +Standalone harness, compiled directly against the real +`threading/SearchMemoryPool.cpp` and `Threading.cpp` (no need to link the rest of +Comet's dependency tree -- `logout`/`logerr` are macros over `cout`/`cerr`, and +`CometStatus` is fully defined inline in its header): + +```cpp +// bench_pool.cpp +#include "threading/SearchMemoryPool.h" +#include "CometStatus.h" +#include +#include +#include +#include +#include +#include + +CometStatus g_cometStatus; // extern required by SearchMemoryPool.cpp's bad_alloc path + +int main(int argc, char** argv) +{ + int nSlots = argc > 1 ? atoi(argv[1]) : 8; + int nThreads = argc > 2 ? atoi(argv[2]) : 32; + long nIters = argc > 3 ? atol(argv[3]) : 200000; + + SearchMemoryPool pool; + if (!pool.allocate(nSlots, 16)) { fprintf(stderr, "allocate failed\n"); return 1; } + + std::atomic totalOps{0}; + std::vector threads; + auto tStart = std::chrono::steady_clock::now(); + + for (int t = 0; t < nThreads; ++t) + { + threads.emplace_back([&pool, nIters, &totalOps]() { + for (long i = 0; i < nIters; ++i) + { + int slot = pool.acquireSlot(); + if (slot < 0) continue; + SearchMemoryPoolSlotGuard guard{pool, slot}; + volatile bool* p = pool.duplFragmentArr(slot); + p[0] = !p[0]; // simulate minimal real work + totalOps.fetch_add(1, std::memory_order_relaxed); + } + }); + } + for (auto& th : threads) th.join(); + + double sec = std::chrono::duration(std::chrono::steady_clock::now() - tStart).count(); + long ops = totalOps.load(); + printf("slots=%d threads=%d total_ops=%ld time=%.4fs ops/sec=%.0f avg_latency_ns=%.1f\n", + nSlots, nThreads, ops, sec, ops / sec, (sec * 1e9) / ops); + + pool.deallocate(); + return 0; +} +``` + +Compile from `CometSearch/`: + +```bash +g++ -O3 -std=c++20 -fpermissive -I. -I../MSToolkit/include \ + -I../MSToolkit/extern/expat-2.2.9/lib -I../MSToolkit/extern/zlib-1.2.11 \ + -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D__LINUX__ -D_NOSQLITE \ + bench_pool.cpp threading/SearchMemoryPool.cpp Threading.cpp -lpthread -o bench_pool +./bench_pool +``` + +Results from the prior benchmarking pass (O(n) linear-scan implementation vs. the +O(1) free-list that replaced it -- both still mutex-bound, which is the point of +this document): + +| Slots/Threads | O(n) scan ops/sec | O(1) free-list ops/sec | Delta | +|---|---|---|---| +| 8/8 | 3.81M | 4.00M | +5% | +| 8/32 | 4.28M | 4.05M | -5% (noise) | +| 16/64 | 3.91M | 3.87M | -1% (noise) | +| 32/128 | 3.89M | 4.11M | +6% | +| 256/256 | 3.81M | 5.05M | +33% | +| 512/512 | 3.26M | 4.39M | +35% | + +At realistic pool sizes (`iNumThreads`, typically <= 64), throughput is flat +across both implementations within noise -- confirming the mutex, not the scan, +sets the ceiling. The free-list version only pulls ahead once slot counts grow +well past any realistic `iNumThreads` value, which is informative for +understanding *why* the scan wasn't the bottleneck but does not by itself +indicate a production-relevant win. Phase 0 of this plan is how to find out +whether removing the mutex itself would be. From 862ff0d47b65626efb992c4420163558d341a3c5 Mon Sep 17 00:00:00 2001 From: Jimmy Eng Date: Thu, 18 Jun 2026 13:30:25 -0700 Subject: [PATCH 14/15] test: add internal-decoy regression variants and Windows .raw file support test Extend run_regression.py with decoy_search=1/2 params variants (fasta/pi only, fi unsupported) and decoy-file comparison. Add test_raw_vs_mzxml.py to confirm the Windows binary's direct .raw reading matches .mzXML search results across all 5 output formats. Document both in tests/tests.md. Co-Authored-By: Claude Sonnet 4.6 --- ...ms => comet_phospho_internaldecoy1.params} | 23 +- ...ms => comet_phospho_internaldecoy2.params} | 21 +- data/comet_small.params | 158 --------- tests/regression/run_regression.py | 165 +++++++--- tests/regression/test_raw_vs_mzxml.py | 310 ++++++++++++++++++ tests/tests.md | 197 +++++++++++ 6 files changed, 661 insertions(+), 213 deletions(-) rename data/{comet_canonical.params => comet_phospho_internaldecoy1.params} (91%) rename data/{comet.params => comet_phospho_internaldecoy2.params} (92%) delete mode 100644 data/comet_small.params create mode 100644 tests/regression/test_raw_vs_mzxml.py create mode 100644 tests/tests.md diff --git a/data/comet_canonical.params b/data/comet_phospho_internaldecoy1.params similarity index 91% rename from data/comet_canonical.params rename to data/comet_phospho_internaldecoy1.params index 8b89258f..7b9f4a58 100644 --- a/data/comet_canonical.params +++ b/data/comet_phospho_internaldecoy1.params @@ -2,8 +2,13 @@ # Comet MS/MS search engine parameters file. # Everything following the '#' symbol is treated as a comment. # -database_name = C:\\Work\\Comet-master\\data\\human.canonical.fasta -decoy_search = 0 # 0=no (default), 1=internal decoy concatenated, 2=internal decoy separate +# Variant of comet_phospho.params for tests/regression/run_regression.py's +# "internaldecoy1" decoy variant: decoy_search=1 (internal decoy, concatenated). +# Used for the fasta and pi modes only -- FI searches do not support Comet's +# internal decoy generation. +# +database_name = human.target-decoy.fasta +decoy_search = 1 # 0=no (default), 1=internal decoy concatenated, 2=internal decoy separate num_threads = 0 # 0=poll CPU to set num threads; else specify num threads directly (max 128) @@ -12,7 +17,7 @@ print_ascorepro_score = -1 # 0=no, 0 to 5 to localize variable_mod0 # # masses # -peptide_mass_tolerance_upper = -20.0 # upper bound of the precursor mass tolerance +peptide_mass_tolerance_upper = 20.0 # upper bound of the precursor mass tolerance peptide_mass_tolerance_lower = -20.0 # lower bound of the precursor mass tolerance; USUALLY NEGATIVE TO BE LOWER THAN 0 peptide_mass_units = 2 # 0=amu, 1=mmu, 2=ppm precursor_tolerance_type = 1 # 0=MH+ (default), 1=precursor m/z; only valid for amu/mmu tolerances @@ -21,9 +26,9 @@ isotope_error = 0 # 0=off, 1=0/1 (C13 error), 2=0/1/2, 3=0/ # # search enzyme # -search_enzyme_number = 0 # choose from list at end of this params file +search_enzyme_number = 1 # choose from list at end of this params file search_enzyme2_number = 0 # second enzyme; set to 0 if no second enzyme -sample_enzyme_number = 0 # specifies the sample enzyme which is possibly different than the one applied to the search; +sample_enzyme_number = 1 # specifies the sample enzyme which is possibly different than the one applied to the search; # used by PeptideProphet to calculate NTT & NMC in pepXML output (default=1 for trypsin). num_enzyme_termini = 2 # 1 (semi-digested), 2 (fully digested, default), 8 C-term unspecific , 9 N-term unspecific allowed_missed_cleavage = 2 # maximum value is 5; for enzyme search @@ -33,8 +38,8 @@ allowed_missed_cleavage = 2 # maximum value is 5; for enzyme search # format: <0=variable/else binary> # e.g. 79.966331 STY 0 3 -1 0 0 97.976896 # -#variable_mod01 = 15.9949 M 0 2 -1 0 0 0.0 -#variable_mod02 = 79.966331 STY 0 2 -1 0 0 97.976896 +variable_mod01 = 15.9949 M 0 2 -1 0 0 0.0 +variable_mod02 = 79.966331 STY 0 2 -1 0 0 97.976896 variable_mod03 = 0.0 X 0 3 -1 0 0 0.0 variable_mod04 = 0.0 X 0 3 -1 0 0 0.0 variable_mod05 = 0.0 X 0 3 -1 0 0 0.0 @@ -64,7 +69,7 @@ use_NL_ions = 0 # 0=no, 1=yes to consider NH3/H2O neutral # output_sqtfile = 0 # 0=no, 1=yes write sqt file output_txtfile = 1 # 0=no, 1=yes, 2=Crux-formatted write tab-delimited txt file -output_pepxmlfile = 1 # 0=no, 1=yes write pepXML file +output_pepxmlfile = 0 # 0=no, 1=yes write pepXML file output_mzidentmlfile = 0 # 0=no, 1=yes write mzIdentML file output_percolatorfile = 0 # 0=no, 1=yes write Percolator pin file num_output_lines = 1 # num peptide results to show @@ -82,7 +87,7 @@ activation_method = ALL # activation method; used if activation m # misc parameters # digest_mass_range = 800.0 5000.0 # MH+ peptide mass range to analyze -peptide_length_range = 8 25 # minimum and maximum peptide length to analyze (default min 1 to allowed max 51) +peptide_length_range = 8 50 # minimum and maximum peptide length to analyze (default min 1 to allowed max 51) max_duplicate_proteins = 10 # maximum number of additional duplicate protein names to report for each peptide ID; -1 reports all duplicates max_fragment_charge = 3 # set maximum fragment charge state to analyze (allowed max 5) min_precursor_charge = 1 # set minimum precursor charge state to analyze (1 if missing) diff --git a/data/comet.params b/data/comet_phospho_internaldecoy2.params similarity index 92% rename from data/comet.params rename to data/comet_phospho_internaldecoy2.params index a0facab0..32fc99b0 100644 --- a/data/comet.params +++ b/data/comet_phospho_internaldecoy2.params @@ -2,8 +2,13 @@ # Comet MS/MS search engine parameters file. # Everything following the '#' symbol is treated as a comment. # +# Variant of comet_phospho.params for tests/regression/run_regression.py's +# "internaldecoy2" decoy variant: decoy_search=2 (internal decoy, separate). +# Used for the fasta and pi modes only -- FI searches do not support Comet's +# internal decoy generation. +# database_name = human.target-decoy.fasta -decoy_search = 0 # 0=no (default), 1=internal decoy concatenated, 2=internal decoy separate +decoy_search = 2 # 0=no (default), 1=internal decoy concatenated, 2=internal decoy separate num_threads = 0 # 0=poll CPU to set num threads; else specify num threads directly (max 128) @@ -12,7 +17,7 @@ print_ascorepro_score = -1 # 0=no, 0 to 5 to localize variable_mod0 # # masses # -peptide_mass_tolerance_upper = -20.0 # upper bound of the precursor mass tolerance +peptide_mass_tolerance_upper = 20.0 # upper bound of the precursor mass tolerance peptide_mass_tolerance_lower = -20.0 # lower bound of the precursor mass tolerance; USUALLY NEGATIVE TO BE LOWER THAN 0 peptide_mass_units = 2 # 0=amu, 1=mmu, 2=ppm precursor_tolerance_type = 1 # 0=MH+ (default), 1=precursor m/z; only valid for amu/mmu tolerances @@ -21,9 +26,9 @@ isotope_error = 0 # 0=off, 1=0/1 (C13 error), 2=0/1/2, 3=0/ # # search enzyme # -search_enzyme_number = 0 # choose from list at end of this params file +search_enzyme_number = 1 # choose from list at end of this params file search_enzyme2_number = 0 # second enzyme; set to 0 if no second enzyme -sample_enzyme_number = 0 # specifies the sample enzyme which is possibly different than the one applied to the search; +sample_enzyme_number = 1 # specifies the sample enzyme which is possibly different than the one applied to the search; # used by PeptideProphet to calculate NTT & NMC in pepXML output (default=1 for trypsin). num_enzyme_termini = 2 # 1 (semi-digested), 2 (fully digested, default), 8 C-term unspecific , 9 N-term unspecific allowed_missed_cleavage = 2 # maximum value is 5; for enzyme search @@ -33,8 +38,8 @@ allowed_missed_cleavage = 2 # maximum value is 5; for enzyme search # format: <0=variable/else binary> # e.g. 79.966331 STY 0 3 -1 0 0 97.976896 # -#variable_mod01 = 15.9949 M 0 2 -1 0 0 0.0 -#variable_mod02 = 79.966331 STY 0 2 -1 0 0 97.976896 +variable_mod01 = 15.9949 M 0 2 -1 0 0 0.0 +variable_mod02 = 79.966331 STY 0 2 -1 0 0 97.976896 variable_mod03 = 0.0 X 0 3 -1 0 0 0.0 variable_mod04 = 0.0 X 0 3 -1 0 0 0.0 variable_mod05 = 0.0 X 0 3 -1 0 0 0.0 @@ -64,7 +69,7 @@ use_NL_ions = 0 # 0=no, 1=yes to consider NH3/H2O neutral # output_sqtfile = 0 # 0=no, 1=yes write sqt file output_txtfile = 1 # 0=no, 1=yes, 2=Crux-formatted write tab-delimited txt file -output_pepxmlfile = 1 # 0=no, 1=yes write pepXML file +output_pepxmlfile = 0 # 0=no, 1=yes write pepXML file output_mzidentmlfile = 0 # 0=no, 1=yes write mzIdentML file output_percolatorfile = 0 # 0=no, 1=yes write Percolator pin file num_output_lines = 1 # num peptide results to show @@ -82,7 +87,7 @@ activation_method = ALL # activation method; used if activation m # misc parameters # digest_mass_range = 800.0 5000.0 # MH+ peptide mass range to analyze -peptide_length_range = 8 25 # minimum and maximum peptide length to analyze (default min 1 to allowed max 51) +peptide_length_range = 8 50 # minimum and maximum peptide length to analyze (default min 1 to allowed max 51) max_duplicate_proteins = 10 # maximum number of additional duplicate protein names to report for each peptide ID; -1 reports all duplicates max_fragment_charge = 3 # set maximum fragment charge state to analyze (allowed max 5) min_precursor_charge = 1 # set minimum precursor charge state to analyze (1 if missing) diff --git a/data/comet_small.params b/data/comet_small.params deleted file mode 100644 index 62976967..00000000 --- a/data/comet_small.params +++ /dev/null @@ -1,158 +0,0 @@ -# comet_version 2026.01 rev. 0 -# Comet MS/MS search engine parameters file. -# Everything following the '#' symbol is treated as a comment. -# -database_name = C:\\Work\\Comet-master\\data\\human.small.fasta -decoy_search = 0 # 0=no (default), 1=internal decoy concatenated, 2=internal decoy separate - -num_threads = 0 # 0=poll CPU to set num threads; else specify num threads directly (max 128) - -print_ascorepro_score = -1 # 0=no, 0 to 5 to localize variable_mod01 to _mod05; -1 to localize all variable mods - -# -# masses -# -peptide_mass_tolerance_upper = -20.0 # upper bound of the precursor mass tolerance -peptide_mass_tolerance_lower = -20.0 # lower bound of the precursor mass tolerance; USUALLY NEGATIVE TO BE LOWER THAN 0 -peptide_mass_units = 2 # 0=amu, 1=mmu, 2=ppm -precursor_tolerance_type = 1 # 0=MH+ (default), 1=precursor m/z; only valid for amu/mmu tolerances -isotope_error = 0 # 0=off, 1=0/1 (C13 error), 2=0/1/2, 3=0/1/2/3, 4=-1/0/1/2/3, 5=-1/0/1 - -# -# search enzyme -# -search_enzyme_number = 0 # choose from list at end of this params file -search_enzyme2_number = 0 # second enzyme; set to 0 if no second enzyme -sample_enzyme_number = 0 # specifies the sample enzyme which is possibly different than the one applied to the search; - # used by PeptideProphet to calculate NTT & NMC in pepXML output (default=1 for trypsin). -num_enzyme_termini = 2 # 1 (semi-digested), 2 (fully digested, default), 8 C-term unspecific , 9 N-term unspecific -allowed_missed_cleavage = 2 # maximum value is 5; for enzyme search - -# -# Up to 15 variable_mod entries are supported for a standard search; manually add additional entries as needed -# format: <0=variable/else binary> -# e.g. 79.966331 STY 0 3 -1 0 0 97.976896 -# -#variable_mod01 = 15.9949 M 0 2 -1 0 0 0.0 -#variable_mod02 = 79.966331 STY 0 2 -1 0 0 97.976896 -variable_mod03 = 0.0 X 0 3 -1 0 0 0.0 -variable_mod04 = 0.0 X 0 3 -1 0 0 0.0 -variable_mod05 = 0.0 X 0 3 -1 0 0 0.0 -max_variable_mods_in_peptide = 4 -require_variable_mod = 0 - -# -# fragment ions -# -# ion trap ms/ms: 1.0005 tolerance, 0.4 offset (mono masses), theoretical_fragment_ions = 1 -# high res ms/ms: 0.02 tolerance, 0.0 offset (mono masses), theoretical_fragment_ions = 0, spectrum_batch_size = 15000 -# -fragment_bin_tol = 0.02 # binning to use on fragment ions -fragment_bin_offset = 0.0 # offset position to start the binning (0.0 to 1.0) -theoretical_fragment_ions = 0 # 0=use flanking peaks, 1=M peak only -use_A_ions = 0 -use_B_ions = 1 -use_C_ions = 0 -use_X_ions = 0 -use_Y_ions = 1 -use_Z_ions = 0 -use_Z1_ions = 0 -use_NL_ions = 0 # 0=no, 1=yes to consider NH3/H2O neutral loss peaks - -# -# output -# -output_sqtfile = 0 # 0=no, 1=yes write sqt file -output_txtfile = 1 # 0=no, 1=yes, 2=Crux-formatted write tab-delimited txt file -output_pepxmlfile = 1 # 0=no, 1=yes write pepXML file -output_mzidentmlfile = 0 # 0=no, 1=yes write mzIdentML file -output_percolatorfile = 0 # 0=no, 1=yes write Percolator pin file -num_output_lines = 1 # num peptide results to show - -# -# mzXML/mzML/raw file parameters -# -scan_range = 0 0 # start and end scan range to search; either entry can be set independently -precursor_charge = 0 0 # precursor charge range to analyze; does not override any existing charge; 0 as 1st entry ignores parameter -override_charge = 0 # 0=no, 1=override precursor charge states, 2=ignore precursor charges outside precursor_charge range, 3=see online -ms_level = 2 # MS level to analyze, valid are levels 2 (default) or 3 -activation_method = ALL # activation method; used if activation method set; allowed ALL, CID, ECD, ETD, ETD+SA, PQD, HCD, IRMPD, SID - -# -# misc parameters -# -digest_mass_range = 800.0 5000.0 # MH+ peptide mass range to analyze -peptide_length_range = 8 25 # minimum and maximum peptide length to analyze (default min 1 to allowed max 51) -max_duplicate_proteins = 10 # maximum number of additional duplicate protein names to report for each peptide ID; -1 reports all duplicates -max_fragment_charge = 3 # set maximum fragment charge state to analyze (allowed max 5) -min_precursor_charge = 1 # set minimum precursor charge state to analyze (1 if missing) -max_precursor_charge = 6 # set maximum precursor charge state to analyze (allowed max 9) -clip_nterm_methionine = 0 # 0=leave protein sequences as-is; 1=also consider sequence w/o N-term methionine -spectrum_batch_size = 75000 # max. # of spectra to search at a time; 0 to search the entire scan range in one loop -decoy_prefix = DECOY_ # decoy entries are denoted by this string which is pre-pended to each protein accession -equal_I_and_L = 1 # 0=treat I and L as different; 1=treat I and L as same -mass_offsets = # one or more mass offsets to search (values substracted from deconvoluted precursor mass) - -# -# spectral processing -# -minimum_peaks = 10 # required minimum number of peaks in spectrum to search (default 10) -minimum_intensity = 0 # minimum intensity value to read in -remove_precursor_peak = 0 # 0=no, 1=yes, 2=all charge reduced precursor peaks (for ETD), 3=phosphate neutral loss peaks -remove_precursor_tolerance = 1.5 # +- Da tolerance for precursor removal -clear_mz_range = 0.0 0.0 # clear out all peaks in the specified m/z range e.g. remove reporter ion region of TMT spectra -percentage_base_peak = 0.0 # specify a percentage (e.g. "0.05" for 5%) of the base peak intensity as a minimum intensity threshold - -# -# static modifications -# -add_Cterm_peptide = 0.0 -add_Nterm_peptide = 0.0 -add_Cterm_protein = 0.0 -add_Nterm_protein = 0.0 - -add_G_glycine = 0.0000 # added to G - avg. 57.0513, mono. 57.02146 -add_A_alanine = 0.0000 # added to A - avg. 71.0779, mono. 71.03711 -add_S_serine = 0.0000 # added to S - avg. 87.0773, mono. 87.03203 -add_P_proline = 0.0000 # added to P - avg. 97.1152, mono. 97.05276 -add_V_valine = 0.0000 # added to V - avg. 99.1311, mono. 99.06841 -add_T_threonine = 0.0000 # added to T - avg. 101.1038, mono. 101.04768 -add_C_cysteine = 57.021464 # added to C - avg. 103.1429, mono. 103.00918 -add_L_leucine = 0.0000 # added to L - avg. 113.1576, mono. 113.08406 -add_I_isoleucine = 0.0000 # added to I - avg. 113.1576, mono. 113.08406 -add_N_asparagine = 0.0000 # added to N - avg. 114.1026, mono. 114.04293 -add_D_aspartic_acid = 0.0000 # added to D - avg. 115.0874, mono. 115.02694 -add_Q_glutamine = 0.0000 # added to Q - avg. 128.1292, mono. 128.05858 -add_K_lysine = 0.0000 # added to K - avg. 128.1723, mono. 128.09496 -add_E_glutamic_acid = 0.0000 # added to E - avg. 129.1140, mono. 129.04259 -add_M_methionine = 0.0000 # added to M - avg. 131.1961, mono. 131.04048 -add_H_histidine = 0.0000 # added to H - avg. 137.1393, mono. 137.05891 -add_F_phenylalanine = 0.0000 # added to F - avg. 147.1739, mono. 147.06841 -add_U_selenocysteine = 0.0000 # added to U - avg. 150.0379, mono. 150.95363 -add_R_arginine = 0.0000 # added to R - avg. 156.1857, mono. 156.10111 -add_Y_tyrosine = 0.0000 # added to Y - avg. 163.0633, mono. 163.06333 -add_W_tryptophan = 0.0000 # added to W - avg. 186.0793, mono. 186.07931 -add_O_pyrrolysine = 0.0000 # added to O - avg. 237.2982, mono 237.14773 -add_B_user_amino_acid = 0.0000 # added to B - avg. 0.0000, mono. 0.00000 -add_J_user_amino_acid = 0.0000 # added to J - avg. 0.0000, mono. 0.00000 -add_X_user_amino_acid = 0.0000 # added to X - avg. 0.0000, mono. 0.00000 -add_Z_user_amino_acid = 0.0000 # added to Z - avg. 0.0000, mono. 0.00000 - -# -# COMET_ENZYME_INFO _must_ be at the end of this parameters file -# Enzyme entries can be added/deleted/edited -# -[COMET_ENZYME_INFO] -0. Cut_everywhere 0 - - -1. Trypsin 1 KR P -2. Trypsin/P 1 KR - -3. Lys_C 1 K P -4. Lys_N 0 K - -5. Arg_C 1 R P -6. Asp_N 0 DN - -7. CNBr 1 M - -8. Asp-N_ambic 1 DE - -9. PepsinA 1 FL - -10. Chymotrypsin 1 FWYL P -11. No_cut 1 @ @ - diff --git a/tests/regression/run_regression.py b/tests/regression/run_regression.py index 2e8d119f..f9bc92a0 100644 --- a/tests/regression/run_regression.py +++ b/tests/regression/run_regression.py @@ -7,20 +7,34 @@ fi -- fragment ion index search (index built fresh by each binary) pi -- peptide index search (index built fresh by each binary) -Comparison metrics per mode: +Each mode is run under one or more decoy variants (each backed by its own +params file, decoy_search baked in): + nodecoy -- decoy_search = 0 (comet_phospho.params) + internaldecoy1 -- decoy_search = 1, internal decoy concatenated + (comet_phospho_internaldecoy1.params) + internaldecoy2 -- decoy_search = 2, internal decoy separate + (comet_phospho_internaldecoy2.params) +internaldecoy1/internaldecoy2 only run against fasta and pi -- FI does not +support Comet's internal (on-the-fly) decoy generation, so that combination +is skipped automatically. + +Comparison metrics per mode/variant: - Wall-clock search time (seconds); index build time reported separately - PSM count (number of lines in .txt output above xcorr threshold) - PSM overlap: fraction of scans where both binaries agree on the top peptide + - For internaldecoy2 (decoy_search=2 writes a separate .decoy.txt): + the same PSM count/overlap comparison is also run on that decoy-only file. Usage: # 1. Fetch baseline binary first: python setup_baselines.py - # 2. Run all three modes against default baseline tag(s): + # 2. Run all modes x all decoy variants against default baseline tag(s): python run_regression.py - # 3. Restrict modes or tags: + # 3. Restrict modes, decoy variants, or tags: python run_regression.py --modes fasta fi + python run_regression.py --decoy-variants nodecoy internaldecoy2 python run_regression.py --tags v2026.01.1 # 4. Point at non-default binaries or data: @@ -28,8 +42,9 @@ python run_regression.py --data ../../data Output: - results/_/report.txt human-readable summary - results/_/report.json machine-readable metrics + results/_/report.txt human-readable summary (all variants x modes) + results/_/report.json machine-readable metrics + results/_///... raw per-run Comet output (baseline.txt, current.txt, etc.) """ import argparse @@ -67,6 +82,22 @@ DEFAULT_TAGS = ["v2026.01.1"] MODES = ["fasta", "fi", "pi"] +# Decoy variants: filename (relative to the effective data dir) for each variant's +# params file. Each is identical to comet_phospho.params except decoy_search. +DECOY_VARIANT_FILENAMES = { + "nodecoy": "comet_phospho.params", + "internaldecoy1": "comet_phospho_internaldecoy1.params", + "internaldecoy2": "comet_phospho_internaldecoy2.params", +} +# Modes each variant is valid for. FI does not support Comet's internal +# (on-the-fly) decoy generation, so internaldecoy1/internaldecoy2 are fasta/pi only. +DECOY_VARIANT_MODES = { + "nodecoy": {"fasta", "fi", "pi"}, + "internaldecoy1": {"fasta", "pi"}, + "internaldecoy2": {"fasta", "pi"}, +} +DEFAULT_DECOY_VARIANTS = list(DECOY_VARIANT_FILENAMES.keys()) + XCORR_THRESHOLD = 2.5 # minimum xcorr to count a PSM @@ -161,18 +192,27 @@ def build_index(binary: Path, params_path: Path, mode: str, work_dir: Path) -> f def run_search(binary: Path, params_path: Path, mzxml: Path, - work_dir: Path) -> tuple[float, Path]: - """Run a search; return (elapsed_seconds, path_to_.txt_output).""" - # Comet writes output next to the mzXML; remove stale file first. - out_txt = mzxml.with_suffix(".txt") + work_dir: Path) -> tuple[float, Path, Path]: + """ + Run a search; return (elapsed_seconds, path_to_.txt_output, path_to_decoy_txt_output). + + The decoy path is only populated by Comet when decoy_search=2 (separate + target/decoy output, see TxtWriter::open() / IResultWriter::BuildNames()); + for decoy_search=0/1 it simply won't exist and callers should check before use. + """ + # Comet writes output next to the mzXML; remove stale files first. + out_txt = mzxml.with_suffix(".txt") + out_decoy = mzxml.parent / (mzxml.stem + ".decoy.txt") if out_txt.exists(): out_txt.unlink() + if out_decoy.exists(): + out_decoy.unlink() elapsed, _ = run_comet( binary, [f"-P{comet_path(params_path)}", comet_path(mzxml)], work_dir, ) - return elapsed, out_txt + return elapsed, out_txt, out_decoy # --------------------------------------------------------------------------- @@ -289,7 +329,7 @@ def run_mode(mode: str, current_bin: Path, baseline_bin: Path, for label, binary in [("baseline", baseline_bin), ("current", current_bin)]: print(f" [fasta] running {label} ...") try: - t, txt_src = run_search(binary, params_path, MZXML_FILE, run_dir) + t, txt_src, decoy_src = run_search(binary, params_path, MZXML_FILE, run_dir) metrics[f"search_time_{label}_s"] = round(t, 2) except RuntimeError as e: print(f" ERROR: {e}", file=sys.stderr) @@ -298,6 +338,8 @@ def run_mode(mode: str, current_bin: Path, baseline_bin: Path, dest = run_dir / f"{label}.txt" if txt_src.exists(): shutil.copy(txt_src, dest) + if decoy_src.exists(): + shutil.copy(decoy_src, run_dir / f"{label}.decoy.txt") else: # ---- FI / PI: build index per binary in its own subdirectory ---- @@ -343,7 +385,7 @@ def run_mode(mode: str, current_bin: Path, baseline_bin: Path, print(f" [{mode}] running {label} search ...") try: - t, txt_src = run_search(binary, search_params_path, MZXML_FILE, sub) + t, txt_src, decoy_src = run_search(binary, search_params_path, MZXML_FILE, sub) metrics[f"search_time_{label}_s"] = round(t, 2) except RuntimeError as e: print(f" ERROR in search: {e}", file=sys.stderr) @@ -353,12 +395,24 @@ def run_mode(mode: str, current_bin: Path, baseline_bin: Path, dest = run_dir / f"{label}.txt" if txt_src.exists(): shutil.copy(txt_src, dest) + if decoy_src.exists(): + shutil.copy(decoy_src, run_dir / f"{label}.decoy.txt") # ---- Compare ---- print(f" [{mode}] comparing results ...") base_psms = parse_txt(run_dir / "baseline.txt") curr_psms = parse_txt(run_dir / "current.txt") metrics.update(compare_results(base_psms, curr_psms)) + + # decoy_search=2 writes a separate .decoy.txt; compare that too + # if either binary produced one (decoy_search=0/1 never will). + base_decoy_path = run_dir / "baseline.decoy.txt" + curr_decoy_path = run_dir / "current.decoy.txt" + if base_decoy_path.exists() or curr_decoy_path.exists(): + base_decoy_psms = parse_txt(base_decoy_path) + curr_decoy_psms = parse_txt(curr_decoy_path) + metrics["decoy_file"] = compare_results(base_decoy_psms, curr_decoy_psms) + return metrics @@ -383,25 +437,39 @@ def print_report(all_metrics: list[dict], current_bin: Path, baseline_tag: str): print(f" current : {current_bin}") print(f" xcorr threshold for PSM count: >= {XCORR_THRESHOLD}") print(sep) + def print_comparison(prefix: str, c: dict): + print(f" {prefix}PSMs >= {XCORR_THRESHOLD} (baseline) : {fmt(c.get('base_psm_count'))}") + print(f" {prefix}PSMs >= {XCORR_THRESHOLD} (current) : {fmt(c.get('curr_psm_count'))}") + af = c.get("agree_frac") + if af is not None: + pct = af * 100 + print(f" {prefix}top-peptide agreement : {c['agree_top_peptide']:>8} / " + f"{c['common_scans']} common scans ({pct:.2f}%)") + print(f" {prefix}only in baseline : {fmt(c.get('only_in_baseline'))}") + print(f" {prefix}only in current : {fmt(c.get('only_in_current'))}") + else: + print(f" {prefix}top-peptide agreement : {'N/A':>8}") + for m in all_metrics: - mode = m["mode"] - print(f"\nMode: {mode.upper()}") + mode = m["mode"] + variant = m.get("decoy_variant", "nodecoy") + print(f"\nVariant: {variant} Mode: {mode.upper()}") + + if m.get("skipped"): + print(f" SKIPPED: {m.get('skip_reason', 'not applicable')}") + continue + if m.get("index_build_time_baseline_s", "absent") != "absent": print(f" index build (baseline) : {fmt(m['index_build_time_baseline_s'], decimals=1)} s") print(f" index build (current) : {fmt(m.get('index_build_time_current_s'), decimals=1)} s") print(f" search time (baseline) : {fmt(m.get('search_time_baseline_s'), decimals=1)} s") print(f" search time (current) : {fmt(m.get('search_time_current_s'), decimals=1)} s") - print(f" PSMs >= {XCORR_THRESHOLD} (baseline) : {fmt(m.get('base_psm_count'))}") - print(f" PSMs >= {XCORR_THRESHOLD} (current) : {fmt(m.get('curr_psm_count'))}") - af = m.get("agree_frac") - if af is not None: - pct = af * 100 - print(f" top-peptide agreement : {m['agree_top_peptide']:>8} / " - f"{m['common_scans']} common scans ({pct:.2f}%)") - print(f" only in baseline : {fmt(m.get('only_in_baseline'))}") - print(f" only in current : {fmt(m.get('only_in_current'))}") - else: - print(f" top-peptide agreement : {'N/A':>8}") + print_comparison("", m) + + decoy_file = m.get("decoy_file") + if decoy_file is not None: + print(f" -- separate decoy file (decoy_search=2) --") + print_comparison("decoy ", decoy_file) print(sep) @@ -418,6 +486,11 @@ def main(): help=f"baseline release tags (default: {DEFAULT_TAGS})") parser.add_argument("--modes", nargs="+", default=MODES, choices=MODES, help=f"search modes to run (default: all)") + parser.add_argument("--decoy-variants", nargs="+", default=DEFAULT_DECOY_VARIANTS, + choices=DEFAULT_DECOY_VARIANTS, + help=f"decoy_search configurations to test (default: all). " + f"internaldecoy1/internaldecoy2 are skipped for the fi mode -- " + f"FI does not support Comet's internal decoy generation.") parser.add_argument("--data", type=Path, default=DATA_DIR, help=f"directory with FASTA, mzXML, and params (default: {DATA_DIR})") args = parser.parse_args() @@ -429,7 +502,13 @@ def main(): MZXML_FILE = args.data / MZXML_FILE.name PARAMS_FILE = args.data / PARAMS_FILE.name - for req, label in [(FASTA_FILE, "FASTA"), (MZXML_FILE, "mzXML"), (PARAMS_FILE, "params")]: + decoy_variant_paths = { + v: (args.data / DECOY_VARIANT_FILENAMES[v]) for v in args.decoy_variants + } + + required = [(FASTA_FILE, "FASTA"), (MZXML_FILE, "mzXML")] + required += [(p, f"{v} params") for v, p in decoy_variant_paths.items()] + for req, label in required: if not req.exists(): print(f"ERROR: {label} file not found: {req}", file=sys.stderr) sys.exit(1) @@ -438,9 +517,9 @@ def main(): print(f"ERROR: current binary not found: {args.current}", file=sys.stderr) sys.exit(1) - base_params = load_params(PARAMS_FILE) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - had_error = False + decoy_variant_params = {v: load_params(p) for v, p in decoy_variant_paths.items()} + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + had_error = False for tag in args.tags: baseline_bin = BASELINES_DIR / tag / ("Comet.exe" if IS_WINDOWS else "comet") @@ -456,15 +535,25 @@ def main(): run_root = RESULTS_DIR / f"{timestamp}_{tag}" tag_metrics = [] - for mode in args.modes: - try: - m = run_mode(mode, args.current, baseline_bin, - base_params, run_root / mode) - except Exception as e: - print(f" [{mode}] FAILED: {e}", file=sys.stderr) - m = {"mode": mode, "error": str(e)} - had_error = True - tag_metrics.append(m) + for variant in args.decoy_variants: + for mode in args.modes: + if mode not in DECOY_VARIANT_MODES[variant]: + print(f" [{variant}/{mode}] SKIPPED: FI does not support " + f"Comet's internal decoy generation") + tag_metrics.append({ + "mode": mode, "decoy_variant": variant, "skipped": True, + "skip_reason": "FI does not support Comet's internal decoy generation", + }) + continue + try: + m = run_mode(mode, args.current, baseline_bin, + decoy_variant_params[variant], run_root / variant / mode) + except Exception as e: + print(f" [{variant}/{mode}] FAILED: {e}", file=sys.stderr) + m = {"mode": mode, "error": str(e)} + had_error = True + m["decoy_variant"] = variant + tag_metrics.append(m) print_report(tag_metrics, args.current, tag) diff --git a/tests/regression/test_raw_vs_mzxml.py b/tests/regression/test_raw_vs_mzxml.py new file mode 100644 index 00000000..11f4172c --- /dev/null +++ b/tests/regression/test_raw_vs_mzxml.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python3 +""" +Windows .raw file support test -- compares the same Windows Comet binary +searching the identical Hela run via .mzXML vs .raw, across all 5 output +formats (txt, sqt, pep.xml, mzid, pin). + +Only the Windows release reads .raw files directly (Thermo vendor library); +this test is SKIPPED (exit 0, not a failure) when given a non-Windows binary +or when the .raw fixture is absent, since both are expected/documented +conditions rather than test failures. + +Goal: confirm (a) .raw file reading works correctly -- the .mzXML and .raw +searches should agree "near exactly" (same underlying spectra, two different +encodings, so tiny floating-point/centroiding differences are tolerated but +not large disagreements) -- and (b) every enabled output format is valid and +non-empty for both input formats, not just the default .txt. + +Usage: + python test_raw_vs_mzxml.py + python test_raw_vs_mzxml.py --comet ../../x64/Release/Comet.exe + python test_raw_vs_mzxml.py --data ../../data +""" + +import argparse +import re +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.resolve())) +import run_regression as rr # reuse run_comet/parse_txt/compare_results/patch_params/etc. + +REGRESSION_DIR = Path(__file__).parent.resolve() +REPO_ROOT = REGRESSION_DIR.parent.parent +DATA_DIR = REPO_ROOT / "data" +DEFAULT_COMET_WIN = REPO_ROOT / "x64" / "Release" / "Comet.exe" + +FASTA_FILE = DATA_DIR / "human.small.fasta" +MZXML_FILE = DATA_DIR / "20250520_Hela_60min_06.mzXML" +RAW_FILE = DATA_DIR / "20250520_Hela_60min_06.raw" +PARAMS_FILE = DATA_DIR / "comet_phospho.params" + +XCORR_THRESHOLD = 2.5 # same bar used by run_regression.py's .txt PSM comparison +MIN_AGREE_FRAC = 0.99 # "near exact" bar -- not byte-exact, since .raw and .mzXML + # are two different encodings of the same underlying spectra +MAX_COUNT_DRIFT = 0.01 # 1% tolerance on record counts for the non-txt formats (these + # are spectrum-processed counts, not scoring-threshold-sensitive, + # so they should track each other tightly) +MAX_PSM_COUNT_DRIFT = 0.05 # 5% tolerance on the xcorr>=threshold PSM count itself -- + # looser than MAX_COUNT_DRIFT because this count is sensitive to + # borderline scores flipping across the threshold from the tiny + # numeric differences between vendor-raw and converted-mzXML peaks + +# format label -> (params flag to enable it, output file extension) +OUTPUT_FORMATS = { + "txt": ("output_txtfile", ".txt"), + "sqt": ("output_sqtfile", ".sqt"), + "pepxml": ("output_pepxmlfile", ".pep.xml"), + "mzidentml": ("output_mzidentmlfile", ".mzid"), + "percolator": ("output_percolatorfile", ".pin"), +} + + +# --------------------------------------------------------------------------- +# Windows-binary / path helpers (binary-driven, not host-OS-driven -- this +# script is meant to invoke a Windows .exe from any host, e.g. via WSL interop) +# --------------------------------------------------------------------------- + +def is_windows_binary(path: Path) -> bool: + try: + with open(path, "rb") as f: + return f.read(2) == b"MZ" + except Exception: + return False + + +def to_win_path(p: Path) -> str: + s = str(p) + if s.startswith("/mnt/"): + parts = s[5:].split("/", 1) + drive = parts[0].upper() + ":" + rest = parts[1].replace("/", "\\") if len(parts) > 1 else "" + return drive + "\\" + rest + return s + + +# --------------------------------------------------------------------------- +# Lightweight per-format record counters (just enough to confirm "valid and +# not blank", plus a count to compare between the mzXML and .raw runs). +# Full peptide-level comparison is only done for .txt, via run_regression's +# already-proven parse_txt()/compare_results(). +# --------------------------------------------------------------------------- + +def count_sqt_spectra(path: Path) -> int: + if not path.exists(): + return 0 + n = 0 + with open(path, encoding="utf-8", errors="replace") as fh: + for line in fh: + if line.startswith("S\t"): + n += 1 + return n + + +def count_pepxml_spectra(path: Path) -> int: + if not path.exists(): + return 0 + text = path.read_text(encoding="utf-8", errors="replace") + return text.count(" int: + if not path.exists(): + return 0 + text = path.read_text(encoding="utf-8", errors="replace") + return len(re.findall(r" bool: + if a == b: + return True + denom = max(a, b, 1) + return abs(a - b) / denom <= tol + + +# --------------------------------------------------------------------------- +# Search execution +# --------------------------------------------------------------------------- + +def run_one_search(comet: Path, params_path: Path, input_file: Path, work_dir: Path): + """Run comet against input_file; return elapsed seconds.""" + elapsed, _ = rr.run_comet( + comet, + [f"-P{to_win_path(params_path)}", to_win_path(input_file)], + work_dir, + ) + return elapsed + + +def collect_outputs(input_file: Path, dest_dir: Path, label: str) -> dict: + """ + Move (not copy) every produced output file from next to input_file into + dest_dir/