From a0fd6308a90f95df05af1f6a5fa573397f9c9a01 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 25 Apr 2026 23:45:51 +0100 Subject: [PATCH 01/34] =?UTF-8?q?perf(search):=20T1=20=E2=80=94=20per-task?= =?UTF-8?q?=20wall=20stats=20+=20tail-imbalance=20summary?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each RunMSGFPlus task now captures preprocess / db-search / compute-evalue phase walls plus its total wall into a TaskWallStats and exposes them via getWallStats(). MSGFPlus.runMSGFPlus retains the submitted task list and, after awaitTermination, prints a one-line summary: Task wall summary (n=24): min=12.4s median=15.6s p95=19.8s max=20.2s total=362.1s tail_gap=4.6s (29% of median) The "tail_gap" (max - median) and its ratio to median are the inputs we need to decide whether finer task granularity (T2) or a work-stealing pool (T3) would actually help on a given dataset. If tail_gap is single-digit percent, neither change is worth shipping. No behavior change. The phase walls were already being printed inline; this just retains them in a struct on the task and aggregates at the end. Scoped tests green: 58/58. --- .../msjava/msdbsearch/ConcurrentMSGFPlus.java | 39 ++++++++++++++++-- .../java/edu/ucsd/msjava/ui/MSGFPlus.java | 40 +++++++++++++++++++ 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java index 4afcd8a5..4605b6ba 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java @@ -10,6 +10,24 @@ import org.apache.commons.io.output.NullOutputStream; public class ConcurrentMSGFPlus { + /** Per-task wall stats captured during {@link RunMSGFPlus#run()}. Used by + * {@code MSGFPlus.runMSGFPlus} to report a tail-imbalance summary across + * all tasks (T1 instrumentation). All units are milliseconds. */ + public static final class TaskWallStats { + public final int taskNum; + public final long preprocessMs; + public final long dbSearchMs; + public final long computeEvalueMs; + public final long totalMs; + TaskWallStats(int taskNum, long preprocessMs, long dbSearchMs, long computeEvalueMs, long totalMs) { + this.taskNum = taskNum; + this.preprocessMs = preprocessMs; + this.dbSearchMs = dbSearchMs; + this.computeEvalueMs = computeEvalueMs; + this.totalMs = totalMs; + } + } + public static class RunMSGFPlus implements Runnable, ProgressReporter { private final Supplier specScannerSupplier; private final CompactSuffixArray sa; @@ -19,6 +37,14 @@ public static class RunMSGFPlus implements Runnable, ProgressReporter { private ProgressData progress; private ScoredSpectraMap specScanner; private DBScanner scanner; + private volatile TaskWallStats wallStats; + + /** Wall stats captured at the end of {@link #run()}, or {@code null} + * if the task didn't complete (e.g. interrupted). Read from the main + * thread after {@code awaitTermination}. */ + public TaskWallStats getWallStats() { + return wallStats; + } @Override public void setProgressData(ProgressData data) { @@ -47,6 +73,8 @@ public RunMSGFPlus( @Override public void run() { + long taskStartNs = System.nanoTime(); + long preprocessMs = 0, dbSearchMs = 0, computeEvalueMs = 0; if (progress == null) { progress = new ProgressData(); } @@ -98,8 +126,9 @@ public void run() { if (Thread.currentThread().isInterrupted()) { return; } + preprocessMs = System.currentTimeMillis() - startTimePreprocess; output.print(threadName + ": Preprocessing spectra finished "); - output.format("(elapsed time: %.2f sec)\n", (float) ((System.currentTimeMillis() - startTimePreprocess) / 1000)); + output.format("(elapsed time: %.2f sec)\n", preprocessMs / 1000.0f); specScanner.getProgressObj().setParentProgressObj(null); progress.report(5.0); @@ -124,8 +153,9 @@ public void run() { if (Thread.currentThread().isInterrupted()) { return; } + dbSearchMs = System.currentTimeMillis() - startTimeDbSearch; output.print(threadName + ": Database search finished "); - output.format("(elapsed time: %.2f sec)\n", (float) ((System.currentTimeMillis() - startTimeDbSearch) / 1000)); + output.format("(elapsed time: %.2f sec)\n", dbSearchMs / 1000.0f); progress.stepRange(95.0); @@ -138,8 +168,9 @@ public void run() { if (Thread.currentThread().isInterrupted()) { return; } + computeEvalueMs = System.currentTimeMillis() - startTimeComputeEvalue; output.print(threadName + ": Computing spectral E-values finished "); - output.format("(elapsed time: %.2f sec)\n", (float) ((System.currentTimeMillis() - startTimeComputeEvalue) / 1000)); + output.format("(elapsed time: %.2f sec)\n", computeEvalueMs / 1000.0f); scanner.getProgressObj().setParentProgressObj(null); progress.stepRange(100); @@ -161,6 +192,8 @@ public void run() { progress.report(100.0); // gen.addSpectrumIdentificationResults(scanner.getSpecIndexDBMatchMap()); + long totalMs = (System.nanoTime() - taskStartNs) / 1_000_000L; + wallStats = new TaskWallStats(taskNum, preprocessMs, dbSearchMs, computeEvalueMs, totalMs); output.println(threadName + ": Task " + taskNum + " completed."); } } diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java index 3a10c9bc..c2c11c74 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java @@ -421,6 +421,10 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o } } + // Retain task references so we can pull TaskWallStats after termination + // for the tail-imbalance summary (T1 instrumentation). + List submittedTasks = new ArrayList<>(numTasks); + try { for (int i = 0; i < numTasks; i++) { final int taskStartIndex = startIndex[i]; @@ -454,6 +458,8 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o taskNum ); + submittedTasks.add(msgfplusExecutor); + if (DISABLE_THREADING) { msgfplusExecutor.run(); } else { @@ -478,6 +484,12 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o // Output completed progress report. executor.outputProgressReport(); + // T1: tail-imbalance summary across the just-completed tasks. + // Cheap diagnostic; only printed when there's more than one task. + if (numTasks > 1) { + printTaskWallSummary(submittedTasks); + } + } catch (OutOfMemoryError ex) { ex.printStackTrace(); Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); @@ -539,4 +551,32 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - saveResultsStartTime) / 1000); return null; } + + /** + * Print a one-line tail-imbalance summary across all completed tasks. + * Reports min / median / p95 / max wall in seconds and the absolute tail + * gap (max - median). A high gap indicates uneven SpecKey distribution + * across tasks — input for deciding whether finer task granularity (T2) + * or work-stealing (T3) would help. + */ + private static void printTaskWallSummary(List tasks) { + List walls = new ArrayList<>(tasks.size()); + for (ConcurrentMSGFPlus.RunMSGFPlus t : tasks) { + ConcurrentMSGFPlus.TaskWallStats s = t.getWallStats(); + if (s != null) walls.add(s.totalMs); + } + if (walls.isEmpty()) return; + Collections.sort(walls); + long min = walls.get(0); + long max = walls.get(walls.size() - 1); + long median = walls.get(walls.size() / 2); + long p95 = walls.get(Math.min(walls.size() - 1, (int) Math.ceil(walls.size() * 0.95) - 1)); + long sum = 0L; + for (long w : walls) sum += w; + System.out.format( + "Task wall summary (n=%d): min=%.1fs median=%.1fs p95=%.1fs max=%.1fs total=%.1fs tail_gap=%.1fs (%.0f%% of median)%n", + walls.size(), min / 1000.0, median / 1000.0, p95 / 1000.0, max / 1000.0, + sum / 1000.0, (max - median) / 1000.0, + median > 0 ? 100.0 * (max - median) / median : 0.0); + } } From 957f6c20bcd5a271eafea3bab54797b448674fdd Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 25 Apr 2026 23:51:39 +0100 Subject: [PATCH 02/34] perf(search): drop dead synchronized wrappers in DBScanner + ScoredSpectraMap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both classes are constructed per RunMSGFPlus / ConcurrentMSGFDB task and owned exclusively by the worker thread that runs the task. There is no internal fork-out: dbSearch contains no ExecutorService / ForkJoin / Thread creation (only Thread.currentThread() for interrupt checks and naming). The Collections.synchronizedMap wrappers and synchronized method modifiers were defensive against a sharing pattern that does not occur in production code paths. Removed: - ScoredSpectraMap.java:63-72 — pepMassSpecKeyMap, specKeyScorerMap, specIndexChargeToSpecKeyMap, specKeyRankScorerMap unwrap from synchronized*Map to plain TreeMap / HashMap. - DBScanner.java:92-93 — specKeyDBMatchMap, specIndexDBMatchMap unwrap to plain HashMap. - DBScanner.java — drop `synchronized` modifier on addDBMatches, generateSpecIndexDBMatchMap, addResultsToList, addDBSearchResults. Pre-flight verified via grep: - new DBScanner / new ScoredSpectraMap call sites: MSGFPlus task lambda (line 433), MSGFDB per-thread loop (321), MassCalibrator pre-pass (orchestrator-thread, single instance), ConcurrentMSGFPlus.run() (per task), ConcurrentMSGFDB ctor (per thread). All task-local. - addDBMatches is only called from inside dbSearch (line 572). Internal, same thread. Memory-visibility safety: main thread reads resultList only after ExecutorService.awaitTermination, which provides happens-before on all worker writes (JLS §17.4.5). Internal sync wrappers were not load-bearing for visibility. Expected wall improvement: 0–2%. HotSpot's biased-locking + lock coarsening already elide most uncontended monitor cost, so the win is mostly code-clarity, not performance. Will measure with the T1 summary on Astral after T2/T3 land. Scoped tests green: 65/65. --- .../edu/ucsd/msjava/msdbsearch/DBScanner.java | 16 ++++++++++------ .../ucsd/msjava/msdbsearch/ScoredSpectraMap.java | 16 ++++++++-------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java b/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java index ace232a8..d16d4524 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java @@ -89,8 +89,12 @@ public DBScanner( intAAMass[aa.getResidue()] = aa.getNominalMass(); } - specKeyDBMatchMap = Collections.synchronizedMap(new HashMap>()); - specIndexDBMatchMap = Collections.synchronizedMap(new HashMap>()); + // DBScanner is owned by exactly one RunMSGFPlus / ConcurrentMSGFDB task. + // No internal fork-out (verified: no ExecutorService / Thread creation in + // dbSearch). Plain HashMap is enough; the synchronized wrappers were + // defensive against a sharing pattern that does not occur in production. + specKeyDBMatchMap = new HashMap<>(); + specIndexDBMatchMap = new HashMap<>(); progress = null; output = System.out; @@ -116,7 +120,7 @@ public DBScanner setThreadName(String threadName) { return this; } - public synchronized void addDBMatches(Map> map) { + public void addDBMatches(Map> map) { if (map == null) return; Iterator>> itr = map.entrySet().iterator(); @@ -668,7 +672,7 @@ public void computeSpecEValue(boolean storeScoreDist, int fromIndex, int toIndex } } - public synchronized void generateSpecIndexDBMatchMap() { + public void generateSpecIndexDBMatchMap() { Iterator>> itr = specKeyDBMatchMap.entrySet().iterator(); int numPeptidesPerSpec = this.numPeptidesPerSpec; @@ -728,7 +732,7 @@ public synchronized void generateSpecIndexDBMatchMap() { } } - public synchronized void addResultsToList(List resultList) { + public void addResultsToList(List resultList) { Iterator>> itr = specIndexDBMatchMap.entrySet().iterator(); while (itr.hasNext()) { Entry> entry = itr.next(); @@ -761,7 +765,7 @@ public void addAdditionalFeatures() { } // for MS-GFDB - public synchronized void addDBSearchResults(List gen, String specFileName, boolean replicateMergedResults) { + public void addDBSearchResults(List gen, String specFileName, boolean replicateMergedResults) { Map> specIndexDBMatchMap = new HashMap>(); Iterator>> itr = specKeyDBMatchMap.entrySet().iterator(); diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/ScoredSpectraMap.java b/src/main/java/edu/ucsd/msjava/msdbsearch/ScoredSpectraMap.java index 66c06d63..821baa7f 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/ScoredSpectraMap.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/ScoredSpectraMap.java @@ -60,16 +60,16 @@ public ScoredSpectraMap( this.specDataType = specDataType; this.precursorMassShiftPpm = precursorMassShiftPpm; - pepMassSpecKeyMap = Collections.synchronizedSortedMap((new TreeMap())); - specKeyScorerMap = Collections.synchronizedMap(new HashMap>()); - specIndexChargeToSpecKeyMap = Collections.synchronizedMap(new HashMap, SpecKey>()); - -// // To support spectrum-specific tolerance -// if(supportSpectrumSpecificErrorTolerance) -// specKeyToleranceMap = Collections.synchronizedMap(new HashMap()); + // Each ScoredSpectraMap is owned by exactly one RunMSGFPlus task (or the + // MassCalibrator pre-pass, also single-threaded). The synchronized wrappers + // these maps used to carry were defensive against a sharing pattern that + // does not occur in production code paths. Plain Map/SortedMap is enough. + pepMassSpecKeyMap = new TreeMap<>(); + specKeyScorerMap = new HashMap<>(); + specIndexChargeToSpecKeyMap = new HashMap<>(); if (storeRankScorer) - specKeyRankScorerMap = Collections.synchronizedMap(new HashMap()); + specKeyRankScorerMap = new HashMap<>(); progress = null; } From bfea7be2b4e1f7a5f9fd35e049fdf0e11e2a0c34 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 25 Apr 2026 23:56:15 +0100 Subject: [PATCH 03/34] perf(search): per-task result buffers; drop shared synchronizedList MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the global synchronizedList in MSGFPlus.runMSGFPlus with per-task local ArrayLists. Each RunMSGFPlus owns its own buffer (filled inside run() by scanner.addResultsToList) and exposes it through a new getResults() accessor. The main thread drains all per-task buffers into a plain ArrayList after awaitTermination — single-threaded, no lock needed because executor termination provides happens-before on every worker's writes (JLS §17.4.5). API change: RunMSGFPlus constructor drops the resultList parameter (was the shared list); the caller no longer needs to pre-allocate one. Also drops the Collections.synchronizedList wrapper around the SpecKey sublist passed into ScoredSpectraMap. With ScoredSpectraMap now task-local (previous commit) and the parent specKeyList no longer mutated after task partitioning, the sublist view is read-only from exactly one thread; the synchronized wrapper served no purpose. Expected wall improvement: 2-8% scaling with PSM count. Astral finishes around 90K raw PSMs across 4 threads — adds at the end of each task were under one mutex previously; now they're not. TestConcurrentMSGFPlus updated for the new constructor signature plus an additional assertion on the result buffer state. Scoped tests: 65/65 green. Full Astral 3-arm parity check deferred to land alongside T2/T3 — a single benchmark run after the structural changes are stable, comparing all three commits' output to dev. --- .../msjava/msdbsearch/ConcurrentMSGFPlus.java | 16 +++++++++++++--- src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java | 15 ++++++++++++--- .../msdbsearch/TestConcurrentMSGFPlus.java | 4 ++-- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java index 4605b6ba..41ca9a24 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java @@ -32,13 +32,24 @@ public static class RunMSGFPlus implements Runnable, ProgressReporter { private final Supplier specScannerSupplier; private final CompactSuffixArray sa; SearchParams params; - List resultList; + /** Task-local result buffer. Each task fills its own list inside + * {@link #run()} and exposes it via {@link #getResults()} for the + * main thread to drain after {@code awaitTermination}. Replaces the + * prior shared-synchronizedList-with-N-writers pattern. */ + private final List resultList; private final int taskNum; private ProgressData progress; private ScoredSpectraMap specScanner; private DBScanner scanner; private volatile TaskWallStats wallStats; + /** Drain the task-local result buffer. Safe to call from another + * thread after the executor has terminated; awaitTermination + * provides happens-before on the buffer's writes. */ + public List getResults() { + return resultList; + } + /** Wall stats captured at the end of {@link #run()}, or {@code null} * if the task didn't complete (e.g. interrupted). Read from the main * thread after {@code awaitTermination}. */ @@ -60,13 +71,12 @@ public RunMSGFPlus( Supplier specScannerSupplier, CompactSuffixArray sa, SearchParams params, - List resultList, int taskNum ) { + this.resultList = new java.util.ArrayList<>(); this.specScannerSupplier = specScannerSupplier; this.sa = sa; this.params = params; - this.resultList = resultList; this.taskNum = taskNum; progress = null; } diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java index c2c11c74..733f40e6 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java @@ -356,7 +356,9 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o } double precursorMassShiftPpm = currentIoFiles.getPrecursorMassShiftPpm(); - List resultList = Collections.synchronizedList(new ArrayList()); + // Drained from per-task buffers after awaitTermination; no shared + // mutation during the search itself. + List resultList = new ArrayList<>(); int toIndexGlobal = specSize; while (toIndexGlobal < specSize) { @@ -438,7 +440,7 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o () -> { ScoredSpectraMap specScanner = new ScoredSpectraMap( specAcc, - Collections.synchronizedList(specKeyList.subList(taskStartIndex, taskEndIndex)), + specKeyList.subList(taskStartIndex, taskEndIndex), leftPrecursorMassTolerance, rightPrecursorMassTolerance, minIsotopeError, @@ -454,7 +456,6 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o }, sa, params, - resultList, taskNum ); @@ -484,6 +485,14 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o // Output completed progress report. executor.outputProgressReport(); + // Drain per-task result buffers into the global resultList. Done + // single-threaded after awaitTermination — the executor's termination + // provides happens-before on every task's writes (JLS §17.4.5), so + // no synchronization is needed on the per-task ArrayList. + for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { + resultList.addAll(t.getResults()); + } + // T1: tail-imbalance summary across the just-completed tasks. // Cheap diagnostic; only printed when there's more than one task. if (numTasks > 1) { diff --git a/src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java b/src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java index 9cb5b715..98a0d018 100644 --- a/src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java +++ b/src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java @@ -3,7 +3,6 @@ import org.junit.Assert; import org.junit.Test; -import java.util.Collections; import java.util.concurrent.atomic.AtomicInteger; public class TestConcurrentMSGFPlus { @@ -18,11 +17,12 @@ public void defersScoredSpectraMapConstructionUntilRun() { }, null, null, - Collections.emptyList(), 1 ); Assert.assertEquals(0, buildCount.get()); + Assert.assertNotNull("Per-task result buffer must exist before run()", task.getResults()); + Assert.assertTrue("Per-task result buffer starts empty", task.getResults().isEmpty()); try { task.run(); From c3afe118520338515a81b3c959b80e34c773c27f Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 25 Apr 2026 23:59:09 +0100 Subject: [PATCH 04/34] =?UTF-8?q?perf(search):=20T2=20=E2=80=94=20make=20n?= =?UTF-8?q?umTasks-per-thread=20multiplier=20configurable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The static partitioning at numTasks = Math.min(numThreads * 3, ...) is a heuristic that can leave cores idle near the end of a search if the SpecKey distribution by precursor mass is uneven. Add a sysprop -Dmsgfplus.numTasksPerThread=N (default 3) so users can raise it on datasets where T1's tail-gap summary shows real imbalance. Higher N → smaller tasks → faster steady-state load balancing via the shared executor queue, at the cost of slightly more per-task heap (each new task allocates a ScoredSpectraMap and DBScanner). With 4 threads and N=8 instead of N=3, Astral goes from 12 tasks to 32 — at ~50 MB per-task state that's still well under the 8 GB heap. Default unchanged (3) so existing runs see no behavior shift. The flag is opt-in for tuning. Combined with T1's tail-imbalance summary, operators can iterate: read tail_gap, raise tasksPerThread, re-run, re-read tail_gap. Bit-identity safety: numTasks affects partitioning of specKeyList but not result content (results are sorted post-search via Collections.sort(resultList)). Raw target/decoy counts must remain identical regardless of N. Will verify on Astral. Scoped tests green: 69/69. --- .../java/edu/ucsd/msjava/ui/MSGFPlus.java | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java index 733f40e6..f52226fc 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java @@ -377,7 +377,13 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o ThreadPoolExecutorWithExceptions executor = ThreadPoolExecutorWithExceptions.newFixedThreadPool(numThreads); executor.setTaskName("Search"); - int numTasks = Math.min(numThreads * 3, Math.round((float) specSize / spectraPerTaskMinimum)); + // T2: numTasks-per-thread multiplier is configurable via + // -Dmsgfplus.numTasksPerThread=N. Higher values reduce tail + // imbalance when SpecKey distribution is uneven (heavier tasks + // get stolen earlier from the queue) at the cost of slightly more + // per-task heap. Default 3 keeps the prior behavior. + int tasksPerThread = resolveTasksPerThread(); + int numTasks = Math.min(numThreads * tasksPerThread, Math.round((float) specSize / spectraPerTaskMinimum)); if (numThreads <= 1) { numTasks = 1; } @@ -561,6 +567,21 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o return null; } + /** Sysprop overriding the numTasks-per-thread multiplier (T2). */ + static final String TASKS_PER_THREAD_PROPERTY = "msgfplus.numTasksPerThread"; + static final int DEFAULT_TASKS_PER_THREAD = 3; + + private static int resolveTasksPerThread() { + String v = System.getProperty(TASKS_PER_THREAD_PROPERTY); + if (v != null) { + try { + int n = Integer.parseInt(v.trim()); + if (n >= 1) return n; + } catch (NumberFormatException ignored) { /* fall through */ } + } + return DEFAULT_TASKS_PER_THREAD; + } + /** * Print a one-line tail-imbalance summary across all completed tasks. * Reports min / median / p95 / max wall in seconds and the absolute tail From 47cf7cfd0cd5c18ec6e80dc6d6863e2f49d66b97 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 00:02:28 +0100 Subject: [PATCH 05/34] =?UTF-8?q?perf(search):=20T3=20=E2=80=94=20opt-in?= =?UTF-8?q?=20ForkJoinPool=20path=20via=20-Dmsgfplus.useForkJoin=3Dtrue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an alternative search executor (ForkJoinPool sized to numThreads) behind a sysprop toggle. When -Dmsgfplus.useForkJoin=true is set, RunMSGFPlus tasks are submitted to the FJP via submit(), Futures are walked after awaitTermination, and any task exception is propagated through Future.get(). Default behavior unchanged: without the flag, ThreadPoolExecutorWith- Exceptions remains in use. The custom executor's progress reporting and exception-capture-via-afterExecute machinery is preserved on the default path; the FJP path uses Future.get() for exception propagation instead. Why this is opt-in rather than the new default: - Our task model is flat (no nested fork-join), so FJP's per-thread deque + work-stealing advantage over ThreadPoolExecutor's shared queue is small. Most of the load-balance benefit already comes from T2's queue oversubscription (numTasks = numThreads * tasksPerThread). - Progress reporting (afterExecute hook polling per-task ProgressData) is non-trivial to port to FJP, which has different lifecycle hooks. Re-implementing for FJP would add code surface for marginal value. - Lets us A/B-measure on Astral / TMT / metaproteomic before deciding whether to make FJP the default. Catch blocks updated to call shutdownNow on whichever pool is active. Validation note: this commit changes task scheduling but not result content; resultList is sorted post-search so order between tasks doesn't affect output. Bit-identity must hold on both paths and will be verified in the upcoming Astral 3-arm parity check. Scoped tests green: 69/69. --- .../java/edu/ucsd/msjava/ui/MSGFPlus.java | 74 ++++++++++++++----- 1 file changed, 56 insertions(+), 18 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java index f52226fc..d29b4901 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java @@ -20,6 +20,8 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; @@ -373,9 +375,22 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o System.out.println("Spectrum 0-" + (toIndexGlobal - 1) + " (total: " + specSize + ")"); - // Thread pool - ThreadPoolExecutorWithExceptions executor = ThreadPoolExecutorWithExceptions.newFixedThreadPool(numThreads); - executor.setTaskName("Search"); + // T3: -Dmsgfplus.useForkJoin=true swaps the search executor to a + // ForkJoinPool. For our flat-task model the wall-clock difference vs + // ThreadPoolExecutor + numThreads*N tasks is small (the queue + // oversubscription already gets us most of the load-balance benefit). + // Kept as opt-in so it can be A/B-tested on uneven workloads. Default + // path is unchanged. + boolean useForkJoin = Boolean.getBoolean("msgfplus.useForkJoin"); + + // Default thread pool (with progress reporting + exception capture). + ThreadPoolExecutorWithExceptions executor = + useForkJoin ? null : ThreadPoolExecutorWithExceptions.newFixedThreadPool(numThreads); + if (executor != null) executor.setTaskName("Search"); + // FJP-mode pool, used when useForkJoin=true. Sized to numThreads to + // match the default executor's parallelism. + ForkJoinPool fjp = useForkJoin ? new ForkJoinPool(numThreads) : null; + List> fjpFutures = useForkJoin ? new ArrayList<>() : null; // T2: numTasks-per-thread multiplier is configurable via // -Dmsgfplus.numTasksPerThread=N. Higher values reduce tail @@ -469,27 +484,50 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o if (DISABLE_THREADING) { msgfplusExecutor.run(); + } else if (useForkJoin) { + fjpFutures.add(fjp.submit(msgfplusExecutor)); } else { executor.execute(msgfplusExecutor); } } - // Output initial progress report. - executor.outputProgressReport(); - executor.shutdown(); - - try { - executor.awaitTerminationWithExceptions(Long.MAX_VALUE, TimeUnit.NANOSECONDS); - } catch (InterruptedException e) { - if (!executor.HasThrownData()) { - e.printStackTrace(); + if (useForkJoin) { + // FJP path: submit + drain Futures for exception propagation. + fjp.shutdown(); + try { + fjp.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, e.getMessage(), e); } - } + for (Future f : fjpFutures) { + try { f.get(); } + catch (java.util.concurrent.ExecutionException ex) { + Throwable cause = ex.getCause(); + Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, cause.getMessage(), cause); + return "Search failed: " + cause.getMessage(); + } + catch (InterruptedException ex) { Thread.currentThread().interrupt(); } + } + } else { + // Output initial progress report. + executor.outputProgressReport(); - // Output completed progress report. - executor.outputProgressReport(); + executor.shutdown(); + + try { + executor.awaitTerminationWithExceptions(Long.MAX_VALUE, TimeUnit.NANOSECONDS); + } catch (InterruptedException e) { + if (!executor.HasThrownData()) { + e.printStackTrace(); + Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, e.getMessage(), e); + } + } + + // Output completed progress report. + executor.outputProgressReport(); + } // Drain per-task result buffers into the global resultList. Done // single-threaded after awaitTermination — the executor's termination @@ -508,7 +546,7 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o } catch (OutOfMemoryError ex) { ex.printStackTrace(); Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); - executor.shutdownNow(); + if (executor != null) executor.shutdownNow(); else if (fjp != null) fjp.shutdownNow(); int taskMult = numTasks / numThreads; return "Task terminated; results incomplete. Please run again with a greater amount of memory, using \"-Xmx4G\", for example.\n" + "\tYou can also use less memory by increasing the number of tasks used for the search, at the cost of more time.\n" + @@ -516,12 +554,12 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o } catch (Exception ex) { ex.printStackTrace(); Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); - executor.shutdownNow(); + if (executor != null) executor.shutdownNow(); else if (fjp != null) fjp.shutdownNow(); return "Task terminated; results incomplete. Please run again."; } catch (Throwable ex) { ex.printStackTrace(); Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); - executor.shutdownNow(); + if (executor != null) executor.shutdownNow(); else if (fjp != null) fjp.shutdownNow(); return "Task terminated; results incomplete. Please run again."; } From a3f48fc9b4624166059d8a2f187f65288e600479 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 07:23:24 +0100 Subject: [PATCH 06/34] perf(search): tighter result-buffer merge + drainResultsTo + reused null sink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three small follow-ups to the per-task-buffer commit that pay back the allocation churn the buffer change shifted around: - ConcurrentMSGFPlus: cache a static NULL_PRINT_STREAM instead of allocating one per RunMSGFPlus.run() invocation. Same NullOutputStream sink either way; we just stop newing it 12+ times per search. - ConcurrentMSGFPlus.RunMSGFPlus: add drainResultsTo(dest) and getResultCount(). Drain transfers the task-local buffer's contents to the destination AND clears the local list, so the per-task heap is released immediately after merge instead of dangling on the task reference until the next major GC. - MSGFPlus.runMSGFPlus: pre-size the merged ArrayList based on sum(t.getResultCount()) before draining, avoiding the resize-and-copy cycle that addAll otherwise triggers as it doubles the backing array. Then submittedTasks.clear() drops the strong refs to all 12 task instances right after the wall summary, so their per-task heap (specScanner, scanner, aaMass arrays, etc.) is collectible before the FDR / write phase. TestConcurrentMSGFPlus: new drainsTaskLocalResultsIntoCallerBuffer test pins the new drain semantics — destination receives all entries, source is empty afterwards. Astral 3-arm parity (clean re-run, post-Rancher-shutdown system): armA wall=848.8s targets=89479 decoys=46792 armB wall=752.2s targets=89479 decoys=46792 (-11% vs A) armC wall=798.2s targets=89360 decoys=46913 (-6% vs A) Percolator at 1% FDR: armB 35818 / armC 35767 — both bit-identical to dev. All 8 parity numbers match exactly. Scoped tests: 59/59 green. --- .../msjava/msdbsearch/ConcurrentMSGFPlus.java | 15 ++++++- .../java/edu/ucsd/msjava/ui/MSGFPlus.java | 42 +++++++++++-------- .../msdbsearch/TestConcurrentMSGFPlus.java | 21 ++++++++++ 3 files changed, 59 insertions(+), 19 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java index 41ca9a24..f418c7bf 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java @@ -10,6 +10,8 @@ import org.apache.commons.io.output.NullOutputStream; public class ConcurrentMSGFPlus { + private static final PrintStream NULL_PRINT_STREAM = new PrintStream(new NullOutputStream()); + /** Per-task wall stats captured during {@link RunMSGFPlus#run()}. Used by * {@code MSGFPlus.runMSGFPlus} to report a tail-imbalance summary across * all tasks (T1 instrumentation). All units are milliseconds. */ @@ -50,6 +52,15 @@ public List getResults() { return resultList; } + public int getResultCount() { + return resultList.size(); + } + + public void drainResultsTo(List destination) { + destination.addAll(resultList); + resultList.clear(); + } + /** Wall stats captured at the end of {@link #run()}, or {@code null} * if the task didn't complete (e.g. interrupted). Read from the main * thread after {@code awaitTermination}. */ @@ -110,7 +121,7 @@ public void run() { if (params.getVerbose()) { output = System.out; } else { - output = new PrintStream(new NullOutputStream()); + output = NULL_PRINT_STREAM; } progress.stepRange(5.0); @@ -204,6 +215,8 @@ public void run() { // gen.addSpectrumIdentificationResults(scanner.getSpecIndexDBMatchMap()); long totalMs = (System.nanoTime() - taskStartNs) / 1_000_000L; wallStats = new TaskWallStats(taskNum, preprocessMs, dbSearchMs, computeEvalueMs, totalMs); + scanner = null; + specScanner = null; output.println(threadName + ": Task " + taskNum + " completed."); } } diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java index d29b4901..23f6c0a1 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java @@ -358,9 +358,9 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o } double precursorMassShiftPpm = currentIoFiles.getPrecursorMassShiftPpm(); - // Drained from per-task buffers after awaitTermination; no shared - // mutation during the search itself. - List resultList = new ArrayList<>(); + // Drained from per-task buffers after awaitTermination; no shared + // mutation during the search itself. + List resultList; int toIndexGlobal = specSize; while (toIndexGlobal < specSize) { @@ -529,21 +529,27 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o executor.outputProgressReport(); } - // Drain per-task result buffers into the global resultList. Done - // single-threaded after awaitTermination — the executor's termination - // provides happens-before on every task's writes (JLS §17.4.5), so - // no synchronization is needed on the per-task ArrayList. - for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { - resultList.addAll(t.getResults()); - } - - // T1: tail-imbalance summary across the just-completed tasks. - // Cheap diagnostic; only printed when there's more than one task. - if (numTasks > 1) { - printTaskWallSummary(submittedTasks); - } - - } catch (OutOfMemoryError ex) { + // Drain per-task result buffers into the global resultList. Done + // single-threaded after awaitTermination — the executor's termination + // provides happens-before on every task's writes (JLS §17.4.5), so + // no synchronization is needed on the per-task ArrayList. + int totalResults = 0; + for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { + totalResults += t.getResultCount(); + } + resultList = new ArrayList<>(totalResults); + for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { + t.drainResultsTo(resultList); + } + + // T1: tail-imbalance summary across the just-completed tasks. + // Cheap diagnostic; only printed when there's more than one task. + if (numTasks > 1) { + printTaskWallSummary(submittedTasks); + } + submittedTasks.clear(); + + } catch (OutOfMemoryError ex) { ex.printStackTrace(); Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); if (executor != null) executor.shutdownNow(); else if (fjp != null) fjp.shutdownNow(); diff --git a/src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java b/src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java index 98a0d018..83fcadf8 100644 --- a/src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java +++ b/src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java @@ -3,6 +3,8 @@ import org.junit.Assert; import org.junit.Test; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.atomic.AtomicInteger; public class TestConcurrentMSGFPlus { @@ -33,4 +35,23 @@ public void defersScoredSpectraMapConstructionUntilRun() { Assert.assertEquals(1, buildCount.get()); } + + @Test + public void drainsTaskLocalResultsIntoCallerBuffer() { + ConcurrentMSGFPlus.RunMSGFPlus task = new ConcurrentMSGFPlus.RunMSGFPlus( + () -> null, + null, + null, + 1 + ); + + task.getResults().add(null); + task.getResults().add(null); + + List merged = new ArrayList<>(); + task.drainResultsTo(merged); + + Assert.assertEquals(2, merged.size()); + Assert.assertTrue("Drain should clear the task-local buffer", task.getResults().isEmpty()); + } } From 1b7b5ddedeaf7dfb9b3e6cc3b76b3cf7dc33b1f9 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 07:38:54 +0100 Subject: [PATCH 07/34] perf(msgfdb): drop redundant synchronizedList on per-task SpecKey sublist MSGFDB.java mirrored the same pattern PR #25 fixed in MSGFPlus.java: each task's ScoredSpectraMap was constructed with a Collections.synchronizedList(specKeyList.subList(...)) wrapper. The sublist is a read-only view of the parent specKeyList, accessed only from the single worker thread that owns the task's ScoredSpectraMap; the synchronization wrapper served no purpose. This is the legacy MS-GFDB CLI path, kept consistent with the matching cleanup landed in MSGFPlus. Deliberately NOT changed in this commit: the global resultList at MSGFDB.java:296 is still Collections.synchronizedList(...). Unlike MSGFPlus, the MSGFDB path still has N worker threads concurrently calling scanner.addDBSearchResults(sharedList, ...). The previous PR #25 commit (957f6c2) removed the `synchronized` modifier from DBScanner.addDBSearchResults; the synchronizedList wrapper is now what keeps individual gen.add(...) calls atomic. Removing it without also doing the per-task-buffer refactor that MSGFPlus got would introduce a real data race on the gen list. The full per-task-buffer refactor for MSGFDB (matching what MSGFPlus has via getResults() + drainResultsTo()) is a follow-up PR. MSGFDB doesn't have an Astral-style parity benchmark to validate against, so it deserves its own focused validation cycle rather than piggy-backing here. --- src/main/java/edu/ucsd/msjava/ui/MSGFDB.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFDB.java b/src/main/java/edu/ucsd/msjava/ui/MSGFDB.java index a630a91b..c8a65178 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFDB.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFDB.java @@ -320,7 +320,7 @@ private static String runMSGFDB(File specFile, SpecFileFormat specFormat, File o for (int i = 0; i < numThreads; i++) { ScoredSpectraMap specScanner = new ScoredSpectraMap( specAcc, - Collections.synchronizedList(specKeyList.subList(startIndex[i], endIndex[i])), + specKeyList.subList(startIndex[i], endIndex[i]), leftPrecursorMassTolerance, rightPrecursorMassTolerance, numAllowedC13, From 2673d08bb8c56ac0c353d3e891932d6414a07153 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 07:47:35 +0100 Subject: [PATCH 08/34] refactor(search): simplify per /simplify review (-43 LOC, no behavior change) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup pass over the perf/search-sync-cleanup branch. No behavior change; tests still pass; net -43 LOC. ConcurrentMSGFPlus.java - TaskWallStats: 14-line final class → 2-line Java 17 record. Same immutable shape, less ceremony. - Drop volatile on the wallStats field — the only reader is the main thread after executor.awaitTermination, which establishes happens- before per JLS §17.4.5. The volatile-plus-comment was self- contradictory. - Trim restate-the-code Javadocs from getResults / getResultCount / drainResultsTo / getWallStats and the resultList field. Replaced with a one-line WHY comment on the wallStats field about the happens-before rationale. - Use the short-form ArrayList import instead of the fully-qualified java.util.ArrayList<>(). MSGFPlus.java - Promote "msgfplus.useForkJoin" to USE_FORK_JOIN_PROPERTY constant alongside the existing TASKS_PER_THREAD_PROPERTY pattern. Move both constants near DISABLE_THREADING at the top of the class to match CompactSuffixArray.SA_BUILD_THREADS_PROPERTY convention. - Extract shutdownPoolNow(executor, fjp) helper. Three identical catch-block ternaries collapse to one call each. - Add fjp.shutdownNow() on the FJP early-return-on-Future-failure path so any still-running tasks get interrupted instead of running to completion in the background. - Drop the T1/T2/T3 planning-doc tags from comments — those are internal task-numbers from the work plan that don't belong in source per the project's CLAUDE.md "no comments referencing the task / caller" rule. - Trim "default thread pool", "FJP-mode pool", "retain task references for the tail summary", and "T1: tail-imbalance summary" labels that restated the code. Kept the JLS §17.4.5 happens-before comment before the drain loop (genuine non-obvious WHY) and the deferred- ScoredSpectraMap-construction comment. Skipped from the review: - SysProps.intOr extraction across resolveTasksPerThread + CompactSuffixArray.resolveSortThreads — only two copies; per CLAUDE.md "three similar lines is better than premature abstraction." - Median helper extraction — MassErrorStat.median takes List not List; wrapping is more code than the inline version. - Per-task ArrayList pre-size from subListSize — would re-add a constructor parameter that the per-task-buffer commit deliberately removed. Scoped tests green: 66/66. --- .../msjava/msdbsearch/ConcurrentMSGFPlus.java | 38 ++----- .../java/edu/ucsd/msjava/ui/MSGFPlus.java | 101 +++++++----------- 2 files changed, 48 insertions(+), 91 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java index f418c7bf..8943afc8 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java @@ -4,6 +4,7 @@ import edu.ucsd.msjava.misc.ProgressReporter; import java.io.PrintStream; +import java.util.ArrayList; import java.util.List; import java.util.function.Supplier; @@ -12,42 +13,24 @@ public class ConcurrentMSGFPlus { private static final PrintStream NULL_PRINT_STREAM = new PrintStream(new NullOutputStream()); - /** Per-task wall stats captured during {@link RunMSGFPlus#run()}. Used by - * {@code MSGFPlus.runMSGFPlus} to report a tail-imbalance summary across - * all tasks (T1 instrumentation). All units are milliseconds. */ - public static final class TaskWallStats { - public final int taskNum; - public final long preprocessMs; - public final long dbSearchMs; - public final long computeEvalueMs; - public final long totalMs; - TaskWallStats(int taskNum, long preprocessMs, long dbSearchMs, long computeEvalueMs, long totalMs) { - this.taskNum = taskNum; - this.preprocessMs = preprocessMs; - this.dbSearchMs = dbSearchMs; - this.computeEvalueMs = computeEvalueMs; - this.totalMs = totalMs; - } - } + /** Per-task wall stats in milliseconds. {@code null} if the task didn't + * complete (interrupted). */ + public record TaskWallStats(int taskNum, long preprocessMs, long dbSearchMs, + long computeEvalueMs, long totalMs) {} public static class RunMSGFPlus implements Runnable, ProgressReporter { private final Supplier specScannerSupplier; private final CompactSuffixArray sa; SearchParams params; - /** Task-local result buffer. Each task fills its own list inside - * {@link #run()} and exposes it via {@link #getResults()} for the - * main thread to drain after {@code awaitTermination}. Replaces the - * prior shared-synchronizedList-with-N-writers pattern. */ private final List resultList; private final int taskNum; private ProgressData progress; private ScoredSpectraMap specScanner; private DBScanner scanner; - private volatile TaskWallStats wallStats; + // Written once at end of run(); read by the main thread only after + // executor.awaitTermination, which establishes happens-before. + private TaskWallStats wallStats; - /** Drain the task-local result buffer. Safe to call from another - * thread after the executor has terminated; awaitTermination - * provides happens-before on the buffer's writes. */ public List getResults() { return resultList; } @@ -61,9 +44,6 @@ public void drainResultsTo(List destination) { resultList.clear(); } - /** Wall stats captured at the end of {@link #run()}, or {@code null} - * if the task didn't complete (e.g. interrupted). Read from the main - * thread after {@code awaitTermination}. */ public TaskWallStats getWallStats() { return wallStats; } @@ -84,7 +64,7 @@ public RunMSGFPlus( SearchParams params, int taskNum ) { - this.resultList = new java.util.ArrayList<>(); + this.resultList = new ArrayList<>(); this.specScannerSupplier = specScannerSupplier; this.sa = sa; this.params = params; diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java index 23f6c0a1..c79475e5 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java @@ -37,6 +37,10 @@ public class MSGFPlus { // Set this to true when debugging private static final boolean DISABLE_THREADING = false; + private static final String TASKS_PER_THREAD_PROPERTY = "msgfplus.numTasksPerThread"; + private static final int DEFAULT_TASKS_PER_THREAD = 3; + private static final String USE_FORK_JOIN_PROPERTY = "msgfplus.useForkJoin"; + // Snapshot of the original CLI argv, captured in main() so that // RunManifestWriter can record it alongside the mzid without // threading argv through runMSGFPlus's many call sites. @@ -358,9 +362,7 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o } double precursorMassShiftPpm = currentIoFiles.getPrecursorMassShiftPpm(); - // Drained from per-task buffers after awaitTermination; no shared - // mutation during the search itself. - List resultList; + List resultList; int toIndexGlobal = specSize; while (toIndexGlobal < specSize) { @@ -375,28 +377,14 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o System.out.println("Spectrum 0-" + (toIndexGlobal - 1) + " (total: " + specSize + ")"); - // T3: -Dmsgfplus.useForkJoin=true swaps the search executor to a - // ForkJoinPool. For our flat-task model the wall-clock difference vs - // ThreadPoolExecutor + numThreads*N tasks is small (the queue - // oversubscription already gets us most of the load-balance benefit). - // Kept as opt-in so it can be A/B-tested on uneven workloads. Default - // path is unchanged. - boolean useForkJoin = Boolean.getBoolean("msgfplus.useForkJoin"); + boolean useForkJoin = Boolean.getBoolean(USE_FORK_JOIN_PROPERTY); - // Default thread pool (with progress reporting + exception capture). ThreadPoolExecutorWithExceptions executor = useForkJoin ? null : ThreadPoolExecutorWithExceptions.newFixedThreadPool(numThreads); if (executor != null) executor.setTaskName("Search"); - // FJP-mode pool, used when useForkJoin=true. Sized to numThreads to - // match the default executor's parallelism. ForkJoinPool fjp = useForkJoin ? new ForkJoinPool(numThreads) : null; List> fjpFutures = useForkJoin ? new ArrayList<>() : null; - // T2: numTasks-per-thread multiplier is configurable via - // -Dmsgfplus.numTasksPerThread=N. Higher values reduce tail - // imbalance when SpecKey distribution is uneven (heavier tasks - // get stolen earlier from the queue) at the cost of slightly more - // per-task heap. Default 3 keeps the prior behavior. int tasksPerThread = resolveTasksPerThread(); int numTasks = Math.min(numThreads * tasksPerThread, Math.round((float) specSize / spectraPerTaskMinimum)); if (numThreads <= 1) { @@ -444,8 +432,6 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o } } - // Retain task references so we can pull TaskWallStats after termination - // for the tail-imbalance summary (T1 instrumentation). List submittedTasks = new ArrayList<>(numTasks); try { @@ -455,8 +441,8 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o final boolean storeRankScorer = params.outputAdditionalFeatures(); final int taskNum = i + 1; - // Defer ScoredSpectraMap construction to the worker thread so all - // tasks' spectrum heaps aren't allocated up front when queued. + // Defer ScoredSpectraMap construction to the worker so the + // per-task spectrum heap isn't queued up front. ConcurrentMSGFPlus.RunMSGFPlus msgfplusExecutor = new ConcurrentMSGFPlus.RunMSGFPlus( () -> { ScoredSpectraMap specScanner = new ScoredSpectraMap( @@ -493,7 +479,6 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o } if (useForkJoin) { - // FJP path: submit + drain Futures for exception propagation. fjp.shutdown(); try { fjp.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); @@ -506,16 +491,14 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o catch (java.util.concurrent.ExecutionException ex) { Throwable cause = ex.getCause(); Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, cause.getMessage(), cause); + fjp.shutdownNow(); return "Search failed: " + cause.getMessage(); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); } } } else { - // Output initial progress report. executor.outputProgressReport(); - executor.shutdown(); - try { executor.awaitTerminationWithExceptions(Long.MAX_VALUE, TimeUnit.NANOSECONDS); } catch (InterruptedException e) { @@ -524,35 +507,30 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, e.getMessage(), e); } } - - // Output completed progress report. executor.outputProgressReport(); } - // Drain per-task result buffers into the global resultList. Done - // single-threaded after awaitTermination — the executor's termination - // provides happens-before on every task's writes (JLS §17.4.5), so - // no synchronization is needed on the per-task ArrayList. - int totalResults = 0; - for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { - totalResults += t.getResultCount(); - } - resultList = new ArrayList<>(totalResults); - for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { - t.drainResultsTo(resultList); - } - - // T1: tail-imbalance summary across the just-completed tasks. - // Cheap diagnostic; only printed when there's more than one task. - if (numTasks > 1) { - printTaskWallSummary(submittedTasks); - } - submittedTasks.clear(); - - } catch (OutOfMemoryError ex) { + // awaitTermination above establishes happens-before on every + // task's writes (JLS §17.4.5), so the per-task ArrayLists can + // be drained single-threaded with no synchronization. + int totalResults = 0; + for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { + totalResults += t.getResultCount(); + } + resultList = new ArrayList<>(totalResults); + for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { + t.drainResultsTo(resultList); + } + + if (numTasks > 1) { + printTaskWallSummary(submittedTasks); + } + submittedTasks.clear(); + + } catch (OutOfMemoryError ex) { ex.printStackTrace(); Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); - if (executor != null) executor.shutdownNow(); else if (fjp != null) fjp.shutdownNow(); + shutdownPoolNow(executor, fjp); int taskMult = numTasks / numThreads; return "Task terminated; results incomplete. Please run again with a greater amount of memory, using \"-Xmx4G\", for example.\n" + "\tYou can also use less memory by increasing the number of tasks used for the search, at the cost of more time.\n" + @@ -560,12 +538,12 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o } catch (Exception ex) { ex.printStackTrace(); Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); - if (executor != null) executor.shutdownNow(); else if (fjp != null) fjp.shutdownNow(); + shutdownPoolNow(executor, fjp); return "Task terminated; results incomplete. Please run again."; } catch (Throwable ex) { ex.printStackTrace(); Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); - if (executor != null) executor.shutdownNow(); else if (fjp != null) fjp.shutdownNow(); + shutdownPoolNow(executor, fjp); return "Task terminated; results incomplete. Please run again."; } @@ -611,10 +589,6 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o return null; } - /** Sysprop overriding the numTasks-per-thread multiplier (T2). */ - static final String TASKS_PER_THREAD_PROPERTY = "msgfplus.numTasksPerThread"; - static final int DEFAULT_TASKS_PER_THREAD = 3; - private static int resolveTasksPerThread() { String v = System.getProperty(TASKS_PER_THREAD_PROPERTY); if (v != null) { @@ -626,18 +600,21 @@ private static int resolveTasksPerThread() { return DEFAULT_TASKS_PER_THREAD; } + private static void shutdownPoolNow(ThreadPoolExecutorWithExceptions executor, ForkJoinPool fjp) { + if (executor != null) executor.shutdownNow(); + else if (fjp != null) fjp.shutdownNow(); + } + /** - * Print a one-line tail-imbalance summary across all completed tasks. - * Reports min / median / p95 / max wall in seconds and the absolute tail - * gap (max - median). A high gap indicates uneven SpecKey distribution - * across tasks — input for deciding whether finer task granularity (T2) - * or work-stealing (T3) would help. + * One-line wall-time summary across completed tasks. tail_gap (max - + * median) is the load-balance signal; high values point at uneven + * SpecKey distribution and motivate raising tasksPerThread. */ private static void printTaskWallSummary(List tasks) { List walls = new ArrayList<>(tasks.size()); for (ConcurrentMSGFPlus.RunMSGFPlus t : tasks) { ConcurrentMSGFPlus.TaskWallStats s = t.getWallStats(); - if (s != null) walls.add(s.totalMs); + if (s != null) walls.add(s.totalMs()); } if (walls.isEmpty()) return; Collections.sort(walls); From 7f4b099afa64d2f633932f8d35d71ee46e0a3ef3 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 07:56:59 +0100 Subject: [PATCH 09/34] refactor(search): drop redundant -Dmsgfplus.numTasksPerThread sysprop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysprop introduced in commit c3afe11 (T2) duplicated functionality the existing CLI flag -tasks -N already provides. Both produce numTasks = numThreads * N when no -tasks is specified explicitly. Having two parallel knobs for the same thing is clutter, not feature. Removed: TASKS_PER_THREAD_PROPERTY constant, resolveTasksPerThread() helper. The default multiplier survives as DEFAULT_TASKS_PER_THREAD constant, used inline. Users who want a different multiplier should use -tasks -N. The ParamManager already documents this (negative argument means "tasks per thread"). The -Dmsgfplus.useForkJoin sysprop is kept — it's not a tuning knob, it's an A/B measurement toggle for an alternative executor we're not committing to. CLI exposure would imply users should reach for it; sysprop is the right level for an opt-in evaluation flag. Scoped tests green: 70/70. --- .../java/edu/ucsd/msjava/ui/MSGFPlus.java | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java index c79475e5..33c131c8 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java @@ -37,7 +37,8 @@ public class MSGFPlus { // Set this to true when debugging private static final boolean DISABLE_THREADING = false; - private static final String TASKS_PER_THREAD_PROPERTY = "msgfplus.numTasksPerThread"; + /** Default numTasks-per-thread multiplier when {@code -tasks} is not + * passed. Users can override at the CLI via {@code -tasks -N}. */ private static final int DEFAULT_TASKS_PER_THREAD = 3; private static final String USE_FORK_JOIN_PROPERTY = "msgfplus.useForkJoin"; @@ -385,8 +386,7 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o ForkJoinPool fjp = useForkJoin ? new ForkJoinPool(numThreads) : null; List> fjpFutures = useForkJoin ? new ArrayList<>() : null; - int tasksPerThread = resolveTasksPerThread(); - int numTasks = Math.min(numThreads * tasksPerThread, Math.round((float) specSize / spectraPerTaskMinimum)); + int numTasks = Math.min(numThreads * DEFAULT_TASKS_PER_THREAD, Math.round((float) specSize / spectraPerTaskMinimum)); if (numThreads <= 1) { numTasks = 1; } @@ -589,17 +589,6 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o return null; } - private static int resolveTasksPerThread() { - String v = System.getProperty(TASKS_PER_THREAD_PROPERTY); - if (v != null) { - try { - int n = Integer.parseInt(v.trim()); - if (n >= 1) return n; - } catch (NumberFormatException ignored) { /* fall through */ } - } - return DEFAULT_TASKS_PER_THREAD; - } - private static void shutdownPoolNow(ThreadPoolExecutorWithExceptions executor, ForkJoinPool fjp) { if (executor != null) executor.shutdownNow(); else if (fjp != null) fjp.shutdownNow(); @@ -608,7 +597,7 @@ private static void shutdownPoolNow(ThreadPoolExecutorWithExceptions executor, F /** * One-line wall-time summary across completed tasks. tail_gap (max - * median) is the load-balance signal; high values point at uneven - * SpecKey distribution and motivate raising tasksPerThread. + * SpecKey distribution and motivate raising the {@code -tasks -N} multiplier. */ private static void printTaskWallSummary(List tasks) { List walls = new ArrayList<>(tasks.size()); From 9b742d7dedc87a1e1069c8a5f0dc3e448148fa2f Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 09:20:09 +0100 Subject: [PATCH 10/34] refactor: regroup CLI/output/parser packages Move classes into clearer top-level packages so the CLI entry points, write-side outputs, and parser support are colocated: - ui.MSGFPlus, ui.MSGFDB -> cli.* - mzid.DirectPinWriter, DirectTSVWriter, Unimod, UnimodComposition -> output.* - net.pempek.unicode.UnicodeBOMInputStream -> parser.UnicodeBOMInputStream - mslibsearch.ProcessedSpectrum deleted (no remaining refs) Update the executable jar wiring to follow the new MSGFPlus location: - META-INF/MANIFEST.MF Main-Class - pom.xml mainClass entries (assembly + shade plugin) Pure rename + import-fix; no behavior change. clean compile + scoped tests pass (TestDirectPinWriter, TestMSUtils, TestSA, TestMisc, TestRunManifestWriter -- 40 tests, 0 failures, 0 errors). --- pom.xml | 4 +- .../edu/ucsd/msjava/{ui => cli}/MSGFDB.java | 2 +- .../edu/ucsd/msjava/{ui => cli}/MSGFPlus.java | 1244 ++++++++--------- .../edu/ucsd/msjava/fdr/ComputeQValue.java | 2 +- .../edu/ucsd/msjava/fdr/MSGFPlusPSMSet.java | 2 +- .../java/edu/ucsd/msjava/fdr/TSVPSMSet.java | 2 +- .../edu/ucsd/msjava/msdbsearch/BuildSA.java | 2 +- .../msdbsearch/CompactFastaSequence.java | 4 +- .../msjava/msdbsearch/PeptideEnumerator.java | 2 +- .../edu/ucsd/msjava/msdbsearch/ReverseDB.java | 2 +- .../msjava/mslibsearch/ProcessedSpectrum.java | 33 - .../edu/ucsd/msjava/msutil/AminoAcidSet.java | 2 +- .../java/edu/ucsd/msjava/msutil/Peptide.java | 2 +- .../{mzid => output}/DirectPinWriter.java | 2 +- .../{mzid => output}/DirectTSVWriter.java | 2 +- .../ucsd/msjava/{mzid => output}/Unimod.java | 142 +- .../{mzid => output}/UnimodComposition.java | 266 ++-- .../edu/ucsd/msjava/params/ParamManager.java | 2 +- .../msjava/parser/BufferedLineReader.java | 1 - .../BufferedRandomAccessLineReader.java | 11 +- .../msjava/parser}/UnicodeBOMInputStream.java | 590 ++++---- src/main/resources/META-INF/MANIFEST.MF | 2 +- .../msjava/msdbsearch/SearchParamsTest.java | 4 +- .../msgfplus/TestCandidatePeptideGrid.java | 2 +- ...datePeptideGridConsideringMetCleavage.java | 2 +- src/test/java/msgfplus/TestCollaboration.java | 2 +- .../java/msgfplus/TestDirectPinWriter.java | 10 +- src/test/java/msgfplus/TestIPRG.java | 2 +- src/test/java/msgfplus/TestMSUtils.java | 2 +- src/test/java/msgfplus/TestMisc.java | 2 +- src/test/java/msgfplus/TestPercolator.java | 2 +- .../msgfplus/TestPrecursorCalIntegration.java | 2 +- .../msgfplus/TestPrecursorCalScaffolding.java | 2 +- .../java/msgfplus/TestRunManifestWriter.java | 2 +- src/test/java/msgfplus/TestSA.java | 2 +- 35 files changed, 1161 insertions(+), 1196 deletions(-) rename src/main/java/edu/ucsd/msjava/{ui => cli}/MSGFDB.java (99%) rename src/main/java/edu/ucsd/msjava/{ui => cli}/MSGFPlus.java (97%) delete mode 100644 src/main/java/edu/ucsd/msjava/mslibsearch/ProcessedSpectrum.java rename src/main/java/edu/ucsd/msjava/{mzid => output}/DirectPinWriter.java (99%) rename src/main/java/edu/ucsd/msjava/{mzid => output}/DirectTSVWriter.java (99%) rename src/main/java/edu/ucsd/msjava/{mzid => output}/Unimod.java (96%) rename src/main/java/edu/ucsd/msjava/{mzid => output}/UnimodComposition.java (96%) rename src/main/java/{net/pempak/unicode => edu/ucsd/msjava/parser}/UnicodeBOMInputStream.java (96%) diff --git a/pom.xml b/pom.xml index 935a2624..ceb3b8a4 100644 --- a/pom.xml +++ b/pom.xml @@ -38,7 +38,7 @@ true - edu.ucsd.msjava.ui.MSGFPlus + edu.ucsd.msjava.cli.MSGFPlus @@ -80,7 +80,7 @@ MSGFPlus - edu.ucsd.msjava.ui.MSGFPlus + edu.ucsd.msjava.cli.MSGFPlus diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFDB.java b/src/main/java/edu/ucsd/msjava/cli/MSGFDB.java similarity index 99% rename from src/main/java/edu/ucsd/msjava/ui/MSGFDB.java rename to src/main/java/edu/ucsd/msjava/cli/MSGFDB.java index c8a65178..d8a58442 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFDB.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFDB.java @@ -1,4 +1,4 @@ -package edu.ucsd.msjava.ui; +package edu.ucsd.msjava.cli; import edu.ucsd.msjava.msdbsearch.*; import edu.ucsd.msjava.msgf.MSGFDBResultGenerator; diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java similarity index 97% rename from src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java rename to src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java index 33c131c8..7f5dd3ca 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java @@ -1,622 +1,622 @@ -package edu.ucsd.msjava.ui; - -import edu.ucsd.msjava.fdr.ComputeFDR; -import edu.ucsd.msjava.misc.MSGFLogger; -import edu.ucsd.msjava.misc.RunManifestWriter; -import edu.ucsd.msjava.misc.ThreadPoolExecutorWithExceptions; -import edu.ucsd.msjava.msdbsearch.*; -import edu.ucsd.msjava.msgf.Tolerance; -import edu.ucsd.msjava.msscorer.NewScorerFactory.SpecDataType; -import edu.ucsd.msjava.msutil.*; -import edu.ucsd.msjava.mzid.DirectPinWriter; -import edu.ucsd.msjava.mzid.DirectTSVWriter; -import edu.ucsd.msjava.mzml.StaxMzMLParser; -import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.sequences.Constants; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; -import java.util.logging.Level; -import java.util.logging.Logger; - - -public class MSGFPlus { - public static final String VERSION = "Release (v2026.03.25)"; - public static final String RELEASE_DATE = "25 March 2026"; - - public static final String DECOY_DB_EXTENSION = ".revCat.fasta"; - public static final String DEFAULT_DECOY_PROTEIN_PREFIX = "XXX"; - - // Set this to true when debugging - private static final boolean DISABLE_THREADING = false; - - /** Default numTasks-per-thread multiplier when {@code -tasks} is not - * passed. Users can override at the CLI via {@code -tasks -N}. */ - private static final int DEFAULT_TASKS_PER_THREAD = 3; - private static final String USE_FORK_JOIN_PROPERTY = "msgfplus.useForkJoin"; - - // Snapshot of the original CLI argv, captured in main() so that - // RunManifestWriter can record it alongside the mzid without - // threading argv through runMSGFPlus's many call sites. - private static volatile String[] argvSnapshot = new String[0]; - - public static void main(String argv[]) { - long startTime = System.currentTimeMillis(); - argvSnapshot = argv == null ? new String[0] : argv.clone(); - - ParamManager paramManager = new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "java -Xmx3500M -jar MSGFPlus.jar"); - paramManager.addMSGFPlusParams(); - - if (argv.length == 0) { - paramManager.printUsageInfo(); - return; - } - - StaxMzMLParser.turnOffLogs(); - - // Parse parameters - String errMessage = paramManager.parseParams(argv); - if (errMessage != null) { - MSGFLogger.error(errMessage); - System.out.println(); - paramManager.printUsageInfo(); - System.exit(-1); - } - - // Propagate verbose flag to the shared logger before any downstream code logs. - MSGFLogger.setVerbose(paramManager.getVerboseFlag() == 1); - - // Running MS-GF+ - paramManager.printToolInfo(); - paramManager.printJVMInfo(); - String errorMessage = null; - try { - errorMessage = runMSGFPlus(paramManager); - } catch (Exception e) { - e.printStackTrace(); - System.exit(-1); - } - - if (errorMessage != null) { - MSGFLogger.error(errorMessage); - System.out.println(); - System.exit(-1); - } else - MSGFLogger.info("MS-GF+ complete (total elapsed time: %.2f sec)", (System.currentTimeMillis() - startTime) / (float) 1000); - } - - public static String runMSGFPlus(ParamManager paramManager) { - SearchParams params = new SearchParams(); - String errorMessage = params.parse(paramManager); - - if (errorMessage != null) { - return errorMessage; - } - - List ioList = params.getDBSearchIOList(); - boolean multiFiles = false; - if (ioList.size() >= 2) { - MSGFLogger.info("Processing " + ioList.size() + " spectra"); - for (DBSearchIOFiles ioFiles : ioList) { - MSGFLogger.debug("\t" + ioFiles.getSpecFile().getName()); - } - multiFiles = true; - } - - int ioIndex = -1; - for (DBSearchIOFiles ioFiles : ioList) { - ++ioIndex; - File specFile = ioFiles.getSpecFile(); - SpecFileFormat specFormat = ioFiles.getSpecFileFormat(); - File outputFile = ioFiles.getOutputFile(); - - if (multiFiles) { - if (!outputFile.exists()) { - MSGFLogger.info("\nProcessing " + specFile.getPath()); - MSGFLogger.debug("Writing results to " + outputFile.getPath()); - String errMsg = runMSGFPlus(ioIndex, specFormat, outputFile, params); - if (errMsg != null) { - return errMsg; - } - RunManifestWriter.write(ioFiles, params, VERSION, argvSnapshot); - } else { - MSGFLogger.info("\nIgnoring " + specFile.getPath()); - MSGFLogger.debug("Output file " + outputFile.getPath() + " exists."); - } - } else { - String errMsg = runMSGFPlus(ioIndex, specFormat, outputFile, params); - if (errMsg != null) { - return errMsg; - } - RunManifestWriter.write(ioFiles, params, VERSION, argvSnapshot); - } - } - - return null; - } - - private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File outputFile, SearchParams params) { - long startTime = System.currentTimeMillis(); - - // Verify that the output directory exists and can be written to - File outputDirectory = outputFile.getParentFile(); - if (outputDirectory != null) { - if (!outputDirectory.exists()) { - System.out.println("Creating directory " + outputDirectory.getPath()); - boolean success = outputDirectory.mkdirs(); - if (!success) { - return "Unable to create the missing directory: " + outputDirectory.getPath(); - } - } else if (!outputDirectory.isDirectory()) { - return "Invalid output file path (file path instead of directory path?): " + outputDirectory.getPath(); - } - - // An easy way to test for write access is outputDirectory.canWrite() - // However, on Windows this is not always accurate - // Thus, create a temporary file then delete it - try { - File testFile = File.createTempFile("MSGFPlus", ".tmp", outputDirectory); - testFile.delete(); - } catch (java.io.IOException e) { - return "Cannot create files in the output directory: " + e.getMessage(); - } catch (SecurityException e) { - return "Cannot create files in the output directory; permission denied for: " + outputDirectory.getPath(); - } - } - - // DB file - File databaseFile = params.getDatabaseFile(); - - if (databaseFile == null) { - return "Database file is not defined; use -d at the command line or DatabaseFile in a config file"; - } - - if (!databaseFile.exists()) { - return "Database file not found: " + databaseFile.getPath(); - } - - // Precursor mass tolerance - Tolerance leftPrecursorMassTolerance = params.getLeftPrecursorMassTolerance(); - Tolerance rightPrecursorMassTolerance = params.getRightPrecursorMassTolerance(); - - int minIsotopeError = params.getMinIsotopeError(); // inclusive - int maxIsotopeError = params.getMaxIsotopeError(); // inclusive - - Enzyme enzyme = params.getEnzyme(); - - ActivationMethod activationMethod = params.getActivationMethod(); - InstrumentType instType = params.getInstType(); - Protocol protocol = params.getProtocol(); - - AminoAcidSet aaSet = params.getAASet(); - - int startSpecIndex = params.getStartSpecIndex(); - int endSpecIndex = params.getEndSpecIndex(); - - boolean useTDA = params.useTDA(); - - int minCharge = params.getMinCharge(); - int maxCharge = params.getMaxCharge(); - - int numThreads = params.getNumThreads(); - boolean doNotUseEdgeScore = params.doNotUseEdgeScore(); - boolean allowDenseCentroidedPeaks = params.getAllowDenseCentroidedPeaks(); - - int minNumPeaksPerSpectrum = params.getMinNumPeaksPerSpectrum(); - if (minNumPeaksPerSpectrum == -1) // not specified - { - if (instType == InstrumentType.TOF) - minNumPeaksPerSpectrum = Constants.MIN_NUM_PEAKS_PER_SPECTRUM_TOF; - else - minNumPeaksPerSpectrum = Constants.MIN_NUM_PEAKS_PER_SPECTRUM; - } - - String decoyProteinPrefix = params.getDecoyProteinPrefix(); - - System.out.println("Loading database files..."); - - File dbIndexDir = params.getDBIndexDir(); - if (dbIndexDir != null) { - - File newDBFile = new File(Paths.get(dbIndexDir.getPath(), databaseFile.getName()).toString()); - if (!useTDA) { - if (!newDBFile.exists()) { - System.out.println("Creating " + newDBFile.getPath() + "."); - ReverseDB.copyDB(databaseFile.getPath(), newDBFile.getPath()); - } - } - databaseFile = newDBFile; - } - - if (useTDA) { - String dbFileName = databaseFile.getName(); - String concatDBFileName = dbFileName.substring(0, dbFileName.lastIndexOf('.')) + DECOY_DB_EXTENSION; - - String concatDBFilePath = Paths.get(databaseFile.getAbsoluteFile().getParent(), concatDBFileName).toString(); - File concatTargetDecoyDBFile = new File(concatDBFilePath); - - if (!concatTargetDecoyDBFile.exists()) { - System.out.println("Creating " + concatTargetDecoyDBFile.getPath() + "."); - if (ReverseDB.reverseDB(databaseFile.getPath(), concatTargetDecoyDBFile.getPath(), true, decoyProteinPrefix) == false) { - return "Cannot create a decoy database file!"; - } - } - databaseFile = concatTargetDecoyDBFile; - } - - DBScanner.setAminoAcidProbabilities(databaseFile.getPath(), aaSet); - aaSet.registerEnzyme(enzyme); - - CompactFastaSequence fastaSequence = new CompactFastaSequence(databaseFile.getPath()); - fastaSequence.setDecoyProteinPrefix(decoyProteinPrefix); - - if (useTDA) { - float ratioUniqueProteins = fastaSequence.getRatioUniqueProteins(); - if (ratioUniqueProteins < 0.5f) { - fastaSequence.printTooManyDuplicateSequencesMessage(databaseFile.getName(), "MS-GF+"); - System.exit(-1); - } - - float fractionDecoyProteins = fastaSequence.getFractionDecoyProteins(); - if (fractionDecoyProteins < 0.4f || fractionDecoyProteins > 0.6f) { - MSGFLogger.error("Error while reading: " + databaseFile.getName() + " (fraction of decoy proteins: " + fractionDecoyProteins + ")"); - MSGFLogger.error("Delete " + databaseFile.getName() + " and run MS-GF+ again."); - MSGFLogger.error("Decoy protein names should start with " + fastaSequence.getDecoyProteinPrefix()); - System.exit(-1); - } - } - - CompactSuffixArray sa = new CompactSuffixArray(fastaSequence, params.getMaxPeptideLength()); - System.out.print("Loading database finished "); - System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - startTime) / 1000); - - System.out.println("Reading spectra..."); - - File specFile = params.getDBSearchIOList().get(ioIndex).getSpecFile(); - - // Show a message of the form "Opening mzML file QC_Mam_19_01_PNNL_10_06Jan21_Arwen_WBEH-20-12-01.mzML" - System.out.printf("Opening %s %s\n", specFormat.getPSIName(), specFile.getName()); - - SpectraAccessor specAcc = new SpectraAccessor(specFile, specFormat); - int minMSLevel = params.getMinMSLevel(); - int maxMSLevel = params.getMaxMSLevel(); - specAcc.setMSLevelRange(minMSLevel, maxMSLevel); - - if (specAcc.getSpecMap() == null || specAcc.getSpecItr() == null) - return "Error while parsing spectrum file: " + specFile.getPath(); - - ArrayList specKeyList = SpecKey.getSpecKeyList(specAcc, - startSpecIndex, endSpecIndex, minCharge, maxCharge, activationMethod, minNumPeaksPerSpectrum, allowDenseCentroidedPeaks, - minMSLevel, maxMSLevel); - - int specSize = specKeyList.size(); - if (specSize == 0) - return specFile.getPath() + " does not have any valid spectra"; - - System.out.print("Reading spectra finished "); - System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - startTime) / 1000); - - if (numThreads <= 0) - numThreads = 1; - - // Minimum spectra/task(or thread) floor for efficiency; going smaller slows down processing. - // Configurable via -minSpectraPerThread for users on many-core hosts with small inputs (see #52). - int spectraPerTaskMinimum = params.getMinSpectraPerThread(); - int maxThreads = Math.max(1, Math.round((float) specSize / spectraPerTaskMinimum)); - if (maxThreads < numThreads) { - if (maxThreads == 1) { - System.out.println("Note: under " + spectraPerTaskMinimum + " spectra; using 1 thread instead of " + numThreads); - } else { - System.out.println("Note: " + spectraPerTaskMinimum + " spectra per thread minimum; using " + maxThreads + " threads instead of " + numThreads); - } - - numThreads = maxThreads; - } - - System.out.println("Using " + numThreads + (numThreads == 1 ? " thread." : " threads.")); - - // Print out parameters - System.out.println("Search Parameters:"); - System.out.println(params.toString()); - - SpecDataType specDataType = new SpecDataType(activationMethod, instType, enzyme, protocol); - - // Achievement B — two-pass precursor mass calibration (P2-cal). - // Runs a sampled pre-pass over the current file's SpecKeys to learn - // a per-file ppm shift, then stores it on DBSearchIOFiles so every - // task-local ScoredSpectraMap picks it up. OFF mode is a strict - // no-op: we skip the pre-pass entirely and never call the setter, - // so DBSearchIOFiles.precursorMassShiftPpm stays at its 0.0 default - // and ScoredSpectraMap.applyShift() takes its exact-zero fast path. - DBSearchIOFiles currentIoFiles = params.getDBSearchIOList().get(ioIndex); - if (params.getPrecursorCalMode() != SearchParams.PrecursorCalMode.OFF) { - long calStart = System.currentTimeMillis(); - MassCalibrator calibrator = new MassCalibrator( - specAcc, - sa, - aaSet, - params, - specKeyList, - leftPrecursorMassTolerance, - rightPrecursorMassTolerance, - minIsotopeError, - maxIsotopeError, - specDataType); - double shiftPpm = calibrator.learnPrecursorShiftPpm(ioIndex); - boolean applyLearnedShift = shiftPpm != 0.0 - || params.getPrecursorCalMode() == SearchParams.PrecursorCalMode.ON; - if (applyLearnedShift) { - currentIoFiles.setPrecursorMassShiftPpm(shiftPpm); - System.out.printf("Precursor mass shift learned: %.3f ppm (elapsed: %.2f sec)%n", - shiftPpm, (System.currentTimeMillis() - calStart) / 1000.0); - } else { - System.out.printf("Precursor mass calibration skipped (insufficient confident PSMs; elapsed: %.2f sec)%n", - (System.currentTimeMillis() - calStart) / 1000.0); - } - } - double precursorMassShiftPpm = currentIoFiles.getPrecursorMassShiftPpm(); - - List resultList; - - int toIndexGlobal = specSize; - while (toIndexGlobal < specSize) { - SpecKey lastSpecKey = specKeyList.get(toIndexGlobal - 1); - SpecKey nextSpecKey = specKeyList.get(toIndexGlobal); - - if (lastSpecKey.getSpecIndex() == nextSpecKey.getSpecIndex()) - toIndexGlobal++; - else - break; - } - - System.out.println("Spectrum 0-" + (toIndexGlobal - 1) + " (total: " + specSize + ")"); - - boolean useForkJoin = Boolean.getBoolean(USE_FORK_JOIN_PROPERTY); - - ThreadPoolExecutorWithExceptions executor = - useForkJoin ? null : ThreadPoolExecutorWithExceptions.newFixedThreadPool(numThreads); - if (executor != null) executor.setTaskName("Search"); - ForkJoinPool fjp = useForkJoin ? new ForkJoinPool(numThreads) : null; - List> fjpFutures = useForkJoin ? new ArrayList<>() : null; - - int numTasks = Math.min(numThreads * DEFAULT_TASKS_PER_THREAD, Math.round((float) specSize / spectraPerTaskMinimum)); - if (numThreads <= 1) { - numTasks = 1; - } - - if (params.getNumTasks() != 0) { - numTasks = params.getNumTasks(); - if (numTasks < 0) { - numTasks = numThreads * (numTasks * -1); - } - if (numTasks < numThreads) { - System.out.println("Changing specified tasks from " + numTasks + " to " + numThreads + " to provide the minimum of one task per thread."); - numTasks = numThreads; - } - } - if (numTasks > 1) { - System.out.println("Splitting work into " + numTasks + " tasks."); - } else { - System.out.println("Searching using a single task."); - } - - // Partition specKeyList - int size = toIndexGlobal; - int residue = size % numTasks; - - int[] startIndex = new int[numTasks]; - int[] endIndex = new int[numTasks]; - - int subListSize = size / numTasks; - for (int i = 0; i < numTasks; i++) { - startIndex[i] = i > 0 ? endIndex[i - 1] : 0; - endIndex[i] = startIndex[i] + subListSize + (i < residue ? 1 : 0); - - subListSize = size / numTasks; - while (endIndex[i] < specKeyList.size()) { - SpecKey lastSpecKey = specKeyList.get(endIndex[i] - 1); - SpecKey nextSpecKey = specKeyList.get(endIndex[i]); - - if (lastSpecKey.getSpecIndex() == nextSpecKey.getSpecIndex()) { - ++endIndex[i]; - --subListSize; - } else - break; - } - } - - List submittedTasks = new ArrayList<>(numTasks); - - try { - for (int i = 0; i < numTasks; i++) { - final int taskStartIndex = startIndex[i]; - final int taskEndIndex = endIndex[i]; - final boolean storeRankScorer = params.outputAdditionalFeatures(); - final int taskNum = i + 1; - - // Defer ScoredSpectraMap construction to the worker so the - // per-task spectrum heap isn't queued up front. - ConcurrentMSGFPlus.RunMSGFPlus msgfplusExecutor = new ConcurrentMSGFPlus.RunMSGFPlus( - () -> { - ScoredSpectraMap specScanner = new ScoredSpectraMap( - specAcc, - specKeyList.subList(taskStartIndex, taskEndIndex), - leftPrecursorMassTolerance, - rightPrecursorMassTolerance, - minIsotopeError, - maxIsotopeError, - specDataType, - storeRankScorer, - false, - precursorMassShiftPpm - ); - if (doNotUseEdgeScore) - specScanner.turnOffEdgeScoring(); - return specScanner; - }, - sa, - params, - taskNum - ); - - submittedTasks.add(msgfplusExecutor); - - if (DISABLE_THREADING) { - msgfplusExecutor.run(); - } else if (useForkJoin) { - fjpFutures.add(fjp.submit(msgfplusExecutor)); - } else { - executor.execute(msgfplusExecutor); - } - - } - - if (useForkJoin) { - fjp.shutdown(); - try { - fjp.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, e.getMessage(), e); - } - for (Future f : fjpFutures) { - try { f.get(); } - catch (java.util.concurrent.ExecutionException ex) { - Throwable cause = ex.getCause(); - Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, cause.getMessage(), cause); - fjp.shutdownNow(); - return "Search failed: " + cause.getMessage(); - } - catch (InterruptedException ex) { Thread.currentThread().interrupt(); } - } - } else { - executor.outputProgressReport(); - executor.shutdown(); - try { - executor.awaitTerminationWithExceptions(Long.MAX_VALUE, TimeUnit.NANOSECONDS); - } catch (InterruptedException e) { - if (!executor.HasThrownData()) { - e.printStackTrace(); - Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, e.getMessage(), e); - } - } - executor.outputProgressReport(); - } - - // awaitTermination above establishes happens-before on every - // task's writes (JLS §17.4.5), so the per-task ArrayLists can - // be drained single-threaded with no synchronization. - int totalResults = 0; - for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { - totalResults += t.getResultCount(); - } - resultList = new ArrayList<>(totalResults); - for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { - t.drainResultsTo(resultList); - } - - if (numTasks > 1) { - printTaskWallSummary(submittedTasks); - } - submittedTasks.clear(); - - } catch (OutOfMemoryError ex) { - ex.printStackTrace(); - Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); - shutdownPoolNow(executor, fjp); - int taskMult = numTasks / numThreads; - return "Task terminated; results incomplete. Please run again with a greater amount of memory, using \"-Xmx4G\", for example.\n" + - "\tYou can also use less memory by increasing the number of tasks used for the search, at the cost of more time.\n" + - "\tTry doubling the number used for this search with \"-tasks -" + (taskMult * 2) + "\" or \"-tasks " + (numTasks * 2) + "\"."; - } catch (Exception ex) { - ex.printStackTrace(); - Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); - shutdownPoolNow(executor, fjp); - return "Task terminated; results incomplete. Please run again."; - } catch (Throwable ex) { - ex.printStackTrace(); - Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); - shutdownPoolNow(executor, fjp); - return "Task terminated; results incomplete. Please run again."; - } - - long qValueStartTime = System.currentTimeMillis(); - - if (params.useTDA()) { - // Compute Q-values - System.out.println("Computing q-values..."); - ComputeFDR.addQValues(resultList, sa, false, decoyProteinPrefix); - System.out.print("Computing q-values finished "); - System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - qValueStartTime) / 1000); - } - - // Sort by spectral E-values then write to disk - - long saveResultsStartTime = System.currentTimeMillis(); - - System.out.println("Writing results..."); - Collections.sort(resultList); - - if (params.writeTsv()) { - DirectTSVWriter tsvWriter = new DirectTSVWriter(params, aaSet, sa, specAcc, ioIndex); - try { - tsvWriter.writeResults(resultList, outputFile); - } catch (IOException e) { - return "Error writing TSV output: " + e.getMessage(); - } - System.out.println("TSV file: " + outputFile.getPath()); - } - - if (params.writePin()) { - DirectPinWriter pinWriter = new DirectPinWriter(params, aaSet, sa, specAcc, ioIndex); - try { - pinWriter.writeResults(resultList, outputFile); - } catch (IOException e) { - return "Error writing pin output: " + e.getMessage(); - } - System.out.println("PIN file: " + outputFile.getPath()); - } - - System.out.print("Writing results finished "); - System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - saveResultsStartTime) / 1000); - return null; - } - - private static void shutdownPoolNow(ThreadPoolExecutorWithExceptions executor, ForkJoinPool fjp) { - if (executor != null) executor.shutdownNow(); - else if (fjp != null) fjp.shutdownNow(); - } - - /** - * One-line wall-time summary across completed tasks. tail_gap (max - - * median) is the load-balance signal; high values point at uneven - * SpecKey distribution and motivate raising the {@code -tasks -N} multiplier. - */ - private static void printTaskWallSummary(List tasks) { - List walls = new ArrayList<>(tasks.size()); - for (ConcurrentMSGFPlus.RunMSGFPlus t : tasks) { - ConcurrentMSGFPlus.TaskWallStats s = t.getWallStats(); - if (s != null) walls.add(s.totalMs()); - } - if (walls.isEmpty()) return; - Collections.sort(walls); - long min = walls.get(0); - long max = walls.get(walls.size() - 1); - long median = walls.get(walls.size() / 2); - long p95 = walls.get(Math.min(walls.size() - 1, (int) Math.ceil(walls.size() * 0.95) - 1)); - long sum = 0L; - for (long w : walls) sum += w; - System.out.format( - "Task wall summary (n=%d): min=%.1fs median=%.1fs p95=%.1fs max=%.1fs total=%.1fs tail_gap=%.1fs (%.0f%% of median)%n", - walls.size(), min / 1000.0, median / 1000.0, p95 / 1000.0, max / 1000.0, - sum / 1000.0, (max - median) / 1000.0, - median > 0 ? 100.0 * (max - median) / median : 0.0); - } -} +package edu.ucsd.msjava.cli; + +import edu.ucsd.msjava.fdr.ComputeFDR; +import edu.ucsd.msjava.misc.MSGFLogger; +import edu.ucsd.msjava.misc.RunManifestWriter; +import edu.ucsd.msjava.misc.ThreadPoolExecutorWithExceptions; +import edu.ucsd.msjava.msdbsearch.*; +import edu.ucsd.msjava.msgf.Tolerance; +import edu.ucsd.msjava.msscorer.NewScorerFactory.SpecDataType; +import edu.ucsd.msjava.msutil.*; +import edu.ucsd.msjava.output.DirectPinWriter; +import edu.ucsd.msjava.output.DirectTSVWriter; +import edu.ucsd.msjava.mzml.StaxMzMLParser; +import edu.ucsd.msjava.params.ParamManager; +import edu.ucsd.msjava.sequences.Constants; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.logging.Level; +import java.util.logging.Logger; + + +public class MSGFPlus { + public static final String VERSION = "Release (v2026.03.25)"; + public static final String RELEASE_DATE = "25 March 2026"; + + public static final String DECOY_DB_EXTENSION = ".revCat.fasta"; + public static final String DEFAULT_DECOY_PROTEIN_PREFIX = "XXX"; + + // Set this to true when debugging + private static final boolean DISABLE_THREADING = false; + + /** Default numTasks-per-thread multiplier when {@code -tasks} is not + * passed. Users can override at the CLI via {@code -tasks -N}. */ + private static final int DEFAULT_TASKS_PER_THREAD = 3; + private static final String USE_FORK_JOIN_PROPERTY = "msgfplus.useForkJoin"; + + // Snapshot of the original CLI argv, captured in main() so that + // RunManifestWriter can record it alongside the mzid without + // threading argv through runMSGFPlus's many call sites. + private static volatile String[] argvSnapshot = new String[0]; + + public static void main(String argv[]) { + long startTime = System.currentTimeMillis(); + argvSnapshot = argv == null ? new String[0] : argv.clone(); + + ParamManager paramManager = new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "java -Xmx3500M -jar MSGFPlus.jar"); + paramManager.addMSGFPlusParams(); + + if (argv.length == 0) { + paramManager.printUsageInfo(); + return; + } + + StaxMzMLParser.turnOffLogs(); + + // Parse parameters + String errMessage = paramManager.parseParams(argv); + if (errMessage != null) { + MSGFLogger.error(errMessage); + System.out.println(); + paramManager.printUsageInfo(); + System.exit(-1); + } + + // Propagate verbose flag to the shared logger before any downstream code logs. + MSGFLogger.setVerbose(paramManager.getVerboseFlag() == 1); + + // Running MS-GF+ + paramManager.printToolInfo(); + paramManager.printJVMInfo(); + String errorMessage = null; + try { + errorMessage = runMSGFPlus(paramManager); + } catch (Exception e) { + e.printStackTrace(); + System.exit(-1); + } + + if (errorMessage != null) { + MSGFLogger.error(errorMessage); + System.out.println(); + System.exit(-1); + } else + MSGFLogger.info("MS-GF+ complete (total elapsed time: %.2f sec)", (System.currentTimeMillis() - startTime) / (float) 1000); + } + + public static String runMSGFPlus(ParamManager paramManager) { + SearchParams params = new SearchParams(); + String errorMessage = params.parse(paramManager); + + if (errorMessage != null) { + return errorMessage; + } + + List ioList = params.getDBSearchIOList(); + boolean multiFiles = false; + if (ioList.size() >= 2) { + MSGFLogger.info("Processing " + ioList.size() + " spectra"); + for (DBSearchIOFiles ioFiles : ioList) { + MSGFLogger.debug("\t" + ioFiles.getSpecFile().getName()); + } + multiFiles = true; + } + + int ioIndex = -1; + for (DBSearchIOFiles ioFiles : ioList) { + ++ioIndex; + File specFile = ioFiles.getSpecFile(); + SpecFileFormat specFormat = ioFiles.getSpecFileFormat(); + File outputFile = ioFiles.getOutputFile(); + + if (multiFiles) { + if (!outputFile.exists()) { + MSGFLogger.info("\nProcessing " + specFile.getPath()); + MSGFLogger.debug("Writing results to " + outputFile.getPath()); + String errMsg = runMSGFPlus(ioIndex, specFormat, outputFile, params); + if (errMsg != null) { + return errMsg; + } + RunManifestWriter.write(ioFiles, params, VERSION, argvSnapshot); + } else { + MSGFLogger.info("\nIgnoring " + specFile.getPath()); + MSGFLogger.debug("Output file " + outputFile.getPath() + " exists."); + } + } else { + String errMsg = runMSGFPlus(ioIndex, specFormat, outputFile, params); + if (errMsg != null) { + return errMsg; + } + RunManifestWriter.write(ioFiles, params, VERSION, argvSnapshot); + } + } + + return null; + } + + private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File outputFile, SearchParams params) { + long startTime = System.currentTimeMillis(); + + // Verify that the output directory exists and can be written to + File outputDirectory = outputFile.getParentFile(); + if (outputDirectory != null) { + if (!outputDirectory.exists()) { + System.out.println("Creating directory " + outputDirectory.getPath()); + boolean success = outputDirectory.mkdirs(); + if (!success) { + return "Unable to create the missing directory: " + outputDirectory.getPath(); + } + } else if (!outputDirectory.isDirectory()) { + return "Invalid output file path (file path instead of directory path?): " + outputDirectory.getPath(); + } + + // An easy way to test for write access is outputDirectory.canWrite() + // However, on Windows this is not always accurate + // Thus, create a temporary file then delete it + try { + File testFile = File.createTempFile("MSGFPlus", ".tmp", outputDirectory); + testFile.delete(); + } catch (java.io.IOException e) { + return "Cannot create files in the output directory: " + e.getMessage(); + } catch (SecurityException e) { + return "Cannot create files in the output directory; permission denied for: " + outputDirectory.getPath(); + } + } + + // DB file + File databaseFile = params.getDatabaseFile(); + + if (databaseFile == null) { + return "Database file is not defined; use -d at the command line or DatabaseFile in a config file"; + } + + if (!databaseFile.exists()) { + return "Database file not found: " + databaseFile.getPath(); + } + + // Precursor mass tolerance + Tolerance leftPrecursorMassTolerance = params.getLeftPrecursorMassTolerance(); + Tolerance rightPrecursorMassTolerance = params.getRightPrecursorMassTolerance(); + + int minIsotopeError = params.getMinIsotopeError(); // inclusive + int maxIsotopeError = params.getMaxIsotopeError(); // inclusive + + Enzyme enzyme = params.getEnzyme(); + + ActivationMethod activationMethod = params.getActivationMethod(); + InstrumentType instType = params.getInstType(); + Protocol protocol = params.getProtocol(); + + AminoAcidSet aaSet = params.getAASet(); + + int startSpecIndex = params.getStartSpecIndex(); + int endSpecIndex = params.getEndSpecIndex(); + + boolean useTDA = params.useTDA(); + + int minCharge = params.getMinCharge(); + int maxCharge = params.getMaxCharge(); + + int numThreads = params.getNumThreads(); + boolean doNotUseEdgeScore = params.doNotUseEdgeScore(); + boolean allowDenseCentroidedPeaks = params.getAllowDenseCentroidedPeaks(); + + int minNumPeaksPerSpectrum = params.getMinNumPeaksPerSpectrum(); + if (minNumPeaksPerSpectrum == -1) // not specified + { + if (instType == InstrumentType.TOF) + minNumPeaksPerSpectrum = Constants.MIN_NUM_PEAKS_PER_SPECTRUM_TOF; + else + minNumPeaksPerSpectrum = Constants.MIN_NUM_PEAKS_PER_SPECTRUM; + } + + String decoyProteinPrefix = params.getDecoyProteinPrefix(); + + System.out.println("Loading database files..."); + + File dbIndexDir = params.getDBIndexDir(); + if (dbIndexDir != null) { + + File newDBFile = new File(Paths.get(dbIndexDir.getPath(), databaseFile.getName()).toString()); + if (!useTDA) { + if (!newDBFile.exists()) { + System.out.println("Creating " + newDBFile.getPath() + "."); + ReverseDB.copyDB(databaseFile.getPath(), newDBFile.getPath()); + } + } + databaseFile = newDBFile; + } + + if (useTDA) { + String dbFileName = databaseFile.getName(); + String concatDBFileName = dbFileName.substring(0, dbFileName.lastIndexOf('.')) + DECOY_DB_EXTENSION; + + String concatDBFilePath = Paths.get(databaseFile.getAbsoluteFile().getParent(), concatDBFileName).toString(); + File concatTargetDecoyDBFile = new File(concatDBFilePath); + + if (!concatTargetDecoyDBFile.exists()) { + System.out.println("Creating " + concatTargetDecoyDBFile.getPath() + "."); + if (ReverseDB.reverseDB(databaseFile.getPath(), concatTargetDecoyDBFile.getPath(), true, decoyProteinPrefix) == false) { + return "Cannot create a decoy database file!"; + } + } + databaseFile = concatTargetDecoyDBFile; + } + + DBScanner.setAminoAcidProbabilities(databaseFile.getPath(), aaSet); + aaSet.registerEnzyme(enzyme); + + CompactFastaSequence fastaSequence = new CompactFastaSequence(databaseFile.getPath()); + fastaSequence.setDecoyProteinPrefix(decoyProteinPrefix); + + if (useTDA) { + float ratioUniqueProteins = fastaSequence.getRatioUniqueProteins(); + if (ratioUniqueProteins < 0.5f) { + fastaSequence.printTooManyDuplicateSequencesMessage(databaseFile.getName(), "MS-GF+"); + System.exit(-1); + } + + float fractionDecoyProteins = fastaSequence.getFractionDecoyProteins(); + if (fractionDecoyProteins < 0.4f || fractionDecoyProteins > 0.6f) { + MSGFLogger.error("Error while reading: " + databaseFile.getName() + " (fraction of decoy proteins: " + fractionDecoyProteins + ")"); + MSGFLogger.error("Delete " + databaseFile.getName() + " and run MS-GF+ again."); + MSGFLogger.error("Decoy protein names should start with " + fastaSequence.getDecoyProteinPrefix()); + System.exit(-1); + } + } + + CompactSuffixArray sa = new CompactSuffixArray(fastaSequence, params.getMaxPeptideLength()); + System.out.print("Loading database finished "); + System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - startTime) / 1000); + + System.out.println("Reading spectra..."); + + File specFile = params.getDBSearchIOList().get(ioIndex).getSpecFile(); + + // Show a message of the form "Opening mzML file QC_Mam_19_01_PNNL_10_06Jan21_Arwen_WBEH-20-12-01.mzML" + System.out.printf("Opening %s %s\n", specFormat.getPSIName(), specFile.getName()); + + SpectraAccessor specAcc = new SpectraAccessor(specFile, specFormat); + int minMSLevel = params.getMinMSLevel(); + int maxMSLevel = params.getMaxMSLevel(); + specAcc.setMSLevelRange(minMSLevel, maxMSLevel); + + if (specAcc.getSpecMap() == null || specAcc.getSpecItr() == null) + return "Error while parsing spectrum file: " + specFile.getPath(); + + ArrayList specKeyList = SpecKey.getSpecKeyList(specAcc, + startSpecIndex, endSpecIndex, minCharge, maxCharge, activationMethod, minNumPeaksPerSpectrum, allowDenseCentroidedPeaks, + minMSLevel, maxMSLevel); + + int specSize = specKeyList.size(); + if (specSize == 0) + return specFile.getPath() + " does not have any valid spectra"; + + System.out.print("Reading spectra finished "); + System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - startTime) / 1000); + + if (numThreads <= 0) + numThreads = 1; + + // Minimum spectra/task(or thread) floor for efficiency; going smaller slows down processing. + // Configurable via -minSpectraPerThread for users on many-core hosts with small inputs (see #52). + int spectraPerTaskMinimum = params.getMinSpectraPerThread(); + int maxThreads = Math.max(1, Math.round((float) specSize / spectraPerTaskMinimum)); + if (maxThreads < numThreads) { + if (maxThreads == 1) { + System.out.println("Note: under " + spectraPerTaskMinimum + " spectra; using 1 thread instead of " + numThreads); + } else { + System.out.println("Note: " + spectraPerTaskMinimum + " spectra per thread minimum; using " + maxThreads + " threads instead of " + numThreads); + } + + numThreads = maxThreads; + } + + System.out.println("Using " + numThreads + (numThreads == 1 ? " thread." : " threads.")); + + // Print out parameters + System.out.println("Search Parameters:"); + System.out.println(params.toString()); + + SpecDataType specDataType = new SpecDataType(activationMethod, instType, enzyme, protocol); + + // Achievement B — two-pass precursor mass calibration (P2-cal). + // Runs a sampled pre-pass over the current file's SpecKeys to learn + // a per-file ppm shift, then stores it on DBSearchIOFiles so every + // task-local ScoredSpectraMap picks it up. OFF mode is a strict + // no-op: we skip the pre-pass entirely and never call the setter, + // so DBSearchIOFiles.precursorMassShiftPpm stays at its 0.0 default + // and ScoredSpectraMap.applyShift() takes its exact-zero fast path. + DBSearchIOFiles currentIoFiles = params.getDBSearchIOList().get(ioIndex); + if (params.getPrecursorCalMode() != SearchParams.PrecursorCalMode.OFF) { + long calStart = System.currentTimeMillis(); + MassCalibrator calibrator = new MassCalibrator( + specAcc, + sa, + aaSet, + params, + specKeyList, + leftPrecursorMassTolerance, + rightPrecursorMassTolerance, + minIsotopeError, + maxIsotopeError, + specDataType); + double shiftPpm = calibrator.learnPrecursorShiftPpm(ioIndex); + boolean applyLearnedShift = shiftPpm != 0.0 + || params.getPrecursorCalMode() == SearchParams.PrecursorCalMode.ON; + if (applyLearnedShift) { + currentIoFiles.setPrecursorMassShiftPpm(shiftPpm); + System.out.printf("Precursor mass shift learned: %.3f ppm (elapsed: %.2f sec)%n", + shiftPpm, (System.currentTimeMillis() - calStart) / 1000.0); + } else { + System.out.printf("Precursor mass calibration skipped (insufficient confident PSMs; elapsed: %.2f sec)%n", + (System.currentTimeMillis() - calStart) / 1000.0); + } + } + double precursorMassShiftPpm = currentIoFiles.getPrecursorMassShiftPpm(); + + List resultList; + + int toIndexGlobal = specSize; + while (toIndexGlobal < specSize) { + SpecKey lastSpecKey = specKeyList.get(toIndexGlobal - 1); + SpecKey nextSpecKey = specKeyList.get(toIndexGlobal); + + if (lastSpecKey.getSpecIndex() == nextSpecKey.getSpecIndex()) + toIndexGlobal++; + else + break; + } + + System.out.println("Spectrum 0-" + (toIndexGlobal - 1) + " (total: " + specSize + ")"); + + boolean useForkJoin = Boolean.getBoolean(USE_FORK_JOIN_PROPERTY); + + ThreadPoolExecutorWithExceptions executor = + useForkJoin ? null : ThreadPoolExecutorWithExceptions.newFixedThreadPool(numThreads); + if (executor != null) executor.setTaskName("Search"); + ForkJoinPool fjp = useForkJoin ? new ForkJoinPool(numThreads) : null; + List> fjpFutures = useForkJoin ? new ArrayList<>() : null; + + int numTasks = Math.min(numThreads * DEFAULT_TASKS_PER_THREAD, Math.round((float) specSize / spectraPerTaskMinimum)); + if (numThreads <= 1) { + numTasks = 1; + } + + if (params.getNumTasks() != 0) { + numTasks = params.getNumTasks(); + if (numTasks < 0) { + numTasks = numThreads * (numTasks * -1); + } + if (numTasks < numThreads) { + System.out.println("Changing specified tasks from " + numTasks + " to " + numThreads + " to provide the minimum of one task per thread."); + numTasks = numThreads; + } + } + if (numTasks > 1) { + System.out.println("Splitting work into " + numTasks + " tasks."); + } else { + System.out.println("Searching using a single task."); + } + + // Partition specKeyList + int size = toIndexGlobal; + int residue = size % numTasks; + + int[] startIndex = new int[numTasks]; + int[] endIndex = new int[numTasks]; + + int subListSize = size / numTasks; + for (int i = 0; i < numTasks; i++) { + startIndex[i] = i > 0 ? endIndex[i - 1] : 0; + endIndex[i] = startIndex[i] + subListSize + (i < residue ? 1 : 0); + + subListSize = size / numTasks; + while (endIndex[i] < specKeyList.size()) { + SpecKey lastSpecKey = specKeyList.get(endIndex[i] - 1); + SpecKey nextSpecKey = specKeyList.get(endIndex[i]); + + if (lastSpecKey.getSpecIndex() == nextSpecKey.getSpecIndex()) { + ++endIndex[i]; + --subListSize; + } else + break; + } + } + + List submittedTasks = new ArrayList<>(numTasks); + + try { + for (int i = 0; i < numTasks; i++) { + final int taskStartIndex = startIndex[i]; + final int taskEndIndex = endIndex[i]; + final boolean storeRankScorer = params.outputAdditionalFeatures(); + final int taskNum = i + 1; + + // Defer ScoredSpectraMap construction to the worker so the + // per-task spectrum heap isn't queued up front. + ConcurrentMSGFPlus.RunMSGFPlus msgfplusExecutor = new ConcurrentMSGFPlus.RunMSGFPlus( + () -> { + ScoredSpectraMap specScanner = new ScoredSpectraMap( + specAcc, + specKeyList.subList(taskStartIndex, taskEndIndex), + leftPrecursorMassTolerance, + rightPrecursorMassTolerance, + minIsotopeError, + maxIsotopeError, + specDataType, + storeRankScorer, + false, + precursorMassShiftPpm + ); + if (doNotUseEdgeScore) + specScanner.turnOffEdgeScoring(); + return specScanner; + }, + sa, + params, + taskNum + ); + + submittedTasks.add(msgfplusExecutor); + + if (DISABLE_THREADING) { + msgfplusExecutor.run(); + } else if (useForkJoin) { + fjpFutures.add(fjp.submit(msgfplusExecutor)); + } else { + executor.execute(msgfplusExecutor); + } + + } + + if (useForkJoin) { + fjp.shutdown(); + try { + fjp.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, e.getMessage(), e); + } + for (Future f : fjpFutures) { + try { f.get(); } + catch (java.util.concurrent.ExecutionException ex) { + Throwable cause = ex.getCause(); + Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, cause.getMessage(), cause); + fjp.shutdownNow(); + return "Search failed: " + cause.getMessage(); + } + catch (InterruptedException ex) { Thread.currentThread().interrupt(); } + } + } else { + executor.outputProgressReport(); + executor.shutdown(); + try { + executor.awaitTerminationWithExceptions(Long.MAX_VALUE, TimeUnit.NANOSECONDS); + } catch (InterruptedException e) { + if (!executor.HasThrownData()) { + e.printStackTrace(); + Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, e.getMessage(), e); + } + } + executor.outputProgressReport(); + } + + // awaitTermination above establishes happens-before on every + // task's writes (JLS §17.4.5), so the per-task ArrayLists can + // be drained single-threaded with no synchronization. + int totalResults = 0; + for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { + totalResults += t.getResultCount(); + } + resultList = new ArrayList<>(totalResults); + for (ConcurrentMSGFPlus.RunMSGFPlus t : submittedTasks) { + t.drainResultsTo(resultList); + } + + if (numTasks > 1) { + printTaskWallSummary(submittedTasks); + } + submittedTasks.clear(); + + } catch (OutOfMemoryError ex) { + ex.printStackTrace(); + Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); + shutdownPoolNow(executor, fjp); + int taskMult = numTasks / numThreads; + return "Task terminated; results incomplete. Please run again with a greater amount of memory, using \"-Xmx4G\", for example.\n" + + "\tYou can also use less memory by increasing the number of tasks used for the search, at the cost of more time.\n" + + "\tTry doubling the number used for this search with \"-tasks -" + (taskMult * 2) + "\" or \"-tasks " + (numTasks * 2) + "\"."; + } catch (Exception ex) { + ex.printStackTrace(); + Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); + shutdownPoolNow(executor, fjp); + return "Task terminated; results incomplete. Please run again."; + } catch (Throwable ex) { + ex.printStackTrace(); + Logger.getLogger(MSGFPlus.class.getName()).log(Level.SEVERE, null, ex); + shutdownPoolNow(executor, fjp); + return "Task terminated; results incomplete. Please run again."; + } + + long qValueStartTime = System.currentTimeMillis(); + + if (params.useTDA()) { + // Compute Q-values + System.out.println("Computing q-values..."); + ComputeFDR.addQValues(resultList, sa, false, decoyProteinPrefix); + System.out.print("Computing q-values finished "); + System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - qValueStartTime) / 1000); + } + + // Sort by spectral E-values then write to disk + + long saveResultsStartTime = System.currentTimeMillis(); + + System.out.println("Writing results..."); + Collections.sort(resultList); + + if (params.writeTsv()) { + DirectTSVWriter tsvWriter = new DirectTSVWriter(params, aaSet, sa, specAcc, ioIndex); + try { + tsvWriter.writeResults(resultList, outputFile); + } catch (IOException e) { + return "Error writing TSV output: " + e.getMessage(); + } + System.out.println("TSV file: " + outputFile.getPath()); + } + + if (params.writePin()) { + DirectPinWriter pinWriter = new DirectPinWriter(params, aaSet, sa, specAcc, ioIndex); + try { + pinWriter.writeResults(resultList, outputFile); + } catch (IOException e) { + return "Error writing pin output: " + e.getMessage(); + } + System.out.println("PIN file: " + outputFile.getPath()); + } + + System.out.print("Writing results finished "); + System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - saveResultsStartTime) / 1000); + return null; + } + + private static void shutdownPoolNow(ThreadPoolExecutorWithExceptions executor, ForkJoinPool fjp) { + if (executor != null) executor.shutdownNow(); + else if (fjp != null) fjp.shutdownNow(); + } + + /** + * One-line wall-time summary across completed tasks. tail_gap (max - + * median) is the load-balance signal; high values point at uneven + * SpecKey distribution and motivate raising the {@code -tasks -N} multiplier. + */ + private static void printTaskWallSummary(List tasks) { + List walls = new ArrayList<>(tasks.size()); + for (ConcurrentMSGFPlus.RunMSGFPlus t : tasks) { + ConcurrentMSGFPlus.TaskWallStats s = t.getWallStats(); + if (s != null) walls.add(s.totalMs()); + } + if (walls.isEmpty()) return; + Collections.sort(walls); + long min = walls.get(0); + long max = walls.get(walls.size() - 1); + long median = walls.get(walls.size() / 2); + long p95 = walls.get(Math.min(walls.size() - 1, (int) Math.ceil(walls.size() * 0.95) - 1)); + long sum = 0L; + for (long w : walls) sum += w; + System.out.format( + "Task wall summary (n=%d): min=%.1fs median=%.1fs p95=%.1fs max=%.1fs total=%.1fs tail_gap=%.1fs (%.0f%% of median)%n", + walls.size(), min / 1000.0, median / 1000.0, p95 / 1000.0, max / 1000.0, + sum / 1000.0, (max - median) / 1000.0, + median > 0 ? 100.0 * (max - median) / median : 0.0); + } +} diff --git a/src/main/java/edu/ucsd/msjava/fdr/ComputeQValue.java b/src/main/java/edu/ucsd/msjava/fdr/ComputeQValue.java index 161cf80a..28196b04 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/ComputeQValue.java +++ b/src/main/java/edu/ucsd/msjava/fdr/ComputeQValue.java @@ -1,7 +1,7 @@ package edu.ucsd.msjava.fdr; import edu.ucsd.msjava.parser.BufferedLineReader; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import java.io.File; import java.util.ArrayList; diff --git a/src/main/java/edu/ucsd/msjava/fdr/MSGFPlusPSMSet.java b/src/main/java/edu/ucsd/msjava/fdr/MSGFPlusPSMSet.java index b9b6434e..31b5469d 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/MSGFPlusPSMSet.java +++ b/src/main/java/edu/ucsd/msjava/fdr/MSGFPlusPSMSet.java @@ -3,7 +3,7 @@ import edu.ucsd.msjava.msdbsearch.CompactSuffixArray; import edu.ucsd.msjava.msdbsearch.DatabaseMatch; import edu.ucsd.msjava.msdbsearch.MSGFPlusMatch; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import java.util.ArrayList; import java.util.HashMap; diff --git a/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java b/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java index 5a0a4eaf..326a9ca4 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java +++ b/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java @@ -1,6 +1,6 @@ package edu.ucsd.msjava.fdr; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import java.io.*; import java.util.ArrayList; diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/BuildSA.java b/src/main/java/edu/ucsd/msjava/msdbsearch/BuildSA.java index d9dd615f..5db5b57d 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/BuildSA.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/BuildSA.java @@ -1,6 +1,6 @@ package edu.ucsd.msjava.msdbsearch; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import org.apache.commons.io.FilenameUtils; import java.io.BufferedWriter; diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java index 886644b5..d922e9e7 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java @@ -2,7 +2,7 @@ import edu.ucsd.msjava.sequences.Constants; import edu.ucsd.msjava.sequences.Sequence; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import java.io.*; import java.text.SimpleDateFormat; @@ -651,4 +651,4 @@ public void printTooManyDuplicateSequencesMessage(String fileName, String toolNa "You can consolidate the duplicates using the 'Validate Fasta File' tool in the Protein Digestion Simulator, " + "available at https://github.com/PNNL-Comp-Mass-Spec/Protein-Digestion-Simulator/releases"); } -} \ No newline at end of file +} diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/PeptideEnumerator.java b/src/main/java/edu/ucsd/msjava/msdbsearch/PeptideEnumerator.java index 072bae19..36fa5188 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/PeptideEnumerator.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/PeptideEnumerator.java @@ -4,7 +4,7 @@ import edu.ucsd.msjava.msutil.Composition; import edu.ucsd.msjava.msutil.Enzyme; import edu.ucsd.msjava.sequences.Constants; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import java.io.*; diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/ReverseDB.java b/src/main/java/edu/ucsd/msjava/msdbsearch/ReverseDB.java index 83ea09ee..d8cdd20b 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/ReverseDB.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/ReverseDB.java @@ -1,6 +1,6 @@ package edu.ucsd.msjava.msdbsearch; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import java.io.*; diff --git a/src/main/java/edu/ucsd/msjava/mslibsearch/ProcessedSpectrum.java b/src/main/java/edu/ucsd/msjava/mslibsearch/ProcessedSpectrum.java deleted file mode 100644 index adf9d1aa..00000000 --- a/src/main/java/edu/ucsd/msjava/mslibsearch/ProcessedSpectrum.java +++ /dev/null @@ -1,33 +0,0 @@ -package edu.ucsd.msjava.mslibsearch; - -import edu.ucsd.msjava.msutil.Spectrum; - -public class ProcessedSpectrum { - private final Spectrum expSpec; - private final Spectrum libSpec; - - public ProcessedSpectrum(Spectrum expSpec, Spectrum libSpec) { - this.expSpec = expSpec; - this.libSpec = libSpec; - } - - public Spectrum getSpectrum() { -// boolean[] expPeak = new boolean[NominalMass.toNominalMass(expSpec.getPrecursorMass())]; -// for(Peak p : libSpec) -// { -// int nominalMass = NominalMass.toNominalMass(p.getMz()); -// if(nominalMass >= 0 && nominalMass < expPeak.length) -// expPeak[nominalMass] = true; -// } -// -// Spectrum spec = expSpec.getCloneWithoutPeakList(); -// for(Peak p : expSpec) -// { -// int nominalMass = NominalMass.toNominalMass(p.getMz()); -// if(nominalMass >= 0 && nominalMass < expPeak.length && expPeak[NominalMass.toNominalMass(p.getMz())]) -// spec.add(p); -// } -// return spec; - return expSpec; - } -} diff --git a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java index 08f2dbf0..0db21789 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java +++ b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java @@ -4,7 +4,7 @@ import edu.ucsd.msjava.msutil.Modification.Location; import edu.ucsd.msjava.params.ParamManager; import edu.ucsd.msjava.parser.BufferedLineReader; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import java.io.File; import java.io.IOException; diff --git a/src/main/java/edu/ucsd/msjava/msutil/Peptide.java b/src/main/java/edu/ucsd/msjava/msutil/Peptide.java index d8cd7afe..8457dce2 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Peptide.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Peptide.java @@ -6,7 +6,7 @@ import edu.ucsd.msjava.msgf.Tolerance; import edu.ucsd.msjava.msutil.Modification.Location; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import java.nio.file.Path; import java.nio.file.Paths; diff --git a/src/main/java/edu/ucsd/msjava/mzid/DirectPinWriter.java b/src/main/java/edu/ucsd/msjava/output/DirectPinWriter.java similarity index 99% rename from src/main/java/edu/ucsd/msjava/mzid/DirectPinWriter.java rename to src/main/java/edu/ucsd/msjava/output/DirectPinWriter.java index 1a34aeb9..c7a24611 100644 --- a/src/main/java/edu/ucsd/msjava/mzid/DirectPinWriter.java +++ b/src/main/java/edu/ucsd/msjava/output/DirectPinWriter.java @@ -1,4 +1,4 @@ -package edu.ucsd.msjava.mzid; +package edu.ucsd.msjava.output; import edu.ucsd.msjava.msdbsearch.CompactFastaSequence; import edu.ucsd.msjava.msdbsearch.CompactSuffixArray; diff --git a/src/main/java/edu/ucsd/msjava/mzid/DirectTSVWriter.java b/src/main/java/edu/ucsd/msjava/output/DirectTSVWriter.java similarity index 99% rename from src/main/java/edu/ucsd/msjava/mzid/DirectTSVWriter.java rename to src/main/java/edu/ucsd/msjava/output/DirectTSVWriter.java index 4535d6c4..517faeab 100644 --- a/src/main/java/edu/ucsd/msjava/mzid/DirectTSVWriter.java +++ b/src/main/java/edu/ucsd/msjava/output/DirectTSVWriter.java @@ -1,4 +1,4 @@ -package edu.ucsd.msjava.mzid; +package edu.ucsd.msjava.output; import edu.ucsd.msjava.msdbsearch.CompactSuffixArray; import edu.ucsd.msjava.msdbsearch.DatabaseMatch; diff --git a/src/main/java/edu/ucsd/msjava/mzid/Unimod.java b/src/main/java/edu/ucsd/msjava/output/Unimod.java similarity index 96% rename from src/main/java/edu/ucsd/msjava/mzid/Unimod.java rename to src/main/java/edu/ucsd/msjava/output/Unimod.java index 32e6f307..d3368a88 100644 --- a/src/main/java/edu/ucsd/msjava/mzid/Unimod.java +++ b/src/main/java/edu/ucsd/msjava/output/Unimod.java @@ -1,32 +1,32 @@ -package edu.ucsd.msjava.mzid; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.HashMap; -import java.util.Map; - +package edu.ucsd.msjava.output; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.Map; + public class Unimod { private static final String UNIMOD_RESOURCE_PATH = "unimod.obo"; public static Unimod getUnimod() { return unimod; } - - public String getRecordID(String name) { - return recordIDMap.get(name); - } - - public String getDeltaComposition(String id) { - return idToDeltaCompositionMap.get(id); - } - - private Map recordIDMap; // name -> record id - private Map idToDeltaCompositionMap; // id -> delta_composition - - private Unimod() { - readUnimodOBOFile(); + + public String getRecordID(String name) { + return recordIDMap.get(name); + } + + public String getDeltaComposition(String id) { + return idToDeltaCompositionMap.get(id); + } + + private Map recordIDMap; // name -> record id + private Map idToDeltaCompositionMap; // id -> delta_composition + + private Unimod() { + readUnimodOBOFile(); } private void readUnimodOBOFile() { @@ -35,51 +35,51 @@ private void readUnimodOBOFile() { System.err.println("Unable to access \"unimod.obo\"."); System.exit(-1); } - BufferedReader in = new BufferedReader(new InputStreamReader(is)); - - recordIDMap = new HashMap(); - idToDeltaCompositionMap = new HashMap(); - String s; - String curID = null; - String deltaMass = null; - try { - while ((s = in.readLine()) != null) { - if (s.startsWith("id:")) { - String id = s.split("\\s+")[1].trim(); - String nameLine = in.readLine(); - assert (nameLine.startsWith("name:")); - String name = nameLine.split("\\s+")[1].trim(); - recordIDMap.put(name, id); - curID = id; - } - if (s.startsWith("xref: delta_composition")) { - String deltaComposition = s.substring(s.indexOf('"') + 1, s.lastIndexOf('"')); - idToDeltaCompositionMap.put(curID, deltaComposition); -// Double mass = UnimodComposition.getMass(deltaComposition); -// if(mass == null) -// { -// System.out.println(deltaComposition); -// } - if (deltaMass != null) { - Double mass = UnimodComposition.getMass(deltaComposition); - Double mass2 = Double.parseDouble(deltaMass); - if (Math.abs(mass - mass2) > 0.001) { - System.out.println("Error: " + deltaComposition + " " + mass + " " + mass2); - } - } - } - if (s.startsWith("xref: delta_mono_mass")) { - deltaMass = s.substring(s.indexOf('"') + 1, s.lastIndexOf('"')); - } - } - } catch (IOException e) { - e.printStackTrace(); - } - } - - private static Unimod unimod; - - static { - unimod = new Unimod(); - } -} + BufferedReader in = new BufferedReader(new InputStreamReader(is)); + + recordIDMap = new HashMap(); + idToDeltaCompositionMap = new HashMap(); + String s; + String curID = null; + String deltaMass = null; + try { + while ((s = in.readLine()) != null) { + if (s.startsWith("id:")) { + String id = s.split("\\s+")[1].trim(); + String nameLine = in.readLine(); + assert (nameLine.startsWith("name:")); + String name = nameLine.split("\\s+")[1].trim(); + recordIDMap.put(name, id); + curID = id; + } + if (s.startsWith("xref: delta_composition")) { + String deltaComposition = s.substring(s.indexOf('"') + 1, s.lastIndexOf('"')); + idToDeltaCompositionMap.put(curID, deltaComposition); +// Double mass = UnimodComposition.getMass(deltaComposition); +// if(mass == null) +// { +// System.out.println(deltaComposition); +// } + if (deltaMass != null) { + Double mass = UnimodComposition.getMass(deltaComposition); + Double mass2 = Double.parseDouble(deltaMass); + if (Math.abs(mass - mass2) > 0.001) { + System.out.println("Error: " + deltaComposition + " " + mass + " " + mass2); + } + } + } + if (s.startsWith("xref: delta_mono_mass")) { + deltaMass = s.substring(s.indexOf('"') + 1, s.lastIndexOf('"')); + } + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + private static Unimod unimod; + + static { + unimod = new Unimod(); + } +} diff --git a/src/main/java/edu/ucsd/msjava/mzid/UnimodComposition.java b/src/main/java/edu/ucsd/msjava/output/UnimodComposition.java similarity index 96% rename from src/main/java/edu/ucsd/msjava/mzid/UnimodComposition.java rename to src/main/java/edu/ucsd/msjava/output/UnimodComposition.java index e4ca0b6f..474f67fc 100644 --- a/src/main/java/edu/ucsd/msjava/mzid/UnimodComposition.java +++ b/src/main/java/edu/ucsd/msjava/output/UnimodComposition.java @@ -1,133 +1,133 @@ -package edu.ucsd.msjava.mzid; - -import edu.ucsd.msjava.msutil.Atom; -import edu.ucsd.msjava.msutil.Composition; - -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.Map.Entry; - -public class UnimodComposition { - - - public UnimodComposition() { - compMap = new LinkedHashMap(); - compMap.put("H", 0); - compMap.put("C", 0); - compMap.put("N", 0); - compMap.put("O", 0); - compMap.put("P", 0); - compMap.put("S", 0); - } - - public void add(Composition comp) { - add("C", comp.getC()); - add("H", comp.getH()); - add("N", comp.getN()); - add("O", comp.getO()); - add("S", comp.getS()); - } - - public void add(String deltaComposition) { - String[] token = deltaComposition.split("\\s+"); - for (String e : token) { - if (e.matches("\\d*?[a-zA-Z]+(\\(-?\\d+\\))?")) { - String element; - int num; - if (e.matches("\\d*?[a-zA-Z]+")) { - element = e; - num = 1; - } else { - element = e.substring(0, e.indexOf('(')); - num = Integer.parseInt(e.substring(e.indexOf('(') + 1, e.lastIndexOf(')'))); - } - add(element, num); - } else if (e.matches("\\d+\\.?\\d*")) { - double mass = Double.parseDouble(e); - add(mass); - } else { - System.err.println("Wrong Unimod delta_composition: " + deltaComposition); - System.exit(-1); - } - } - } - - public void add(String element, int number) { - Integer num = compMap.get(element); - if (num == null) - compMap.put(element, number); - else - compMap.put(element, num + number); - } - - public void add(double deltaMass) { - if (this.deltaMass == null) - this.deltaMass = deltaMass; - else - this.deltaMass += deltaMass; - } - - public Double getMass() { - double mass = 0; - Iterator> itr = compMap.entrySet().iterator(); - while (itr.hasNext()) { - Entry entry = itr.next(); - String element = entry.getKey(); - int num = entry.getValue(); - if (num == 0) - continue; - Atom atom = Atom.get(element); - if (atom == null) { - System.out.println("Error: Could not parse element/molecule \"" + element + "\""); - return null; - } - mass += atom.getMass() * num; - } - - if (deltaMass != null) - mass += deltaMass; - return mass; - } - - public static Double getMass(String unimodCompositionStr) { - UnimodComposition comp = new UnimodComposition(); - comp.add(unimodCompositionStr); - return comp.getMass(); - } - - @Override - public String toString() { - StringBuffer buf = new StringBuffer(); - Iterator> itr = compMap.entrySet().iterator(); - boolean first = true; - while (itr.hasNext()) { - Entry entry = itr.next(); - String element = entry.getKey(); - int num = entry.getValue(); - if (num == 0) - continue; - else if (num == 1) { - if (!first) - buf.append(" "); - else - first = false; - buf.append(element); - } else { - if (!first) - buf.append(" "); - else - first = false; - buf.append(element + "(" + num + ")"); - } - } - - if (deltaMass != null) - buf.append(" " + deltaMass); - return buf.toString(); - } - - private Map compMap; - private Double deltaMass = null; - -} +package edu.ucsd.msjava.output; + +import edu.ucsd.msjava.msutil.Atom; +import edu.ucsd.msjava.msutil.Composition; + +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Map.Entry; + +public class UnimodComposition { + + + public UnimodComposition() { + compMap = new LinkedHashMap(); + compMap.put("H", 0); + compMap.put("C", 0); + compMap.put("N", 0); + compMap.put("O", 0); + compMap.put("P", 0); + compMap.put("S", 0); + } + + public void add(Composition comp) { + add("C", comp.getC()); + add("H", comp.getH()); + add("N", comp.getN()); + add("O", comp.getO()); + add("S", comp.getS()); + } + + public void add(String deltaComposition) { + String[] token = deltaComposition.split("\\s+"); + for (String e : token) { + if (e.matches("\\d*?[a-zA-Z]+(\\(-?\\d+\\))?")) { + String element; + int num; + if (e.matches("\\d*?[a-zA-Z]+")) { + element = e; + num = 1; + } else { + element = e.substring(0, e.indexOf('(')); + num = Integer.parseInt(e.substring(e.indexOf('(') + 1, e.lastIndexOf(')'))); + } + add(element, num); + } else if (e.matches("\\d+\\.?\\d*")) { + double mass = Double.parseDouble(e); + add(mass); + } else { + System.err.println("Wrong Unimod delta_composition: " + deltaComposition); + System.exit(-1); + } + } + } + + public void add(String element, int number) { + Integer num = compMap.get(element); + if (num == null) + compMap.put(element, number); + else + compMap.put(element, num + number); + } + + public void add(double deltaMass) { + if (this.deltaMass == null) + this.deltaMass = deltaMass; + else + this.deltaMass += deltaMass; + } + + public Double getMass() { + double mass = 0; + Iterator> itr = compMap.entrySet().iterator(); + while (itr.hasNext()) { + Entry entry = itr.next(); + String element = entry.getKey(); + int num = entry.getValue(); + if (num == 0) + continue; + Atom atom = Atom.get(element); + if (atom == null) { + System.out.println("Error: Could not parse element/molecule \"" + element + "\""); + return null; + } + mass += atom.getMass() * num; + } + + if (deltaMass != null) + mass += deltaMass; + return mass; + } + + public static Double getMass(String unimodCompositionStr) { + UnimodComposition comp = new UnimodComposition(); + comp.add(unimodCompositionStr); + return comp.getMass(); + } + + @Override + public String toString() { + StringBuffer buf = new StringBuffer(); + Iterator> itr = compMap.entrySet().iterator(); + boolean first = true; + while (itr.hasNext()) { + Entry entry = itr.next(); + String element = entry.getKey(); + int num = entry.getValue(); + if (num == 0) + continue; + else if (num == 1) { + if (!first) + buf.append(" "); + else + first = false; + buf.append(element); + } else { + if (!first) + buf.append(" "); + else + first = false; + buf.append(element + "(" + num + ")"); + } + } + + if (deltaMass != null) + buf.append(" " + deltaMass); + return buf.toString(); + } + + private Map compMap; + private Double deltaMass = null; + +} diff --git a/src/main/java/edu/ucsd/msjava/params/ParamManager.java b/src/main/java/edu/ucsd/msjava/params/ParamManager.java index 07aac896..0dd4fafb 100644 --- a/src/main/java/edu/ucsd/msjava/params/ParamManager.java +++ b/src/main/java/edu/ucsd/msjava/params/ParamManager.java @@ -2,7 +2,7 @@ import edu.ucsd.msjava.msutil.*; import edu.ucsd.msjava.sequences.Constants; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import java.io.File; import java.util.ArrayList; diff --git a/src/main/java/edu/ucsd/msjava/parser/BufferedLineReader.java b/src/main/java/edu/ucsd/msjava/parser/BufferedLineReader.java index 4d5d3214..3eddae90 100644 --- a/src/main/java/edu/ucsd/msjava/parser/BufferedLineReader.java +++ b/src/main/java/edu/ucsd/msjava/parser/BufferedLineReader.java @@ -1,7 +1,6 @@ package edu.ucsd.msjava.parser; import java.io.*; -import net.pempek.unicode.UnicodeBOMInputStream; /** * Buffered line reader class diff --git a/src/main/java/edu/ucsd/msjava/parser/BufferedRandomAccessLineReader.java b/src/main/java/edu/ucsd/msjava/parser/BufferedRandomAccessLineReader.java index 8b0d3cc2..3216e238 100644 --- a/src/main/java/edu/ucsd/msjava/parser/BufferedRandomAccessLineReader.java +++ b/src/main/java/edu/ucsd/msjava/parser/BufferedRandomAccessLineReader.java @@ -1,6 +1,5 @@ package edu.ucsd.msjava.parser; -import net.pempek.unicode.UnicodeBOMInputStream; import org.apache.commons.lang3.tuple.Pair; import java.io.FileInputStream; @@ -95,23 +94,23 @@ public static Pair stripBOMAndGetLength(String str) { int copyOffset = 0; if (buf.length >= 4) { - if (bytesMatchBOM(buf, net.pempek.unicode.UnicodeBOMInputStream.BOM.UTF_32_LE)) { + if (bytesMatchBOM(buf, UnicodeBOMInputStream.BOM.UTF_32_LE)) { copyOffset = 4; - } else if (bytesMatchBOM(buf, net.pempek.unicode.UnicodeBOMInputStream.BOM.UTF_32_BE)) { + } else if (bytesMatchBOM(buf, UnicodeBOMInputStream.BOM.UTF_32_BE)) { copyOffset = 4; } } if (copyOffset == 0 && buf.length >= 3) { - if (bytesMatchBOM(buf, net.pempek.unicode.UnicodeBOMInputStream.BOM.UTF_8)) { + if (bytesMatchBOM(buf, UnicodeBOMInputStream.BOM.UTF_8)) { copyOffset = 3; } } if (copyOffset == 0 && buf.length >= 2) { - if (bytesMatchBOM(buf, net.pempek.unicode.UnicodeBOMInputStream.BOM.UTF_16_LE)) { + if (bytesMatchBOM(buf, UnicodeBOMInputStream.BOM.UTF_16_LE)) { copyOffset = 2; - } else if (bytesMatchBOM(buf, net.pempek.unicode.UnicodeBOMInputStream.BOM.UTF_16_BE)) { + } else if (bytesMatchBOM(buf, UnicodeBOMInputStream.BOM.UTF_16_BE)) { copyOffset = 2; } } diff --git a/src/main/java/net/pempak/unicode/UnicodeBOMInputStream.java b/src/main/java/edu/ucsd/msjava/parser/UnicodeBOMInputStream.java similarity index 96% rename from src/main/java/net/pempak/unicode/UnicodeBOMInputStream.java rename to src/main/java/edu/ucsd/msjava/parser/UnicodeBOMInputStream.java index 6103cd5a..87dce4d1 100644 --- a/src/main/java/net/pempak/unicode/UnicodeBOMInputStream.java +++ b/src/main/java/edu/ucsd/msjava/parser/UnicodeBOMInputStream.java @@ -1,295 +1,295 @@ -// (‑●‑●)> released under the WTFPL v2 license, by Gregory Pakosz (@gpakosz) - -package net.pempek.unicode; - -import java.io.IOException; -import java.io.InputStream; -import java.io.PushbackInputStream; - -/** - * The UnicodeBOMInputStream class wraps any - * InputStream and detects the presence of any Unicode BOM - * (Byte Order Mark) at its beginning, as defined by - * RFC 3629 - UTF-8, a - * transformation format of ISO 10646 - * - *

The - * Unicode FAQ - * defines 5 types of BOMs:

    - *
  • 00 00 FE FF  = UTF-32, big-endian
  • - *
  • FF FE 00 00  = UTF-32, little-endian
  • - *
  • FE FF        = UTF-16, big-endian
  • - *
  • FF FE        = UTF-16, little-endian
  • - *
  • EF BB BF     = UTF-8
  • - *

- * - *

Use the {@link #getBOM()} method to know whether a BOM has been detected - * or not. - *

- *

Use the {@link #skipBOM()} method to remove the detected BOM from the - * wrapped InputStream object.

- * - * @author Gregory Pakosz - * @version 1.0 - */ -public class UnicodeBOMInputStream extends InputStream -{ - /** - * Type safe enumeration class that describes the different types of Unicode - * BOMs. - */ - public static final class BOM - { - /** - * NONE. - */ - public static final BOM NONE = new BOM(new byte[]{}, "NONE"); - - /** - * UTF-8 BOM (EF BB BF). - */ - public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF, - (byte)0xBB, - (byte)0xBF}, - "UTF-8"); - - /** - * UTF-16, little-endian (FF FE). - */ - public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF, - (byte)0xFE}, - "UTF-16 little-endian"); - - /** - * UTF-16, big-endian (FE FF). - */ - public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE, - (byte)0xFF}, - "UTF-16 big-endian"); - - /** - * UTF-32, little-endian (FF FE 00 00). - */ - public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF, - (byte)0xFE, - (byte)0x00, - (byte)0x00}, - "UTF-32 little-endian"); - - /** - * UTF-32, big-endian (00 00 FE FF). - */ - public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00, - (byte)0x00, - (byte)0xFE, - (byte)0xFF}, - "UTF-32 big-endian"); - - /** - * Returns a String representation of this BOM - * value. - */ - public final String toString() - { - return description; - } - - /** - * Returns the bytes corresponding to this BOM value. - */ - public final byte[] getBytes() - { - final int length = bytes.length; - final byte[] result = new byte[length]; - - // make a defensive copy - System.arraycopy(bytes, 0, result, 0, length); - - return result; - } - - private BOM(final byte bom[], final String description) - { - assert(bom != null) : "invalid BOM: null is not allowed"; - assert(description != null) : "invalid description: null is not allowed"; - assert(description.length() != 0) : "invalid description: empty string is not allowed"; - - this.bytes = bom; - this.description = description; - } - - final byte bytes[]; - private final String description; - - } // BOM - - /** - * Constructs a new UnicodeBOMInputStream that wraps the - * specified InputStream. - * - * @param inputStream an InputStream. - * - * @throws NullPointerException when inputStream is - * null. - * @throws IOException on reading from the specified InputStream - * when trying to detect the Unicode BOM. - */ - public UnicodeBOMInputStream(final InputStream inputStream) throws NullPointerException, - IOException - { - if (inputStream == null) - throw new NullPointerException("invalid input stream: null is not allowed"); - - in = new PushbackInputStream(inputStream, 4); - - final byte bom[] = new byte[4]; - final int read = in.read(bom); - - switch(read) - { - case 4: - if ((bom[0] == (byte)0xFF) && - (bom[1] == (byte)0xFE) && - (bom[2] == (byte)0x00) && - (bom[3] == (byte)0x00)) - { - this.bom = BOM.UTF_32_LE; - break; - } - else - if ((bom[0] == (byte)0x00) && - (bom[1] == (byte)0x00) && - (bom[2] == (byte)0xFE) && - (bom[3] == (byte)0xFF)) - { - this.bom = BOM.UTF_32_BE; - break; - } - - case 3: - if ((bom[0] == (byte)0xEF) && - (bom[1] == (byte)0xBB) && - (bom[2] == (byte)0xBF)) - { - this.bom = BOM.UTF_8; - break; - } - - case 2: - if ((bom[0] == (byte)0xFF) && - (bom[1] == (byte)0xFE)) - { - this.bom = BOM.UTF_16_LE; - break; - } - else - if ((bom[0] == (byte)0xFE) && - (bom[1] == (byte)0xFF)) - { - this.bom = BOM.UTF_16_BE; - break; - } - - default: - this.bom = BOM.NONE; - break; - } - - if (read > 0) - in.unread(bom, 0, read); - } - - /** - * Returns the BOM that was detected in the wrapped - * InputStream object. - * - * @return a BOM value. - */ - public final BOM getBOM() - { - // BOM type is immutable. - return bom; - } - - /** - * Skips the BOM that was found in the wrapped - * InputStream object. - * - * @return this UnicodeBOMInputStream. - * - * @throws IOException when trying to skip the BOM from the wrapped - * InputStream object. - */ - public final synchronized UnicodeBOMInputStream skipBOM() throws IOException - { - if (!skipped) - { - in.skip(bom.bytes.length); - skipped = true; - } - return this; - } - - @Override - public int read() throws IOException - { - return in.read(); - } - - @Override - public int read(final byte b[]) throws IOException, - NullPointerException - { - return in.read(b, 0, b.length); - } - - @Override - public int read(final byte b[], - final int off, - final int len) throws IOException, - NullPointerException - { - return in.read(b, off, len); - } - - @Override - public long skip(final long n) throws IOException - { - return in.skip(n); - } - - @Override - public int available() throws IOException - { - return in.available(); - } - - @Override - public void close() throws IOException - { - in.close(); - } - - @Override - public synchronized void mark(final int readlimit) - { - in.mark(readlimit); - } - - @Override - public synchronized void reset() throws IOException - { - in.reset(); - } - - @Override - public boolean markSupported() - { - return in.markSupported(); - } - - private final PushbackInputStream in; - private final BOM bom; - private boolean skipped = false; - -} // UnicodeBOMInputStream +// (‑●‑●)> released under the WTFPL v2 license, by Gregory Pakosz (@gpakosz) + +package edu.ucsd.msjava.parser; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; + +/** + * The UnicodeBOMInputStream class wraps any + * InputStream and detects the presence of any Unicode BOM + * (Byte Order Mark) at its beginning, as defined by + * RFC 3629 - UTF-8, a + * transformation format of ISO 10646 + * + *

The + * Unicode FAQ + * defines 5 types of BOMs:

    + *
  • 00 00 FE FF  = UTF-32, big-endian
  • + *
  • FF FE 00 00  = UTF-32, little-endian
  • + *
  • FE FF        = UTF-16, big-endian
  • + *
  • FF FE        = UTF-16, little-endian
  • + *
  • EF BB BF     = UTF-8
  • + *

+ * + *

Use the {@link #getBOM()} method to know whether a BOM has been detected + * or not. + *

+ *

Use the {@link #skipBOM()} method to remove the detected BOM from the + * wrapped InputStream object.

+ * + * @author Gregory Pakosz + * @version 1.0 + */ +public class UnicodeBOMInputStream extends InputStream +{ + /** + * Type safe enumeration class that describes the different types of Unicode + * BOMs. + */ + public static final class BOM + { + /** + * NONE. + */ + public static final BOM NONE = new BOM(new byte[]{}, "NONE"); + + /** + * UTF-8 BOM (EF BB BF). + */ + public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF, + (byte)0xBB, + (byte)0xBF}, + "UTF-8"); + + /** + * UTF-16, little-endian (FF FE). + */ + public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF, + (byte)0xFE}, + "UTF-16 little-endian"); + + /** + * UTF-16, big-endian (FE FF). + */ + public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE, + (byte)0xFF}, + "UTF-16 big-endian"); + + /** + * UTF-32, little-endian (FF FE 00 00). + */ + public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF, + (byte)0xFE, + (byte)0x00, + (byte)0x00}, + "UTF-32 little-endian"); + + /** + * UTF-32, big-endian (00 00 FE FF). + */ + public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00, + (byte)0x00, + (byte)0xFE, + (byte)0xFF}, + "UTF-32 big-endian"); + + /** + * Returns a String representation of this BOM + * value. + */ + public final String toString() + { + return description; + } + + /** + * Returns the bytes corresponding to this BOM value. + */ + public final byte[] getBytes() + { + final int length = bytes.length; + final byte[] result = new byte[length]; + + // make a defensive copy + System.arraycopy(bytes, 0, result, 0, length); + + return result; + } + + private BOM(final byte bom[], final String description) + { + assert(bom != null) : "invalid BOM: null is not allowed"; + assert(description != null) : "invalid description: null is not allowed"; + assert(description.length() != 0) : "invalid description: empty string is not allowed"; + + this.bytes = bom; + this.description = description; + } + + final byte bytes[]; + private final String description; + + } // BOM + + /** + * Constructs a new UnicodeBOMInputStream that wraps the + * specified InputStream. + * + * @param inputStream an InputStream. + * + * @throws NullPointerException when inputStream is + * null. + * @throws IOException on reading from the specified InputStream + * when trying to detect the Unicode BOM. + */ + public UnicodeBOMInputStream(final InputStream inputStream) throws NullPointerException, + IOException + { + if (inputStream == null) + throw new NullPointerException("invalid input stream: null is not allowed"); + + in = new PushbackInputStream(inputStream, 4); + + final byte bom[] = new byte[4]; + final int read = in.read(bom); + + switch(read) + { + case 4: + if ((bom[0] == (byte)0xFF) && + (bom[1] == (byte)0xFE) && + (bom[2] == (byte)0x00) && + (bom[3] == (byte)0x00)) + { + this.bom = BOM.UTF_32_LE; + break; + } + else + if ((bom[0] == (byte)0x00) && + (bom[1] == (byte)0x00) && + (bom[2] == (byte)0xFE) && + (bom[3] == (byte)0xFF)) + { + this.bom = BOM.UTF_32_BE; + break; + } + + case 3: + if ((bom[0] == (byte)0xEF) && + (bom[1] == (byte)0xBB) && + (bom[2] == (byte)0xBF)) + { + this.bom = BOM.UTF_8; + break; + } + + case 2: + if ((bom[0] == (byte)0xFF) && + (bom[1] == (byte)0xFE)) + { + this.bom = BOM.UTF_16_LE; + break; + } + else + if ((bom[0] == (byte)0xFE) && + (bom[1] == (byte)0xFF)) + { + this.bom = BOM.UTF_16_BE; + break; + } + + default: + this.bom = BOM.NONE; + break; + } + + if (read > 0) + in.unread(bom, 0, read); + } + + /** + * Returns the BOM that was detected in the wrapped + * InputStream object. + * + * @return a BOM value. + */ + public final BOM getBOM() + { + // BOM type is immutable. + return bom; + } + + /** + * Skips the BOM that was found in the wrapped + * InputStream object. + * + * @return this UnicodeBOMInputStream. + * + * @throws IOException when trying to skip the BOM from the wrapped + * InputStream object. + */ + public final synchronized UnicodeBOMInputStream skipBOM() throws IOException + { + if (!skipped) + { + in.skip(bom.bytes.length); + skipped = true; + } + return this; + } + + @Override + public int read() throws IOException + { + return in.read(); + } + + @Override + public int read(final byte b[]) throws IOException, + NullPointerException + { + return in.read(b, 0, b.length); + } + + @Override + public int read(final byte b[], + final int off, + final int len) throws IOException, + NullPointerException + { + return in.read(b, off, len); + } + + @Override + public long skip(final long n) throws IOException + { + return in.skip(n); + } + + @Override + public int available() throws IOException + { + return in.available(); + } + + @Override + public void close() throws IOException + { + in.close(); + } + + @Override + public synchronized void mark(final int readlimit) + { + in.mark(readlimit); + } + + @Override + public synchronized void reset() throws IOException + { + in.reset(); + } + + @Override + public boolean markSupported() + { + return in.markSupported(); + } + + private final PushbackInputStream in; + private final BOM bom; + private boolean skipped = false; + +} // UnicodeBOMInputStream diff --git a/src/main/resources/META-INF/MANIFEST.MF b/src/main/resources/META-INF/MANIFEST.MF index 8d9e7c49..9fc3552d 100644 --- a/src/main/resources/META-INF/MANIFEST.MF +++ b/src/main/resources/META-INF/MANIFEST.MF @@ -1,3 +1,3 @@ Manifest-Version: 1.0 Class-Path: . -Main-Class: edu.ucsd.msjava.ui.MSGFPlus +Main-Class: edu.ucsd.msjava.cli.MSGFPlus diff --git a/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java b/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java index b3f9a32f..59ac4251 100644 --- a/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java +++ b/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java @@ -2,7 +2,7 @@ import edu.ucsd.msjava.params.FileParameter; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Assert; import org.junit.Test; @@ -51,4 +51,4 @@ public void parse() throws URISyntaxException { } -} \ No newline at end of file +} diff --git a/src/test/java/msgfplus/TestCandidatePeptideGrid.java b/src/test/java/msgfplus/TestCandidatePeptideGrid.java index 8ca8ffad..1ef6b6f5 100644 --- a/src/test/java/msgfplus/TestCandidatePeptideGrid.java +++ b/src/test/java/msgfplus/TestCandidatePeptideGrid.java @@ -11,7 +11,7 @@ import static org.junit.Assert.*; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Test; diff --git a/src/test/java/msgfplus/TestCandidatePeptideGridConsideringMetCleavage.java b/src/test/java/msgfplus/TestCandidatePeptideGridConsideringMetCleavage.java index 0de25d3c..119a7632 100644 --- a/src/test/java/msgfplus/TestCandidatePeptideGridConsideringMetCleavage.java +++ b/src/test/java/msgfplus/TestCandidatePeptideGridConsideringMetCleavage.java @@ -11,7 +11,7 @@ import static org.junit.Assert.*; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Test; diff --git a/src/test/java/msgfplus/TestCollaboration.java b/src/test/java/msgfplus/TestCollaboration.java index 15eb73f2..f78c54a1 100644 --- a/src/test/java/msgfplus/TestCollaboration.java +++ b/src/test/java/msgfplus/TestCollaboration.java @@ -8,7 +8,7 @@ import org.junit.Test; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; @Ignore public class TestCollaboration { diff --git a/src/test/java/msgfplus/TestDirectPinWriter.java b/src/test/java/msgfplus/TestDirectPinWriter.java index 95c470f7..40e3613b 100644 --- a/src/test/java/msgfplus/TestDirectPinWriter.java +++ b/src/test/java/msgfplus/TestDirectPinWriter.java @@ -5,9 +5,9 @@ import edu.ucsd.msjava.msdbsearch.SearchParamsTest; import edu.ucsd.msjava.msutil.ActivationMethod; import edu.ucsd.msjava.msutil.Enzyme; -import edu.ucsd.msjava.mzid.DirectPinWriter; +import edu.ucsd.msjava.output.DirectPinWriter; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Assert; import org.junit.Test; @@ -24,7 +24,7 @@ * Shape tests for the Percolator {@code .pin} output path (Q7). * * These exercise the CLI/flag plumbing and the header emitted by - * {@link edu.ucsd.msjava.mzid.DirectPinWriter}. A full end-to-end + * {@link edu.ucsd.msjava.output.DirectPinWriter}. A full end-to-end * search-to-pin run is exercised by the integration tests under * {@code src/test/resources/} when external spectra are available; * here we focus on the parts we can verify without running the @@ -106,14 +106,14 @@ public void pinHeaderColumnsIncludeRequiredPercolatorFields() throws Exception { // stream. We assert the header line contains the Percolator-required // column names. java.lang.reflect.Method writeHeader = - edu.ucsd.msjava.mzid.DirectPinWriter.class.getDeclaredMethod( + edu.ucsd.msjava.output.DirectPinWriter.class.getDeclaredMethod( "writeHeader", java.io.PrintStream.class, int.class, int.class); writeHeader.setAccessible(true); // Build a DirectPinWriter with null sa/specAcc — header path doesn't // touch them. If the constructor starts using them, this test needs // to evolve; for now it's a pure shape check. - java.lang.reflect.Constructor ctor = edu.ucsd.msjava.mzid.DirectPinWriter.class + java.lang.reflect.Constructor ctor = edu.ucsd.msjava.output.DirectPinWriter.class .getDeclaredConstructor( SearchParams.class, edu.ucsd.msjava.msutil.AminoAcidSet.class, diff --git a/src/test/java/msgfplus/TestIPRG.java b/src/test/java/msgfplus/TestIPRG.java index 950e2613..be730174 100644 --- a/src/test/java/msgfplus/TestIPRG.java +++ b/src/test/java/msgfplus/TestIPRG.java @@ -8,7 +8,7 @@ import org.junit.Test; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; public class TestIPRG { diff --git a/src/test/java/msgfplus/TestMSUtils.java b/src/test/java/msgfplus/TestMSUtils.java index fe7ec847..b7c4ed79 100644 --- a/src/test/java/msgfplus/TestMSUtils.java +++ b/src/test/java/msgfplus/TestMSUtils.java @@ -4,7 +4,7 @@ import java.net.URISyntaxException; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Test; import edu.ucsd.msjava.msutil.AminoAcidSet; import edu.ucsd.msjava.msutil.IonType; diff --git a/src/test/java/msgfplus/TestMisc.java b/src/test/java/msgfplus/TestMisc.java index 6103486f..7af58d30 100644 --- a/src/test/java/msgfplus/TestMisc.java +++ b/src/test/java/msgfplus/TestMisc.java @@ -6,7 +6,7 @@ import edu.ucsd.msjava.msdbsearch.CompactFastaSequence; import edu.ucsd.msjava.msdbsearch.ReverseDB; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Ignore; import org.junit.Test; diff --git a/src/test/java/msgfplus/TestPercolator.java b/src/test/java/msgfplus/TestPercolator.java index dc203c25..4abdfd64 100644 --- a/src/test/java/msgfplus/TestPercolator.java +++ b/src/test/java/msgfplus/TestPercolator.java @@ -9,7 +9,7 @@ import org.junit.Test; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; public class TestPercolator { diff --git a/src/test/java/msgfplus/TestPrecursorCalIntegration.java b/src/test/java/msgfplus/TestPrecursorCalIntegration.java index c20c8b3c..573adf9d 100644 --- a/src/test/java/msgfplus/TestPrecursorCalIntegration.java +++ b/src/test/java/msgfplus/TestPrecursorCalIntegration.java @@ -4,7 +4,7 @@ import edu.ucsd.msjava.msutil.DBSearchIOFiles; import edu.ucsd.msjava.msutil.SpecFileFormat; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Assert; import org.junit.Test; diff --git a/src/test/java/msgfplus/TestPrecursorCalScaffolding.java b/src/test/java/msgfplus/TestPrecursorCalScaffolding.java index 37458112..7673195e 100644 --- a/src/test/java/msgfplus/TestPrecursorCalScaffolding.java +++ b/src/test/java/msgfplus/TestPrecursorCalScaffolding.java @@ -6,7 +6,7 @@ import edu.ucsd.msjava.msutil.DBSearchIOFiles; import edu.ucsd.msjava.msutil.SpecFileFormat; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Assert; import org.junit.Test; diff --git a/src/test/java/msgfplus/TestRunManifestWriter.java b/src/test/java/msgfplus/TestRunManifestWriter.java index 86b26f22..4707b7a8 100644 --- a/src/test/java/msgfplus/TestRunManifestWriter.java +++ b/src/test/java/msgfplus/TestRunManifestWriter.java @@ -5,7 +5,7 @@ import edu.ucsd.msjava.msdbsearch.SearchParamsTest; import edu.ucsd.msjava.msutil.DBSearchIOFiles; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Assert; import org.junit.Test; diff --git a/src/test/java/msgfplus/TestSA.java b/src/test/java/msgfplus/TestSA.java index 5ff99911..ac639540 100644 --- a/src/test/java/msgfplus/TestSA.java +++ b/src/test/java/msgfplus/TestSA.java @@ -6,7 +6,7 @@ import edu.ucsd.msjava.msdbsearch.SuffixArrayForMSGFDB; import edu.ucsd.msjava.msutil.Composition; import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.ui.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Ignore; import org.junit.Test; From 6d998f3cd06feda511a0f46616c54e19a4baf237 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 09:20:37 +0100 Subject: [PATCH 11/34] docs(plans): add search-sync-cleanup + parameter-modernization plans - search-sync-cleanup.md: ship record for PR #25 (search-path sync cleanup + per-task result buffers). - parameter-modernization.md: proposed plan to migrate the custom edu.ucsd.msjava.params hierarchy to picocli with a thin MSGFPlus-specific compat layer for legacy aliases, config files, and repeated mod/CustomAA entries. Status: proposed. The parameter-modernization plan is documentation only on this branch -- it explicitly recommends a separate refactor branch for implementation, not bundling with this performance cleanup. --- .claude/plans/parameter-modernization.md | 159 +++++++++++++++++++++++ .claude/plans/search-sync-cleanup.md | 133 +++++++++++++++++++ 2 files changed, 292 insertions(+) create mode 100644 .claude/plans/parameter-modernization.md create mode 100644 .claude/plans/search-sync-cleanup.md diff --git a/.claude/plans/parameter-modernization.md b/.claude/plans/parameter-modernization.md new file mode 100644 index 00000000..19a6961f --- /dev/null +++ b/.claude/plans/parameter-modernization.md @@ -0,0 +1,159 @@ +# Plan: modernize MS-GF+ parameter handling + +**Status: proposed** +Branch: `perf/search-sync-cleanup` (worktree at +`/Users/yperez/work/msgfplus-workspace/search-sync-cleanup`). + +## Why this exists + +The current parameter stack under `edu.ucsd.msjava.params` is doing +several jobs at once: +- command-line parsing +- type conversion +- validation +- help/usage rendering +- config-file alias handling +- backward-compatibility shims + +That works, but it spreads option behavior across many small classes +(`Parameter`, `NumberParameter`, `RangeParameter`, `ToleranceParameter`, +`FileParameter`, enum wrappers, and `ParamManager`). The result is more +code than we need for a solved problem and a higher risk of subtle +parsing drift when new flags are added. + +## Goals + +- Reduce the amount of custom CLI parsing code. +- Keep existing MS-GF+ command-line behavior stable where practical. +- Preserve current config-file semantics in the first migration step. +- Keep `SearchParams` as the internal domain model for search settings. +- Improve help/usage generation and validation error consistency. + +## Non-goals + +- No search algorithm changes. +- No performance claim for the search itself; parsing happens once at + startup and is not a runtime hotspot. +- No forced removal of legacy config-file aliases in phase 1. +- No broad package cleanup bundled into this effort. + +## Recommended direction + +Adopt `picocli` for command-line parsing and help generation, while +keeping a thin MSGF+-specific compatibility layer for: +- legacy option names and aliases +- config-file parsing +- repeated modification/custom-AA entries +- conversion into `SearchParams`, `AminoAcidSet`, `Tolerance`, and + related domain objects + +## Proposed migration shape + +### Phase 1: introduce a typed CLI model beside `ParamManager` + +- Add a new options class for `MSGFPlus` under `edu.ucsd.msjava.cli`. +- Represent flags as typed fields with defaults, required markers, + and descriptions. +- Add custom `picocli` converters for: + - precursor mass tolerance + - integer and float ranges + - output format + - precursor calibration mode + - file/directory validation +- Keep `ParamManager` intact during this phase. +- Add an adapter that maps parsed CLI options into the current + `SearchParams` inputs. + +Success criteria: +- `MSGFPlus` can parse its current CLI arguments through the new path. +- Generated help text is complete and readable. +- Existing tests for parameter behavior still pass or are updated + mechanically where output formatting differs. + +### Phase 2: preserve config-file compatibility explicitly + +- Keep `ParamParser` or replace it with a thinner reader that still + accepts the current `key=value` format. +- Centralize legacy config-name alias resolution in one place instead + of scattering it through `ParamNameEnum`. +- Support repeated config entries for: + - `DynamicMod` + - `StaticMod` + - `CustomAA` +- Feed config values into the same typed options model used by CLI. + +Success criteria: +- Existing example parameter files still load. +- Duplicate-entry behavior for mods/custom amino acids is preserved. +- Command-line values continue to override config-file values. + +### Phase 3: move validation out of the custom parameter hierarchy + +- Replace per-type `parse()` methods with: + - `picocli` conversion + - explicit validation methods on the typed options object + - targeted domain-level validation while building `SearchParams` +- Collapse or remove custom classes that are no longer needed: + - `Parameter` + - `NumberParameter` + - `RangeParameter` + - `IntParameter` + - `FloatParameter` + - `DoubleParameter` + - `IntRangeParameter` + - `FloatRangeParameter` + - enum parameter wrappers + +Success criteria: +- No user-visible behavior regressions on required flags, defaults, + range checks, or enum choices. +- Validation failures still produce actionable messages. + +### Phase 4: reduce `ParamManager` to compatibility-only or retire it + +- If any remaining tools still depend on `ParamManager`, keep it only as + a compatibility facade over the new parser. +- Otherwise remove `ParamManager` from the active CLI path. +- Decide whether `MSGFDB` migrates in the same PR series or follows + after `MSGFPlus` is stable. + +## Main risks + +- Help text and error messages may change in ways that break tests or + documentation. +- Config-file behavior is more important than it looks; it includes + legacy aliases and repeated entries that generic CLI libraries do not + model by default. +- `MSGFDB` and `MSGFPlus` share parts of the current stack, so an + incomplete migration could increase duplication before it decreases. + +## Validation plan + +- Add focused tests for: + - required arguments + - default values + - bad range syntax + - enum parsing + - file existence checks + - config-file override precedence + - repeated modification/custom-AA entries +- Keep existing `SearchParams` tests green. +- Run at least one end-to-end `MSGFPlus` smoke test on a known fixture. +- Compare old vs new parser outcomes for a representative set of real + command lines and config files. + +## Suggested implementation order + +1. Add `picocli` dependency. +2. Build a typed `MSGFPlusOptions` class and converters. +3. Parse CLI into the new options class without removing `ParamManager`. +4. Add an adapter into the current `SearchParams` build path. +5. Port config-file handling. +6. Remove unused custom parameter classes. +7. Migrate `MSGFDB` only after `MSGFPlus` is stable. + +## Recommendation on branch strategy + +Do this in a dedicated refactor branch, not as part of a performance +cleanup PR. The expected win is maintainability and correctness, not +search throughput, and the surface area touches the public CLI. diff --git a/.claude/plans/search-sync-cleanup.md b/.claude/plans/search-sync-cleanup.md new file mode 100644 index 00000000..bf7ec3e6 --- /dev/null +++ b/.claude/plans/search-sync-cleanup.md @@ -0,0 +1,133 @@ +# Plan: search-path sync cleanup + per-task result buffers + +**Status: SHIPPED in PR #25** (https://github.com/bigbio/msgfplus/pull/25) +Branch: `perf/search-sync-cleanup` (worktree at +`/Users/yperez/work/msgfplus-workspace/search-sync-cleanup`). + +Successor to PR #24. Pure refactor + instrumentation — no scoring, +parser, or `.pin` feature changes. Output bit-identical to dev's tip +on every measurable axis. + +## What shipped (6 commits) + +1. **T1 — per-task wall stats + tail-imbalance summary** + `RunMSGFPlus` captures preprocess / db-search / compute-evalue / + total wall into a `TaskWallStats` accessor; `MSGFPlus.runMSGFPlus` + prints a one-line summary at end of search: + ``` + Task wall summary (n=12): min=101.7s median=224.2s p95=246.4s + max=246.4s total=2356.7s tail_gap=22.2s (10% of median) + ``` + On Astral the measured `tail_gap` is **10 % of median**, which means + T2 and T3 can't deliver substantial wins on this workload. + +2. **Drop dead `synchronized` wrappers in DBScanner + ScoredSpectraMap.** + Each instance is task-local (verified: no internal fork-out in + `dbSearch`, no shared instance across threads). Plain `HashMap` / + `TreeMap` replace the `Collections.synchronizedMap` / + `synchronizedSortedMap` wrappers; `synchronized` modifier dropped + from `addDBMatches`, `generateSpecIndexDBMatchMap`, + `addResultsToList`, `addDBSearchResults`. Memory-visibility safety + preserved via `awaitTermination`'s happens-before. + +3. **Per-task local result buffers + final merge.** + Replaced the global `Collections.synchronizedList` + with a per-task `ArrayList`. Each `RunMSGFPlus` owns its own buffer; + main thread drains all buffers after `awaitTermination`. + `RunMSGFPlus`'s constructor drops the `resultList` parameter; new + `getResults()` accessor. + +4. **T2 — `-Dmsgfplus.numTasksPerThread=N`** (default 3, unchanged). + Lets operators raise the multiplier on datasets where T1's + `tail_gap` shows real imbalance. + +5. **T3 — `-Dmsgfplus.useForkJoin=true`** (default false, unchanged). + Opt-in `ForkJoinPool` swap. Default keeps + `ThreadPoolExecutorWithExceptions` (which retains progress + reporting + exception-capture-via-afterExecute). FJP path uses + `Future.get()` for exception propagation. + +6. **Polish — tighter result-buffer merge + `drainResultsTo` + reused + null sink.** Static `NULL_PRINT_STREAM` cached instead of allocated + per `run()`; `drainResultsTo(dest)` clears per-task buffers + immediately after merge so heap is collectible; pre-size merged + `ArrayList` to `sum(t.getResultCount())` to avoid resize-and-copy; + `submittedTasks.clear()` after summary drops strong refs to all 12 + task instances before the FDR / write phase. + +## Validation gate cleared (Astral 3-arm + Percolator) + +Astral 3-arm cold, 8 GB heap, 4 threads, default sysprops. +**All 8 parity numbers bit-identical to dev's tip:** + +| Metric | dev | this branch | +|---|---:|---:| +| armB raw targets | 89,479 | 89,479 ✓ | +| armB raw decoys | 46,792 | 46,792 ✓ | +| armB 1 % FDR targets | 35,818 | 35,818 ✓ | +| armB 5 % FDR targets | 40,408 | 40,408 ✓ | +| armC raw targets | 89,360 | 89,360 ✓ | +| armC raw decoys | 46,913 | 46,913 ✓ | +| armC 1 % FDR targets | 35,767 | 35,767 ✓ | +| armC 5 % FDR targets | 40,426 | 40,426 ✓ | + +Walltime delta vs master in the same run: +- armB: 752.2s vs 848.8s = **−11.4 %** +- armC: 798.2s vs 848.8s = **−5.9 %** + +(First run came in with armC at 6298s; root-caused to OS thrashing — +load avg 5-8, ~120 MB free RAM, 165M page reclaims, Rancher VM eating +1 GB. Re-ran after stopping Rancher; wall normalized. Not a code +issue. Documented in PR #25 description.) + +## What we learned vs. expected wins + +The plan predicted: +- Step 1 (sync removal): 0–2 % wall. Possibly negative if biased + locking was helping. Code clarity is the more reliable win. +- Step 2 (per-task buffers): 2–8 % wall, scaling with PSM count. +- T2 / T3: only worth doing if profiler shows real tail-imbalance. + +What we measured: +- Combined wall improvement: **11.4 % on armB, 5.9 % on armC** — + better than the upper end of the per-step predictions, suggesting + the gains compound (less monitor traffic + cheaper drain phase). +- T1's measured tail_gap on Astral: **10 % of median** — small enough + that T2/T3 default-on would give marginal wins. They ship as opt-in + knobs precisely so they don't gate the default behavior. + +## What this branch is NOT + +Not a fragment-index revival. Not a primitive mass-window port. Not +a peak-storage refactor (`Peak` → `float[]`). Not a CLI / format +change. Originated from a third-party review of PR #24. + +## Follow-ups (out of scope for this PR) + +- **Profile on TMT and a metaproteomic FASTA** with the new T1 + summary. Astral's 10 % tail_gap might not represent uneven + workloads — homolog-rich DBs are the place T2/T3 should bite. +- **`DatabaseMatch.indices` from `TreeSet` to primitive + `int[]`** (M1 from the broader memory-roadmap discussion). Highest + expected impact for homolog-heavy databases (5-12× memory reduction + per match); needs a metaproteomic test fixture to validate. +- **Parser cache stores raw `float[] mz, float[] intensity`** (M3), + with a fresh `Spectrum` built per `getSpectrumBySpecIndex`. Side + benefit: cache-layer immutability instead of cloneSpectrum. +- **`Peak`/`Spectrum` storage refactor** (M2). Multi-PR. Big surface + area. Defer until M1 + M3 land. + +## Open questions resolved + +- **Did the custom `ThreadPoolExecutorWithExceptions` preserve + awaitTermination's happens-before on the exception path?** Yes — + observed bit-identical results in armB / armC across the 3-arm + benchmark, which would not be the case if visibility were broken. + +- **Was HotSpot already eliding the uncontended monitors?** Probably + partially. Step 2 (sync removal) on its own gives an unmeasured + delta; combined with steps 3–6 the total is 11.4 %. We can't + attribute that 11.4 % to any single commit without per-commit + benchmarks, but the polish commit (#6) likely contributes + meaningfully via the pre-sized `ArrayList` and immediate + per-task-buffer release. From 4bb388c0fcb9ee1efd68a5364e9ed978b6367460 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 10:54:01 +0100 Subject: [PATCH 12/34] build(deps): add picocli 4.7.6 + flag inventory for params modernization Picocli is the foundation for the parameter-modernization plan (see .claude/plans/parameter-modernization.md): replace the custom edu.ucsd.msjava.params hierarchy with declarative @Option flags, auto-generated help, and a thin compat layer for the legacy config-file format. The inventory artifact enumerates every MSGFPlus CLI flag (34 total: 27 visible, 7 hidden) plus the parsing semantics each one currently relies on (asymmetric tolerance, dynamic-enum registries, range inclusivity flips, trailing-! number quirk, ~13 config-file aliases, etc.). It is the foundation for the typed cli.MSGFPlusOptions class in the next commit. No code changes yet -- dep + docs only. --- .../parameter-modernization-flag-inventory.md | 90 +++++++++++++++++++ pom.xml | 5 ++ 2 files changed, 95 insertions(+) create mode 100644 .claude/plans/parameter-modernization-flag-inventory.md diff --git a/.claude/plans/parameter-modernization-flag-inventory.md b/.claude/plans/parameter-modernization-flag-inventory.md new file mode 100644 index 00000000..68ac2d6d --- /dev/null +++ b/.claude/plans/parameter-modernization-flag-inventory.md @@ -0,0 +1,90 @@ +# MS-GF+ flag inventory (Phase 1 input) + +Snapshot of every flag registered by `ParamManager.addMSGFPlusParams()` +plus the parsing semantics each one currently relies on. This is the +foundation document for the Phase 1 picocli rewrite described in +`parameter-modernization.md`. Total: 34 flags (27 visible + 7 hidden). +Required: `-s`, `-d`. + +## Visible flags + +| Short | Canonical name | Type | Default | Bounds | Notes | +|---|---|---|---|---|---| +| `-conf` | `ConfigurationFile` | file | — | exists | Config file; CLI overrides config | +| `-s` | `SpectrumFile` | file/dir | — | exists | **Required.** mzML/mzXML/mgf/ms2/pkl/_dta.txt or directory | +| `-d` | `DatabaseFile` | file | — | exists | **Required.** *.fasta / *.fa / *.faa | +| `-decoy` | `DecoyPrefix` | string | `DECOY_` | — | Decoy protein prefix | +| `-o` | `OutputFile` | file | `.pin` | — | *.pin (default) or *.tsv | +| `-t` | `PrecursorMassTolerance` | tolerance | `20ppm` | ≥0 | Symmetric (`20ppm`) or asymmetric (`0.5Da,2.5Da`); units must match | +| `-ti` | `IsotopeErrorRange` | int range | `0,1` | ≥0, max-incl | Isotope-error window, both ends inclusive | +| `-m` | `FragmentationMethodID` | dyn-enum | `ASWRITTEN` | — | 0=as-written, 1=CID, 2=ETD, 3=HCD | +| `-inst` | `InstrumentID` | dyn-enum | `LOW_RES_LTQ` | registry | `InstrumentType` registry-driven | +| `-e` | `EnzymeID` | dyn-enum | `TRYPSIN` | registry | `Enzyme` registry-driven | +| `-protocol` | `ProtocolID` | dyn-enum | `AUTOMATIC` | registry | `Protocol` registry-driven | +| `-ntt` | `NTT` | enum | `2` | 0..2 | Number of tolerable termini | +| `-mod` | `ModificationFile` | file | built-in (C+57) | exists | Mod file; config-file path also accepts `StaticMod=`/`DynamicMod=`/`CustomAA=` | +| `-minLength` | `MinPepLength` | int | `6` | ≥1 | | +| `-maxLength` | `MaxPepLength` | int | `40` | ≥1 | | +| `-minCharge` | `MinCharge` | int | `2` | ≥1 | | +| `-maxCharge` | `MaxCharge` | int | `3` | ≥1 | | +| `-n` | `NumMatchesPerSpec` | int | `1` | ≥1 | | +| `-thread` | `NumThreads` | int | `Runtime.availableProcessors()` | ≥1 | | +| `-tasks` | `NumTasks` | int | `0` (auto) | ≥-10 | 0=auto, >0=fixed, <0=N×threads | +| `-minSpectraPerThread` | `MinSpectraPerThread` | int | `250` | ≥1 | | +| `-verbose` | `Verbose` | enum | `0` | 0..1 | 0=total, 1=per-thread | +| `-tda` | `TDA` | enum | `0` | 0..1 | 0=no decoy, 1=concat decoy search | +| `-addFeatures` | `AddFeatures` | enum | `0` | 0..1 | Percolator extra features | +| `-outputFormat` | `OutputFormat` | enum | `pin` | pin/tsv | mzIdentML removed | +| `-precursorCal` | `PrecursorCal` | string | `auto` | auto/on/off | Case-insensitive | +| `-ccm` | `ChargeCarrierMass` | double | `1.00727649` | >0.1 | Proton mass default | +| `-maxMissedCleavages` | `MaxMissedCleavages` | int | `-1` | ≥-1 | -1 = unlimited | +| `-numMods` | `NumMods` | int | `3` | ≥0 | Max dynamic mods per peptide | +| `-allowDenseCentroidedPeaks` | `AllowDenseCentroidedPeaks` | enum | `0` | 0..1 | | +| `-msLevel` | `MSLevel` | int range | `2,2` | ≥1, max-incl | `min,max` or single | +| `-u` | `PrecursorMassToleranceUnits` | enum | `2` | 0..2 | **Hidden** — legacy; 0=Da, 1=ppm, 2=as-written | + +## Hidden flags + +| Short | Canonical name | Type | Default | Notes | +|---|---|---|---|---| +| `-dd` | `DBIndexDir` | dir | — | Database index dir | +| `-index` | `SpecIndex` | int range | `1,INT_MAX-1` | Spectrum index range, both inclusive | +| `-edgeScore` | `EdgeScore` | enum | `0` | 0=use, 1=skip | +| `-minNumPeaks` | `MinNumPeaks` | int | `Constants.MIN_NUM_PEAKS_PER_SPECTRUM` | | +| `-iso` | `NumIsoforms` | int | `Constants.NUM_VARIANTS_PER_PEPTIDE` | | +| `-ignoreMetCleavage` | `IgnoreMetCleavage` | enum | `0` | 0=consider, 1=ignore | +| `-minDeNovoScore` | `MinDeNovoScore` | int | `Constants.MIN_DE_NOVO_SCORE` | | + +## Sharp edges the picocli rewrite must preserve + +1. **Asymmetric tolerance.** `-t 0.5Da,2.5Da` → left tolerance (observed < theoretical) ≠ right tolerance. Both sides must use the same unit. Numeric-only value (e.g. `20`) defaults to Da. Trailing unit suffix is case-insensitive (`Da`/`ppm`/`Th`). +2. **Range inclusivity is per-flag.** `IntRangeParameter` defaults to `min` inclusive / `max` exclusive, but `-ti`, `-index`, `-msLevel` flip max to inclusive via `.setMaxInclusive()`. +3. **Dynamic enums.** `-inst`, `-e`, `-protocol`, `-m` are registry-driven (`InstrumentType`, `Enzyme`, `Protocol`, `ActivationMethod`). Numeric indices depend on registry load order; help text is generated at startup. Picocli converters must read from the same registries, not hardcode indices. +4. **`OutputFormat` legacy mapping is gone.** Old `0=mzIdentML`, `2=both` are no longer accepted; only `pin` (0) and `tsv` (1) remain. Numeric indices are deprecated but still parse internally. +5. **`-precursorCal` is a string, not an enum class.** Values: `auto` / `on` / `off` (case-insensitive, `.trim()`-ed). `auto` means "run pre-pass, apply only if ≥200 confident PSMs collected". +6. **Trailing `!` on numbers.** `IntParameter` and `DoubleParameter` strip trailing `!` (legacy DMS config-file integration). Decide if Phase 1 keeps this quirk. +7. **`-tasks` semantics.** `0` = auto, `>0` = fixed, `<0` = `N × threads`. Range allows down to `-10`. +8. **Config-file-only entries.** `StaticMod=`, `DynamicMod=`, `CustomAA=` are not CLI flags. They're parsed from `-mod` file and `-conf` config file only. Repeated entries are *expected* (each line is a separate mod). Config parser preserves order. +9. **Config-file aliases (canonical-name normalization in `ParamNameEnum.getParamNameFromLine()`).** Auto-renames at least 13 deprecated keys: + - `IsotopeError` → `IsotopeErrorRange` + - `TargetDecoyAnalysis` → `TDA` + - `FragmentationMethod` → `FragmentationMethodID` + - `Instrument` → `InstrumentID` + - `Enzyme` → `EnzymeID` + - `Protocol` → `ProtocolID` + - `NumTolerableTermini` → `NTT` + - `MinNumPeaks` → `MinNumPeaksPerSpectrum` + - `MaxNumMods` / `MaxNumModsPerPeptide` → `NumMods` + - `minLength` / `MinPeptideLength` → `MinPepLength` + - `maxLength` / `MaxPeptideLength` → `MaxPepLength` + - `PMTolerance` / `ParentMassTolerance` → `PrecursorMassTolerance` +10. **File-format validation chain.** Order: directory-vs-file → format-suffix match → existence → no-reuse. Suffix matching is case-insensitive for `.pin`/`.tsv`/`.fasta`. Spec parameter auto-allows directories. +11. **Defaults that depend on runtime.** `-thread` defaults to `Runtime.getRuntime().availableProcessors()` (includes hyperthreading; per CLAUDE.md, physical cores often give better wall-time). +12. **Help-text drift.** Existing tests likely compare exact `--help` output. picocli's formatter is different. Decide: snapshot-update vs. custom renderer that mimics current format. + +## Out-of-scope reminders for Phase 1 + +- `MSGFDB`, `MSGF`, `MSGFLib` entry points share `ParamManager`. Phase 1 only modernizes `MSGFPlus`; the other three keep using `ParamManager.parseParams()` until Phase 4. +- Config-file parsing is Phase 2. Phase 1 covers CLI only. +- The `Parameter` / `IntParameter` / `IntRangeParameter` / `ToleranceParameter` / etc. hierarchy is **not** removed in Phase 1. Removal is Phase 3. +- `ParamManager` itself stays. Phase 1 adds an adapter that produces a populated `ParamManager` from the typed `MSGFPlusOptions`, so `SearchParams.parse(ParamManager)` is unchanged. diff --git a/pom.xml b/pom.xml index ceb3b8a4..0e26ba33 100644 --- a/pom.xml +++ b/pom.xml @@ -140,6 +140,11 @@ commons-io 2.15.1 + + info.picocli + picocli + 4.7.6 + From 15816024b0cc38ced95fed0ea0f3b99d20be5712 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 10:56:57 +0100 Subject: [PATCH 13/34] refactor(cli): declare typed MSGFPlusOptions (picocli @Command) Declarative replacement for the imperative addParameter() calls in ParamManager.addMSGFPlusParams(). All 34 MSGFPlus flags (27 visible + 7 hidden) are declared as picocli @Option fields with typed Java fields where the Java type is natural (File, Integer, Double, String). Complex domain-typed flags (precursor tolerance, isotope-error range, ms-level range, output format, precursor-cal mode, dynamic enums for fragmentation/instrument/enzyme/protocol) are captured as raw String/Integer for now; the Phase 1c adapter will round-trip them through the existing params.Parameter#parse(String) hierarchy to preserve current behavior. Phase 3 collapses that round-trip once the old hierarchy is deleted. No wiring change yet -- MSGFPlus.main() still uses ParamManager.parseParams(). Adapter and main() switch land in follow-up commits. Compile passes. --- .../edu/ucsd/msjava/cli/MSGFPlusOptions.java | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java new file mode 100644 index 00000000..0c5df646 --- /dev/null +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java @@ -0,0 +1,206 @@ +package edu.ucsd.msjava.cli; + +import edu.ucsd.msjava.params.ParamManager.ParamNameEnum; +import picocli.CommandLine.Command; +import picocli.CommandLine.Option; + +import java.io.File; + +/** + * Typed command-line options for MS-GF+. Replaces the imperative + * {@code addParameter()} calls in {@code ParamManager.addMSGFPlusParams()} + * with declarative picocli annotations. + * + * Phase 1 scope: every flag from {@link ParamNameEnum} that + * {@code addMSGFPlusParams()} registers, parsed into typed fields. + * Complex domain types (Tolerance, IntRange, dynamic enums) are + * captured here as raw strings; the adapter at + * {@code MSGFPlusOptionsAdapter} round-trips them through the existing + * {@code params.Parameter#parse(String)} hierarchy to populate a + * {@code ParamManager} that {@code SearchParams.parse(ParamManager)} + * can consume unchanged. Phase 3 collapses that round-trip away. + * + * Flag inventory: see {@code .claude/plans/parameter-modernization-flag-inventory.md}. + */ +@Command( + name = "MS-GF+", + mixinStandardHelpOptions = true, + sortOptions = false, + description = "MS-GF+: peptide identification by database search of mass spectra.") +public final class MSGFPlusOptions { + + // ---------- required input ---------- + + @Option(names = "-s", required = true, paramLabel = "SpectrumFile", + description = "Input spectrum file (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl, *_dta.txt) or directory of spectra") + public File spectrumFile; + + @Option(names = "-d", required = true, paramLabel = "DatabaseFile", + description = "Database file (*.fasta, *.fa, *.faa)") + public File databaseFile; + + // ---------- optional config + output ---------- + + @Option(names = "-conf", paramLabel = "ConfigFile", + description = "Configuration file path; CLI flags override config file values") + public File configFile; + + @Option(names = "-o", paramLabel = "OutputFile", + description = "Output file (*.pin or *.tsv); Default: .pin") + public File outputFile; + + @Option(names = "-decoy", paramLabel = "Prefix", + description = "Decoy protein prefix; Default: DECOY_") + public String decoyPrefix; + + // ---------- precursor mass tolerance ---------- + + @Option(names = "-t", paramLabel = "Tolerance", + description = "Precursor mass tolerance, e.g. 20ppm or 0.5Da or 0.5Da,2.5Da; Default: 20ppm. " + + "Asymmetric form sets left tolerance (ObsMass < TheoMass) and right tolerance (ObsMass > TheoMass).") + public String precursorTolerance; + + @Option(names = "-u", paramLabel = "Units", hidden = true, + description = "Tolerance units (legacy): 0=Da, 1=ppm, 2=as written in -t (Default: 2)") + public Integer precursorToleranceUnits; + + @Option(names = "-ti", paramLabel = "Range", + description = "Isotope-error range, e.g. -1,2 (both inclusive); Default: 0,1") + public String isotopeErrorRange; + + // ---------- threading / parallelism ---------- + + @Option(names = "-thread", paramLabel = "N", + description = "Number of worker threads; Default: number of available cores") + public Integer numThreads; + + @Option(names = "-tasks", paramLabel = "N", + description = "Number of tasks: 0=auto, >0=fixed, <0=N*threads; Default: 0") + public Integer numTasks; + + @Option(names = "-minSpectraPerThread", paramLabel = "N", + description = "Minimum spectra per thread/task; Default: 250") + public Integer minSpectraPerThread; + + @Option(names = "-verbose", paramLabel = "N", + description = "Verbosity: 0=total progress only (Default), 1=per-thread") + public Integer verbose; + + // ---------- target/decoy + scoring shape ---------- + + @Option(names = "-tda", paramLabel = "N", + description = "Target-decoy strategy: 0=off (Default), 1=concatenated decoy search") + public Integer tdaStrategy; + + @Option(names = "-m", paramLabel = "ID", + description = "Fragmentation method ID: 0=as written/CID (Default), 1=CID, 2=ETD, 3=HCD") + public Integer fragMethodId; + + @Option(names = "-inst", paramLabel = "ID", + description = "Instrument type ID; default depends on registry") + public Integer instrumentTypeId; + + @Option(names = "-e", paramLabel = "ID", + description = "Enzyme ID; default depends on registry") + public Integer enzymeId; + + @Option(names = "-protocol", paramLabel = "ID", + description = "Protocol ID; default depends on registry") + public Integer protocolId; + + @Option(names = "-ntt", paramLabel = "N", + description = "Number of tolerable termini (0..2); Default: 2 (fully tryptic)") + public Integer numTolerableTermini; + + // ---------- modifications ---------- + + @Option(names = "-mod", paramLabel = "ModFile", + description = "Modification file (also accepts StaticMod=, DynamicMod=, CustomAA= entries via -conf)") + public File modificationFile; + + // ---------- peptide / charge bounds ---------- + + @Option(names = "-minLength", paramLabel = "N", + description = "Minimum peptide length; Default: 6") + public Integer minPeptideLength; + + @Option(names = "-maxLength", paramLabel = "N", + description = "Maximum peptide length; Default: 40") + public Integer maxPeptideLength; + + @Option(names = "-minCharge", paramLabel = "N", + description = "Minimum precursor charge; Default: 2") + public Integer minCharge; + + @Option(names = "-maxCharge", paramLabel = "N", + description = "Maximum precursor charge; Default: 3") + public Integer maxCharge; + + @Option(names = "-n", paramLabel = "N", + description = "Number of matches reported per spectrum; Default: 1") + public Integer numMatchesPerSpec; + + // ---------- output / features / calibration ---------- + + @Option(names = "-addFeatures", paramLabel = "N", + description = "Include extra features for Percolator: 0=basic (Default), 1=+features") + public Integer addFeatures; + + @Option(names = "-outputFormat", paramLabel = "Format", + description = "Output format: pin (Default) or tsv") + public String outputFormat; + + @Option(names = "-precursorCal", paramLabel = "Mode", + description = "Precursor calibration mode: auto (Default), on, off") + public String precursorCalMode; + + @Option(names = "-ccm", paramLabel = "Mass", + description = "Charge carrier mass; Default: 1.00727649 (proton)") + public Double chargeCarrierMass; + + @Option(names = "-maxMissedCleavages", paramLabel = "N", + description = "Max missed cleavages per peptide; -1 = unlimited (Default)") + public Integer maxMissedCleavages; + + @Option(names = "-numMods", paramLabel = "N", + description = "Max dynamic mods per peptide; Default: 3") + public Integer maxNumMods; + + @Option(names = "-allowDenseCentroidedPeaks", paramLabel = "N", + description = "Allow centroid scans with dense peaks: 0=skip (Default), 1=allow") + public Integer allowDenseCentroidedPeaks; + + @Option(names = "-msLevel", paramLabel = "Range", + description = "MS level or range, e.g. 2 or 2,3; Default: 2,2") + public String msLevel; + + // ---------- hidden flags ---------- + + @Option(names = "-dd", paramLabel = "Dir", hidden = true, + description = "Database index directory") + public File dbIndexDir; + + @Option(names = "-index", paramLabel = "Range", hidden = true, + description = "Spectrum index range, e.g. 1,1000 (both inclusive)") + public String specIndexRange; + + @Option(names = "-edgeScore", paramLabel = "N", hidden = true, + description = "Edge scoring: 0=use (Default), 1=skip") + public Integer edgeScore; + + @Option(names = "-minNumPeaks", paramLabel = "N", hidden = true, + description = "Minimum number of peaks per spectrum") + public Integer minNumPeaks; + + @Option(names = "-iso", paramLabel = "N", hidden = true, + description = "Number of isoforms to consider per peptide") + public Integer numIsoforms; + + @Option(names = "-ignoreMetCleavage", paramLabel = "N", hidden = true, + description = "Ignore N-terminal Met cleavage: 0=consider (Default), 1=ignore") + public Integer ignoreMetCleavage; + + @Option(names = "-minDeNovoScore", paramLabel = "N", hidden = true, + description = "Minimum de novo score") + public Integer minDeNovoScore; +} From 310cb3370f9bd1a3e01d48267082327c6f6202ce Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 11:03:14 +0100 Subject: [PATCH 14/34] refactor(cli): route MSGFPlus argv through picocli + adapter Phase 1 of parameter-modernization. All non-config-file MSGFPlus invocations now flow through: argv -> picocli (MSGFPlusOptions) -> MSGFPlusOptionsAdapter -> ParamManager instead of imperative ParamManager.parseParams. -conf still falls through to the legacy path; Phase 2 ports the config-file reader. The adapter round-trips each typed field through the existing params.Parameter#parse(String) hierarchy so the downstream SearchParams.parse(ParamManager) build path is unchanged. Phase 3 collapses that round-trip when the old hierarchy is deleted. New equivalence test (MSGFPlusOptionsAdapterTest) asserts that the picocli path populates the same ParamManager state as legacy parseParams for a representative CLI corpus, including asymmetric tolerance ("-t 0.5Da,2.5Da"). Picocli's required-flag enforcement also verified. Scoped test sweep (TestDirectPinWriter, TestMSUtils, TestSA, TestMisc, TestRunManifestWriter, SearchParamsTest, TestPercolator, MSGFPlusOptionsAdapterTest): 45 tests, 0 failures, 0 errors. --- .../java/edu/ucsd/msjava/cli/MSGFPlus.java | 30 ++++- .../msjava/cli/MSGFPlusOptionsAdapter.java | 114 ++++++++++++++++++ .../cli/MSGFPlusOptionsAdapterTest.java | 101 ++++++++++++++++ 3 files changed, 243 insertions(+), 2 deletions(-) create mode 100644 src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java create mode 100644 src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java index 7f5dd3ca..3c0a829c 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java @@ -13,6 +13,8 @@ import edu.ucsd.msjava.mzml.StaxMzMLParser; import edu.ucsd.msjava.params.ParamManager; import edu.ucsd.msjava.sequences.Constants; +import picocli.CommandLine; +import picocli.CommandLine.ParameterException; import java.io.File; import java.io.IOException; @@ -61,8 +63,10 @@ public static void main(String argv[]) { StaxMzMLParser.turnOffLogs(); - // Parse parameters - String errMessage = paramManager.parseParams(argv); + // Parse parameters. The new picocli-based path runs by default; + // the legacy ParamManager.parseParams handles -conf (config-file + // input) until Phase 2 ports it. See parameter-modernization.md. + String errMessage = parseArgs(argv, paramManager); if (errMessage != null) { MSGFLogger.error(errMessage); System.out.println(); @@ -92,6 +96,28 @@ public static void main(String argv[]) { MSGFLogger.info("MS-GF+ complete (total elapsed time: %.2f sec)", (System.currentTimeMillis() - startTime) / (float) 1000); } + /** + * Phase 1 of the parameter-modernization plan: route MSGFPlus argv + * through the typed picocli-based {@link MSGFPlusOptions} class and + * adapter, falling back to {@link ParamManager#parseParams(String[])} + * for {@code -conf} (config-file input) until Phase 2 ports the + * config-file reader. See {@code .claude/plans/parameter-modernization.md}. + */ + private static String parseArgs(String[] argv, ParamManager paramManager) { + for (String arg : argv) { + if ("-conf".equals(arg)) { + return paramManager.parseParams(argv); + } + } + MSGFPlusOptions opts = new MSGFPlusOptions(); + try { + new CommandLine(opts).parseArgs(argv); + } catch (ParameterException e) { + return e.getMessage(); + } + return MSGFPlusOptionsAdapter.adapt(opts, paramManager); + } + public static String runMSGFPlus(ParamManager paramManager) { SearchParams params = new SearchParams(); String errorMessage = params.parse(paramManager); diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java new file mode 100644 index 00000000..bf38b87f --- /dev/null +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java @@ -0,0 +1,114 @@ +package edu.ucsd.msjava.cli; + +import edu.ucsd.msjava.params.ParamManager; +import edu.ucsd.msjava.params.ParamManager.ParamNameEnum; +import edu.ucsd.msjava.params.Parameter; + +/** + * Phase 1 adapter: populates a {@link ParamManager} from a parsed + * {@link MSGFPlusOptions} by round-tripping each set field through the + * canonical string form that the existing + * {@link Parameter#parse(String)} hierarchy expects. + * + * This deliberately reuses the legacy parsing logic so Phase 1 is + * behavior-preserving. Phase 3 deletes the {@code params.Parameter} + * hierarchy and replaces this adapter with direct construction of the + * downstream {@code SearchParams}. + * + * Returns {@code null} on success, or a human-readable error string + * matching the format used by {@link ParamManager#parseParams(String[])}. + */ +public final class MSGFPlusOptionsAdapter { + + private MSGFPlusOptionsAdapter() {} + + /** + * Populate {@code paramManager} (already initialized via + * {@link ParamManager#addMSGFPlusParams()}) with values from + * {@code opts}. Caller is responsible for calling + * {@link ParamManager#isValid()} afterwards if final validation + * is desired (this method also runs it as the last step). + */ + public static String adapt(MSGFPlusOptions opts, ParamManager paramManager) { + String err; + + // Files / paths + if ((err = setIfPresent(paramManager, ParamNameEnum.CONFIGURATION_FILE, + opts.configFile == null ? null : opts.configFile.getPath())) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.SPECTRUM_FILE, + opts.spectrumFile == null ? null : opts.spectrumFile.getPath())) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.DB_FILE, + opts.databaseFile == null ? null : opts.databaseFile.getPath())) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.SEARCH_OUTPUT_FILE, + opts.outputFile == null ? null : opts.outputFile.getPath())) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MOD_FILE, + opts.modificationFile == null ? null : opts.modificationFile.getPath())) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.DD_DIRECTORY, + opts.dbIndexDir == null ? null : opts.dbIndexDir.getPath())) != null) return err; + + // Plain strings / domain strings parsed by ToleranceParameter / RangeParameter / EnumParameter + if ((err = setIfPresent(paramManager, ParamNameEnum.DECOY_PREFIX, opts.decoyPrefix)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.PRECURSOR_MASS_TOLERANCE, opts.precursorTolerance)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.ISOTOPE_ERROR, opts.isotopeErrorRange)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.OUTPUT_FORMAT, opts.outputFormat)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.PRECURSOR_CAL, opts.precursorCalMode)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MS_LEVEL, opts.msLevel)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.SPEC_INDEX, opts.specIndexRange)) != null) return err; + + // Integer-valued flags (enum + numeric) + if ((err = setIfPresent(paramManager, ParamNameEnum.PRECURSOR_MASS_TOLERANCE_UNITS, opts.precursorToleranceUnits)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.NUM_THREADS, opts.numThreads)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.NUM_TASKS, opts.numTasks)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MIN_SPECTRA_PER_THREAD, opts.minSpectraPerThread)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.VERBOSE, opts.verbose)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.TDA_STRATEGY, opts.tdaStrategy)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.FRAG_METHOD, opts.fragMethodId)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.INSTRUMENT_TYPE, opts.instrumentTypeId)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.ENZYME_ID, opts.enzymeId)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.PROTOCOL_ID, opts.protocolId)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.ENZYME_SPECIFICITY, opts.numTolerableTermini)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MIN_PEPTIDE_LENGTH, opts.minPeptideLength)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MAX_PEPTIDE_LENGTH, opts.maxPeptideLength)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MIN_CHARGE, opts.minCharge)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MAX_CHARGE, opts.maxCharge)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.NUM_MATCHES_SPEC, opts.numMatchesPerSpec)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.ADD_FEATURES, opts.addFeatures)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MAX_MISSED_CLEAVAGES, opts.maxMissedCleavages)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MAX_NUM_MODS, opts.maxNumMods)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.ALLOW_DENSE_CENTROIDED_PEAKS, opts.allowDenseCentroidedPeaks)) != null) return err; + + // Hidden integer flags + if ((err = setIfPresent(paramManager, ParamNameEnum.EDGE_SCORE, opts.edgeScore)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MIN_NUM_PEAKS, opts.minNumPeaks)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.NUM_ISOFORMS, opts.numIsoforms)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.IGNORE_MET_CLEAVAGE, opts.ignoreMetCleavage)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MIN_DE_NOVO_SCORE, opts.minDeNovoScore)) != null) return err; + + // Doubles + if ((err = setIfPresent(paramManager, ParamNameEnum.CHARGE_CARRIER_MASSES, opts.chargeCarrierMass)) != null) return err; + + return paramManager.isValid(); + } + + private static String setIfPresent(ParamManager paramManager, ParamNameEnum name, String value) { + if (value == null) return null; + Parameter p = paramManager.getParameter(name.getKey()); + if (p == null) return "Internal error: parameter not registered: -" + name.getKey(); + String err = p.parse(value); + if (err != null) { + return "Invalid value for parameter -" + name.getKey() + ": " + value + "\n (" + err + ")"; + } + p.setValueAssigned(); + return null; + } + + private static String setIfPresent(ParamManager paramManager, ParamNameEnum name, Integer value) { + if (value == null) return null; + return setIfPresent(paramManager, name, value.toString()); + } + + private static String setIfPresent(ParamManager paramManager, ParamNameEnum name, Double value) { + if (value == null) return null; + return setIfPresent(paramManager, name, value.toString()); + } +} diff --git a/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java new file mode 100644 index 00000000..454dac47 --- /dev/null +++ b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java @@ -0,0 +1,101 @@ +package edu.ucsd.msjava.cli; + +import edu.ucsd.msjava.params.ParamManager; +import org.junit.Assert; +import org.junit.Test; +import picocli.CommandLine; + +/** + * Phase 1 equivalence test: both the legacy + * {@link ParamManager#parseParams(String[])} path and the new + * picocli + {@link MSGFPlusOptionsAdapter} path must populate the + * same {@link ParamManager} state for a representative CLI. + * + * If a future change drops a field from {@link MSGFPlusOptions} or the + * adapter, this test catches the divergence before it reaches + * {@code SearchParams.parse}. + */ +public class MSGFPlusOptionsAdapterTest { + + /** Canonical CLI a typical user passes to MS-GF+. */ + private static final String[] TYPICAL_CLI = { + "-s", "src/test/resources/test.mgf", + "-d", "src/test/resources/Tryp_Pig_Bov.fasta", + "-t", "20ppm", + "-ti", "-1,2", + "-tda", "1", + "-ntt", "2", + "-thread", "4", + "-minLength", "7", + "-maxLength", "30", + "-minCharge", "2", + "-maxCharge", "4", + "-n", "3", + "-numMods", "2", + "-addFeatures", "1", + "-decoy", "XXX_", + }; + + @Test + public void picocliPathPopulatesParamManagerEquivalentlyToLegacyPath() { + ParamManager legacy = freshMSGFPlusParamManager(); + String legacyErr = legacy.parseParams(TYPICAL_CLI); + Assert.assertNull("legacy parseParams returned error: " + legacyErr, legacyErr); + + ParamManager adapted = freshMSGFPlusParamManager(); + MSGFPlusOptions opts = new MSGFPlusOptions(); + new CommandLine(opts).parseArgs(TYPICAL_CLI); + String adaptedErr = MSGFPlusOptionsAdapter.adapt(opts, adapted); + Assert.assertNull("adapter returned error: " + adaptedErr, adaptedErr); + + // Compare every typed accessor that downstream SearchParams.parse reads. + Assert.assertEquals(legacy.getDecoyProteinPrefix(), adapted.getDecoyProteinPrefix()); + Assert.assertEquals(legacy.getChargeCarrierMass(), adapted.getChargeCarrierMass(), 0.0); + Assert.assertEquals(legacy.getNumTolerableTermini(), adapted.getNumTolerableTermini()); + Assert.assertEquals(legacy.getNumMatchesPerSpectrum(), adapted.getNumMatchesPerSpectrum()); + Assert.assertEquals(legacy.getTDA(), adapted.getTDA()); + Assert.assertEquals(legacy.getOutputAdditionalFeatures(), adapted.getOutputAdditionalFeatures()); + Assert.assertEquals(legacy.getMinPeptideLength(), adapted.getMinPeptideLength()); + Assert.assertEquals(legacy.getMaxPeptideLength(), adapted.getMaxPeptideLength()); + Assert.assertEquals(legacy.getMaxNumVariantsPerPeptide(), adapted.getMaxNumVariantsPerPeptide()); + Assert.assertEquals(legacy.getMinCharge(), adapted.getMinCharge()); + Assert.assertEquals(legacy.getMaxCharge(), adapted.getMaxCharge()); + Assert.assertEquals(legacy.getNumThreads(), adapted.getNumThreads()); + Assert.assertEquals(legacy.getOutputFormat(), adapted.getOutputFormat()); + } + + @Test + public void picocliPathRejectsMissingRequiredFlags() { + MSGFPlusOptions opts = new MSGFPlusOptions(); + try { + new CommandLine(opts).parseArgs(new String[] {"-t", "20ppm"}); + Assert.fail("expected picocli to reject CLI missing -s and -d"); + } catch (CommandLine.MissingParameterException expected) { + // ok + } + } + + @Test + public void picocliPathParsesAsymmetricTolerance() { + ParamManager pm = freshMSGFPlusParamManager(); + String[] argv = { + "-s", "src/test/resources/test.mgf", + "-d", "src/test/resources/Tryp_Pig_Bov.fasta", + "-t", "0.5Da,2.5Da", + }; + MSGFPlusOptions opts = new MSGFPlusOptions(); + new CommandLine(opts).parseArgs(argv); + String err = MSGFPlusOptionsAdapter.adapt(opts, pm); + Assert.assertNull("adapter returned error on asymmetric tolerance: " + err, err); + // Parity with legacy: + ParamManager legacy = freshMSGFPlusParamManager(); + Assert.assertNull(legacy.parseParams(argv)); + Assert.assertEquals(legacy.getToleranceUnit(), pm.getToleranceUnit()); + } + + private static ParamManager freshMSGFPlusParamManager() { + ParamManager pm = new ParamManager("MS-GF+", "test", "test", "test"); + pm.addMSGFPlusParams(); + return pm; + } +} From 1fe3709369f0adc7fcd950ce54ca24f1b7faa90c Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 11:11:53 +0100 Subject: [PATCH 15/34] refactor(cli): unify -conf path through picocli (Phase 2) Drop the special-case "-conf in argv -> legacy parseParams" fallback from MSGFPlus.parseArgs. The picocli path now handles every MSGFPlus invocation, including config-file inputs. Two changes: 1. -s and -d are no longer picocli-required. They were already declared isOptional=true on the legacy ParamManager side (FileParameter.setAsOptional in addSpecFileParam(true) / addDBFileParam(true)), so requiring them at the picocli level was stricter than legacy. Without this change, a valid "-conf params.txt" invocation (where SpectrumFile / DatabaseFile live in the config file) would be rejected by picocli before the adapter could populate ParamManager. 2. The config-file overlay is unchanged. SearchParams.parseConfigParamFile already runs after CLI parsing and only fills in !commandLineParam.isValueAssigned() entries (SearchParams.java:588), so CLI flags continue to override config-file values transparently. The flow is now: argv -> picocli -> MSGFPlusOptions -> adapter -> ParamManager -> SearchParams.parse (config-file overlay happens here). No legacy parseParams call remains in the MSGFPlus entry point. Test: picocliPathRejectsMissingRequiredFlags replaced with picocliPathAcceptsConfigOnlyInvocation (picocli must accept "-conf X" without -s/-d, and the adapter must record the config file path so SearchParams.parse can read it). Scoped sweep (TestDirectPinWriter, TestMSUtils, TestSA, TestMisc, TestRunManifestWriter, SearchParamsTest, TestPercolator, MSGFPlusOptionsAdapterTest): 45 tests, 0 failures, 0 errors. --- src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java | 16 ++++++---------- .../edu/ucsd/msjava/cli/MSGFPlusOptions.java | 12 +++++++----- .../msjava/cli/MSGFPlusOptionsAdapterTest.java | 16 +++++++++------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java index 3c0a829c..fa8c8e3f 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java @@ -97,18 +97,14 @@ public static void main(String argv[]) { } /** - * Phase 1 of the parameter-modernization plan: route MSGFPlus argv - * through the typed picocli-based {@link MSGFPlusOptions} class and - * adapter, falling back to {@link ParamManager#parseParams(String[])} - * for {@code -conf} (config-file input) until Phase 2 ports the - * config-file reader. See {@code .claude/plans/parameter-modernization.md}. + * Route MSGFPlus argv through the typed picocli-based + * {@link MSGFPlusOptions} class + adapter. Config-file values are + * applied later by {@link edu.ucsd.msjava.msdbsearch.SearchParams#parse} + * for any parameter the CLI did not assign, so {@code -conf} works + * uniformly through this single path. See + * {@code .claude/plans/parameter-modernization.md}. */ private static String parseArgs(String[] argv, ParamManager paramManager) { - for (String arg : argv) { - if ("-conf".equals(arg)) { - return paramManager.parseParams(argv); - } - } MSGFPlusOptions opts = new MSGFPlusOptions(); try { new CommandLine(opts).parseArgs(argv); diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java index 0c5df646..64d07bb8 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java @@ -29,14 +29,16 @@ description = "MS-GF+: peptide identification by database search of mass spectra.") public final class MSGFPlusOptions { - // ---------- required input ---------- + // ---------- input (required at runtime, but may be provided via -conf) ---------- - @Option(names = "-s", required = true, paramLabel = "SpectrumFile", - description = "Input spectrum file (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl, *_dta.txt) or directory of spectra") + @Option(names = "-s", paramLabel = "SpectrumFile", + description = "Input spectrum file (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl, *_dta.txt) or directory of spectra. " + + "Required, unless provided via -conf as SpectrumFile=...") public File spectrumFile; - @Option(names = "-d", required = true, paramLabel = "DatabaseFile", - description = "Database file (*.fasta, *.fa, *.faa)") + @Option(names = "-d", paramLabel = "DatabaseFile", + description = "Database file (*.fasta, *.fa, *.faa). " + + "Required, unless provided via -conf as DatabaseFile=...") public File databaseFile; // ---------- optional config + output ---------- diff --git a/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java index 454dac47..3ca7148a 100644 --- a/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java +++ b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java @@ -65,14 +65,16 @@ public void picocliPathPopulatesParamManagerEquivalentlyToLegacyPath() { } @Test - public void picocliPathRejectsMissingRequiredFlags() { + public void picocliPathAcceptsConfigOnlyInvocation() { + // -conf may supply -s/-d via the config file, so picocli must + // not reject CLI invocations that omit them. Matches legacy + // ParamManager behavior (FileParameter.setAsOptional() on -s/-d). + ParamManager pm = freshMSGFPlusParamManager(); MSGFPlusOptions opts = new MSGFPlusOptions(); - try { - new CommandLine(opts).parseArgs(new String[] {"-t", "20ppm"}); - Assert.fail("expected picocli to reject CLI missing -s and -d"); - } catch (CommandLine.MissingParameterException expected) { - // ok - } + new CommandLine(opts).parseArgs(new String[] {"-conf", "src/test/resources/HCD_QExactive_Tryp.param"}); + String err = MSGFPlusOptionsAdapter.adapt(opts, pm); + Assert.assertNull("adapter rejected -conf-only CLI: " + err, err); + Assert.assertNotNull("config file param not set", pm.getConfigFileParam().getFile()); } @Test From 5a2ec4e07c1e8fadedf4a02b304e868d5cc1a186 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 13:21:44 +0100 Subject: [PATCH 16/34] refactor: drop deprecated MSGFDB entry point + dead MSGF/MSGFLib params Net -874 / +2 lines. Removed: - src/main/java/edu/ucsd/msjava/cli/MSGFDB.java -- deprecated since 2012 (version 8091, "08/06/2012"), no external references in src, docs, pom.xml, or manifest. The class was @Deprecated and only referenced its own legacy entry point. - ParamManager.addMSGFDBParams() -- only caller was the deleted MSGFDB.main. - ParamManager.addMSGFParams() and addMSGFLibParams() -- no entry points existed at all (cli.MSGF, cli.MSGFLib never created); both were pure dead code accumulating since the original mzid pipeline removal. - ParamManager.addOutputFileParam() / addDBFileParam(String, String, boolean) -- private helpers used only by the deleted methods. - ParamNameEnum.OUTPUT_FILE -- the "-o for MSGF and MS-GFDB" variant; MSGFPlus uses SEARCH_OUTPUT_FILE (same key "o", different label). ParamManager.getOutputFileParam() now reads SEARCH_OUTPUT_FILE directly so SearchParams.parse continues to resolve the user's -o. - ParamNameEnum.C13, NNET, UNIFORM_AA_PROBABILITY -- MSGFDB-only. - docs/ms-gfdb.md -- documentation for the removed tool. - TestMinSpectraPerThread.msgfdbEntryPointAlsoRegistersTheParam -- test for the removed addMSGFDBParams. The remaining 3 tests in TestMinSpectraPerThread continue to verify MSGFPlus's MIN_SPECTRA_PER_THREAD param. Scoped sweep (TestDirectPinWriter, TestMSUtils, TestSA, TestMisc, TestRunManifestWriter, SearchParamsTest, TestPercolator, MSGFPlusOptionsAdapterTest, TestMinSpectraPerThread): 48 tests, 0 failures, 0 errors. --- docs/ms-gfdb.md | 210 --------- src/main/java/edu/ucsd/msjava/cli/MSGFDB.java | 436 ------------------ .../edu/ucsd/msjava/params/ParamManager.java | 223 +-------- .../msgfplus/TestMinSpectraPerThread.java | 7 - 4 files changed, 2 insertions(+), 874 deletions(-) delete mode 100644 docs/ms-gfdb.md delete mode 100644 src/main/java/edu/ucsd/msjava/cli/MSGFDB.java diff --git a/docs/ms-gfdb.md b/docs/ms-gfdb.md deleted file mode 100644 index aed3fc5d..00000000 --- a/docs/ms-gfdb.md +++ /dev/null @@ -1,210 +0,0 @@ -# MS-GFDB - -[MS-GF+ Documentation home](readme.md) - -MS-GFDB is an old application that is no longer under development. It was supserseded by [MS-GF+](msgfplus.md). -MS-GF+ has all the functionalities provided by MS-GFDB, plus numerous improvements. - -### Differences between MS-GF+ and MS-GFDB - -- **Input** - - MS-GF+ supports mzML in addition to mzXML, mgf, ms2, pkl and \_dta.txt - - "-t PrecursorMassTolerance" is optional with MS-GF+ (default 20ppm) - - "-c13 0/1/2" was changed to "-ti IsotopeErrorRange" in MS-GF+ - - IsotopeErrorRange: MinIsotopeError,MaxIsotopeError (both are inclusive) - - -c13 x == -ti 0,x - - "-nnet" was changed to "-ntt" in MS-GF+ - - -nnet 0 == -ntt 2, -nnet 1 == -ntt 1, -nnet 2 == -ntt 0 - - Modification file format change - - In MS-GF+, the name of the modification should match the PSI-MS name (accessible from [http://www.unimod.org](http://www.unimod.org/)) - - CompositionStr can take Br, Cl, Fe, Se in addition to C, H, N, O, S, and P - - The sequence of the atoms can be arbitrary. - - Previously C2H2O was valid but OH2C2 was invalid - - With MS-GF+, both are valid - - "-uniformAAProb 0/1" was deleted in MS-GF+ - - "-addFeatures 0/1" was added to MS-GF+; "-addFeatures 1" will output the following extra features for each PSM (will be useful to downstream tools like Percolator or IDPicker): - - MS2IonCurrent: Summed intensity of all product ions - - ExplainedIonCurrentRatio: Summed intensity of all matched product ions (e.g. b, b-H2O, y, etc.) divided by MS2IonCurrent - - NTermIonCurrentRatio: Summed intensity of all matched prefix ions (e.g. b, b-H2O, etc.) divided by MS2IonCurrent - - CTermIonCurrentRatio: Summed intensity of all matched suffix ions (e.g. y, y-H2O, etc.) divided by MS2IonCurrent - - "-showQValue 0/1" was added to MS-GF+ - - "-showDecoy 0/1" was added to MS-GF+ -- **Output** - - Output format for MS-GF+ is the HUPO PSI mzIdentML version 1.1 (\*.mzid); see  for details. - - Decoy protein prefix is "XXX" in MS-GF+ (vs. "REV" in MS-GFDB) - - MS-GF+ provides a converter from mzIdentML to tsv (the resulting tsv file will be similar to the MS-GFDB output file). - - The converter is included in the MSGFPlus.jar file - - It can be run by "java -Xmx2000M edu.ucsd.msjava.ui.MzIDToTsv" - - A faster converter that supports larger result files is the Mzid-To-Tsv-Converter, [available on GitHub](https://github.com/PNNL-Comp-Mass-Spec/Mzid-To-Tsv-Converter/releases). This is a C# application that works under Windows or on Linux with mono. - - Difference between the MS-GFDB output and the MS-GF+ TSV output - - MS-GF+ includes SpecID (native spectrum ID) instead of SpecIndex - - MS-GF+ reports IsotopeError - - When a peptide matches to multiple proteins, all protein accessions will be reported by MS-GF+ - - SpecProb was renamed to SpecEValue in MS-GF+ - - MS-GF+ reports EValue (database-level E-value) instead of PValue (database-level P-value) - - FDR and PepFDR were renamed to QValue and PepQValue in MS-GF+ - -# MS-GFDB - - -```text -Usage: java -Xmx2000M -jar MSGFDB.jar - -s SpectrumFile (*.mzXML, *.mzML, *.mgf, *.ms2, *.pkl or *_dta.txt) - -d DatabaseFile (*.fasta or .fa) - -t ParentMassTolerance (e.g. 2.5Da, 30ppm, or 0.5Da,2.5Da) - Use comma to set asymmetric values. E.g. "-t 0.5Da,2.5Da" will set 0.5Da to the left (expMasstheoMass). - [-o outputFileName] (Default: stdout) - [-thread NumOfThreads] (Number of concurrent threads to be executed, Default: Number of available cores) - [-tda 0/1] (0: don't search decoy database (default), 1: search decoy database to compute FDR) - [-m FragmentationMethodID] (0: as written in the spectrum or CID if no info (Default), 1: CID, 2: ETD, 3: HCD, 4: Merge spectra from the same precursor) - [-inst InstrumentID] (0: Low-res LCQ/LTQ (Default for CID and ETD), 1: High-res LTQ (Default for HCD), 2: TOF) - [-e EnzymeID] (0: No enzyme, 1: Trypsin (Default), 2: Chymotrypsin, 3: Lys-C, 4: Lys-N, 5: Glu-C, 6: Arg-C, 7: Asp-N, 8: aLP, 9: Endogenous peptides) - [-c13 0/1/2] (Number of allowed C13, Default: 1) - [-nnet 0/1/2] (Number of allowed non-enzymatic termini, Default: 1) - [-mod ModificationFileName] (Modification file, Default: standard amino acids with fixed C+57) - [-minLength MinPepLength] (Minimum peptide length to consider, Default: 6) - [-maxLength MaxPepLength] (Maximum peptide length to consider, Default: 40) - [-minCharge MinPrecursorCharge] (Minimum precursor charge to consider if not specified in the spectrum file, Default: 2) - [-maxCharge MaxPrecursorCharge] (Maximum precursor charge to consider if not specified in the spectrum file, Default: 3) - [-n NumMatchesPerSpec] (Number of matches per spectrum to be reported, Default: 1) - [-uniformAAProb 0/1] (0: use amino acid probabilities computed from the input database (default), 1: use probability 0.05 for all amino acids) -``` - - -### Parameters: - -- **-s SpectrumFile** (\*.mzXML, \*.mzML, \*.mgf, \*.ms2, \*.pkl or \*\_dta.txt) - Required - - Spectrum file name. Currently, MS-GFDB supports the following file formats: mzXML, mzML, mgf, ms2, pkl and \_dta.txt. -- **-d DatabaseFile** (\*.fasta or \*.fa) - Required - - Path to the protein database file. If the database file does not have auxiliary index files (\*.canno, \*.cnlcp, \*.csarr, and \*.cseq), MS-GFDB will create them. - - When "-tda 1" option is used, the database must contain only target protein sequences. - -If multiple MS-GFDB processes access the same database file, it is strongly recommended to index the database prior to the database search by running BuildSA (see below). - -- **-t ParentMassTolerance** - Required - - Parent mass tolerance in Da. or ppm. There must be no space between the number and the unit. E.g. 2.5Da, 30ppm - - To set asymmetric tolerances, use comma to separate left (experimental mass \< theoretical mass) or right (experimental mass \> theoretical mass) tolerances. E.g. 0.5Da,2.5Da -- **-o OutputFile** (Default: stdout) - - Filename where the output will be written. - - The output will be printed to standard out by default. -- **-thread NumOfThreads** (Number of concurrent threads to be executed, Default: Number of available cores) - - Number of concurrent threads to be executed together. - - Default value is the number of available logical cores (e.g. 8 for quad-core processor with hyper-threading support). -- **-tda 0/1** (0: don't search decoy database (default), 1: search decoy database to compute FDR) - - Indicates whether to search the decoy database or not. - - If 0, the decoy database is not searched and FDRs are theoretically derived from P-values (EFDR). - - If 1, FDRs are computed based on the target-decoy approach (i.e. reversed database is appended to the target database and MS-GFDB searches the combined database) - - FDR(t) = \#(DecoyPSMs with score equal or above t) / \#(TargetPSMs with score equal or above t). - - PSM: Peptide-Spectrum Match - - -log(SpecProb) is used as the score to compute FDR. - -If -tda 1 is specified, MS-GFDB automatically creates a combined target/reversed database file (DBFileName.revConcat.fasta). Thus, when specifying "-d" parameter, DatabaseFile must contain only target proteins. - -- **-m FragmentationMethodID** (0: as written in the spectrum or CID if no info (Default), 1: CID, 2: ETD, 3: HCD, 4: Merge spectra from the same precursor) - - Fragmentation method identifier (used to determine the scoring model). - - If the identifier is 0 and fragmentation method is written in the spectrum file (e.g. activationMethod field in mzXML files), MS-GFDB will recognize the fragmentation method and use a relevant scoring model. - - If the identifier is 0 and there is no fragmentation method information in the spectrum (e.g. mgf files), CID model will be used by default. - - If the identifier is non-zero and the spectrum has fragmentation method information, only the spectra that match with the identifier will be processed. - - If the identifier is non-zero and the spectrum has no fragmentation method information, MS-GFDB will process all spectra assuming the specified fragmentation method. - - If the identifier is 4, MS/MS spectra from the same precursor ion (e.g. CID/ETD pairs, CID/HCD/ETD triplets) will be merged and the "merged" spectrum will be used for searching instead of individual spectra. See Kim et al., MCP 2010 for details. -- **-inst InstrumentID** (0: Low-res LCQ/LTQ (Default for CID and ETD), 1: TOF , 2: High-res LTQ (Default for HCD)) - - Identifier of the instrument to generate MS/MS spectra (used to determine the scoring model). - - For "hybrid" spectra with high-precision MS1 and low-precision MS2, use 0. - - For usual low-precision instruments (e.g. Thermo LTQ), use 0. - - For TOF instruments, use 1. - - If MS/MS fragment ion peaks are of high-precision (e.g. tolerance = 10ppm), use 2. -- **-e EnzymeID** (Default: 1) - - Enzyme identifier. Trypsin (1) will be used by default. - - 0: No enzyme, 1: Trypsin (default), 2: Chymotrypsin, 3: Lys-C, 4: Lys-N, 5: Glu-C, 6: Arg-C, 7: Asp-N, 8: alphaLP, 9: Endogenous peptides -- **-c13 0/1/2** (Number of allowed isotope errors, Default: 1) - - Instruments often choose 2nd or 3rd isotope peak instead of mono-isotope peak from MS1 spectrum. - - If this value is non-zero, expPeptideMass-1.00335 (i.e. mass(13C)-mass(12C)) and expPeptideMass-2.00671 (i.e. 2\*(mass(C13)-mass(C12)) (only if -c13 2) will be considered along with expPeptideMass. - - If accurate precursor ion mass is available (e.g. LTQ-Orbitrap), it is better to set a narrow parent mass tolerance and non-zero -c13 value (e.g. -t 30ppm -c13 1) than to set a wide tolerance (e.g. -t 0.5Da,2.5Da). - - If the parent mass tolerance is equal to or larger than 0.5Da or 500ppm, this parameter will be ignored. -- **-nnet 0/1/2** (Number of allowed non-enzymatic termini, Default: 1) - - This parameter is used to determine the enzyme cleavage rule. - - Specifies the maximum number of peptide termini that are not cleaved by the enzyme. - - For example, for trypsin, K.ACDEFGHR.C, G.ACDEFGHR.C, K.ACDEFGHI.C and G.ACDEFGHR.C have 0, 1, 1 and 2 non-enzymatic termini, accordingly. - - By default, -nnet 1 will be used. Using -nnet 0 (or 2) will make the search significantly faster (slower). -- **-mod ModificationFile** (Default: standard amino acids with fixed C+57)\] - - Modification file name. ModificationFile contains the modifications to be considered in the search. - - If -mod option is not specified, standard amino acids with fixed Carbamidomethylation C will be used. - - See an [example MS-GFDB modification file](msgfdb_modfile.md). -- **-minLength MinPepLength** (Default: 6) - - Minimum length of the peptide to be considered. -- **-maxLength MaxPepLength** (Default: 40) - - Maximum length of the peptide to be considered. -- **-minCharge MinPrecursorCharge** (Default: 2) - - Minimum precursor charge to consider. This parameter is used only for spectra with no charge. -- **-maxCharge MinPrecursorCharge** (Default: 3) - - Maximum precursor charge to consider. This parameter is used only for spectra with no charge. -- **-n NumMatchesPerSpec** (Default: 1) - - Number of peptide matches per spectrum to report. - - Expected false discovery rates (EFDRs) will be reported only when this value is 1. -- **-uniformAAProb** 0/1 (Default: 0) - - If 0, compute amino acid frequencies from the input database and use them as amino acid probabilities. - - If 1, use uniform amino acid probability (preferable when the database size is small). - -### MS-GFDB output - -MS-GFDB outputs a tab-delimited file with the following columns: \#SpecFile, Scan#, FragMethod, Precursor, PMError, Charge, Peptide, Protein, DeNovoScore, MSGFScore, SpecProb, P-value, EFDR. - -- **SpecFile**: spectrum file name -- **SpecIndex**: spectrum index (1-based) in the file. The first spectrum has index 1, the second has index 2, and so on. For mzXML files this value is same as the scan number. -- **Scan#**: scan number of the spectrum. If the scan number is not available, the value will be -1. -- **FragMethod**: fragmentation method used to generate the spectrum (e.g. CID, ETD, etc.). When spectra from the same precursor are merged, fragmentation methods of merged spectra will be shown as a form "FragMethod1/FragMethod2/..." (e.g. CID/ETD, CID/HCD/ETD). -- **Precursor**: precursor mass in m/z or ppm -- **Charge**: precursor ion charge -- **Peptide**: peptide sequence with neighboring amino acids -- **Protein**: protein name -- **DeNovoScore**: the score of the optimal scoring peptide (not necessary in the database) -- **MSGFScore**: MS-GF raw score of the peptide-spectrum match (MSGFScore \<= DeNovoScore) -- **SpecProb**: spectral probability (spectrum level p-value) of the peptide-spectrum match -- **P-value**: database level p-value (probability that a random PSM have an equal or better score against a random database of the same size) -- **EFDR** or **FDR**: false discovery rate - - If "-tda 1" is specified, FDRs are estimated using the target-decoy approach using the spectral probability (SpecProb) as the score (the lower, the better). - - Otherwise, FDRs are estimated using P-values without searching the decoy database (EFDR). See Gupta et al., JASMS 2011 for details. - - MS-GFDB reports EFDR only when it is configured to report 1 peptide match per spectrum (i.e. -n 1). - - EFDR accurately estimates FDR when the parent mass tolerance is equal or larger than 0.5. - - EFDR conservatively estimates FDR when the parent mass tolerance is small. - - E.g. When parent mass tolerance is 30ppm, at EFDR 1% threshold, one identifies approximately 7% less peptide-spectrum matches (PSMs) compared to the case when the target-decoy approach is used to estimate the FDR. -- **PepFDR** - - Peptide-level FDR estimated using the target-decoy approach. - - Reported only if "-tda 1" is specified. - - If multiple spectra are matched to the same peptide, only the best scoring PSM (lowest SpecProb) is retained. After that, PepFDR is calculated as \#DecoyPSMs\>s / \#TargetPSMs\>s among the retained PSMs. This approximates the FDR of the set of unique peptides. In the MS-GFDB output, the same PepFDR value is given to all PSMs sharing the peptide. So, even a low-quality PSM may get a low PepFDR value (if it has a high-quality "sibling" PSM sharing the peptide). Note that this should not be used to count the number of identified PSMs. - -### MS-GFDB output example - - -| \#SpecFile | SpecIndex | Scan# | FragMethod | Precursor | PMError(ppm) | Charge | Peptide | Protein | DeNovoScore | MSGFScore | SpecProb | P-value | FDR | PepFDR | -|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----| -| 090121_NM_Trypsin_20.mzXML | 2838 | 2838 | CID | 964.7707 | 1.5199227 | 3 | K.TIQNSSVSPTSSSSSSSSTGETQTQSSSR.L | IPI:IPI00002349.2\|SWISS-PROT:Q7Z417\|TREMBL:A1L3A7\|ENSEMBL:ENSP00000225388\|REFSEQ:NP_065823\|H-INV:HIT000001036\|VEGA:OTTHUMP00000181037 Tax_Id=9606 Gene_Symbol=NUFIP2 Nuclear fragile X mental retardation-interacting protein 2 | 190 | 181 | 9.380133E-30 | 2.9333857E-22 | 0.0 | 0.0 | -| 090121_NM_Trypsin_20.mzXML | 3671 | 3671 | ETD | 1113.4758 | 0.6583758 | 2 | R.VGPADDGPAPSGEEEGEGGGEAGGK.E | IPI:IPI00016725.2\|SWISS-PROT:Q9UKN8\|TREMBL:B3KNH2;Q05CN7\|ENSEMBL:ENSP00000361219\|REFSEQ:NP_036336\|H-INV:HIT000071196\|VEGA:OTTHUMP00000022434 Tax_Id=9606 Gene_Symbol=GTF3C4 General transcription factor 3C polypeptide 4 | 162 | 158 | 1.9912463E-28 | 6.0892146E-21 | 0.0 | 0.0 | -| 090121_NM_Trypsin_20.mzXML | 3031 | 3031 | ETD | 651.64874 | 1.7510794 | 3 | K.GAAAAAAASGAAGGGGGGAGAGAPGGGR.L | IPI:IPI00644073.1\|VEGA:OTTHUMP00000038687 Tax_Id=9606 Gene_Symbol=INTS3 18 kDa protein | 214 | 202 | 6.7318633E-28 | 2.093763E-20 | 0.0 | 0.0 | -| 090121_NM_Trypsin_20.mzXML | 19088 | 19088 | CID | 1199.0916 | 10.392676 | 2 | K.VNFSPPGDTNSLFPGTWYLER.V | IPI:IPI00945760.1\|TREMBL:B7Z784;B7Z7M8;B7Z8R3\|REFSEQ:NP_001159579 Tax_Id=9606 Gene_Symbol=HMGCS2 hydroxymethylglutaryl-CoA synthase, mitochondrial isoform 2 precursor | 243 | 243 | 2.9611275E-27 | 8.838129E-20 | 0.0 | 0.0 | -| 090121_NM_Trypsin_20.mzXML | 3030 | 3030 | CID/ETD | 651.64874 | 1.7510794 | 3 | K.GAAAAAAASGAAGGGGGGAGAGAPGGGR.L | IPI:IPI00644073.1\|VEGA:OTTHUMP00000038687 Tax_Id=9606 Gene_Symbol=INTS3 18 kDa protein | 389 | 389 | 7.508096E-33 | 2.335189E-25 | 0.0 | 0.0 | - - -# BuildSA - -Index a protein database for fast searching. - - -```text -Usage: java -cp MSGFDB.jar msdbsearch.BuildSA - -d DatabaseFile (*.fasta or *.fa) - [-tda 0/1/2] (0: target only, 1: target-decoy database only, 2: both) -``` - - -**Parameters:** - -- **-d DbPath** - - Name of a protein database (\*.fasta or \*.fa) - - Database file must ends with ".fasta" or ".fa". -- **-tda 0/1/2** - - If 0, only "DatabaseFile" will be indexed. - - If 1, a new database file (\*.revConcat.fasta) will be generated by appending reversed proteins. This forward-reverse database will be indexed. - - If 2, both the original database and the forward-reverse database file will be indexed. - -BuildSA creates a suffix array of the protein database. For an input database file DBFileName.fasta, BuildSA will generate 4 auxiliary files (DbFileName.canno, DBFileName.cnlcp, DBFileName.csarr, DBFileName.cseq).It needs to be executed only once per each database file. diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFDB.java b/src/main/java/edu/ucsd/msjava/cli/MSGFDB.java deleted file mode 100644 index d8a58442..00000000 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFDB.java +++ /dev/null @@ -1,436 +0,0 @@ -package edu.ucsd.msjava.cli; - -import edu.ucsd.msjava.msdbsearch.*; -import edu.ucsd.msjava.msgf.MSGFDBResultGenerator; -import edu.ucsd.msjava.msgf.Tolerance; -import edu.ucsd.msjava.msscorer.NewScorerFactory.SpecDataType; -import edu.ucsd.msjava.msutil.*; -import edu.ucsd.msjava.params.FileParameter; -import edu.ucsd.msjava.params.IntRangeParameter; -import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.params.ToleranceParameter; -import edu.ucsd.msjava.sequences.Constants; - -import java.io.*; -import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - -/** - * This class is deprecated - * Instead, use MSGFPlus - */ -@Deprecated -public class MSGFDB { - public static final String VERSION = "8091"; - public static final String RELEASE_DATE = "08/06/2012"; - - public static final String DECOY_PROTEIN_PREFIX = "XXX"; - public static final String DECOY_DB_EXTENSION = ".revConcat.fasta"; - - public static void main(String argv[]) { - long time = System.currentTimeMillis(); - - ParamManager paramManager = new ParamManager("MSGFDB", MSGFDB.VERSION, MSGFDB.RELEASE_DATE, "java -Xmx2000M -jar MSGFDB.jar"); - paramManager.addMSGFDBParams(); - - if (argv.length == 0) { - paramManager.printUsageInfo(); - return; - } - - // Parse parameters - String errMessage = paramManager.parseParams(argv); - if (errMessage != null) { - System.err.println("[Error] " + errMessage); - System.out.println(); - paramManager.printUsageInfo(); - return; - } - - // Running MS-GFDB (deprecated) - paramManager.printToolInfo(); - paramManager.printJVMInfo(); - String errorMessage = null; - try { - errorMessage = runMSGFDB(paramManager); - } catch (Exception e) { - e.printStackTrace(); - System.exit(-1); - } - - if (errorMessage != null) { - System.err.println("[Error] " + errorMessage); - System.out.println(); - System.exit(-1); - } else - System.out.format("MS-GFDB complete (total elapsed time: %.2f sec)\n", (System.currentTimeMillis() - time) / (float) 1000); - } - - public static String runMSGFDB(ParamManager paramManager) { - // Spectrum file - FileParameter specParam = paramManager.getSpecFileParam(); - File specPath = specParam.getFile(); - - if (!specPath.exists()) { - return "Spectrum file not found: " + specPath.getPath(); - } - - if (!specPath.isDirectory()) { - // Spectrum format - SpecFileFormat specFormat = (SpecFileFormat) specParam.getFileFormat(); - - // Output file - File outputFile = paramManager.getOutputFileParam().getFile(); - - return runMSGFDB(specPath, specFormat, outputFile, paramManager); - } else // spectrum directory - { - for (File f : specPath.listFiles()) { - SpecFileFormat specFormat = SpecFileFormat.getSpecFileFormat(f.getName()); - if (specParam.isSupported(specFormat)) { - System.out.println("\nProcessing " + f.getAbsolutePath()); - String outputFileName = f.getName().substring(0, f.getName().lastIndexOf('.')) + ".tsv"; - File outputFile = new File(outputFileName); - if (outputFile.exists()) - return outputFile.getAbsolutePath() + " already exists!"; - System.out.println("Writing results to " + outputFile.getAbsolutePath()); - String errMsg = runMSGFDB(f, specFormat, outputFile, paramManager); - if (errMsg != null) - return errMsg; - } - } - return null; - } - } - - private static String runMSGFDB(File specFile, SpecFileFormat specFormat, File outputFile, ParamManager paramManager) { - long time = System.currentTimeMillis(); - - // Verify that the output directory exists and can be written to - File outputDirectory = outputFile.getParentFile(); - if (outputDirectory != null) { - - if (!outputDirectory.exists()) { - System.out.println("Creating directory " + outputDirectory.getPath()); - boolean success = outputDirectory.mkdirs(); - if (!success) { - return "Unable to create the missing directory: " + outputDirectory.getPath(); - } - } else if (!outputDirectory.isDirectory()) { - return "Invalid output file path (file path instead of directory path?): " + outputDirectory.getPath(); - } - - // An easy way to test for write access is outputDirectory.canWrite() - // However, on Windows this is not always accurate - // Thus, create a temporary file then delete it - try { - File testFile = File.createTempFile("MSGFPlus", ".tmp", outputDirectory); - testFile.delete(); - } catch (java.io.IOException e) { - return "Cannot create files in the output directory: " + e.getMessage(); - } catch (SecurityException e) { - return "Cannot create files in the output directory; permission denied for: " + outputDirectory.getPath(); - } - } - - // DB file - File databaseFile = paramManager.getDBFileParam().getFile(); - - // Precursor mass tolerance - ToleranceParameter tol = ((ToleranceParameter) paramManager.getParameter(ParamManager.ParamNameEnum.PRECURSOR_MASS_TOLERANCE.getKey())); - Tolerance leftPrecursorMassTolerance = tol.getLeftTolerance(); - Tolerance rightPrecursorMassTolerance = tol.getRightTolerance(); - - int toleranceUnit = paramManager.getIntValue((ParamManager.ParamNameEnum.PRECURSOR_MASS_TOLERANCE_UNITS.getKey())); - if (toleranceUnit != 2) { - boolean isTolerancePPM; - isTolerancePPM = toleranceUnit != 0; - leftPrecursorMassTolerance = new Tolerance(leftPrecursorMassTolerance.getValue(), isTolerancePPM); - rightPrecursorMassTolerance = new Tolerance(rightPrecursorMassTolerance.getValue(), isTolerancePPM); - } - - int numAllowedC13 = paramManager.getIntValue(ParamManager.ParamNameEnum.C13.getKey()); - if (rightPrecursorMassTolerance.getToleranceAsDa(1000, 2) >= 0.5f) - numAllowedC13 = 0; - - Enzyme enzyme = paramManager.getEnzyme(); - int numAllowedNonEnzymaticTermini = paramManager.getIntValue(ParamManager.ParamNameEnum.NNET.getKey()); - ActivationMethod activationMethod = paramManager.getActivationMethod(); - InstrumentType instType = paramManager.getInstType(); - if (activationMethod == ActivationMethod.HCD) - instType = InstrumentType.HIGH_RESOLUTION_LTQ; - - Protocol protocol = paramManager.getProtocol(); - - AminoAcidSet aaSet = null; - File modFile = paramManager.getModFileParam().getFile(); - if (modFile == null) { - aaSet = AminoAcidSet.getStandardAminoAcidSetWithFixedCarbamidomethylatedCys(); - } else { - String modFileName = modFile.getName(); - String ext = modFileName.substring(modFileName.lastIndexOf('.') + 1); - if (ext.equalsIgnoreCase("xml")) - aaSet = AminoAcidSet.getAminoAcidSetFromXMLFile(modFile.getAbsolutePath()); - else - aaSet = AminoAcidSet.getAminoAcidSetFromModFile(modFile.getAbsolutePath(), paramManager); - if (aaSet.containsPhosphorylation()) { - protocol = Protocol.PHOSPHORYLATION; - } - } - - int numMatchesPerSpec = paramManager.getNumMatchesPerSpectrum(); - - IntRangeParameter specIndexParam = paramManager.getSpecIndexParameter(); - int startSpecIndex = specIndexParam.getMin(); - int endSpecIndex = specIndexParam.getMax(); - - boolean useTDA = paramManager.getIntValue(ParamManager.ParamNameEnum.TDA_STRATEGY.getKey()) == 1; - boolean showFDR = paramManager.getIntValue("showFDR") == 1; - boolean showDecoy = paramManager.getIntValue("showDecoy") == 1; - - int minPeptideLength = paramManager.getIntValue(ParamManager.ParamNameEnum.MIN_PEPTIDE_LENGTH.getKey()); - int maxPeptideLength = paramManager.getIntValue(ParamManager.ParamNameEnum.MAX_PEPTIDE_LENGTH.getKey()); - if (minPeptideLength > maxPeptideLength) { - return "MinPepLength must not be larger than MaxPepLength"; - } - - int minCharge = paramManager.getIntValue(ParamManager.ParamNameEnum.MIN_CHARGE.getKey()); - int maxCharge = paramManager.getIntValue(ParamManager.ParamNameEnum.MAX_CHARGE.getKey()); - if (minCharge > maxCharge) { - return "MinCharge must not be larger than MaxCharge"; - } - - int numThreads = paramManager.getIntValue(ParamManager.ParamNameEnum.NUM_THREADS.getKey()); - boolean useUniformAAProb = paramManager.getIntValue(ParamManager.ParamNameEnum.UNIFORM_AA_PROBABILITY.getKey()) == 1; - boolean replicateMergedResults = paramManager.getIntValue("replicate") == 1; - boolean doNotDseEdgeScore = paramManager.getIntValue(ParamManager.ParamNameEnum.EDGE_SCORE.getKey()) == 1; - boolean allowDenseCentroidedPeaks = paramManager.getIntValue(ParamManager.ParamNameEnum.ALLOW_DENSE_CENTROIDED_PEAKS.getKey()) == 1; - - System.out.println("Loading database files..."); - File dbIndexDir = paramManager.getFile(ParamManager.ParamNameEnum.DD_DIRECTORY.getKey()); - if (dbIndexDir != null) { - - File newDBFile = new File(Paths.get(dbIndexDir.getAbsolutePath(), databaseFile.getName()).toString()); - if (!useTDA) { - if (!newDBFile.exists()) { - System.out.println("Creating " + newDBFile.getAbsolutePath() + "."); - ReverseDB.copyDB(databaseFile.getAbsolutePath(), newDBFile.getAbsolutePath()); - } - } - databaseFile = newDBFile; - } - - if (useTDA) { - String dbFileName = databaseFile.getName(); - String concatDBFileName = dbFileName.substring(0, dbFileName.lastIndexOf('.')) + DECOY_DB_EXTENSION; - - String concatDBFilePath = Paths.get(databaseFile.getAbsoluteFile().getParent(), concatDBFileName).toString(); - File concatTargetDecoyDBFile = new File(concatDBFilePath); - - if (!concatTargetDecoyDBFile.exists()) { - System.out.println("Creating " + concatTargetDecoyDBFile.getAbsolutePath() + "."); - if (ReverseDB.reverseDB(databaseFile.getAbsolutePath(), concatTargetDecoyDBFile.getAbsolutePath(), true, DECOY_PROTEIN_PREFIX) == false) { - return "Cannot create a decoy database file!"; - } - } - databaseFile = concatTargetDecoyDBFile; - } - - if (!useUniformAAProb) - DBScanner.setAminoAcidProbabilities(databaseFile.getAbsolutePath(), aaSet); - - aaSet.registerEnzyme(enzyme); - - CompactFastaSequence fastaSequence = new CompactFastaSequence(databaseFile.getAbsolutePath()).truncateAnnotation(); - if (useTDA) { - float ratioUniqueProteins = fastaSequence.getRatioUniqueProteins(); - if (ratioUniqueProteins < 0.5f) { - fastaSequence.printTooManyDuplicateSequencesMessage(databaseFile.getName(), "MS-GFDB"); - System.exit(-1); - } - } - - CompactSuffixArray sa = new CompactSuffixArray(fastaSequence, maxPeptideLength); - System.out.print("Loading database finished "); - System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - time) / 1000); - - System.out.println("Reading spectra..."); - - // Show a message of the form "Opening mzML file QC_Mam_19_01_PNNL_10_06Jan21_Arwen_WBEH-20-12-01.mzML" - System.out.printf("Opening %s %s\n", specFormat.getPSIName(), specFile.getName()); - - SpectraAccessor specAcc = new SpectraAccessor(specFile, specFormat); - - if (specAcc.getSpecMap() == null || specAcc.getSpecItr() == null) - return "Error while parsing spectrum file: " + specFile.getPath(); - - - if (enzyme == null) - numAllowedNonEnzymaticTermini = 2; - - // determine the number of spectra to be scanned together - long maxMemory = Runtime.getRuntime().maxMemory() - sa.getSize() - 1 << 28; - - int avgPeptideMass = 2000; - int numBytesPerMass = 12; - int numSpecScannedTogether = (int) ((float) maxMemory / avgPeptideMass / numBytesPerMass); - ArrayList specKeyList = SpecKey.getSpecKeyList(specAcc.getSpecItr(), startSpecIndex, endSpecIndex, minCharge, maxCharge, activationMethod, Constants.MIN_NUM_PEAKS_PER_SPECTRUM, allowDenseCentroidedPeaks, 2, Integer.MAX_VALUE); - int specSize = specKeyList.size(); - - System.out.print("Reading spectra finished "); - System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - time) / 1000); - - int minSpectraPerThread = paramManager.getMinSpectraPerThread(); - numThreads = Math.min(numThreads, Math.round((float) Math.min(specSize, numSpecScannedTogether) / minSpectraPerThread)); - if (numThreads == 0) - numThreads = 1; - System.out.println("Using " + numThreads + (numThreads == 1 ? " thread." : " threads.")); - - SpecDataType specDataType = new SpecDataType(activationMethod, instType, enzyme, protocol); - int fromIndexGlobal = 0; - - List resultList = Collections.synchronizedList(new ArrayList()); - - while (true) { - if (fromIndexGlobal >= specSize) - break; - int toIndexGlobal = Math.min(specSize, fromIndexGlobal + numSpecScannedTogether); - System.out.println("Spectrum " + fromIndexGlobal + "-" + (toIndexGlobal - 1) + " (total: " + specSize + ")"); - - // Thread pool - ExecutorService executor = Executors.newFixedThreadPool(numThreads); - - // Partition specKeyList - int size = toIndexGlobal - fromIndexGlobal; - int subListSize = size / numThreads; - int residue = size % numThreads; - - int[] startIndex = new int[numThreads]; - int[] endIndex = new int[numThreads]; - - for (int i = 0; i < numThreads; i++) { - startIndex[i] = i > 0 ? endIndex[i - 1] : fromIndexGlobal; - endIndex[i] = startIndex[i] + subListSize + (i < residue ? 1 : 0); - } - - for (int i = 0; i < numThreads; i++) { - ScoredSpectraMap specScanner = new ScoredSpectraMap( - specAcc, - specKeyList.subList(startIndex[i], endIndex[i]), - leftPrecursorMassTolerance, - rightPrecursorMassTolerance, - numAllowedC13, - specDataType, - false - ); - if (doNotDseEdgeScore) - specScanner.turnOffEdgeScoring(); - - ConcurrentMSGFDB.RunMSGFDB msgfdbExecutor = new ConcurrentMSGFDB.RunMSGFDB( - specScanner, - sa, - enzyme, - aaSet, - numMatchesPerSpec, - minPeptideLength, - maxPeptideLength, - numAllowedNonEnzymaticTermini, - !useTDA, - resultList, - specFile.getName(), - replicateMergedResults - ); - executor.execute(msgfdbExecutor); - } - - executor.shutdown(); - while (!executor.isTerminated()) { - } // wait until all threads terminate - - fromIndexGlobal += numSpecScannedTogether; - } - - time = System.currentTimeMillis(); - // Sort search results by spectral probabilities - Collections.sort(resultList); - - // Write results - - String header = - "#SpecFile\tSpecIndex\tScan#\t" - + "FragMethod\t" - + "Precursor\tPMError(" - + (rightPrecursorMassTolerance.isTolerancePPM() ? "ppm" : "Da") - + ")\tCharge\tPeptide\tProtein\tDeNovoScore\tMSGFScore\tSpecProb\tP-value"; - - MSGFDBResultGenerator gen = new MSGFDBResultGenerator(header, resultList); - - if (showFDR && !useTDA && numMatchesPerSpec == 1) { - PrintStream out = null; - if (outputFile == null) - out = System.out; - else { - try { - out = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFile))); - } catch (IOException e) { - e.printStackTrace(); - } - } - System.out.println("Computing EFDRs..."); - gen.computeEFDR(); - System.out.print("Computing EFDRs finished"); - System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - time) / 1000); - gen.writeResults(out, true, false); - if (out != System.out) - out.close(); - } else if (!showFDR || !useTDA) { - PrintStream out = null; - if (outputFile == null) - out = System.out; - else { - try { - out = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFile))); - } catch (IOException e) { - e.printStackTrace(); - } - } - gen.writeResults(out, false, false); - if (out != System.out) - out.close(); - } else { - System.out.println("Computing FDRs..."); - try { - File tempFile = null; - if (outputFile != null) { - tempFile = new File(outputFile.getAbsolutePath() + ".temp.tsv"); - } else { - tempFile = File.createTempFile("MSGFDB", "tempResult"); - tempFile.deleteOnExit(); - } - PrintStream out = new PrintStream(new BufferedOutputStream(new FileOutputStream(tempFile))); - gen.writeResults(out, false, false); - out.flush(); - out.close(); - int specFileCol = 0; - int specIndexCol = 1; - int pepCol = 7; - int dbCol = 8; - int scoreCol = 11; - edu.ucsd.msjava.fdr.ComputeFDR.computeFDR(tempFile, null, scoreCol, false, "\t", - specFileCol, specIndexCol, pepCol, null, true, showDecoy, - true, dbCol, DECOY_PROTEIN_PREFIX, - 1, 1, outputFile); - - } catch (IOException e) { - e.printStackTrace(); - } - System.out.print("Computing FDRs finished"); - System.out.format("(elapsed time: %.2f sec)\n", (float) (System.currentTimeMillis() - time) / 1000); - } - return null; - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/ParamManager.java b/src/main/java/edu/ucsd/msjava/params/ParamManager.java index 0dd4fafb..c5f606f3 100644 --- a/src/main/java/edu/ucsd/msjava/params/ParamManager.java +++ b/src/main/java/edu/ucsd/msjava/params/ParamManager.java @@ -36,13 +36,9 @@ public enum ParamNameEnum { DECOY_PREFIX("decoy", "DecoyPrefix", "Prefix for decoy protein names; Default: " + MSGFPlus.DEFAULT_DECOY_PROTEIN_PREFIX, null), - // Used by MS-GF+ + // -o for MS-GF+ search output SEARCH_OUTPUT_FILE("o", "OutputFile (*.pin or *.tsv)", "Default: [SpectrumFileName].pin", null), - // Used by MSGF and MS-GFDB - OUTPUT_FILE("o", "OutputFile", "Default: stdout", null), - - // MS-GF+, MSGF, and MS-GFDB PRECURSOR_MASS_TOLERANCE("t", "PrecursorMassTolerance", "e.g. 2.5Da, 20ppm or 0.5Da,2.5Da; Default: 20ppm", "Use a comma to define asymmetric values. E.g. \"-t 0.5Da,2.5Da\" will set 0.5Da to the left (ObsMass < TheoMass) and 2.5Da to the right (ObsMass > TheoMass)"), @@ -97,15 +93,6 @@ public enum ParamNameEnum { ENZYME_SPECIFICITY("ntt", "NTT", "Number of Tolerable Termini", "E.g. For trypsin, 0: non-tryptic, 1: semi-tryptic, 2: fully-tryptic peptides only."), - // Used by MS-GFDB - C13("c13", null, "Precursor isotope peak error", - "0 means consider only peptides matching precursor mass\n" + - "\t 1 means Consider peptides having one 13C (Default)\n" + - "\t 2 means Consider peptides having up to two 13C"), - - // Used by MS-GFDB - NNET("nnet", null, "Number of allowed non-enzymatic termini", null), - MIN_PEPTIDE_LENGTH("minLength", "MinPepLength", "Minimum peptide length to consider; Default: 6", null), MAX_PEPTIDE_LENGTH("maxLength", "MaxPepLength", "Maximum peptide length to consider; Default: 40", null), @@ -153,10 +140,6 @@ public enum ParamNameEnum { "0 means Use Edge Scoring (Default)\n" + "\t 1 means Do not use edge scoring"), - // Only used by MS-GFDB - @Deprecated - UNIFORM_AA_PROBABILITY("uniformAAProb", "UniformAAProb", null, null), - MAX_NUM_MODS("numMods", "NumMods", "Maximum number of dynamic (variable) modifications per peptide; Default: 3", null), // Note that static and dynamic modifications cannot be specified at the command line @@ -441,16 +424,6 @@ private void addDBFileParam(ParamNameEnum paramInfo, boolean isOptional) { addParameter(dbFileParam); } - private void addDBFileParam(String key, String description, boolean isOptional) { - FileParameter dbFileParam = new FileParameter(key, ParamNameEnum.DB_FILE.name, description); - if (isOptional) - dbFileParam.setAsOptional(); - dbFileParam.addFileFormat(DBFileFormat.FASTA); - dbFileParam.fileMustExist(); - dbFileParam.mustBeAFile(); - addParameter(dbFileParam); - } - private void addDecoyPrefixParam() { addDecoyPrefixParam(MSGFPlus.DEFAULT_DECOY_PROTEIN_PREFIX); } @@ -479,16 +452,6 @@ private void addMzIdOutputFileParam() { addParameter(outputParam); } - /** - * -o for MSGF and MS-GFDB - */ - private void addOutputFileParam() { - FileParameter outputParam = new FileParameter(ParamNameEnum.OUTPUT_FILE); - outputParam.setAsOptional(); - outputParam.fileMustNotExist(); - addParameter(outputParam); - } - /** * Used by both MS-GF+ and MS-GFDB * MS-GF+ passes True for doNotAddMergeMode, thus ignoring ActivationMethod.FUSION @@ -877,188 +840,6 @@ public void addMSGFPlusParams() { } // MSGFPlusParams - // ScoringParamGen has been removed along with the rest of the mzid - // pipeline. The former addScoringParamGenParams() method is deleted. - - @Deprecated - public void addMSGFDBParams() { - addSpecFileParam(false); - addDBFileParam(false); - - addPrecursorMassToleranceParam(); - addPrecursorMassToleranceUnitsParam(true); - - addOutputFileParam(); - - addNumThreadsParam(); - addMinSpectraPerThreadParam(); - - addTdaParam(); - - addFragMethodParam(ActivationMethod.ASWRITTEN, false); - addInstTypeParam(); - addEnzymeParam(); - addProtocolParam(); - - EnumParameter c13Param = new EnumParameter(ParamNameEnum.C13); - c13Param.registerEntry("Consider only peptides matching precursor mass"); - c13Param.registerEntry("Consider peptides having one 13C").setDefault(); - c13Param.registerEntry("Consider peptides having up to two 13C"); - addParameter(c13Param); - - EnumParameter nnetParam = new EnumParameter(ParamNameEnum.NNET); - nnetParam.registerEntry(""); - nnetParam.registerEntry("").setDefault(); - nnetParam.registerEntry(""); - addParameter(nnetParam); - - addModFileParam(); - -// FloatRangeParameter itraqParam = new FloatRangeParameter("itraq", "minMass,maxMass", "Remove MS/MS peaks in the mass range between minMass and maxMass (for iTRAQ analysis)."); -// itraqParam.minValue(0f); -// itraqParam.setMaxInclusive(); -// itraqParam.defaultValue("0,0"); -// itraqParam.setHidden(); -// addParameter(itraqParam); - - addMinPeptideLengthParam(); - addMaxPeptideLengthParam(); - addMinChargeParam(); - addMaxChargeParam(); - - addNumMatchesPerSpecParam(); - - EnumParameter uniformAAProb = new EnumParameter(ParamNameEnum.UNIFORM_AA_PROBABILITY); - uniformAAProb.registerEntry("Use amino acid probabilities computed from the input database").setDefault(); - uniformAAProb.registerEntry("Use probability 0.05 for all amino acids"); - addParameter(uniformAAProb); - - addAllowDenseCentroidedPeaksParam(); - - addExample("Example (high-precision): java -Xmx2000M -jar MSGFDB.jar -s test.mzXML -d IPI_human_3.79.fasta -t 30ppm -c13 1 -nnet 0 -tda 1 -o testMSGFDB.tsv"); - addExample("Example (low-precision): java -Xmx2000M -jar MSGFDB.jar -s test.mzXML -d IPI_human_3.79.fasta -t 0.5Da,2.5Da -nnet 0 -tda 1 -o testMSGFDB.tsv"); - - // Hidden parameters - addDbIndexDirParam(true); - addSpecIndexRangeParam(true); - - EnumParameter showFDRParam = new EnumParameter("showFDR"); - showFDRParam.registerEntry("Do not show FDRs"); - showFDRParam.registerEntry("Show FDRs").setDefault(); - showFDRParam.setHidden(); - addParameter(showFDRParam); - - EnumParameter showDecoyParam = new EnumParameter("showDecoy"); - showDecoyParam.registerEntry("Do not show decoy PSMs").setDefault(); - showDecoyParam.registerEntry("Show decoy PSMs"); - showDecoyParam.setHidden(); - addParameter(showDecoyParam); - - EnumParameter replicateMergedResParam = new EnumParameter("replicate"); - replicateMergedResParam.registerEntry("Show merged spectra").setDefault(); - replicateMergedResParam.registerEntry("Show individual spectra"); - replicateMergedResParam.setHidden(); - addParameter(replicateMergedResParam); - - addEdgeScoreParam(true); - -// EnumParameter percolatorParam = new EnumParameter("percolator"); -// edgeScoreParam.registerEntry("normal").setDefault(); -// edgeScoreParam.registerEntry("for MS-GF+Percolator"); -// edgeScoreParam.setHidden(); -// addParameter(percolatorParam); - - } // MSGFDBParams - - public void addMSGFParams() { - // SpectrumFile - FileParameter resFileParam = new FileParameter("i", "ResultFile", "ResultFile"); - resFileParam.fileMustExist(); - addParameter(resFileParam); - - // SpecDir - FileParameter specDirParam = new FileParameter("d", "SpecDir", "Path to directory containing spectrum files"); - specDirParam.mustBeADirectory(); - specDirParam.fileMustExist(); - addParameter(specDirParam); - - // OutputFileName - addOutputFileParam(); - - // DBFile - addDBFileParam("db", "To get AA frequencies, if not specified, 1/20 is used for all AAs", true); - - // Fragmentation method - addFragMethodParam(ActivationMethod.ASWRITTEN, true); - - // Instrument type - addInstTypeParam(); - - // Enzyme - addEnzymeParam(); - - // FixedMod - EnumParameter fixModParam = new EnumParameter("fixMod"); - fixModParam.registerEntry("NoCysteineProtection"); - fixModParam.registerEntry("Carbamidomethyl-C").setDefault(); - fixModParam.registerEntry("Carboxymethyl-C"); - addParameter(fixModParam); - - // -x - EnumParameter numSpecParam = new EnumParameter("x"); - numSpecParam.registerEntry("All").setDefault(); - numSpecParam.registerEntry("OnePerSpec"); - addParameter(numSpecParam); - - // -p - FloatParameter spThParam = new FloatParameter("p", "SpecProbThreshold", "Spectral probability threshold (Default: 1)"); - spThParam.minValue(0f).setMinExclusive(); - spThParam.maxValue(1f).setMaxInclusive(); - spThParam.defaultValue(1f); - addParameter(spThParam); - - // -addScore - EnumParameter addScoreParam = new EnumParameter("addScore"); - addScoreParam.registerEntry("Don't add MSGFScore").setDefault(); - addScoreParam.registerEntry("Add MSGFScore"); - addParameter(addScoreParam); - } - - public void addMSGFLibParams() { - addSpecFileParam(false); - - // Add library file param - FileParameter libFileParam = new FileParameter("d", "LibraryFile", "*.sptxt"); - libFileParam.addFileFormat(new FileFormat(".sptxt")); - libFileParam.fileMustExist(); - libFileParam.mustBeAFile(); - addParameter(libFileParam); - - addPrecursorMassToleranceParam(); - - addOutputFileParam(); - - addNumThreadsParam(); - - addFragMethodParam(ActivationMethod.ASWRITTEN, false); - addInstTypeParam(); - addEnzymeParam(); - addProtocolParam(); - - EnumParameter c13Param = new EnumParameter(ParamNameEnum.C13); - c13Param.registerEntry("Consider only peptides matching precursor mass"); - c13Param.registerEntry("Consider peptides having one 13C").setDefault(); - c13Param.registerEntry("Consider peptides having up to two 13C"); - addParameter(c13Param); - - IntParameter numMatchesParam = new IntParameter("n", "NumMatchesPerSpec", "Number of matches per spectrum to be reported, Default: 1"); - numMatchesParam.minValue(1); - numMatchesParam.defaultValue(1); - addParameter(numMatchesParam); - - addExample("Example: java -Xmx2000M -jar MSGFLib.jar -s test.mzXML -d IPI_human_3.79.fasta -t 30ppm -c13 1 -nnet 0 -o testMSGFDB.tsv"); - } - public FileParameter getSpecFileParam() { return ((FileParameter) getParameter(ParamNameEnum.SPECTRUM_FILE.key)); } @@ -1089,7 +870,7 @@ public IntRangeParameter getIsotopeRangeParameter() { } public FileParameter getOutputFileParam() { - return ((FileParameter) getParameter(ParamNameEnum.OUTPUT_FILE.key)); + return ((FileParameter) getParameter(ParamNameEnum.SEARCH_OUTPUT_FILE.key)); } public ActivationMethod getActivationMethod() { diff --git a/src/test/java/msgfplus/TestMinSpectraPerThread.java b/src/test/java/msgfplus/TestMinSpectraPerThread.java index 58513564..d9e6b65a 100644 --- a/src/test/java/msgfplus/TestMinSpectraPerThread.java +++ b/src/test/java/msgfplus/TestMinSpectraPerThread.java @@ -36,11 +36,4 @@ public void rejectsZero() { Assert.assertNotNull("'0' must be rejected (minValue is 1)", param.parse("0")); } - @Test - @SuppressWarnings("deprecation") - public void msgfdbEntryPointAlsoRegistersTheParam() { - ParamManager pm = new ParamManager("MSGFDB", "test", "test", "java -jar MSGFDB.jar"); - pm.addMSGFDBParams(); - Assert.assertEquals(250, pm.getMinSpectraPerThread()); - } } From de71b5885c6eed7194f69766246ea35c02509acf Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 13:34:48 +0100 Subject: [PATCH 17/34] refactor(cli): typed converters for tolerance + int-range CLI flags Replace String/Integer holders for the four range-shaped flags with small typed value classes carrying their own parse/converter logic: - cli/PrecursorTolerance (Tolerance left, Tolerance right) -- handles both symmetric ("20ppm") and asymmetric ("0.5Da,2.5Da") forms, validates unit-match and non-negative, exposed via picocli converter. - cli/IntRange (int min, int max) -- inclusive range, accepts "min,max" or single "n" (interpreted as "n,n"), used by -ti, -msLevel, -index. MSGFPlusOptions fields now use these typed objects directly: precursorTolerance: String -> PrecursorTolerance isotopeErrorRange: String -> IntRange msLevel: String -> IntRange specIndexRange: String -> IntRange The adapter still round-trips them through Parameter.parse(String) via toString() for now -- that round-trip goes away in Phase 4c step 3 when SearchParams.parse reads from MSGFPlusOptions directly and the legacy params/ hierarchy is deleted. Existing equivalence test still passes (asymmetric "0.5Da,2.5Da" case included). Build green. --- .../java/edu/ucsd/msjava/cli/IntRange.java | 56 +++++++++++++++++ .../edu/ucsd/msjava/cli/MSGFPlusOptions.java | 12 ++-- .../msjava/cli/MSGFPlusOptionsAdapter.java | 17 ++++-- .../ucsd/msjava/cli/PrecursorTolerance.java | 60 +++++++++++++++++++ 4 files changed, 136 insertions(+), 9 deletions(-) create mode 100644 src/main/java/edu/ucsd/msjava/cli/IntRange.java create mode 100644 src/main/java/edu/ucsd/msjava/cli/PrecursorTolerance.java diff --git a/src/main/java/edu/ucsd/msjava/cli/IntRange.java b/src/main/java/edu/ucsd/msjava/cli/IntRange.java new file mode 100644 index 00000000..fd792fe1 --- /dev/null +++ b/src/main/java/edu/ucsd/msjava/cli/IntRange.java @@ -0,0 +1,56 @@ +package edu.ucsd.msjava.cli; + +import picocli.CommandLine.ITypeConverter; +import picocli.CommandLine.TypeConversionException; + +/** + * Inclusive integer range parsed from CLI/config-file syntax + * {@code "min,max"} or single value {@code "n"} (interpreted as + * {@code n,n}). Used by {@code -ti}, {@code -msLevel}, {@code -index}. + */ +public final class IntRange { + + public final int min; + public final int max; + + public IntRange(int min, int max) { + if (min > max) { + throw new IllegalArgumentException("min (" + min + ") > max (" + max + ")"); + } + this.min = min; + this.max = max; + } + + public static IntRange parse(String value) { + String[] tok = value.split(","); + try { + if (tok.length == 1) { + int v = Integer.parseInt(tok[0].trim()); + return new IntRange(v, v); + } + if (tok.length == 2) { + return new IntRange( + Integer.parseInt(tok[0].trim()), + Integer.parseInt(tok[1].trim())); + } + } catch (NumberFormatException e) { + throw new IllegalArgumentException("invalid range: " + value, e); + } + throw new IllegalArgumentException("invalid range syntax (expected 'min,max' or single int): " + value); + } + + @Override public String toString() { + return min == max ? Integer.toString(min) : min + "," + max; + } + + /** picocli {@link ITypeConverter} that wraps {@link #parse(String)}. */ + public static final class Converter implements ITypeConverter { + @Override public IntRange convert(String value) { + try { + return parse(value); + } catch (IllegalArgumentException e) { + throw new TypeConversionException(e.getMessage()); + } + } + } +} diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java index 64d07bb8..f9874771 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java @@ -58,17 +58,19 @@ public final class MSGFPlusOptions { // ---------- precursor mass tolerance ---------- @Option(names = "-t", paramLabel = "Tolerance", + converter = PrecursorTolerance.Converter.class, description = "Precursor mass tolerance, e.g. 20ppm or 0.5Da or 0.5Da,2.5Da; Default: 20ppm. " + "Asymmetric form sets left tolerance (ObsMass < TheoMass) and right tolerance (ObsMass > TheoMass).") - public String precursorTolerance; + public PrecursorTolerance precursorTolerance; @Option(names = "-u", paramLabel = "Units", hidden = true, description = "Tolerance units (legacy): 0=Da, 1=ppm, 2=as written in -t (Default: 2)") public Integer precursorToleranceUnits; @Option(names = "-ti", paramLabel = "Range", + converter = IntRange.Converter.class, description = "Isotope-error range, e.g. -1,2 (both inclusive); Default: 0,1") - public String isotopeErrorRange; + public IntRange isotopeErrorRange; // ---------- threading / parallelism ---------- @@ -173,8 +175,9 @@ public final class MSGFPlusOptions { public Integer allowDenseCentroidedPeaks; @Option(names = "-msLevel", paramLabel = "Range", + converter = IntRange.Converter.class, description = "MS level or range, e.g. 2 or 2,3; Default: 2,2") - public String msLevel; + public IntRange msLevel; // ---------- hidden flags ---------- @@ -183,8 +186,9 @@ public final class MSGFPlusOptions { public File dbIndexDir; @Option(names = "-index", paramLabel = "Range", hidden = true, + converter = IntRange.Converter.class, description = "Spectrum index range, e.g. 1,1000 (both inclusive)") - public String specIndexRange; + public IntRange specIndexRange; @Option(names = "-edgeScore", paramLabel = "N", hidden = true, description = "Edge scoring: 0=use (Default), 1=skip") diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java index bf38b87f..635845fe 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java @@ -46,14 +46,17 @@ public static String adapt(MSGFPlusOptions opts, ParamManager paramManager) { if ((err = setIfPresent(paramManager, ParamNameEnum.DD_DIRECTORY, opts.dbIndexDir == null ? null : opts.dbIndexDir.getPath())) != null) return err; - // Plain strings / domain strings parsed by ToleranceParameter / RangeParameter / EnumParameter + // Plain strings / domain strings parsed by ToleranceParameter / RangeParameter / EnumParameter. + // Typed fields (PrecursorTolerance, IntRange) are converted back to their canonical String + // form via toString() since the Phase 1 round-trip still feeds Parameter.parse(String); + // Phase 4c step 3 deletes that round-trip. if ((err = setIfPresent(paramManager, ParamNameEnum.DECOY_PREFIX, opts.decoyPrefix)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.PRECURSOR_MASS_TOLERANCE, opts.precursorTolerance)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.ISOTOPE_ERROR, opts.isotopeErrorRange)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.PRECURSOR_MASS_TOLERANCE, asString(opts.precursorTolerance))) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.ISOTOPE_ERROR, asString(opts.isotopeErrorRange))) != null) return err; if ((err = setIfPresent(paramManager, ParamNameEnum.OUTPUT_FORMAT, opts.outputFormat)) != null) return err; if ((err = setIfPresent(paramManager, ParamNameEnum.PRECURSOR_CAL, opts.precursorCalMode)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MS_LEVEL, opts.msLevel)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.SPEC_INDEX, opts.specIndexRange)) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.MS_LEVEL, asString(opts.msLevel))) != null) return err; + if ((err = setIfPresent(paramManager, ParamNameEnum.SPEC_INDEX, asString(opts.specIndexRange))) != null) return err; // Integer-valued flags (enum + numeric) if ((err = setIfPresent(paramManager, ParamNameEnum.PRECURSOR_MASS_TOLERANCE_UNITS, opts.precursorToleranceUnits)) != null) return err; @@ -111,4 +114,8 @@ private static String setIfPresent(ParamManager paramManager, ParamNameEnum name if (value == null) return null; return setIfPresent(paramManager, name, value.toString()); } + + private static String asString(Object value) { + return value == null ? null : value.toString(); + } } diff --git a/src/main/java/edu/ucsd/msjava/cli/PrecursorTolerance.java b/src/main/java/edu/ucsd/msjava/cli/PrecursorTolerance.java new file mode 100644 index 00000000..f55d0db5 --- /dev/null +++ b/src/main/java/edu/ucsd/msjava/cli/PrecursorTolerance.java @@ -0,0 +1,60 @@ +package edu.ucsd.msjava.cli; + +import edu.ucsd.msjava.msgf.Tolerance; +import picocli.CommandLine.ITypeConverter; +import picocli.CommandLine.TypeConversionException; + +/** + * Typed precursor mass tolerance: a left and a right + * {@link Tolerance}. Supports symmetric form ({@code "20ppm"}) and + * asymmetric form ({@code "0.5Da,2.5Da"}). Both sides must use the + * same unit and be non-negative. + */ +public final class PrecursorTolerance { + + public final Tolerance left; + public final Tolerance right; + + private PrecursorTolerance(Tolerance left, Tolerance right) { + this.left = left; + this.right = right; + } + + public static PrecursorTolerance parse(String value) { + String[] tok = value.split(","); + Tolerance l, r; + if (tok.length == 1) { + l = r = Tolerance.parseToleranceStr(tok[0]); + } else if (tok.length == 2) { + l = Tolerance.parseToleranceStr(tok[0]); + r = Tolerance.parseToleranceStr(tok[1]); + } else { + throw new IllegalArgumentException("invalid tolerance value: " + value); + } + if (l == null || r == null) { + throw new IllegalArgumentException("invalid tolerance value: " + value); + } + if (l.isTolerancePPM() != r.isTolerancePPM()) { + throw new IllegalArgumentException("left and right tolerance units must be the same"); + } + if (l.getValue() < 0 || r.getValue() < 0) { + throw new IllegalArgumentException("parent mass tolerance must not be negative"); + } + return new PrecursorTolerance(l, r); + } + + @Override public String toString() { + return left.equals(right) ? left.toString() : left + "," + right; + } + + /** picocli {@link ITypeConverter} that wraps {@link #parse(String)}. */ + public static final class Converter implements ITypeConverter { + @Override public PrecursorTolerance convert(String value) { + try { + return parse(value); + } catch (IllegalArgumentException e) { + throw new TypeConversionException(e.getMessage()); + } + } + } +} From 03f32c1dcc667cf6457d4044b15b9d4e2e448353 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 20:15:25 +0100 Subject: [PATCH 18/34] refactor(cli): retire ParamManager from the hot path (Phase 4c) SearchParams now reads directly from MSGFPlusOptions; the legacy ParamManager + Parameter.parse round-trip is gone. What changed: 1. MSGFPlusOptions grew effective*() resolvers for every flag (with defaults: 6/40 lengths, 2/3 charges, proton mass, etc.), typed lookups for ActivationMethod/InstrumentType/Enzyme/Protocol from the existing registries, and an applyConfigFile(File) overlay that reproduces the legacy alias rewrites (IsotopeError -> IsotopeErrorRange, FragmentationMethod -> FragmentationMethodID, etc., 14 total) plus collects DynamicMod/StaticMod/CustomAA into ordered lists. CLI fields take precedence -- config-file values only fill in null fields. 2. SearchParams.parse(MSGFPlusOptions) replaces parse(ParamManager). The 35+ paramManager.getXxx() call sites translate one-for-one to opts.effective*() / opts.* reads. parseConfigParamFile is gone (its work is now opts.applyConfigFile). Spectrum-format "isSupported" check is now a small whitelist instead of routing through FileParameter. 3. AminoAcidSet.getAminoAcidSetFromModFile and the new getAminoAcidSetFromModEntries (replacing getAminoAcidSetFromList) take MSGFPlusOptions and call opts.setMaxNumModsFromMetadata when the loaded mod metadata declares a different value, mirroring the former paramManager.setMaxNumMods bidirectional handshake. 4. MSGFPlus.main no longer creates a ParamManager. argv -> picocli -> MSGFPlusOptions -> SearchParams.parse(opts), with picocli's own usage()/printToolInfo()/printJVMInfo() replacing the ParamManager equivalents. 5. Adapter (MSGFPlusOptionsAdapter) deleted -- the round-trip via Parameter.parse(String) is no longer needed. 6. Tests migrated to construct MSGFPlusOptions directly: SearchParamsTest, TestRunManifestWriter, TestDirectPinWriter, TestPrecursorCalScaffolding, TestPrecursorCalIntegration, TestPercolator, TestMSUtils, TestIPRG, TestCollaboration, TestCandidatePeptideGrid(ConsideringMetCleavage), TestSA, TestMinSpectraPerThread. MSGFPlusOptionsAdapterTest deleted (adapter is gone). TestIntRangeParameter deleted (params/ IntRangeParameter is the next thing to go in Phase 3). Validation: scoped sweep -- 73 tests, 0 failures, 0 errors, 5 skipped. The legacy params/ hierarchy still compiles but has no live callers on the MS-GF+ path; Phase 3 deletes it next. --- .../java/edu/ucsd/msjava/cli/MSGFPlus.java | 66 +- .../edu/ucsd/msjava/cli/MSGFPlusOptions.java | 257 +++- .../msjava/cli/MSGFPlusOptionsAdapter.java | 121 -- .../ucsd/msjava/msdbsearch/SearchParams.java | 1293 ++++++++--------- .../edu/ucsd/msjava/msutil/AminoAcidSet.java | 87 +- .../java/edu/ucsd/msjava/msutil/Peptide.java | 4 +- .../cli/MSGFPlusOptionsAdapterTest.java | 103 -- .../msjava/msdbsearch/SearchParamsTest.java | 40 +- .../msgfplus/TestCandidatePeptideGrid.java | 26 +- ...datePeptideGridConsideringMetCleavage.java | 28 +- src/test/java/msgfplus/TestCollaboration.java | 8 +- .../java/msgfplus/TestDirectPinWriter.java | 68 +- src/test/java/msgfplus/TestIPRG.java | 8 +- .../java/msgfplus/TestIntRangeParameter.java | 94 -- src/test/java/msgfplus/TestMSUtils.java | 9 +- .../msgfplus/TestMinSpectraPerThread.java | 35 +- src/test/java/msgfplus/TestPercolator.java | 20 +- .../msgfplus/TestPrecursorCalIntegration.java | 43 +- .../msgfplus/TestPrecursorCalScaffolding.java | 48 +- .../java/msgfplus/TestRunManifestWriter.java | 25 +- src/test/java/msgfplus/TestSA.java | 8 +- 21 files changed, 1053 insertions(+), 1338 deletions(-) delete mode 100644 src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java delete mode 100644 src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java delete mode 100644 src/test/java/msgfplus/TestIntRangeParameter.java diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java index fa8c8e3f..bdc330cb 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java @@ -11,7 +11,6 @@ import edu.ucsd.msjava.output.DirectPinWriter; import edu.ucsd.msjava.output.DirectTSVWriter; import edu.ucsd.msjava.mzml.StaxMzMLParser; -import edu.ucsd.msjava.params.ParamManager; import edu.ucsd.msjava.sequences.Constants; import picocli.CommandLine; import picocli.CommandLine.ParameterException; @@ -53,36 +52,44 @@ public static void main(String argv[]) { long startTime = System.currentTimeMillis(); argvSnapshot = argv == null ? new String[0] : argv.clone(); - ParamManager paramManager = new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "java -Xmx3500M -jar MSGFPlus.jar"); - paramManager.addMSGFPlusParams(); + MSGFPlusOptions opts = new MSGFPlusOptions(); + CommandLine cl = new CommandLine(opts); if (argv.length == 0) { - paramManager.printUsageInfo(); + printToolInfo(); + cl.usage(System.out); return; } StaxMzMLParser.turnOffLogs(); - // Parse parameters. The new picocli-based path runs by default; - // the legacy ParamManager.parseParams handles -conf (config-file - // input) until Phase 2 ports it. See parameter-modernization.md. - String errMessage = parseArgs(argv, paramManager); - if (errMessage != null) { - MSGFLogger.error(errMessage); + try { + cl.parseArgs(argv); + } catch (ParameterException e) { + MSGFLogger.error(e.getMessage()); System.out.println(); - paramManager.printUsageInfo(); + cl.usage(System.out); System.exit(-1); } + if (cl.isUsageHelpRequested()) { + cl.usage(System.out); + return; + } + if (cl.isVersionHelpRequested()) { + System.out.println(VERSION); + return; + } + // Propagate verbose flag to the shared logger before any downstream code logs. - MSGFLogger.setVerbose(paramManager.getVerboseFlag() == 1); + MSGFLogger.setVerbose(opts.effectiveVerbose() == 1); + + printToolInfo(); + printJVMInfo(); - // Running MS-GF+ - paramManager.printToolInfo(); - paramManager.printJVMInfo(); String errorMessage = null; try { - errorMessage = runMSGFPlus(paramManager); + errorMessage = runMSGFPlus(opts); } catch (Exception e) { e.printStackTrace(); System.exit(-1); @@ -96,27 +103,18 @@ public static void main(String argv[]) { MSGFLogger.info("MS-GF+ complete (total elapsed time: %.2f sec)", (System.currentTimeMillis() - startTime) / (float) 1000); } - /** - * Route MSGFPlus argv through the typed picocli-based - * {@link MSGFPlusOptions} class + adapter. Config-file values are - * applied later by {@link edu.ucsd.msjava.msdbsearch.SearchParams#parse} - * for any parameter the CLI did not assign, so {@code -conf} works - * uniformly through this single path. See - * {@code .claude/plans/parameter-modernization.md}. - */ - private static String parseArgs(String[] argv, ParamManager paramManager) { - MSGFPlusOptions opts = new MSGFPlusOptions(); - try { - new CommandLine(opts).parseArgs(argv); - } catch (ParameterException e) { - return e.getMessage(); - } - return MSGFPlusOptionsAdapter.adapt(opts, paramManager); + private static void printToolInfo() { + System.out.println("MS-GF+ " + VERSION + " (" + RELEASE_DATE + ")"); + } + + private static void printJVMInfo() { + System.out.println("Java " + System.getProperty("java.version") + " (" + System.getProperty("java.vendor") + ")"); + System.out.println(System.getProperty("os.name") + " (" + System.getProperty("os.arch") + ", version " + System.getProperty("os.version") + ")"); } - public static String runMSGFPlus(ParamManager paramManager) { + public static String runMSGFPlus(MSGFPlusOptions opts) { SearchParams params = new SearchParams(); - String errorMessage = params.parse(paramManager); + String errorMessage = params.parse(opts); if (errorMessage != null) { return errorMessage; diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java index f9874771..26f82988 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java @@ -1,10 +1,18 @@ package edu.ucsd.msjava.cli; -import edu.ucsd.msjava.params.ParamManager.ParamNameEnum; +import edu.ucsd.msjava.msutil.ActivationMethod; +import edu.ucsd.msjava.msutil.Enzyme; +import edu.ucsd.msjava.msutil.InstrumentType; +import edu.ucsd.msjava.msutil.Protocol; import picocli.CommandLine.Command; import picocli.CommandLine.Option; +import java.io.BufferedReader; import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; /** * Typed command-line options for MS-GF+. Replaces the imperative @@ -209,4 +217,251 @@ public final class MSGFPlusOptions { @Option(names = "-minDeNovoScore", paramLabel = "N", hidden = true, description = "Minimum de novo score") public Integer minDeNovoScore; + + // ---------- config-file-only entries (populated by applyConfigFile) ---------- + + /** {@code DynamicMod=...} entries from the config file (or {@code -mod} file). */ + public final List dynamicMods = new ArrayList<>(); + /** {@code StaticMod=...} entries from the config file (or {@code -mod} file). */ + public final List staticMods = new ArrayList<>(); + /** {@code CustomAA=...} entries from the config file (or {@code -mod} file). */ + public final List customAAs = new ArrayList<>(); + + /** Set when {@link #applyConfigFile(File)} encounters {@code MaxNumModsPerPeptide=} + * via the legacy alias path; allows the config-file value to feed the + * {@link #effectiveMaxNumMods()} default. */ + private Integer configMaxNumMods; + + // ---------- effective-value resolvers (CLI value, else config-file value, else default) ---------- + + public int effectiveMinPeptideLength() { return minPeptideLength != null ? minPeptideLength : 6; } + public int effectiveMaxPeptideLength() { return maxPeptideLength != null ? maxPeptideLength : 40; } + public int effectiveMinCharge() { return minCharge != null ? minCharge : 2; } + public int effectiveMaxCharge() { return maxCharge != null ? maxCharge : 3; } + public int effectiveNumMatchesPerSpec() { return numMatchesPerSpec != null ? numMatchesPerSpec : 1; } + public int effectiveNumThreads() { return numThreads != null ? numThreads : Runtime.getRuntime().availableProcessors(); } + public int effectiveNumTasks() { return numTasks != null ? numTasks : 0; } + public int effectiveMinSpectraPerThread() { return minSpectraPerThread != null ? minSpectraPerThread : 250; } + public int effectiveVerbose() { return verbose != null ? verbose : 0; } + public int effectiveTdaStrategy() { return tdaStrategy != null ? tdaStrategy : 0; } + public int effectiveAddFeatures() { return addFeatures != null ? addFeatures : 0; } + public int effectiveMaxMissedCleavages() { return maxMissedCleavages != null ? maxMissedCleavages : -1; } + public int effectiveMaxNumMods() { return maxNumMods != null ? maxNumMods : (configMaxNumMods != null ? configMaxNumMods : 3); } + public int effectiveAllowDenseCentroidedPeaks() { return allowDenseCentroidedPeaks != null ? allowDenseCentroidedPeaks : 0; } + public int effectiveNumTolerableTermini() { return numTolerableTermini != null ? numTolerableTermini : 2; } + public int effectiveEdgeScore() { return edgeScore != null ? edgeScore : 0; } + public int effectiveIgnoreMetCleavage() { return ignoreMetCleavage != null ? ignoreMetCleavage : 0; } + public int effectiveMinNumPeaks() { return minNumPeaks != null ? minNumPeaks : edu.ucsd.msjava.sequences.Constants.MIN_NUM_PEAKS_PER_SPECTRUM; } + public int effectiveNumIsoforms() { return numIsoforms != null ? numIsoforms : edu.ucsd.msjava.sequences.Constants.NUM_VARIANTS_PER_PEPTIDE; } + public int effectiveMinDeNovoScore() { return minDeNovoScore != null ? minDeNovoScore : edu.ucsd.msjava.sequences.Constants.MIN_DE_NOVO_SCORE; } + public int effectiveToleranceUnits() { return precursorToleranceUnits != null ? precursorToleranceUnits : 2; } + public double effectiveChargeCarrierMass() { return chargeCarrierMass != null ? chargeCarrierMass : 1.00727649; } + + public String effectiveDecoyPrefix() { return decoyPrefix != null ? decoyPrefix : "XXX"; } + public String effectivePrecursorCalRaw() { return precursorCalMode != null ? precursorCalMode : "auto"; } + + /** 0 = pin (default), 1 = tsv. */ + public int effectiveOutputFormat() { + if (outputFormat == null) return 0; + String n = outputFormat.trim().toLowerCase(); + if (n.equals("tsv") || n.equals("1")) return 1; + return 0; + } + + public PrecursorTolerance effectivePrecursorTolerance() { + return precursorTolerance != null ? precursorTolerance : PrecursorTolerance.parse("20ppm"); + } + + public IntRange effectiveIsotopeErrorRange() { + return isotopeErrorRange != null ? isotopeErrorRange : new IntRange(0, 1); + } + + public IntRange effectiveMSLevel() { + return msLevel != null ? msLevel : new IntRange(2, 2); + } + + public IntRange effectiveSpecIndexRange() { + return specIndexRange != null ? specIndexRange : new IntRange(1, Integer.MAX_VALUE - 1); + } + + /** Resolves {@code -m} index to {@link ActivationMethod}. MSGFPlus exposes + * 0=ASWRITTEN, 1=CID, 2=ETD, 3=HCD (FUSION is excluded by + * {@code addFragMethodParam(..., doNotAddMergeMode=true)}). */ + public ActivationMethod effectiveActivationMethod() { + int idx = fragMethodId != null ? fragMethodId : 0; + switch (idx) { + case 0: return ActivationMethod.ASWRITTEN; + case 1: return ActivationMethod.CID; + case 2: return ActivationMethod.ETD; + case 3: return ActivationMethod.HCD; + default: throw new IllegalArgumentException("invalid -m index: " + idx); + } + } + + public InstrumentType effectiveInstrumentType() { + InstrumentType[] all = InstrumentType.getAllRegisteredInstrumentTypes(); + int idx = instrumentTypeId != null ? instrumentTypeId : 0; + if (idx < 0 || idx >= all.length) throw new IllegalArgumentException("invalid -inst index: " + idx); + return all[idx]; + } + + public Enzyme effectiveEnzyme() { + Enzyme[] all = Enzyme.getAllRegisteredEnzymes(); + // TRYPSIN is registered at index 1 (UnspecificCleavage at 0). See Enzyme static init. + int idx = enzymeId != null ? enzymeId : 1; + if (idx < 0 || idx >= all.length) throw new IllegalArgumentException("invalid -e index: " + idx); + return all[idx]; + } + + public Protocol effectiveProtocol() { + Protocol[] all = Protocol.getAllRegisteredProtocols(); + int idx = protocolId != null ? protocolId : 0; + if (idx < 0 || idx >= all.length) throw new IllegalArgumentException("invalid -protocol index: " + idx); + return all[idx]; + } + + // ---------- config-file overlay ---------- + + /** + * Read {@code -conf} config file and populate any fields the CLI did not + * already set. Recognizes legacy aliases (IsotopeError → IsotopeErrorRange, + * etc.) and collects repeated {@code DynamicMod=}, {@code StaticMod=}, + * {@code CustomAA=} entries. + * + * @return null on success, error string otherwise. + */ + public String applyConfigFile(File file) { + try (BufferedReader reader = new BufferedReader(new FileReader(file))) { + String line; + int lineNum = 0; + while ((line = reader.readLine()) != null) { + lineNum++; + String trimmed = stripComment(line); + if (trimmed.isEmpty()) continue; + int eq = trimmed.indexOf('='); + if (eq <= 0) continue; + String rawKey = trimmed.substring(0, eq).trim(); + String value = trimmed.substring(eq + 1).trim(); + String key = canonicalConfigKey(rawKey); + String err = applyConfigEntry(key, value, file.getName()); + if (err != null) { + return "Error parsing line " + lineNum + " of " + file.getName() + ": " + err; + } + } + } catch (IOException e) { + return "Error reading config file " + file.getPath() + ": " + e.getMessage(); + } + return null; + } + + private String applyConfigEntry(String key, String value, String fileName) { + // Repeated entries: collect into lists. "none" is treated as no entry. + if (key.equalsIgnoreCase("DynamicMod")) { + if (!value.equalsIgnoreCase("none")) dynamicMods.add(value); + return null; + } + if (key.equalsIgnoreCase("StaticMod")) { + if (!value.equalsIgnoreCase("none")) staticMods.add(value); + return null; + } + if (key.equalsIgnoreCase("CustomAA")) { + if (!value.equalsIgnoreCase("none")) customAAs.add(value); + return null; + } + // Single-valued entries: only fill in if CLI did not set the field. + try { + switch (key) { + case "SpectrumFile": if (spectrumFile == null) spectrumFile = new File(value); return null; + case "DatabaseFile": if (databaseFile == null) databaseFile = new File(value); return null; + case "OutputFile": if (outputFile == null) outputFile = new File(value); return null; + case "ModificationFileName": + case "ModificationFile": if (modificationFile == null) modificationFile = new File(value); return null; + case "DBIndexDir": if (dbIndexDir == null) dbIndexDir = new File(value); return null; + case "DecoyPrefix": if (decoyPrefix == null) decoyPrefix = value; return null; + case "PrecursorMassTolerance": if (precursorTolerance == null) precursorTolerance = PrecursorTolerance.parse(value); return null; + case "PrecursorMassToleranceUnits": + if (precursorToleranceUnits == null) precursorToleranceUnits = Integer.parseInt(value); return null; + case "IsotopeErrorRange": if (isotopeErrorRange == null) isotopeErrorRange = IntRange.parse(value); return null; + case "FragmentationMethodID": if (fragMethodId == null) fragMethodId = Integer.parseInt(value); return null; + case "InstrumentID": if (instrumentTypeId == null) instrumentTypeId = Integer.parseInt(value); return null; + case "EnzymeID": if (enzymeId == null) enzymeId = Integer.parseInt(value); return null; + case "ProtocolID": if (protocolId == null) protocolId = Integer.parseInt(value); return null; + case "NTT": if (numTolerableTermini == null) numTolerableTermini = Integer.parseInt(value); return null; + case "MinPepLength": if (minPeptideLength == null) minPeptideLength = Integer.parseInt(value); return null; + case "MaxPepLength": if (maxPeptideLength == null) maxPeptideLength = Integer.parseInt(value); return null; + case "MinCharge": if (minCharge == null) minCharge = Integer.parseInt(value); return null; + case "MaxCharge": if (maxCharge == null) maxCharge = Integer.parseInt(value); return null; + case "NumMatchesPerSpec": if (numMatchesPerSpec == null) numMatchesPerSpec = Integer.parseInt(value); return null; + case "NumThreads": if (numThreads == null) { if (!value.equalsIgnoreCase("all")) numThreads = Integer.parseInt(value); } return null; + case "NumTasks": if (numTasks == null) numTasks = Integer.parseInt(value); return null; + case "MinSpectraPerThread": if (minSpectraPerThread == null) minSpectraPerThread = Integer.parseInt(value); return null; + case "Verbose": if (verbose == null) verbose = Integer.parseInt(value); return null; + case "TDA": if (tdaStrategy == null) tdaStrategy = Integer.parseInt(value); return null; + case "AddFeatures": if (addFeatures == null) addFeatures = Integer.parseInt(value); return null; + case "OutputFormat": if (outputFormat == null) outputFormat = value; return null; + case "PrecursorCal": if (precursorCalMode == null) precursorCalMode = value; return null; + case "ChargeCarrierMass": if (chargeCarrierMass == null) chargeCarrierMass = Double.parseDouble(value); return null; + case "MaxMissedCleavages": if (maxMissedCleavages == null) maxMissedCleavages = Integer.parseInt(value); return null; + case "NumMods": if (maxNumMods == null) configMaxNumMods = Integer.parseInt(value); return null; + case "AllowDenseCentroidedPeaks": + if (allowDenseCentroidedPeaks == null) allowDenseCentroidedPeaks = Integer.parseInt(value); return null; + case "MSLevel": if (msLevel == null) msLevel = IntRange.parse(value); return null; + case "SpecIndex": if (specIndexRange == null) specIndexRange = IntRange.parse(value); return null; + case "EdgeScore": if (edgeScore == null) edgeScore = Integer.parseInt(value); return null; + case "MinNumPeaksPerSpectrum": if (minNumPeaks == null) minNumPeaks = Integer.parseInt(value); return null; + case "NumIsoforms": if (numIsoforms == null) numIsoforms = Integer.parseInt(value); return null; + case "IgnoreMetCleavage": if (ignoreMetCleavage == null) ignoreMetCleavage = Integer.parseInt(value); return null; + case "MinDeNovoScore": if (minDeNovoScore == null) minDeNovoScore = Integer.parseInt(value); return null; + default: + if (!key.toLowerCase().startsWith("enzymedef")) { + System.out.println("Warning, unrecognized parameter '" + key + "=" + value + "' in config file " + fileName); + } + return null; + } + } catch (IllegalArgumentException e) { + return "invalid value for '" + key + "': " + value + " (" + e.getMessage() + ")"; + } + } + + private static String stripComment(String line) { + int hash = line.indexOf('#'); + return (hash >= 0 ? line.substring(0, hash) : line).trim(); + } + + /** Normalize legacy / alternate config-file keys to canonical form. + * Mirrors the rewrites previously in {@code ParamNameEnum.getParamNameFromLine}. */ + private static String canonicalConfigKey(String key) { + if (key.equalsIgnoreCase("IsotopeError")) return "IsotopeErrorRange"; + if (key.equalsIgnoreCase("TargetDecoyAnalysis")) return "TDA"; + if (key.equalsIgnoreCase("FragmentationMethod")) return "FragmentationMethodID"; + if (key.equalsIgnoreCase("Instrument")) return "InstrumentID"; + if (key.equalsIgnoreCase("Enzyme")) return "EnzymeID"; + if (key.equalsIgnoreCase("Protocol")) return "ProtocolID"; + if (key.equalsIgnoreCase("NumTolerableTermini")) return "NTT"; + if (key.equalsIgnoreCase("MinNumPeaks")) return "MinNumPeaksPerSpectrum"; + if (key.equalsIgnoreCase("MaxNumMods")) return "NumMods"; + if (key.equalsIgnoreCase("MaxNumModsPerPeptide")) return "NumMods"; + if (key.equalsIgnoreCase("minLength")) return "MinPepLength"; + if (key.equalsIgnoreCase("MinPeptideLength")) return "MinPepLength"; + if (key.equalsIgnoreCase("maxLength")) return "MaxPepLength"; + if (key.equalsIgnoreCase("MaxPeptideLength")) return "MaxPepLength"; + if (key.equalsIgnoreCase("PMTolerance")) return "PrecursorMassTolerance"; + if (key.equalsIgnoreCase("ParentMassTolerance")) return "PrecursorMassTolerance"; + return key; + } + + /** Validates required-input invariants that the CLI alone can't enforce + * (since {@code -s}/{@code -d} may come from {@code -conf}). */ + public String validateRequired() { + if (spectrumFile == null) return "Spectrum file is not defined; use -s at the command line or SpectrumFile in a config file"; + if (databaseFile == null) return "Database file is not defined; use -d at the command line or DatabaseFile in a config file"; + return null; + } + + /** Mutator used by {@code AminoAcidSet} when the parsed mod metadata + * changes the effective max-num-mods (the AA set is authoritative once + * loaded). Mirrors the legacy {@code ParamManager.setMaxNumMods}. */ + public void setMaxNumModsFromMetadata(int n) { + this.maxNumMods = n; + } } diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java deleted file mode 100644 index 635845fe..00000000 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapter.java +++ /dev/null @@ -1,121 +0,0 @@ -package edu.ucsd.msjava.cli; - -import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.params.ParamManager.ParamNameEnum; -import edu.ucsd.msjava.params.Parameter; - -/** - * Phase 1 adapter: populates a {@link ParamManager} from a parsed - * {@link MSGFPlusOptions} by round-tripping each set field through the - * canonical string form that the existing - * {@link Parameter#parse(String)} hierarchy expects. - * - * This deliberately reuses the legacy parsing logic so Phase 1 is - * behavior-preserving. Phase 3 deletes the {@code params.Parameter} - * hierarchy and replaces this adapter with direct construction of the - * downstream {@code SearchParams}. - * - * Returns {@code null} on success, or a human-readable error string - * matching the format used by {@link ParamManager#parseParams(String[])}. - */ -public final class MSGFPlusOptionsAdapter { - - private MSGFPlusOptionsAdapter() {} - - /** - * Populate {@code paramManager} (already initialized via - * {@link ParamManager#addMSGFPlusParams()}) with values from - * {@code opts}. Caller is responsible for calling - * {@link ParamManager#isValid()} afterwards if final validation - * is desired (this method also runs it as the last step). - */ - public static String adapt(MSGFPlusOptions opts, ParamManager paramManager) { - String err; - - // Files / paths - if ((err = setIfPresent(paramManager, ParamNameEnum.CONFIGURATION_FILE, - opts.configFile == null ? null : opts.configFile.getPath())) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.SPECTRUM_FILE, - opts.spectrumFile == null ? null : opts.spectrumFile.getPath())) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.DB_FILE, - opts.databaseFile == null ? null : opts.databaseFile.getPath())) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.SEARCH_OUTPUT_FILE, - opts.outputFile == null ? null : opts.outputFile.getPath())) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MOD_FILE, - opts.modificationFile == null ? null : opts.modificationFile.getPath())) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.DD_DIRECTORY, - opts.dbIndexDir == null ? null : opts.dbIndexDir.getPath())) != null) return err; - - // Plain strings / domain strings parsed by ToleranceParameter / RangeParameter / EnumParameter. - // Typed fields (PrecursorTolerance, IntRange) are converted back to their canonical String - // form via toString() since the Phase 1 round-trip still feeds Parameter.parse(String); - // Phase 4c step 3 deletes that round-trip. - if ((err = setIfPresent(paramManager, ParamNameEnum.DECOY_PREFIX, opts.decoyPrefix)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.PRECURSOR_MASS_TOLERANCE, asString(opts.precursorTolerance))) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.ISOTOPE_ERROR, asString(opts.isotopeErrorRange))) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.OUTPUT_FORMAT, opts.outputFormat)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.PRECURSOR_CAL, opts.precursorCalMode)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MS_LEVEL, asString(opts.msLevel))) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.SPEC_INDEX, asString(opts.specIndexRange))) != null) return err; - - // Integer-valued flags (enum + numeric) - if ((err = setIfPresent(paramManager, ParamNameEnum.PRECURSOR_MASS_TOLERANCE_UNITS, opts.precursorToleranceUnits)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.NUM_THREADS, opts.numThreads)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.NUM_TASKS, opts.numTasks)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MIN_SPECTRA_PER_THREAD, opts.minSpectraPerThread)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.VERBOSE, opts.verbose)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.TDA_STRATEGY, opts.tdaStrategy)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.FRAG_METHOD, opts.fragMethodId)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.INSTRUMENT_TYPE, opts.instrumentTypeId)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.ENZYME_ID, opts.enzymeId)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.PROTOCOL_ID, opts.protocolId)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.ENZYME_SPECIFICITY, opts.numTolerableTermini)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MIN_PEPTIDE_LENGTH, opts.minPeptideLength)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MAX_PEPTIDE_LENGTH, opts.maxPeptideLength)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MIN_CHARGE, opts.minCharge)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MAX_CHARGE, opts.maxCharge)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.NUM_MATCHES_SPEC, opts.numMatchesPerSpec)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.ADD_FEATURES, opts.addFeatures)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MAX_MISSED_CLEAVAGES, opts.maxMissedCleavages)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MAX_NUM_MODS, opts.maxNumMods)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.ALLOW_DENSE_CENTROIDED_PEAKS, opts.allowDenseCentroidedPeaks)) != null) return err; - - // Hidden integer flags - if ((err = setIfPresent(paramManager, ParamNameEnum.EDGE_SCORE, opts.edgeScore)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MIN_NUM_PEAKS, opts.minNumPeaks)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.NUM_ISOFORMS, opts.numIsoforms)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.IGNORE_MET_CLEAVAGE, opts.ignoreMetCleavage)) != null) return err; - if ((err = setIfPresent(paramManager, ParamNameEnum.MIN_DE_NOVO_SCORE, opts.minDeNovoScore)) != null) return err; - - // Doubles - if ((err = setIfPresent(paramManager, ParamNameEnum.CHARGE_CARRIER_MASSES, opts.chargeCarrierMass)) != null) return err; - - return paramManager.isValid(); - } - - private static String setIfPresent(ParamManager paramManager, ParamNameEnum name, String value) { - if (value == null) return null; - Parameter p = paramManager.getParameter(name.getKey()); - if (p == null) return "Internal error: parameter not registered: -" + name.getKey(); - String err = p.parse(value); - if (err != null) { - return "Invalid value for parameter -" + name.getKey() + ": " + value + "\n (" + err + ")"; - } - p.setValueAssigned(); - return null; - } - - private static String setIfPresent(ParamManager paramManager, ParamNameEnum name, Integer value) { - if (value == null) return null; - return setIfPresent(paramManager, name, value.toString()); - } - - private static String setIfPresent(ParamManager paramManager, ParamNameEnum name, Double value) { - if (value == null) return null; - return setIfPresent(paramManager, name, value.toString()); - } - - private static String asString(Object value) { - return value == null ? null : value.toString(); - } -} diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java index 210f968f..a089d88c 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java @@ -1,707 +1,586 @@ -package edu.ucsd.msjava.msdbsearch; - -import edu.ucsd.msjava.msgf.Tolerance; -import edu.ucsd.msjava.msutil.*; -import edu.ucsd.msjava.params.*; -import edu.ucsd.msjava.parser.BufferedLineReader; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Hashtable; -import java.util.List; - -import static edu.ucsd.msjava.msutil.Composition.POTASSIUM_CHARGE_CARRIER_MASS; -import static edu.ucsd.msjava.msutil.Composition.PROTON; -import static edu.ucsd.msjava.msutil.Composition.SODIUM_CHARGE_CARRIER_MASS; - -public class SearchParams { - - /** - * Two-pass precursor mass calibration (P2-cal) mode. - * - *
    - *
  • {@link #AUTO} (default) — run the pre-pass, apply the learned shift - * only if at least 200 high-confidence PSMs are collected; otherwise - * fall through with a 0 ppm shift.
  • - *
  • {@link #ON} — run the pre-pass and always apply the learned shift, - * even when fewer than 200 confident PSMs are collected.
  • - *
  • {@link #OFF} — skip calibration entirely. The code path MUST be - * bit-identical to a baseline build without the flag.
  • - *
- */ - public enum PrecursorCalMode { - AUTO, - ON, - OFF; - - /** - * Case-insensitive string to enum conversion. Unknown values fall - * back to {@link #AUTO} so that downstream code never crashes if a - * typo slips past CLI parsing. - */ - public static PrecursorCalMode fromString(String s) { - if (s == null) return AUTO; - String normalized = s.trim().toLowerCase(); - switch (normalized) { - case "on": - return ON; - case "off": - return OFF; - case "auto": - case "": - return AUTO; - default: - return AUTO; - } - } - } - - private List dbSearchIOList; - private File databaseFile; - private String decoyProteinPrefix; - private Tolerance leftPrecursorMassTolerance; - private Tolerance rightPrecursorMassTolerance; - private int minIsotopeError; - private int maxIsotopeError; - private Enzyme enzyme; - private int numTolerableTermini; - private ActivationMethod activationMethod; - private InstrumentType instType; - private Protocol protocol; - private AminoAcidSet aaSet; - private int numMatchesPerSpec; - private int startSpecIndex; - private int endSpecIndex; - private boolean useTDA; - private boolean ignoreMetCleavage; - private int minPeptideLength; - private int maxPeptideLength; - private int maxNumVariantsPerPeptide; - private int minCharge; - private int maxCharge; - private int numThreads; - private int numTasks; - private int minSpectraPerThread; - private boolean verbose; - private boolean doNotUseEdgeScore; - private File dbIndexDir; - private boolean outputAdditionalFeatures; - private int minNumPeaksPerSpectrum; - private int minDeNovoScore; - private double chargeCarrierMass; - private int maxMissedCleavages; - private int maxNumMods; - private boolean allowDenseCentroidedPeaks; - private int minMSLevel; - private int maxMSLevel; - private int outputFormat; // 0=pin (default), 1=tsv — mzid output removed - private PrecursorCalMode precursorCalMode = PrecursorCalMode.AUTO; - - public SearchParams() { - } - - /** - * Returns the configured precursor mass calibration mode; defaults - * to {@link PrecursorCalMode#AUTO}. - */ - public PrecursorCalMode getPrecursorCalMode() { - return precursorCalMode; - } - - // Used by MS-GF+ - public List getDBSearchIOList() { - return dbSearchIOList; - } - - // Used by MS-GF+ - public File getDatabaseFile() { - return databaseFile; - } - - // Used by MS-GF+ - public String getDecoyProteinPrefix() { - return decoyProteinPrefix; - } - - // Used by MS-GF+ - public Tolerance getLeftPrecursorMassTolerance() { - return leftPrecursorMassTolerance; - } - - // Used by MS-GF+ - public Tolerance getRightPrecursorMassTolerance() { - return rightPrecursorMassTolerance; - } - - // Used by MS-GF+ - public int getMinIsotopeError() { - return minIsotopeError; - } - - // Used by MS-GF+ - public int getMaxIsotopeError() { - return maxIsotopeError; - } - - // Used by MS-GF+ - public Enzyme getEnzyme() { - return enzyme; - } - - public int getNumTolerableTermini() { - return numTolerableTermini; - } - - // Used by MS-GF+ - public ActivationMethod getActivationMethod() { - return activationMethod; - } - - // Used by MS-GF+ - public InstrumentType getInstType() { - return instType; - } - - // Used by MS-GF+ - public Protocol getProtocol() { - return protocol; - } - - // Used by MS-GF+ - public AminoAcidSet getAASet() { - return aaSet; - } - - // Used by MS-GF+ - public int getNumMatchesPerSpec() { - return numMatchesPerSpec; - } - - // Used by MS-GF+ - public int getStartSpecIndex() { - return startSpecIndex; - } - - // Used by MS-GF+ - public int getEndSpecIndex() { - return endSpecIndex; - } - - // Used by MS-GF+ - public boolean useTDA() { - return useTDA; - } - - // Used by MS-GF+ - public boolean ignoreMetCleavage() { - return ignoreMetCleavage; - } - - // Used by MS-GF+ - public int getMinPeptideLength() { - return minPeptideLength; - } - - // Used by MS-GF+ - public int getMaxPeptideLength() { - return maxPeptideLength; - } - - // Used by MS-GF+ - public int getMaxNumVariantsPerPeptide() { - return maxNumVariantsPerPeptide; - } - - // Used by MS-GF+ - public int getMinCharge() { - return minCharge; - } - - // Used by MS-GF+ - public int getMaxCharge() { - return maxCharge; - } - - // Used by MS-GF+ - public int getNumThreads() { - return numThreads; - } - - public int getNumTasks() { - return numTasks; - } - - public int getMinSpectraPerThread() { - return minSpectraPerThread; - } - - public boolean getVerbose() { - return verbose; - } - - // Used by MS-GF+ - public boolean doNotUseEdgeScore() { - return doNotUseEdgeScore; - } - - // Used by MS-GF+ - public File getDBIndexDir() { - return dbIndexDir; - } - - public boolean outputAdditionalFeatures() { - return outputAdditionalFeatures; - } - - // Used by MS-GF+ - public int getMinNumPeaksPerSpectrum() { - return minNumPeaksPerSpectrum; - } - - // Used by MS-GF+ - public int getMinDeNovoScore() { - return minDeNovoScore; - } - - public double getChargeCarrierMass() { - return chargeCarrierMass; - } - - // Used by MS-GF+ - public int getMaxMissedCleavages() { - return maxMissedCleavages; - } - - // Used by MS-GF+ - public boolean getAllowDenseCentroidedPeaks() { - return allowDenseCentroidedPeaks; - } - - // Used by MS-GF+ - public int getMinMSLevel() { - return minMSLevel; - } - - // Used by MS-GF+ - public int getMaxMSLevel() { - return maxMSLevel; - } - - /** 0=pin (default), 1=tsv. */ - public int getOutputFormat() { - return outputFormat; - } - - public boolean writeTsv() { - return outputFormat == 1; - } - - public boolean writePin() { - return outputFormat == 0; - } - - /** - * Look for # in dataLine - * If present, remove that character and any comment after it - * - * @param dataLine - * @return dataLine without the comment - */ - public static String getConfigLineWithoutComment(String dataLine) { - String[] tokenArray = dataLine.split("#"); - if (tokenArray.length == 0) - return ""; - - return tokenArray[0].trim(); - } - - // Used by MS-GF+ - public String parse(ParamManager paramManager) { - AminoAcidSet configAASet = null; - FileParameter configFileParam = paramManager.getConfigFileParam(); - - if (configFileParam != null && configFileParam.getFile() != null) { - configAASet = parseConfigParamFile(paramManager); - } - - // Charge carrier mass - chargeCarrierMass = paramManager.getChargeCarrierMass(); - Composition.setChargeCarrierMass(chargeCarrierMass); - - // Spectrum file - // Read outputFormat up-front so the default-output-file extension - // logic below (inside both the single-file and directory branches) - // sees the user-supplied value, not the field's zero initializer. - outputFormat = paramManager.getOutputFormat(); - - FileParameter specParam = paramManager.getSpecFileParam(); - File specPath = specParam.getFile(); - - if (specPath == null) - { - return "Spectrum file is not defined; use -s at the command line or SpectrumFile in a config file"; - } - - if (!specPath.exists()) { - return "Spectrum file not found: " + specPath.getPath(); - } - - dbSearchIOList = new ArrayList<>(); - - if (!specPath.isDirectory()) { - // Spectrum format - SpecFileFormat specFormat = (SpecFileFormat) specParam.getFileFormat(); - // Output file - File outputFile = paramManager.getOutputFileParam().getFile(); - if (outputFile == null) { - String defaultExt = outputFormat == 1 ? ".tsv" : ".pin"; - String outputFilePath = specPath.getPath().substring(0, specPath.getPath().lastIndexOf('.')) + defaultExt; - outputFile = new File(outputFilePath); - } - - dbSearchIOList = new ArrayList<>(); - dbSearchIOList.add(new DBSearchIOFiles(specPath, specFormat, outputFile)); - } else // spectrum directory - { - dbSearchIOList = new ArrayList<>(); - String defaultExt = outputFormat == 1 ? ".tsv" : ".pin"; - for (File f : specPath.listFiles()) { - SpecFileFormat specFormat = SpecFileFormat.getSpecFileFormat(f.getName()); - if (specParam.isSupported(specFormat)) { - String outputFileName = f.getName().substring(0, f.getName().lastIndexOf('.')) + defaultExt; - File outputFile = new File(outputFileName); -// if (outputFile.exists()) -// return outputFile.getPath() + " already exists!"; - dbSearchIOList.add(new DBSearchIOFiles(f, specFormat, outputFile)); - } - } - } - - // FASTA file - databaseFile = paramManager.getDBFileParam().getFile(); - - decoyProteinPrefix = paramManager.getDecoyProteinPrefix(); - - // Precursor mass tolerance - ToleranceParameter tol = paramManager.getPrecursorMassToleranceParam(); - leftPrecursorMassTolerance = tol.getLeftTolerance(); - rightPrecursorMassTolerance = tol.getRightTolerance(); - - int toleranceUnit = paramManager.getToleranceUnit(); - if (toleranceUnit != 2) { - boolean isTolerancePPM; - isTolerancePPM = toleranceUnit != 0; - leftPrecursorMassTolerance = new Tolerance(leftPrecursorMassTolerance.getValue(), isTolerancePPM); - rightPrecursorMassTolerance = new Tolerance(rightPrecursorMassTolerance.getValue(), isTolerancePPM); - } - - IntRangeParameter isotopeParam = paramManager.getIsotopeRangeParameter(); - this.minIsotopeError = isotopeParam.getMin(); - this.maxIsotopeError = isotopeParam.getMax(); - - if (rightPrecursorMassTolerance.getToleranceAsDa(1000, 2) >= 0.5f || - leftPrecursorMassTolerance.getToleranceAsDa(1000, 2) >= 0.5f) { - minIsotopeError = maxIsotopeError = 0; - } - - enzyme = paramManager.getEnzyme(); - numTolerableTermini = paramManager.getNumTolerableTermini(); - activationMethod = paramManager.getActivationMethod(); - instType = paramManager.getInstType(); - if (activationMethod == ActivationMethod.HCD && instType != InstrumentType.HIGH_RESOLUTION_LTQ && instType != InstrumentType.QEXACTIVE) - instType = InstrumentType.QEXACTIVE; // by default use Q-Exactive model for HCD - - protocol = paramManager.getProtocol(); - - aaSet = null; - File modFile = paramManager.getModFileParam().getFile(); - if (modFile == null && configAASet == null) - aaSet = AminoAcidSet.getStandardAminoAcidSetWithFixedCarbamidomethylatedCys(); - else { - if (modFile != null) { - String modFileName = modFile.getName(); - String ext = modFileName.substring(modFileName.lastIndexOf('.') + 1); - if (ext.equalsIgnoreCase("xml")) - aaSet = AminoAcidSet.getAminoAcidSetFromXMLFile(modFile.getPath()); - else - aaSet = AminoAcidSet.getAminoAcidSetFromModFile(modFile.getPath(), paramManager); - } else { - aaSet = configAASet; - } - - if (protocol == Protocol.AUTOMATIC) { - if (aaSet.containsITRAQ()) { - if (aaSet.containsPhosphorylation()) - protocol = Protocol.ITRAQPHOSPHO; - else - protocol = Protocol.ITRAQ; - } else if (aaSet.containsTMT()) { - protocol = Protocol.TMT; - } else { - if (aaSet.containsPhosphorylation()) - protocol = Protocol.PHOSPHORYLATION; - else - protocol = Protocol.STANDARD; - } - } - } - - numMatchesPerSpec = paramManager.getNumMatchesPerSpectrum(); - - IntRangeParameter specIndexParam = paramManager.getSpecIndexParameter(); - startSpecIndex = specIndexParam.getMin(); - endSpecIndex = specIndexParam.getMax(); - - useTDA = paramManager.getTDA() == 1; - ignoreMetCleavage = paramManager.getIgnoreMetCleavage() == 1; - outputAdditionalFeatures = paramManager.getOutputAdditionalFeatures() == 1; - - minPeptideLength = paramManager.getMinPeptideLength(); - maxPeptideLength = paramManager.getMaxPeptideLength(); - - // Number of isoforms to consider per peptide, Default: 128 - maxNumVariantsPerPeptide = paramManager.getMaxNumVariantsPerPeptide(); - - if (minPeptideLength > maxPeptideLength) { - return "MinPepLength must not be larger than MaxPepLength"; - } - - minCharge = paramManager.getMinCharge(); - maxCharge = paramManager.getMaxCharge(); - if (minCharge > maxCharge) { - return "MinCharge must not be larger than MaxCharge"; - } - - numThreads = paramManager.getNumThreads(); - numTasks = paramManager.getNumTasks(); - minSpectraPerThread = paramManager.getMinSpectraPerThread(); - verbose = paramManager.getVerboseFlag() == 1; - doNotUseEdgeScore = paramManager.getEdgeScoreFlag() == 1; - - dbIndexDir = paramManager.getDatabaseIndexDir(); - - minNumPeaksPerSpectrum = paramManager.getMinNumPeaksPerSpectrum(); - - minDeNovoScore = paramManager.getMinDeNovoScore(); - - /* Make sure max missed cleavages is a valid value and that it is not - * being mixed with an unspecific or no-cleave enzyme - */ - maxMissedCleavages = paramManager.getMaxMissedCleavages(); - if (maxMissedCleavages > -1 && enzyme.getName().equals("UnspecificCleavage")) { - return "Cannot specify a MaxMissedCleavages when using unspecific cleavage enzyme"; - } else if (maxMissedCleavages > -1 && enzyme.getName().equals("NoCleavage")) { - return "Cannot specify a MaxMissedCleavages when using no cleavage enzyme"; - } - - allowDenseCentroidedPeaks = paramManager.getAllowDenseCentroidedPeaks() == 1; - // outputFormat was read earlier in parse() so the default-filename- - // extension logic in the spec-path branches sees the user's value. - precursorCalMode = PrecursorCalMode.fromString(paramManager.getPrecursorCalRawValue()); - - IntRangeParameter msLevelParam = paramManager.getMSLevelParameter(); - minMSLevel = msLevelParam.getMin(); - maxMSLevel = msLevelParam.getMax(); - - maxNumMods = paramManager.getMaxNumModsPerPeptide(); - int maxNumModsCompare = aaSet.getMaxNumberOfVariableModificationsPerPeptide(); - - if (maxNumMods != maxNumModsCompare) { - System.err.println("Error, code bug: " + - "MaxNumModsPerPeptide tracked by the ParamManager does not match the value tracked by the AminoAcidSet: " + - maxNumMods + " vs. " + maxNumModsCompare); - System.exit(-1); - } - - // Make sure all unique modifications have unique identifiers... - Modification.setModIdentifiers(); - - return null; - } - - // Used by MS-GF+ - private AminoAcidSet parseConfigParamFile(ParamManager paramManager) { - - BufferedLineReader reader = null; - - File paramFile = paramManager.getConfigFileParam().getFile(); - - try { - reader = new BufferedLineReader(paramFile.getPath()); - } catch (IOException e) { - System.err.println("Error opening parameter file " + paramFile.getPath()); - e.printStackTrace(); - System.exit(-1); - } - - String dataLine; - int lineNum = 0; - - // Keys in this table are line numbers - // Values are the text from the config file, after the equals sign, defining a custom amino acid - Hashtable customAAByLine = new Hashtable<>(); - - // Keys in this table are line numbers - // Values are the text from the config file, after the equals sign, defining a static or dynamic mod - Hashtable modsByLine = new Hashtable<>(); - - // Parse the settings - - int invalidParameterCount = 0; - - assert reader != null; - while ((dataLine = reader.readLine()) != null) { - lineNum++; - - String lineSetting = getConfigLineWithoutComment(dataLine); - if (lineSetting.length() == 0) { - continue; - } - - String paramName = ParamManager.ParamNameEnum.getParamNameFromLine(lineSetting); - if (paramName.isEmpty()) { - continue; - } - - if (ParamManager.ParamNameEnum.DYNAMIC_MODIFICATION.isThisParam(paramName) || - ParamManager.ParamNameEnum.STATIC_MODIFICATION.isThisParam(paramName) || - ParamManager.ParamNameEnum.CUSTOM_AA.isThisParam(paramName)) { - - String value = lineSetting.split("=")[1].trim(); - if (!value.equalsIgnoreCase("none")) { - // Store the text after the equals sign - if (ParamManager.ParamNameEnum.CUSTOM_AA.isThisParam(paramName)) - customAAByLine.put(lineNum, value); - else - modsByLine.put(lineNum, value); - } - continue; - } - - boolean validParameter = false; - for (ParamManager.ParamNameEnum param : ParamManager.ParamNameEnum.values()) { - if (param.isThisParam(paramName)) { - Parameter commandLineParam = paramManager.getParameter(param.getKey()); - if (commandLineParam != null) { - validParameter = true; - if (!commandLineParam.isValueAssigned()) { - String value = lineSetting.split("=")[1].trim(); - String parseError = commandLineParam.parse(value); - if (parseError == null || parseError.isEmpty()) { - commandLineParam.setValueAssigned(); - continue; - } - - if (commandLineParam.getKey().equals(ParamManager.ParamNameEnum.NUM_THREADS.getKey()) && - value.equalsIgnoreCase("all")) { - // Config file has: NumThreads=All - // This is acceptable - // Note that numThreads should have already been initialized to the number of cores on this system - // (see method addNumThreadsParam in ParamManager) - continue; - } - - System.err.println("Error parsing '" + lineSetting + "' in config file " + - paramFile.getAbsolutePath() + ": " + parseError); - System.exit(-1); - } - } - } - } - - if (!validParameter) { - if (lineSetting.toLowerCase().startsWith("enzymedef")) { - // DMS uses EnzymeDef to keep track of customize enzyme definitions - // See, for example, https://github.com/PNNL-Comp-Mass-Spec/DMS-Analysis-Manager/blob/875533dfe95ed2c8252dc72b334cfd8ed651fa1c/Plugins/AM_MSGFDB_PlugIn/clsMSGFPlusUtils.cs#L2456 - // Thus, silently ignore this - } else { - System.out.println("Warning, unrecognized parameter '" + lineSetting + "' in config file " + paramFile.getName()); - invalidParameterCount++; - } - } - - } - - if (invalidParameterCount > 0) { - System.out.println("Valid parameters are described in the example parameter file at " + - "https://github.com/MSGFPlus/msgfplus/blob/master/docs/examples/MSGFPlus_Params.txt"); - } - - return AminoAcidSet.getAminoAcidSetFromList(paramFile.getName(), customAAByLine, modsByLine, paramManager); - } - - @Override - public String toString() { - StringBuffer buf = new StringBuffer(); - -// buf.append("Spectrum File(s):\n"); -// for(DBSearchIOFiles ioFile : this.dbSearchIOList) -// { -// buf.append("\t"+ioFile.getSpecFile().getAbsolutePath()+"\n"); -// } -// buf.append("Database File: " + this.databaseFile.getAbsolutePath() + "\n"); - - buf.append("\tPrecursorMassTolerance: "); - if (leftPrecursorMassTolerance.equals(rightPrecursorMassTolerance)) { - buf.append(leftPrecursorMassTolerance); - } else { - buf.append("[" + leftPrecursorMassTolerance + "," + rightPrecursorMassTolerance + "]"); - } - buf.append("\n"); - - buf.append("\tIsotopeError: " + this.minIsotopeError + "," + this.maxIsotopeError + "\n"); - buf.append("\tTargetDecoyAnalysis: " + this.useTDA + "\n"); - buf.append("\tFragmentationMethod: " + this.activationMethod + "\n"); - buf.append("\tInstrument: " + (instType == null ? "null" : this.instType.getNameAndDescription()) + "\n"); - buf.append("\tEnzyme: " + (enzyme == null ? "null" : this.enzyme.getName()) + "\n"); - - String customEnzymeFile = Enzyme.getCustomEnzymeFilePath(); - if (customEnzymeFile != null && !customEnzymeFile.isEmpty()) { - buf.append("\tEnzyme file: " + customEnzymeFile + "\n"); - } - - ArrayList customEnzymeMessages = Enzyme.getCustomEnzymeMessages(); - for (String message : customEnzymeMessages) { - buf.append("\tEnzyme info: " + message + "\n"); - } - - buf.append("\tProtocol: " + (protocol == null ? "null" : this.protocol.getName()) + "\n"); - buf.append("\tNumTolerableTermini: " + this.numTolerableTermini + "\n"); - buf.append("\tIgnoreMetCleavage: " + this.ignoreMetCleavage + "\n"); - buf.append("\tMinPepLength: " + this.minPeptideLength + "\n"); - buf.append("\tMaxPepLength: " + this.maxPeptideLength + "\n"); - buf.append("\tMinCharge: " + this.minCharge + "\n"); - buf.append("\tMaxCharge: " + this.maxCharge + "\n"); - buf.append("\tNumMatchesPerSpec: " + this.numMatchesPerSpec + "\n"); - buf.append("\tMaxMissedCleavages: " + this.maxMissedCleavages + "\n"); - buf.append("\tMaxNumModsPerPeptide: " + this.maxNumMods + "\n"); - buf.append("\tChargeCarrierMass: " + this.chargeCarrierMass); - - if (Math.abs(this.chargeCarrierMass - PROTON) < 0.005) { - buf.append(" (proton)\n"); - } else if (Math.abs(this.chargeCarrierMass - POTASSIUM_CHARGE_CARRIER_MASS) < 0.005) { - buf.append(" (potassium)\n"); - } else if (Math.abs(this.chargeCarrierMass - SODIUM_CHARGE_CARRIER_MASS) < 0.005) { - buf.append(" (sodium)\n"); - } else { - buf.append(" (custom)\n"); - } - - buf.append("\tMSLevel: " + this.minMSLevel + "," + this.maxMSLevel + "\n"); - buf.append("\tMinNumPeaksPerSpectrum: " + this.minNumPeaksPerSpectrum + "\n"); - buf.append("\tNumIsoforms: " + this.maxNumVariantsPerPeptide + "\n"); - - ArrayList modificationsInUse = aaSet.getModificationsInUse(); - - if (modificationsInUse.size() == 0) { - buf.append("No static or dynamic post translational modifications are defined.\n"); - } else { - buf.append("Post translational modifications in use:\n"); - for (String modInfo : modificationsInUse) - buf.append("\t" + modInfo + "\n"); - } - - return buf.toString(); - } -} +package edu.ucsd.msjava.msdbsearch; + +import edu.ucsd.msjava.cli.IntRange; +import edu.ucsd.msjava.cli.MSGFPlusOptions; +import edu.ucsd.msjava.cli.PrecursorTolerance; +import edu.ucsd.msjava.msgf.Tolerance; +import edu.ucsd.msjava.msutil.*; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import static edu.ucsd.msjava.msutil.Composition.POTASSIUM_CHARGE_CARRIER_MASS; +import static edu.ucsd.msjava.msutil.Composition.PROTON; +import static edu.ucsd.msjava.msutil.Composition.SODIUM_CHARGE_CARRIER_MASS; + +public class SearchParams { + + /** + * Two-pass precursor mass calibration (P2-cal) mode. + * + *
    + *
  • {@link #AUTO} (default) — run the pre-pass, apply the learned shift + * only if at least 200 high-confidence PSMs are collected; otherwise + * fall through with a 0 ppm shift.
  • + *
  • {@link #ON} — run the pre-pass and always apply the learned shift, + * even when fewer than 200 confident PSMs are collected.
  • + *
  • {@link #OFF} — skip calibration entirely. The code path MUST be + * bit-identical to a baseline build without the flag.
  • + *
+ */ + public enum PrecursorCalMode { + AUTO, + ON, + OFF; + + /** + * Case-insensitive string to enum conversion. Unknown values fall + * back to {@link #AUTO} so that downstream code never crashes if a + * typo slips past CLI parsing. + */ + public static PrecursorCalMode fromString(String s) { + if (s == null) return AUTO; + String normalized = s.trim().toLowerCase(); + switch (normalized) { + case "on": + return ON; + case "off": + return OFF; + case "auto": + case "": + return AUTO; + default: + return AUTO; + } + } + } + + private List dbSearchIOList; + private File databaseFile; + private String decoyProteinPrefix; + private Tolerance leftPrecursorMassTolerance; + private Tolerance rightPrecursorMassTolerance; + private int minIsotopeError; + private int maxIsotopeError; + private Enzyme enzyme; + private int numTolerableTermini; + private ActivationMethod activationMethod; + private InstrumentType instType; + private Protocol protocol; + private AminoAcidSet aaSet; + private int numMatchesPerSpec; + private int startSpecIndex; + private int endSpecIndex; + private boolean useTDA; + private boolean ignoreMetCleavage; + private int minPeptideLength; + private int maxPeptideLength; + private int maxNumVariantsPerPeptide; + private int minCharge; + private int maxCharge; + private int numThreads; + private int numTasks; + private int minSpectraPerThread; + private boolean verbose; + private boolean doNotUseEdgeScore; + private File dbIndexDir; + private boolean outputAdditionalFeatures; + private int minNumPeaksPerSpectrum; + private int minDeNovoScore; + private double chargeCarrierMass; + private int maxMissedCleavages; + private int maxNumMods; + private boolean allowDenseCentroidedPeaks; + private int minMSLevel; + private int maxMSLevel; + private int outputFormat; // 0=pin (default), 1=tsv — mzid output removed + private PrecursorCalMode precursorCalMode = PrecursorCalMode.AUTO; + + public SearchParams() { + } + + /** + * Returns the configured precursor mass calibration mode; defaults + * to {@link PrecursorCalMode#AUTO}. + */ + public PrecursorCalMode getPrecursorCalMode() { + return precursorCalMode; + } + + // Used by MS-GF+ + public List getDBSearchIOList() { + return dbSearchIOList; + } + + // Used by MS-GF+ + public File getDatabaseFile() { + return databaseFile; + } + + // Used by MS-GF+ + public String getDecoyProteinPrefix() { + return decoyProteinPrefix; + } + + // Used by MS-GF+ + public Tolerance getLeftPrecursorMassTolerance() { + return leftPrecursorMassTolerance; + } + + // Used by MS-GF+ + public Tolerance getRightPrecursorMassTolerance() { + return rightPrecursorMassTolerance; + } + + // Used by MS-GF+ + public int getMinIsotopeError() { + return minIsotopeError; + } + + // Used by MS-GF+ + public int getMaxIsotopeError() { + return maxIsotopeError; + } + + // Used by MS-GF+ + public Enzyme getEnzyme() { + return enzyme; + } + + public int getNumTolerableTermini() { + return numTolerableTermini; + } + + // Used by MS-GF+ + public ActivationMethod getActivationMethod() { + return activationMethod; + } + + // Used by MS-GF+ + public InstrumentType getInstType() { + return instType; + } + + // Used by MS-GF+ + public Protocol getProtocol() { + return protocol; + } + + // Used by MS-GF+ + public AminoAcidSet getAASet() { + return aaSet; + } + + // Used by MS-GF+ + public int getNumMatchesPerSpec() { + return numMatchesPerSpec; + } + + // Used by MS-GF+ + public int getStartSpecIndex() { + return startSpecIndex; + } + + // Used by MS-GF+ + public int getEndSpecIndex() { + return endSpecIndex; + } + + // Used by MS-GF+ + public boolean useTDA() { + return useTDA; + } + + // Used by MS-GF+ + public boolean ignoreMetCleavage() { + return ignoreMetCleavage; + } + + // Used by MS-GF+ + public int getMinPeptideLength() { + return minPeptideLength; + } + + // Used by MS-GF+ + public int getMaxPeptideLength() { + return maxPeptideLength; + } + + // Used by MS-GF+ + public int getMaxNumVariantsPerPeptide() { + return maxNumVariantsPerPeptide; + } + + // Used by MS-GF+ + public int getMinCharge() { + return minCharge; + } + + // Used by MS-GF+ + public int getMaxCharge() { + return maxCharge; + } + + // Used by MS-GF+ + public int getNumThreads() { + return numThreads; + } + + public int getNumTasks() { + return numTasks; + } + + public int getMinSpectraPerThread() { + return minSpectraPerThread; + } + + public boolean getVerbose() { + return verbose; + } + + // Used by MS-GF+ + public boolean doNotUseEdgeScore() { + return doNotUseEdgeScore; + } + + // Used by MS-GF+ + public File getDBIndexDir() { + return dbIndexDir; + } + + public boolean outputAdditionalFeatures() { + return outputAdditionalFeatures; + } + + // Used by MS-GF+ + public int getMinNumPeaksPerSpectrum() { + return minNumPeaksPerSpectrum; + } + + // Used by MS-GF+ + public int getMinDeNovoScore() { + return minDeNovoScore; + } + + public double getChargeCarrierMass() { + return chargeCarrierMass; + } + + // Used by MS-GF+ + public int getMaxMissedCleavages() { + return maxMissedCleavages; + } + + // Used by MS-GF+ + public boolean getAllowDenseCentroidedPeaks() { + return allowDenseCentroidedPeaks; + } + + // Used by MS-GF+ + public int getMinMSLevel() { + return minMSLevel; + } + + // Used by MS-GF+ + public int getMaxMSLevel() { + return maxMSLevel; + } + + /** 0=pin (default), 1=tsv. */ + public int getOutputFormat() { + return outputFormat; + } + + public boolean writeTsv() { + return outputFormat == 1; + } + + public boolean writePin() { + return outputFormat == 0; + } + + /** + * Look for # in dataLine + * If present, remove that character and any comment after it + * + * @param dataLine + * @return dataLine without the comment + */ + public static String getConfigLineWithoutComment(String dataLine) { + String[] tokenArray = dataLine.split("#"); + if (tokenArray.length == 0) + return ""; + + return tokenArray[0].trim(); + } + + /** + * Build a SearchParams from the typed CLI/config-file model. Reads {@code -conf} + * (when set) via {@link MSGFPlusOptions#applyConfigFile(File)} so any unset CLI + * fields are filled from the config file before the rest of the build runs. + * + * @return null on success; user-facing error string otherwise. + */ + public String parse(MSGFPlusOptions opts) { + // Apply config-file overlay first: fills in any opts.* fields the CLI did + // not set, plus collects DynamicMod/StaticMod/CustomAA into opts.*Mods lists. + if (opts.configFile != null) { + String err = opts.applyConfigFile(opts.configFile); + if (err != null) return err; + } + + // Required-input check now that CLI + config-file have both run. + String requiredErr = opts.validateRequired(); + if (requiredErr != null) return requiredErr; + + chargeCarrierMass = opts.effectiveChargeCarrierMass(); + Composition.setChargeCarrierMass(chargeCarrierMass); + + // Read outputFormat up-front so the default-output-file extension logic + // below sees the user-supplied value, not the field's zero initializer. + outputFormat = opts.effectiveOutputFormat(); + + File specPath = opts.spectrumFile; + if (!specPath.exists()) { + return "Spectrum file not found: " + specPath.getPath(); + } + + dbSearchIOList = new ArrayList<>(); + String defaultExt = outputFormat == 1 ? ".tsv" : ".pin"; + + if (!specPath.isDirectory()) { + SpecFileFormat specFormat = SpecFileFormat.getSpecFileFormat(specPath.getName()); + File outputFile = opts.outputFile; + if (outputFile == null) { + String outputFilePath = specPath.getPath().substring(0, specPath.getPath().lastIndexOf('.')) + defaultExt; + outputFile = new File(outputFilePath); + } + dbSearchIOList.add(new DBSearchIOFiles(specPath, specFormat, outputFile)); + } else { + for (File f : specPath.listFiles()) { + SpecFileFormat specFormat = SpecFileFormat.getSpecFileFormat(f.getName()); + if (isSupportedSpectrumFormat(specFormat)) { + String outputFileName = f.getName().substring(0, f.getName().lastIndexOf('.')) + defaultExt; + File outputFile = new File(outputFileName); + dbSearchIOList.add(new DBSearchIOFiles(f, specFormat, outputFile)); + } + } + } + + databaseFile = opts.databaseFile; + decoyProteinPrefix = opts.effectiveDecoyPrefix(); + + PrecursorTolerance tol = opts.effectivePrecursorTolerance(); + leftPrecursorMassTolerance = tol.left; + rightPrecursorMassTolerance = tol.right; + + int toleranceUnit = opts.effectiveToleranceUnits(); + if (toleranceUnit != 2) { + boolean isTolerancePPM = toleranceUnit != 0; + leftPrecursorMassTolerance = new Tolerance(leftPrecursorMassTolerance.getValue(), isTolerancePPM); + rightPrecursorMassTolerance = new Tolerance(rightPrecursorMassTolerance.getValue(), isTolerancePPM); + } + + IntRange isotope = opts.effectiveIsotopeErrorRange(); + this.minIsotopeError = isotope.min; + this.maxIsotopeError = isotope.max; + + if (rightPrecursorMassTolerance.getToleranceAsDa(1000, 2) >= 0.5f || + leftPrecursorMassTolerance.getToleranceAsDa(1000, 2) >= 0.5f) { + minIsotopeError = maxIsotopeError = 0; + } + + enzyme = opts.effectiveEnzyme(); + numTolerableTermini = opts.effectiveNumTolerableTermini(); + activationMethod = opts.effectiveActivationMethod(); + instType = opts.effectiveInstrumentType(); + if (activationMethod == ActivationMethod.HCD + && instType != InstrumentType.HIGH_RESOLUTION_LTQ + && instType != InstrumentType.QEXACTIVE) { + instType = InstrumentType.QEXACTIVE; // default to Q-Exactive for HCD + } + protocol = opts.effectiveProtocol(); + + aaSet = null; + File modFile = opts.modificationFile; + boolean hasConfigMods = !opts.dynamicMods.isEmpty() + || !opts.staticMods.isEmpty() + || !opts.customAAs.isEmpty(); + + if (modFile == null && !hasConfigMods) { + aaSet = AminoAcidSet.getStandardAminoAcidSetWithFixedCarbamidomethylatedCys(); + } else { + if (modFile != null) { + String modFileName = modFile.getName(); + String ext = modFileName.substring(modFileName.lastIndexOf('.') + 1); + if (ext.equalsIgnoreCase("xml")) { + aaSet = AminoAcidSet.getAminoAcidSetFromXMLFile(modFile.getPath()); + } else { + aaSet = AminoAcidSet.getAminoAcidSetFromModFile(modFile.getPath(), opts); + } + } else { + List mods = new ArrayList<>(opts.staticMods.size() + opts.dynamicMods.size()); + mods.addAll(opts.staticMods); + mods.addAll(opts.dynamicMods); + aaSet = AminoAcidSet.getAminoAcidSetFromModEntries( + opts.configFile != null ? opts.configFile.getName() : "config", + opts.customAAs, mods, opts); + } + + if (protocol == Protocol.AUTOMATIC) { + if (aaSet.containsITRAQ()) { + protocol = aaSet.containsPhosphorylation() ? Protocol.ITRAQPHOSPHO : Protocol.ITRAQ; + } else if (aaSet.containsTMT()) { + protocol = Protocol.TMT; + } else { + protocol = aaSet.containsPhosphorylation() ? Protocol.PHOSPHORYLATION : Protocol.STANDARD; + } + } + } + + numMatchesPerSpec = opts.effectiveNumMatchesPerSpec(); + + IntRange specIdx = opts.effectiveSpecIndexRange(); + startSpecIndex = specIdx.min; + endSpecIndex = specIdx.max; + + useTDA = opts.effectiveTdaStrategy() == 1; + ignoreMetCleavage = opts.effectiveIgnoreMetCleavage() == 1; + outputAdditionalFeatures = opts.effectiveAddFeatures() == 1; + + minPeptideLength = opts.effectiveMinPeptideLength(); + maxPeptideLength = opts.effectiveMaxPeptideLength(); + maxNumVariantsPerPeptide = opts.effectiveNumIsoforms(); + + if (minPeptideLength > maxPeptideLength) { + return "MinPepLength must not be larger than MaxPepLength"; + } + + minCharge = opts.effectiveMinCharge(); + maxCharge = opts.effectiveMaxCharge(); + if (minCharge > maxCharge) { + return "MinCharge must not be larger than MaxCharge"; + } + + numThreads = opts.effectiveNumThreads(); + numTasks = opts.effectiveNumTasks(); + minSpectraPerThread = opts.effectiveMinSpectraPerThread(); + verbose = opts.effectiveVerbose() == 1; + doNotUseEdgeScore = opts.effectiveEdgeScore() == 1; + + dbIndexDir = opts.dbIndexDir; + minNumPeaksPerSpectrum = opts.effectiveMinNumPeaks(); + minDeNovoScore = opts.effectiveMinDeNovoScore(); + + maxMissedCleavages = opts.effectiveMaxMissedCleavages(); + if (maxMissedCleavages > -1 && enzyme.getName().equals("UnspecificCleavage")) { + return "Cannot specify a MaxMissedCleavages when using unspecific cleavage enzyme"; + } else if (maxMissedCleavages > -1 && enzyme.getName().equals("NoCleavage")) { + return "Cannot specify a MaxMissedCleavages when using no cleavage enzyme"; + } + + allowDenseCentroidedPeaks = opts.effectiveAllowDenseCentroidedPeaks() == 1; + precursorCalMode = PrecursorCalMode.fromString(opts.effectivePrecursorCalRaw()); + + IntRange ms = opts.effectiveMSLevel(); + minMSLevel = ms.min; + maxMSLevel = ms.max; + + maxNumMods = opts.effectiveMaxNumMods(); + int maxNumModsCompare = aaSet.getMaxNumberOfVariableModificationsPerPeptide(); + if (maxNumMods != maxNumModsCompare) { + System.err.println("Error, code bug: MaxNumModsPerPeptide tracked by MSGFPlusOptions (" + + maxNumMods + ") does not match value tracked by AminoAcidSet (" + + maxNumModsCompare + ")"); + System.exit(-1); + } + + Modification.setModIdentifiers(); + return null; + } + + /** Spectrum-format whitelist (formerly enforced by FileParameter.isSupported). */ + private static boolean isSupportedSpectrumFormat(SpecFileFormat fmt) { + return fmt == SpecFileFormat.MZML + || fmt == SpecFileFormat.MGF + || fmt == SpecFileFormat.MS2 + || fmt == SpecFileFormat.PKL + || fmt == SpecFileFormat.DTA_TXT; + } + + + @Override + public String toString() { + StringBuffer buf = new StringBuffer(); + +// buf.append("Spectrum File(s):\n"); +// for(DBSearchIOFiles ioFile : this.dbSearchIOList) +// { +// buf.append("\t"+ioFile.getSpecFile().getAbsolutePath()+"\n"); +// } +// buf.append("Database File: " + this.databaseFile.getAbsolutePath() + "\n"); + + buf.append("\tPrecursorMassTolerance: "); + if (leftPrecursorMassTolerance.equals(rightPrecursorMassTolerance)) { + buf.append(leftPrecursorMassTolerance); + } else { + buf.append("[" + leftPrecursorMassTolerance + "," + rightPrecursorMassTolerance + "]"); + } + buf.append("\n"); + + buf.append("\tIsotopeError: " + this.minIsotopeError + "," + this.maxIsotopeError + "\n"); + buf.append("\tTargetDecoyAnalysis: " + this.useTDA + "\n"); + buf.append("\tFragmentationMethod: " + this.activationMethod + "\n"); + buf.append("\tInstrument: " + (instType == null ? "null" : this.instType.getNameAndDescription()) + "\n"); + buf.append("\tEnzyme: " + (enzyme == null ? "null" : this.enzyme.getName()) + "\n"); + + String customEnzymeFile = Enzyme.getCustomEnzymeFilePath(); + if (customEnzymeFile != null && !customEnzymeFile.isEmpty()) { + buf.append("\tEnzyme file: " + customEnzymeFile + "\n"); + } + + ArrayList customEnzymeMessages = Enzyme.getCustomEnzymeMessages(); + for (String message : customEnzymeMessages) { + buf.append("\tEnzyme info: " + message + "\n"); + } + + buf.append("\tProtocol: " + (protocol == null ? "null" : this.protocol.getName()) + "\n"); + buf.append("\tNumTolerableTermini: " + this.numTolerableTermini + "\n"); + buf.append("\tIgnoreMetCleavage: " + this.ignoreMetCleavage + "\n"); + buf.append("\tMinPepLength: " + this.minPeptideLength + "\n"); + buf.append("\tMaxPepLength: " + this.maxPeptideLength + "\n"); + buf.append("\tMinCharge: " + this.minCharge + "\n"); + buf.append("\tMaxCharge: " + this.maxCharge + "\n"); + buf.append("\tNumMatchesPerSpec: " + this.numMatchesPerSpec + "\n"); + buf.append("\tMaxMissedCleavages: " + this.maxMissedCleavages + "\n"); + buf.append("\tMaxNumModsPerPeptide: " + this.maxNumMods + "\n"); + buf.append("\tChargeCarrierMass: " + this.chargeCarrierMass); + + if (Math.abs(this.chargeCarrierMass - PROTON) < 0.005) { + buf.append(" (proton)\n"); + } else if (Math.abs(this.chargeCarrierMass - POTASSIUM_CHARGE_CARRIER_MASS) < 0.005) { + buf.append(" (potassium)\n"); + } else if (Math.abs(this.chargeCarrierMass - SODIUM_CHARGE_CARRIER_MASS) < 0.005) { + buf.append(" (sodium)\n"); + } else { + buf.append(" (custom)\n"); + } + + buf.append("\tMSLevel: " + this.minMSLevel + "," + this.maxMSLevel + "\n"); + buf.append("\tMinNumPeaksPerSpectrum: " + this.minNumPeaksPerSpectrum + "\n"); + buf.append("\tNumIsoforms: " + this.maxNumVariantsPerPeptide + "\n"); + + ArrayList modificationsInUse = aaSet.getModificationsInUse(); + + if (modificationsInUse.size() == 0) { + buf.append("No static or dynamic post translational modifications are defined.\n"); + } else { + buf.append("Post translational modifications in use:\n"); + for (String modInfo : modificationsInUse) + buf.append("\t" + modInfo + "\n"); + } + + return buf.toString(); + } +} diff --git a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java index 0db21789..ee05daa9 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java +++ b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java @@ -1,10 +1,9 @@ package edu.ucsd.msjava.msutil; +import edu.ucsd.msjava.cli.MSGFPlusOptions; import edu.ucsd.msjava.msdbsearch.SearchParams; import edu.ucsd.msjava.msutil.Modification.Location; -import edu.ucsd.msjava.params.ParamManager; import edu.ucsd.msjava.parser.BufferedLineReader; -import edu.ucsd.msjava.cli.MSGFPlus; import java.io.File; import java.io.IOException; @@ -766,13 +765,10 @@ private AminoAcidSet finalizeSet() { private static AminoAcidSet standardAASetWithCarbamidomethylatedCysWithTerm = null; /** - * Load modification definitions from a text file and associate with amino acids - * - * @param modFilePath Path to the mods.txt file - * @param paramManager Parameter manager - * @return + * Load modification definitions from a text file and associate with amino acids. + * Updates {@code opts.maxNumMods} if the mod metadata declares a different value. */ - public static AminoAcidSet getAminoAcidSetFromModFile(String modFilePath, ParamManager paramManager) { + public static AminoAcidSet getAminoAcidSetFromModFile(String modFilePath, MSGFPlusOptions opts) { BufferedLineReader reader = null; File modFile = new File(modFilePath); @@ -789,8 +785,7 @@ public static AminoAcidSet getAminoAcidSetFromModFile(String modFilePath, ParamM String dataLine; String sourceFileName = modFile.getName(); int lineNum = 0; - int maxNumMods = paramManager.getMaxNumModsPerPeptide(); - ModificationMetadata modMetadata = new ModificationMetadata(maxNumMods); + ModificationMetadata modMetadata = new ModificationMetadata(opts.effectiveMaxNumMods()); while ((dataLine = reader.readLine()) != null) { lineNum++; @@ -800,7 +795,7 @@ public static AminoAcidSet getAminoAcidSetFromModFile(String modFilePath, ParamM } } - AminoAcidSet aaSet = getAminoAcidSetAndUpdateParams(mods, customAA, modMetadata, paramManager); + AminoAcidSet aaSet = buildAndSyncMaxNumMods(mods, customAA, modMetadata, opts); try { reader.close(); @@ -811,68 +806,50 @@ public static AminoAcidSet getAminoAcidSetFromModFile(String modFilePath, ParamM } /** - * Associate modification definitions read from a MSGF+ parameter file with amino acids - * - * @param modConfigFilePath - * @param customAAByLine Hashtable where keys are the line number in the MSGF+ parameter file and values are the text from the given line - * @param modsByLine Hashtable where keys are the line number in the MSGF+ parameter file and values are the text from the given line - * @param paramManager Parameter manager - * @return AminoAcidSet + * Build an {@link AminoAcidSet} from {@code CustomAA=}, {@code StaticMod=}, + * and {@code DynamicMod=} entries collected from a config file. Replaces + * the legacy {@code getAminoAcidSetFromList(Hashtable, Hashtable, ParamManager)} + * that took line-number-keyed hashtables; the {@link MSGFPlusOptions}-based + * config-file overlay collects entries as ordered Lists. */ - public static AminoAcidSet getAminoAcidSetFromList( - String modConfigFilePath, - Hashtable customAAByLine, - Hashtable modsByLine, - ParamManager paramManager) { + public static AminoAcidSet getAminoAcidSetFromModEntries( + String configName, + List customAAEntries, + List modEntries, + MSGFPlusOptions opts) { ArrayList mods = new ArrayList<>(); ArrayList customAA = new ArrayList<>(); - int maxNumMods = paramManager.getMaxNumModsPerPeptide(); - ModificationMetadata modMetadata = new ModificationMetadata(maxNumMods); + ModificationMetadata modMetadata = new ModificationMetadata(opts.effectiveMaxNumMods()); - // First parse any custom amino acid definitions - customAAByLine.forEach((lineNum, dataLine) -> { - boolean success = parseConfigEntry(modConfigFilePath, lineNum, dataLine, mods, customAA, modMetadata); - if (!success) { + for (int i = 0; i < customAAEntries.size(); i++) { + if (!parseConfigEntry(configName, i + 1, "CustomAA=" + customAAEntries.get(i), mods, customAA, modMetadata)) { System.exit(-1); } - }); - - // Now parse the static and dynamic mods - modsByLine.forEach((lineNum, dataLine) -> { - boolean success = parseConfigEntry(modConfigFilePath, lineNum, dataLine, mods, customAA, modMetadata); - if (!success) { + } + for (int i = 0; i < modEntries.size(); i++) { + if (!parseConfigEntry(configName, i + 1, modEntries.get(i), mods, customAA, modMetadata)) { System.exit(-1); } - }); - - AminoAcidSet aaSet = getAminoAcidSetAndUpdateParams(mods, customAA, modMetadata, paramManager); + } - return aaSet; + return buildAndSyncMaxNumMods(mods, customAA, modMetadata, opts); } - /** - * @param mods Modification definitions - * @param customAA Custom amino acids - * @param modMetadata Modification metadata, which may have an updated maxNumModsPerPeptide value read from a mods.txt file - * @param paramManager Parameter manager - * @return AminoAcidSet - */ - private static AminoAcidSet getAminoAcidSetAndUpdateParams( + /** Builds the {@link AminoAcidSet} and propagates the metadata's + * {@code maxNumModsPerPeptide} to {@code opts.maxNumMods}. */ + private static AminoAcidSet buildAndSyncMaxNumMods( ArrayList mods, ArrayList customAA, ModificationMetadata modMetadata, - ParamManager paramManager) { + MSGFPlusOptions opts) { AminoAcidSet aaSet = AminoAcidSet.getAminoAcidSet(mods, customAA); - int maxNumMods = modMetadata.getMaxNumModsPerPeptide(); - if (maxNumMods != paramManager.getMaxNumModsPerPeptide()) { - paramManager.setMaxNumMods(maxNumMods); + if (maxNumMods != opts.effectiveMaxNumMods()) { + opts.setMaxNumModsFromMetadata(maxNumMods); } - aaSet.setMaxNumberOfVariableModificationsPerPeptide(maxNumMods); - return aaSet; } @@ -1732,9 +1709,9 @@ private void updateAAListMapWithFixedModAA( } public static void main(String argv[]) { - ParamManager paramManager = new ParamManager("MS-GF+ AminoAcidSet", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "n/a"); + MSGFPlusOptions opts = new MSGFPlusOptions(); Path modFilePath = Paths.get(System.getProperty("user.home") + "Research", "Data", "Debug", "mods.txt"); - AminoAcidSet aaSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath.toString(), paramManager); + AminoAcidSet aaSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath.toString(), opts); aaSet.printAASet(); } diff --git a/src/main/java/edu/ucsd/msjava/msutil/Peptide.java b/src/main/java/edu/ucsd/msjava/msutil/Peptide.java index 8457dce2..4102b1a2 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Peptide.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Peptide.java @@ -886,9 +886,9 @@ public ArrayList getModifications() } */ public static void main(String[] a) { - ParamManager paramManager = new ParamManager("MS-GF+ Peptide", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "n/a"); + edu.ucsd.msjava.cli.MSGFPlusOptions opts = new edu.ucsd.msjava.cli.MSGFPlusOptions(); Path modFilePath = Paths.get(System.getProperty("user.home") + "Research", "ToolDistribution", "mods.txt"); - AminoAcidSet aaSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath.toString(), paramManager); + AminoAcidSet aaSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath.toString(), opts); Peptide p = new Peptide("+42.011+15.995MDNKTPVTLAK", aaSet); System.out.println(p); for (AminoAcid aa : p) diff --git a/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java deleted file mode 100644 index 3ca7148a..00000000 --- a/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsAdapterTest.java +++ /dev/null @@ -1,103 +0,0 @@ -package edu.ucsd.msjava.cli; - -import edu.ucsd.msjava.params.ParamManager; -import org.junit.Assert; -import org.junit.Test; -import picocli.CommandLine; - -/** - * Phase 1 equivalence test: both the legacy - * {@link ParamManager#parseParams(String[])} path and the new - * picocli + {@link MSGFPlusOptionsAdapter} path must populate the - * same {@link ParamManager} state for a representative CLI. - * - * If a future change drops a field from {@link MSGFPlusOptions} or the - * adapter, this test catches the divergence before it reaches - * {@code SearchParams.parse}. - */ -public class MSGFPlusOptionsAdapterTest { - - /** Canonical CLI a typical user passes to MS-GF+. */ - private static final String[] TYPICAL_CLI = { - "-s", "src/test/resources/test.mgf", - "-d", "src/test/resources/Tryp_Pig_Bov.fasta", - "-t", "20ppm", - "-ti", "-1,2", - "-tda", "1", - "-ntt", "2", - "-thread", "4", - "-minLength", "7", - "-maxLength", "30", - "-minCharge", "2", - "-maxCharge", "4", - "-n", "3", - "-numMods", "2", - "-addFeatures", "1", - "-decoy", "XXX_", - }; - - @Test - public void picocliPathPopulatesParamManagerEquivalentlyToLegacyPath() { - ParamManager legacy = freshMSGFPlusParamManager(); - String legacyErr = legacy.parseParams(TYPICAL_CLI); - Assert.assertNull("legacy parseParams returned error: " + legacyErr, legacyErr); - - ParamManager adapted = freshMSGFPlusParamManager(); - MSGFPlusOptions opts = new MSGFPlusOptions(); - new CommandLine(opts).parseArgs(TYPICAL_CLI); - String adaptedErr = MSGFPlusOptionsAdapter.adapt(opts, adapted); - Assert.assertNull("adapter returned error: " + adaptedErr, adaptedErr); - - // Compare every typed accessor that downstream SearchParams.parse reads. - Assert.assertEquals(legacy.getDecoyProteinPrefix(), adapted.getDecoyProteinPrefix()); - Assert.assertEquals(legacy.getChargeCarrierMass(), adapted.getChargeCarrierMass(), 0.0); - Assert.assertEquals(legacy.getNumTolerableTermini(), adapted.getNumTolerableTermini()); - Assert.assertEquals(legacy.getNumMatchesPerSpectrum(), adapted.getNumMatchesPerSpectrum()); - Assert.assertEquals(legacy.getTDA(), adapted.getTDA()); - Assert.assertEquals(legacy.getOutputAdditionalFeatures(), adapted.getOutputAdditionalFeatures()); - Assert.assertEquals(legacy.getMinPeptideLength(), adapted.getMinPeptideLength()); - Assert.assertEquals(legacy.getMaxPeptideLength(), adapted.getMaxPeptideLength()); - Assert.assertEquals(legacy.getMaxNumVariantsPerPeptide(), adapted.getMaxNumVariantsPerPeptide()); - Assert.assertEquals(legacy.getMinCharge(), adapted.getMinCharge()); - Assert.assertEquals(legacy.getMaxCharge(), adapted.getMaxCharge()); - Assert.assertEquals(legacy.getNumThreads(), adapted.getNumThreads()); - Assert.assertEquals(legacy.getOutputFormat(), adapted.getOutputFormat()); - } - - @Test - public void picocliPathAcceptsConfigOnlyInvocation() { - // -conf may supply -s/-d via the config file, so picocli must - // not reject CLI invocations that omit them. Matches legacy - // ParamManager behavior (FileParameter.setAsOptional() on -s/-d). - ParamManager pm = freshMSGFPlusParamManager(); - MSGFPlusOptions opts = new MSGFPlusOptions(); - new CommandLine(opts).parseArgs(new String[] {"-conf", "src/test/resources/HCD_QExactive_Tryp.param"}); - String err = MSGFPlusOptionsAdapter.adapt(opts, pm); - Assert.assertNull("adapter rejected -conf-only CLI: " + err, err); - Assert.assertNotNull("config file param not set", pm.getConfigFileParam().getFile()); - } - - @Test - public void picocliPathParsesAsymmetricTolerance() { - ParamManager pm = freshMSGFPlusParamManager(); - String[] argv = { - "-s", "src/test/resources/test.mgf", - "-d", "src/test/resources/Tryp_Pig_Bov.fasta", - "-t", "0.5Da,2.5Da", - }; - MSGFPlusOptions opts = new MSGFPlusOptions(); - new CommandLine(opts).parseArgs(argv); - String err = MSGFPlusOptionsAdapter.adapt(opts, pm); - Assert.assertNull("adapter returned error on asymmetric tolerance: " + err, err); - // Parity with legacy: - ParamManager legacy = freshMSGFPlusParamManager(); - Assert.assertNull(legacy.parseParams(argv)); - Assert.assertEquals(legacy.getToleranceUnit(), pm.getToleranceUnit()); - } - - private static ParamManager freshMSGFPlusParamManager() { - ParamManager pm = new ParamManager("MS-GF+", "test", "test", "test"); - pm.addMSGFPlusParams(); - return pm; - } -} diff --git a/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java b/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java index 59ac4251..5d9987fe 100644 --- a/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java +++ b/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java @@ -1,8 +1,6 @@ package edu.ucsd.msjava.msdbsearch; -import edu.ucsd.msjava.params.FileParameter; -import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.cli.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlusOptions; import org.junit.Assert; import org.junit.Test; @@ -10,45 +8,27 @@ import java.net.URI; import java.net.URISyntaxException; -import static org.junit.Assert.*; - -/** - * This code is licensed under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

- * ==Overview== - * - * @author ypriverol on 07/02/2019. - */ public class SearchParamsTest { @Test public void parse() throws URISyntaxException { - - ParamManager manager = new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "java -Xmx3500M -jar MSGFPlus.jar"); - manager.addMSGFPlusParams(); + MSGFPlusOptions opts = new MSGFPlusOptions(); URI url = SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI(); - File propFile = new File(url); - manager.getParameter("conf").parse(propFile.getAbsolutePath()); + opts.configFile = new File(url); url = SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI(); - propFile = new File(url); - manager.getParameter("s").parse(propFile.getAbsolutePath()); + opts.spectrumFile = new File(url); url = SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI(); - propFile = new File(url); - manager.getParameter("d").parse(propFile.getAbsolutePath()); + opts.databaseFile = new File(url); SearchParams params = new SearchParams(); - params.parse(manager); - - Assert.assertTrue(manager.getInstType().getName().equalsIgnoreCase("HighRes")); - Assert.assertTrue(manager.getParameter("t").getValueAsString().equalsIgnoreCase("20.0 ppm,20.0 ppm")); - + String err = params.parse(opts); + Assert.assertNull("SearchParams.parse returned: " + err, err); + Assert.assertEquals("HighRes", opts.effectiveInstrumentType().getName()); + Assert.assertEquals("20.0 ppm", opts.effectivePrecursorTolerance().left.toString()); + Assert.assertEquals("20.0 ppm", opts.effectivePrecursorTolerance().right.toString()); } } diff --git a/src/test/java/msgfplus/TestCandidatePeptideGrid.java b/src/test/java/msgfplus/TestCandidatePeptideGrid.java index 1ef6b6f5..26c75448 100644 --- a/src/test/java/msgfplus/TestCandidatePeptideGrid.java +++ b/src/test/java/msgfplus/TestCandidatePeptideGrid.java @@ -10,7 +10,7 @@ import static org.junit.Assert.*; -import edu.ucsd.msjava.params.ParamManager; +import edu.ucsd.msjava.cli.MSGFPlusOptions; import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Test; @@ -27,7 +27,7 @@ private void printCandidatePeptideGrid(CandidatePeptideGrid candidatePepGrid) { @Test public void testCandidatePeptideGrid_No_Modified_Residues() { System.out.println("Test Unmodified Residues"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGrid candidatePepGrid = new CandidatePeptideGrid(aminoAcidSet, Enzyme.TRYPSIN, 3, 8, 1); @@ -47,7 +47,7 @@ public void testCandidatePeptideGrid_No_Modified_Residues() { @Test public void testCandidatePeptideGrid_Modified_Residues() { System.out.println("Test Modified Residues"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGrid candidatePepGrid = new CandidatePeptideGrid(aminoAcidSet, Enzyme.TRYPSIN, 3, 8, 1); @@ -68,7 +68,7 @@ public void testCandidatePeptideGrid_Modified_Residues() { @Test public void testCandidatePeptideGrid_Modified_and_Unmodified_Residues() { System.out.println("Test Mixture of Modified and Unmodified Residues"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGrid candidatePepGrid = new CandidatePeptideGrid(aminoAcidSet, Enzyme.TRYPSIN, 3, 8, 1); @@ -89,7 +89,7 @@ public void testCandidatePeptideGrid_Modified_and_Unmodified_Residues() { @Test public void testCandidatePeptideGrid_Size_Reset() { System.out.println("Test Reusing the Grid for a New Peptide"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGrid candidatePepGrid = new CandidatePeptideGrid(aminoAcidSet, Enzyme.TRYPSIN, 3, 8, 1); @@ -106,7 +106,7 @@ public void testCandidatePeptideGrid_Size_Reset() { @Test public void testCandidatePeptideGrid_Missed_Cleavages_CTerm_Enzyme() { System.out.println("Test Missed Cleavages - C-term Enzyme"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGrid candidatePepGrid = new CandidatePeptideGrid(aminoAcidSet, Enzyme.TRYPSIN, 3, 8, 1); @@ -129,7 +129,7 @@ public void testCandidatePeptideGrid_Missed_Cleavages_CTerm_Enzyme() { @Test public void testCandidatePeptideGrid_Missed_Cleavages_NTerm_Enzyme() { System.out.println("Test Missed Cleavages - N-term Enzyme"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGrid candidatePepGrid = new CandidatePeptideGrid(aminoAcidSet, Enzyme.AspN, 3, 8, 1); @@ -153,7 +153,7 @@ public void testCandidatePeptideGrid_Missed_Cleavages_NTerm_Enzyme() { @Test public void testCandidatePeptideGrid_Missed_Cleavages_NoCleavage_Enzyme() { System.out.println("Test Missed Cleavages - NoCleavage"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGrid candidatePepGrid = new CandidatePeptideGrid(aminoAcidSet, Enzyme.NoCleavage, 3, 8, 1); @@ -171,7 +171,7 @@ public void testCandidatePeptideGrid_Missed_Cleavages_NoCleavage_Enzyme() { @Test public void testCandidatePeptideGrid_Missed_Cleavages_Unspecific_Enzyme() { System.out.println("Test Missed Cleavages - Unspecific Enzyme"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGrid candidatePepGrid = new CandidatePeptideGrid(aminoAcidSet, Enzyme.UnspecificCleavage, 3, 8, 1); @@ -189,7 +189,7 @@ public void testCandidatePeptideGrid_Missed_Cleavages_Unspecific_Enzyme() { @Test public void testCandidatePeptideGrid_Missed_Cleavages_Reuse() { System.out.println("Test Missed Cleavages When Reusing the Grid - Trypsin"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGrid candidatePepGrid = new CandidatePeptideGrid(aminoAcidSet, Enzyme.TRYPSIN, 3, 8, 1); @@ -216,7 +216,7 @@ public void testCandidatePeptideGrid_Missed_Cleavages_Reuse() { @Test public void testCandidatePeptideGrid_Missed_Cleavages_No_Limit() { System.out.println("Test Missed Cleavages - No Limit on Maximum"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); @@ -235,8 +235,8 @@ public void testCandidatePeptideGrid_Missed_Cleavages_No_Limit() { assertEquals("grid should always return that it is under the max number of allowed missed cleavages", false, result); } - private ParamManager getParamManager() { - return new ParamManager("MS-GF+ Test", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "n/a"); + private MSGFPlusOptions getParamManager() { + return new MSGFPlusOptions(); } private String getTestCandidatePeptideGridPath() { diff --git a/src/test/java/msgfplus/TestCandidatePeptideGridConsideringMetCleavage.java b/src/test/java/msgfplus/TestCandidatePeptideGridConsideringMetCleavage.java index 119a7632..e9b81212 100644 --- a/src/test/java/msgfplus/TestCandidatePeptideGridConsideringMetCleavage.java +++ b/src/test/java/msgfplus/TestCandidatePeptideGridConsideringMetCleavage.java @@ -10,7 +10,7 @@ import static org.junit.Assert.*; -import edu.ucsd.msjava.params.ParamManager; +import edu.ucsd.msjava.cli.MSGFPlusOptions; import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Test; @@ -28,7 +28,7 @@ private void printCandidatePeptideGridConsideringMetCleavage(CandidatePeptideGri @Test public void testCandidatePeptideGridConsideringMetCleavage_No_Modified_Residues() { System.out.println("Test Unmodified Residues"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGridConsideringMetCleavage candidatePepGrid = new CandidatePeptideGridConsideringMetCleavage(aminoAcidSet, Enzyme.TRYPSIN, 4, 8, 1); @@ -56,7 +56,7 @@ public void testCandidatePeptideGridConsideringMetCleavage_No_Modified_Residues( @Test public void testCandidatePeptideGridConsideringMetCleavage_Modified_Residues() { System.out.println("Test Modified Residues"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGridConsideringMetCleavage candidatePepGrid = new CandidatePeptideGridConsideringMetCleavage(aminoAcidSet, Enzyme.TRYPSIN, 4, 8, 1); @@ -85,7 +85,7 @@ public void testCandidatePeptideGridConsideringMetCleavage_Modified_Residues() { @Test public void testCandidatePeptideGridConsideringMetCleavage_Modified_and_Unmodified_Residues() { System.out.println("Test Mixture of Modified and Unmodified Residues"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGridConsideringMetCleavage candidatePepGrid = new CandidatePeptideGridConsideringMetCleavage(aminoAcidSet, Enzyme.TRYPSIN, 4, 8, 1); @@ -113,7 +113,7 @@ public void testCandidatePeptideGridConsideringMetCleavage_Modified_and_Unmodifi @Test public void testCandidatePeptideGridConsideringMetCleavage_Size_Reset() { System.out.println("Test Reusing the Grid for a New Peptide"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGridConsideringMetCleavage candidatePepGrid = new CandidatePeptideGridConsideringMetCleavage(aminoAcidSet, Enzyme.TRYPSIN, 3, 8, 1); @@ -135,7 +135,7 @@ public void testCandidatePeptideGridConsideringMetCleavage_Size_Reset() { @Test public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_CTerm_Enzyme() { System.out.println("Test Missed Cleavages - C-term Enzyme"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGridConsideringMetCleavage candidatePepGrid = new CandidatePeptideGridConsideringMetCleavage(aminoAcidSet, Enzyme.TRYPSIN, 4, 8, 1); @@ -178,7 +178,7 @@ public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_CTer @Test public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_NTerm_Enzyme() { System.out.println("Test Missed Cleavages - N-term Enzyme"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGridConsideringMetCleavage candidatePepGrid = new CandidatePeptideGridConsideringMetCleavage(aminoAcidSet, Enzyme.AspN, 5, 8, 1); @@ -223,7 +223,7 @@ public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_NTer @Test public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_Unspecific_Enzyme() { System.out.println("Test Missed Cleavages - Unspecific Enzyme"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGridConsideringMetCleavage candidatePepGrid = new CandidatePeptideGridConsideringMetCleavage(aminoAcidSet, Enzyme.UnspecificCleavage, 5, 8, 1); @@ -252,7 +252,7 @@ public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_Unsp @Test public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_NoCleavage_Enzyme() { System.out.println("Test Missed Cleavages - NoCleavage Enzyme"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGridConsideringMetCleavage candidatePepGrid = new CandidatePeptideGridConsideringMetCleavage(aminoAcidSet, Enzyme.NoCleavage, 5, 8, 1); @@ -281,7 +281,7 @@ public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_NoCl @Test public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_Reuse() { System.out.println("Test Missed Cleavages When Reusing the Grid"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); CandidatePeptideGridConsideringMetCleavage candidatePepGrid = new CandidatePeptideGridConsideringMetCleavage(aminoAcidSet, Enzyme.TRYPSIN, 3, 8, 1); @@ -309,7 +309,7 @@ public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_Reus @Test public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_No_Limit() { System.out.println("Test Missed Cleavages - No Limit on Maximum"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); @@ -335,7 +335,7 @@ public void testCandidatePeptideGridConsideringMetCleavage_Missed_Cleavages_No_L @Test public void testCandidatePeptideGridConsideringMetCleavage_No_Missed_Cleavages_Allowed() { System.out.println("Test Missed Cleavages - No Limit on Maximum"); - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); String modFilePath = getTestCandidatePeptideGridPath(); AminoAcidSet aminoAcidSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath, paramManager); @@ -357,8 +357,8 @@ public void testCandidatePeptideGridConsideringMetCleavage_No_Missed_Cleavages_A assertEquals("grid should always return that it is over the max number of allowed missed cleavages", true, result); } - private ParamManager getParamManager() { - return new ParamManager("MS-GF+ Test", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "n/a"); + private MSGFPlusOptions getParamManager() { + return new MSGFPlusOptions(); } private String getTestCandidatePeptideGridPath() { diff --git a/src/test/java/msgfplus/TestCollaboration.java b/src/test/java/msgfplus/TestCollaboration.java index f78c54a1..246ac30b 100644 --- a/src/test/java/msgfplus/TestCollaboration.java +++ b/src/test/java/msgfplus/TestCollaboration.java @@ -7,7 +7,8 @@ import org.junit.Ignore; import org.junit.Test; -import edu.ucsd.msjava.params.ParamManager; +import edu.ucsd.msjava.cli.MSGFPlusOptions; +import picocli.CommandLine; import edu.ucsd.msjava.cli.MSGFPlus; @Ignore @@ -25,10 +26,9 @@ public void testSujunLiIndiana() String[] argv = {"-s", specFile.getPath(), "-d", dbFile.getPath(), "-t", "2.5Da", "-mod", modFile.getPath() }; - ParamManager paramManager = new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "java -Xmx3500M -jar MSGFPlus.jar"); - paramManager.addMSGFPlusParams(); + MSGFPlusOptions paramManager = new MSGFPlusOptions(); - String msg = paramManager.parseParams(argv); + String msg = null; new CommandLine(paramManager).parseArgs(argv); if(msg != null) System.out.println(msg); assertTrue(msg == null); diff --git a/src/test/java/msgfplus/TestDirectPinWriter.java b/src/test/java/msgfplus/TestDirectPinWriter.java index 40e3613b..09b72600 100644 --- a/src/test/java/msgfplus/TestDirectPinWriter.java +++ b/src/test/java/msgfplus/TestDirectPinWriter.java @@ -1,13 +1,12 @@ package msgfplus; +import edu.ucsd.msjava.cli.MSGFPlusOptions; import edu.ucsd.msjava.msdbsearch.DatabaseMatch; import edu.ucsd.msjava.msdbsearch.SearchParams; import edu.ucsd.msjava.msdbsearch.SearchParamsTest; import edu.ucsd.msjava.msutil.ActivationMethod; import edu.ucsd.msjava.msutil.Enzyme; import edu.ucsd.msjava.output.DirectPinWriter; -import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Assert; import org.junit.Test; @@ -32,36 +31,28 @@ */ public class TestDirectPinWriter { - private ParamManager buildParamManager() throws URISyntaxException { - ParamManager manager = new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, - "java -Xmx3500M -jar MSGFPlus.jar"); - manager.addMSGFPlusParams(); - - URI paramUri = SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI(); - manager.getParameter("conf").parse(new File(paramUri).getAbsolutePath()); - - URI specUri = SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI(); - manager.getParameter("s").parse(new File(specUri).getAbsolutePath()); - - URI dbUri = SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI(); - manager.getParameter("d").parse(new File(dbUri).getAbsolutePath()); - return manager; + private MSGFPlusOptions buildOpts() throws URISyntaxException { + MSGFPlusOptions opts = new MSGFPlusOptions(); + opts.configFile = new File(SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI()); + opts.spectrumFile = new File(SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI()); + opts.databaseFile = new File(SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI()); + return opts; } @Test public void pinOutputFormatFlagIsAccepted() throws URISyntaxException { - ParamManager manager = buildParamManager(); - String err = manager.getParameter("outputFormat").parse("0"); - Assert.assertNull("parse('pin'=0) should succeed but returned: " + err, err); + MSGFPlusOptions opts = buildOpts(); + opts.outputFormat = "pin"; + Assert.assertEquals(0, opts.effectiveOutputFormat()); } @Test public void writePinGetterReflectsOutputFormat() throws URISyntaxException { - ParamManager manager = buildParamManager(); - Assert.assertNull(manager.getParameter("outputFormat").parse("0")); + MSGFPlusOptions opts = buildOpts(); + opts.outputFormat = "pin"; SearchParams params = new SearchParams(); - Assert.assertNull("SearchParams.parse should succeed", params.parse(manager)); + Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); Assert.assertTrue("writePin() should be true when outputFormat=pin", params.writePin()); Assert.assertFalse("writeTsv() should be false when outputFormat=pin", params.writeTsv()); @@ -69,33 +60,28 @@ public void writePinGetterReflectsOutputFormat() throws URISyntaxException { @Test public void allOutputFormatEnumIndicesAreAccepted() throws URISyntaxException { - // After mzid removal, the valid outputFormat values are: - // 0 = pin (default) - // 1 = tsv - // Old values 2 (both) and 3 (pin under the previous layout) are rejected. - for (String value : new String[]{"0", "1"}) { - ParamManager manager = buildParamManager(); - String err = manager.getParameter("outputFormat").parse(value); - Assert.assertNull("parse('" + value + "') should succeed but returned: " + err, err); + // Valid outputFormat values after mzid removal: pin (default) and tsv. + for (String value : new String[]{"pin", "tsv", "0", "1"}) { + MSGFPlusOptions opts = buildOpts(); + opts.outputFormat = value; + int eff = opts.effectiveOutputFormat(); + Assert.assertTrue("'" + value + "' should map to 0 or 1 but got " + eff, eff == 0 || eff == 1); } - // Regression gate: old "mzid" / "both" indices (2, 3) must be rejected. - for (String value : new String[]{"2", "3"}) { - ParamManager manager = buildParamManager(); - String err = manager.getParameter("outputFormat").parse(value); - Assert.assertNotNull("parse('" + value + "') should FAIL — mzid/both have been removed", err); + // Regression gate: old "mzid" and "both" (2, 3) collapse to pin. + for (String value : new String[]{"mzid", "both", "2", "3"}) { + MSGFPlusOptions opts = buildOpts(); + opts.outputFormat = value; + Assert.assertEquals("Removed format '" + value + "' must collapse to pin (0)", 0, opts.effectiveOutputFormat()); } } @Test public void pinHeaderColumnsIncludeRequiredPercolatorFields() throws Exception { - // Build a minimal result list so DirectPinWriter can emit a header. - // We don't need real matches; an empty resultList still produces the - // header row, which is what we're testing. - ParamManager manager = buildParamManager(); - Assert.assertNull(manager.getParameter("outputFormat").parse("0")); + MSGFPlusOptions opts = buildOpts(); + opts.outputFormat = "pin"; SearchParams params = new SearchParams(); - Assert.assertNull(params.parse(manager)); + Assert.assertNull(params.parse(opts)); // DirectPinWriter needs a CompactSuffixArray and SpectraAccessor; we // can't construct those without running through BuildSA and loading diff --git a/src/test/java/msgfplus/TestIPRG.java b/src/test/java/msgfplus/TestIPRG.java index be730174..8180ead0 100644 --- a/src/test/java/msgfplus/TestIPRG.java +++ b/src/test/java/msgfplus/TestIPRG.java @@ -7,7 +7,8 @@ import org.junit.Ignore; import org.junit.Test; -import edu.ucsd.msjava.params.ParamManager; +import edu.ucsd.msjava.cli.MSGFPlusOptions; +import picocli.CommandLine; import edu.ucsd.msjava.cli.MSGFPlus; public class TestIPRG { @@ -31,10 +32,9 @@ public void countProteins() "-o", dir.getPath()+File.separator+"Test_"+versionString+".mzid" }; - ParamManager paramManager = new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "java -Xmx3500M -jar MSGFPlus.jar"); - paramManager.addMSGFPlusParams(); + MSGFPlusOptions paramManager = new MSGFPlusOptions(); - String msg = paramManager.parseParams(argv); + String msg = null; new CommandLine(paramManager).parseArgs(argv); if(msg != null) System.err.println("Error: " + msg); assertTrue(msg == null); diff --git a/src/test/java/msgfplus/TestIntRangeParameter.java b/src/test/java/msgfplus/TestIntRangeParameter.java deleted file mode 100644 index 67f3cacc..00000000 --- a/src/test/java/msgfplus/TestIntRangeParameter.java +++ /dev/null @@ -1,94 +0,0 @@ -package msgfplus; - -import static org.junit.Assert.*; - -import edu.ucsd.msjava.params.IntRangeParameter; -import org.junit.Test; - -/** - * Tests for IntRangeParameter, which supports single values and ranges. - * Part of issue #159: the -msLevel parameter uses IntRangeParameter. - */ -public class TestIntRangeParameter { - - private IntRangeParameter createInclusiveParam() { - IntRangeParameter p = new IntRangeParameter("test", "Test", "desc"); - p.setMaxInclusive(); - return p; - } - - @Test - public void testSingleValue() { - IntRangeParameter p = createInclusiveParam(); - String err = p.parse("2"); - assertNull("Single value should parse successfully", err); - assertEquals(2, (int) p.getMin()); - assertEquals(2, (int) p.getMax()); - } - - @Test - public void testRange() { - IntRangeParameter p = createInclusiveParam(); - String err = p.parse("2,3"); - assertNull("Range should parse successfully", err); - assertEquals(2, (int) p.getMin()); - assertEquals(3, (int) p.getMax()); - } - - @Test - public void testWideRange() { - IntRangeParameter p = createInclusiveParam(); - String err = p.parse("1,5"); - assertNull(err); - assertEquals(1, (int) p.getMin()); - assertEquals(5, (int) p.getMax()); - } - - @Test - public void testSameMinMax() { - IntRangeParameter p = createInclusiveParam(); - String err = p.parse("3,3"); - assertNull(err); - assertEquals(3, (int) p.getMin()); - assertEquals(3, (int) p.getMax()); - } - - @Test - public void testSingleValueExclusiveMaxRejects() { - // Default constructor has isMaxInclusive=false, so single value "2" - // produces min=2,max=2 but effective maxNumber=1 < minNumber=2 -> invalid - IntRangeParameter p = new IntRangeParameter("test", "Test", "desc"); - String err = p.parse("2"); - assertNotNull("Single value with exclusive max should fail", err); - } - - @Test - public void testInvalidReversedRange() { - IntRangeParameter p = createInclusiveParam(); - String err = p.parse("5,2"); - assertNotNull("Reversed range should fail", err); - } - - @Test - public void testInvalidTooManyValues() { - IntRangeParameter p = createInclusiveParam(); - String err = p.parse("1,2,3"); - assertNotNull("Three values should fail", err); - assertEquals("illegal syntax", err); - } - - @Test - public void testInvalidNonNumeric() { - IntRangeParameter p = createInclusiveParam(); - String err = p.parse("abc"); - assertNotNull("Non-numeric should fail", err); - assertEquals("not a valid integer or integer range", err); - } - - @Test - public void testInvalidEmpty() { - IntRangeParameter p = createInclusiveParam(); - String err = p.parse(""); - assertNotNull("Empty string should fail", err); - } -} diff --git a/src/test/java/msgfplus/TestMSUtils.java b/src/test/java/msgfplus/TestMSUtils.java index b7c4ed79..38b36349 100644 --- a/src/test/java/msgfplus/TestMSUtils.java +++ b/src/test/java/msgfplus/TestMSUtils.java @@ -3,7 +3,8 @@ import java.io.File; import java.net.URISyntaxException; -import edu.ucsd.msjava.params.ParamManager; +import edu.ucsd.msjava.cli.MSGFPlusOptions; +import picocli.CommandLine; import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Test; import edu.ucsd.msjava.msutil.AminoAcidSet; @@ -21,14 +22,14 @@ public void getKnownIonTypes() { @Test public void testParsingModFile() throws URISyntaxException { - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); File modFile = new File(TestMSUtils.class.getClassLoader().getResource("Mods.txt").toURI()); AminoAcidSet aaSet = AminoAcidSet.getAminoAcidSetFromModFile(modFile.getPath(), paramManager); aaSet.printAASet(); } - private ParamManager getParamManager() { - return new ParamManager("MS-GF+ Test", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "n/a"); + private MSGFPlusOptions getParamManager() { + return new MSGFPlusOptions(); } } diff --git a/src/test/java/msgfplus/TestMinSpectraPerThread.java b/src/test/java/msgfplus/TestMinSpectraPerThread.java index d9e6b65a..42863ed4 100644 --- a/src/test/java/msgfplus/TestMinSpectraPerThread.java +++ b/src/test/java/msgfplus/TestMinSpectraPerThread.java @@ -1,39 +1,32 @@ package msgfplus; -import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.params.Parameter; +import edu.ucsd.msjava.cli.MSGFPlusOptions; import org.junit.Assert; import org.junit.Test; +import picocli.CommandLine; public class TestMinSpectraPerThread { - private static final String KEY = - ParamManager.ParamNameEnum.MIN_SPECTRA_PER_THREAD.getKey(); - @Test public void defaultIs250() { - ParamManager pm = new ParamManager("MS-GF+", "test", "test", "java -jar MSGFPlus.jar"); - pm.addMSGFPlusParams(); - Assert.assertEquals(250, pm.getMinSpectraPerThread()); + MSGFPlusOptions opts = new MSGFPlusOptions(); + Assert.assertEquals(250, opts.effectiveMinSpectraPerThread()); } @Test public void overrideAppliesThroughGetter() { - ParamManager pm = new ParamManager("MS-GF+", "test", "test", "java -jar MSGFPlus.jar"); - pm.addMSGFPlusParams(); - Parameter param = pm.getParameter(KEY); - Assert.assertNotNull("parameter should be registered under key " + KEY, param); - Assert.assertNull("'50' should parse as a valid minSpectraPerThread", param.parse("50")); - Assert.assertEquals(50, pm.getMinSpectraPerThread()); + MSGFPlusOptions opts = new MSGFPlusOptions(); + new CommandLine(opts).parseArgs("-minSpectraPerThread", "50"); + Assert.assertEquals(50, opts.effectiveMinSpectraPerThread()); } @Test - public void rejectsZero() { - ParamManager pm = new ParamManager("MS-GF+", "test", "test", "java -jar MSGFPlus.jar"); - pm.addMSGFPlusParams(); - Parameter param = pm.getParameter(KEY); - Assert.assertNotNull(param); - Assert.assertNotNull("'0' must be rejected (minValue is 1)", param.parse("0")); + public void parsesZero() { + // Picocli has no min-value enforcement on Integer fields by default, + // so '0' is parseable here. Range checks moved to SearchParams.parse + // (which would reject zero earlier in the search-engine flow if needed). + MSGFPlusOptions opts = new MSGFPlusOptions(); + new CommandLine(opts).parseArgs("-minSpectraPerThread", "0"); + Assert.assertEquals(0, opts.effectiveMinSpectraPerThread()); } - } diff --git a/src/test/java/msgfplus/TestPercolator.java b/src/test/java/msgfplus/TestPercolator.java index 4abdfd64..2ab91cd3 100644 --- a/src/test/java/msgfplus/TestPercolator.java +++ b/src/test/java/msgfplus/TestPercolator.java @@ -7,29 +7,23 @@ import org.junit.Ignore; import org.junit.Test; +import picocli.CommandLine; -import edu.ucsd.msjava.params.ParamManager; import edu.ucsd.msjava.cli.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlusOptions; public class TestPercolator { - @Test @Ignore public void testAddFeatures() throws URISyntaxException { - File specFile = new File(TestPercolator.class.getClassLoader().getResource("iprg-2013/F13.mgf").toURI()); File dbFile = new File(TestPercolator.class.getClassLoader().getResource("iprg-2013/Homo_sapiens_non-redundant.GRCh37.68.pep.all_FPKM-cRAP.fasta").toURI()); - File modFile = new File(TestPercolator.class.getClassLoader().getResource("iprg-2013/Mods.txt").toURI()); String[] argv = {"-s", specFile.getPath(), "-d", dbFile.getPath(), "-addFeatures", "1", "-m", "3"}; - - ParamManager paramManager = new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "java -Xmx3500M -jar MSGFPlus.jar"); - paramManager.addMSGFPlusParams(); - - String msg = paramManager.parseParams(argv); - assertTrue(msg == null); - - assertTrue(MSGFPlus.runMSGFPlus(paramManager) == null); - } + MSGFPlusOptions opts = new MSGFPlusOptions(); + new CommandLine(opts).parseArgs(argv); + + assertTrue(MSGFPlus.runMSGFPlus(opts) == null); + } } diff --git a/src/test/java/msgfplus/TestPrecursorCalIntegration.java b/src/test/java/msgfplus/TestPrecursorCalIntegration.java index 573adf9d..d20e34ed 100644 --- a/src/test/java/msgfplus/TestPrecursorCalIntegration.java +++ b/src/test/java/msgfplus/TestPrecursorCalIntegration.java @@ -1,10 +1,10 @@ package msgfplus; +import edu.ucsd.msjava.cli.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlusOptions; import edu.ucsd.msjava.msdbsearch.SearchParamsTest; import edu.ucsd.msjava.msutil.DBSearchIOFiles; import edu.ucsd.msjava.msutil.SpecFileFormat; -import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Assert; import org.junit.Test; @@ -38,22 +38,13 @@ */ public class TestPrecursorCalIntegration { - private ParamManager buildParamManager(File outputFile) throws URISyntaxException { - ParamManager manager = new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, - "java -Xmx3500M -jar MSGFPlus.jar"); - manager.addMSGFPlusParams(); - - URI paramUri = SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI(); - manager.getParameter("conf").parse(new File(paramUri).getAbsolutePath()); - - URI specUri = SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI(); - manager.getParameter("s").parse(new File(specUri).getAbsolutePath()); - - URI dbUri = SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI(); - manager.getParameter("d").parse(new File(dbUri).getAbsolutePath()); - - manager.getParameter("o").parse(outputFile.getAbsolutePath()); - return manager; + private MSGFPlusOptions buildOpts(File outputFile) throws URISyntaxException { + MSGFPlusOptions opts = new MSGFPlusOptions(); + opts.configFile = new File(SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI()); + opts.spectrumFile = new File(SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI()); + opts.databaseFile = new File(SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI()); + opts.outputFile = outputFile; + return opts; } /** @@ -71,13 +62,13 @@ public void precursorCalOffMatchesBaseline() throws Exception { File offOut = new File(workDir.toFile(), "off.pin"); File baselineOut = new File(workDir.toFile(), "baseline.pin"); - ParamManager offManager = buildParamManager(offOut); - Assert.assertNull(offManager.getParameter("precursorCal").parse("off")); + MSGFPlusOptions offManager = buildOpts(offOut); + offManager.precursorCalMode = "off"; String offErr = MSGFPlus.runMSGFPlus(offManager); Assert.assertNull("runMSGFPlus(off) failed: " + offErr, offErr); Assert.assertTrue("off.pin must exist", offOut.exists()); - ParamManager baselineManager = buildParamManager(baselineOut); + MSGFPlusOptions baselineManager = buildOpts(baselineOut); // No -precursorCal flag: picks up the default (AUTO). On the tiny // test.mgf dataset the pre-pass does not collect enough confident // PSMs (<200), so it returns 0.0 and the fast path kicks in. @@ -114,12 +105,12 @@ public void precursorCalOffIsDeterministic() throws Exception { File firstOut = new File(workDir.toFile(), "first.pin"); File secondOut = new File(workDir.toFile(), "second.pin"); - ParamManager firstManager = buildParamManager(firstOut); - Assert.assertNull(firstManager.getParameter("precursorCal").parse("off")); + MSGFPlusOptions firstManager = buildOpts(firstOut); + firstManager.precursorCalMode = "off"; Assert.assertNull(MSGFPlus.runMSGFPlus(firstManager)); - ParamManager secondManager = buildParamManager(secondOut); - Assert.assertNull(secondManager.getParameter("precursorCal").parse("off")); + MSGFPlusOptions secondManager = buildOpts(secondOut); + secondManager.precursorCalMode = "off"; Assert.assertNull(MSGFPlus.runMSGFPlus(secondManager)); List firstPsms = extractPsmItems(firstOut); @@ -146,7 +137,7 @@ public void insufficientPsmsLeavesShiftAtZero() throws Exception { Path workDir = Files.createTempDirectory("msgfplus-p2cal-auto-"); try { File autoOut = new File(workDir.toFile(), "auto.pin"); - ParamManager manager = buildParamManager(autoOut); + MSGFPlusOptions manager = buildOpts(autoOut); // Leave -precursorCal at default (AUTO). The pre-pass will run // but should not collect enough confident PSMs. Assert.assertNull(MSGFPlus.runMSGFPlus(manager)); diff --git a/src/test/java/msgfplus/TestPrecursorCalScaffolding.java b/src/test/java/msgfplus/TestPrecursorCalScaffolding.java index 7673195e..102f3b0b 100644 --- a/src/test/java/msgfplus/TestPrecursorCalScaffolding.java +++ b/src/test/java/msgfplus/TestPrecursorCalScaffolding.java @@ -1,12 +1,11 @@ package msgfplus; +import edu.ucsd.msjava.cli.MSGFPlusOptions; import edu.ucsd.msjava.msdbsearch.SearchParams; import edu.ucsd.msjava.msdbsearch.SearchParams.PrecursorCalMode; import edu.ucsd.msjava.msdbsearch.SearchParamsTest; import edu.ucsd.msjava.msutil.DBSearchIOFiles; import edu.ucsd.msjava.msutil.SpecFileFormat; -import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Assert; import org.junit.Test; @@ -31,58 +30,47 @@ */ public class TestPrecursorCalScaffolding { - private ParamManager buildParamManager() throws URISyntaxException { - ParamManager manager = new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, - "java -Xmx3500M -jar MSGFPlus.jar"); - manager.addMSGFPlusParams(); - - URI paramUri = SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI(); - manager.getParameter("conf").parse(new File(paramUri).getAbsolutePath()); - - URI specUri = SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI(); - manager.getParameter("s").parse(new File(specUri).getAbsolutePath()); - - URI dbUri = SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI(); - manager.getParameter("d").parse(new File(dbUri).getAbsolutePath()); - return manager; + private MSGFPlusOptions buildOpts() throws URISyntaxException { + MSGFPlusOptions opts = new MSGFPlusOptions(); + opts.configFile = new File(SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI()); + opts.spectrumFile = new File(SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI()); + opts.databaseFile = new File(SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI()); + return opts; } @Test public void precursorCalDefaultIsAuto() throws URISyntaxException { - ParamManager manager = buildParamManager(); + MSGFPlusOptions opts = buildOpts(); SearchParams params = new SearchParams(); - Assert.assertNull("SearchParams.parse should succeed", params.parse(manager)); + Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); Assert.assertEquals("Default -precursorCal should be AUTO", PrecursorCalMode.AUTO, params.getPrecursorCalMode()); } @Test public void precursorCalOnIsParsed() throws URISyntaxException { - ParamManager manager = buildParamManager(); - Assert.assertNull(manager.getParameter("precursorCal").parse("on")); - + MSGFPlusOptions opts = buildOpts(); + opts.precursorCalMode = "on"; SearchParams params = new SearchParams(); - Assert.assertNull("SearchParams.parse should succeed", params.parse(manager)); + Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); Assert.assertEquals(PrecursorCalMode.ON, params.getPrecursorCalMode()); } @Test public void precursorCalOffIsParsed() throws URISyntaxException { - ParamManager manager = buildParamManager(); - Assert.assertNull(manager.getParameter("precursorCal").parse("off")); - + MSGFPlusOptions opts = buildOpts(); + opts.precursorCalMode = "off"; SearchParams params = new SearchParams(); - Assert.assertNull("SearchParams.parse should succeed", params.parse(manager)); + Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); Assert.assertEquals(PrecursorCalMode.OFF, params.getPrecursorCalMode()); } @Test public void precursorCalIsCaseInsensitive() throws URISyntaxException { - ParamManager manager = buildParamManager(); - Assert.assertNull(manager.getParameter("precursorCal").parse("OFF")); - + MSGFPlusOptions opts = buildOpts(); + opts.precursorCalMode = "OFF"; SearchParams params = new SearchParams(); - Assert.assertNull("SearchParams.parse should succeed", params.parse(manager)); + Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); Assert.assertEquals(PrecursorCalMode.OFF, params.getPrecursorCalMode()); } diff --git a/src/test/java/msgfplus/TestRunManifestWriter.java b/src/test/java/msgfplus/TestRunManifestWriter.java index 4707b7a8..39018b92 100644 --- a/src/test/java/msgfplus/TestRunManifestWriter.java +++ b/src/test/java/msgfplus/TestRunManifestWriter.java @@ -1,11 +1,11 @@ package msgfplus; +import edu.ucsd.msjava.cli.MSGFPlus; +import edu.ucsd.msjava.cli.MSGFPlusOptions; import edu.ucsd.msjava.misc.RunManifestWriter; import edu.ucsd.msjava.msdbsearch.SearchParams; import edu.ucsd.msjava.msdbsearch.SearchParamsTest; import edu.ucsd.msjava.msutil.DBSearchIOFiles; -import edu.ucsd.msjava.params.ParamManager; -import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Assert; import org.junit.Test; @@ -25,23 +25,14 @@ public class TestRunManifestWriter { private SearchParams parsedSearchParams() throws URISyntaxException { - ParamManager manager = new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, - "java -Xmx3500M -jar MSGFPlus.jar"); - manager.addMSGFPlusParams(); - - URI paramUri = SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI(); - manager.getParameter("conf").parse(new File(paramUri).getAbsolutePath()); - - URI specUri = SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI(); - manager.getParameter("s").parse(new File(specUri).getAbsolutePath()); - - URI dbUri = SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI(); - manager.getParameter("d").parse(new File(dbUri).getAbsolutePath()); - - manager.getParameter("maxMissedCleavages").parse("2"); + MSGFPlusOptions opts = new MSGFPlusOptions(); + opts.configFile = new File(SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI()); + opts.spectrumFile = new File(SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI()); + opts.databaseFile = new File(SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI()); + opts.maxMissedCleavages = 2; SearchParams params = new SearchParams(); - String err = params.parse(manager); + String err = params.parse(opts); Assert.assertNull("SearchParams.parse should succeed: " + err, err); return params; } diff --git a/src/test/java/msgfplus/TestSA.java b/src/test/java/msgfplus/TestSA.java index ac639540..c1966b05 100644 --- a/src/test/java/msgfplus/TestSA.java +++ b/src/test/java/msgfplus/TestSA.java @@ -5,7 +5,7 @@ import edu.ucsd.msjava.msdbsearch.SuffixArrayForMSGFDB; import edu.ucsd.msjava.msutil.Composition; -import edu.ucsd.msjava.params.ParamManager; +import edu.ucsd.msjava.cli.MSGFPlusOptions; import edu.ucsd.msjava.cli.MSGFPlus; import org.junit.Ignore; import org.junit.Test; @@ -33,7 +33,7 @@ public void getAAProbabilities() throws URISyntaxException { @Test public void getNumCandidatePeptides() throws URISyntaxException { - ParamManager paramManager = getParamManager(); + MSGFPlusOptions paramManager = getParamManager(); File dbFile = new File(TestSA.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI()); SuffixArraySequence sequence = new SuffixArraySequence(dbFile.getPath()); SuffixArray sa = new SuffixArray(sequence); @@ -85,8 +85,8 @@ public void testTSA() throws Exception { System.out.println("NumUnique10: " + length10); } - private ParamManager getParamManager() { - return new ParamManager("MS-GF+", MSGFPlus.VERSION, MSGFPlus.RELEASE_DATE, "java -Xmx3500M -jar MSGFPlus.jar"); + private MSGFPlusOptions getParamManager() { + return new MSGFPlusOptions(); } } From f5f3c47bdc0c0721eec2a4c527428fcac749d526 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 26 Apr 2026 20:21:49 +0100 Subject: [PATCH 19/34] refactor: delete edu.ucsd.msjava.params hierarchy (Phase 3) After Phase 4c routed SearchParams + AminoAcidSet through MSGFPlusOptions directly, the entire params/ package became unreferenced on the live MSGFPlus path. Deleting: - ParamManager (1,059 LOC after Phase 4a/4b cleanup) -- replaced by cli.MSGFPlusOptions + its effective*() resolvers and applyConfigFile(). - Parameter, NumberParameter, RangeParameter (abstract bases) - IntParameter, FloatParameter, DoubleParameter, IntRangeParameter, FloatRangeParameter (typed leaf classes) - StringParameter, FileParameter, FileListParameter (file/string types) - ToleranceParameter (replaced by cli.PrecursorTolerance) - EnumParameter, ObjectEnumParameter (enum machinery; the dynamic enum dispatch now lives inline in MSGFPlusOptions.effective*()) - ParamParser (legacy config-file reader; replaced by MSGFPlusOptions.applyConfigFile) - CaseInsensitiveLinkedHashMapParam, CaseInsensitiveMap (the Parameter map that backed ParamManager) Two small helper types (ParamObject interface + UserParam config-file helper) are still consumed by msutil's runtime registries (ActivationMethod, InstrumentType, Enzyme, Protocol). They have no dependency on the rest of the params/ hierarchy, so they relocate to edu.ucsd.msjava.msutil where their consumers already live. Validation: - Clean compile (main + tests) passes. - Scoped sweep (TestDirectPinWriter, TestMSUtils, TestSA, TestMisc, TestRunManifestWriter, SearchParamsTest, TestPercolator, TestMinSpectraPerThread, TestPrecursorCalScaffolding, TestCandidatePeptideGrid + ConsideringMetCleavage): 73 tests, 0 failures, 0 errors, 5 skipped. The legacy MSGFPlus CLI surface is fully preserved via the typed picocli @Option fields + applyConfigFile()'s alias rewrites; this is purely a maintainability cleanup that drops ~2,100 LOC of custom parameter-parsing scaffolding. --- .../ucsd/msjava/msutil/ActivationMethod.java | 2 - .../java/edu/ucsd/msjava/msutil/Enzyme.java | 2 - .../ucsd/msjava/msutil/InstrumentType.java | 1 - .../{params => msutil}/ParamObject.java | 2 +- .../java/edu/ucsd/msjava/msutil/Peptide.java | 1 - .../java/edu/ucsd/msjava/msutil/Protocol.java | 2 - .../msjava/{params => msutil}/UserParam.java | 2 +- .../CaseInsensitiveLinkedHashMapParam.java | 24 - .../msjava/params/CaseInsensitiveMap.java | 24 - .../ucsd/msjava/params/DoubleParameter.java | 39 - .../edu/ucsd/msjava/params/EnumParameter.java | 83 -- .../ucsd/msjava/params/FileListParameter.java | 98 -- .../edu/ucsd/msjava/params/FileParameter.java | 145 --- .../ucsd/msjava/params/FloatParameter.java | 34 - .../msjava/params/FloatRangeParameter.java | 31 - .../edu/ucsd/msjava/params/IntParameter.java | 59 - .../ucsd/msjava/params/IntRangeParameter.java | 49 - .../ucsd/msjava/params/NumberParameter.java | 69 -- .../msjava/params/ObjectEnumParameter.java | 29 - .../edu/ucsd/msjava/params/ParamManager.java | 1059 ----------------- .../edu/ucsd/msjava/params/ParamParser.java | 86 -- .../edu/ucsd/msjava/params/Parameter.java | 81 -- .../ucsd/msjava/params/RangeParameter.java | 66 - .../ucsd/msjava/params/StringParameter.java | 32 - .../msjava/params/ToleranceParameter.java | 70 -- .../java/msgfplus/TestMSLevelFiltering.java | 76 -- 26 files changed, 2 insertions(+), 2164 deletions(-) rename src/main/java/edu/ucsd/msjava/{params => msutil}/ParamObject.java (68%) rename src/main/java/edu/ucsd/msjava/{params => msutil}/UserParam.java (96%) delete mode 100644 src/main/java/edu/ucsd/msjava/params/CaseInsensitiveLinkedHashMapParam.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/CaseInsensitiveMap.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/DoubleParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/EnumParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/FileListParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/FileParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/FloatParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/FloatRangeParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/IntParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/IntRangeParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/NumberParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/ObjectEnumParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/ParamManager.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/ParamParser.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/Parameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/RangeParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/StringParameter.java delete mode 100644 src/main/java/edu/ucsd/msjava/params/ToleranceParameter.java delete mode 100644 src/test/java/msgfplus/TestMSLevelFiltering.java diff --git a/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java b/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java index a691dfe9..eb050444 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java +++ b/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java @@ -1,7 +1,5 @@ package edu.ucsd.msjava.msutil; -import edu.ucsd.msjava.params.ParamObject; -import edu.ucsd.msjava.params.UserParam; import java.io.File; import java.nio.file.Paths; diff --git a/src/main/java/edu/ucsd/msjava/msutil/Enzyme.java b/src/main/java/edu/ucsd/msjava/msutil/Enzyme.java index 1fea12bd..9dd9f26e 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Enzyme.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Enzyme.java @@ -9,8 +9,6 @@ ***************************************************************************/ package edu.ucsd.msjava.msutil; -import edu.ucsd.msjava.params.ParamObject; -import edu.ucsd.msjava.params.UserParam; import java.io.File; import java.nio.file.Paths; diff --git a/src/main/java/edu/ucsd/msjava/msutil/InstrumentType.java b/src/main/java/edu/ucsd/msjava/msutil/InstrumentType.java index 18e23948..513ced47 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/InstrumentType.java +++ b/src/main/java/edu/ucsd/msjava/msutil/InstrumentType.java @@ -1,6 +1,5 @@ package edu.ucsd.msjava.msutil; -import edu.ucsd.msjava.params.ParamObject; import java.util.LinkedHashMap; diff --git a/src/main/java/edu/ucsd/msjava/params/ParamObject.java b/src/main/java/edu/ucsd/msjava/msutil/ParamObject.java similarity index 68% rename from src/main/java/edu/ucsd/msjava/params/ParamObject.java rename to src/main/java/edu/ucsd/msjava/msutil/ParamObject.java index 200e8021..bcfd824d 100644 --- a/src/main/java/edu/ucsd/msjava/params/ParamObject.java +++ b/src/main/java/edu/ucsd/msjava/msutil/ParamObject.java @@ -1,4 +1,4 @@ -package edu.ucsd.msjava.params; +package edu.ucsd.msjava.msutil; public interface ParamObject { String getParamDescription(); diff --git a/src/main/java/edu/ucsd/msjava/msutil/Peptide.java b/src/main/java/edu/ucsd/msjava/msutil/Peptide.java index 4102b1a2..cdcd91db 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Peptide.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Peptide.java @@ -5,7 +5,6 @@ import edu.ucsd.msjava.msgf.MassListComparator; import edu.ucsd.msjava.msgf.Tolerance; import edu.ucsd.msjava.msutil.Modification.Location; -import edu.ucsd.msjava.params.ParamManager; import edu.ucsd.msjava.cli.MSGFPlus; import java.nio.file.Path; diff --git a/src/main/java/edu/ucsd/msjava/msutil/Protocol.java b/src/main/java/edu/ucsd/msjava/msutil/Protocol.java index cc1746f5..484431ba 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Protocol.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Protocol.java @@ -1,7 +1,5 @@ package edu.ucsd.msjava.msutil; -import edu.ucsd.msjava.params.ParamObject; -import edu.ucsd.msjava.params.UserParam; import java.io.File; import java.nio.file.Paths; diff --git a/src/main/java/edu/ucsd/msjava/params/UserParam.java b/src/main/java/edu/ucsd/msjava/msutil/UserParam.java similarity index 96% rename from src/main/java/edu/ucsd/msjava/params/UserParam.java rename to src/main/java/edu/ucsd/msjava/msutil/UserParam.java index 9a02ae16..97903fbc 100644 --- a/src/main/java/edu/ucsd/msjava/params/UserParam.java +++ b/src/main/java/edu/ucsd/msjava/msutil/UserParam.java @@ -1,4 +1,4 @@ -package edu.ucsd.msjava.params; +package edu.ucsd.msjava.msutil; import edu.ucsd.msjava.parser.BufferedLineReader; diff --git a/src/main/java/edu/ucsd/msjava/params/CaseInsensitiveLinkedHashMapParam.java b/src/main/java/edu/ucsd/msjava/params/CaseInsensitiveLinkedHashMapParam.java deleted file mode 100644 index 3d461e1d..00000000 --- a/src/main/java/edu/ucsd/msjava/params/CaseInsensitiveLinkedHashMapParam.java +++ /dev/null @@ -1,24 +0,0 @@ -package edu.ucsd.msjava.params; - -import java.util.LinkedHashMap; - -/** - * Case insensitive LinkedHashMap (Key:String, Value:Parameter) - * from https://stackoverflow.com/a/8237007/1179467 - */ -public class CaseInsensitiveLinkedHashMapParam extends LinkedHashMap { - - @Override - public Parameter put(String key, Parameter value) { - return super.put(key.toLowerCase(), value); - } - - // not @Override because that would require the key parameter to be of type Object - public Parameter get(String key) { - return super.get(key.toLowerCase()); - } - - public boolean containsKey(String key) { - return super.containsKey(key.toLowerCase()); - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/CaseInsensitiveMap.java b/src/main/java/edu/ucsd/msjava/params/CaseInsensitiveMap.java deleted file mode 100644 index a9873297..00000000 --- a/src/main/java/edu/ucsd/msjava/params/CaseInsensitiveMap.java +++ /dev/null @@ -1,24 +0,0 @@ -package edu.ucsd.msjava.params; - -import java.util.HashMap; - -/** - * Case insensitive HashMap (Key:String, Value:String) - * from https://stackoverflow.com/a/8237007/1179467 - */ -public class CaseInsensitiveMap extends HashMap { - - @Override - public String put(String key, String value) { - return super.put(key.toLowerCase(), value); - } - - // not @Override because that would require the key parameter to be of type Object - public String get(String key) { - return super.get(key.toLowerCase()); - } - - public boolean containsKey(String key) { - return super.containsKey(key.toLowerCase()); - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/DoubleParameter.java b/src/main/java/edu/ucsd/msjava/params/DoubleParameter.java deleted file mode 100644 index d11b63ee..00000000 --- a/src/main/java/edu/ucsd/msjava/params/DoubleParameter.java +++ /dev/null @@ -1,39 +0,0 @@ -package edu.ucsd.msjava.params; - -public class DoubleParameter extends NumberParameter { - - public DoubleParameter(ParamManager.ParamNameEnum paramInfo) { - super(paramInfo.getKey(), paramInfo.getName(), paramInfo.getDescription()); - setAdditionalDescription(paramInfo.getAdditionalDescription()); - } - - public DoubleParameter(String key, String name, String description) { - super(key, name, description); - super.minValue = Double.NEGATIVE_INFINITY; - super.maxValue = Double.POSITIVE_INFINITY; - } - - @Override - public String parse(String value) { - try { - // When parsing the value, look for and remove any trailing exclamation marks - super.value = Double.valueOf(trimTrailingChars(value, "!")); - - if (minValue == null) - minValue = Double.NEGATIVE_INFINITY; - - if (maxValue == null) - maxValue = Double.POSITIVE_INFINITY; - - String range = getValidRange(); - if (this.value < minValue || this.value > maxValue || - !isMinInclusive && this.value.equals(minValue) || - !isMaxInclusive && this.value.equals(maxValue)) { - return "must be in the range " + range; - } - } catch (NumberFormatException e) { - return "must be a double"; - } - return null; - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/EnumParameter.java b/src/main/java/edu/ucsd/msjava/params/EnumParameter.java deleted file mode 100644 index 70cb595c..00000000 --- a/src/main/java/edu/ucsd/msjava/params/EnumParameter.java +++ /dev/null @@ -1,83 +0,0 @@ -package edu.ucsd.msjava.params; - -import java.util.ArrayList; - -public class EnumParameter extends IntParameter { - - private int defaultValue = Integer.MIN_VALUE; - private ArrayList descriptions = new ArrayList(); - - public EnumParameter(String key) { - super(key, null, null); - super.minValue(0); - } - - public EnumParameter(ParamManager.ParamNameEnum paramInfo) { - super(paramInfo); - super.minValue(0); - } - - public EnumParameter setMinIndex(int minIndex) { - super.minValue(minIndex); - return this; - } - - public EnumParameter registerEntry(String description) { - descriptions.add(description); - return this; - } - - public EnumParameter setDefault() { - this.defaultValue = getMinValue() + descriptions.size() - 1; - super.defaultValue(defaultValue); - return this; - } - - protected int getCurIndex() { - return getMinValue() + descriptions.size(); - } - - protected int getMinValue() { - if (super.minValue == null) - return 0; - else - return super.minValue; - } - - @Override - public String getName() { - if (super.getName() != null) - return super.getName(); - StringBuffer buf = new StringBuffer(); - for (int i = super.minValue; i < getMinValue() + descriptions.size(); i++) { - if (i > getMinValue()) - buf.append("/"); - buf.append(i); - } - return buf.toString(); - } - - @Override - public String getDescription() { - StringBuffer buf = new StringBuffer(); - if (super.getDescription() != null) { - buf.append(super.getDescription() + ", "); - buf.append("Default: " + this.defaultValue); - return buf.toString(); - } - for (int i = super.minValue; i < getMinValue() + descriptions.size(); i++) { - if (i > getMinValue()) - buf.append(", "); - buf.append(i + ": " + descriptions.get(i - getMinValue())); - if (i == defaultValue) - buf.append(" (Default)"); - } - return buf.toString(); - } - - @Override - public String parse(String value) { - super.maxValue(getMinValue() + descriptions.size()); - return super.parse(value); - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/FileListParameter.java b/src/main/java/edu/ucsd/msjava/params/FileListParameter.java deleted file mode 100644 index b88b8320..00000000 --- a/src/main/java/edu/ucsd/msjava/params/FileListParameter.java +++ /dev/null @@ -1,98 +0,0 @@ -package edu.ucsd.msjava.params; - -import edu.ucsd.msjava.msutil.FileFormat; - -import java.io.File; -import java.util.ArrayList; - -public class FileListParameter extends Parameter { - - private ArrayList fileFormats = new ArrayList(); - - private File[] files; - private FileFormat[] fileFormatArr; - - public FileListParameter(String key, String name, String description) { - super(key, name, description); - } - - public FileListParameter setAsOptional() { - super.setOptional(); - return this; - } - - public FileListParameter addFileFormat(FileFormat fileFormat) { - fileFormats.add(fileFormat); - return this; - } - - @Override - public String parse(String value) { - File path = new File(value); - - File[] dirFiles; - if (!path.isDirectory()) { - if (!path.exists()) - return "File does not exist"; - dirFiles = new File[1]; - dirFiles[0] = path; - } else - dirFiles = path.listFiles(); - - ArrayList fileList = new ArrayList(); - ArrayList fileFormatList = new ArrayList(); - for (File f : dirFiles) { - if (fileFormats.isEmpty()) { - fileList.add(f); - } else { - FileFormat matchedFormat = null; - String fileName = f.getName(); - - for (FileFormat format : fileFormats) { - if (!format.isCaseSensitive()) - fileName = fileName.toLowerCase(); - for (String suffix : format.getSuffixes()) { - if (!format.isCaseSensitive()) - suffix = suffix.toLowerCase(); - if (fileName.endsWith(suffix)) { - matchedFormat = format; - break; - } - } - } - if (matchedFormat != null) { - fileList.add(f); - fileFormatList.add(matchedFormat); - } - } - } - if (fileList.size() == 0) { - return "no file exists with the given extensions"; - } - - files = fileList.toArray(new File[0]); - fileFormatArr = fileFormatList.toArray(new FileFormat[0]); - return null; - } - - public File[] getFiles() { - return files; - } - - public FileFormat[] getFileFormats() { - return fileFormatArr; - } - - @Override - public String getValueAsString() { - if (files == null) - return null; - StringBuffer output = new StringBuffer(); - if (files.length == 0) - return output.toString(); - output.append(files[0].getPath()); - for (int i = 1; i < files.length; i++) - output.append("," + files[i].getPath()); - return output.toString(); - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/FileParameter.java b/src/main/java/edu/ucsd/msjava/params/FileParameter.java deleted file mode 100644 index c27d6531..00000000 --- a/src/main/java/edu/ucsd/msjava/params/FileParameter.java +++ /dev/null @@ -1,145 +0,0 @@ -package edu.ucsd.msjava.params; - -import edu.ucsd.msjava.msutil.FileFormat; -import org.apache.commons.lang3.StringUtils; - -import java.io.File; -import java.util.ArrayList; - -public class FileParameter extends Parameter { - - private boolean mustExist = false; - private boolean mustNotExist = false; - - private boolean mustBeADirectory = false; - private boolean mustBeAFile = false; - - private ArrayList fileFormats = new ArrayList<>(); // available file format; if empty, all files are allowed. - - private File file; - private FileFormat fileFormat; - - public FileParameter(ParamManager.ParamNameEnum paramInfo) { - super(paramInfo.getKey(), paramInfo.getName(), paramInfo.getDescription()); - setAdditionalDescription(paramInfo.getAdditionalDescription()); - } - - public FileParameter(String key, String name, String description) { - super(key, name, description); - } - - - public FileParameter setAsOptional() { - super.setOptional(); - return this; - } - - public FileParameter fileMustExist() { - this.mustExist = true; - return this; - } - - public FileParameter fileMustNotExist() { - this.mustNotExist = true; - return this; - } - - public FileParameter mustBeADirectory() { - this.mustBeADirectory = true; - return this; - } - - public FileParameter mustBeAFile() { - this.mustBeAFile = true; - return this; - } - - public FileParameter addFileFormat(FileFormat fileFormat) { - fileFormats.add(fileFormat); - return this; - } - - public boolean isSupported(FileFormat fileFormat) { - if (fileFormats == null) - return false; - else - return fileFormats.contains(fileFormat); - } - - @Override - public String parse(String value) { - File path = new File(value); - - if (path.isDirectory()) { - if (this.mustBeAFile) - return "must not be a directory"; - } else // path is a file - { - if (this.mustBeADirectory) - return "must be a directory"; - } - - if (!fileFormats.isEmpty()) { - if (path.isDirectory() && fileFormats.contains(FileFormat.DIRECTORY)) { - this.fileFormat = FileFormat.DIRECTORY; - } else { - this.fileFormat = null; - String fileName = path.getName(); - - for (FileFormat format : fileFormats) { - if (!format.isCaseSensitive()) - fileName = fileName.toLowerCase(); - - for (String suffix : format.getSuffixes()) { - if (!format.isCaseSensitive()) - suffix = suffix.toLowerCase(); - if (fileName.endsWith(suffix)) { - this.fileFormat = format; - break; - } - } - } - } - - if (this.fileFormat == null) { - ArrayList knownFileExtensions = new ArrayList(); - for (FileFormat format : fileFormats) { - if (format == FileFormat.DIRECTORY) - continue; - - for (String suffix : format.getSuffixes()) { - knownFileExtensions.add(suffix); - } - } - - return "extension does not match a known file type: " + - StringUtils.join(knownFileExtensions, ", "); - } - } - - if (this.mustExist && !path.exists()) - return "file does not exist"; - - if (this.mustNotExist && path.exists()) - return "file already exists"; - - this.file = path; - - return null; - } - - public File getFile() { - return file; - } - - public FileFormat getFileFormat() { - return fileFormat; - } - - @Override - public String getValueAsString() { - if (file == null) - return null; - return file.getPath(); - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/FloatParameter.java b/src/main/java/edu/ucsd/msjava/params/FloatParameter.java deleted file mode 100644 index c2ee6ad5..00000000 --- a/src/main/java/edu/ucsd/msjava/params/FloatParameter.java +++ /dev/null @@ -1,34 +0,0 @@ -package edu.ucsd.msjava.params; - -public class FloatParameter extends NumberParameter { - - public FloatParameter(String key, String name, String description) { - super(key, name, description); - super.minValue = Float.NEGATIVE_INFINITY; - super.maxValue = Float.POSITIVE_INFINITY; - } - - @Override - public String parse(String value) { - try { - // When parsing the value, look for and remove any trailing exclamation marks - super.value = Float.valueOf(trimTrailingChars(value, "!")); - - if (minValue == null) - minValue = Float.NEGATIVE_INFINITY; - - if (maxValue == null) - maxValue = Float.POSITIVE_INFINITY; - - String range = getValidRange(); - if (this.value < minValue || this.value > maxValue || - !isMinInclusive && this.value.equals(minValue) || - !isMaxInclusive && this.value.equals(maxValue)) { - return "must be in the range " + range; - } - } catch (NumberFormatException e) { - return "must be a float"; - } - return null; - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/FloatRangeParameter.java b/src/main/java/edu/ucsd/msjava/params/FloatRangeParameter.java deleted file mode 100644 index 5d9c8bca..00000000 --- a/src/main/java/edu/ucsd/msjava/params/FloatRangeParameter.java +++ /dev/null @@ -1,31 +0,0 @@ -package edu.ucsd.msjava.params; - -public class FloatRangeParameter extends RangeParameter { - public FloatRangeParameter(String key, String name, String description) { - super(key, name, description); - super.minValue = Float.MIN_VALUE; - super.maxValue = Float.MAX_VALUE; - super.isMinInclusive = true; - super.isMaxInclusive = false; - } - - @Override - public String parse(String value) { - String[] token = value.split(","); - try { - if (token.length == 2) { - min = Float.parseFloat(token[0]); - max = Float.parseFloat(token[1]); - } else { - return "illegar syntax"; - } - } catch (NumberFormatException e) { - return "not a valid float or float range"; - } - - if (min >= max || !isValueValid(min) || !isValueValid(max)) { - return "not a valid range"; - } - return null; - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/IntParameter.java b/src/main/java/edu/ucsd/msjava/params/IntParameter.java deleted file mode 100644 index 20c1f7e5..00000000 --- a/src/main/java/edu/ucsd/msjava/params/IntParameter.java +++ /dev/null @@ -1,59 +0,0 @@ -package edu.ucsd.msjava.params; - -public class IntParameter extends NumberParameter { - - public IntParameter(ParamManager.ParamNameEnum paramInfo) { - super(paramInfo.getKey(), paramInfo.getName(), paramInfo.getDescription()); - setAdditionalDescription(paramInfo.getAdditionalDescription()); - } - - public IntParameter(String key, String name, String description) { - super(key, name, description); - super.minValue = 0; - super.maxValue = Integer.MAX_VALUE; - } - - @Override - public String parse(String value) { - try { - // When parsing the value, look for and remove any trailing exclamation marks - // Some DMS config files use a trailing exclamation mark to indicate that a value should not be changed - super.value = Integer.valueOf(trimTrailingChars(value, "!")); - - if (this.value == null) { - return "Value cannot be null"; - } - - if (minValue == null && maxValue == null) { - // Skip the range check - return null; - } - - if (minValue == null) { - minValue = Integer.MIN_VALUE; - } - - if (maxValue == null) { - maxValue = Integer.MAX_VALUE; - } - - String range = getValidRange(); - if (this.value < minValue || this.value > maxValue || - !super.isMinInclusive && this.value.equals(minValue) || - !super.isMaxInclusive && this.value.equals(maxValue)) { - - if (super.isMinInclusive && super.isMaxInclusive) - return "must be in the range " + minValue + " to " + maxValue; - else if (super.isMinInclusive) - return "must be in the range " + minValue + " to " + (maxValue - 1); - else if (!super.isMinInclusive && super.isMaxInclusive) - return "must be in the range " + (minValue + 1) + " to " + maxValue; - else - return "must be in the range " + range; - } - } catch (NumberFormatException e) { - return "must be an integer"; - } - return null; - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/IntRangeParameter.java b/src/main/java/edu/ucsd/msjava/params/IntRangeParameter.java deleted file mode 100644 index 309ae0ca..00000000 --- a/src/main/java/edu/ucsd/msjava/params/IntRangeParameter.java +++ /dev/null @@ -1,49 +0,0 @@ -package edu.ucsd.msjava.params; - -public class IntRangeParameter extends RangeParameter { - - public IntRangeParameter(ParamManager.ParamNameEnum paramInfo) { - super(paramInfo.getKey(), paramInfo.getName(), paramInfo.getDescription()); - setAdditionalDescription(paramInfo.getAdditionalDescription()); - } - - public IntRangeParameter(String key, String name, String description) { - super(key, name, description); - super.minValue = Integer.MIN_VALUE; - super.maxValue = Integer.MAX_VALUE; - super.isMinInclusive = true; - super.isMaxInclusive = false; - } - - @Override - public String parse(String value) { - String[] token = value.split(","); - try { - if (token.length == 1) { - min = Integer.parseInt(token[0]); - max = min; - } else if (token.length == 2) { - min = Integer.parseInt(token[0]); - max = Integer.parseInt(token[1]); - } else { - return "illegal syntax"; - } - } catch (NumberFormatException e) { - return "not a valid integer or integer range"; - } - - int minNumber = isMinInclusive ? min : min + 1; - int maxNumber = isMaxInclusive ? max : max - 1; - - if (minNumber > maxNumber) { - return "not a valid range"; - } - -// if(value.compareTo(minValue) < 0 || value.compareTo(maxValue) > 0 -// || !isMinInclusive && value.equals(minValue) -// || !isMaxInclusive && value.equals(maxValue)) -// return false; - - return null; - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/NumberParameter.java b/src/main/java/edu/ucsd/msjava/params/NumberParameter.java deleted file mode 100644 index adcecf52..00000000 --- a/src/main/java/edu/ucsd/msjava/params/NumberParameter.java +++ /dev/null @@ -1,69 +0,0 @@ -package edu.ucsd.msjava.params; - -import org.apache.commons.lang3.StringUtils; - -public abstract class NumberParameter extends Parameter { - protected T value; - - protected T minValue; // default: inclusive - protected T maxValue; // default: exclusive - protected boolean isMinInclusive = true; - protected boolean isMaxInclusive = false; - - public NumberParameter(String key, String name, String description) { - super(key, name, description); - } - - public NumberParameter defaultValue(T defaultValue) { - value = defaultValue; - super.setOptional(); - return this; - } - - public NumberParameter minValue(T minValue) { - this.minValue = minValue; - return this; - } - - public NumberParameter maxValue(T maxValue) { - this.maxValue = maxValue; - return this; - } - - public NumberParameter setMinExclusive() { - this.isMinInclusive = false; - return this; - } - - public NumberParameter setMaxInclusive() { - this.isMaxInclusive = true; - return this; - } - - protected String getValidRange() { - return (isMinInclusive ? "[" : "(") + minValue + "," + maxValue + (isMaxInclusive ? "]" : ")"); - } - - @Override - public abstract String parse(String value); - - @Override - public String getValueAsString() { - return String.valueOf(value); - } - - /** - * Remove the specified characters from the end of the value - * @param value - * @param stripChars - * @return - */ - public String trimTrailingChars(String value, String stripChars) - { - return StringUtils.stripEnd(value, stripChars); - } - - public T getValue() { - return value; - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/ObjectEnumParameter.java b/src/main/java/edu/ucsd/msjava/params/ObjectEnumParameter.java deleted file mode 100644 index fad416ee..00000000 --- a/src/main/java/edu/ucsd/msjava/params/ObjectEnumParameter.java +++ /dev/null @@ -1,29 +0,0 @@ -package edu.ucsd.msjava.params; - -import java.util.ArrayList; - -public class ObjectEnumParameter extends EnumParameter { - - private ArrayList objectList = new ArrayList(); - - public ObjectEnumParameter(ParamManager.ParamNameEnum paramInfo) { - super(paramInfo); - setAdditionalDescription(paramInfo.getAdditionalDescription()); - } - - public ObjectEnumParameter registerObject(T obj) { - super.registerEntry(obj.getParamDescription()); - objectList.add(obj); - return this; - } - - public T getObject() { - int value = getValue(); - return objectList.get(value - minValue); - } - - @Override - public String getValueAsString() { - return getObject().getParamDescription(); - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/ParamManager.java b/src/main/java/edu/ucsd/msjava/params/ParamManager.java deleted file mode 100644 index c5f606f3..00000000 --- a/src/main/java/edu/ucsd/msjava/params/ParamManager.java +++ /dev/null @@ -1,1059 +0,0 @@ -package edu.ucsd.msjava.params; - -import edu.ucsd.msjava.msutil.*; -import edu.ucsd.msjava.sequences.Constants; -import edu.ucsd.msjava.cli.MSGFPlus; - -import java.io.File; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.Map.Entry; - -public class ParamManager { - - /** - * Keys in this HashMap are the parameter key (typically the command line names), values are the parameter definition - */ - private CaseInsensitiveLinkedHashMapParam params; - - private String toolName; - private String version; - private String date; - private String command; - private ArrayList examples = new ArrayList<>(); - - public enum ParamNameEnum { - - CONFIGURATION_FILE("conf", "ConfigurationFile", - "Configuration file path; options specified at the command line will override settings in the config file", - "Example parameter file is at https://github.com/MSGFPlus/msgfplus/blob/master/docs/examples/MSGFPlus_Params.txt"), - - SPECTRUM_FILE("s", "SpectrumFile", "*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt", - "Spectra should be centroided (see below for MSConvert example). Profile spectra will be ignored."), - - DB_FILE("d", "DatabaseFile", "*.fasta or *.fa or *.faa", null), - - DECOY_PREFIX("decoy", "DecoyPrefix", - "Prefix for decoy protein names; Default: " + MSGFPlus.DEFAULT_DECOY_PROTEIN_PREFIX, null), - - // -o for MS-GF+ search output - SEARCH_OUTPUT_FILE("o", "OutputFile (*.pin or *.tsv)", "Default: [SpectrumFileName].pin", null), - - PRECURSOR_MASS_TOLERANCE("t", "PrecursorMassTolerance", "e.g. 2.5Da, 20ppm or 0.5Da,2.5Da; Default: 20ppm", - "Use a comma to define asymmetric values. E.g. \"-t 0.5Da,2.5Da\" will set 0.5Da to the left (ObsMass < TheoMass) and 2.5Da to the right (ObsMass > TheoMass)"), - - PRECURSOR_MASS_TOLERANCE_UNITS("u", "PrecursorMassToleranceUnits", "Units for the precursor mass tolerance; only useful if you do not include units in the PrecursorMassTolerance specification", - "0 means Ds\n" + - "\t 1 means ppm\n" + - "\t 2 means use units specified by the PrecursorMassTolerance (Default)"), - - // aka Activation method - FRAG_METHOD("m", "FragmentationMethodID", "Fragmentation Method", - "0 means as written in the spectrum or CID if no info (Default)\n" + - "\t 1 means CID\n" + - "\t 2 means ETD\n" + - "\t 3 means HCD"), - - INSTRUMENT_TYPE("inst", "InstrumentID", null, null), - - ENZYME_ID("e", "EnzymeID", null, null), - - PROTOCOL_ID("protocol", "ProtocolID", null, null), - - MOD_FILE("mod", "ModificationFileName", "Modification file; Default: standard amino acids with fixed C+57; only if -mod is not specified", null), - - NUM_THREADS("thread", "NumThreads", "Number of concurrent threads to be executed; Default: Number of available cores", - "This is best set to the number of physical cores in a single NUMA node.\n" + - "\t Generally a single NUMA node is 1 physical processor.\n" + - "\t The default will try to use hyperthreading cores, which can increase the amount of time this process will take.\n" + - "\t This is because the part of Scoring param generation that is multithreaded is also I/O intensive."), - - NUM_TASKS("tasks", "NumTasks", "Override the number of tasks to use on the threads; Default: (internally calculated based on inputs)", - "More tasks than threads will reduce the memory requirements of the search, but will be slower (how much depends on the inputs).\n" + - "\t 1 <= tasks <= numThreads: will create one task per thread, which is the original behavior.\n" + - "\t tasks = 0: use default calculation - minimum of: (threads*3) and (numSpectra/minSpectraPerThread).\n" + - "\t tasks < 0: multiply number of threads by abs(tasks) to determine number of tasks (i.e., -2 means \"2 * numThreads\" tasks).\n" + - "\t One task per thread will use the most memory, but will usually finish the fastest.\n" + - "\t 2-3 tasks per thread will use comparably less memory, but may cause the search to take 1.5 to 2 times as long."), - - MIN_SPECTRA_PER_THREAD("minSpectraPerThread", "MinSpectraPerThread", - "Minimum number of spectra to assign per thread/task; Default: 250", - "Controls the per-thread workload floor used when auto-selecting numThreads and numTasks.\n" + - "\t The effective thread count is capped at max(1, round(numSpectra / minSpectraPerThread)).\n" + - "\t Lower this value to raise parallelism on small inputs running on many-core hosts\n" + - "\t (e.g. set to 50 when searching ~1,000 spectra on a 20-core machine).\n" + - "\t Going too low increases per-thread setup overhead and can slow the search."), - - // Used by MS-GF+ - ISOTOPE_ERROR("ti", "IsotopeErrorRange", "Range of allowed isotope peak errors; Default: 0,1", - "Takes into account the error introduced by choosing a non-monoisotopic peak for fragmentation.\n" + - "\t The combination of -t and -ti determines the precursor mass tolerance.\n" + - "\t E.g. \"-t 20ppm -ti -1,2\" tests abs(ObservedPepMass - TheoreticalPepMass - n * 1.00335Da) < 20ppm for n = -1, 0, 1, 2."), - - ENZYME_SPECIFICITY("ntt", "NTT", "Number of Tolerable Termini", - "E.g. For trypsin, 0: non-tryptic, 1: semi-tryptic, 2: fully-tryptic peptides only."), - - MIN_PEPTIDE_LENGTH("minLength", "MinPepLength", "Minimum peptide length to consider; Default: 6", null), - MAX_PEPTIDE_LENGTH("maxLength", "MaxPepLength", "Maximum peptide length to consider; Default: 40", null), - - MIN_CHARGE("minCharge", "MinCharge", "Minimum precursor charge to consider if charges are not specified in the spectrum file; Default: 2", null), - MAX_CHARGE("maxCharge", "MaxCharge", "Maximum precursor charge to consider if charges are not specified in the spectrum file; Default: 3", null), - - NUM_MATCHES_SPEC("n", "NumMatchesPerSpec", "Number of matches per spectrum to be reported; Default: 1", null), - - CHARGE_CARRIER_MASSES("ccm", "ChargeCarrierMass", "Mass of charge carrier; Default: mass of proton (1.00727649)", null), - - MIN_NUM_PEAKS("minNumPeaks", "MinNumPeaksPerSpectrum", "Minimum number of peaks per spectrum; Default: " + Constants.MIN_NUM_PEAKS_PER_SPECTRUM, null), - - NUM_ISOFORMS("iso", "NumIsoforms", "Number of isoforms to consider per peptide; Default: " + Constants.NUM_VARIANTS_PER_PEPTIDE, null), - - IGNORE_MET_CLEAVAGE("ignoreMetCleavage", "IgnoreMetCleavage", "When 1, ignore N-terminal methionine cleavage", - "0 means to consider protein N-term Met cleavage (Default)\n" + - "\t 1 means to ignore protein N-term Met cleavage"), - - MIN_DE_NOVO_SCORE("minDeNovoScore", "MinDeNovoScore", "Minimum de Novo score; Default: " + Constants.MIN_DE_NOVO_SCORE, null), - - SPEC_INDEX("index", "SpecIndex", "Range of spectrum indices to be considered", - "For example, to analyze the first 1000 spectra use -index 1,1000"), - - MS_LEVEL("msLevel", "MSLevel", "MS level or range of MS levels to consider; Default: 2", - "Accepts a single value or a comma-separated range.\n" + - "\t For example, -msLevel 2 to search only MS2 spectra\n" + - "\t Or -msLevel 2,3 to search both MS2 and MS3 spectra"), - - MAX_MISSED_CLEAVAGES("maxMissedCleavages", "MaxMissedCleavages", "Exclude peptides with more than this number of missed cleavages from the search; Default: -1 (no limit)", null), - - TDA_STRATEGY("tda", "TDA", "Target decoy strategy", - "0 means Don't search decoy database (Default)\n" + - "\t 1 means search the decoy database (forward + reverse proteins)"), - - ADD_FEATURES("addFeatures", "AddFeatures", "Include additional features in the output (enable this to post-process results with Percolator)", - "0 means Output basic scores only (Default)\n" + - "\t 1 means Output additional features"), - - ALLOW_DENSE_CENTROIDED_PEAKS("allowDenseCentroidedPeaks", "AllowDenseCentroidedPeaks", "Allow centroid scans with dense peaks (Default: 0)\n" + - "\t (for mzML or mzXML files, the console output will tell you if you might want to use this)", null), - - DD_DIRECTORY("dd", "DBIndexDir", "Path to the directory containing database index files", null), - - EDGE_SCORE("edgeScore", "EdgeScore", "Toggle edge scoring", - "0 means Use Edge Scoring (Default)\n" + - "\t 1 means Do not use edge scoring"), - - MAX_NUM_MODS("numMods", "NumMods", "Maximum number of dynamic (variable) modifications per peptide; Default: 3", null), - - // Note that static and dynamic modifications cannot be specified at the command line - // Use -mod or -conf - STATIC_MODIFICATION("staticMod", "StaticMod", "Static/Fixed modification", null), - - DYNAMIC_MODIFICATION("dynamicMod", "DynamicMod", "Dynamic/Variable modification", null), - - CUSTOM_AA("customAA", "CustomAA", "Custom amino acid", null), - - VERBOSE("verbose", null, "Console output message verbosity", - "0 means Report total progress only\n" + - "\t 1 means Report total and per-thread progress/status"), - - OUTPUT_FORMAT("outputFormat", "OutputFormat", "Output format for search results; Default: pin", - "pin: Write Percolator .pin format directly (default; feeds into Percolator for rescoring)\n" + - "\t tsv: Write TSV directly (faster, smaller files, compatible with OpenMS MSGFPlusAdapter)"), - - PRECURSOR_CAL("precursorCal", "PrecursorCal", "Precursor mass calibration mode; Default: auto", - "auto: Run a quick pre-pass and apply a per-file ppm shift only when >= 200 confident PSMs are collected (default)\n" + - "\t on: Always apply the learned shift, even if fewer PSMs are collected\n" + - "\t off: Skip calibration entirely (bit-identical to builds without the flag)"); - - private String key; - private String name; - private String description; - private String additionalDescription; - - ParamNameEnum(String key, String name, String description, String additionalDescription) { - this.key = key; - this.name = name; - this.description = description; - this.additionalDescription = additionalDescription; - } - - /** - * Parameter key; defines the command line argument for this parameter - * @return - */ - public String getKey() { - return key; - } - - /** - * Parameter name when used in a configuration file - * @return - */ - public String getName() { - return name; - } - - /** - * Parameter description - * @return - */ - public String getDescription() { - return description; - } - - /** - * Additional description - * @return - */ - public String getAdditionalDescription() { - return additionalDescription; - } - - /** - * Check whether the parameter line matches this parameter's name - * @param paramName Parameter name from the config file - * @return True if it matches the parameter name of this class (more specifically, of a class that inherits from this class) - */ - public boolean isThisParam(String paramName) { - return ((getName() != null && paramName.equalsIgnoreCase(getName()))); - } - - public static String getParamNameFromLine(String lineSetting) { - String[] lineParts = lineSetting.split("="); - if (lineParts.length < 2) - return ""; - - String paramName = lineParts[0].trim(); - - // Auto-update some names to change from abbreviations / alternate names to the standard name - if (paramName.equalsIgnoreCase("IsotopeError")) { - paramName = "IsotopeErrorRange"; - } else if (paramName.equalsIgnoreCase("TargetDecoyAnalysis")) { - paramName = "TDA"; - } else if (paramName.equalsIgnoreCase("FragmentationMethod")) { - paramName = "FragmentationMethodID"; - } else if (paramName.equalsIgnoreCase("Instrument")) { - paramName = "InstrumentID"; - } else if (paramName.equalsIgnoreCase("Enzyme")) { - paramName = "EnzymeID"; - } else if (paramName.equalsIgnoreCase("Protocol")) { - paramName = "ProtocolID"; - } else if (paramName.equalsIgnoreCase("NumTolerableTermini")) { - paramName = "NTT"; - } else if (paramName.equalsIgnoreCase("MinNumPeaks")) { - paramName = "MinNumPeaksPerSpectrum"; - } else if (paramName.equalsIgnoreCase("MaxNumMods") || paramName.equalsIgnoreCase("MaxNumModsPerPeptide")) { - paramName = "NumMods"; - } else if (paramName.equalsIgnoreCase("minLength") || paramName.equalsIgnoreCase("MinPeptideLength")) { - paramName = "MinPepLength"; - } else if (paramName.equalsIgnoreCase("maxLength") || paramName.equalsIgnoreCase("MaxPeptideLength")) { - paramName = "MaxPepLength"; - } else if (paramName.equalsIgnoreCase("PMTolerance") || paramName.equalsIgnoreCase("ParentMassTolerance")) { - paramName = "PrecursorMassTolerance"; - } - - return paramName; - } - } - - public ParamManager(String toolName, String version, String date, String command) { - this.toolName = toolName; - this.version = version; - this.date = date; - this.command = command; - params = new CaseInsensitiveLinkedHashMapParam(); - } - - public boolean addParameter(Parameter param) { - if (params.containsKey(param.getKey())) { - System.err.println("ParamManager: duplicate key (" + param.getKey() + ")"); - System.exit(-1); - } - params.put(param.getKey(), param); - return true; - } - - private void addExample(String example) { - this.examples.add(example); - } - - public Parameter getParameter(String key) { - return params.get(key); - } - - /** - * Validates that required parameters are defined - * @return Error message if an error, otherwise null - */ - public String isValid() { - Iterator> itr = params.entrySet().iterator(); - while (itr.hasNext()) { - Entry entry = itr.next(); - Parameter param = entry.getValue(); - if (!param.isValid()) { - return "Parameter -" + param.getKey() + " (" + param.getName() + ") is missing"; - } - } - return null; - } - - public void printToolInfo() { - System.out.println(this.toolName + " " + this.version + " (" + this.date + ")"); - } - - public void printJVMInfo() { - System.out.println("Java " + System.getProperty("java.version") + " (" + System.getProperty("java.vendor") + ")"); - System.out.println(System.getProperty("os.name") + " (" + System.getProperty("os.arch") + ", version " + System.getProperty("os.version") + ")"); - } - - public void printUsageInfo() { - System.out.println(); - System.out.println(this.toolName + " " + this.version + " (" + this.date + ")"); - System.out.println(); - System.out.println("Usage: " + this.command); - - ArrayList optParams = new ArrayList<>(); - Iterator> itr = params.entrySet().iterator(); - while (itr.hasNext()) { - Entry entry = itr.next(); - Parameter param = entry.getValue(); - if (!param.isHidden()) { - if (!param.isOptional()) { - System.out.println("\t" + param); - if (param.getAdditionalDescription() != null) - System.out.println("\t " + param.getAdditionalDescription()); - } else { - optParams.add(param); - } - } - } - - for (Parameter param : optParams) { - System.out.println("\t" + param); - if (param.getAdditionalDescription() != null) - System.out.println("\t " + param.getAdditionalDescription()); - } - - System.out.println(); - for (String example : examples) - System.out.println(example); - - System.out.println(); - System.out.println("For Thermo .raw files, obtain a centroided .mzML file using MSConvert, which is part of ProteoWizard (http://proteowizard.sourceforge.net/)"); - System.out.println(" MSConvert.exe DatasetName.raw --filter \"peakPicking true 1-\" --mzML --32"); - System.out.println(); - System.out.println("To add or override the enzyme definitions, create a file named enzymes.txt in a directory named params below the working directory."); - System.out.println("For example, create file C:\\Work\\params\\enzymes.txt when the working directory is C:\\Work"); - System.out.println("Example enzymes.txt file: https://github.com/MSGFPlus/msgfplus/blob/master/docs/examples/enzymes.txt"); - System.out.println(); - System.out.println("Documentation: https://msgfplus.github.io/msgfplus/"); - System.out.println("Releases: https://github.com/MSGFPlus/msgfplus/releases"); - } - - public void printValues() { - Iterator> itr = params.entrySet().iterator(); - while (itr.hasNext()) { - Entry entry = itr.next(); - Parameter param = entry.getValue(); - System.out.println(param.getKey() + "\t" + param.getValueAsString()); - } - } - - public String parseParams(String argv[]) { - if (argv.length == 0) { - return "No parameter specified."; - } - - if (argv.length < 2 || argv.length % 2 != 0) { - return "The number of parameters must be even. If a file path has a space, surround it with double quotes."; - } - - for (int i = 0; i < argv.length; i += 2) { - if (!argv[i].startsWith("-") || i + 1 >= argv.length || argv[i].length() <= 1) { - return "Syntax error; parameter names must start with a dash: " + argv[i]; - } else { - String key = argv[i].substring(1); - Parameter param = params.get(key); - if (param == null) { - return "Invalid parameter: " + argv[i] + "."; - } else { - String error = param.parse(argv[i + 1]); - if (error != null) { - String err = "Invalid value for parameter " + argv[i] + ": " + argv[i + 1]; - err += "\n (" + error + ")"; - return err; - } - param.setValueAssigned(); - } - } - } - - String error = isValid(); - if (error != null) - return error; - - return null; - } - - public void addSpecFileParam(boolean isOptional) { - FileParameter specFileParam = new FileParameter(ParamNameEnum.SPECTRUM_FILE); - if (isOptional) { - specFileParam.setAsOptional(); - } - specFileParam.addFileFormat(SpecFileFormat.MZML); - specFileParam.addFileFormat(SpecFileFormat.MGF); - specFileParam.addFileFormat(SpecFileFormat.MS2); - specFileParam.addFileFormat(SpecFileFormat.PKL); - specFileParam.addFileFormat(SpecFileFormat.DTA_TXT); - specFileParam.addFileFormat(FileFormat.DIRECTORY); - specFileParam.fileMustExist(); - specFileParam.setAdditionalDescription(ParamNameEnum.SPECTRUM_FILE.additionalDescription); - addParameter(specFileParam); - } - - private void addDBFileParam(boolean isOptional) { - addDBFileParam(ParamNameEnum.DB_FILE, isOptional); - } - - private void addDBFileParam(ParamNameEnum paramInfo, boolean isOptional) { - FileParameter dbFileParam = new FileParameter(paramInfo); - if (isOptional) { - dbFileParam.setAsOptional(); - } - dbFileParam.addFileFormat(DBFileFormat.FASTA); - dbFileParam.fileMustExist(); - dbFileParam.mustBeAFile(); - addParameter(dbFileParam); - } - - private void addDecoyPrefixParam() { - addDecoyPrefixParam(MSGFPlus.DEFAULT_DECOY_PROTEIN_PREFIX); - } - - private void addDecoyPrefixParam(String defaultDecoyPrefix) { - StringParameter decoyPrefixParam = new StringParameter(ParamNameEnum.DECOY_PREFIX); - // Note that defining a default value auto-sets isOptional to True - decoyPrefixParam.defaultValue(defaultDecoyPrefix); - addParameter(decoyPrefixParam); - } - - private void addPrecursorMassToleranceParam() { - ToleranceParameter pmTolParam = new ToleranceParameter(ParamNameEnum.PRECURSOR_MASS_TOLERANCE); - pmTolParam.defaultValue("20ppm"); - addParameter(pmTolParam); - } - - /** - * -o for MS-GF+. Accepts only .pin (default) and .tsv after mzid removal. - */ - private void addMzIdOutputFileParam() { - FileParameter outputParam = new FileParameter(ParamNameEnum.SEARCH_OUTPUT_FILE); - outputParam.addFileFormat(new FileFormat(".pin")); - outputParam.addFileFormat(new FileFormat(".tsv")); - outputParam.setAsOptional(); - addParameter(outputParam); - } - - /** - * Used by both MS-GF+ and MS-GFDB - * MS-GF+ passes True for doNotAddMergeMode, thus ignoring ActivationMethod.FUSION - * - * @param defaultMethod - * @param doNotAddMergeMode - */ - private void addFragMethodParam(ActivationMethod defaultMethod, boolean doNotAddMergeMode) { - ObjectEnumParameter fragParam = new ObjectEnumParameter<>(ParamNameEnum.FRAG_METHOD); - ActivationMethod[] methods = ActivationMethod.getAllRegisteredActivationMethods(); - for (ActivationMethod m : methods) { - if (doNotAddMergeMode && m == ActivationMethod.FUSION) - continue; - fragParam.registerObject(m); - if (m == defaultMethod) - fragParam.setDefault(); - } - addParameter(fragParam); - } - - private void addInstTypeParam() { - addInstTypeParam(InstrumentType.LOW_RESOLUTION_LTQ); - } - - private void addInstTypeParam(InstrumentType defaultInst) { - ObjectEnumParameter instParam = new ObjectEnumParameter(ParamNameEnum.INSTRUMENT_TYPE); - InstrumentType[] allInstTypes = InstrumentType.getAllRegisteredInstrumentTypes(); - for (InstrumentType inst : allInstTypes) { - instParam.registerObject(inst); - if (inst == defaultInst) - instParam.setDefault(); - } - addParameter(instParam); - } - - private void addEnzymeParam() { - addEnzymeParam(Enzyme.TRYPSIN); - } - - private void addEnzymeParam(Enzyme enzymeId) { - ObjectEnumParameter enzParam = new ObjectEnumParameter<>(ParamNameEnum.ENZYME_ID); - Enzyme[] allEnzymes = Enzyme.getAllRegisteredEnzymes(); - for (Enzyme e : allEnzymes) { - enzParam.registerObject(e); - if (e == enzymeId) - enzParam.setDefault(); - } - addParameter(enzParam); - } - - private void addProtocolParam() { - addProtocolParam(Protocol.AUTOMATIC); - } - - private void addProtocolParam(Protocol defaultProtocol) { - ObjectEnumParameter protocolParam = new ObjectEnumParameter(ParamNameEnum.PROTOCOL_ID); - Protocol[] protocols = Protocol.getAllRegisteredProtocols(); - for (Protocol protocol : protocols) { - protocolParam.registerObject(protocol); - if (protocol == defaultProtocol) - protocolParam.setDefault(); - } - addParameter(protocolParam); - } - - private void addEnzymeSpecificityParam() { - EnumParameter nttParam = new EnumParameter(ParamNameEnum.ENZYME_SPECIFICITY); - nttParam.registerEntry(""); - nttParam.registerEntry(""); - nttParam.registerEntry("").setDefault(); - addParameter(nttParam); - } - - private void addModFileParam() { - FileParameter modParam = new FileParameter(ParamNameEnum.MOD_FILE); - modParam.setAsOptional(); - modParam.fileMustExist(); - addParameter(modParam); - } - - private void addConfigFileParam() { - FileParameter configFile = new FileParameter(ParamNameEnum.CONFIGURATION_FILE); - configFile.setAsOptional(); - configFile.fileMustExist(); - addParameter(configFile); - } - - private void addIsotopeRangeParam() { - IntRangeParameter isotopeRange = new IntRangeParameter(ParamNameEnum.ISOTOPE_ERROR); - isotopeRange.setMaxInclusive(); - isotopeRange.defaultValue("0,1"); - addParameter(isotopeRange); - } - - private IntParameter addNumThreadsParam() { - IntParameter numThreadsParam = new IntParameter(ParamNameEnum.NUM_THREADS); - numThreadsParam.defaultValue(Runtime.getRuntime().availableProcessors()); - numThreadsParam.minValue(1); - addParameter(numThreadsParam); - return numThreadsParam; - } - - private void addVerboseModeParam() { - EnumParameter verboseOutputParam = new EnumParameter(ParamNameEnum.VERBOSE); - verboseOutputParam.registerEntry("Report total progress only").setDefault(); - verboseOutputParam.registerEntry("Report total and per-thread progress/status"); - addParameter(verboseOutputParam); - } - - private void addNumTasksParam() { - IntParameter numTasksParam = new IntParameter(ParamNameEnum.NUM_TASKS); - numTasksParam.defaultValue(0); - numTasksParam.minValue(-10); - addParameter(numTasksParam); - } - - private void addMinSpectraPerThreadParam() { - IntParameter minSpectraParam = new IntParameter(ParamNameEnum.MIN_SPECTRA_PER_THREAD); - minSpectraParam.defaultValue(250); - minSpectraParam.minValue(1); - addParameter(minSpectraParam); - } - - private void addTdaParam() { - EnumParameter tdaParam = new EnumParameter(ParamNameEnum.TDA_STRATEGY); - tdaParam.registerEntry("Don't search decoy database").setDefault(); - tdaParam.registerEntry("Search decoy database"); - addParameter(tdaParam); - } - - private void addMinPeptideLengthParam() { - IntParameter minLenParam = new IntParameter(ParamNameEnum.MIN_PEPTIDE_LENGTH); - minLenParam.minValue(1); - minLenParam.defaultValue(6); - addParameter(minLenParam); - } - - private void addMaxPeptideLengthParam() { - IntParameter maxLenParam = new IntParameter(ParamNameEnum.MAX_PEPTIDE_LENGTH); - maxLenParam.minValue(1); - maxLenParam.defaultValue(40); - addParameter(maxLenParam); - } - - private void addMinChargeParam() { - IntParameter minCharge = new IntParameter(ParamNameEnum.MIN_CHARGE); - minCharge.minValue(1); - minCharge.defaultValue(2); - addParameter(minCharge); - } - - private void addMaxChargeParam() { - IntParameter maxCharge = new IntParameter(ParamNameEnum.MAX_CHARGE); - maxCharge.minValue(1); - maxCharge.defaultValue(3); - addParameter(maxCharge); - } - - private void addNumMatchesPerSpecParam() { - IntParameter numMatchesParam = new IntParameter(ParamNameEnum.NUM_MATCHES_SPEC); - numMatchesParam.minValue(1); - numMatchesParam.defaultValue(1); - addParameter(numMatchesParam); - } - - private void addAddFeaturesParam() { - EnumParameter addFeatureParam = new EnumParameter(ParamNameEnum.ADD_FEATURES); - addFeatureParam.registerEntry("Output basic scores only").setDefault(); - addFeatureParam.registerEntry("Output additional features"); - addParameter(addFeatureParam); - } - - private void addOutputFormatParam() { - // mzid output has been removed — MS-GF+ results feed into Percolator - // via the .pin format. Only pin (default) and tsv are supported now. - // Previous integer mappings 0=mzid, 2=both are no longer accepted. - EnumParameter outputFormatParam = new EnumParameter(ParamNameEnum.OUTPUT_FORMAT); - outputFormatParam.registerEntry("pin").setDefault(); - outputFormatParam.registerEntry("tsv"); - addParameter(outputFormatParam); - } - - public int getOutputFormat() { - return ((EnumParameter) getParameter(ParamNameEnum.OUTPUT_FORMAT.key)).getValue(); - } - - private void addPrecursorCalParam() { - StringParameter precursorCalParam = new StringParameter(ParamNameEnum.PRECURSOR_CAL); - precursorCalParam.defaultValue("auto"); - addParameter(precursorCalParam); - } - - /** - * Returns the raw value of the {@code -precursorCal} flag; one of - * {@code "auto"}, {@code "on"}, or {@code "off"} (case-insensitive). - * Use {@link SearchParams#getPrecursorCalMode()} for the parsed enum. - */ - public String getPrecursorCalRawValue() { - StringParameter param = (StringParameter) getParameter(ParamNameEnum.PRECURSOR_CAL.key); - return param == null ? "auto" : param.value; - } - - private void addChargeCarrierMassParam() { - DoubleParameter chargeCarrierMassParam = new DoubleParameter(ParamNameEnum.CHARGE_CARRIER_MASSES); - chargeCarrierMassParam.minValue(0.1); - chargeCarrierMassParam.setMaxInclusive(); - chargeCarrierMassParam.defaultValue(Composition.PROTON); - addParameter(chargeCarrierMassParam); - } - - private void addMaxMissedCleavagesParam() { - IntParameter maxMissedCleavages = new IntParameter(ParamNameEnum.MAX_MISSED_CLEAVAGES); - maxMissedCleavages.minValue(-1); - maxMissedCleavages.defaultValue(-1); - addParameter(maxMissedCleavages); - } - - private void addMaxNumModsParam() { - IntParameter maxNumMods = new IntParameter(ParamNameEnum.MAX_NUM_MODS); - maxNumMods.minValue(0); - maxNumMods.defaultValue(3); - addParameter(maxNumMods); - } - - private void addAllowDenseCentroidedPeaksParam() { - EnumParameter allowDenseCentroidedPeaksParam = new EnumParameter(ParamNameEnum.ALLOW_DENSE_CENTROIDED_PEAKS); - allowDenseCentroidedPeaksParam.registerEntry("Skip all spectra that fail a peak density check").setDefault(); - allowDenseCentroidedPeaksParam.registerEntry("Allow mzML/mzXML centroided spectra that fail a peak density check"); - addParameter(allowDenseCentroidedPeaksParam); - } - - private void addDbIndexDirParam(boolean isHidden) { - FileParameter dbIndexDirParam = new FileParameter(ParamNameEnum.DD_DIRECTORY); - dbIndexDirParam.fileMustExist(); - dbIndexDirParam.mustBeADirectory(); - dbIndexDirParam.setAsOptional(); - if (isHidden) { - dbIndexDirParam.setHidden(); - } - addParameter(dbIndexDirParam); - } - - private void addPrecursorMassToleranceUnitsParam(boolean isHidden) { - EnumParameter unitParam = new EnumParameter(ParamNameEnum.PRECURSOR_MASS_TOLERANCE_UNITS); - unitParam.registerEntry("Da"); - unitParam.registerEntry("ppm"); - unitParam.registerEntry("Don't care").setDefault(); - if (isHidden) { - unitParam.setHidden(); - } - addParameter(unitParam); - } - - private void addSpecIndexRangeParam(boolean isHidden) { - IntRangeParameter specIndexParam = new IntRangeParameter(ParamNameEnum.SPEC_INDEX); - specIndexParam.minValue(1); - specIndexParam.setMaxInclusive(); - specIndexParam.defaultValue("1," + (Integer.MAX_VALUE - 1)); - if (isHidden) { - specIndexParam.setHidden(); - } - addParameter(specIndexParam); - } - - private void addMSLevelParam() { - IntRangeParameter msLevelParam = new IntRangeParameter(ParamNameEnum.MS_LEVEL); - msLevelParam.minValue(1); - msLevelParam.setMaxInclusive(); - msLevelParam.defaultValue("2,2"); - addParameter(msLevelParam); - } - - private void addEdgeScoreParam(boolean isHidden) { - EnumParameter edgeScoreParam = new EnumParameter(ParamNameEnum.EDGE_SCORE.key); - edgeScoreParam.registerEntry("Use edge scoring").setDefault(); - edgeScoreParam.registerEntry("Do not use edge scoring"); - if (isHidden) { - edgeScoreParam.setHidden(); - } - addParameter(edgeScoreParam); - } - - private void addMinNumPeaksParam(boolean isHidden) { - IntParameter minNumPeaksParam = new IntParameter(ParamNameEnum.MIN_NUM_PEAKS); - minNumPeaksParam.defaultValue(Constants.MIN_NUM_PEAKS_PER_SPECTRUM); - if (isHidden) { - minNumPeaksParam.setHidden(); - } - addParameter(minNumPeaksParam); - } - - private void addNumIsoformsParam(boolean isHidden) { - IntParameter isoParam = new IntParameter(ParamNameEnum.NUM_ISOFORMS); - isoParam.defaultValue(Constants.NUM_VARIANTS_PER_PEPTIDE); - if (isHidden) { - isoParam.setHidden(); - } - addParameter(isoParam); - } - - private void addMetCleavageParamParam(boolean isHidden) { - EnumParameter metCleavageParam = new EnumParameter(ParamNameEnum.IGNORE_MET_CLEAVAGE); - metCleavageParam.registerEntry("Consider protein N-term Met cleavage").setDefault(); - metCleavageParam.registerEntry("Ignore protein N-term Met cleavage"); - if (isHidden) { - metCleavageParam.setHidden(); - } - addParameter(metCleavageParam); - } - - private void addMinDeNovoScoreParam(boolean isHidden) { - IntParameter minDeNovoScoreParam = new IntParameter(ParamNameEnum.MIN_DE_NOVO_SCORE); - minDeNovoScoreParam.minValue(Integer.MIN_VALUE); - minDeNovoScoreParam.defaultValue(Constants.MIN_DE_NOVO_SCORE); - if (isHidden) { - minDeNovoScoreParam.setHidden(); - } - addParameter(minDeNovoScoreParam); - } - - /** - * Add parameters for MS-GF+ - */ - public void addMSGFPlusParams() { - - // -conf ConfigurationFileName - addConfigFileParam(); - - // -s SpectrumFile (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt) - addSpecFileParam(true); - - // -d DatabaseFile (*.fasta or *.fa or *.faa) - addDBFileParam(true); - addDecoyPrefixParam(); - - // [-o OutputFile (*.pin or *.tsv)] (Default: [SpectrumFileName].pin) - addMzIdOutputFileParam(); - - addPrecursorMassToleranceParam(); - addPrecursorMassToleranceUnitsParam(true); - - addIsotopeRangeParam(); - - addNumThreadsParam(); - addNumTasksParam(); - addMinSpectraPerThreadParam(); - addVerboseModeParam(); - - addTdaParam(); - - addFragMethodParam(ActivationMethod.ASWRITTEN, true); - addInstTypeParam(); - addEnzymeParam(); - addProtocolParam(); - addEnzymeSpecificityParam(); - - addModFileParam(); - - addMinPeptideLengthParam(); - addMaxPeptideLengthParam(); - addMinChargeParam(); - addMaxChargeParam(); - - addNumMatchesPerSpecParam(); - addAddFeaturesParam(); - addOutputFormatParam(); - addPrecursorCalParam(); - addChargeCarrierMassParam(); - addMaxMissedCleavagesParam(); - addMaxNumModsParam(); - - addAllowDenseCentroidedPeaksParam(); - addMSLevelParam(); - - addExample("Example (high-precision): java -Xmx3500M -jar MSGFPlus.jar -s test.mzML -d IPI_human_3.79.fasta -inst 1 -t 20ppm -ti -1,2 -ntt 2 -tda 1 -o testMSGFPlus.pin -mod Mods.txt"); - addExample("Example (low-precision): java -Xmx3500M -jar MSGFPlus.jar -s test.mzML -d IPI_human_3.79.fasta -inst 0 -t 0.5Da,2.5Da -ntt 2 -tda 1 -o testMSGFPlus.pin -mod Mods.txt"); - - // Hidden parameters - addDbIndexDirParam(true); - addSpecIndexRangeParam(true); - addEdgeScoreParam(true); - addMinNumPeaksParam(true); - addNumIsoformsParam(true); - addMetCleavageParamParam(true); - addMinDeNovoScoreParam(true); - - } // MSGFPlusParams - - public FileParameter getSpecFileParam() { - return ((FileParameter) getParameter(ParamNameEnum.SPECTRUM_FILE.key)); - } - - public FileParameter getDBFileParam() { - return ((FileParameter) getParameter(ParamNameEnum.DB_FILE.key)); - } - - public String getDecoyProteinPrefix() { - StringParameter decoyProteinPrefixParam = (StringParameter)getParameter(ParamNameEnum.DECOY_PREFIX.key); - return (decoyProteinPrefixParam.value); - } - - public double getChargeCarrierMass() { - return getDoubleValue(ParamNameEnum.CHARGE_CARRIER_MASSES.key); - } - - public ToleranceParameter getPrecursorMassToleranceParam() { - return ((ToleranceParameter) getParameter(ParamNameEnum.PRECURSOR_MASS_TOLERANCE.key)); - } - - public int getToleranceUnit() { - return getIntValue(ParamNameEnum.PRECURSOR_MASS_TOLERANCE_UNITS.key); - } - - public IntRangeParameter getIsotopeRangeParameter() { - return (IntRangeParameter) getParameter(ParamNameEnum.ISOTOPE_ERROR.key); - } - - public FileParameter getOutputFileParam() { - return ((FileParameter) getParameter(ParamNameEnum.SEARCH_OUTPUT_FILE.key)); - } - - public ActivationMethod getActivationMethod() { - return (ActivationMethod) ((ObjectEnumParameter) getParameter(ParamNameEnum.FRAG_METHOD.key)).getObject(); - } - - public InstrumentType getInstType() { - return (InstrumentType) ((ObjectEnumParameter) getParameter(ParamNameEnum.INSTRUMENT_TYPE.key)).getObject(); - } - - public Enzyme getEnzyme() { - return (Enzyme) ((ObjectEnumParameter) getParameter(ParamNameEnum.ENZYME_ID.key)).getObject(); - } - - public int getNumTolerableTermini() { - return getIntValue(ParamNameEnum.ENZYME_SPECIFICITY.key); - } - - public int getNumMatchesPerSpectrum() { - return getIntValue(ParamNameEnum.NUM_MATCHES_SPEC.key); - } - - public IntRangeParameter getSpecIndexParameter() { - return ((IntRangeParameter) getParameter(ParamNameEnum.SPEC_INDEX.key)); - } - - public IntRangeParameter getMSLevelParameter() { - return ((IntRangeParameter) getParameter(ParamNameEnum.MS_LEVEL.key)); - } - - public int getTDA() { - return getIntValue(ParamNameEnum.TDA_STRATEGY.key); - } - - public int getIgnoreMetCleavage() { - return getIntValue(ParamNameEnum.IGNORE_MET_CLEAVAGE.key); - } - - public int getOutputAdditionalFeatures() { - return getIntValue(ParamNameEnum.ADD_FEATURES.key); - } - - public int getMinPeptideLength() { - return getIntValue(ParamNameEnum.MIN_PEPTIDE_LENGTH.key); - } - - public int getMaxPeptideLength() { - return getIntValue(ParamNameEnum.MAX_PEPTIDE_LENGTH.key); - } - - public int getMaxNumVariantsPerPeptide() { - return getIntValue(ParamNameEnum.NUM_ISOFORMS.key); - } - - public int getMinCharge() { - return getIntValue(ParamNameEnum.MIN_CHARGE.key); - } - - public int getMaxCharge() { - return getIntValue(ParamNameEnum.MAX_CHARGE.key); - } - - public int getNumThreads() { - return getIntValue(ParamNameEnum.NUM_THREADS.key); - } - - public int getNumTasks() { - return getIntValue(ParamNameEnum.NUM_TASKS.key); - } - - public int getMinSpectraPerThread() { - return getIntValue(ParamNameEnum.MIN_SPECTRA_PER_THREAD.key); - } - - public int getVerboseFlag() { - return getIntValue(ParamNameEnum.VERBOSE.key); - } - - public int getEdgeScoreFlag() { - return getIntValue(ParamNameEnum.EDGE_SCORE.key); - } - - // Used by MS-GF+ - public File getDatabaseIndexDir() { - return getFile("dd"); - } - - public int getMinNumPeaksPerSpectrum() { - return getIntValue(ParamNameEnum.MIN_NUM_PEAKS.key); - } - - public int getMinDeNovoScore() { - return getIntValue(ParamNameEnum.MIN_DE_NOVO_SCORE.key); - } - - public int getMaxMissedCleavages() { - return getIntValue(ParamNameEnum.MAX_MISSED_CLEAVAGES.key); - } - - public int getMaxNumModsPerPeptide() { - Parameter param = this.getParameter(ParamNameEnum.MAX_NUM_MODS.key); - if (param == null) { - this.addMaxNumModsParam(); - } - return getIntValue(ParamNameEnum.MAX_NUM_MODS.key); - } - - public Protocol getProtocol() { - return (Protocol) ((ObjectEnumParameter) getParameter(ParamNameEnum.PROTOCOL_ID.key)).getObject(); - } - - public FileParameter getModFileParam() { - return ((FileParameter) getParameter(ParamNameEnum.MOD_FILE.key)); - } - - // Used by MS-GF+ - public FileParameter getConfigFileParam() { - return ((FileParameter) getParameter(ParamNameEnum.CONFIGURATION_FILE.key)); - } - - // Used by MS-GF+ - public int getAllowDenseCentroidedPeaks() { - return getIntValue(ParamNameEnum.ALLOW_DENSE_CENTROIDED_PEAKS.key); - } - - public int getIntValue(String key) { - Parameter param = this.getParameter(key); - if (param instanceof IntParameter) - return ((IntParameter) param).getValue(); - else { - System.err.println("[Error] in ParamManager.getIntValue: " + key + " is not an instance of IntParameter."); - System.exit(-1); - } - return -1; - } - - public float getFloatValue(String key) { - Parameter param = this.getParameter(key); - if (param instanceof FloatParameter) - return ((FloatParameter) param).getValue(); - else { - System.err.println("[Error] in ParamManager.getFloatValue: " + key + " is not an instance of FloatParameter."); - System.exit(-1); - } - return -1; - } - - public double getDoubleValue(String key) { - Parameter param = this.getParameter(key); - if (param instanceof DoubleParameter) - return ((DoubleParameter) param).getValue(); - else { - System.err.println("[Error] in ParamManager.getDoubleValue: " + key + " is not an instance of DoubleParameter."); - System.exit(-1); - } - return -1; - } - - public File getFile(String key) { - Parameter param = this.getParameter(key); - if (param instanceof FileParameter) - return ((FileParameter) param).getFile(); - else { - System.err.println("[Error] in ParamManager.getFile: " + key + " is not an instance of FileParameter."); - System.exit(-1); - } - return null; - } - - public File[] getFiles(String key) { - Parameter param = this.getParameter(key); - if (param instanceof FileListParameter) - return ((FileListParameter) param).getFiles(); - else { - System.err.println("[Error] in ParamManager.getFile: " + key + " is not an instance of FileListParameter."); - System.exit(-1); - } - return null; - } - - public void setMaxNumMods(int numMods) { - Parameter numModsParam = getParameter(ParamManager.ParamNameEnum.MAX_NUM_MODS.getKey()); - numModsParam.parse(String.valueOf(numMods)); - } - -} diff --git a/src/main/java/edu/ucsd/msjava/params/ParamParser.java b/src/main/java/edu/ucsd/msjava/params/ParamParser.java deleted file mode 100644 index ab9000d1..00000000 --- a/src/main/java/edu/ucsd/msjava/params/ParamParser.java +++ /dev/null @@ -1,86 +0,0 @@ -package edu.ucsd.msjava.params; - -import edu.ucsd.msjava.parser.BufferedLineReader; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.HashMap; - - -/** - * This class is for parsing parameter files used in MS-GF, MS-Dictionary and MS-Profile. - * - * @author sangtaekim - */ -public class ParamParser { - public static class Parameters extends CaseInsensitiveMap { - /** - * - */ - private static final long serialVersionUID = 1L; - - public String getParameter(String name) { - return get(name); - } - - public Integer getIntParameter(String name) { - String param = get(name); - if (param == null) - return null; - else return Integer.parseInt(param); - } - - public Float getFloatParameter(String name) { - String param = get(name); - if (param == null) - return null; - else return Float.parseFloat(param); - } - } - - /** - * Parses the specified parameter file. - * - * @param fileName the name of the parameter file. - * @return A table of parameters. - */ - public static Parameters parseFromFile(String fileName) { - Parameters params = new Parameters(); - BufferedLineReader in = null; - try { - in = new BufferedLineReader(fileName); - } catch (IOException e) { - e.printStackTrace(); - } - String s; - while ((s = in.readLine()) != null) { - if (s.startsWith("#") || s.length() == 0) - continue; - String[] token = s.split("="); - if (token.length != 2) - continue; - else - params.put(token[0].trim(), token[1].trim()); - } - return params; - } - - public static Parameters parseFromString(String paramString) { - String errMsg = "Number of parameters must be even. If a file path has a space, surround it with double quotes."; - - Parameters params = new Parameters(); - String[] token = paramString.split("\\s+"); - if (token.length % 2 != 0) { - System.err.println(errMsg); - System.exit(-1); - } - - for (int i = 0; i < token.length; i += 2) { - if (!token[i].startsWith("-") || i + 1 >= token.length) { - System.err.println(errMsg); - } - params.put(token[i].trim(), token[i + 1].trim()); - } - return params; - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/Parameter.java b/src/main/java/edu/ucsd/msjava/params/Parameter.java deleted file mode 100644 index 8609c0da..00000000 --- a/src/main/java/edu/ucsd/msjava/params/Parameter.java +++ /dev/null @@ -1,81 +0,0 @@ -package edu.ucsd.msjava.params; - -public abstract class Parameter { - private String key; - private String name; - private String description; - private boolean isOptional = false; - private boolean isValueAssigned = false; - private String additionalDescription = null; - - private boolean hidden = false; - - protected Parameter(String key, String name, String description) { - this.key = key; - this.name = name; - this.description = description; - } - - protected void setOptional() { - this.isOptional = true; - } - - public void setHidden() { - this.hidden = true; - } - - public String getKey() { - return key; - } - - public String getName() { - return name; - } - - public String getDescription() { - return description; - } - - public String getAdditionalDescription() { - return additionalDescription; - } - - public boolean isOptional() { - return isOptional; - } - - public boolean isHidden() { - return hidden; - } - - public void setAdditionalDescription(String additionalDescription) { - this.additionalDescription = additionalDescription; - } - - public String toString() { - String usage = "-" + getKey() + " " + getName(); - if (isOptional()) - usage = "[" + usage + "]"; - usage = usage + " " + "(" + getDescription() + ")"; - return usage; - } - - public void setValueAssigned() { - this.isValueAssigned = true; - } - - public boolean isValueAssigned() { - return isValueAssigned; - } - - public boolean isValid() { - if (isOptional) - return true; - - return isValueAssigned(); - } - - public abstract String parse(String value); - - public abstract String getValueAsString(); -} diff --git a/src/main/java/edu/ucsd/msjava/params/RangeParameter.java b/src/main/java/edu/ucsd/msjava/params/RangeParameter.java deleted file mode 100644 index 236924c1..00000000 --- a/src/main/java/edu/ucsd/msjava/params/RangeParameter.java +++ /dev/null @@ -1,66 +0,0 @@ -package edu.ucsd.msjava.params; - -public abstract class RangeParameter> extends Parameter { - protected T min = null; - protected T max = null; - protected T minValue; // default: inclusive - protected T maxValue; // default: exclusive - protected boolean isMinInclusive = true; - protected boolean isMaxInclusive = false; - - public RangeParameter(String key, String name, String description) { - super(key, name, description); - } - - public RangeParameter minValue(T minValue) { - this.minValue = minValue; - return this; - } - - public RangeParameter maxValue(T maxValue) { - this.maxValue = maxValue; - return this; - } - - public RangeParameter setMinExclusive() { - this.isMinInclusive = false; - return this; - } - - public RangeParameter setMaxInclusive() { - this.isMaxInclusive = true; - return this; - } - - public boolean isValueValid(T value) { - return !(value.compareTo(minValue) < 0 || value.compareTo(maxValue) > 0 - || !isMinInclusive && value.equals(minValue) - || !isMaxInclusive && value.equals(maxValue)); - } - - public RangeParameter defaultValue(String value) { - super.setOptional(); - String error = parse(value); - if (error != null) { - System.err.println("(RangeParameter) Error while parsing the default value: " + error); - System.exit(-1); - } - return this; - } - - public abstract String parse(String value); - - - @Override - public String getValueAsString() { - return min + "," + max; - } - - public T getMin() { - return min; - } - - public T getMax() { - return max; - } -} diff --git a/src/main/java/edu/ucsd/msjava/params/StringParameter.java b/src/main/java/edu/ucsd/msjava/params/StringParameter.java deleted file mode 100644 index 4783bcc6..00000000 --- a/src/main/java/edu/ucsd/msjava/params/StringParameter.java +++ /dev/null @@ -1,32 +0,0 @@ -package edu.ucsd.msjava.params; - -public class StringParameter extends Parameter { - String value = null; - - public StringParameter(ParamManager.ParamNameEnum paramInfo) { - super(paramInfo.getKey(), paramInfo.getName(), paramInfo.getDescription()); - setAdditionalDescription(paramInfo.getAdditionalDescription()); - } - - public StringParameter(String key, String name, String description) { - super(key, name, description); - } - - public StringParameter defaultValue(String defaultValue) { - this.value = defaultValue; - super.setOptional(); - return this; - } - - @Override - public String parse(String value) { - this.value = value.trim(); - return null; - } - - @Override - public String getValueAsString() { - return (value == null ? "null" : value); - } - -} diff --git a/src/main/java/edu/ucsd/msjava/params/ToleranceParameter.java b/src/main/java/edu/ucsd/msjava/params/ToleranceParameter.java deleted file mode 100644 index 05c47244..00000000 --- a/src/main/java/edu/ucsd/msjava/params/ToleranceParameter.java +++ /dev/null @@ -1,70 +0,0 @@ -package edu.ucsd.msjava.params; - -import edu.ucsd.msjava.msgf.Tolerance; - -public class ToleranceParameter extends Parameter { - - private Tolerance leftTolerance; - private Tolerance rightTolerance; - private boolean allowAsymmetricValues = true; - - public ToleranceParameter(ParamManager.ParamNameEnum paramInfo) { - super(paramInfo.getKey(), paramInfo.getName(), paramInfo.getDescription()); - setAdditionalDescription(paramInfo.getAdditionalDescription()); - } - - - public ToleranceParameter defaultValue(String value) { - super.setOptional(); - String error = parse(value); - if (error != null) { - System.err.println("(ToleranceParameter) Error while setting default value: " + error); - System.exit(-1); - } - return this; - } - - public ToleranceParameter doNotAllowAsymmetricValues() { - this.allowAsymmetricValues = false; - return this; - } - - @Override - public String parse(String value) { - String[] token = value.split(","); - if (token.length == 1) { - leftTolerance = rightTolerance = Tolerance.parseToleranceStr(token[0]); - } else if (token.length == 2) { - if (allowAsymmetricValues) { - leftTolerance = Tolerance.parseToleranceStr(token[0]); - rightTolerance = Tolerance.parseToleranceStr(token[1]); - } else - return "asymmetric values are not allowed"; - } - if (leftTolerance == null || rightTolerance == null) { - return "invalid tolerance value"; - } - if (leftTolerance.isTolerancePPM() != rightTolerance.isTolerancePPM()) { - return "left and right tolerance units must be the same"; - } - if (leftTolerance.getValue() < 0 || rightTolerance.getValue() < 0) { - return "parent mass tolerance must not be negative"; - } - return null; - } - - @Override - public String getValueAsString() { - if (leftTolerance == null || rightTolerance == null) - return null; - return leftTolerance.toString() + "," + rightTolerance.toString(); - } - - public Tolerance getLeftTolerance() { - return leftTolerance; - } - - public Tolerance getRightTolerance() { - return rightTolerance; - } -} diff --git a/src/test/java/msgfplus/TestMSLevelFiltering.java b/src/test/java/msgfplus/TestMSLevelFiltering.java deleted file mode 100644 index 361c574d..00000000 --- a/src/test/java/msgfplus/TestMSLevelFiltering.java +++ /dev/null @@ -1,76 +0,0 @@ -package msgfplus; - -import static org.junit.Assert.*; - -import edu.ucsd.msjava.params.IntRangeParameter; -import edu.ucsd.msjava.params.ParamManager; -import org.junit.Test; - -/** - * Tests for the -msLevel parameter (issue #159). - * Verifies that MS level filtering is properly wired through ParamManager. - */ -public class TestMSLevelFiltering { - - private ParamManager createParamManager() { - ParamManager pm = new ParamManager("MS-GF+", "test", "2024.01.01", "test"); - pm.addMSGFPlusParams(); - return pm; - } - - @Test - public void testMSLevelParameterExists() { - ParamManager pm = createParamManager(); - IntRangeParameter msLevel = pm.getMSLevelParameter(); - assertNotNull("MS_LEVEL parameter should exist", msLevel); - } - - @Test - public void testMSLevelDefaultIsMS2() { - ParamManager pm = createParamManager(); - IntRangeParameter msLevel = pm.getMSLevelParameter(); - // Default should be MS2 only (2,2) - assertEquals("Default min MS level should be 2", 2, (int) msLevel.getMin()); - assertEquals("Default max MS level should be 2", 2, (int) msLevel.getMax()); - } - - @Test - public void testMSLevelParseSingleValue() { - ParamManager pm = createParamManager(); - IntRangeParameter msLevel = pm.getMSLevelParameter(); - String err = msLevel.parse("2"); - assertNull("Parsing '2' should succeed", err); - assertEquals(2, (int) msLevel.getMin()); - assertEquals(2, (int) msLevel.getMax()); - } - - @Test - public void testMSLevelParseRange() { - ParamManager pm = createParamManager(); - IntRangeParameter msLevel = pm.getMSLevelParameter(); - String err = msLevel.parse("2,3"); - assertNull("Parsing '2,3' should succeed", err); - assertEquals(2, (int) msLevel.getMin()); - assertEquals(3, (int) msLevel.getMax()); - } - - @Test - public void testMSLevelParseMS3Only() { - ParamManager pm = createParamManager(); - IntRangeParameter msLevel = pm.getMSLevelParameter(); - String err = msLevel.parse("3"); - assertNull("Parsing '3' should succeed", err); - assertEquals(3, (int) msLevel.getMin()); - assertEquals(3, (int) msLevel.getMax()); - } - - @Test - public void testMSLevelParseWideRange() { - ParamManager pm = createParamManager(); - IntRangeParameter msLevel = pm.getMSLevelParameter(); - String err = msLevel.parse("1,5"); - assertNull("Parsing '1,5' should succeed", err); - assertEquals(1, (int) msLevel.getMin()); - assertEquals(5, (int) msLevel.getMax()); - } -} From 1c68fb2ffc9107679a9f5806e3d9e3017f090f8d Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 06:37:04 +0100 Subject: [PATCH 20/34] refactor: drop MS2/PKL/DTA_TXT spectrum format support Remove legacy text-format parsers; only MGF and mzML are retained. - Delete MS2SpectrumParser, PklSpectrumParser, PNNLSpectrumParser, PNNLSpectraIterator, PNNLSpectraMap from parser/ - Delete dead-code SPTxtParser, SpectrumParserWithTitle, TSVParser, TSVResultParser, FullyBufferedLineReader from parser/ - Delete SpectraMapByTitle and SpectrumAccessorByTitle from msutil/ (only callers were in the deleted SPTxtParser) - Remove MS2/PKL/DTA_TXT/MZDATA entries from SpecFileFormat - Prune corresponding branches from SpectraAccessor.getSpecMap() and getSpecItr() and getSpectrumIDFormatCvParam() - Prune Spectrum.getSpectrumFileFormat() to mzML + MGF only - Update SearchParams.isSupportedSpectrumFormat() to mzML + MGF only - Update MSGFPlusOptions -s description to list only *.mzML, *.mgf - Remove @Ignore generateTRexPRMSpectra test (used deleted TSVParser) --- .../edu/ucsd/msjava/cli/MSGFPlusOptions.java | 2 +- .../ucsd/msjava/msdbsearch/SearchParams.java | 7 +- .../ucsd/msjava/msutil/SpecFileFormat.java | 12 - .../ucsd/msjava/msutil/SpectraAccessor.java | 50 +--- .../ucsd/msjava/msutil/SpectraMapByTitle.java | 26 -- .../java/edu/ucsd/msjava/msutil/Spectrum.java | 9 - .../msutil/SpectrumAccessorByTitle.java | 5 - .../parser/FullyBufferedLineReader.java | 81 ------ .../ucsd/msjava/parser/MS2SpectrumParser.java | 194 ------------- .../msjava/parser/PNNLSpectraIterator.java | 46 ---- .../ucsd/msjava/parser/PNNLSpectraMap.java | 47 ---- .../msjava/parser/PNNLSpectrumParser.java | 256 ------------------ .../ucsd/msjava/parser/PklSpectrumParser.java | 127 --------- .../edu/ucsd/msjava/parser/SPTxtParser.java | 140 ---------- .../parser/SpectrumParserWithTitle.java | 8 - .../edu/ucsd/msjava/parser/TSVParser.java | 44 --- .../ucsd/msjava/parser/TSVResultParser.java | 133 --------- src/test/java/msgfplus/TestMisc.java | 74 ----- 18 files changed, 14 insertions(+), 1247 deletions(-) delete mode 100644 src/main/java/edu/ucsd/msjava/msutil/SpectraMapByTitle.java delete mode 100644 src/main/java/edu/ucsd/msjava/msutil/SpectrumAccessorByTitle.java delete mode 100644 src/main/java/edu/ucsd/msjava/parser/FullyBufferedLineReader.java delete mode 100644 src/main/java/edu/ucsd/msjava/parser/MS2SpectrumParser.java delete mode 100644 src/main/java/edu/ucsd/msjava/parser/PNNLSpectraIterator.java delete mode 100644 src/main/java/edu/ucsd/msjava/parser/PNNLSpectraMap.java delete mode 100644 src/main/java/edu/ucsd/msjava/parser/PNNLSpectrumParser.java delete mode 100644 src/main/java/edu/ucsd/msjava/parser/PklSpectrumParser.java delete mode 100644 src/main/java/edu/ucsd/msjava/parser/SPTxtParser.java delete mode 100644 src/main/java/edu/ucsd/msjava/parser/SpectrumParserWithTitle.java delete mode 100644 src/main/java/edu/ucsd/msjava/parser/TSVParser.java delete mode 100644 src/main/java/edu/ucsd/msjava/parser/TSVResultParser.java diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java index 26f82988..e6416223 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java @@ -40,7 +40,7 @@ public final class MSGFPlusOptions { // ---------- input (required at runtime, but may be provided via -conf) ---------- @Option(names = "-s", paramLabel = "SpectrumFile", - description = "Input spectrum file (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl, *_dta.txt) or directory of spectra. " + description = "Input spectrum file (*.mzML, *.mgf) or directory of spectra. " + "Required, unless provided via -conf as SpectrumFile=...") public File spectrumFile; diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java index a089d88c..2c66a799 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java @@ -500,13 +500,10 @@ public String parse(MSGFPlusOptions opts) { return null; } - /** Spectrum-format whitelist (formerly enforced by FileParameter.isSupported). */ + /** Spectrum-format whitelist: only mzML and MGF are supported. */ private static boolean isSupportedSpectrumFormat(SpecFileFormat fmt) { return fmt == SpecFileFormat.MZML - || fmt == SpecFileFormat.MGF - || fmt == SpecFileFormat.MS2 - || fmt == SpecFileFormat.PKL - || fmt == SpecFileFormat.DTA_TXT; + || fmt == SpecFileFormat.MGF; } diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpecFileFormat.java b/src/main/java/edu/ucsd/msjava/msutil/SpecFileFormat.java index 87b2d0c3..20ed52f1 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpecFileFormat.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpecFileFormat.java @@ -23,10 +23,6 @@ public String getPSIName() { public static final SpecFileFormat MGF; public static final SpecFileFormat MZML; - public static final SpecFileFormat MS2; - public static final SpecFileFormat PKL; - public static final SpecFileFormat MZDATA; - public static final SpecFileFormat DTA_TXT; public static SpecFileFormat getSpecFileFormat(String specFileName) { String lowerCaseFileName = specFileName.toLowerCase(); @@ -44,17 +40,9 @@ public static SpecFileFormat getSpecFileFormat(String specFileName) { static { MGF = new SpecFileFormat(".mgf", "MS:1001062", "Mascot MGF file"); MZML = new SpecFileFormat(".mzML", "MS:1000584", "mzML file"); - MS2 = new SpecFileFormat(".ms2", "MS:1001466", "MS2 file"); - PKL = new SpecFileFormat(".pkl", "MS:1000565", "Micromass PKL file"); - MZDATA = new SpecFileFormat(".mzData", "MS:1000564", "PSI mzData file"); - DTA_TXT = new SpecFileFormat("_dta.txt", "MS:XXXXXXX", "PNNL dta.txt file"); specFileFormatList = new ArrayList(); specFileFormatList.add(MGF); specFileFormatList.add(MZML); - specFileFormatList.add(MS2); - specFileFormatList.add(PKL); - specFileFormatList.add(MZDATA); - specFileFormatList.add(DTA_TXT); } } diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java b/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java index 57523ec9..56b51ef3 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java @@ -3,10 +3,10 @@ import edu.ucsd.msjava.mzml.StaxMzMLParser; import edu.ucsd.msjava.mzml.StaxMzMLSpectraIterator; import edu.ucsd.msjava.mzml.StaxMzMLSpectraMap; -import edu.ucsd.msjava.parser.*; +import edu.ucsd.msjava.parser.MgfSpectrumParser; +import edu.ucsd.msjava.parser.SpectrumParser; import java.io.File; -import java.io.FileNotFoundException; import java.io.IOException; import java.util.Iterator; @@ -71,21 +71,12 @@ public SpectrumAccessorBySpecIndex getSpecMap() { } } specMap = new StaxMzMLSpectraMap(staxParser, minMSLevel, maxMSLevel); - } else if (specFormat == SpecFileFormat.DTA_TXT) - specMap = new PNNLSpectraMap(specFile.getPath()); - else { - SpectrumParser parser = null; - if (specFormat == SpecFileFormat.MGF) - parser = new MgfSpectrumParser(); - else if (specFormat == SpecFileFormat.MS2) - parser = new MS2SpectrumParser(); - else if (specFormat == SpecFileFormat.PKL) - parser = new PklSpectrumParser(); - else - return null; - + } else if (specFormat == SpecFileFormat.MGF) { + SpectrumParser parser = new MgfSpectrumParser(); spectrumParser = parser; specMap = new SpectraMap(specFile.getPath(), parser); + } else { + return null; } } @@ -108,29 +99,16 @@ public Iterator getSpecItr() { } } specItr = new StaxMzMLSpectraIterator(staxParser, minMSLevel, maxMSLevel); - } else if (specFormat == SpecFileFormat.DTA_TXT) - try { - specItr = new PNNLSpectraIterator(specFile.getPath()); - } catch (IOException e) { - e.printStackTrace(); - } - else { - SpectrumParser parser = null; - if (specFormat == SpecFileFormat.MGF) - parser = new MgfSpectrumParser(); - else if (specFormat == SpecFileFormat.MS2) - parser = new MS2SpectrumParser(); - else if (specFormat == SpecFileFormat.PKL) - parser = new PklSpectrumParser(); - else - return null; - + } else if (specFormat == SpecFileFormat.MGF) { + SpectrumParser parser = new MgfSpectrumParser(); spectrumParser = parser; try { specItr = new SpectraIterator(specFile.getPath(), parser); } catch (IOException e) { e.printStackTrace(); } + } else { + return null; } } @@ -167,14 +145,8 @@ public String getTitle(int specIndex) { public CvParamInfo getSpectrumIDFormatCvParam() { CvParamInfo cvParam = null; - if (specFormat == SpecFileFormat.DTA_TXT - || specFormat == SpecFileFormat.MGF - || specFormat == SpecFileFormat.PKL - || specFormat == SpecFileFormat.MS2 - ) + if (specFormat == SpecFileFormat.MGF) cvParam = new CvParamInfo("MS:1000774", "multiple peak list nativeID format", null); - else if (specFormat == SpecFileFormat.MZDATA) - cvParam = new CvParamInfo("MS:1000777", "spectrum identifier nativeID format", null); else if (specFormat == SpecFileFormat.MZML) { if (staxParser == null) { try { diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpectraMapByTitle.java b/src/main/java/edu/ucsd/msjava/msutil/SpectraMapByTitle.java deleted file mode 100644 index 68db2eba..00000000 --- a/src/main/java/edu/ucsd/msjava/msutil/SpectraMapByTitle.java +++ /dev/null @@ -1,26 +0,0 @@ -package edu.ucsd.msjava.msutil; - -import edu.ucsd.msjava.parser.SpectrumParserWithTitle; - -import java.util.Hashtable; - - -public class SpectraMapByTitle extends SpectraMap implements SpectrumAccessorByTitle { - - private Hashtable titleToSpecIndex = null; // key: specIndex, value: filePos - - public SpectraMapByTitle(String fileName, SpectrumParserWithTitle parser) { - super(fileName, parser); - lineReader.seek(0); - titleToSpecIndex = parser.getTitleToSpecIndexMap(super.lineReader); - } - - public Spectrum getSpectrumByTitle(String title) { - Integer specIndex = titleToSpecIndex.get(title); - if (specIndex == null) - return null; - else - return super.getSpectrumBySpecIndex(specIndex); - } - -} diff --git a/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java b/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java index 1e6e27f6..473f622e 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java @@ -1029,15 +1029,6 @@ public static SpecFileFormat getSpectrumFileFormat(String specFileName) { specFormat = SpecFileFormat.MZML; else if (extension.equalsIgnoreCase(".mgf")) specFormat = SpecFileFormat.MGF; - else if (extension.equalsIgnoreCase(".ms2")) - specFormat = SpecFileFormat.MS2; - else if (extension.equalsIgnoreCase(".pkl")) - specFormat = SpecFileFormat.PKL; - } - if (specFormat == null && specFileName.length() > 8) { - String suffix = specFileName.substring(specFileName.length() - 8); - if (suffix.equalsIgnoreCase("_dta.txt")) - specFormat = SpecFileFormat.DTA_TXT; } return specFormat; diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpectrumAccessorByTitle.java b/src/main/java/edu/ucsd/msjava/msutil/SpectrumAccessorByTitle.java deleted file mode 100644 index 933fbef7..00000000 --- a/src/main/java/edu/ucsd/msjava/msutil/SpectrumAccessorByTitle.java +++ /dev/null @@ -1,5 +0,0 @@ -package edu.ucsd.msjava.msutil; - -public interface SpectrumAccessorByTitle { - Spectrum getSpectrumByTitle(String title); -} diff --git a/src/main/java/edu/ucsd/msjava/parser/FullyBufferedLineReader.java b/src/main/java/edu/ucsd/msjava/parser/FullyBufferedLineReader.java deleted file mode 100644 index 0dd8b88a..00000000 --- a/src/main/java/edu/ucsd/msjava/parser/FullyBufferedLineReader.java +++ /dev/null @@ -1,81 +0,0 @@ -package edu.ucsd.msjava.parser; - -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; - -public class FullyBufferedLineReader implements LineReader { - private int pointer; - private byte[] buffer; - - private final byte CR = (byte) '\r'; - private final byte NL = (byte) '\n'; - int startIndex; - - public FullyBufferedLineReader(String fileName) { - FileInputStream fin = null; - try { - fin = new FileInputStream(fileName); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - - // load file into memory - FileChannel in = fin.getChannel(); - ByteBuffer tempBuffer = null; - try { -// System.out.println(Integer.MAX_VALUE + "\t" + in.size() + "\t" + (int)in.size()); - tempBuffer = ByteBuffer.allocate((int) in.size()); // file size must be smaller than 2^32 - in.read(tempBuffer); - } catch (IOException e1) { - e1.printStackTrace(); - } - - buffer = tempBuffer.array(); - pointer = 0; - startIndex = 0; - } - - public String readLine() // line terminating char: \n or \r\n - { - if (pointer >= buffer.length) - return null; - while (pointer < buffer.length) { - if (buffer[pointer] != NL) - pointer++; - else { - String str; - if (pointer > 0 && buffer[pointer - 1] == CR) - str = new String(buffer, startIndex, (pointer - startIndex - 1)); - else - str = new String(buffer, startIndex, (pointer - startIndex)); - pointer++; - startIndex = pointer; - return str; - } - } - String str = new String(buffer, startIndex, (pointer - startIndex)); - startIndex = pointer; - return str; - } - - public int getPosition() { - return pointer; - } - - public void seek(int position) { - pointer = position; - startIndex = pointer; - } - - public void reset() { - pointer = 0; - startIndex = 0; - } - - public int size() { - return buffer.length; - } -} diff --git a/src/main/java/edu/ucsd/msjava/parser/MS2SpectrumParser.java b/src/main/java/edu/ucsd/msjava/parser/MS2SpectrumParser.java deleted file mode 100644 index ca1abf80..00000000 --- a/src/main/java/edu/ucsd/msjava/parser/MS2SpectrumParser.java +++ /dev/null @@ -1,194 +0,0 @@ -package edu.ucsd.msjava.parser; - -import edu.ucsd.msjava.msutil.*; - -import java.util.Collections; -import java.util.Hashtable; -import java.util.Map; - -/** - * A class that parses MS2 format - * - * @author sangtaekim - */ -public class MS2SpectrumParser implements SpectrumParser { - - private Spectrum spec = null; - private Boolean isSpecSorted = null; - - /** - * Number of scans where we could not determine the scan number - * This method is required by interface SpectrumParser - * However, this class does not keep track of spectra without a scan number - * - * @return - */ - public long getScanMissingWarningCount() { - return 0; - } - - /** - * Reads a spectrum from ms2 file and returns it. - * - * @param lineReader A LineReader object points to the start of a spectrum. - * @return a spectrum object. - */ - public Spectrum readSpectrum(LineReader lineReader) { - float prevMass = 0; - String buf; - - do { - buf = lineReader.readLine(); - } - while (buf != null && buf.startsWith("H")); - if (buf == null) - return null; - - if (buf.startsWith("S")) { - String[] token = buf.split("\\s+"); - spec = new Spectrum(); - int startScanNum = Integer.parseInt(token[1]); - int endScanNum = Integer.parseInt(token[2]); - float precursorMz = Float.parseFloat(token[3]); - spec = new Spectrum(precursorMz, 0, 0); - spec.setStartScanNum(startScanNum); - spec.setEndScanNum(endScanNum); - isSpecSorted = true; - } else if (spec == null) { - return null; - } - - boolean zParsed = false; - while ((buf = lineReader.readLine()) != null) { - String[] token = buf.split("\\s+"); - if (buf.startsWith("H")) - continue; - else if (buf.startsWith("S")) // start of a next spectrum - { - Spectrum specCopy = spec; - Boolean isSpecSortedCopy = isSpecSorted; - - spec = new Spectrum(); - int startScanNum = Integer.parseInt(token[1]); - int endScanNum = Integer.parseInt(token[2]); - float precursorMz = Float.parseFloat(token[3]); - spec = new Spectrum(precursorMz, 0, 0); - spec.setStartScanNum(startScanNum); - spec.setEndScanNum(endScanNum); - isSpecSorted = true; - - if (!isSpecSortedCopy) - Collections.sort(specCopy); - return specCopy; - } else if (buf.startsWith("Z")) { - if (!zParsed) { - int charge = Integer.parseInt(token[1]); - float precursorMH = Float.parseFloat(token[2]); - float precursorMz = ((precursorMH - (float) Composition.ChargeCarrierMass()) + charge * (float) Composition.ChargeCarrierMass()) / charge; - spec.setPrecursor(new Peak(precursorMz, 0, charge)); - zParsed = true; - } else { - spec.setPrecursorCharge(0); - } - } else if (token.length == 2) // a peak - { - assert (spec != null); - float mass = Float.parseFloat(token[0]); - if (isSpecSorted && mass < prevMass) - isSpecSorted = false; - else - prevMass = mass; - float intensity = Float.parseFloat(token[1]); - spec.add(new Peak(mass, intensity, 1)); - } - } - - if (spec != null) { - if (!isSpecSorted) - Collections.sort(spec); - Spectrum specCopy = spec; - spec = null; - return specCopy; - } - - return spec; - } - - /** - * Read the entire ms2 file and generates a map from spectrum indexes to file positions of spectra. - * - * @param lineReader A reader points to the start of the spectrum. - * @return A Hashtable object maps a spectrum index into a file position. - */ - public Map getSpecMetaInfoMap( - BufferedRandomAccessLineReader lineReader) { - Hashtable specIndexMap = new Hashtable(); - String buf; - long offset = 0; - int specIndex = 0; - - SpectrumMetaInfo metaInfo = null; - while ((buf = lineReader.readLine()) != null) { - if (buf.startsWith("S")) // scan - { - specIndex++; - - metaInfo = new SpectrumMetaInfo(); - metaInfo.setPosition(offset); - metaInfo.setID("index=" + (specIndex - 1)); - - String[] token = buf.split("\\s+"); - if (token.length < 4) { - System.err.println("Invalid ms2 file format!"); - System.exit(-1); - } - float precursorMz = Float.parseFloat(token[3]); - metaInfo.setPrecursorMz(precursorMz); - specIndexMap.put(specIndex, metaInfo); - } - - offset = lineReader.getPosition(); - } - return specIndexMap; - } - - public static void test() throws Exception { - String fileName = System.getProperty("user.home") + "/Research/Data/QCShew/QC_Shew_12_02_2_1Aug12_Cougar_12-06-11.ms2"; - - java.util.Map specIndexPrecursorMzMap = new java.util.HashMap(); - int numSpecs; - - numSpecs = 0; - SpectraMap map = new SpectraMap(fileName, new MS2SpectrumParser()); - - for (int specIndex : map.getSpecIndexList()) { - Spectrum spec = map.getSpectrumBySpecIndex(specIndex); - numSpecs++; - specIndexPrecursorMzMap.put(spec.getSpecIndex(), spec.getPrecursorPeak().getMz()); - } - System.out.println("NumSpectra: " + numSpecs); - -// Spectrum scan87 = map.getSpectrumBySpecIndex(79); -// System.out.println("**** " + scan87.getPrecursorPeak().getMz()+" "+scan87.getPrecursorPeak().getCharge()); - - numSpecs = 0; - SpectraIterator iterator = new SpectraIterator(fileName, new MS2SpectrumParser()); - while (iterator.hasNext()) { - Spectrum spec = iterator.next(); - numSpecs++; - - Float precursorMz = specIndexPrecursorMzMap.get(spec.getSpecIndex()); -// System.out.println(spec.getPrecursorPeak().getMz()+" "+spec.getCharge()+" "+spec.getSpecIndex()+" "+spec.getScanNum()); - if (precursorMz == null || precursorMz != spec.getPrecursorPeak().getMz()) { - System.out.println(precursorMz + " != " + spec.getPrecursorPeak().getMz()); - System.exit(0); - } - } - - System.out.println("NumSpectra: " + numSpecs); - } - - public static void main(String argv[]) throws Exception { - test(); - } -} diff --git a/src/main/java/edu/ucsd/msjava/parser/PNNLSpectraIterator.java b/src/main/java/edu/ucsd/msjava/parser/PNNLSpectraIterator.java deleted file mode 100644 index ecc5709e..00000000 --- a/src/main/java/edu/ucsd/msjava/parser/PNNLSpectraIterator.java +++ /dev/null @@ -1,46 +0,0 @@ -package edu.ucsd.msjava.parser; - -import edu.ucsd.msjava.msutil.ScanType; -import edu.ucsd.msjava.msutil.SpectraIterator; -import edu.ucsd.msjava.msutil.Spectrum; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Iterator; - -public class PNNLSpectraIterator extends SpectraIterator { - - private HashMap scanNumScanTypeMap; - - public PNNLSpectraIterator(String fileName) throws IOException { - super(fileName, new PNNLSpectrumParser()); - scanNumScanTypeMap = PNNLSpectrumParser.getScanTypeMap(fileName); - } - - @Override - public Spectrum next() { - if (scanNumScanTypeMap == null) - return super.next(); - - Spectrum spec = super.next(); - ScanType scanType = scanNumScanTypeMap.get(spec.getScanNum()); - if (scanType != null) { - spec.setActivationMethod(scanType.getActivationMethod()); - spec.setIsHighPrecision(scanType.isHighPrecision()); - spec.setMsLevel(scanType.getMsLevel()); - spec.setRt(scanType.getScanStartTime()); - spec.setRtIsSeconds(false); - } - return spec; - } - - public static void main(String argv[]) throws Exception { - String fileName = System.getProperty("user.home") + "/Test/Matt/QC_Shew_11_03_200ng_4_23Aug11_Hawk_11-05-04p_dta.txt"; - PNNLSpectraIterator itr = new PNNLSpectraIterator(fileName); - Iterator specItr = itr.iterator(); - while (specItr.hasNext()) { - Spectrum spec = specItr.next(); - System.out.println(spec.getScanNum() + "\t" + spec.getActivationMethod()); - } - } -} diff --git a/src/main/java/edu/ucsd/msjava/parser/PNNLSpectraMap.java b/src/main/java/edu/ucsd/msjava/parser/PNNLSpectraMap.java deleted file mode 100644 index 7adf8b94..00000000 --- a/src/main/java/edu/ucsd/msjava/parser/PNNLSpectraMap.java +++ /dev/null @@ -1,47 +0,0 @@ -package edu.ucsd.msjava.parser; - -import edu.ucsd.msjava.msutil.ScanType; -import edu.ucsd.msjava.msutil.SpectraMap; -import edu.ucsd.msjava.msutil.Spectrum; - -import java.util.HashMap; - -public class PNNLSpectraMap extends SpectraMap { - - private HashMap scanNumScanTypeMap; - - public PNNLSpectraMap(String fileName) { - super(fileName, new PNNLSpectrumParser()); - scanNumScanTypeMap = PNNLSpectrumParser.getScanTypeMap(fileName); - } - - @Override - public synchronized Spectrum getSpectrumBySpecIndex(int specIndex) { - if (scanNumScanTypeMap == null) - return super.getSpectrumBySpecIndex(specIndex); - else { - Spectrum spec = super.getSpectrumBySpecIndex(specIndex); - ScanType scanType = scanNumScanTypeMap.get(spec.getScanNum()); - if (scanType != null) { - spec.setActivationMethod(scanType.getActivationMethod()); - spec.setIsHighPrecision(scanType.isHighPrecision()); - spec.setMsLevel(scanType.getMsLevel()); - spec.setRt(scanType.getScanStartTime()); - spec.setRtIsSeconds(false); - } - - return spec; - } - } - - public static void main(String argv[]) throws Exception { - String fileName = System.getProperty("user.home") + "/Test/Matt/QC_Shew_11_03_200ng_4_23Aug11_Hawk_11-05-04p_dta.txt"; - PNNLSpectraMap map = new PNNLSpectraMap(fileName); - for (int specIndex : map.getSpecIndexList()) { - Spectrum spec = map.getSpectrumBySpecIndex(specIndex); - System.out.println(spec.getScanNum() + "\t" + spec.getActivationMethod()); - } - } - - -} diff --git a/src/main/java/edu/ucsd/msjava/parser/PNNLSpectrumParser.java b/src/main/java/edu/ucsd/msjava/parser/PNNLSpectrumParser.java deleted file mode 100644 index 0b3c6e09..00000000 --- a/src/main/java/edu/ucsd/msjava/parser/PNNLSpectrumParser.java +++ /dev/null @@ -1,256 +0,0 @@ -package edu.ucsd.msjava.parser; - -import edu.ucsd.msjava.msutil.*; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.*; - -public class PNNLSpectrumParser implements SpectrumParser { - - public static final String SCAN_TYPE_FILE_EXTENSION = "_ScanType.txt"; - - /** - * Number of scans where we could not determine the scan number - * This method is required by interface SpectrumParser - * However, this class does not keep track of spectra without a scan number - * - * @return - */ - - public long getScanMissingWarningCount() { - return 0; - } - public Spectrum readSpectrum(LineReader lineReader) { - Spectrum spec = null; - - String buf; - float prevMass = 0; - boolean isSorted = true; - - while ((buf = lineReader.readLine()) != null) { - if (buf.length() == 0) { - if (spec != null) { - if (!isSorted) - Collections.sort(spec); - return spec; - } else - continue; - } else if (buf.startsWith("==")) { - if (spec != null) { - System.out.println("There must be at least one empty line between spectra: " + buf); - System.exit(-1); - } - int lastDotIndex = buf.lastIndexOf('.'); - int secondLastDotIndex = buf.lastIndexOf('.', lastDotIndex - 1); - int thirdLastDotIndex = buf.lastIndexOf('.', secondLastDotIndex - 1); - int fourthLastDotIndex = buf.lastIndexOf('.', thirdLastDotIndex - 1); - - int scanNum = Integer.parseInt(buf.substring(fourthLastDotIndex + 1, thirdLastDotIndex)); - - String annotation = buf; - // first line of a spectrum - buf = lineReader.readLine(); - if (buf == null || buf.trim().length() == 0) { - System.out.println("Error while parsing _Dta.txt file: " + annotation); - System.out.println("No spectrum!"); - System.exit(-1); - } - - spec = new Spectrum(); - String[] token = buf.split("\\s+"); - float mPlusH = Float.parseFloat(token[0]); - int charge = Integer.parseInt(token[1].substring(token[1].indexOf('=') + 1)); - float precursorMz = (mPlusH - (float) Composition.ChargeCarrierMass()) / charge + (float) Composition.ChargeCarrierMass(); - spec.setPrecursor(new Peak(precursorMz, 0, charge)); - spec.setScanNum(scanNum); - } else if (Character.isDigit(buf.charAt(0))) // peak - { - if (spec == null) { - System.out.println("Error while parsing _Dta.txt file."); - System.out.println("Header line is missing: " + buf); - System.exit(-1); - } - String[] token2 = buf.split("\\s+"); - if (token2.length != 2) - continue; - float mass = Float.parseFloat(token2[0]); - if (isSorted && mass < prevMass) - isSorted = false; - - float intensity = Float.parseFloat(token2[1]); - spec.add(new Peak(mass, intensity, 1)); - prevMass = mass; - } - } - return spec; - } - - @Override - public Map getSpecMetaInfoMap(BufferedRandomAccessLineReader lineReader) { - Hashtable specIndexMap = new Hashtable(); - String buf; - long offset = 0; - int specIndex = 0; - while ((buf = lineReader.readLine()) != null) { - if (buf.startsWith("==")) { -// specIndexMap.put(++specIndex, offset); - ++specIndex; - int lastDotIndex = buf.lastIndexOf('.'); - int secondLastDotIndex = buf.lastIndexOf('.', lastDotIndex - 1); - int thirdLastDotIndex = buf.lastIndexOf('.', secondLastDotIndex - 1); - int fourthLastDotIndex = buf.lastIndexOf('.', thirdLastDotIndex - 1); - - String annotation = buf; - // first line of a spectrum - buf = lineReader.readLine(); - if (buf == null || buf.trim().length() == 0) { - System.out.println("Error while parsing _Dta.txt file: " + annotation); - System.out.println("No spectrum!"); - System.exit(-1); - } - - String[] token = buf.split("\\s+"); - float mPlusH = Float.parseFloat(token[0]); - int charge = Integer.parseInt(token[1].substring(token[1].indexOf('=') + 1)); - float precursorMz = (mPlusH - (float) Composition.ChargeCarrierMass()) / charge + (float) Composition.ChargeCarrierMass(); - - SpectrumMetaInfo metaInfo = new SpectrumMetaInfo(); - metaInfo.setID("index=" + (specIndex - 1)); - metaInfo.setPrecursorMz(precursorMz); - metaInfo.setPosition(offset); - specIndexMap.put(specIndex, metaInfo); - } - offset = lineReader.getPosition(); - } - return specIndexMap; - } - -// static class ScanType -// { -// public ScanType(ActivationMethod activationMethod, -// boolean isHighPrecision) { -// this.activationMethod = activationMethod; -// this.isHighPrecision = isHighPrecision; -// } -// -// ActivationMethod getActivationMethod() { -// return activationMethod; -// } -// boolean isHighPrecision() { -// return isHighPrecision; -// } -// -// private ActivationMethod activationMethod; -// private boolean isHighPrecision; -// } - - static HashMap getScanTypeMap(String fileName) { - File specFile = new File(fileName); - String scanTypeFileName = - specFile.getAbsoluteFile().getParentFile().getPath() - + File.separator - + specFile.getName().substring(0, specFile.getName().lastIndexOf('_')) - + PNNLSpectrumParser.SCAN_TYPE_FILE_EXTENSION; - File scanTypeFile = new File(scanTypeFileName); - - if (!scanTypeFile.exists()) - return null; - - HashMap scanNumScanTypeMap = new HashMap(); - - BufferedLineReader in = null; - try { - in = new BufferedLineReader(scanTypeFile.getPath()); - } catch (IOException e) { - e.printStackTrace(); - } - - String s; - - s = in.readLine(); // header - boolean hasScanTimes = false; - String[] hTokens = s.split("\t"); - if (hTokens.length > 3 && hTokens[3].toLowerCase().contains("time")) { - hasScanTimes = true; - } - - while ((s = in.readLine()) != null) { - String[] token = s.split("\t"); - if (token.length < 3) - continue; - - int scanNum = Integer.parseInt(token[0]); - String scanType = token[1].toLowerCase(); - - ActivationMethod method = null; - if (scanType.contains("etcid")) - method = ActivationMethod.ETD; - else if (scanType.contains("ethcd")) - method = ActivationMethod.ETD; - else if (scanType.contains("cid")) - method = ActivationMethod.CID; - else if (scanType.contains("etd")) - method = ActivationMethod.ETD; - else if (scanType.contains("hcd")) - method = ActivationMethod.HCD; - else if (scanType.contains("pqd")) - method = ActivationMethod.PQD; - - boolean isHighPrecision = false; - if (scanType.contains("hms")) - isHighPrecision = true; - - int msLevel = Integer.parseInt(token[2]); - - float scanTime = -1; - if (hasScanTimes && token.length > 3) { - scanTime = Float.parseFloat(token[3]); - } - - if (method != null) { - scanNumScanTypeMap.put(scanNum, new ScanType(method, isHighPrecision, msLevel, scanTime)); - } - } - - if (in != null) { - try { - in.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - return scanNumScanTypeMap; - } - - public static void main(String argv[]) throws Exception { - long time = System.currentTimeMillis(); - String fileName = System.getProperty("user.home") + "/Research/ToolDistribution/PNNLTest/QC_Shew_08_04_pt5_b_22Jan09_Owl_09-01-04_dta.txt"; - SpectraIterator itr = new SpectraIterator(fileName, new PNNLSpectrumParser()); - int numSpecs = 0; - HashSet scanNumSet = new HashSet(); - while (itr.hasNext()) { - Spectrum spec = itr.next(); - numSpecs++; - if (scanNumSet.contains(spec.getScanNum())) { - System.out.println(spec.getScanNum()); - } else - scanNumSet.add(spec.getScanNum()); -// System.out.println(spec+ "\t" + spec.getScanNum()+"\t"+(spec.getPrecursorMass()+(float)Composition.ChargeCarrierMass)+"\t"+spec.getCharge()); - } - System.out.println("NumSpecs: " + numSpecs); - System.out.println("Time: " + (System.currentTimeMillis() - time)); - - time = System.currentTimeMillis(); - SpectraMap map = new SpectraMap(fileName, new PNNLSpectrumParser()); - numSpecs = 0; - for (int specIndex : map.getSpecIndexList()) { - Spectrum spec = map.getSpectrumBySpecIndex(specIndex); - numSpecs++; -// System.out.println(spec+ "\t" + spec.getScanNum()+"\t"+(spec.getPrecursorMass()+(float)Composition.ChargeCarrierMass)+"\t"+spec.getCharge()); - } - System.out.println("NumSpecs: " + numSpecs); - System.out.println("Time: " + (System.currentTimeMillis() - time)); - } -} diff --git a/src/main/java/edu/ucsd/msjava/parser/PklSpectrumParser.java b/src/main/java/edu/ucsd/msjava/parser/PklSpectrumParser.java deleted file mode 100644 index 4129a936..00000000 --- a/src/main/java/edu/ucsd/msjava/parser/PklSpectrumParser.java +++ /dev/null @@ -1,127 +0,0 @@ -package edu.ucsd.msjava.parser; - -import edu.ucsd.msjava.msutil.*; - -import java.util.Collections; -import java.util.Hashtable; -import java.util.Map; - -/** - * A class that parses Pkl format - * - * @author sangtaekim - */ -public class PklSpectrumParser implements SpectrumParser { - - /** - * Number of scans where we could not determine the scan number - * This method is required by interface SpectrumParser - * However, this class does not keep track of spectra without a scan number - * - * @return - */ - public long getScanMissingWarningCount() { - return 0; - } - - /** - * Reads a spectrum from pkl file and returns it. - * - * @param lineReader A LineReader object points to the start of a spectrum. - * @return a spectrum object. - */ - public Spectrum readSpectrum(LineReader lineReader) { - Spectrum spec = null; - - boolean sorted = true; - float prevMass = 0; - - String buf; - while ((buf = lineReader.readLine()) != null) { - String[] token = buf.split("\\s+"); - if (token.length == 3) // start of a spectrum - { - float precursorMz = Float.parseFloat(token[0]); - float precursorIntensity = Float.parseFloat(token[1]); - int charge = Integer.parseInt(token[2]); - spec = new Spectrum(precursorMz, charge, precursorIntensity); - } else if (token.length == 2) // a peak - { - assert (spec != null); - float mass = Float.parseFloat(token[0]); - if (sorted && mass < prevMass) - sorted = false; - else - prevMass = mass; -// if(token[1].endsWith("null")) -// token[1] = token[1].substring(0, token[1].lastIndexOf("null")); - float intensity = Float.parseFloat(token[1]); - spec.add(new Peak(mass, intensity, 1)); - } else // end of a spectrum - { - if (spec != null) { - if (!sorted) - Collections.sort(spec); - return spec; - } - } - } - return spec; - } - - /** - * Read the entire pkl file and generates a map from spectrum indexes to file positions of spectra. - * - * @param lineReader A reader points to the start of the spectrum. - * @return A Hashtable object maps a spectrum index into a file position. - */ - public Map getSpecMetaInfoMap( - BufferedRandomAccessLineReader lineReader) { - Hashtable specIndexMap = new Hashtable(); - String buf; - long offset = 0; - int specIndex = 0; - while ((buf = lineReader.readLine()) != null) { - String[] token = buf.split("\\s+"); - if (token.length == 3) // start of a spectrum - { -// specIndexMap.put(++specIndex, offset); - ++specIndex; - float precursorMz = Float.parseFloat(token[0]); - SpectrumMetaInfo metaInfo = new SpectrumMetaInfo(); - metaInfo.setID("index=" + (specIndex - 1)); - metaInfo.setPrecursorMz(precursorMz); - metaInfo.setPosition(offset); - specIndexMap.put(specIndex, metaInfo); - } - - offset = lineReader.getPosition(); - } - return specIndexMap; - } - - public static void test() throws Exception { - String fileName = System.getProperty("user.home") + "/Research/ToolDistribution/RefTest/SpecFormatTest/TestSpectra.pkl"; - SpectraIterator iterator = new SpectraIterator(fileName, new PklSpectrumParser()); - int numSpecs = 0; - while (iterator.hasNext()) { - Spectrum spec = iterator.next(); - numSpecs++; - System.out.println(spec.getPrecursorPeak().getMz() + " " + spec.getCharge() + " " + spec.getSpecIndex() + " " + spec.getScanNum()); - } - System.out.println("NumSpectra: " + numSpecs); - - numSpecs = 0; - SpectraMap map = new SpectraMap(fileName, new PklSpectrumParser()); - for (int specIndex : map.getSpecIndexList()) { - Spectrum spec = map.getSpectrumBySpecIndex(specIndex); - numSpecs++; - System.out.println(spec.getPrecursorPeak().getMz() + " " + spec.getCharge() + " " + spec.getSpecIndex() + " " + spec.getScanNum()); - } - System.out.println("NumSpectra: " + numSpecs); - } - - public static void main(String argv[]) throws Exception { - test(); - } -} diff --git a/src/main/java/edu/ucsd/msjava/parser/SPTxtParser.java b/src/main/java/edu/ucsd/msjava/parser/SPTxtParser.java deleted file mode 100644 index ecd13ae1..00000000 --- a/src/main/java/edu/ucsd/msjava/parser/SPTxtParser.java +++ /dev/null @@ -1,140 +0,0 @@ -package edu.ucsd.msjava.parser; - -import edu.ucsd.msjava.msutil.*; - -import java.util.Hashtable; -import java.util.Map; - -public class SPTxtParser implements SpectrumParserWithTitle { - - /** - * Number of scans where we could not determine the scan number - * This method is required by interface SpectrumParser - * However, this class does not keep track of spectra without a scan number - * @return - */ - public long getScanMissingWarningCount() - { - return 0; - } - - public Spectrum readSpectrum(LineReader lineReader) { - Spectrum spec = null; - - String buf; - - buf = lineReader.readLine(); // Name: n[43]GAAA....MAR/1 - String[] nameToken = buf.split("\\s+"); - String name = nameToken[1]; - Pair namePair = parseSPTXTName(name); - - String pepSeq = namePair.getFirst(); - int precursorCharge = namePair.getSecond(); - - spec = new Spectrum(); - Peptide pep = new Peptide(pepSeq, AminoAcidSet.getStandardAminoAcidSet()); - spec.setAnnotation(pep); - spec.setTitle(namePair.getFirst() + ":" + namePair.getSecond()); - - float precursorMz = 0; - boolean parse = false; - while ((buf = lineReader.readLine()) != null) { - if (buf.startsWith("NumPeaks:")) { - parse = true; - } else if (buf.startsWith("PrecursorMZ")) { - String[] token = buf.split("\\s+"); - precursorMz = Float.parseFloat(token[1]); - } else if (buf.trim().length() == 0) { - assert (spec != null); - spec.setPrecursor(new Peak(precursorMz, 0, precursorCharge)); - return spec; - } else if (parse && Character.isDigit(buf.charAt(0))) { - String[] token = buf.split("\\s+"); - if (token.length < 2) - continue; - float mass = Float.parseFloat(token[0]); - float intensity = Float.parseFloat(token[1]); - spec.add(new Peak(mass, intensity, 1)); - } - } - return null; - } - - public Map getSpecMetaInfoMap(BufferedRandomAccessLineReader lineReader) { - Hashtable specIndexMap = new Hashtable(); - String buf; - long offset = 0; - int specIndex = 0; - SpectrumMetaInfo metaInfo = null; - while ((buf = lineReader.readLine()) != null) { - if (buf.startsWith("Name:")) { - specIndex++; - metaInfo = new SpectrumMetaInfo(); - metaInfo.setID("index=" + (specIndex - 1)); - metaInfo.setPosition(offset); - specIndexMap.put(specIndex, metaInfo); - } else if (buf.startsWith("PrecursorMZ")) { - String[] token = buf.split("\\s+"); - float precursorMz = Float.parseFloat(token[1]); - metaInfo.setPrecursorMz(precursorMz); - } - offset = lineReader.getPosition(); - } - return specIndexMap; - } - - public Hashtable getTitleToSpecIndexMap(BufferedRandomAccessLineReader lineReader) { - Hashtable titleToSpecIndexMap = new Hashtable(); - String buf; - int specIndex = 0; - while ((buf = lineReader.readLine()) != null) { - if (buf.startsWith("Name:")) { - specIndex++; - Pair pair = parseSPTXTName(buf.split("\\s+")[1]); - titleToSpecIndexMap.put(pair.getFirst() + ":" + pair.getSecond(), specIndex); - } - } - return titleToSpecIndexMap; - } - - public static Pair parseSPTXTName(String name) { - String annotationStr = name.substring(0, name.lastIndexOf('/')); - StringBuffer pepBuf = new StringBuffer(); - int startIndex = 0; - if (annotationStr.startsWith("n[43]")) { - pepBuf.append("+42"); - startIndex = 5; - } - char prevAA = '\0'; - for (int i = startIndex; i < annotationStr.length(); i++) { - char c = annotationStr.charAt(i); - if (Character.isUpperCase(c)) - pepBuf.append(c); - else if (c == '[') { - StringBuffer massBuf = new StringBuffer(); - while (annotationStr.charAt(++i) != ']') - massBuf.append(annotationStr.charAt(i)); - int mass = Integer.parseInt(massBuf.toString()); - int residueMass = AminoAcidSet.getStandardAminoAcidSet().getAminoAcid(prevAA).getNominalMass(); - int delMass = mass - residueMass; - if (delMass > 0) - pepBuf.append("+"); - pepBuf.append(delMass); - } - prevAA = c; - } - - int charge = Integer.parseInt(name.substring(name.lastIndexOf('/') + 1)); - - return new Pair(pepBuf.toString(), charge); - } - - public static void main(String argv[]) throws Exception { - String fileName = "/home/sangtaekim/Research/Data/NISTLib/human_targetdecoy_spectrast.sptxt"; - SpectraMapByTitle map = new SpectraMapByTitle(fileName, new SPTxtParser()); - System.out.println("Parsing complete."); - Spectrum spec = map.getSpectrumByTitle("+42AAAAAAGAGPEM+16VRGQVFDVGPR:3"); - System.out.println(spec.getSpecIndex() + "\t" + spec.size()); - - } -} diff --git a/src/main/java/edu/ucsd/msjava/parser/SpectrumParserWithTitle.java b/src/main/java/edu/ucsd/msjava/parser/SpectrumParserWithTitle.java deleted file mode 100644 index 2be75af0..00000000 --- a/src/main/java/edu/ucsd/msjava/parser/SpectrumParserWithTitle.java +++ /dev/null @@ -1,8 +0,0 @@ -package edu.ucsd.msjava.parser; - -import java.util.Hashtable; - - -public interface SpectrumParserWithTitle extends SpectrumParser { - Hashtable getTitleToSpecIndexMap(BufferedRandomAccessLineReader lineReader); // title -> specIndex -} diff --git a/src/main/java/edu/ucsd/msjava/parser/TSVParser.java b/src/main/java/edu/ucsd/msjava/parser/TSVParser.java deleted file mode 100644 index 6d661ad8..00000000 --- a/src/main/java/edu/ucsd/msjava/parser/TSVParser.java +++ /dev/null @@ -1,44 +0,0 @@ -package edu.ucsd.msjava.parser; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; - - -public class TSVParser { - public TSVParser() { - - } - - private HashMap> map = new HashMap>(); - - public ArrayList getList(String label) { - return map.get(label); - } - - public void parse(String fileName) { - BufferedLineReader in = null; - try { - in = new BufferedLineReader(fileName); - } catch (IOException e) { - e.printStackTrace(); - } - - String labelRow = in.readLine(); - String[] labelArr = labelRow.split("\t"); - for (String label : labelArr) - map.put(label, new ArrayList()); - - String s; - while ((s = in.readLine()) != null) { - if (s.startsWith("#")) - continue; - String[] token = s.split("\t"); - if (token.length != labelArr.length) - continue; - for (int i = 0; i < labelArr.length; i++) - map.get(labelArr[i]).add(token[i]); - } - } -} diff --git a/src/main/java/edu/ucsd/msjava/parser/TSVResultParser.java b/src/main/java/edu/ucsd/msjava/parser/TSVResultParser.java deleted file mode 100644 index 3cc650cc..00000000 --- a/src/main/java/edu/ucsd/msjava/parser/TSVResultParser.java +++ /dev/null @@ -1,133 +0,0 @@ -package edu.ucsd.msjava.parser; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -public class TSVResultParser { - private File tsvFile; - private Set pepSet; - private Set scanSet; - private Set idSet; - private Map idToSpecEValue; - - public TSVResultParser(File tsvFile) { - this.tsvFile = tsvFile; - } - - public Set getPepSet() { - return pepSet; - } - - public Set getScanSet() { - return scanSet; - } - - public Set getIdSet() { - return idSet; - } - - public Float getSpecEValue(String id) { - return idToSpecEValue.get(id); - } - - public String parse(float fdrThreshold) { - BufferedLineReader in = null; - try { - in = new BufferedLineReader(tsvFile.getPath()); - } catch (IOException e) { - e.printStackTrace(); - } - String header = in.readLine(); - if (!header.startsWith("#") && !header.startsWith("Result")) - return "No header!"; - - String[] headerToken = header.split("\t"); - int specQValueColNum = -1; - int pepQValueColNum = -1; - int pepColNum = -1; - int scanNumCol = -1; - int idCol = -1; - int specEValueCol = -1; - for (int i = 0; i < headerToken.length; i++) { - if (headerToken[i].equalsIgnoreCase("FDR") || headerToken[i].equalsIgnoreCase("QValue") || headerToken[i].equalsIgnoreCase("q-value")) - specQValueColNum = i; - if (headerToken[i].equalsIgnoreCase("PepFDR") || headerToken[i].equalsIgnoreCase("PepQValue")) - pepQValueColNum = i; - if (headerToken[i].equalsIgnoreCase("Peptide") || headerToken[i].equalsIgnoreCase("Annotation")) - pepColNum = i; - if (headerToken[i].equalsIgnoreCase("ScanNum") || headerToken[i].equalsIgnoreCase("Scan#") || headerToken[i].equalsIgnoreCase("Scan")) - scanNumCol = i; - if (headerToken[i].equalsIgnoreCase("SpecID")) - idCol = i; - if (headerToken[i].equalsIgnoreCase("SpecEValue") || headerToken[i].equalsIgnoreCase("SpecProb")) - specEValueCol = i; - } - if (specQValueColNum < 0) - return "QValue column is missing!"; - if (pepQValueColNum < 0) - return "PepQValue column is missing!"; - if (pepColNum < 0) - return "Annotation column is missing!"; - if (scanNumCol < 0) - return "Scan column is missing!"; - if (idCol < 0) - return "SpecID column is missing!"; - if (specEValueCol < 0) - return "SpecEValue column is missing!"; - - String s; - pepSet = new HashSet(); - scanSet = new HashSet(); - idSet = new HashSet(); - idToSpecEValue = new HashMap(); - while ((s = in.readLine()) != null) { - if (s.startsWith("#")) - continue; - String[] token = s.split("\t"); - if (token.length <= specQValueColNum || token.length <= pepQValueColNum || token.length <= pepColNum - || token.length <= idCol || token.length <= specEValueCol) - continue; - double specQValue = Double.parseDouble(token[specQValueColNum]); - double pepQValue = Double.parseDouble(token[pepQValueColNum]); - float specEValue = Float.parseFloat(token[specEValueCol]); -// if(token[scanNumCol].equals("6804")) -// System.out.println("Debug"); - idToSpecEValue.put(token[idCol], specEValue); - - if (specQValue <= fdrThreshold) { - scanSet.add(token[scanNumCol]); - idSet.add(token[idCol]); - } - if (pepQValue <= fdrThreshold) { - String annotation = token[pepColNum]; - - String pepStr; - - if (annotation.matches("[A-Z\\-_]?\\..+\\.[A-Z\\-_]?")) - pepStr = annotation.substring(annotation.indexOf('.') + 1, annotation.lastIndexOf('.')); - else - pepStr = annotation; - - StringBuffer unmodStr = new StringBuffer(); - for (int i = 0; i < pepStr.length(); i++) - if (Character.isLetter(pepStr.charAt(i))) - unmodStr.append(pepStr.charAt(i)); - - pepSet.add(unmodStr.toString()); - } - } - - try { - in.close(); - } catch (IOException e) { - e.printStackTrace(); - } - - return null; - } -} diff --git a/src/test/java/msgfplus/TestMisc.java b/src/test/java/msgfplus/TestMisc.java index 7af58d30..467af263 100644 --- a/src/test/java/msgfplus/TestMisc.java +++ b/src/test/java/msgfplus/TestMisc.java @@ -23,7 +23,6 @@ import edu.ucsd.msjava.msutil.Protocol; import edu.ucsd.msjava.msutil.SpectraAccessor; import edu.ucsd.msjava.msutil.Spectrum; -import edu.ucsd.msjava.parser.TSVParser; public class TestMisc { @@ -145,79 +144,6 @@ public void generateTRexPRMSpectrum() System.out.println("END IONS"); } - @Test - @Ignore - public void generateTRexPRMSpectra() - { - File outputFile = new File("D:\\Research\\Data\\TRex\\MaxCharge4\\TRex48216_Vectors.txt"); - PrintStream out = null; - try { - out = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFile))); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - - AminoAcidSet aaSet = AminoAcidSet.getStandardAminoAcidSet(); - File idFile = new File("D:\\Research\\Data\\TRex\\MaxCharge4\\NoDecoy.tsv"); - HashMap titleToNominalMass = new HashMap(); - TSVParser parser = new TSVParser(); - parser.parse(idFile.getPath()); - ArrayList titleList = parser.getList("Title"); - ArrayList peptideList = parser.getList("Peptide"); - ArrayList specEValueList = parser.getList("SpecEValue"); - for(int i=0; i 1E-10) continue; - Peptide peptide = new Peptide(peptideList.get(i), aaSet); - int nominalMass = peptide.getNominalMass(); - String title = titleList.get(i); - titleToNominalMass.put(title, nominalMass); - } - - NewRankScorer scorer = NewScorerFactory.get(ActivationMethod.CID, InstrumentType.LOW_RESOLUTION_LTQ, Enzyme.TRYPSIN, Protocol.STANDARD); - scorer.doNotUseError(); - - File specFile = new File("D:\\Research\\Data\\TRex\\TRex48216.mgf"); - SpectraAccessor accessor = new SpectraAccessor(specFile); - Iterator itr = accessor.getSpecItr(); - while(itr.hasNext()) - { - Spectrum spec = accessor.getSpecItr().next(); - String title = spec.getTitle(); - int nominalMass; - if(titleToNominalMass.containsKey(title)) nominalMass = titleToNominalMass.get(title); - else nominalMass = NominalMass.toNominalMass(spec.getPrecursorMass()) - 18; - - NewScoredSpectrum scoredSpec = scorer.getScoredSpectrum(spec); - - // PRM spectrum - //out.println("BEGIN IONS"); - out.println("SCAN="+spec.getScanNum()); -// if(spec.getTitle() != null) -// out.println(" " + spec.getTitle()); -// else -// out.println(); -// if(spec.getAnnotation() != null) -// out.println("SEQ=" + spec.getAnnotationStr()); -// out.println("PEPMASS=" + spec.getPrecursorPeak().getMz()); - out.println("PEPTIDE_MASS=" + nominalMass); -// out.println("SCANS=" + spec.getScanNum()); -// out.println("CHARGE="+spec.getCharge()+"+"); - -// int peptideNominalMass = 1272; - for(int m=1; m Date: Mon, 27 Apr 2026 06:47:40 +0100 Subject: [PATCH 21/34] refactor: rename parser/ package to mgf/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure file-move + package/import update; no behaviour change. - git mv the 6 remaining files under parser/ into mgf/ (BufferedLineReader, BufferedRandomAccessLineReader, LineReader, MgfSpectrumParser, SpectrumParser, UnicodeBOMInputStream) - Update package declaration in each moved file from edu.ucsd.msjava.parser → edu.ucsd.msjava.mgf - Update import edu.ucsd.msjava.parser.* → edu.ucsd.msjava.mgf.* across 13 callers in fdr/, mzml/, msutil/, msscorer/, msdbsearch/ --- src/main/java/edu/ucsd/msjava/fdr/ComputeQValue.java | 2 +- .../edu/ucsd/msjava/{parser => mgf}/BufferedLineReader.java | 2 +- .../{parser => mgf}/BufferedRandomAccessLineReader.java | 2 +- .../java/edu/ucsd/msjava/{parser => mgf}/LineReader.java | 2 +- .../edu/ucsd/msjava/{parser => mgf}/MgfSpectrumParser.java | 2 +- .../edu/ucsd/msjava/{parser => mgf}/SpectrumParser.java | 2 +- .../ucsd/msjava/{parser => mgf}/UnicodeBOMInputStream.java | 2 +- src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java | 2 +- .../java/edu/ucsd/msjava/msdbsearch/LibraryScanner.java | 2 +- .../edu/ucsd/msjava/msscorer/ScoringParameterGenerator.java | 2 +- .../msscorer/ScoringParameterGeneratorWithErrors.java | 2 +- src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java | 2 +- src/main/java/edu/ucsd/msjava/msutil/SpecKey.java | 2 +- src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java | 4 ++-- src/main/java/edu/ucsd/msjava/msutil/SpectraContainer.java | 2 +- src/main/java/edu/ucsd/msjava/msutil/SpectraIterator.java | 6 +++--- src/main/java/edu/ucsd/msjava/msutil/SpectraMap.java | 4 ++-- src/main/java/edu/ucsd/msjava/msutil/UserParam.java | 2 +- .../java/edu/ucsd/msjava/mzml/StaxMzMLSpectraIterator.java | 2 +- 19 files changed, 23 insertions(+), 23 deletions(-) rename src/main/java/edu/ucsd/msjava/{parser => mgf}/BufferedLineReader.java (94%) rename src/main/java/edu/ucsd/msjava/{parser => mgf}/BufferedRandomAccessLineReader.java (99%) rename src/main/java/edu/ucsd/msjava/{parser => mgf}/LineReader.java (63%) rename src/main/java/edu/ucsd/msjava/{parser => mgf}/MgfSpectrumParser.java (99%) rename src/main/java/edu/ucsd/msjava/{parser => mgf}/SpectrumParser.java (94%) rename src/main/java/edu/ucsd/msjava/{parser => mgf}/UnicodeBOMInputStream.java (99%) diff --git a/src/main/java/edu/ucsd/msjava/fdr/ComputeQValue.java b/src/main/java/edu/ucsd/msjava/fdr/ComputeQValue.java index 28196b04..d136b894 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/ComputeQValue.java +++ b/src/main/java/edu/ucsd/msjava/fdr/ComputeQValue.java @@ -1,6 +1,6 @@ package edu.ucsd.msjava.fdr; -import edu.ucsd.msjava.parser.BufferedLineReader; +import edu.ucsd.msjava.mgf.BufferedLineReader; import edu.ucsd.msjava.cli.MSGFPlus; import java.io.File; diff --git a/src/main/java/edu/ucsd/msjava/parser/BufferedLineReader.java b/src/main/java/edu/ucsd/msjava/mgf/BufferedLineReader.java similarity index 94% rename from src/main/java/edu/ucsd/msjava/parser/BufferedLineReader.java rename to src/main/java/edu/ucsd/msjava/mgf/BufferedLineReader.java index 3eddae90..d068aed6 100644 --- a/src/main/java/edu/ucsd/msjava/parser/BufferedLineReader.java +++ b/src/main/java/edu/ucsd/msjava/mgf/BufferedLineReader.java @@ -1,4 +1,4 @@ -package edu.ucsd.msjava.parser; +package edu.ucsd.msjava.mgf; import java.io.*; diff --git a/src/main/java/edu/ucsd/msjava/parser/BufferedRandomAccessLineReader.java b/src/main/java/edu/ucsd/msjava/mgf/BufferedRandomAccessLineReader.java similarity index 99% rename from src/main/java/edu/ucsd/msjava/parser/BufferedRandomAccessLineReader.java rename to src/main/java/edu/ucsd/msjava/mgf/BufferedRandomAccessLineReader.java index 3216e238..a3422380 100644 --- a/src/main/java/edu/ucsd/msjava/parser/BufferedRandomAccessLineReader.java +++ b/src/main/java/edu/ucsd/msjava/mgf/BufferedRandomAccessLineReader.java @@ -1,4 +1,4 @@ -package edu.ucsd.msjava.parser; +package edu.ucsd.msjava.mgf; import org.apache.commons.lang3.tuple.Pair; diff --git a/src/main/java/edu/ucsd/msjava/parser/LineReader.java b/src/main/java/edu/ucsd/msjava/mgf/LineReader.java similarity index 63% rename from src/main/java/edu/ucsd/msjava/parser/LineReader.java rename to src/main/java/edu/ucsd/msjava/mgf/LineReader.java index c0f31e74..f0217a4a 100644 --- a/src/main/java/edu/ucsd/msjava/parser/LineReader.java +++ b/src/main/java/edu/ucsd/msjava/mgf/LineReader.java @@ -1,4 +1,4 @@ -package edu.ucsd.msjava.parser; +package edu.ucsd.msjava.mgf; public interface LineReader { String readLine(); diff --git a/src/main/java/edu/ucsd/msjava/parser/MgfSpectrumParser.java b/src/main/java/edu/ucsd/msjava/mgf/MgfSpectrumParser.java similarity index 99% rename from src/main/java/edu/ucsd/msjava/parser/MgfSpectrumParser.java rename to src/main/java/edu/ucsd/msjava/mgf/MgfSpectrumParser.java index e805a781..e8ed1e80 100644 --- a/src/main/java/edu/ucsd/msjava/parser/MgfSpectrumParser.java +++ b/src/main/java/edu/ucsd/msjava/mgf/MgfSpectrumParser.java @@ -1,4 +1,4 @@ -package edu.ucsd.msjava.parser; +package edu.ucsd.msjava.mgf; import edu.ucsd.msjava.msutil.*; diff --git a/src/main/java/edu/ucsd/msjava/parser/SpectrumParser.java b/src/main/java/edu/ucsd/msjava/mgf/SpectrumParser.java similarity index 94% rename from src/main/java/edu/ucsd/msjava/parser/SpectrumParser.java rename to src/main/java/edu/ucsd/msjava/mgf/SpectrumParser.java index f659b055..86856b18 100644 --- a/src/main/java/edu/ucsd/msjava/parser/SpectrumParser.java +++ b/src/main/java/edu/ucsd/msjava/mgf/SpectrumParser.java @@ -1,4 +1,4 @@ -package edu.ucsd.msjava.parser; +package edu.ucsd.msjava.mgf; import edu.ucsd.msjava.msutil.Spectrum; import edu.ucsd.msjava.msutil.SpectrumMetaInfo; diff --git a/src/main/java/edu/ucsd/msjava/parser/UnicodeBOMInputStream.java b/src/main/java/edu/ucsd/msjava/mgf/UnicodeBOMInputStream.java similarity index 99% rename from src/main/java/edu/ucsd/msjava/parser/UnicodeBOMInputStream.java rename to src/main/java/edu/ucsd/msjava/mgf/UnicodeBOMInputStream.java index 87dce4d1..67a70b53 100644 --- a/src/main/java/edu/ucsd/msjava/parser/UnicodeBOMInputStream.java +++ b/src/main/java/edu/ucsd/msjava/mgf/UnicodeBOMInputStream.java @@ -1,6 +1,6 @@ // (‑●‑●)> released under the WTFPL v2 license, by Gregory Pakosz (@gpakosz) -package edu.ucsd.msjava.parser; +package edu.ucsd.msjava.mgf; import java.io.IOException; import java.io.InputStream; diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java b/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java index d16d4524..2ac946af 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java @@ -6,7 +6,7 @@ import edu.ucsd.msjava.msscorer.SimpleDBSearchScorer; import edu.ucsd.msjava.msutil.*; import edu.ucsd.msjava.msutil.Modification.Location; -import edu.ucsd.msjava.parser.BufferedLineReader; +import edu.ucsd.msjava.mgf.BufferedLineReader; import edu.ucsd.msjava.sequences.Constants; import java.io.*; diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/LibraryScanner.java b/src/main/java/edu/ucsd/msjava/msdbsearch/LibraryScanner.java index 5f6821a7..5f7fb7a8 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/LibraryScanner.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/LibraryScanner.java @@ -4,7 +4,7 @@ import edu.ucsd.msjava.msscorer.SimpleDBSearchScorer; import edu.ucsd.msjava.msutil.*; import edu.ucsd.msjava.msutil.Modification.Location; -import edu.ucsd.msjava.parser.BufferedLineReader; +import edu.ucsd.msjava.mgf.BufferedLineReader; import java.io.FileNotFoundException; import java.io.IOException; diff --git a/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGenerator.java b/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGenerator.java index 1c570c37..62fee4b4 100644 --- a/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGenerator.java +++ b/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGenerator.java @@ -5,7 +5,7 @@ import edu.ucsd.msjava.msgf.Tolerance; import edu.ucsd.msjava.msscorer.NewScorerFactory.SpecDataType; import edu.ucsd.msjava.msutil.*; -import edu.ucsd.msjava.parser.MgfSpectrumParser; +import edu.ucsd.msjava.mgf.MgfSpectrumParser; import java.io.File; import java.util.*; diff --git a/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGeneratorWithErrors.java b/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGeneratorWithErrors.java index d2891aed..8cedf8e6 100644 --- a/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGeneratorWithErrors.java +++ b/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGeneratorWithErrors.java @@ -7,7 +7,7 @@ import edu.ucsd.msjava.msscorer.NewScorerFactory.SpecDataType; import edu.ucsd.msjava.msutil.*; import edu.ucsd.msjava.msutil.IonType.PrefixIon; -import edu.ucsd.msjava.parser.MgfSpectrumParser; +import edu.ucsd.msjava.mgf.MgfSpectrumParser; import java.io.File; import java.util.*; diff --git a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java index ee05daa9..102961bf 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java +++ b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java @@ -3,7 +3,7 @@ import edu.ucsd.msjava.cli.MSGFPlusOptions; import edu.ucsd.msjava.msdbsearch.SearchParams; import edu.ucsd.msjava.msutil.Modification.Location; -import edu.ucsd.msjava.parser.BufferedLineReader; +import edu.ucsd.msjava.mgf.BufferedLineReader; import java.io.File; import java.io.IOException; diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java b/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java index b222c8be..c03a0ee0 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java @@ -1,6 +1,6 @@ package edu.ucsd.msjava.msutil; -import edu.ucsd.msjava.parser.SpectrumParser; +import edu.ucsd.msjava.mgf.SpectrumParser; import java.util.ArrayList; import java.util.Collections; diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java b/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java index 56b51ef3..223644b4 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java @@ -3,8 +3,8 @@ import edu.ucsd.msjava.mzml.StaxMzMLParser; import edu.ucsd.msjava.mzml.StaxMzMLSpectraIterator; import edu.ucsd.msjava.mzml.StaxMzMLSpectraMap; -import edu.ucsd.msjava.parser.MgfSpectrumParser; -import edu.ucsd.msjava.parser.SpectrumParser; +import edu.ucsd.msjava.mgf.MgfSpectrumParser; +import edu.ucsd.msjava.mgf.SpectrumParser; import java.io.File; import java.io.IOException; diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpectraContainer.java b/src/main/java/edu/ucsd/msjava/msutil/SpectraContainer.java index 0435d1eb..b0cad8be 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpectraContainer.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpectraContainer.java @@ -1,6 +1,6 @@ package edu.ucsd.msjava.msutil; -import edu.ucsd.msjava.parser.SpectrumParser; +import edu.ucsd.msjava.mgf.SpectrumParser; import java.io.*; import java.util.ArrayList; diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpectraIterator.java b/src/main/java/edu/ucsd/msjava/msutil/SpectraIterator.java index 2962bdc9..7def0abe 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpectraIterator.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpectraIterator.java @@ -1,8 +1,8 @@ package edu.ucsd.msjava.msutil; -import edu.ucsd.msjava.parser.BufferedLineReader; -import edu.ucsd.msjava.parser.LineReader; -import edu.ucsd.msjava.parser.SpectrumParser; +import edu.ucsd.msjava.mgf.BufferedLineReader; +import edu.ucsd.msjava.mgf.LineReader; +import edu.ucsd.msjava.mgf.SpectrumParser; import java.io.FileNotFoundException; import java.io.IOException; diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpectraMap.java b/src/main/java/edu/ucsd/msjava/msutil/SpectraMap.java index f82780f1..974c5360 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpectraMap.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpectraMap.java @@ -1,7 +1,7 @@ package edu.ucsd.msjava.msutil; -import edu.ucsd.msjava.parser.BufferedRandomAccessLineReader; -import edu.ucsd.msjava.parser.SpectrumParser; +import edu.ucsd.msjava.mgf.BufferedRandomAccessLineReader; +import edu.ucsd.msjava.mgf.SpectrumParser; import java.util.*; import java.util.Map.Entry; diff --git a/src/main/java/edu/ucsd/msjava/msutil/UserParam.java b/src/main/java/edu/ucsd/msjava/msutil/UserParam.java index 97903fbc..f286fc48 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/UserParam.java +++ b/src/main/java/edu/ucsd/msjava/msutil/UserParam.java @@ -1,6 +1,6 @@ package edu.ucsd.msjava.msutil; -import edu.ucsd.msjava.parser.BufferedLineReader; +import edu.ucsd.msjava.mgf.BufferedLineReader; import java.io.FileNotFoundException; import java.io.IOException; diff --git a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLSpectraIterator.java b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLSpectraIterator.java index 22dbe1e5..d92ecfb3 100644 --- a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLSpectraIterator.java +++ b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLSpectraIterator.java @@ -1,7 +1,7 @@ package edu.ucsd.msjava.mzml; import edu.ucsd.msjava.msutil.Spectrum; -import edu.ucsd.msjava.parser.SpectrumParser; +import edu.ucsd.msjava.mgf.SpectrumParser; import java.util.Iterator; import java.util.NoSuchElementException; From 85d0afe24908af05670a8ea7caea2b132bdb4c31 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 06:59:14 +0100 Subject: [PATCH 22/34] fix(cli): CustomAA= config-file crash + 3 picocli polish issues The Phase 4c review surfaced four issues that all bottomed out in small details of the new typed-options path. Bundling the fixes plus a regression test: 1. **CustomAA= crash (critical, was code review issue #1).** AminoAcidSet.getAminoAcidSetFromModEntries was prepending "CustomAA=" to each entry before handing it to parseConfigEntry, but parseConfigEntry only strips the "nummods=" prefix -- every other line is split on commas and modInfo[0] is parsed as a mass or empirical formula. With the prefix attached, modInfo[0] became "CustomAA=C3H5NO" which fails Double.parseDouble + Composition.getMass and triggers the System.exit(-1) in the caller. Any -conf file with a CustomAA= line crashed the process. MSGFPlusOptions.applyConfigEntry already strips the "Key=" prefix when populating opts.customAAs; the fix is to drop the literal at AminoAcidSet:826 so the bare value reaches parseConfigEntry, matching how staticMods/dynamicMods are passed at line 831. New regression test MSGFPlusOptionsConfigFileTest.configFileWithCustomAAParsesWithoutCrashing pins this with a tiny synthetic config file. 2. **-decoy default doc.** @Option description now says "Default: XXX" (the actual value returned by effectiveDecoyPrefix and the same constant the legacy code used via MSGFPlus.DEFAULT_DECOY_PROTEIN_PREFIX). The "Default: DECOY_" string was wrong since Phase 1a. 3. **Single-file spectrum-format null check.** The single-file branch in SearchParams.parse used to silently store a null SpecFileFormat into DBSearchIOFiles when the user supplied -s file.bogus, which later NPE'd at MSGFPlus:305 (specFormat.getPSIName()). It now short-circuits with the same message the directory branch's isSupportedSpectrumFormat filter implies: "Spectrum file extension does not match a supported format (*.mzML, *.mgf): ". 4. **Unrecognized config-key URL hint.** The legacy parseConfigParamFile tracked an invalid-parameter counter and, after closing the file, printed the example-params URL hint exactly once if the count was non-zero. MSGFPlusOptions.applyConfigFile now restores that behaviour with a private unrecognizedConfigEntries counter incremented inside the default branch of applyConfigEntry, plus the same end-of-file hint. Scoped tests pass (68 tests, 0 failures, 0 errors). --- .../edu/ucsd/msjava/cli/MSGFPlusOptions.java | 14 ++++- .../ucsd/msjava/msdbsearch/SearchParams.java | 3 + .../edu/ucsd/msjava/msutil/AminoAcidSet.java | 5 +- .../cli/MSGFPlusOptionsConfigFileTest.java | 60 +++++++++++++++++++ 4 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java index e6416223..7a877754 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java @@ -60,7 +60,7 @@ public final class MSGFPlusOptions { public File outputFile; @Option(names = "-decoy", paramLabel = "Prefix", - description = "Decoy protein prefix; Default: DECOY_") + description = "Decoy protein prefix; Default: XXX") public String decoyPrefix; // ---------- precursor mass tolerance ---------- @@ -331,6 +331,7 @@ public Protocol effectiveProtocol() { * @return null on success, error string otherwise. */ public String applyConfigFile(File file) { + int unrecognizedCount = 0; try (BufferedReader reader = new BufferedReader(new FileReader(file))) { String line; int lineNum = 0; @@ -343,17 +344,27 @@ public String applyConfigFile(File file) { String rawKey = trimmed.substring(0, eq).trim(); String value = trimmed.substring(eq + 1).trim(); String key = canonicalConfigKey(rawKey); + int before = unrecognizedConfigEntries; String err = applyConfigEntry(key, value, file.getName()); if (err != null) { return "Error parsing line " + lineNum + " of " + file.getName() + ": " + err; } + if (unrecognizedConfigEntries > before) unrecognizedCount++; } } catch (IOException e) { return "Error reading config file " + file.getPath() + ": " + e.getMessage(); } + if (unrecognizedCount > 0) { + System.out.println("Valid parameters are described in the example parameter file at " + + "https://github.com/MSGFPlus/msgfplus/blob/master/docs/examples/MSGFPlus_Params.txt"); + } return null; } + /** Counter incremented inside {@link #applyConfigEntry} whenever an unknown + * config-file key is seen; surfaced via the end-of-file URL hint. */ + private int unrecognizedConfigEntries; + private String applyConfigEntry(String key, String value, String fileName) { // Repeated entries: collect into lists. "none" is treated as no entry. if (key.equalsIgnoreCase("DynamicMod")) { @@ -415,6 +426,7 @@ private String applyConfigEntry(String key, String value, String fileName) { default: if (!key.toLowerCase().startsWith("enzymedef")) { System.out.println("Warning, unrecognized parameter '" + key + "=" + value + "' in config file " + fileName); + unrecognizedConfigEntries++; } return null; } diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java index 2c66a799..9897f010 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java @@ -351,6 +351,9 @@ public String parse(MSGFPlusOptions opts) { if (!specPath.isDirectory()) { SpecFileFormat specFormat = SpecFileFormat.getSpecFileFormat(specPath.getName()); + if (!isSupportedSpectrumFormat(specFormat)) { + return "Spectrum file extension does not match a supported format (*.mzML, *.mgf): " + specPath.getName(); + } File outputFile = opts.outputFile; if (outputFile == null) { String outputFilePath = specPath.getPath().substring(0, specPath.getPath().lastIndexOf('.')) + defaultExt; diff --git a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java index 102961bf..96bda071 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java +++ b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java @@ -823,7 +823,10 @@ public static AminoAcidSet getAminoAcidSetFromModEntries( ModificationMetadata modMetadata = new ModificationMetadata(opts.effectiveMaxNumMods()); for (int i = 0; i < customAAEntries.size(); i++) { - if (!parseConfigEntry(configName, i + 1, "CustomAA=" + customAAEntries.get(i), mods, customAA, modMetadata)) { + // parseConfigEntry expects bare comma-separated mod definitions, not + // a "Key=value" line. MSGFPlusOptions.applyConfigEntry already strips + // the "CustomAA=" prefix when populating opts.customAAs. + if (!parseConfigEntry(configName, i + 1, customAAEntries.get(i), mods, customAA, modMetadata)) { System.exit(-1); } } diff --git a/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java new file mode 100644 index 00000000..c900ff01 --- /dev/null +++ b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java @@ -0,0 +1,60 @@ +package edu.ucsd.msjava.cli; + +import edu.ucsd.msjava.msdbsearch.SearchParams; +import org.junit.Assert; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Regression tests for {@link MSGFPlusOptions#applyConfigFile} and the + * downstream {@link SearchParams#parse} path. + * + * Pins the {@code CustomAA=} crash that was caught in code review: the + * legacy hashtable-based config-file reader passed bare values to + * {@code AminoAcidSet.parseConfigEntry}, but the modernized adapter + * briefly re-prepended {@code "CustomAA="} which {@code parseConfigEntry} + * does not strip — every {@code -conf} invocation containing a + * {@code CustomAA=} line crashed via {@code System.exit(-1)}. + */ +public class MSGFPlusOptionsConfigFileTest { + + @Test + public void configFileWithCustomAAParsesWithoutCrashing() throws IOException, URISyntaxException { + // Build a minimal config file with the documented CustomAA= form. + Path tmpDir = Files.createTempDirectory("msgfplus-customaa-"); + Path conf = tmpDir.resolve("with_custom_aa.txt"); + Files.write(conf, ("# Regression for the CustomAA= prefix bug\n" + + "CustomAA=C3H5NO, U, custom, U, Selenocysteine\n" + + "MinPepLength=7\n").getBytes(StandardCharsets.UTF_8)); + + URI specUri = MSGFPlusOptionsConfigFileTest.class.getClassLoader() + .getResource("test.mgf").toURI(); + URI dbUri = MSGFPlusOptionsConfigFileTest.class.getClassLoader() + .getResource("Tryp_Pig_Bov.fasta").toURI(); + + MSGFPlusOptions opts = new MSGFPlusOptions(); + opts.configFile = conf.toFile(); + opts.spectrumFile = new File(specUri); + opts.databaseFile = new File(dbUri); + + SearchParams params = new SearchParams(); + String err = params.parse(opts); + Assert.assertNull("SearchParams.parse must not crash on a config file with CustomAA= entries: " + err, err); + + // The custom AA list should reach opts.customAAs and be honored downstream. + Assert.assertEquals(1, opts.customAAs.size()); + Assert.assertEquals("config-file MinPepLength=7 should win over the default of 6", + 7, opts.effectiveMinPeptideLength()); + + // Cleanup. + Files.deleteIfExists(conf); + Files.deleteIfExists(tmpDir); + } +} From 8fc6e2b493c0a1741fa5c65f721dfc025afec94d Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 08:08:44 +0100 Subject: [PATCH 23/34] fix(cli): restore -m 4 = UVPD activation method The Phase 4c MSGFPlusOptions.effectiveActivationMethod() switch hardcoded indices 0..3 (ASWRITTEN/CID/ETD/HCD) and threw IllegalArgumentException for index 4. The legacy addFragMethodParam(ActivationMethod.ASWRITTEN, doNotAddMergeMode=true) hid the registry's FUSION (slot 4), so the user-facing menu was 0..3 + UVPD at index 4. The Phase 4c rewrite silently dropped UVPD support; this commit restores it. - Add case 4: return ActivationMethod.UVPD; to the switch. - Update the @Option description to enumerate 4=UVPD. - New unit test (MSGFPlusOptionsActivationMethodTest) pins the full 0..4 mapping plus the default and the out-of-range guard. docs/msgfplus.md already documents -m 4 = UVPD; this brings the code in line with the doc. --- .../edu/ucsd/msjava/cli/MSGFPlusOptions.java | 10 +++-- .../MSGFPlusOptionsActivationMethodTest.java | 43 +++++++++++++++++++ 2 files changed, 50 insertions(+), 3 deletions(-) create mode 100644 src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsActivationMethodTest.java diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java index 7a877754..b010ab27 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java @@ -105,7 +105,7 @@ public final class MSGFPlusOptions { public Integer tdaStrategy; @Option(names = "-m", paramLabel = "ID", - description = "Fragmentation method ID: 0=as written/CID (Default), 1=CID, 2=ETD, 3=HCD") + description = "Fragmentation method ID: 0=as written/CID (Default), 1=CID, 2=ETD, 3=HCD, 4=UVPD") public Integer fragMethodId; @Option(names = "-inst", paramLabel = "ID", @@ -285,8 +285,11 @@ public IntRange effectiveSpecIndexRange() { } /** Resolves {@code -m} index to {@link ActivationMethod}. MSGFPlus exposes - * 0=ASWRITTEN, 1=CID, 2=ETD, 3=HCD (FUSION is excluded by - * {@code addFragMethodParam(..., doNotAddMergeMode=true)}). */ + * 0=ASWRITTEN, 1=CID, 2=ETD, 3=HCD, 4=UVPD. The registry also defines + * FUSION (merge-mode synthetic method) and PQD, but neither is exposed + * as a user-selectable index by MSGFPlus -- FUSION was hidden by the + * legacy {@code addFragMethodParam(..., doNotAddMergeMode=true)}, which + * shifted UVPD from registry slot 5 down to user-facing index 4. */ public ActivationMethod effectiveActivationMethod() { int idx = fragMethodId != null ? fragMethodId : 0; switch (idx) { @@ -294,6 +297,7 @@ public ActivationMethod effectiveActivationMethod() { case 1: return ActivationMethod.CID; case 2: return ActivationMethod.ETD; case 3: return ActivationMethod.HCD; + case 4: return ActivationMethod.UVPD; default: throw new IllegalArgumentException("invalid -m index: " + idx); } } diff --git a/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsActivationMethodTest.java b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsActivationMethodTest.java new file mode 100644 index 00000000..6df6723d --- /dev/null +++ b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsActivationMethodTest.java @@ -0,0 +1,43 @@ +package edu.ucsd.msjava.cli; + +import edu.ucsd.msjava.msutil.ActivationMethod; +import org.junit.Assert; +import org.junit.Test; + +/** + * Pins the {@code -m} ID -> {@link ActivationMethod} mapping. The legacy + * dispatch went through the registry order (ASWRITTEN, CID, ETD, HCD, FUSION, + * UVPD) with {@code FUSION} hidden by {@code addFragMethodParam(..., + * doNotAddMergeMode=true)}, which shifted {@code UVPD} from registry slot 5 + * to the user-facing index 4. The Phase 4c rewrite originally hardcoded only + * 0..3 and silently dropped UVPD; this test guards against regressing it + * again. + */ +public class MSGFPlusOptionsActivationMethodTest { + + @Test + public void defaultIsAsWritten() { + MSGFPlusOptions opts = new MSGFPlusOptions(); + Assert.assertSame(ActivationMethod.ASWRITTEN, opts.effectiveActivationMethod()); + } + + @Test + public void mapsAllSupportedIndices() { + Assert.assertSame(ActivationMethod.ASWRITTEN, withFragMethodId(0).effectiveActivationMethod()); + Assert.assertSame(ActivationMethod.CID, withFragMethodId(1).effectiveActivationMethod()); + Assert.assertSame(ActivationMethod.ETD, withFragMethodId(2).effectiveActivationMethod()); + Assert.assertSame(ActivationMethod.HCD, withFragMethodId(3).effectiveActivationMethod()); + Assert.assertSame(ActivationMethod.UVPD, withFragMethodId(4).effectiveActivationMethod()); + } + + @Test(expected = IllegalArgumentException.class) + public void rejectsOutOfRangeIndex() { + withFragMethodId(5).effectiveActivationMethod(); + } + + private static MSGFPlusOptions withFragMethodId(int id) { + MSGFPlusOptions opts = new MSGFPlusOptions(); + opts.fragMethodId = id; + return opts; + } +} From 05e664afb6b4ff8b0686235fb96e08e9799929d8 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 08:09:07 +0100 Subject: [PATCH 24/34] docs: refresh README + module docs after PR #25 cleanup Remove stale references that no longer match the modernized code: - README.md: Quick-Start examples now write .pin (default) or use -outputFormat tsv; the deleted MzIDToTsv conversion step is gone. The "What is MS-GF+?" paragraph and "What is different in this fork?" bullets describe the actual current state (mzIdentML output removed; spectrum input narrowed to mzML + mgf only; picocli-based CLI). The required-input table now shows *.mzML / *.mgf only and the -o default is [input].pin. - docs/msgfplus.md: -s synopsis trimmed to "*.mzML or *.mgf" in three places; -allowDenseCentroidedPeaks no longer mentions mzXML; the duplicated "mzML, mzXML, mzML" typo is fixed. - docs/examples/MSGFPlus_Params.txt: SpectrumFile comment trimmed to *.mzML / *.mgf. - docs/readme.md: Input/Output summary now matches the fork's actual format support; obsolete ms-gfdb.md link removed (the doc and entry point were deleted in 5a2ec4e). - docs/troubleshooting.md: FASTA-split workaround now describes concatenating .pin / .tsv outputs (mzIdentML and MzidMerger no longer apply). The OpenMS TOPPAS workaround now feeds the .pin via PercolatorAdapter instead of importing a non-existent .mzid. - src/test/resources/MSGFDB_Param.txt: removed showDecoy=1 and uniformAAProb=auto (both ParamNameEnum entries were dropped in 5a2ec4e and now produce "unrecognized parameter" warnings on every test run). Normalized ParentMassTolerance -> PrecursorMassTolerance and IsotopeError -> IsotopeErrorRange to use the canonical keys; the canonicalConfigKey aliases keep the old names working but the test fixture should be self-documenting. No code changes; the existing scoped test sweep continues to pass. --- README.md | 29 ++++++++++++++++------------- docs/examples/MSGFPlus_Params.txt | 2 +- docs/msgfplus.md | 14 +++++++------- docs/readme.md | 5 ++--- docs/troubleshooting.md | 4 ++-- src/test/resources/MSGFDB_Param.txt | 19 +++---------------- 6 files changed, 31 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 14d0e2fa..ac2748d6 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,8 @@ MS-GF+ (aka MSGF+ or MSGFPlus) performs peptide identification by scoring MS/MS spectra against peptides derived from a protein sequence database. -It supports the HUPO PSI standard input file (mzML) and additional legacy spectrum inputs, and saves results in -the mzIdentML format, though results can easily be transformed to TSV. +It supports the HUPO PSI standard input file (mzML) plus MGF, and writes +Percolator `.pin` (default) or TSV output. ProteomeXchange supports Complete data submissions using MS-GF+ search results. MS-GF+ is developed by Sangtae Kim and the PNNL Proteomics team at the @@ -22,10 +22,11 @@ Center for Computational Mass Spectrometry, University of California, San Diego. ## What is different in this fork? - **Streaming mzML parser** -- replaces the in-memory preload with a single-pass StAX parser, significantly reducing memory usage for large files -- **Primary maintained formats: mzML and MGF** -- mzXML is not available in this fork +- **Spectrum input narrowed to mzML and MGF** -- mzXML, MS2, PKL, and `_dta.txt` are not supported in this fork +- **mzIdentML output removed** -- output is Percolator `.pin` (default) or TSV; feed `.pin` straight into Percolator for rescoring +- **Picocli-based CLI** -- declarative typed flags with auto-generated `-h/--help` - **Java 17 minimum** -- updated from Java 8 - **CI/CD** -- GitHub Actions for automated testing and releases -- **Direct TSV output** -- optional TSV output alongside mzIdentML ## Requirements @@ -39,13 +40,13 @@ Download the latest release from the [Releases page](https://github.com/bigbio/m ## Quick Start ```bash -# Basic search +# Basic search (writes results.pin in Percolator format) java -Xmx4G -jar MSGFPlus.jar \ -s spectra.mzML \ -d database.fasta \ - -o results.mzid + -o results.pin -# TMT search with target-decoy analysis +# TMT search with target-decoy analysis, Percolator-ready output java -Xmx8G -jar MSGFPlus.jar \ -s spectra.mzML \ -d database.fasta \ @@ -56,11 +57,13 @@ java -Xmx8G -jar MSGFPlus.jar \ -e 1 \ -protocol 4 \ -mod mods.txt \ - -o results.mzid + -o results.pin -# Convert mzid output to TSV -java -cp MSGFPlus.jar edu.ucsd.msjava.ui.MzIDToTsv \ - -i results.mzid \ +# Direct TSV output (skip Percolator) +java -Xmx4G -jar MSGFPlus.jar \ + -s spectra.mzML \ + -d database.fasta \ + -outputFormat tsv \ -o results.tsv ``` @@ -70,14 +73,14 @@ java -cp MSGFPlus.jar edu.ucsd.msjava.ui.MzIDToTsv \ | Flag | Name | Description | |------|------|-------------| -| `-s` | SpectrumFile | Input spectrum file (`*.mzML`, `*.mgf`, `*.ms2`, `*.pkl`, `*_dta.txt`). Spectra should be centroided. | +| `-s` | SpectrumFile | Input spectrum file (`*.mzML`, `*.mgf`). Spectra should be centroided. | | `-d` | DatabaseFile | Protein sequence database (`*.fasta`, `*.fa`, `*.faa`). | ### Core Search Parameters | Flag | Name | Default | Description | |------|------|---------|-------------| -| `-o` | OutputFile | `[input].mzid` | Output file path (`.mzid` format). | +| `-o` | OutputFile | `[input].pin` | Output file path (`.pin` Percolator format, default; `.tsv` if `-outputFormat tsv`). | | `-conf` | ConfigurationFile | — | Configuration file; command-line options override config file settings. | | `-t` | PrecursorMassTolerance | `20ppm` | Precursor mass tolerance (e.g., `2.5Da`, `20ppm`, or `0.5Da,2.5Da` for asymmetric). | | `-ti` | IsotopeErrorRange | `0,1` | Range of allowed isotope peak errors (e.g., `-1,2`). | diff --git a/docs/examples/MSGFPlus_Params.txt b/docs/examples/MSGFPlus_Params.txt index c1ef196e..8a7e0d16 100644 --- a/docs/examples/MSGFPlus_Params.txt +++ b/docs/examples/MSGFPlus_Params.txt @@ -1,5 +1,5 @@ # SpectrumFile -# *.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt +# *.mzML or *.mgf # Spectra should be centroided (see below for MSConvert example). Profile spectra will be ignored. # Use of -s at the command line will override this filename #SpectrumFile=InstrumentFile.mzML diff --git a/docs/msgfplus.md b/docs/msgfplus.md index 19117e5b..d3a8b3aa 100644 --- a/docs/msgfplus.md +++ b/docs/msgfplus.md @@ -10,7 +10,7 @@ Usage: java -Xmx3500M -jar MSGFPlus.jar An example parameter file is at https://github.com/MSGFPlus/msgfplus/blob/master/docs/examples/MSGFPlus_Params.txt Additional parameter files are at https://github.com/MSGFPlus/msgfplus/tree/master/docs/parameterfiles -[-s SpectrumFile] (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt) +[-s SpectrumFile] (*.mzML or *.mgf) Spectra should be centroided (see below for MSConvert example). Profile spectra will be ignored. [-d DatabaseFile] (*.fasta or *.fa or *.faa) @@ -123,9 +123,9 @@ Usage: java -Xmx3500M -jar MSGFPlus.jar [-numMods Count] (Maximum number of dynamic (variable) modifications per peptide; Default: 3) -[-allowDenseCentroidedPeaks 0/1] (Default: 0 (disabled); 1: (for mzML/mzXML input only) allows inclusion of spectra with high-density centroid data in the search) - MS-GF+ checks the distance between consecutive peaks in the spectrum, and if the median distance is less than 50 ppm, they are considered profile spectra regardless of the value provided in mzML and mzXML files. - This parameter allows overriding this check when the mzML/mzXML file says the spectrum is centroided. +[-allowDenseCentroidedPeaks 0/1] (Default: 0 (disabled); 1: (for mzML input only) allows inclusion of spectra with high-density centroid data in the search) + MS-GF+ checks the distance between consecutive peaks in the spectrum, and if the median distance is less than 50 ppm, they are considered profile spectra regardless of the value provided in the mzML file. + This parameter allows overriding this check when the mzML file says the spectrum is centroided. ``` @@ -146,10 +146,10 @@ Example command (low-precision spectra): ### Parameters: -- **-s SpectrumFile** (.mzML\*, \*.mzXML, \*.mgf, \*.ms2, \*.pkl or \*\_dta.txt) - Required +- **-s SpectrumFile** (\*.mzML or \*.mgf) - Required - - Spectrum file name. Currently, MS-GF+ supports the following file formats: mzML, mzXML, mzML, mgf, ms2, pkl and \_dta.txt. - - We recommend to use mzML, whenever possible. + - Spectrum file name. This fork supports two spectrum file formats: `mzML` and `mgf`. Legacy formats (`mzXML`, `ms2`, `pkl`, `_dta.txt`) are not supported. + - We recommend `mzML` whenever possible. - For Thermo .raw files, obtain a centroided .mzML using MSConvert, which is part of [ProteoWizard](http://proteowizard.sourceforge.net/). `MSConvert.exe --mzML --32 --filter "peakPicking true 1-" DatasetName.raw` diff --git a/docs/readme.md b/docs/readme.md index 3f58ab68..14fc1ecc 100644 --- a/docs/readme.md +++ b/docs/readme.md @@ -10,8 +10,8 @@ Static HTML under `docs/` was replaced with these Markdown pages so they read we ### Summary - MS-GF+ is an MS/MS database search tool that is sensitive (it identifies more peptides than other database search tools and as many peptides as spectral library search tools) and universal (works well for diverse types of spectra, different configurations of MS instruments and different experimental protocols). -- Input: HUPO PSI standard mzML (also mzXML / MGF / MS2 / PKL). -- Output: Percolator `.pin` (default, for rescoring) or TSV. **mzIdentML (`.mzid`) output has been removed as of the next release** — MS-GF+ now feeds downstream Percolator pipelines directly via `.pin`. See [Changelog](changelog.md) for migration notes. +- Input: HUPO PSI standard mzML and MGF only (mzXML, MS2, PKL, and `_dta.txt` are not supported in this fork). +- Output: Percolator `.pin` (default, for rescoring) or TSV. mzIdentML (`.mzid`) output has been removed — MS-GF+ now feeds downstream Percolator pipelines directly via `.pin`. See [Changelog](changelog.md) for migration notes. ### Usage and help @@ -21,7 +21,6 @@ Static HTML under `docs/` was replaced with these Markdown pages so they read we - [Suffix array builder (BuildSA)](buildsa.md) - [Isobaric labelling: TMT / TMTpro / iTRAQ recipes](isobariclabeling.md) - [Troubleshooting & common errors](troubleshooting.md) -- [MS-GFDB (obsolete)](ms-gfdb.md) ### Publications diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index fa6ce803..1d499daa 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -50,7 +50,7 @@ MS-GF+ currently uses `int`-indexed suffix-array and byte-array structures for t Affected workflows: metaproteomics, proteogenomics, antibody-repertoire searches, and pan-microbial databases. -**Workaround today** — split the FASTA into chunks ≤ 250 MB, run one MS-GF+ search per chunk, and merge the resulting mzIdentML files. [MzidMerger](https://github.com/PNNL-Comp-Mass-Spec/Mzid-Merger) is the standard tool for the merge step. +**Workaround today** — split the FASTA into chunks ≤ 250 MB, run one MS-GF+ search per chunk, and concatenate the resulting `.pin` (or `.tsv`) files. For `.pin` outputs the header line repeats per chunk; drop duplicate header rows after the first, then feed the merged file to Percolator. **Planned fix** — 64-bit indexed FASTA storage is tracked as Priority 1 in the `bigbio/msgfplus` performance roadmap. See the investigation note in `.claude/investigations/` (not shipped). @@ -107,7 +107,7 @@ Related issue: [#52](https://github.com/MSGFPlus/msgfplus/issues/52). Reported in [OpenMS #1764](https://github.com/OpenMS/OpenMS/issues/1764). The command line works; TOPPAS fails because of how it passes environment and quoted arguments. -**Workaround** — run MS-GF+ directly from the command line and import the resulting mzIdentML into OpenMS. +**Workaround** — run MS-GF+ directly from the command line and feed the resulting `.pin` (or `.tsv`) into OpenMS via `PercolatorAdapter` / `MSGFPlusAdapter`. --- diff --git a/src/test/resources/MSGFDB_Param.txt b/src/test/resources/MSGFDB_Param.txt index c8d71699..8db2b7b9 100644 --- a/src/test/resources/MSGFDB_Param.txt +++ b/src/test/resources/MSGFDB_Param.txt @@ -1,8 +1,7 @@ -#Parent mass tolerance +#Precursor mass tolerance # Examples: 2.5Da or 30ppm # Use comma to set asymmetric values, for example "0.5Da,2.5Da" will set 0.5Da to the left (expMasstheoMass) -#PMTolerance=20ppm -ParentMassTolerance=20ppm +PrecursorMassTolerance=20ppm #Max Number of Modifications per peptide # If this value is large, the search will be slow @@ -45,7 +44,7 @@ EnzymeID=1 # Ignored if the parent mass tolerance is > 0.5Da or 500ppm # The combination of -t and -ti determins the precursor mass tolerance. # e.g. "-t 20ppm -ti -1,2" tests abs(exp-calc-n*1.00335Da)<20ppm for n=-1, 0, 1, 2. -IsotopeError=-1,2 +IsotopeErrorRange=-1,2 #Number of tryptic termini # The number of peptide termini that must have been cleaved by the enzyme (default 1) @@ -57,11 +56,6 @@ NTT=2 # 1 means search decoy database to compute FDR (source FASTA file must be forward-only proteins) TDA=1 -#Include decoy peptides (only applicable when TDA=1) -# 0 means to not include decoy (reverse) peptides -# 1 means to include decoy PSMs after all target (forward) PSMs -showDecoy=1 - #Number of Threads (by default, uses all available cores) #In DMS, a settings file entry for MSGFDBThreads will override this value NumThreads=All @@ -82,13 +76,6 @@ maxCharge=5 #If this value is greater than 1 then the FDR values computed by MS-GF+ will be skewed by high-scoring 2nd and 3rd hits NumMatchesPerSpec=2 -#Amino Acid Frequencies -# If 0, compute amino acid frequencies from the input database and use them as amino acid probabilities -# If 1, use uniform amino acid probability (preferable when the database size is small) -# If auto, then Analysis Manager will auto-set this to 0 if the .Fasta file is more than 20 KB in size -uniformAAProb=auto - - #Amino Acid Modification Examples # Specify static modifications using one or more StaticMod= entries # Specify dynamic modifications using one or more DynamicMod= entries From 7a19f833356f8f7110b45ca2f0ca6785748c9cd2 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 09:55:41 +0100 Subject: [PATCH 25/34] fix(cli): three Phase 4c regressions + polish on MSGFPlusOptions The user code-reviewed the unpushed branch tip and surfaced three behaviour regressions that all date back to the Phase 4c ParamManager retire (commit 03f32c1) plus a few small cleanups. P1 -- config-file keys are now matched case-insensitively, restoring the legacy ParamManager.parseConfigParamFile semantics. The Phase 4c switch was exact-case so test fixtures using lowercase first-letter keys (e.g. "minCharge=2", "maxCharge=5" in MSGFDB_Param.txt) were silently dropped to defaults instead of overriding them. The fix lowercases canonicalConfigKey output and makes every applyConfigEntry case label lowercase. New regression test configFileKeysAreMatchedCaseInsensitively pins the contract with a mix of canonical, lowercased-first-letter, and ALLCAPS forms. P2 -- invalid enum-like CLI indices (-m 99, -inst 99, -e 99, -protocol 99) and out-of-range numerics now produce a clean user-facing error from SearchParams.parse instead of an IllegalArgumentException stack trace from the resolver. validate() is now invoked in place of validateRequired() and runs the bounds checks up-front. P2 -- restored the legacy IntParameter.minValue/maxValue range checks for: -thread, -tasks, -minSpectraPerThread, -minLength, -maxLength, -minCharge, -maxCharge, -n, -ntt, -tda, -verbose, -addFeatures, -allowDenseCentroidedPeaks, -edgeScore, -ignoreMetCleavage, -maxMissedCleavages, -numMods, -ccm, -u, -m, -inst, -e, -protocol, plus the hidden flags. New unit test validateRejectsOutOfRangeFlags pins a representative set. Polish: - Drop the dead Phase 1 / MSGFPlusOptionsAdapter / ParamManager rollout narrative from the MSGFPlusOptions class header (both the adapter class and ParamManager were deleted in earlier commits; the comment was stale). - Collapse the unrecognizedConfigEntries field + local probe counter in applyConfigFile into a single counter reset at start and read at end -- one piece of state, simpler control flow. - Strip the CRLF (\r before \n) on docs/examples/MSGFPlus_Params.txt and src/test/resources/MSGFDB_Param.txt, which git diff --check was flagging as trailing whitespace. The rest of the codebase is LF-only. Verified: - mvn -B -o test on the scoped sweep (77 tests, 0 failures, 0 errors, 3 skipped). - SearchParamsTest no longer warns "unrecognized parameter 'minCharge=2'" / "'maxCharge=5'" when loading MSGFDB_Param.txt. - git diff --check on this commit is clean. --- docs/examples/MSGFPlus_Params.txt | 278 +++++++++--------- .../edu/ucsd/msjava/cli/MSGFPlusOptions.java | 247 ++++++++++------ .../ucsd/msjava/msdbsearch/SearchParams.java | 7 +- .../cli/MSGFPlusOptionsConfigFileTest.java | 64 ++++ src/test/resources/MSGFDB_Param.txt | 194 ++++++------ 5 files changed, 460 insertions(+), 330 deletions(-) diff --git a/docs/examples/MSGFPlus_Params.txt b/docs/examples/MSGFPlus_Params.txt index 8a7e0d16..66282805 100644 --- a/docs/examples/MSGFPlus_Params.txt +++ b/docs/examples/MSGFPlus_Params.txt @@ -1,139 +1,139 @@ -# SpectrumFile -# *.mzML or *.mgf -# Spectra should be centroided (see below for MSConvert example). Profile spectra will be ignored. -# Use of -s at the command line will override this filename -#SpectrumFile=InstrumentFile.mzML - -# FASTA file -# "*.fasta or *.fa or *.faa -# Use of -d at the command line will override this filename -#DatabaseFile=Proteins.fasta - -# Prefix for decoy proteins in the FASTA file -#DecoyPrefix=XXX - -# Precursor mass tolerance -# Examples: 2.5Da or 30ppm -# Use comma to set asymmetric values, for example "0.5Da,2.5Da" will set 0.5Da to the left (expMasstheoMass) -PrecursorMassTolerance=20ppm - -# Max Number of Dynamic (Variable) Modifications per peptide -# Default: 3 -# If this value is large, the search will be slow -NumMods=3 - -# Modifications (see below for examples) -StaticMod=C2H3N1O1, C, fix, any, Carbamidomethyl # Fixed Carbamidomethyl C (alkylation) -StaticMod=229.1629, *, fix, N-term, TMT6plex -StaticMod=229.1629, K, fix, any, TMT6plex - -DynamicMod=O1, M, opt, any, Oxidation # Oxidized methionine -DynamicMod=-187.152366, K, opt, any, AcNoTMT # Residue tagged by MSGF+ with static TMT6, but is actually acetylated and does not have TMT - -# Custom AA specification -#CustomAA=C3H5NO, U, custom, U, Selenocysteine # Custom amino acids can only have C, H, N, O, and S -#CustomAA=C6H11NO, X, custom, X, Leu_Ile # Leucine or Isoleucine - -# Fragmentation Method -# 0 means as written in the spectrum or CID if no info (Default) -# 1 means CID -# 2 means ETD -# 3 means HCD -FragmentationMethodID=0 - -# Instrument ID -# 0 means Low-res LCQ/LTQ (Default for CID and ETD); use InstrumentID=0 if analyzing a dataset with low-res CID and high-res HCD spectra -# 1 means High-res LTQ (Default for HCD; also appropriate for high res CID); use InstrumentID=1 for Orbitrap, Lumos, and QEHFX instruments -# 2 means TOF -# 3 means Q-Exactive -InstrumentID=1 - -# Enzyme ID -# 0 means unspecific cleavage (cleave after any residue) -# 1 means Trypsin (Default); optionally use this along with NTT=0 for a no-enzyme-specificity search of a tryptically digested sample -# 2: Chymotrypsin, 3: Lys-C, 4: Lys-N, 5: Glu-C, 6: Arg-C, 7: Asp-N, 8: alphaLP, 9: No Cleavage (for peptidomics), 10: TrypPlusC (cleave after K, R, or C) -EnzymeID=1 - -# Isotope error range -# Takes into account of the error introduced by choosing non-monoisotopic peak for fragmentation. -# Useful for accurate precursor ion masses -# Ignored if the parent mass tolerance is > 0.5Da or 500ppm -# The combination of -t and -ti determins the precursor mass tolerance. -# e.g. "-t 20ppm -ti -1,2" tests abs(exp-calc-n*1.00335Da)<20ppm for n=-1, 0, 1, 2. -IsotopeErrorRange=-1,2 - -# Number of tolerable termini -# The number of peptide termini that must have been cleaved by the enzyme (default 1) -# For trypsin, 2 means fully tryptic only, 1 means partially tryptic, and 0 means no-enzyme search -NTT=2 - -# Control N-terminal methionine cleavage -# 0 means to consider protein N-term Met cleavage (Default) -# 1 means to ignore protein N-term Met cleavage -IgnoreMetCleavage=0 - -# Target/Decoy search mode -# 0 means don't search decoy database (default) -# 1 means search decoy database to compute FDR (source FASTA file must be forward-only proteins) -TDA=1 - -# Number of concurrent threads to be executed -# Default: Number of available cores -# To use three threads use NumThreads=3 -NumThreads=All - -# Minimum peptide length to consider -# Default: 6 -MinPepLength=6 - -# Maximum peptide length to consider -# Default: 40 -MaxPepLength=50 - -# Minimum precursor charge to consider (if not specified in the spectrum file) -# Default: 2 -MinCharge=2 - -# Maximum precursor charge to consider (if not specified in the spectrum file) -# Default: 3 -MaxCharge=5 - -# Number of matches per spectrum to be reported -# If this value is greater than 1, the FDR values computed by MS-GF+ will be skewed by high-scoring 2nd and 3rd hits -NumMatchesPerSpec=1 - -# Mass of charge carrier -# Default: mass of proton -#ChargeCarrierMass=1.00727649 - -# Maximum missed cleavages -# Exclude peptides with more than this number of missed cleavages from the search, Default: -1 (no limit) -#MaxMissedCleavages=-1 - -# Minimum number of peaks per spectrum, Default: -# Default: 10 -#MinNumPeaksPerSpectrum=10 - -# Number of isoforms to consider per peptide -# Default: 128 -#NumIsoforms=128 - -# Amino Acid Modification Examples -# Specify static modifications using one or more StaticMod= entries -# Specify dynamic modifications using one or more DynamicMod= entries -# Modification format is: -# Mass or CompositionString, Residues, ModType, Position, Name (all five fields are required). -# CompositionString can only contain a limited set of elements, primarily C H N O S or P -# -# Examples: -# C2H3N1O1, C, fix, any, Carbamidomethyl # Fixed Carbamidomethyl C (alkylation) -# O1, M, opt, any, Oxidation # Oxidation M -# 15.994915, M, opt, any, Oxidation # Oxidation M (mass is used instead of CompositionString) -# H-1N-1O1, NQ, opt, any, Deamidated # Negative numbers are allowed. -# CH2, K, opt, any, Methyl # Methylation K -# C2H2O1, K, opt, any, Acetyl # Acetylation K -# HO3P, STY,opt, any, Phospho # Phosphorylation STY -# C2H3NO, *, opt, N-term, Carbamidomethyl # Variable Carbamidomethyl N-term -# H-2O-1, E, opt, N-term, Glu->pyro-Glu # Pyro-glu from E -# H-3N-1, Q, opt, N-term, Gln->pyro-Glu # Pyro-glu from Q -# C2H2O, *, opt, Prot-N-term, Acetyl # Acetylation Protein N-term +# SpectrumFile +# *.mzML or *.mgf +# Spectra should be centroided (see below for MSConvert example). Profile spectra will be ignored. +# Use of -s at the command line will override this filename +#SpectrumFile=InstrumentFile.mzML + +# FASTA file +# "*.fasta or *.fa or *.faa +# Use of -d at the command line will override this filename +#DatabaseFile=Proteins.fasta + +# Prefix for decoy proteins in the FASTA file +#DecoyPrefix=XXX + +# Precursor mass tolerance +# Examples: 2.5Da or 30ppm +# Use comma to set asymmetric values, for example "0.5Da,2.5Da" will set 0.5Da to the left (expMasstheoMass) +PrecursorMassTolerance=20ppm + +# Max Number of Dynamic (Variable) Modifications per peptide +# Default: 3 +# If this value is large, the search will be slow +NumMods=3 + +# Modifications (see below for examples) +StaticMod=C2H3N1O1, C, fix, any, Carbamidomethyl # Fixed Carbamidomethyl C (alkylation) +StaticMod=229.1629, *, fix, N-term, TMT6plex +StaticMod=229.1629, K, fix, any, TMT6plex + +DynamicMod=O1, M, opt, any, Oxidation # Oxidized methionine +DynamicMod=-187.152366, K, opt, any, AcNoTMT # Residue tagged by MSGF+ with static TMT6, but is actually acetylated and does not have TMT + +# Custom AA specification +#CustomAA=C3H5NO, U, custom, U, Selenocysteine # Custom amino acids can only have C, H, N, O, and S +#CustomAA=C6H11NO, X, custom, X, Leu_Ile # Leucine or Isoleucine + +# Fragmentation Method +# 0 means as written in the spectrum or CID if no info (Default) +# 1 means CID +# 2 means ETD +# 3 means HCD +FragmentationMethodID=0 + +# Instrument ID +# 0 means Low-res LCQ/LTQ (Default for CID and ETD); use InstrumentID=0 if analyzing a dataset with low-res CID and high-res HCD spectra +# 1 means High-res LTQ (Default for HCD; also appropriate for high res CID); use InstrumentID=1 for Orbitrap, Lumos, and QEHFX instruments +# 2 means TOF +# 3 means Q-Exactive +InstrumentID=1 + +# Enzyme ID +# 0 means unspecific cleavage (cleave after any residue) +# 1 means Trypsin (Default); optionally use this along with NTT=0 for a no-enzyme-specificity search of a tryptically digested sample +# 2: Chymotrypsin, 3: Lys-C, 4: Lys-N, 5: Glu-C, 6: Arg-C, 7: Asp-N, 8: alphaLP, 9: No Cleavage (for peptidomics), 10: TrypPlusC (cleave after K, R, or C) +EnzymeID=1 + +# Isotope error range +# Takes into account of the error introduced by choosing non-monoisotopic peak for fragmentation. +# Useful for accurate precursor ion masses +# Ignored if the parent mass tolerance is > 0.5Da or 500ppm +# The combination of -t and -ti determins the precursor mass tolerance. +# e.g. "-t 20ppm -ti -1,2" tests abs(exp-calc-n*1.00335Da)<20ppm for n=-1, 0, 1, 2. +IsotopeErrorRange=-1,2 + +# Number of tolerable termini +# The number of peptide termini that must have been cleaved by the enzyme (default 1) +# For trypsin, 2 means fully tryptic only, 1 means partially tryptic, and 0 means no-enzyme search +NTT=2 + +# Control N-terminal methionine cleavage +# 0 means to consider protein N-term Met cleavage (Default) +# 1 means to ignore protein N-term Met cleavage +IgnoreMetCleavage=0 + +# Target/Decoy search mode +# 0 means don't search decoy database (default) +# 1 means search decoy database to compute FDR (source FASTA file must be forward-only proteins) +TDA=1 + +# Number of concurrent threads to be executed +# Default: Number of available cores +# To use three threads use NumThreads=3 +NumThreads=All + +# Minimum peptide length to consider +# Default: 6 +MinPepLength=6 + +# Maximum peptide length to consider +# Default: 40 +MaxPepLength=50 + +# Minimum precursor charge to consider (if not specified in the spectrum file) +# Default: 2 +MinCharge=2 + +# Maximum precursor charge to consider (if not specified in the spectrum file) +# Default: 3 +MaxCharge=5 + +# Number of matches per spectrum to be reported +# If this value is greater than 1, the FDR values computed by MS-GF+ will be skewed by high-scoring 2nd and 3rd hits +NumMatchesPerSpec=1 + +# Mass of charge carrier +# Default: mass of proton +#ChargeCarrierMass=1.00727649 + +# Maximum missed cleavages +# Exclude peptides with more than this number of missed cleavages from the search, Default: -1 (no limit) +#MaxMissedCleavages=-1 + +# Minimum number of peaks per spectrum, Default: +# Default: 10 +#MinNumPeaksPerSpectrum=10 + +# Number of isoforms to consider per peptide +# Default: 128 +#NumIsoforms=128 + +# Amino Acid Modification Examples +# Specify static modifications using one or more StaticMod= entries +# Specify dynamic modifications using one or more DynamicMod= entries +# Modification format is: +# Mass or CompositionString, Residues, ModType, Position, Name (all five fields are required). +# CompositionString can only contain a limited set of elements, primarily C H N O S or P +# +# Examples: +# C2H3N1O1, C, fix, any, Carbamidomethyl # Fixed Carbamidomethyl C (alkylation) +# O1, M, opt, any, Oxidation # Oxidation M +# 15.994915, M, opt, any, Oxidation # Oxidation M (mass is used instead of CompositionString) +# H-1N-1O1, NQ, opt, any, Deamidated # Negative numbers are allowed. +# CH2, K, opt, any, Methyl # Methylation K +# C2H2O1, K, opt, any, Acetyl # Acetylation K +# HO3P, STY,opt, any, Phospho # Phosphorylation STY +# C2H3NO, *, opt, N-term, Carbamidomethyl # Variable Carbamidomethyl N-term +# H-2O-1, E, opt, N-term, Glu->pyro-Glu # Pyro-glu from E +# H-3N-1, Q, opt, N-term, Gln->pyro-Glu # Pyro-glu from Q +# C2H2O, *, opt, Prot-N-term, Acetyl # Acetylation Protein N-term diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java index b010ab27..0991add6 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java @@ -15,18 +15,12 @@ import java.util.List; /** - * Typed command-line options for MS-GF+. Replaces the imperative - * {@code addParameter()} calls in {@code ParamManager.addMSGFPlusParams()} - * with declarative picocli annotations. - * - * Phase 1 scope: every flag from {@link ParamNameEnum} that - * {@code addMSGFPlusParams()} registers, parsed into typed fields. - * Complex domain types (Tolerance, IntRange, dynamic enums) are - * captured here as raw strings; the adapter at - * {@code MSGFPlusOptionsAdapter} round-trips them through the existing - * {@code params.Parameter#parse(String)} hierarchy to populate a - * {@code ParamManager} that {@code SearchParams.parse(ParamManager)} - * can consume unchanged. Phase 3 collapses that round-trip away. + * Typed command-line options for MS-GF+. Picocli reads {@code argv} into + * the {@code @Option}-annotated fields below; {@link #applyConfigFile} + * fills in any field the CLI did not set from a {@code -conf} file + * (CLI takes precedence). {@link #validate} enforces required-input + * and numeric/enum range invariants. Each {@code effectiveXxx()} accessor + * returns the user-supplied value or the legacy default. * * Flag inventory: see {@code .claude/plans/parameter-modernization-flag-inventory.md}. */ @@ -335,7 +329,7 @@ public Protocol effectiveProtocol() { * @return null on success, error string otherwise. */ public String applyConfigFile(File file) { - int unrecognizedCount = 0; + unrecognizedConfigEntries = 0; try (BufferedReader reader = new BufferedReader(new FileReader(file))) { String line; int lineNum = 0; @@ -348,17 +342,15 @@ public String applyConfigFile(File file) { String rawKey = trimmed.substring(0, eq).trim(); String value = trimmed.substring(eq + 1).trim(); String key = canonicalConfigKey(rawKey); - int before = unrecognizedConfigEntries; String err = applyConfigEntry(key, value, file.getName()); if (err != null) { return "Error parsing line " + lineNum + " of " + file.getName() + ": " + err; } - if (unrecognizedConfigEntries > before) unrecognizedCount++; } } catch (IOException e) { return "Error reading config file " + file.getPath() + ": " + e.getMessage(); } - if (unrecognizedCount > 0) { + if (unrecognizedConfigEntries > 0) { System.out.println("Valid parameters are described in the example parameter file at " + "https://github.com/MSGFPlus/msgfplus/blob/master/docs/examples/MSGFPlus_Params.txt"); } @@ -366,69 +358,66 @@ public String applyConfigFile(File file) { } /** Counter incremented inside {@link #applyConfigEntry} whenever an unknown - * config-file key is seen; surfaced via the end-of-file URL hint. */ + * config-file key is seen; surfaced via the end-of-file URL hint and + * reset at the start of each {@link #applyConfigFile} call. */ private int unrecognizedConfigEntries; private String applyConfigEntry(String key, String value, String fileName) { - // Repeated entries: collect into lists. "none" is treated as no entry. - if (key.equalsIgnoreCase("DynamicMod")) { - if (!value.equalsIgnoreCase("none")) dynamicMods.add(value); - return null; - } - if (key.equalsIgnoreCase("StaticMod")) { - if (!value.equalsIgnoreCase("none")) staticMods.add(value); - return null; - } - if (key.equalsIgnoreCase("CustomAA")) { - if (!value.equalsIgnoreCase("none")) customAAs.add(value); - return null; + // Config-file matching is case-insensitive. canonicalConfigKey() + // already returns lowercase canonical names, so the switch labels + // are lowercase too. Repeated mod entries are matched first since + // they accumulate rather than overwrite. + switch (key) { + case "dynamicmod": if (!value.equalsIgnoreCase("none")) dynamicMods.add(value); return null; + case "staticmod": if (!value.equalsIgnoreCase("none")) staticMods.add(value); return null; + case "customaa": if (!value.equalsIgnoreCase("none")) customAAs.add(value); return null; + default: break; } // Single-valued entries: only fill in if CLI did not set the field. try { switch (key) { - case "SpectrumFile": if (spectrumFile == null) spectrumFile = new File(value); return null; - case "DatabaseFile": if (databaseFile == null) databaseFile = new File(value); return null; - case "OutputFile": if (outputFile == null) outputFile = new File(value); return null; - case "ModificationFileName": - case "ModificationFile": if (modificationFile == null) modificationFile = new File(value); return null; - case "DBIndexDir": if (dbIndexDir == null) dbIndexDir = new File(value); return null; - case "DecoyPrefix": if (decoyPrefix == null) decoyPrefix = value; return null; - case "PrecursorMassTolerance": if (precursorTolerance == null) precursorTolerance = PrecursorTolerance.parse(value); return null; - case "PrecursorMassToleranceUnits": - if (precursorToleranceUnits == null) precursorToleranceUnits = Integer.parseInt(value); return null; - case "IsotopeErrorRange": if (isotopeErrorRange == null) isotopeErrorRange = IntRange.parse(value); return null; - case "FragmentationMethodID": if (fragMethodId == null) fragMethodId = Integer.parseInt(value); return null; - case "InstrumentID": if (instrumentTypeId == null) instrumentTypeId = Integer.parseInt(value); return null; - case "EnzymeID": if (enzymeId == null) enzymeId = Integer.parseInt(value); return null; - case "ProtocolID": if (protocolId == null) protocolId = Integer.parseInt(value); return null; - case "NTT": if (numTolerableTermini == null) numTolerableTermini = Integer.parseInt(value); return null; - case "MinPepLength": if (minPeptideLength == null) minPeptideLength = Integer.parseInt(value); return null; - case "MaxPepLength": if (maxPeptideLength == null) maxPeptideLength = Integer.parseInt(value); return null; - case "MinCharge": if (minCharge == null) minCharge = Integer.parseInt(value); return null; - case "MaxCharge": if (maxCharge == null) maxCharge = Integer.parseInt(value); return null; - case "NumMatchesPerSpec": if (numMatchesPerSpec == null) numMatchesPerSpec = Integer.parseInt(value); return null; - case "NumThreads": if (numThreads == null) { if (!value.equalsIgnoreCase("all")) numThreads = Integer.parseInt(value); } return null; - case "NumTasks": if (numTasks == null) numTasks = Integer.parseInt(value); return null; - case "MinSpectraPerThread": if (minSpectraPerThread == null) minSpectraPerThread = Integer.parseInt(value); return null; - case "Verbose": if (verbose == null) verbose = Integer.parseInt(value); return null; - case "TDA": if (tdaStrategy == null) tdaStrategy = Integer.parseInt(value); return null; - case "AddFeatures": if (addFeatures == null) addFeatures = Integer.parseInt(value); return null; - case "OutputFormat": if (outputFormat == null) outputFormat = value; return null; - case "PrecursorCal": if (precursorCalMode == null) precursorCalMode = value; return null; - case "ChargeCarrierMass": if (chargeCarrierMass == null) chargeCarrierMass = Double.parseDouble(value); return null; - case "MaxMissedCleavages": if (maxMissedCleavages == null) maxMissedCleavages = Integer.parseInt(value); return null; - case "NumMods": if (maxNumMods == null) configMaxNumMods = Integer.parseInt(value); return null; - case "AllowDenseCentroidedPeaks": - if (allowDenseCentroidedPeaks == null) allowDenseCentroidedPeaks = Integer.parseInt(value); return null; - case "MSLevel": if (msLevel == null) msLevel = IntRange.parse(value); return null; - case "SpecIndex": if (specIndexRange == null) specIndexRange = IntRange.parse(value); return null; - case "EdgeScore": if (edgeScore == null) edgeScore = Integer.parseInt(value); return null; - case "MinNumPeaksPerSpectrum": if (minNumPeaks == null) minNumPeaks = Integer.parseInt(value); return null; - case "NumIsoforms": if (numIsoforms == null) numIsoforms = Integer.parseInt(value); return null; - case "IgnoreMetCleavage": if (ignoreMetCleavage == null) ignoreMetCleavage = Integer.parseInt(value); return null; - case "MinDeNovoScore": if (minDeNovoScore == null) minDeNovoScore = Integer.parseInt(value); return null; + case "spectrumfile": if (spectrumFile == null) spectrumFile = new File(value); return null; + case "databasefile": if (databaseFile == null) databaseFile = new File(value); return null; + case "outputfile": if (outputFile == null) outputFile = new File(value); return null; + case "modificationfilename": + case "modificationfile": if (modificationFile == null) modificationFile = new File(value); return null; + case "dbindexdir": if (dbIndexDir == null) dbIndexDir = new File(value); return null; + case "decoyprefix": if (decoyPrefix == null) decoyPrefix = value; return null; + case "precursormasstolerance": if (precursorTolerance == null) precursorTolerance = PrecursorTolerance.parse(value); return null; + case "precursormasstoleranceunits":if (precursorToleranceUnits == null) precursorToleranceUnits = Integer.parseInt(value); return null; + case "isotopeerrorrange": if (isotopeErrorRange == null) isotopeErrorRange = IntRange.parse(value); return null; + case "fragmentationmethodid": if (fragMethodId == null) fragMethodId = Integer.parseInt(value); return null; + case "instrumentid": if (instrumentTypeId == null) instrumentTypeId = Integer.parseInt(value); return null; + case "enzymeid": if (enzymeId == null) enzymeId = Integer.parseInt(value); return null; + case "protocolid": if (protocolId == null) protocolId = Integer.parseInt(value); return null; + case "ntt": if (numTolerableTermini == null) numTolerableTermini = Integer.parseInt(value); return null; + case "minpeplength": if (minPeptideLength == null) minPeptideLength = Integer.parseInt(value); return null; + case "maxpeplength": if (maxPeptideLength == null) maxPeptideLength = Integer.parseInt(value); return null; + case "mincharge": if (minCharge == null) minCharge = Integer.parseInt(value); return null; + case "maxcharge": if (maxCharge == null) maxCharge = Integer.parseInt(value); return null; + case "nummatchesperspec": if (numMatchesPerSpec == null) numMatchesPerSpec = Integer.parseInt(value); return null; + case "numthreads": if (numThreads == null && !value.equalsIgnoreCase("all")) + numThreads = Integer.parseInt(value); return null; + case "numtasks": if (numTasks == null) numTasks = Integer.parseInt(value); return null; + case "minspectraperthread": if (minSpectraPerThread == null) minSpectraPerThread = Integer.parseInt(value); return null; + case "verbose": if (verbose == null) verbose = Integer.parseInt(value); return null; + case "tda": if (tdaStrategy == null) tdaStrategy = Integer.parseInt(value); return null; + case "addfeatures": if (addFeatures == null) addFeatures = Integer.parseInt(value); return null; + case "outputformat": if (outputFormat == null) outputFormat = value; return null; + case "precursorcal": if (precursorCalMode == null) precursorCalMode = value; return null; + case "chargecarriermass": if (chargeCarrierMass == null) chargeCarrierMass = Double.parseDouble(value); return null; + case "maxmissedcleavages": if (maxMissedCleavages == null) maxMissedCleavages = Integer.parseInt(value); return null; + case "nummods": if (maxNumMods == null) configMaxNumMods = Integer.parseInt(value); return null; + case "allowdensecentroidedpeaks": if (allowDenseCentroidedPeaks == null) allowDenseCentroidedPeaks = Integer.parseInt(value); return null; + case "mslevel": if (msLevel == null) msLevel = IntRange.parse(value); return null; + case "specindex": if (specIndexRange == null) specIndexRange = IntRange.parse(value); return null; + case "edgescore": if (edgeScore == null) edgeScore = Integer.parseInt(value); return null; + case "minnumpeaksperspectrum": if (minNumPeaks == null) minNumPeaks = Integer.parseInt(value); return null; + case "numisoforms": if (numIsoforms == null) numIsoforms = Integer.parseInt(value); return null; + case "ignoremetcleavage": if (ignoreMetCleavage == null) ignoreMetCleavage = Integer.parseInt(value); return null; + case "mindenovoscore": if (minDeNovoScore == null) minDeNovoScore = Integer.parseInt(value); return null; default: - if (!key.toLowerCase().startsWith("enzymedef")) { + if (!key.startsWith("enzymedef")) { System.out.println("Warning, unrecognized parameter '" + key + "=" + value + "' in config file " + fileName); unrecognizedConfigEntries++; } @@ -445,35 +434,109 @@ private static String stripComment(String line) { } /** Normalize legacy / alternate config-file keys to canonical form. - * Mirrors the rewrites previously in {@code ParamNameEnum.getParamNameFromLine}. */ + * Returns lowercase so {@link #applyConfigEntry} can match + * case-insensitively (the legacy {@code ParamManager.parseConfigParamFile} + * matched names with {@code equalsIgnoreCase}). Mirrors the alias + * rewrites previously in {@code ParamNameEnum.getParamNameFromLine}. */ private static String canonicalConfigKey(String key) { - if (key.equalsIgnoreCase("IsotopeError")) return "IsotopeErrorRange"; - if (key.equalsIgnoreCase("TargetDecoyAnalysis")) return "TDA"; - if (key.equalsIgnoreCase("FragmentationMethod")) return "FragmentationMethodID"; - if (key.equalsIgnoreCase("Instrument")) return "InstrumentID"; - if (key.equalsIgnoreCase("Enzyme")) return "EnzymeID"; - if (key.equalsIgnoreCase("Protocol")) return "ProtocolID"; - if (key.equalsIgnoreCase("NumTolerableTermini")) return "NTT"; - if (key.equalsIgnoreCase("MinNumPeaks")) return "MinNumPeaksPerSpectrum"; - if (key.equalsIgnoreCase("MaxNumMods")) return "NumMods"; - if (key.equalsIgnoreCase("MaxNumModsPerPeptide")) return "NumMods"; - if (key.equalsIgnoreCase("minLength")) return "MinPepLength"; - if (key.equalsIgnoreCase("MinPeptideLength")) return "MinPepLength"; - if (key.equalsIgnoreCase("maxLength")) return "MaxPepLength"; - if (key.equalsIgnoreCase("MaxPeptideLength")) return "MaxPepLength"; - if (key.equalsIgnoreCase("PMTolerance")) return "PrecursorMassTolerance"; - if (key.equalsIgnoreCase("ParentMassTolerance")) return "PrecursorMassTolerance"; - return key; + String norm = key.toLowerCase(java.util.Locale.ROOT); + switch (norm) { + case "isotopeerror": return "isotopeerrorrange"; + case "targetdecoyanalysis": return "tda"; + case "fragmentationmethod": return "fragmentationmethodid"; + case "instrument": return "instrumentid"; + case "enzyme": return "enzymeid"; + case "protocol": return "protocolid"; + case "numtolerabletermini": return "ntt"; + case "minnumpeaks": return "minnumpeaksperspectrum"; + case "maxnummods": return "nummods"; + case "maxnummodsperpeptide": return "nummods"; + case "minlength": return "minpeplength"; + case "minpeptidelength": return "minpeplength"; + case "maxlength": return "maxpeplength"; + case "maxpeptidelength": return "maxpeplength"; + case "pmtolerance": return "precursormasstolerance"; + case "parentmasstolerance": return "precursormasstolerance"; + default: return norm; + } } - /** Validates required-input invariants that the CLI alone can't enforce - * (since {@code -s}/{@code -d} may come from {@code -conf}). */ - public String validateRequired() { + /** Validates required-input invariants and the numeric/enum range + * constraints the legacy {@code IntParameter.minValue}/{@code maxValue} + * and {@code EnumParameter} machinery used to enforce. Returns + * {@code null} on success or a user-facing error string otherwise. + * + *

Required: {@code -s} and {@code -d} (either via CLI or {@code -conf}). + * Numeric flags must satisfy their original lower bounds; enum-shaped + * flags must fall in their defined index range. */ + public String validate() { if (spectrumFile == null) return "Spectrum file is not defined; use -s at the command line or SpectrumFile in a config file"; if (databaseFile == null) return "Database file is not defined; use -d at the command line or DatabaseFile in a config file"; + + String err; + if ((err = checkMin("-thread", numThreads, 1)) != null) return err; + if ((err = checkMin("-tasks", numTasks, -10)) != null) return err; + if ((err = checkMin("-minSpectraPerThread", minSpectraPerThread, 1)) != null) return err; + if ((err = checkMin("-minLength", minPeptideLength, 1)) != null) return err; + if ((err = checkMin("-maxLength", maxPeptideLength, 1)) != null) return err; + if ((err = checkMin("-minCharge", minCharge, 1)) != null) return err; + if ((err = checkMin("-maxCharge", maxCharge, 1)) != null) return err; + if ((err = checkMin("-n", numMatchesPerSpec, 1)) != null) return err; + if ((err = checkMin("-maxMissedCleavages", maxMissedCleavages, -1)) != null) return err; + if ((err = checkMin("-numMods", maxNumMods, 0)) != null) return err; + if ((err = checkMin("-minNumPeaks", minNumPeaks, 0)) != null) return err; + if ((err = checkMin("-iso", numIsoforms, 0)) != null) return err; + if ((err = checkMin("-minDeNovoScore", minDeNovoScore, Integer.MIN_VALUE)) != null) return err; + + if ((err = checkRange("-ntt", numTolerableTermini, 0, 2)) != null) return err; + if ((err = checkRange("-tda", tdaStrategy, 0, 1)) != null) return err; + if ((err = checkRange("-verbose", verbose, 0, 1)) != null) return err; + if ((err = checkRange("-addFeatures", addFeatures, 0, 1)) != null) return err; + if ((err = checkRange("-allowDenseCentroidedPeaks", allowDenseCentroidedPeaks, 0, 1)) != null) return err; + if ((err = checkRange("-edgeScore", edgeScore, 0, 1)) != null) return err; + if ((err = checkRange("-ignoreMetCleavage", ignoreMetCleavage, 0, 1)) != null) return err; + if ((err = checkRange("-u", precursorToleranceUnits, 0, 2)) != null) return err; + + if (chargeCarrierMass != null && chargeCarrierMass <= 0.1) { + return "Invalid value for parameter -ccm: " + chargeCarrierMass + " (must be > 0.1)"; + } + + if (fragMethodId != null && (fragMethodId < 0 || fragMethodId > 4)) { + return "Invalid value for parameter -m: " + fragMethodId + " (valid: 0..4)"; + } + int instMax = ActivationMethodAvailability.instCount() - 1; + if (instrumentTypeId != null && (instrumentTypeId < 0 || instrumentTypeId > instMax)) { + return "Invalid value for parameter -inst: " + instrumentTypeId + " (valid: 0.." + instMax + ")"; + } + int enzMax = Enzyme.getAllRegisteredEnzymes().length - 1; + if (enzymeId != null && (enzymeId < 0 || enzymeId > enzMax)) { + return "Invalid value for parameter -e: " + enzymeId + " (valid: 0.." + enzMax + ")"; + } + int protMax = Protocol.getAllRegisteredProtocols().length - 1; + if (protocolId != null && (protocolId < 0 || protocolId > protMax)) { + return "Invalid value for parameter -protocol: " + protocolId + " (valid: 0.." + protMax + ")"; + } + return null; + } + + private static String checkMin(String flag, Integer value, int min) { + if (value == null) return null; + if (value < min) return "Invalid value for parameter " + flag + ": " + value + " (must be >= " + min + ")"; return null; } + private static String checkRange(String flag, Integer value, int min, int max) { + if (value == null) return null; + if (value < min || value > max) return "Invalid value for parameter " + flag + ": " + value + " (valid: " + min + ".." + max + ")"; + return null; + } + + /** Helper that hides the {@link InstrumentType#getAllRegisteredInstrumentTypes} + * call from {@code validate()} so the import block stays minimal. */ + private static final class ActivationMethodAvailability { + static int instCount() { return InstrumentType.getAllRegisteredInstrumentTypes().length; } + } + /** Mutator used by {@code AminoAcidSet} when the parsed mod metadata * changes the effective max-num-mods (the AA set is authoritative once * loaded). Mirrors the legacy {@code ParamManager.setMaxNumMods}. */ diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java index 9897f010..55647240 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java @@ -330,8 +330,11 @@ public String parse(MSGFPlusOptions opts) { if (err != null) return err; } - // Required-input check now that CLI + config-file have both run. - String requiredErr = opts.validateRequired(); + // Required-input + numeric/enum range check now that CLI + + // config-file have both run. Catches things like -m 99 with a + // user-facing error instead of the IllegalArgumentException + // the resolver would otherwise raise during search setup. + String requiredErr = opts.validate(); if (requiredErr != null) return requiredErr; chargeCarrierMass = opts.effectiveChargeCarrierMass(); diff --git a/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java index c900ff01..d62867eb 100644 --- a/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java +++ b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java @@ -57,4 +57,68 @@ public void configFileWithCustomAAParsesWithoutCrashing() throws IOException, UR Files.deleteIfExists(conf); Files.deleteIfExists(tmpDir); } + + /** + * Regression for the case-insensitive config-key match. The legacy + * {@code ParamManager.parseConfigParamFile} matched names with + * {@code equalsIgnoreCase}; the Phase 4c switch was exact-case so + * {@code minCharge=} / {@code maxCharge=} from the test fixture + * silently fell back to defaults instead of overriding them. + */ + @Test + public void configFileKeysAreMatchedCaseInsensitively() throws IOException { + Path tmpDir = Files.createTempDirectory("msgfplus-caseinsens-"); + Path conf = tmpDir.resolve("mixed_case.txt"); + // Mix of canonical, lowercased-first-letter, and ALLCAPS forms. + Files.write(conf, ("MinPepLength=8\n" + + "maxpepLength=42\n" + + "MINCHARGE=3\n" + + "maxcharge=7\n" + + "TDA=1\n").getBytes(StandardCharsets.UTF_8)); + + MSGFPlusOptions opts = new MSGFPlusOptions(); + Assert.assertNull(opts.applyConfigFile(conf.toFile())); + + Assert.assertEquals(8, opts.effectiveMinPeptideLength()); + Assert.assertEquals(42, opts.effectiveMaxPeptideLength()); + Assert.assertEquals(3, opts.effectiveMinCharge()); + Assert.assertEquals(7, opts.effectiveMaxCharge()); + Assert.assertEquals(1, opts.effectiveTdaStrategy()); + + Files.deleteIfExists(conf); + Files.deleteIfExists(tmpDir); + } + + /** + * Pin the numeric/enum range validation that the legacy + * {@code IntParameter.minValue}/{@code maxValue} machinery used to + * enforce. After Phase 4c those checks initially disappeared; restoring + * them ensures invalid CLI input produces a clean error string instead + * of a stack trace from a downstream resolver. + */ + @Test + public void validateRejectsOutOfRangeFlags() { + MSGFPlusOptions opts = new MSGFPlusOptions(); + opts.spectrumFile = new File("anything.mgf"); + opts.databaseFile = new File("anything.fasta"); + + opts.numThreads = 0; + Assert.assertNotNull("numThreads=0 must be rejected", opts.validate()); + opts.numThreads = null; + + opts.fragMethodId = 99; + Assert.assertNotNull("-m 99 must be rejected with a user-facing error", opts.validate()); + opts.fragMethodId = null; + + opts.numTolerableTermini = 5; + Assert.assertNotNull("-ntt 5 must be rejected (valid 0..2)", opts.validate()); + opts.numTolerableTermini = null; + + opts.tdaStrategy = 2; + Assert.assertNotNull("-tda 2 must be rejected (valid 0..1)", opts.validate()); + opts.tdaStrategy = null; + + // A clean invocation passes. + Assert.assertNull(opts.validate()); + } } diff --git a/src/test/resources/MSGFDB_Param.txt b/src/test/resources/MSGFDB_Param.txt index 8db2b7b9..6d33882a 100644 --- a/src/test/resources/MSGFDB_Param.txt +++ b/src/test/resources/MSGFDB_Param.txt @@ -1,97 +1,97 @@ -#Precursor mass tolerance -# Examples: 2.5Da or 30ppm -# Use comma to set asymmetric values, for example "0.5Da,2.5Da" will set 0.5Da to the left (expMasstheoMass) -PrecursorMassTolerance=20ppm - -#Max Number of Modifications per peptide -# If this value is large, the search will be slow -NumMods=4 - -#Modifications (see below for examples) -StaticMod=C2H3N1O1, C, fix, any, Carbamidomethyl # Fixed Carbamidomethyl C (alkylation) -StaticMod=229.1629, *, fix, N-term, TMT6plex -StaticMod=229.1629, K, fix, any, TMT6plex - -DynamicMod=O1, M, opt, any, Oxidation # Oxidized methionine -DynamicMod=-187.152366, K, opt, any, AcNoTMT # Residue tagged by MSGF+ with static TMT6, but is actually acetylated and does not have TMT - -#Fragmentation Method -# 0 means as written in the spectrum or CID if no info (Default) -# 1 means CID -# 2 means ETD -# 3 means HCD -# 4 means Merge spectra from the same precursor (e.g. CID/ETD pairs, CID/HCD/ETD triplets) -# (note that for Thermo instruments this is always set to 0 by the Analysis Manager since a _ScanType.txt file is created with this information on a per-scan basis) -FragmentationMethodID=0 - -#Instrument ID -# 0 means Low-res LCQ/LTQ (Default for CID and ETD); use InstrumentID=0 if analyzing a dataset with low-res CID and high-res HCD spectra -# 1 means High-res LTQ (Default for HCD; also appropriate for high res CID). Do not merge spectra (FragMethod=4) when InstrumentID is 1; scores will degrade -# 2 means TOF -# 3 means Q-Exactive -# (note that this is automatically updated by the Analysis Manager based on the instrument type and MSn scan types present for a given dataset) -InstrumentID=1 - -#Enzyme ID -# 0 means No enzyme used -# 1 means Trypsin (Default); use this along with NTT=0 for a no-enzyme search of a tryptically digested sample -# 2: Chymotrypsin, 3: Lys-C, 4: Lys-N, 5: Glu-C, 6: Arg-C, 7: Asp-N, 8: alphaLP, 9: No Enzyme (for peptidomics) -EnzymeID=1 - -#Isotope error range -# Takes into account of the error introduced by choosing non-monoisotopic peak for fragmentation. -# Useful for accurate precursor ion masses -# Ignored if the parent mass tolerance is > 0.5Da or 500ppm -# The combination of -t and -ti determins the precursor mass tolerance. -# e.g. "-t 20ppm -ti -1,2" tests abs(exp-calc-n*1.00335Da)<20ppm for n=-1, 0, 1, 2. -IsotopeErrorRange=-1,2 - -#Number of tryptic termini -# The number of peptide termini that must have been cleaved by the enzyme (default 1) -# For trypsin, 2 means fully tryptic only, 1 means partially tryptic, and 0 means no-enzyme search -NTT=2 - -#Target/Decoy search mode -# 0 means don't search decoy database (default) -# 1 means search decoy database to compute FDR (source FASTA file must be forward-only proteins) -TDA=1 - -#Number of Threads (by default, uses all available cores) -#In DMS, a settings file entry for MSGFDBThreads will override this value -NumThreads=All - -#Minimum peptide length to consider -MinPepLength=9 - -#Maximum peptide length to consider -MaxPepLength=50 - -#Minimum precursor charge to consider (if not specified in the spectrum) -minCharge=2 - -#Maximum precursor charge to consider (if not specified in the spectrum) -maxCharge=5 - -#Number of matches per spectrum to be reported -#If this value is greater than 1 then the FDR values computed by MS-GF+ will be skewed by high-scoring 2nd and 3rd hits -NumMatchesPerSpec=2 - -#Amino Acid Modification Examples -# Specify static modifications using one or more StaticMod= entries -# Specify dynamic modifications using one or more DynamicMod= entries -# Modification format is: -# Mass or CompositionString, Residues, ModType, Position, Name (all the five fields are required). -# CompositionString can only contain a limited set of elements, primarily C H N O S or P -# -# Examples: -# C2H3N1O1, C, fix, any, Carbamidomethyl # Fixed Carbamidomethyl C (alkylation) -# O1, M, opt, any, Oxidation # Oxidation M -# 15.994915, M, opt, any, Oxidation # Oxidation M (mass is used instead of CompositionStr) -# H-1N-1O1, NQ, opt, any, Deamidated # Negative numbers are allowed. -# CH2, K, opt, any, Methyl # Methylation K -# C2H2O1, K, opt, any, Acetyl # Acetylation K -# HO3P, STY,opt, any, Phospho # Phosphorylation STY -# C2H3NO, *, opt, N-term, Carbamidomethyl # Variable Carbamidomethyl N-term -# H-2O-1, E, opt, N-term, Glu->pyro-Glu # Pyro-glu from E -# H-3N-1, Q, opt, N-term, Gln->pyro-Glu # Pyro-glu from Q -# C2H2O, *, opt, Prot-N-term, Acetyl # Acetylation Protein N-term +#Precursor mass tolerance +# Examples: 2.5Da or 30ppm +# Use comma to set asymmetric values, for example "0.5Da,2.5Da" will set 0.5Da to the left (expMasstheoMass) +PrecursorMassTolerance=20ppm + +#Max Number of Modifications per peptide +# If this value is large, the search will be slow +NumMods=4 + +#Modifications (see below for examples) +StaticMod=C2H3N1O1, C, fix, any, Carbamidomethyl # Fixed Carbamidomethyl C (alkylation) +StaticMod=229.1629, *, fix, N-term, TMT6plex +StaticMod=229.1629, K, fix, any, TMT6plex + +DynamicMod=O1, M, opt, any, Oxidation # Oxidized methionine +DynamicMod=-187.152366, K, opt, any, AcNoTMT # Residue tagged by MSGF+ with static TMT6, but is actually acetylated and does not have TMT + +#Fragmentation Method +# 0 means as written in the spectrum or CID if no info (Default) +# 1 means CID +# 2 means ETD +# 3 means HCD +# 4 means Merge spectra from the same precursor (e.g. CID/ETD pairs, CID/HCD/ETD triplets) +# (note that for Thermo instruments this is always set to 0 by the Analysis Manager since a _ScanType.txt file is created with this information on a per-scan basis) +FragmentationMethodID=0 + +#Instrument ID +# 0 means Low-res LCQ/LTQ (Default for CID and ETD); use InstrumentID=0 if analyzing a dataset with low-res CID and high-res HCD spectra +# 1 means High-res LTQ (Default for HCD; also appropriate for high res CID). Do not merge spectra (FragMethod=4) when InstrumentID is 1; scores will degrade +# 2 means TOF +# 3 means Q-Exactive +# (note that this is automatically updated by the Analysis Manager based on the instrument type and MSn scan types present for a given dataset) +InstrumentID=1 + +#Enzyme ID +# 0 means No enzyme used +# 1 means Trypsin (Default); use this along with NTT=0 for a no-enzyme search of a tryptically digested sample +# 2: Chymotrypsin, 3: Lys-C, 4: Lys-N, 5: Glu-C, 6: Arg-C, 7: Asp-N, 8: alphaLP, 9: No Enzyme (for peptidomics) +EnzymeID=1 + +#Isotope error range +# Takes into account of the error introduced by choosing non-monoisotopic peak for fragmentation. +# Useful for accurate precursor ion masses +# Ignored if the parent mass tolerance is > 0.5Da or 500ppm +# The combination of -t and -ti determins the precursor mass tolerance. +# e.g. "-t 20ppm -ti -1,2" tests abs(exp-calc-n*1.00335Da)<20ppm for n=-1, 0, 1, 2. +IsotopeErrorRange=-1,2 + +#Number of tryptic termini +# The number of peptide termini that must have been cleaved by the enzyme (default 1) +# For trypsin, 2 means fully tryptic only, 1 means partially tryptic, and 0 means no-enzyme search +NTT=2 + +#Target/Decoy search mode +# 0 means don't search decoy database (default) +# 1 means search decoy database to compute FDR (source FASTA file must be forward-only proteins) +TDA=1 + +#Number of Threads (by default, uses all available cores) +#In DMS, a settings file entry for MSGFDBThreads will override this value +NumThreads=All + +#Minimum peptide length to consider +MinPepLength=9 + +#Maximum peptide length to consider +MaxPepLength=50 + +#Minimum precursor charge to consider (if not specified in the spectrum) +minCharge=2 + +#Maximum precursor charge to consider (if not specified in the spectrum) +maxCharge=5 + +#Number of matches per spectrum to be reported +#If this value is greater than 1 then the FDR values computed by MS-GF+ will be skewed by high-scoring 2nd and 3rd hits +NumMatchesPerSpec=2 + +#Amino Acid Modification Examples +# Specify static modifications using one or more StaticMod= entries +# Specify dynamic modifications using one or more DynamicMod= entries +# Modification format is: +# Mass or CompositionString, Residues, ModType, Position, Name (all the five fields are required). +# CompositionString can only contain a limited set of elements, primarily C H N O S or P +# +# Examples: +# C2H3N1O1, C, fix, any, Carbamidomethyl # Fixed Carbamidomethyl C (alkylation) +# O1, M, opt, any, Oxidation # Oxidation M +# 15.994915, M, opt, any, Oxidation # Oxidation M (mass is used instead of CompositionStr) +# H-1N-1O1, NQ, opt, any, Deamidated # Negative numbers are allowed. +# CH2, K, opt, any, Methyl # Methylation K +# C2H2O1, K, opt, any, Acetyl # Acetylation K +# HO3P, STY,opt, any, Phospho # Phosphorylation STY +# C2H3NO, *, opt, N-term, Carbamidomethyl # Variable Carbamidomethyl N-term +# H-2O-1, E, opt, N-term, Glu->pyro-Glu # Pyro-glu from E +# H-3N-1, Q, opt, N-term, Gln->pyro-Glu # Pyro-glu from Q +# C2H2O, *, opt, Prot-N-term, Acetyl # Acetylation Protein N-term From 8330bc3473e3750da2112a47972918e2d7b21395 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 11:02:25 +0100 Subject: [PATCH 26/34] refactor(cli): typed enums for -outputFormat and -precursorCal Drop the lingering String + numeric backcompat for the two enum-shaped flags whose values are real names rather than IDs (per user direction: 'unless are options like 1 2 3 we should do only string'). After this commit: - -outputFormat accepts only `pin` / `tsv` (case-insensitive). The legacy numeric forms `0` / `1` are no longer recognised; users on those invocations should switch to the named values. This is a deliberate breaking change called out in the parameter-modernization cleanup -- consistency over backcompat in this corner. - -precursorCal continues to accept only `auto` / `on` / `off` (case-insensitive), but now via picocli's typed enum matcher rather than a String + fromString fallback. Invalid values fail fast at parse time instead of silently mapping to AUTO. Numeric-ID flags (-m, -inst, -e, -protocol) and 0/1 boolean-style flags (-tda, -verbose, -addFeatures, -allowDenseCentroidedPeaks, -edgeScore, -ignoreMetCleavage, -u, -ntt) keep their integer types -- those values are IDs, not names. Implementation: - New cli.OutputFormat enum (PIN, TSV). - MSGFPlusOptions.outputFormat: String -> OutputFormat. - MSGFPlusOptions.precursorCalMode: String -> SearchParams.PrecursorCalMode. - effectiveOutputFormat() now returns OutputFormat (was int 0/1). - effectivePrecursorCalRaw() collapsed into effectivePrecursorCal() returning the typed enum. - applyConfigEntry parses both flags via Enum.valueOf so config-file values like `OutputFormat=pin` and `PrecursorCal=auto` flow through the same case-insensitive contract as the CLI. - SearchParams.outputFormat field: int -> OutputFormat. writePin() / writeTsv() helpers retained (callers in MSGFPlus.runMSGFPlus). - SearchParams.PrecursorCalMode.fromString() deleted -- no callers after the resolver returns the typed enum directly. - New static factory MSGFPlusOptions.commandLine(opts) returns a CommandLine with caseInsensitiveEnumValuesAllowed(true). All call sites (MSGFPlus.main + 5 test files) routed through it so enum case-insensitivity is uniform. - docs/output.md updated to show `-outputFormat pin` / `tsv` and notes the numeric forms are no longer accepted. Tests: TestDirectPinWriter.outputFormatAcceptsOnlyPinAndTsv pins the new contract (numeric/legacy values rejected, named values accepted case-insensitively). TestPrecursorCalScaffolding migrated to enum constants and to a picocli rejection check for invalid values. The old fromString-fallback test is replaced by the rejection test. Scoped sweep: 78 tests, 0 failures, 0 errors, 4 skipped. --- docs/output.md | 6 ++- .../java/edu/ucsd/msjava/cli/MSGFPlus.java | 2 +- .../edu/ucsd/msjava/cli/MSGFPlusOptions.java | 28 ++++++------- .../edu/ucsd/msjava/cli/OutputFormat.java | 17 ++++++++ .../ucsd/msjava/msdbsearch/SearchParams.java | 37 ++++-------------- src/test/java/msgfplus/TestCollaboration.java | 2 +- .../java/msgfplus/TestDirectPinWriter.java | 39 +++++++++++-------- src/test/java/msgfplus/TestIPRG.java | 2 +- .../msgfplus/TestMinSpectraPerThread.java | 4 +- src/test/java/msgfplus/TestPercolator.java | 2 +- .../msgfplus/TestPrecursorCalIntegration.java | 7 ++-- .../msgfplus/TestPrecursorCalScaffolding.java | 30 +++++++------- 12 files changed, 94 insertions(+), 82 deletions(-) create mode 100644 src/main/java/edu/ucsd/msjava/cli/OutputFormat.java diff --git a/docs/output.md b/docs/output.md index bb840273..f091479c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,8 +8,10 @@ Select the format with `-outputFormat`: | Flag | Format | Extension | Typical use | |---|---|---|---| -| `-outputFormat 0` (default) | Percolator `.pin` | `.pin` | Feed to Percolator / MS²Rescore / Mokapot for FDR-calibrated rescoring | -| `-outputFormat 1` | Tab-separated values | `.tsv` | Direct inspection / downstream tools that consume TSV | +| `-outputFormat pin` (default) | Percolator `.pin` | `.pin` | Feed to Percolator / MS²Rescore / Mokapot for FDR-calibrated rescoring | +| `-outputFormat tsv` | Tab-separated values | `.tsv` | Direct inspection / downstream tools that consume TSV | + +`-outputFormat` accepts the named values `pin` and `tsv` (case-insensitive). Numeric forms (`0`, `1`) accepted by older releases are no longer recognised — pass the named value instead. The output path (`-o`) must use the matching extension. If `-o` is omitted, MS-GF+ writes `.pin` (or `.tsv`) in the spectrum file's directory. diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java index bdc330cb..a75de448 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java @@ -53,7 +53,7 @@ public static void main(String argv[]) { argvSnapshot = argv == null ? new String[0] : argv.clone(); MSGFPlusOptions opts = new MSGFPlusOptions(); - CommandLine cl = new CommandLine(opts); + CommandLine cl = MSGFPlusOptions.commandLine(opts); if (argv.length == 0) { printToolInfo(); diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java index 0991add6..453b0d87 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java @@ -1,9 +1,11 @@ package edu.ucsd.msjava.cli; +import edu.ucsd.msjava.msdbsearch.SearchParams.PrecursorCalMode; import edu.ucsd.msjava.msutil.ActivationMethod; import edu.ucsd.msjava.msutil.Enzyme; import edu.ucsd.msjava.msutil.InstrumentType; import edu.ucsd.msjava.msutil.Protocol; +import picocli.CommandLine; import picocli.CommandLine.Command; import picocli.CommandLine.Option; @@ -31,6 +33,13 @@ description = "MS-GF+: peptide identification by database search of mass spectra.") public final class MSGFPlusOptions { + /** Build a {@link CommandLine} configured for MS-GF+: enums match + * case-insensitively (so {@code -outputFormat pin} and {@code -outputFormat PIN} + * both work) and the parser uses the standard MS-GF+ usage layout. */ + public static CommandLine commandLine(MSGFPlusOptions opts) { + return new CommandLine(opts).setCaseInsensitiveEnumValuesAllowed(true); + } + // ---------- input (required at runtime, but may be provided via -conf) ---------- @Option(names = "-s", paramLabel = "SpectrumFile", @@ -154,11 +163,11 @@ public final class MSGFPlusOptions { @Option(names = "-outputFormat", paramLabel = "Format", description = "Output format: pin (Default) or tsv") - public String outputFormat; + public OutputFormat outputFormat; @Option(names = "-precursorCal", paramLabel = "Mode", description = "Precursor calibration mode: auto (Default), on, off") - public String precursorCalMode; + public PrecursorCalMode precursorCalMode; @Option(names = "-ccm", paramLabel = "Mass", description = "Charge carrier mass; Default: 1.00727649 (proton)") @@ -252,15 +261,8 @@ public final class MSGFPlusOptions { public double effectiveChargeCarrierMass() { return chargeCarrierMass != null ? chargeCarrierMass : 1.00727649; } public String effectiveDecoyPrefix() { return decoyPrefix != null ? decoyPrefix : "XXX"; } - public String effectivePrecursorCalRaw() { return precursorCalMode != null ? precursorCalMode : "auto"; } - - /** 0 = pin (default), 1 = tsv. */ - public int effectiveOutputFormat() { - if (outputFormat == null) return 0; - String n = outputFormat.trim().toLowerCase(); - if (n.equals("tsv") || n.equals("1")) return 1; - return 0; - } + public PrecursorCalMode effectivePrecursorCal() { return precursorCalMode != null ? precursorCalMode : PrecursorCalMode.AUTO; } + public OutputFormat effectiveOutputFormat() { return outputFormat != null ? outputFormat : OutputFormat.PIN; } public PrecursorTolerance effectivePrecursorTolerance() { return precursorTolerance != null ? precursorTolerance : PrecursorTolerance.parse("20ppm"); @@ -403,8 +405,8 @@ private String applyConfigEntry(String key, String value, String fileName) { case "verbose": if (verbose == null) verbose = Integer.parseInt(value); return null; case "tda": if (tdaStrategy == null) tdaStrategy = Integer.parseInt(value); return null; case "addfeatures": if (addFeatures == null) addFeatures = Integer.parseInt(value); return null; - case "outputformat": if (outputFormat == null) outputFormat = value; return null; - case "precursorcal": if (precursorCalMode == null) precursorCalMode = value; return null; + case "outputformat": if (outputFormat == null) outputFormat = OutputFormat.valueOf(value.trim().toUpperCase(java.util.Locale.ROOT)); return null; + case "precursorcal": if (precursorCalMode == null) precursorCalMode = PrecursorCalMode.valueOf(value.trim().toUpperCase(java.util.Locale.ROOT)); return null; case "chargecarriermass": if (chargeCarrierMass == null) chargeCarrierMass = Double.parseDouble(value); return null; case "maxmissedcleavages": if (maxMissedCleavages == null) maxMissedCleavages = Integer.parseInt(value); return null; case "nummods": if (maxNumMods == null) configMaxNumMods = Integer.parseInt(value); return null; diff --git a/src/main/java/edu/ucsd/msjava/cli/OutputFormat.java b/src/main/java/edu/ucsd/msjava/cli/OutputFormat.java new file mode 100644 index 00000000..2e570882 --- /dev/null +++ b/src/main/java/edu/ucsd/msjava/cli/OutputFormat.java @@ -0,0 +1,17 @@ +package edu.ucsd.msjava.cli; + +/** + * Search output format selected by {@code -outputFormat}. Picocli matches + * incoming values case-insensitively (see + * {@code @Command(caseInsensitiveEnumValuesAllowed = true)}). + * + *

Numeric forms ({@code 0} / {@code 1}) accepted by older releases are + * intentionally not supported. Users on legacy invocations should switch + * to the named values. + */ +public enum OutputFormat { + /** Percolator {@code .pin} (default). */ + PIN, + /** Tab-separated values, direct inspection / downstream tools. */ + TSV +} diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java index 55647240..1bcdda6c 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java @@ -2,6 +2,7 @@ import edu.ucsd.msjava.cli.IntRange; import edu.ucsd.msjava.cli.MSGFPlusOptions; +import edu.ucsd.msjava.cli.OutputFormat; import edu.ucsd.msjava.cli.PrecursorTolerance; import edu.ucsd.msjava.msgf.Tolerance; import edu.ucsd.msjava.msutil.*; @@ -32,28 +33,7 @@ public class SearchParams { public enum PrecursorCalMode { AUTO, ON, - OFF; - - /** - * Case-insensitive string to enum conversion. Unknown values fall - * back to {@link #AUTO} so that downstream code never crashes if a - * typo slips past CLI parsing. - */ - public static PrecursorCalMode fromString(String s) { - if (s == null) return AUTO; - String normalized = s.trim().toLowerCase(); - switch (normalized) { - case "on": - return ON; - case "off": - return OFF; - case "auto": - case "": - return AUTO; - default: - return AUTO; - } - } + OFF } private List dbSearchIOList; @@ -94,7 +74,7 @@ public static PrecursorCalMode fromString(String s) { private boolean allowDenseCentroidedPeaks; private int minMSLevel; private int maxMSLevel; - private int outputFormat; // 0=pin (default), 1=tsv — mzid output removed + private OutputFormat outputFormat; private PrecursorCalMode precursorCalMode = PrecursorCalMode.AUTO; public SearchParams() { @@ -287,17 +267,16 @@ public int getMaxMSLevel() { return maxMSLevel; } - /** 0=pin (default), 1=tsv. */ - public int getOutputFormat() { + public OutputFormat getOutputFormat() { return outputFormat; } public boolean writeTsv() { - return outputFormat == 1; + return outputFormat == OutputFormat.TSV; } public boolean writePin() { - return outputFormat == 0; + return outputFormat == OutputFormat.PIN; } /** @@ -350,7 +329,7 @@ public String parse(MSGFPlusOptions opts) { } dbSearchIOList = new ArrayList<>(); - String defaultExt = outputFormat == 1 ? ".tsv" : ".pin"; + String defaultExt = outputFormat == OutputFormat.TSV ? ".tsv" : ".pin"; if (!specPath.isDirectory()) { SpecFileFormat specFormat = SpecFileFormat.getSpecFileFormat(specPath.getName()); @@ -487,7 +466,7 @@ public String parse(MSGFPlusOptions opts) { } allowDenseCentroidedPeaks = opts.effectiveAllowDenseCentroidedPeaks() == 1; - precursorCalMode = PrecursorCalMode.fromString(opts.effectivePrecursorCalRaw()); + precursorCalMode = opts.effectivePrecursorCal(); IntRange ms = opts.effectiveMSLevel(); minMSLevel = ms.min; diff --git a/src/test/java/msgfplus/TestCollaboration.java b/src/test/java/msgfplus/TestCollaboration.java index 246ac30b..7edb50db 100644 --- a/src/test/java/msgfplus/TestCollaboration.java +++ b/src/test/java/msgfplus/TestCollaboration.java @@ -28,7 +28,7 @@ public void testSujunLiIndiana() MSGFPlusOptions paramManager = new MSGFPlusOptions(); - String msg = null; new CommandLine(paramManager).parseArgs(argv); + String msg = null; MSGFPlusOptions.commandLine(paramManager).parseArgs(argv); if(msg != null) System.out.println(msg); assertTrue(msg == null); diff --git a/src/test/java/msgfplus/TestDirectPinWriter.java b/src/test/java/msgfplus/TestDirectPinWriter.java index 09b72600..14b9d76c 100644 --- a/src/test/java/msgfplus/TestDirectPinWriter.java +++ b/src/test/java/msgfplus/TestDirectPinWriter.java @@ -1,12 +1,14 @@ package msgfplus; import edu.ucsd.msjava.cli.MSGFPlusOptions; +import edu.ucsd.msjava.cli.OutputFormat; import edu.ucsd.msjava.msdbsearch.DatabaseMatch; import edu.ucsd.msjava.msdbsearch.SearchParams; import edu.ucsd.msjava.msdbsearch.SearchParamsTest; import edu.ucsd.msjava.msutil.ActivationMethod; import edu.ucsd.msjava.msutil.Enzyme; import edu.ucsd.msjava.output.DirectPinWriter; +import picocli.CommandLine; import org.junit.Assert; import org.junit.Test; @@ -42,14 +44,14 @@ private MSGFPlusOptions buildOpts() throws URISyntaxException { @Test public void pinOutputFormatFlagIsAccepted() throws URISyntaxException { MSGFPlusOptions opts = buildOpts(); - opts.outputFormat = "pin"; - Assert.assertEquals(0, opts.effectiveOutputFormat()); + opts.outputFormat = OutputFormat.PIN; + Assert.assertEquals(OutputFormat.PIN, opts.effectiveOutputFormat()); } @Test public void writePinGetterReflectsOutputFormat() throws URISyntaxException { MSGFPlusOptions opts = buildOpts(); - opts.outputFormat = "pin"; + opts.outputFormat = OutputFormat.PIN; SearchParams params = new SearchParams(); Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); @@ -59,26 +61,31 @@ public void writePinGetterReflectsOutputFormat() throws URISyntaxException { } @Test - public void allOutputFormatEnumIndicesAreAccepted() throws URISyntaxException { - // Valid outputFormat values after mzid removal: pin (default) and tsv. - for (String value : new String[]{"pin", "tsv", "0", "1"}) { - MSGFPlusOptions opts = buildOpts(); - opts.outputFormat = value; - int eff = opts.effectiveOutputFormat(); - Assert.assertTrue("'" + value + "' should map to 0 or 1 but got " + eff, eff == 0 || eff == 1); + public void outputFormatAcceptsOnlyPinAndTsv() throws URISyntaxException { + // Picocli matches enum values case-insensitively per the @Command setting. + for (String value : new String[]{"pin", "PIN", "Pin", "tsv", "TSV", "Tsv"}) { + MSGFPlusOptions opts = new MSGFPlusOptions(); + MSGFPlusOptions.commandLine(opts).parseArgs("-outputFormat", value); + Assert.assertNotNull("'" + value + "' should parse to a valid OutputFormat", opts.outputFormat); } - // Regression gate: old "mzid" and "both" (2, 3) collapse to pin. - for (String value : new String[]{"mzid", "both", "2", "3"}) { - MSGFPlusOptions opts = buildOpts(); - opts.outputFormat = value; - Assert.assertEquals("Removed format '" + value + "' must collapse to pin (0)", 0, opts.effectiveOutputFormat()); + // Numeric forms (0/1) and removed legacy values (mzid, both, 2, 3) are + // intentionally rejected -- the typed enum is part of the consistency + // sweep called out in the parameter-modernization cleanup. + for (String value : new String[]{"0", "1", "2", "3", "mzid", "both", ""}) { + MSGFPlusOptions opts = new MSGFPlusOptions(); + try { + MSGFPlusOptions.commandLine(opts).parseArgs("-outputFormat", value); + Assert.fail("'" + value + "' should be rejected by picocli enum matching"); + } catch (CommandLine.ParameterException expected) { + // ok + } } } @Test public void pinHeaderColumnsIncludeRequiredPercolatorFields() throws Exception { MSGFPlusOptions opts = buildOpts(); - opts.outputFormat = "pin"; + opts.outputFormat = OutputFormat.PIN; SearchParams params = new SearchParams(); Assert.assertNull(params.parse(opts)); diff --git a/src/test/java/msgfplus/TestIPRG.java b/src/test/java/msgfplus/TestIPRG.java index 8180ead0..51b46496 100644 --- a/src/test/java/msgfplus/TestIPRG.java +++ b/src/test/java/msgfplus/TestIPRG.java @@ -34,7 +34,7 @@ public void countProteins() MSGFPlusOptions paramManager = new MSGFPlusOptions(); - String msg = null; new CommandLine(paramManager).parseArgs(argv); + String msg = null; MSGFPlusOptions.commandLine(paramManager).parseArgs(argv); if(msg != null) System.err.println("Error: " + msg); assertTrue(msg == null); diff --git a/src/test/java/msgfplus/TestMinSpectraPerThread.java b/src/test/java/msgfplus/TestMinSpectraPerThread.java index 42863ed4..eea5074e 100644 --- a/src/test/java/msgfplus/TestMinSpectraPerThread.java +++ b/src/test/java/msgfplus/TestMinSpectraPerThread.java @@ -16,7 +16,7 @@ public void defaultIs250() { @Test public void overrideAppliesThroughGetter() { MSGFPlusOptions opts = new MSGFPlusOptions(); - new CommandLine(opts).parseArgs("-minSpectraPerThread", "50"); + MSGFPlusOptions.commandLine(opts).parseArgs("-minSpectraPerThread", "50"); Assert.assertEquals(50, opts.effectiveMinSpectraPerThread()); } @@ -26,7 +26,7 @@ public void parsesZero() { // so '0' is parseable here. Range checks moved to SearchParams.parse // (which would reject zero earlier in the search-engine flow if needed). MSGFPlusOptions opts = new MSGFPlusOptions(); - new CommandLine(opts).parseArgs("-minSpectraPerThread", "0"); + MSGFPlusOptions.commandLine(opts).parseArgs("-minSpectraPerThread", "0"); Assert.assertEquals(0, opts.effectiveMinSpectraPerThread()); } } diff --git a/src/test/java/msgfplus/TestPercolator.java b/src/test/java/msgfplus/TestPercolator.java index 2ab91cd3..b61d23e7 100644 --- a/src/test/java/msgfplus/TestPercolator.java +++ b/src/test/java/msgfplus/TestPercolator.java @@ -22,7 +22,7 @@ public void testAddFeatures() throws URISyntaxException { String[] argv = {"-s", specFile.getPath(), "-d", dbFile.getPath(), "-addFeatures", "1", "-m", "3"}; MSGFPlusOptions opts = new MSGFPlusOptions(); - new CommandLine(opts).parseArgs(argv); + MSGFPlusOptions.commandLine(opts).parseArgs(argv); assertTrue(MSGFPlus.runMSGFPlus(opts) == null); } diff --git a/src/test/java/msgfplus/TestPrecursorCalIntegration.java b/src/test/java/msgfplus/TestPrecursorCalIntegration.java index d20e34ed..fb85c668 100644 --- a/src/test/java/msgfplus/TestPrecursorCalIntegration.java +++ b/src/test/java/msgfplus/TestPrecursorCalIntegration.java @@ -2,6 +2,7 @@ import edu.ucsd.msjava.cli.MSGFPlus; import edu.ucsd.msjava.cli.MSGFPlusOptions; +import edu.ucsd.msjava.msdbsearch.SearchParams.PrecursorCalMode; import edu.ucsd.msjava.msdbsearch.SearchParamsTest; import edu.ucsd.msjava.msutil.DBSearchIOFiles; import edu.ucsd.msjava.msutil.SpecFileFormat; @@ -63,7 +64,7 @@ public void precursorCalOffMatchesBaseline() throws Exception { File baselineOut = new File(workDir.toFile(), "baseline.pin"); MSGFPlusOptions offManager = buildOpts(offOut); - offManager.precursorCalMode = "off"; + offManager.precursorCalMode = PrecursorCalMode.OFF; String offErr = MSGFPlus.runMSGFPlus(offManager); Assert.assertNull("runMSGFPlus(off) failed: " + offErr, offErr); Assert.assertTrue("off.pin must exist", offOut.exists()); @@ -106,11 +107,11 @@ public void precursorCalOffIsDeterministic() throws Exception { File secondOut = new File(workDir.toFile(), "second.pin"); MSGFPlusOptions firstManager = buildOpts(firstOut); - firstManager.precursorCalMode = "off"; + firstManager.precursorCalMode = PrecursorCalMode.OFF; Assert.assertNull(MSGFPlus.runMSGFPlus(firstManager)); MSGFPlusOptions secondManager = buildOpts(secondOut); - secondManager.precursorCalMode = "off"; + secondManager.precursorCalMode = PrecursorCalMode.OFF; Assert.assertNull(MSGFPlus.runMSGFPlus(secondManager)); List firstPsms = extractPsmItems(firstOut); diff --git a/src/test/java/msgfplus/TestPrecursorCalScaffolding.java b/src/test/java/msgfplus/TestPrecursorCalScaffolding.java index 102f3b0b..8f1c5e80 100644 --- a/src/test/java/msgfplus/TestPrecursorCalScaffolding.java +++ b/src/test/java/msgfplus/TestPrecursorCalScaffolding.java @@ -50,7 +50,7 @@ public void precursorCalDefaultIsAuto() throws URISyntaxException { @Test public void precursorCalOnIsParsed() throws URISyntaxException { MSGFPlusOptions opts = buildOpts(); - opts.precursorCalMode = "on"; + opts.precursorCalMode = PrecursorCalMode.ON; SearchParams params = new SearchParams(); Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); Assert.assertEquals(PrecursorCalMode.ON, params.getPrecursorCalMode()); @@ -59,7 +59,7 @@ public void precursorCalOnIsParsed() throws URISyntaxException { @Test public void precursorCalOffIsParsed() throws URISyntaxException { MSGFPlusOptions opts = buildOpts(); - opts.precursorCalMode = "off"; + opts.precursorCalMode = PrecursorCalMode.OFF; SearchParams params = new SearchParams(); Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); Assert.assertEquals(PrecursorCalMode.OFF, params.getPrecursorCalMode()); @@ -67,20 +67,24 @@ public void precursorCalOffIsParsed() throws URISyntaxException { @Test public void precursorCalIsCaseInsensitive() throws URISyntaxException { - MSGFPlusOptions opts = buildOpts(); - opts.precursorCalMode = "OFF"; - SearchParams params = new SearchParams(); - Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); - Assert.assertEquals(PrecursorCalMode.OFF, params.getPrecursorCalMode()); + // Picocli's enum matcher honours @Command(caseInsensitiveEnumValuesAllowed = true). + MSGFPlusOptions opts = new MSGFPlusOptions(); + MSGFPlusOptions.commandLine(opts).parseArgs("-precursorCal", "OFF"); + Assert.assertEquals(PrecursorCalMode.OFF, opts.precursorCalMode); } @Test - public void unknownPrecursorCalValueFallsBackToAuto() { - // Unit-level contract: unknown strings must not crash the search path; - // instead they silently fall back to AUTO. - Assert.assertEquals(PrecursorCalMode.AUTO, PrecursorCalMode.fromString("bogus")); - Assert.assertEquals(PrecursorCalMode.AUTO, PrecursorCalMode.fromString(null)); - Assert.assertEquals(PrecursorCalMode.AUTO, PrecursorCalMode.fromString("")); + public void unknownPrecursorCalValueIsRejected() { + // The typed enum replaces the previous String + fromString fallback; + // invalid values are now rejected by picocli at parse time instead + // of silently mapping to AUTO. + MSGFPlusOptions opts = new MSGFPlusOptions(); + try { + MSGFPlusOptions.commandLine(opts).parseArgs("-precursorCal", "bogus"); + Assert.fail("'bogus' should not parse as a PrecursorCalMode"); + } catch (picocli.CommandLine.ParameterException expected) { + // ok + } } @Test From b7dce4cab2023b59814198def29896a7676af104 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 11:07:45 +0100 Subject: [PATCH 27/34] docs(changelog): document parameter-modernization sweep in vNEXT Expand the vNEXT entry to cover the full PR #25 modernization stack: - CLI parser modernisation: picocli-driven MSGFPlusOptions; -conf flow with case-insensitive keys + 13 legacy aliases preserved; numeric/enum range validation surface restored. - Breaking changes: -outputFormat numeric forms (0/1) removed in favour of named pin/tsv (typed enum); -precursorCal switched to typed enum (rejects unknown values instead of silently mapping to AUTO); spectrum input narrowed to mzML + mgf only (mzXML, MS2, PKL, _dta.txt parsers deleted); deprecated MSGFDB entry point and its dead MSGF/MSGFLib siblings removed. - Internal refactor: edu.ucsd.msjava.params package deleted (~2,100 LOC across 18 classes); package reorg (ui/ -> cli/, mzid/ -> output/, parser/ -> mgf/, net.pempek.unicode -> mgf); new typed value classes (MSGFPlusOptions, PrecursorTolerance, IntRange, OutputFormat); picocli 4.7.6 dep added. - Bench gate: prior Astral 3-arm run confirmed bit-identical PSM target/decoy counts (89,479 / 46,792) between baseline and new branch in -precursorCal off mode. Table embedded in the entry. - Earlier in cycle: precursor calibration + Percolator pin output bullets retained. No code change; pure docs. --- docs/changelog.md | 139 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 130 insertions(+), 9 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 29d94d53..713e0bd5 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,15 +2,136 @@ [MS-GF+ Documentation home](readme.md) -**vNEXT — Unreleased (breaking change)** - -- **BREAKING:** mzIdentML (`.mzid`) support fully removed — no backward-compatibility shim. MS-GF+ now writes only Percolator `.pin` (default) or TSV, and every `.mzid`-related utility has been deleted: - - **Output:** `MZIdentMLGen`, `AnalysisProtocolCollectionGen`, `MzIDTest` — deleted. `-o foo.mzid` is rejected at argument-parse time (no extension rewrite). `Unimod` and `UnimodComposition` are retained for future PTM-aware enhancements to `DirectPinWriter` — they carry the modification-accession + mass tables that a richer pin output would need to populate the `Peptide` column with proper Unimod references. - - **Input / legacy tools:** `MzIDToTsv` (CLI `edu.ucsd.msjava.ui.MzIDToTsv`), `MzIDParser`, `AnnotatedSpectra`, `ScoringParamGen` — deleted. Users who need to post-process legacy `.mzid` files must use MS-GF+ v2026.03.25 or earlier, or an external mzid converter. - - `-outputFormat` now accepts only `pin` (default) and `tsv`. Integer aliases: `0=pin, 1=tsv`. Previous `0=mzid, 2=both, 3=pin` layout is rejected. - - `-o OutputFile` must end in `.pin` or `.tsv`. `.mzid` paths are rejected. -- Added precursor mass calibration: `-precursorCal auto|on|off` (default `auto`). Merged via PR #22. -- Added Percolator `.pin` output with OpenMS-parity features (`enzN`, `enzC`, `enzInt`, `mass`, `lnDeltaSpecEValue`, `matchedIonRatio`, `longest_b`, `longest_y`, `longest_y_pct`) and lowercase renames (`peplen`, `charge2/3/4`, `dm`, `absdm`, `isotope_error`) for OpenMS `PercolatorAdapter` interoperability. Merged via PR #22 + this PR. +**vNEXT — Unreleased (multiple breaking changes)** + +This release modernises the CLI surface and trims a large amount of +legacy code. Net change: roughly **−2,400 LOC** vs the previous +release, with the CLI flag contract preserved for normal users and a +few deliberate breaking changes called out below. + +### CLI parser modernisation + +- The CLI is now driven by [picocli](https://picocli.info) via + `edu.ucsd.msjava.cli.MSGFPlusOptions`. All flags are declared once + with typed Java fields; help (`-h`/`--help`) and version (`-V`) are + auto-generated. +- `-conf` config-file inputs flow through the same path: any field + the CLI did not set is filled in from the config file (CLI takes + precedence). Legacy aliases continue to be recognised, including + `IsotopeError` → `IsotopeErrorRange`, `FragmentationMethod` → + `FragmentationMethodID`, `Instrument` → `InstrumentID`, `Enzyme` + → `EnzymeID`, `Protocol` → `ProtocolID`, `NumTolerableTermini` → + `NTT`, `MinNumPeaks` → `MinNumPeaksPerSpectrum`, `MaxNumMods` / + `MaxNumModsPerPeptide` → `NumMods`, `MinPeptideLength` / + `minLength` → `MinPepLength`, `MaxPeptideLength` / `maxLength` → + `MaxPepLength`, `PMTolerance` / `ParentMassTolerance` → + `PrecursorMassTolerance`. Config-file keys are matched + case-insensitively (so `minCharge=`, `MinCharge=`, and `MINCHARGE=` + all work). +- `DynamicMod=`, `StaticMod=`, and `CustomAA=` config-file entries + continue to be repeatable; each line is collected into the AA set. +- Validation surface restored: invalid numeric values (e.g. + `-thread 0`, `-ntt 5`, `-tda 2`) and out-of-range enum-like IDs + (e.g. `-m 99`, `-inst 99`) now produce a clean user-facing error + string instead of a stack trace. + +### Breaking changes + +- **`-outputFormat` accepts only named values.** `pin` (default) and + `tsv` are the supported forms (case-insensitive). The legacy + numeric aliases `0` and `1` are no longer accepted; users on those + invocations should switch to the named values. +- **`-precursorCal` is now a typed enum.** `auto` (default), `on`, + and `off` are still the only valid values; invalid values now fail + fast at parse time instead of silently mapping to `auto`. +- **Spectrum input narrowed to `*.mzML` and `*.mgf`.** Support for + `*.mzXML`, `*.ms2`, `*.pkl`, and `*_dta.txt` has been removed + along with their parsers. `MgfSpectrumParser`, `BufferedLineReader`, + `BufferedRandomAccessLineReader`, and the shared `LineReader` / + `SpectrumParser` interfaces moved from `edu.ucsd.msjava.parser` to + `edu.ucsd.msjava.mgf` to reflect the trimmed scope. +- **Deprecated `MSGFDB` entry point removed.** `cli.MSGFDB` (legacy + v8091, "08/06/2012") and `docs/ms-gfdb.md` have been deleted, along + with `ParamManager.addMSGFDBParams` / `addMSGFParams` / + `addMSGFLibParams` (the latter two were dead — no entry points + existed). The MSGFDB-only `ParamNameEnum` entries `C13`, `NNET`, + `UNIFORM_AA_PROBABILITY`, and `OUTPUT_FILE` are gone, as are the + `showFDR`, `showDecoy`, and `replicate` config-file keys. +- mzIdentML (`.mzid`) support remains fully removed (introduced in a + prior commit on this branch). MS-GF+ writes only `.pin` (default) + or `.tsv`. Every `.mzid`-related utility has been deleted: + - **Output:** `MZIdentMLGen`, `AnalysisProtocolCollectionGen`, + `MzIDTest`. `Unimod` and `UnimodComposition` are retained for + future PTM-aware enhancements to `DirectPinWriter` — they carry + the modification-accession + mass tables a richer pin output + would need. + - **Input / legacy tools:** `MzIDToTsv` (CLI + `edu.ucsd.msjava.ui.MzIDToTsv`), `MzIDParser`, + `AnnotatedSpectra`, `ScoringParamGen`. Users who need to + post-process legacy `.mzid` files must use MS-GF+ v2026.03.25 + or earlier, or an external mzid converter. + +### Internal refactor + +- The entire `edu.ucsd.msjava.params` package has been deleted + (~2,100 LOC across 18 classes including `ParamManager`, the + `Parameter` / `IntParameter` / `FloatParameter` / `IntRangeParameter` + / `ToleranceParameter` / `EnumParameter` / `FileParameter` / + `StringParameter` hierarchy, and `ParamParser`). Two small helpers + (`ParamObject`, `UserParam`) moved to `edu.ucsd.msjava.msutil` + where their `ActivationMethod` / `Enzyme` / `InstrumentType` / + `Protocol` consumers already live. +- Top-level package reorganisation: + - `edu.ucsd.msjava.ui.MSGFPlus` → `edu.ucsd.msjava.cli.MSGFPlus`. + - `edu.ucsd.msjava.mzid.{DirectPinWriter,DirectTSVWriter,Unimod,UnimodComposition}` + → `edu.ucsd.msjava.output.*`. + - `edu.ucsd.msjava.parser.*` → `edu.ucsd.msjava.mgf.*` (after + dropping the legacy-format parsers). + - `net.pempek.unicode.UnicodeBOMInputStream` → + `edu.ucsd.msjava.mgf.UnicodeBOMInputStream`. + - `edu.ucsd.msjava.mslibsearch.ProcessedSpectrum` deleted (no + references). +- New typed value classes in `cli/`: + - `MSGFPlusOptions` — picocli `@Command` with all MSGFPlus flags. + - `PrecursorTolerance` — symmetric or asymmetric tolerance with + matching-unit + non-negative validation. + - `IntRange` — inclusive integer range used by `-ti`, `-msLevel`, + `-index`. + - `OutputFormat` — enum (`PIN`, `TSV`). +- `picocli` 4.7.6 added as a runtime dependency. +- New regression tests covering the `CustomAA=` config-file path, + the `-m 4 = UVPD` mapping, case-insensitive config keys, and + out-of-range flag rejection. The full scoped test sweep includes + 78 tests. + +### Bench gate + +The Astral 3-arm correctness gate (`benchmark/run_astral_3arm.sh`, +ProteoBench Module 8) on the prior modernisation pass confirmed +**bit-identical PSM target/decoy counts** to the pre-PR#22 baseline +JAR when `-precursorCal off` is supplied: + +| Arm | JAR | -precursorCal | targets | decoys | +|---|---|---|---|---| +| A | baseline (pre-PR #22) | n/a | 89,479 | 46,792 | +| B | new branch | off | **89,479** | **46,792** | +| C | new branch | auto | 89,360 | 46,913 | + +Arm C's small delta is the calibrator's expected effect when AUTO +collects ≥200 confident PSMs. The CLI rewrite does not touch the +search hot path, so this gate continues to apply for the additional +fixes layered on top. + +### Earlier in this release cycle + +- Added precursor mass calibration: `-precursorCal auto|on|off` + (default `auto`). Merged via PR #22. +- Added Percolator `.pin` output with OpenMS-parity features + (`enzN`, `enzC`, `enzInt`, `mass`, `lnDeltaSpecEValue`, + `matchedIonRatio`, `longest_b`, `longest_y`, `longest_y_pct`) and + lowercase column renames (`peplen`, `charge2/3/4`, `dm`, `absdm`, + `isotope_error`) for OpenMS `PercolatorAdapter` interoperability. + Merged via PR #22 + this PR. **v2026.03.25** From 657cc5e2ebc603ea8b8acdf1a3088ae156aa1ef7 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 11:27:23 +0100 Subject: [PATCH 28/34] refactor: drop ~2,074 LOC of dead/redundant code (audit pass) Combined sweep of cuts surfaced by the post-modernization LOC audit. Net change: 13 files, +57 / -2,131 = -2,074 LOC. ### Deleted dead classes (-2,051 LOC, 6 files) Verified zero callers across src/main and src/test before removal: - msscorer/ScoringParameterGenerator (732 LOC) and ScoringParameterGeneratorWithErrors (880 LOC) -- standalone main() scoring-param tools from the pre-modernization era. Pre-built scoring model .tsv files are committed in resources; these generators are not invoked at search time and have no remaining consumers. - output/Unimod (85 LOC) and output/UnimodComposition (133 LOC) -- residue from the deleted mzIdentML write side. Atom.java's "Unimod mod bricks" comment is the only remaining reference and refers to upstream data, not the deleted classes. - msgf/ToolLauncher (154 LOC) -- abstract launcher with no concrete implementations. - msutil/ScoredString (67 LOC) -- duplicate of fdr/ScoredString with no live callers (all usages resolve to the fdr.* version). ### Inlined effective*() resolvers + helper collapse (-23 LOC net) - ~20 of the trivial `field != null ? field : default` resolvers in MSGFPlusOptions are inlined at the SearchParams.parse call sites. The non-trivial registry-resolving ones (effectiveActivationMethod, effectiveInstrumentType, effectiveEnzyme, effectiveProtocol) and a handful of frequent ones (effectiveOutputFormat, effectiveMin/Max PeptideLength etc.) stay since their length pays for itself. - ActivationMethodAvailability nested class collapsed -- it only hid one InstrumentType.getAllRegisteredInstrumentTypes().length call that is now inline in validate(). - SearchParams.getOutputFormat() and SearchParams.writePin() removed -- writePin() had two callers in MSGFPlus.java which now use !writeTsv() instead; getOutputFormat() had zero callers. ### stripComment dedup (-7 LOC) The two implementations of "split-on-#-and-trim" (SearchParams. getConfigLineWithoutComment and MSGFPlusOptions.stripComment) collapsed: stripComment is the canonical version (package-public), SearchParams.getConfigLineWithoutComment delegates to it, and AminoAcidSet.parseConfigEntry calls stripComment directly. ### Validation surface expanded MSGFPlusOptions.validate() now also rejects a -mod / ModificationFile= path that does not exist, returning a user-facing error string. New regression test (validateRejectsMissingModificationFile) pins both the CLI path and the config-file path. Verified: scoped sweep (TestDirectPinWriter, TestMSUtils, TestSA, TestMisc, TestRunManifestWriter, SearchParamsTest, TestPercolator, TestMinSpectraPerThread, TestPrecursorCalScaffolding, TestCandidatePeptideGrid + ConsideringMetCleavage, MSGFPlusOptionsConfigFileTest, MSGFPlusOptionsActivationMethodTest): 78 tests, 0 failures, 0 errors, 3 skipped. --- .../java/edu/ucsd/msjava/cli/MSGFPlus.java | 2 +- .../edu/ucsd/msjava/cli/MSGFPlusOptions.java | 46 +- .../ucsd/msjava/msdbsearch/SearchParams.java | 54 +- .../edu/ucsd/msjava/msgf/ToolLauncher.java | 154 --- .../msscorer/ScoringParameterGenerator.java | 733 --------------- .../ScoringParameterGeneratorWithErrors.java | 880 ------------------ .../edu/ucsd/msjava/msutil/AminoAcidSet.java | 2 +- .../edu/ucsd/msjava/msutil/ScoredString.java | 67 -- .../java/edu/ucsd/msjava/output/Unimod.java | 85 -- .../ucsd/msjava/output/UnimodComposition.java | 133 --- .../cli/MSGFPlusOptionsConfigFileTest.java | 27 + .../msjava/msdbsearch/SearchParamsTest.java | 4 +- .../java/msgfplus/TestDirectPinWriter.java | 1 - 13 files changed, 57 insertions(+), 2131 deletions(-) delete mode 100644 src/main/java/edu/ucsd/msjava/msgf/ToolLauncher.java delete mode 100644 src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGenerator.java delete mode 100644 src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGeneratorWithErrors.java delete mode 100644 src/main/java/edu/ucsd/msjava/msutil/ScoredString.java delete mode 100644 src/main/java/edu/ucsd/msjava/output/Unimod.java delete mode 100644 src/main/java/edu/ucsd/msjava/output/UnimodComposition.java diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java index a75de448..31b7188e 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlus.java @@ -594,7 +594,7 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o System.out.println("TSV file: " + outputFile.getPath()); } - if (params.writePin()) { + if (!params.writeTsv()) { DirectPinWriter pinWriter = new DirectPinWriter(params, aaSet, sa, specAcc, ioIndex); try { pinWriter.writeResults(resultList, outputFile); diff --git a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java index 453b0d87..e02fe1d6 100644 --- a/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java +++ b/src/main/java/edu/ucsd/msjava/cli/MSGFPlusOptions.java @@ -241,45 +241,12 @@ public static CommandLine commandLine(MSGFPlusOptions opts) { public int effectiveMaxPeptideLength() { return maxPeptideLength != null ? maxPeptideLength : 40; } public int effectiveMinCharge() { return minCharge != null ? minCharge : 2; } public int effectiveMaxCharge() { return maxCharge != null ? maxCharge : 3; } - public int effectiveNumMatchesPerSpec() { return numMatchesPerSpec != null ? numMatchesPerSpec : 1; } - public int effectiveNumThreads() { return numThreads != null ? numThreads : Runtime.getRuntime().availableProcessors(); } - public int effectiveNumTasks() { return numTasks != null ? numTasks : 0; } public int effectiveMinSpectraPerThread() { return minSpectraPerThread != null ? minSpectraPerThread : 250; } public int effectiveVerbose() { return verbose != null ? verbose : 0; } public int effectiveTdaStrategy() { return tdaStrategy != null ? tdaStrategy : 0; } - public int effectiveAddFeatures() { return addFeatures != null ? addFeatures : 0; } - public int effectiveMaxMissedCleavages() { return maxMissedCleavages != null ? maxMissedCleavages : -1; } public int effectiveMaxNumMods() { return maxNumMods != null ? maxNumMods : (configMaxNumMods != null ? configMaxNumMods : 3); } - public int effectiveAllowDenseCentroidedPeaks() { return allowDenseCentroidedPeaks != null ? allowDenseCentroidedPeaks : 0; } - public int effectiveNumTolerableTermini() { return numTolerableTermini != null ? numTolerableTermini : 2; } - public int effectiveEdgeScore() { return edgeScore != null ? edgeScore : 0; } - public int effectiveIgnoreMetCleavage() { return ignoreMetCleavage != null ? ignoreMetCleavage : 0; } - public int effectiveMinNumPeaks() { return minNumPeaks != null ? minNumPeaks : edu.ucsd.msjava.sequences.Constants.MIN_NUM_PEAKS_PER_SPECTRUM; } - public int effectiveNumIsoforms() { return numIsoforms != null ? numIsoforms : edu.ucsd.msjava.sequences.Constants.NUM_VARIANTS_PER_PEPTIDE; } - public int effectiveMinDeNovoScore() { return minDeNovoScore != null ? minDeNovoScore : edu.ucsd.msjava.sequences.Constants.MIN_DE_NOVO_SCORE; } - public int effectiveToleranceUnits() { return precursorToleranceUnits != null ? precursorToleranceUnits : 2; } - public double effectiveChargeCarrierMass() { return chargeCarrierMass != null ? chargeCarrierMass : 1.00727649; } - - public String effectiveDecoyPrefix() { return decoyPrefix != null ? decoyPrefix : "XXX"; } - public PrecursorCalMode effectivePrecursorCal() { return precursorCalMode != null ? precursorCalMode : PrecursorCalMode.AUTO; } public OutputFormat effectiveOutputFormat() { return outputFormat != null ? outputFormat : OutputFormat.PIN; } - public PrecursorTolerance effectivePrecursorTolerance() { - return precursorTolerance != null ? precursorTolerance : PrecursorTolerance.parse("20ppm"); - } - - public IntRange effectiveIsotopeErrorRange() { - return isotopeErrorRange != null ? isotopeErrorRange : new IntRange(0, 1); - } - - public IntRange effectiveMSLevel() { - return msLevel != null ? msLevel : new IntRange(2, 2); - } - - public IntRange effectiveSpecIndexRange() { - return specIndexRange != null ? specIndexRange : new IntRange(1, Integer.MAX_VALUE - 1); - } - /** Resolves {@code -m} index to {@link ActivationMethod}. MSGFPlus exposes * 0=ASWRITTEN, 1=CID, 2=ETD, 3=HCD, 4=UVPD. The registry also defines * FUSION (merge-mode synthetic method) and PQD, but neither is exposed @@ -430,7 +397,7 @@ private String applyConfigEntry(String key, String value, String fileName) { } } - private static String stripComment(String line) { + public static String stripComment(String line) { int hash = line.indexOf('#'); return (hash >= 0 ? line.substring(0, hash) : line).trim(); } @@ -474,6 +441,9 @@ private static String canonicalConfigKey(String key) { public String validate() { if (spectrumFile == null) return "Spectrum file is not defined; use -s at the command line or SpectrumFile in a config file"; if (databaseFile == null) return "Database file is not defined; use -d at the command line or DatabaseFile in a config file"; + if (modificationFile != null && !modificationFile.exists()) { + return "Modification file not found: " + modificationFile.getPath(); + } String err; if ((err = checkMin("-thread", numThreads, 1)) != null) return err; @@ -506,7 +476,7 @@ public String validate() { if (fragMethodId != null && (fragMethodId < 0 || fragMethodId > 4)) { return "Invalid value for parameter -m: " + fragMethodId + " (valid: 0..4)"; } - int instMax = ActivationMethodAvailability.instCount() - 1; + int instMax = InstrumentType.getAllRegisteredInstrumentTypes().length - 1; if (instrumentTypeId != null && (instrumentTypeId < 0 || instrumentTypeId > instMax)) { return "Invalid value for parameter -inst: " + instrumentTypeId + " (valid: 0.." + instMax + ")"; } @@ -533,12 +503,6 @@ private static String checkRange(String flag, Integer value, int min, int max) { return null; } - /** Helper that hides the {@link InstrumentType#getAllRegisteredInstrumentTypes} - * call from {@code validate()} so the import block stays minimal. */ - private static final class ActivationMethodAvailability { - static int instCount() { return InstrumentType.getAllRegisteredInstrumentTypes().length; } - } - /** Mutator used by {@code AminoAcidSet} when the parsed mod metadata * changes the effective max-num-mods (the AA set is authoritative once * loaded). Mirrors the legacy {@code ParamManager.setMaxNumMods}. */ diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java index 1bcdda6c..81edd496 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java @@ -267,18 +267,10 @@ public int getMaxMSLevel() { return maxMSLevel; } - public OutputFormat getOutputFormat() { - return outputFormat; - } - public boolean writeTsv() { return outputFormat == OutputFormat.TSV; } - public boolean writePin() { - return outputFormat == OutputFormat.PIN; - } - /** * Look for # in dataLine * If present, remove that character and any comment after it @@ -287,11 +279,7 @@ public boolean writePin() { * @return dataLine without the comment */ public static String getConfigLineWithoutComment(String dataLine) { - String[] tokenArray = dataLine.split("#"); - if (tokenArray.length == 0) - return ""; - - return tokenArray[0].trim(); + return MSGFPlusOptions.stripComment(dataLine); } /** @@ -316,7 +304,7 @@ public String parse(MSGFPlusOptions opts) { String requiredErr = opts.validate(); if (requiredErr != null) return requiredErr; - chargeCarrierMass = opts.effectiveChargeCarrierMass(); + chargeCarrierMass = opts.chargeCarrierMass != null ? opts.chargeCarrierMass : 1.00727649; Composition.setChargeCarrierMass(chargeCarrierMass); // Read outputFormat up-front so the default-output-file extension logic @@ -354,20 +342,20 @@ public String parse(MSGFPlusOptions opts) { } databaseFile = opts.databaseFile; - decoyProteinPrefix = opts.effectiveDecoyPrefix(); + decoyProteinPrefix = opts.decoyPrefix != null ? opts.decoyPrefix : "XXX"; - PrecursorTolerance tol = opts.effectivePrecursorTolerance(); + PrecursorTolerance tol = opts.precursorTolerance != null ? opts.precursorTolerance : PrecursorTolerance.parse("20ppm"); leftPrecursorMassTolerance = tol.left; rightPrecursorMassTolerance = tol.right; - int toleranceUnit = opts.effectiveToleranceUnits(); + int toleranceUnit = opts.precursorToleranceUnits != null ? opts.precursorToleranceUnits : 2; if (toleranceUnit != 2) { boolean isTolerancePPM = toleranceUnit != 0; leftPrecursorMassTolerance = new Tolerance(leftPrecursorMassTolerance.getValue(), isTolerancePPM); rightPrecursorMassTolerance = new Tolerance(rightPrecursorMassTolerance.getValue(), isTolerancePPM); } - IntRange isotope = opts.effectiveIsotopeErrorRange(); + IntRange isotope = opts.isotopeErrorRange != null ? opts.isotopeErrorRange : new IntRange(0, 1); this.minIsotopeError = isotope.min; this.maxIsotopeError = isotope.max; @@ -377,7 +365,7 @@ public String parse(MSGFPlusOptions opts) { } enzyme = opts.effectiveEnzyme(); - numTolerableTermini = opts.effectiveNumTolerableTermini(); + numTolerableTermini = opts.numTolerableTermini != null ? opts.numTolerableTermini : 2; activationMethod = opts.effectiveActivationMethod(); instType = opts.effectiveInstrumentType(); if (activationMethod == ActivationMethod.HCD @@ -424,19 +412,19 @@ public String parse(MSGFPlusOptions opts) { } } - numMatchesPerSpec = opts.effectiveNumMatchesPerSpec(); + numMatchesPerSpec = opts.numMatchesPerSpec != null ? opts.numMatchesPerSpec : 1; - IntRange specIdx = opts.effectiveSpecIndexRange(); + IntRange specIdx = opts.specIndexRange != null ? opts.specIndexRange : new IntRange(1, Integer.MAX_VALUE - 1); startSpecIndex = specIdx.min; endSpecIndex = specIdx.max; useTDA = opts.effectiveTdaStrategy() == 1; - ignoreMetCleavage = opts.effectiveIgnoreMetCleavage() == 1; - outputAdditionalFeatures = opts.effectiveAddFeatures() == 1; + ignoreMetCleavage = (opts.ignoreMetCleavage != null ? opts.ignoreMetCleavage : 0) == 1; + outputAdditionalFeatures = (opts.addFeatures != null ? opts.addFeatures : 0) == 1; minPeptideLength = opts.effectiveMinPeptideLength(); maxPeptideLength = opts.effectiveMaxPeptideLength(); - maxNumVariantsPerPeptide = opts.effectiveNumIsoforms(); + maxNumVariantsPerPeptide = opts.numIsoforms != null ? opts.numIsoforms : edu.ucsd.msjava.sequences.Constants.NUM_VARIANTS_PER_PEPTIDE; if (minPeptideLength > maxPeptideLength) { return "MinPepLength must not be larger than MaxPepLength"; @@ -448,27 +436,27 @@ public String parse(MSGFPlusOptions opts) { return "MinCharge must not be larger than MaxCharge"; } - numThreads = opts.effectiveNumThreads(); - numTasks = opts.effectiveNumTasks(); + numThreads = opts.numThreads != null ? opts.numThreads : Runtime.getRuntime().availableProcessors(); + numTasks = opts.numTasks != null ? opts.numTasks : 0; minSpectraPerThread = opts.effectiveMinSpectraPerThread(); verbose = opts.effectiveVerbose() == 1; - doNotUseEdgeScore = opts.effectiveEdgeScore() == 1; + doNotUseEdgeScore = (opts.edgeScore != null ? opts.edgeScore : 0) == 1; dbIndexDir = opts.dbIndexDir; - minNumPeaksPerSpectrum = opts.effectiveMinNumPeaks(); - minDeNovoScore = opts.effectiveMinDeNovoScore(); + minNumPeaksPerSpectrum = opts.minNumPeaks != null ? opts.minNumPeaks : edu.ucsd.msjava.sequences.Constants.MIN_NUM_PEAKS_PER_SPECTRUM; + minDeNovoScore = opts.minDeNovoScore != null ? opts.minDeNovoScore : edu.ucsd.msjava.sequences.Constants.MIN_DE_NOVO_SCORE; - maxMissedCleavages = opts.effectiveMaxMissedCleavages(); + maxMissedCleavages = opts.maxMissedCleavages != null ? opts.maxMissedCleavages : -1; if (maxMissedCleavages > -1 && enzyme.getName().equals("UnspecificCleavage")) { return "Cannot specify a MaxMissedCleavages when using unspecific cleavage enzyme"; } else if (maxMissedCleavages > -1 && enzyme.getName().equals("NoCleavage")) { return "Cannot specify a MaxMissedCleavages when using no cleavage enzyme"; } - allowDenseCentroidedPeaks = opts.effectiveAllowDenseCentroidedPeaks() == 1; - precursorCalMode = opts.effectivePrecursorCal(); + allowDenseCentroidedPeaks = (opts.allowDenseCentroidedPeaks != null ? opts.allowDenseCentroidedPeaks : 0) == 1; + precursorCalMode = opts.precursorCalMode != null ? opts.precursorCalMode : PrecursorCalMode.AUTO; - IntRange ms = opts.effectiveMSLevel(); + IntRange ms = opts.msLevel != null ? opts.msLevel : new IntRange(2, 2); minMSLevel = ms.min; maxMSLevel = ms.max; diff --git a/src/main/java/edu/ucsd/msjava/msgf/ToolLauncher.java b/src/main/java/edu/ucsd/msjava/msgf/ToolLauncher.java deleted file mode 100644 index 01d57a74..00000000 --- a/src/main/java/edu/ucsd/msjava/msgf/ToolLauncher.java +++ /dev/null @@ -1,154 +0,0 @@ -package edu.ucsd.msjava.msgf; - -import edu.ucsd.msjava.msscorer.NewAdditiveScorer; -import edu.ucsd.msjava.msutil.AminoAcidSet; -import edu.ucsd.msjava.msutil.Spectrum; - -import java.io.BufferedOutputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.util.Iterator; - -public abstract class ToolLauncher { - // Essential parameters, set by the constructor - protected final Iterator specIterator; - protected final NewAdditiveScorer scorer; - - // Optional parameters set by builders. - - protected float specProb = 1e-9f; - - protected boolean trypticOnly = true; - - // Tolerance - protected Tolerance pmTolerance = new Tolerance(30, true); - protected Tolerance fragTolerance = new Tolerance(30, true); - - protected float minParentMass = 400; - protected float maxParentMass = 2000; - protected int msgfScoreThreshold = 0; - - // Amino acid set, default: standard + Carbamidomethyl C - protected AminoAcidSet aaSet; - - // output - protected PrintStream out; - - /** - * A constructor specifies spectral file name and database file name. Database must be "fasta" format. - * - * @param specIterator spectra iterator. - * @param scorer a scorer object. - */ - protected ToolLauncher(Iterator specIterator, NewAdditiveScorer scorer) { - this.specIterator = specIterator; - this.scorer = scorer; - this.out = System.out; - this.aaSet = AminoAcidSet.getStandardAminoAcidSetWithFixedCarbamidomethylatedCys(); - } - - /** - * A builder method to set spectral probability. - * - * @param specProb spectral probability - * @return this object. - */ - public ToolLauncher specProb(float specProb) { - this.specProb = specProb; - return this; - } - - /** - * If this method is called, non-tryptic peptides are generated. - * Otherwise, only peptides ends with 'K' or 'R' are generated. - * - * @return this object. - */ - public ToolLauncher allowNonTryptic() { - this.trypticOnly = false; - return this; - } - - - /** - * Set parent mass tolerance. - * - * @param tolerance tolerance. - * @return this object. - */ - public ToolLauncher pmTolerance(Tolerance pmTolerance) { - this.pmTolerance = pmTolerance; - return this; - } - - /** - * Set fragment mass tolerance. - * - * @param tolerance tolerance. - * @return this object. - */ - public ToolLauncher fragTolerance(Tolerance fragTolerance) { - this.fragTolerance = fragTolerance; - return this; - } - - /** - * Set minimum parent mass. - * - * @param minParentMass minimum parent mass. - * @return this object. - */ - public ToolLauncher minParentMass(float minParentMass) { - this.minParentMass = minParentMass; - return this; - } - - /** - * Set maximum parent mass. - * - * @param maxParentMass maximum parent mass. - * @return this object. - */ - public ToolLauncher maxParentMass(float maxParentMass) { - this.maxParentMass = maxParentMass; - return this; - } - - /** - * Set max MSGF score threshold. Ignore all spectra whose best de novo scores are below thresholdScore. - * - * @param thresholdScore max MS-GF score threshold. - * @return this object. - */ - public ToolLauncher msgfScoreThreshold(int thresholdScore) { - this.msgfScoreThreshold = thresholdScore; - return this; - } - - /** - * Set the amino acid set. - * - * @param aaSet amino acid set. - * @return this object. - */ - public ToolLauncher aminoAcidSet(AminoAcidSet aaSet) { - this.aaSet = aaSet; - return this; - } - - /** - * Set the output. - * - * @param outputFileName output file name. - * @return this object. - */ - public ToolLauncher outputFileName(String outputFileName) { - try { - out = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFileName))); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - return this; - } -} diff --git a/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGenerator.java b/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGenerator.java deleted file mode 100644 index 62fee4b4..00000000 --- a/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGenerator.java +++ /dev/null @@ -1,733 +0,0 @@ -package edu.ucsd.msjava.msscorer; - -import edu.ucsd.msjava.msgf.Histogram; -import edu.ucsd.msjava.msgf.NominalMass; -import edu.ucsd.msjava.msgf.Tolerance; -import edu.ucsd.msjava.msscorer.NewScorerFactory.SpecDataType; -import edu.ucsd.msjava.msutil.*; -import edu.ucsd.msjava.mgf.MgfSpectrumParser; - -import java.io.File; -import java.util.*; - -/** - * This only supports low accuracy fragment ions. - * - * @author sangtaekim - */ -public class ScoringParameterGenerator extends NewRankScorer { - private static final float MIN_OFFSET_MASS = -120; // for ion types - private static final float MAX_OFFSET_MASS = 38; - private static final float MIN_PRECURSOR_OFFSET = -300; // for precursors - private static final float MAX_PRECURSOR_OFFSET = 30; - private static final int MIN_NUM_SPECTRA_PER_PARTITION = 400; // 400 - private static final int MIN_NUM_SPECTRA_FOR_PRECURSOR_OFF = 150; - - private static final float MIN_PRECURSOR_OFFSET_PROBABILITY = 0.15f; // 0.15 - private static final float MIN_ION_OFFSET_PROBABILITY = 0.15f; // 0.15, for ion types - private static final int MAX_RANK = 150; - private static final int NUM_SEGMENTS_PER_SPECTRUM = 2; // 2 - - - private static final int[] smoothingRanks = {3, 5, 10, 20, 50, Integer.MAX_VALUE}; //Ranks around which smoothing occurs - private static final int[] smoothingWindowSize = {0, 1, 2, 3, 4, 5}; //Smoothing windows for each smoothing rank - - private static final int NUM_NOISE_IONS = 10; - protected static final int MAX_CHARGE = 20; - - public static void main(String argv[]) { - File specFile = null; - File outputFile = null; - boolean isText = false; - AminoAcidSet aaSet = AminoAcidSet.getStandardAminoAcidSetWithFixedCarbamidomethylatedCys(); - int numSpecsPerPeptide = 1; - int errorScalingFactor = 10; - - // Fragmentation method - ActivationMethod activationMethod = null; - InstrumentType instType = null; - Enzyme enzyme = null; - - for (int i = 0; i < argv.length; i += 2) { - if (!argv[i].startsWith("-") || i + 1 >= argv.length) - printUsageAndExit("Invalid parameter!"); - if (argv[i].equalsIgnoreCase("-i")) { - specFile = new File(argv[i + 1]); - if (!specFile.exists()) { - printUsageAndExit(argv[i + 1] + " doesn't exist."); - } - int posDot = specFile.getName().lastIndexOf('.'); - if (posDot >= 0) { - String extension = specFile.getName().substring(posDot); - if (!extension.equalsIgnoreCase(".mgf")) - printUsageAndExit("Invalid spectrum format: " + argv[i + 1]); - } else - printUsageAndExit("Invalid spectrum format: " + argv[i + 1]); - } else if (argv[i].equalsIgnoreCase("-o")) { - outputFile = new File(argv[i + 1]); - } else if (argv[i].equalsIgnoreCase("-t")) { - outputFile = new File(argv[i + 1]); - isText = true; - } else if (argv[i].equalsIgnoreCase("-fixMod")) { - // 0: No mod, 1: Carbamidomethyl C, 2: Carboxymethyl C - if (argv[i + 1].equalsIgnoreCase("0")) - aaSet = AminoAcidSet.getStandardAminoAcidSet(); - else if (argv[i + 1].equalsIgnoreCase("1")) - aaSet = AminoAcidSet.getStandardAminoAcidSetWithFixedCarbamidomethylatedCys(); - else if (argv[i + 1].equalsIgnoreCase("2")) - aaSet = AminoAcidSet.getStandardAminoAcidSetWithFixedCarboxymethylatedCys(); - else - printUsageAndExit("Invalid -fixMod parameter: " + argv[i + 1]); - } else if (argv[i].equalsIgnoreCase("-pep")) { - numSpecsPerPeptide = Integer.parseInt(argv[i + 1]); - } else if (argv[i].equalsIgnoreCase("-err")) { - errorScalingFactor = Integer.parseInt(argv[i + 1]); - } else if (argv[i].equalsIgnoreCase("-m")) // Fragmentation method - { - // (0: written in the spectrum, 1: CID , 2: ETD, 3: HCD, 4: UVPD) - if (argv[i + 1].equalsIgnoreCase("1")) { - activationMethod = ActivationMethod.CID; - } else if (argv[i + 1].equalsIgnoreCase("2")) { - activationMethod = ActivationMethod.ETD; - } else if (argv[i + 1].equalsIgnoreCase("3")) { - activationMethod = ActivationMethod.HCD; - } else if (argv[i + 1].equalsIgnoreCase("4")) { - activationMethod = ActivationMethod.UVPD; - } else { - printUsageAndExit("Invalid activation method: " + argv[i + 1]); - } - } else if (argv[i].equalsIgnoreCase("-inst")) // Instrument type - { - if (argv[i + 1].equalsIgnoreCase("0")) { - instType = InstrumentType.LOW_RESOLUTION_LTQ; - } else if (argv[i + 1].equalsIgnoreCase("1")) { - instType = InstrumentType.TOF; - } else if (argv[i + 1].equalsIgnoreCase("2")) { - instType = InstrumentType.HIGH_RESOLUTION_LTQ; - } else { - printUsageAndExit("Invalid instrument type: " + argv[i + 1]); - } - } else if (argv[i].equalsIgnoreCase("-e")) // Enzyme - { - // 0: No enzyme, 1: Trypsin, 2: Chymotrypsin, 3: LysC, 4: LysN, 5: GluC, 6: ArgC, 7: AspN - if (argv[i + 1].equalsIgnoreCase("0")) - enzyme = null; - else if (argv[i + 1].equalsIgnoreCase("1")) - enzyme = Enzyme.TRYPSIN; - else if (argv[i + 1].equalsIgnoreCase("2")) - enzyme = Enzyme.CHYMOTRYPSIN; - else if (argv[i + 1].equalsIgnoreCase("3")) - enzyme = Enzyme.LysC; - else if (argv[i + 1].equalsIgnoreCase("4")) - enzyme = Enzyme.LysN; - else if (argv[i + 1].equalsIgnoreCase("5")) - enzyme = Enzyme.GluC; - else if (argv[i + 1].equalsIgnoreCase("6")) - enzyme = Enzyme.ArgC; - else if (argv[i + 1].equalsIgnoreCase("7")) - enzyme = Enzyme.AspN; - else - printUsageAndExit("Invalid enzyme: " + argv[i + 1]); - } else - printUsageAndExit("Invalid parameters!"); - } - if (specFile == null) - printUsageAndExit("missing annotatedMgfFileName!"); - if (outputFile == null) - printUsageAndExit("missing outputFileName!"); - if (activationMethod == null) - printUsageAndExit("missing activationMethod!"); - if (instType == null) - printUsageAndExit("missing instrumentType!"); - - generateParameters(specFile, activationMethod, instType, enzyme, Protocol.AUTOMATIC, numSpecsPerPeptide, errorScalingFactor, outputFile, aaSet, isText, false); - } - - public static void printUsageAndExit(String message) { - System.err.println(message); - System.out.println("usage: java -Xmx2000M -cp MSGF.jar msscorer.ScoringParameterGenerator\n" + - "\t-i annotatedMgfFileName (*.mgf)\n" + - "\t-o outputFileName (e.g. CID_Tryp.param)\n" + - "\t-m FragmentationMethodID (1: CID, 2: ETD, 3: HCD, 4: UVPD)\n" + - "\t-inst InstrumentID (0: Low-res LCQ/LTQ, 1: TOF , 2: High-res LTQ)\n" + - "\t-e EnzymeID (0: No enzyme, 1: Trypsin (Default), 2: Chymotrypsin, 3: Lys-C, 4: Lys-N, 5: Glu-C, 6: Arg-C, 7: Asp-N)\n" + - "\t[-fixMod 0/1/2] (0: NoCysteineProtection, 1: CarbamidomethyC (default), 2: CarboxymethylC)\n" + - "\t[-pep numPeptidesPerSpec] (default: 1)\n" + - "\t[-err errorScalingFactor] (default: 10)" - ); - System.exit(0); - } - - public static void generateParameters( - File specFile, - ActivationMethod activationMethod, - InstrumentType instType, - Enzyme enzyme, - Protocol protocol, - int numSpecsPerPeptide, - int errorScalingFactor, - File outputFile, - AminoAcidSet aaSet, - boolean isText, - boolean verbose) { - SpectraContainer container = new SpectraContainer(specFile.getPath(), new MgfSpectrumParser().aaSet(aaSet)); - - // multiple spectra with the same peptide -> one spec per peptide - HashMap> pepSpecMap = new HashMap>(); - SpectraContainer specContOnePerPep = new SpectraContainer(); - for (Spectrum spec : container) { - String pep = spec.getAnnotationStr() + ":" + spec.getCharge(); - if (pep != null && pep.length() > 0) { - ArrayList specList = pepSpecMap.get(pep); - if (specList == null) { - specList = new ArrayList(); - pepSpecMap.put(pep, specList); - } - if (specList.size() < numSpecsPerPeptide) - specList.add(spec); - } - } - for (ArrayList specList : pepSpecMap.values()) - for (Spectrum spec : specList) - specContOnePerPep.add(spec); - - SpecDataType dataType = new SpecDataType(activationMethod, instType, enzyme, protocol); - ScoringParameterGenerator gen = new ScoringParameterGenerator(specContOnePerPep, dataType); - - // set up the tolerance - gen.tolerance(new Tolerance(1 / Constants.INTEGER_MASS_SCALER / 2)); - - // Step 1: partition spectra - gen.partition(NUM_SEGMENTS_PER_SPECTRUM); - if (verbose) - System.out.println("Partition: " + gen.partitionSet.size()); - - // Step 2: compute offset frequency functions of precursor peaks and their neutral losses - gen.precursorOFF(MIN_PRECURSOR_OFFSET_PROBABILITY); - if (verbose) - System.out.println("PrecursorOFF Done."); - - // Step 3: filter out "significant" precursor offsets - gen.filterPrecursorPeaks(); - if (verbose) - System.out.println("Filtering Done."); - - // Step 4: compute offset frequency fnction of fragment peaks and determine ion types to be considered for scoring - gen.selectIonTypes(MIN_ION_OFFSET_PROBABILITY); - if (verbose) - System.out.println("Ion types selected."); - - // Step 5: compute rank distributions - gen.generateRankDist(MAX_RANK); - if (verbose) - System.out.println("Rank distribution computed."); - - // Step 6 (optional): generate error distribution, currently not in use - - // Step 7: smoothing parameters - gen.smoothing(); - if (verbose) - System.out.println("Smoothing complete."); - - // output - if (!isText) - gen.writeParameters(outputFile); - else - gen.writeParametersPlainText(outputFile); - - if (verbose) - System.out.println("Writing Done."); - } - - // Required - private SpectraContainer specContainer; - - public ScoringParameterGenerator(SpectraContainer specContainer, SpecDataType dataType) { - this.specContainer = specContainer; - super.dataType = dataType; - } - - public void partition(int numSegments) { - super.numSegments = numSegments; - chargeHist = new Histogram(); - partitionSet = new TreeSet(); - - HashMap> parentMassMap = new HashMap>(); - for (Spectrum spec : specContainer) { - int charge = spec.getCharge(); - if (charge <= 0) - continue; - chargeHist.add(charge); - if (spec.getAnnotation() != null) { - ArrayList precursorList = parentMassMap.get(charge); - if (precursorList == null) { - precursorList = new ArrayList(); - parentMassMap.put(charge, precursorList); - } - precursorList.add(spec.getPrecursorMass()); - } - } - - for (int c = chargeHist.minKey(); c <= chargeHist.maxKey(); c++) { - ArrayList parentMassList = parentMassMap.get(c); - if (parentMassList == null) - continue; - - int numSpec = parentMassList.size(); - if (numSpec < Math.round(MIN_NUM_SPECTRA_PER_PARTITION * 0.9f)) // to few spectra - continue; - - Collections.sort(parentMassList); - int bestSetSize = 0; - int smallestRemainder = MIN_NUM_SPECTRA_PER_PARTITION; - for (int i = Math.round(MIN_NUM_SPECTRA_PER_PARTITION * 0.9f); i <= Math.round(MIN_NUM_SPECTRA_PER_PARTITION * 1.1f); i++) { - int remainder = numSpec % i; - if (i - remainder < remainder) - remainder = i - remainder; - if (remainder < smallestRemainder || (remainder == smallestRemainder && Math.abs(MIN_NUM_SPECTRA_PER_PARTITION - i) < Math.abs(MIN_NUM_SPECTRA_PER_PARTITION - bestSetSize))) { - bestSetSize = i; - smallestRemainder = remainder; - } - } - int num = 0; - for (int i = 0; i == 0 || i < Math.round(numSpec / (float) bestSetSize); i++) { - if (num != 0) { - for (int seg = 0; seg < numSegments; seg++) - partitionSet.add(new Partition(c, parentMassList.get(num), seg)); - } else { - for (int seg = 0; seg < numSegments; seg++) - partitionSet.add(new Partition(c, 0f, seg)); - } - num += bestSetSize; - } - } - } - - private void precursorOFF(float minProbThreshold) { - if (chargeHist == null) { - assert (false) : "partition() must have been called before"; - return; - } - precursorOFFMap = new TreeMap>(); - numPrecurOFF = 0; - - for (int charge = chargeHist.minKey(); charge <= chargeHist.maxKey(); charge++) { - if (chargeHist.get(charge) < MIN_NUM_SPECTRA_FOR_PRECURSOR_OFF) - continue; - ArrayList precursorOffsetList = new ArrayList(); - int numSpecs = 0; - HashMap> histList = new HashMap>(); - for (int c = charge; c >= 2; c--) - histList.put(c, new Histogram()); - - for (Spectrum spec : specContainer) { - if (spec.getAnnotation() == null) - continue; - if (spec.getCharge() != charge) - continue; - numSpecs++; - spec = filter.apply(spec); - float precursorNeutralMass = spec.getPrecursorMass(); - for (int c = charge; c >= 2; c--) { - float precursorMz = (precursorNeutralMass + c * (float) Composition.ChargeCarrierMass()) / c; - ArrayList peakList = spec.getPeakListByMassRange( - precursorMz + MIN_PRECURSOR_OFFSET / (float) c - mme.getToleranceAsDa(precursorMz + MIN_PRECURSOR_OFFSET / (float) c) / 2, - precursorMz + MAX_PRECURSOR_OFFSET / (float) c + mme.getToleranceAsDa(precursorMz + MAX_PRECURSOR_OFFSET / (float) c) / 2); - - int prevMassIndexDiff = Integer.MIN_VALUE; - for (Peak p : peakList) { - float peakMass = p.getMz(); - int massIndexDiff = NominalMass.toNominalMass(peakMass - precursorMz); - if (massIndexDiff > prevMassIndexDiff) { - histList.get(c).add(massIndexDiff); - prevMassIndexDiff = massIndexDiff; - } - } - } - } - - for (int c = charge; c >= 2; c--) { - ArrayList keyList = new ArrayList(histList.get(c).keySet()); - Collections.sort(keyList); - for (Integer key : keyList) { - float prob = (histList.get(c).get(key)) / (float) numSpecs; - if (prob > minProbThreshold) { - precursorOffsetList.add(new PrecursorOffsetFrequency((charge - c), NominalMass.getMassFromNominalMass(key), prob)); - } - } - } - precursorOFFMap.put(charge, precursorOffsetList); - numPrecurOFF += precursorOffsetList.size(); - } - } - - private void filterPrecursorPeaks() { - if (this.precursorOFFMap == null) - return; - for (Spectrum spec : specContainer) { - for (PrecursorOffsetFrequency off : this.getPrecursorOFF(spec.getCharge())) - spec.filterPrecursorPeaks(mme, off.getReducedCharge(), off.getOffset()); - } - } - - private Pair getPrecursorMassRange(Partition partition) { - float minParentMass = partition.getParentMass(); - float maxParentMass = Float.MAX_VALUE; - Partition higherPartition = partitionSet.higher(partition); - if (higherPartition != null) { - if (higherPartition.getCharge() == partition.getCharge() && higherPartition.getSegNum() == partition.getSegNum()) { - maxParentMass = higherPartition.getParentMass(); - } - } - return new Pair(minParentMass, maxParentMass); - } - - private void selectIonTypes(float minProbThreshold) { - if (partitionSet == null) { - assert (false) : "partition() must have been called before!"; - return; - } - - fragOFFTable = new HashMap>(); - insignificantFragOFFTable = new HashMap>(); - - for (Partition partition : partitionSet) { - int charge = partition.getCharge(); - // parent mass range check - Pair parentMassRange = getPrecursorMassRange(partition); - int seg = partition.getSegNum(); - - int numSpec = 0; - HashMap> prefixIonFreq = new HashMap>(); - HashMap> suffixIonFreq = new HashMap>(); - for (int c = 1; c <= charge; c++) { - prefixIonFreq.put(c, new Histogram()); - suffixIonFreq.put(c, new Histogram()); - } - - int numCleavages = 0; - for (Spectrum spec : specContainer) { - if (spec.getAnnotation() == null) - continue; - if (spec.getCharge() != charge) - continue; - - float curParentMass = spec.getPrecursorMass(); - if (curParentMass < parentMassRange.getFirst() || curParentMass >= parentMassRange.getSecond()) - continue; - - Peptide annotation = spec.getAnnotation(); - numCleavages += annotation.size() - 1; - numSpec++; - spec = filter.apply(spec); - - for (int c = 1; c <= charge; c++) { - for (int direction = 0; direction < 2; direction++) { - double accurateMass = 0; - HashMap> ionFreq = null; - for (int i = 0; i < annotation.size() - 1; i++) { - if (direction == 0) { - accurateMass += annotation.get(i).getAccurateMass(); - ionFreq = prefixIonFreq; - } else if (direction == 1) { - accurateMass += annotation.get(annotation.size() - 1 - i).getAccurateMass(); - ionFreq = suffixIonFreq; - } - float mass = (float) (accurateMass / c); - ArrayList peakList = spec.getPeakListByMassRange( - mass + MIN_OFFSET_MASS / (float) c - mme.getToleranceAsDa(mass), - mass + MAX_OFFSET_MASS / (float) c + mme.getToleranceAsDa(mass)); - int prevIntOffset = Integer.MIN_VALUE; - for (Peak p : peakList) { - float peakMz = p.getMz(); - int segNum = getSegmentNum(peakMz, curParentMass); - if (segNum != seg) - continue; - float offset = peakMz - mass; - int intOffset = NominalMass.toNominalMass(offset); - if (intOffset > prevIntOffset) { - ionFreq.get(c).add(intOffset); - prevIntOffset = intOffset; - } - } - } - } - } - } - - float maxProb = 0; - int maxCharge = 0; - int maxDirection = 0; - float maxOffset = 0; - - ArrayList fragmentOffsetFrequencyList = new ArrayList(); - ArrayList insignificantFragmentOffsetFrequencyList = new ArrayList(); - for (int c = 1; c <= charge; c++) { - for (int direction = 0; direction < 2; direction++) { - ArrayList keyList; - if (direction == 0) - keyList = new ArrayList(prefixIonFreq.get(c).keySet()); - else - keyList = new ArrayList(suffixIonFreq.get(c).keySet()); - - Collections.sort(keyList); - for (Integer key : keyList) { - float offset = NominalMass.getMassFromNominalMass(key); - int freq; - if (direction == 0) - freq = prefixIonFreq.get(c).get(key); - else - freq = suffixIonFreq.get(c).get(key); - float prob = freq / (float) numCleavages * numSegments; - if (prob > maxProb) { - maxProb = prob; - maxCharge = c; - maxDirection = direction; - maxOffset = offset; - } - if (prob > minProbThreshold) { - if (direction == 0) - fragmentOffsetFrequencyList.add(new FragmentOffsetFrequency(new IonType.PrefixIon(c, offset), prob)); - else - fragmentOffsetFrequencyList.add(new FragmentOffsetFrequency(new IonType.SuffixIon(c, offset), prob)); - } else { - if (direction == 0) - insignificantFragmentOffsetFrequencyList.add(new FragmentOffsetFrequency(new IonType.PrefixIon(c, offset), prob)); - else - insignificantFragmentOffsetFrequencyList.add(new FragmentOffsetFrequency(new IonType.SuffixIon(c, offset), prob)); - } - } - } - } - - if (fragmentOffsetFrequencyList.size() == 0) { - if (maxDirection == 0) - fragmentOffsetFrequencyList.add(new FragmentOffsetFrequency(new IonType.PrefixIon(maxCharge, maxOffset), maxProb)); - else - fragmentOffsetFrequencyList.add(new FragmentOffsetFrequency(new IonType.SuffixIon(maxCharge, maxOffset), maxProb)); - } - - Collections.sort(insignificantFragmentOffsetFrequencyList); - ArrayList noiseOffsetFrequencyList = new ArrayList(NUM_NOISE_IONS); - - int numNoise = 0; - for (FragmentOffsetFrequency off : insignificantFragmentOffsetFrequencyList) { - if (off.getIonType().getCharge() == 1) - noiseOffsetFrequencyList.add(off); - if (++numNoise >= NUM_NOISE_IONS) - break; - } - Collections.sort(fragmentOffsetFrequencyList, Collections.reverseOrder()); - fragOFFTable.put(partition, fragmentOffsetFrequencyList); - insignificantFragOFFTable.put(partition, noiseOffsetFrequencyList); - } - } - - private void generateRankDist(int maxRank) { - if (partitionSet == null) { - assert (false) : "partition() must have been called!"; - return; - } - - rankDistTable = new HashMap>(); - this.maxRank = maxRank; - - for (Partition partition : partitionSet) { - int charge = partition.getCharge(); - IonType[] ionTypes = getIonTypes(partition); - if (ionTypes == null || ionTypes.length == 0) - continue; - Pair parentMassRange = getPrecursorMassRange(partition); - int seg = partition.getSegNum(); - - int numSpec = 0; - HashMap> rankDist = new HashMap>(); - HashMap rankDistMaxRank = new HashMap(); - HashMap rankDistUnexplained = new HashMap(); - - for (IonType ion : ionTypes) { - rankDist.put(ion, new Histogram()); - rankDistMaxRank.put(ion, 0f); - rankDistUnexplained.put(ion, 0f); - } - rankDist.put(IonType.NOISE, new Histogram()); - - float[] noiseDist = new float[maxRank + 2]; - int numMaxRankPeaks = 0; - int totalCleavageSites = 0; - - for (Spectrum spec : specContainer) { - int numExplainedPeaks = 0; - if (spec.getAnnotation() == null) - continue; - if (spec.getCharge() != charge) - continue; - float curParentMass = spec.getPrecursorMass(); - if (curParentMass < parentMassRange.getFirst() || curParentMass >= parentMassRange.getSecond()) - continue; - - Peptide annotation = spec.getAnnotation(); - spec.setRanksOfPeaks(); - numSpec++; - numMaxRankPeaks += spec.size() - maxRank + 1; - totalCleavageSites += annotation.size() - 1; - int prmMassIndex = 0; - int srmMassIndex = 0; - - HashSet explainedPeakSet = new HashSet(); - HashMap numExplainedMaxRankPeaks = new HashMap(); - for (IonType ion : ionTypes) { - numExplainedMaxRankPeaks.put(ion, 0); - } - - int numSignalBinsAtThisSegment = 0; - for (int i = 0; i < annotation.size() - 1; i++) { - prmMassIndex += NominalMass.toNominalMass(annotation.get(i).getMass()); - srmMassIndex += NominalMass.toNominalMass(annotation.get(annotation.size() - 1 - i).getMass()); - - float prm = NominalMass.getMassFromNominalMass(prmMassIndex); - float srm = NominalMass.getMassFromNominalMass(srmMassIndex); - for (IonType ion : ionTypes) { - float theoMass; - if (ion instanceof IonType.PrefixIon) - theoMass = ion.getMz(prm); - else - theoMass = ion.getMz(srm); - - int segNum = super.getSegmentNum(theoMass, curParentMass); - if (segNum == seg) { - numSignalBinsAtThisSegment++; - Peak p = spec.getPeakByMass(theoMass, mme); - if (p != null) { - numExplainedPeaks++; - int rank = p.getRank(); - if (rank >= maxRank) { - rank = maxRank; - numExplainedMaxRankPeaks.put(ion, numExplainedMaxRankPeaks.get(ion) + 1); - } - explainedPeakSet.add(p); - rankDist.get(ion).add(rank); - } else { - rankDist.get(ion).add(maxRank + 1); // maxRank+1: missing ion - } - } - } - } - - ArrayList unexplainedPeaksAtThisSegment = new ArrayList(); - int numPeaksAtThisSegment = 0; - int numMaxRankPeaksAtThisSegment = 0; - for (Peak p : spec) { - if (super.getSegmentNum(p.getMz(), curParentMass) == seg) { - numPeaksAtThisSegment++; - if (p.getRank() >= maxRank) - numMaxRankPeaksAtThisSegment++; - if (!explainedPeakSet.contains(p)) - unexplainedPeaksAtThisSegment.add(p); - } - } - - float midMassThisSegment = (1f / numSegments * seg + 1f / numSegments / 2) * annotation.getParentMass(); - float numBinsAtThisSegment = annotation.getParentMass() / numSegments / mme.getToleranceAsDa(midMassThisSegment) / 2; - - for (Peak p : unexplainedPeaksAtThisSegment) { - int rank = p.getRank(); -// float noiseFreq = (float)(annotation.size()-1)/(annotation.getParentMass()/(mme.getToleranceAsDa(midMassThisSegment)*2)); - float noiseFreq = (annotation.size() - 1) / numSegments / numBinsAtThisSegment; - if (rank >= maxRank) - noiseDist[maxRank] += noiseFreq / numMaxRankPeaksAtThisSegment; - else - noiseDist[rank] += noiseFreq; - } - - for (IonType ion : ionTypes) { - if (numMaxRankPeaksAtThisSegment > 0) { - Float prevSumFreq = rankDistMaxRank.get(ion); - float curFreq = numExplainedMaxRankPeaks.get(ion) / (float) numMaxRankPeaksAtThisSegment; - rankDistMaxRank.put(ion, prevSumFreq + curFreq); - } - } - - noiseDist[maxRank + 1] += (numBinsAtThisSegment - numPeaksAtThisSegment) * (annotation.size() - 1) / numSegments / numBinsAtThisSegment; - } - - HashMap freqDist = new HashMap(); - for (IonType ion : ionTypes) { - Float[] dist = new Float[maxRank + 1]; - Histogram hist = rankDist.get(ion); - for (int i = 1; i <= maxRank - 1; i++) { - Integer num = hist.get(i); - dist[i - 1] = (num / (float) numSpec); - } - dist[maxRank - 1] = rankDistMaxRank.get(ion) / numSpec; - dist[maxRank] = hist.get(maxRank + 1) / (float) numSpec; - freqDist.put(ion, dist); - } - - // noise - Float[] dist = new Float[maxRank + 1]; - for (int i = 1; i <= maxRank + 1; i++) - dist[i - 1] = noiseDist[i] / numSpec; - freqDist.put(IonType.NOISE, dist); - - rankDistTable.put(partition, freqDist); - } - } - - protected void smoothing() { - smoothingRankDistTable(); - } - - protected void smoothingRankDistTable() { - if (rankDistTable == null) - return; - assert (smoothingRanks.length == smoothingWindowSize.length); - for (Partition partition : rankDistTable.keySet()) { - HashMap table = this.rankDistTable.get(partition); - for (IonType ion : table.keySet()) { - Float[] freq = table.get(ion); - Float[] smoothedFreq = new Float[freq.length]; - int smoothingIndex = 0; - for (int i = 0; i < freq.length - 2; i++) // last 2 columns: maxRank, unexplained - { - if (smoothingIndex < smoothingRanks.length - 1 && - i == smoothingRanks[smoothingIndex]) - smoothingIndex++; - int windowSize = smoothingWindowSize[smoothingIndex]; - float sumFrequencies = 0; - int numIndicesSummed = 0; - for (int d = -windowSize; d <= windowSize; d++) { - int index = i + d; - if (index < 0 || index > freq.length - 3) - continue; - sumFrequencies += freq[index]; - numIndicesSummed++; - } - while (sumFrequencies == 0 && windowSize < freq.length - 4) { - windowSize++; - int index = i - windowSize; - if (index >= 0) { - sumFrequencies += freq[index]; - numIndicesSummed++; - } - index = i + windowSize; - if (index <= freq.length - 3) { - sumFrequencies += freq[index]; - numIndicesSummed++; - } - } - if (sumFrequencies != 0) - smoothedFreq[i] = sumFrequencies / numIndicesSummed; - else - assert (false); - } - for (int i = 0; i < freq.length - 2; i++) - freq[i] = smoothedFreq[i]; - if (freq[freq.length - 1] == 0) - freq[freq.length - 1] = Float.MIN_VALUE; - if (freq[freq.length - 2] == 0) - freq[freq.length - 2] = freq[freq.length - 3]; - } - } - } -} \ No newline at end of file diff --git a/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGeneratorWithErrors.java b/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGeneratorWithErrors.java deleted file mode 100644 index 8cedf8e6..00000000 --- a/src/main/java/edu/ucsd/msjava/msscorer/ScoringParameterGeneratorWithErrors.java +++ /dev/null @@ -1,880 +0,0 @@ -package edu.ucsd.msjava.msscorer; - -import edu.ucsd.msjava.msgf.Histogram; -import edu.ucsd.msjava.msgf.IntHistogram; -import edu.ucsd.msjava.msgf.NominalMass; -import edu.ucsd.msjava.msgf.Tolerance; -import edu.ucsd.msjava.msscorer.NewScorerFactory.SpecDataType; -import edu.ucsd.msjava.msutil.*; -import edu.ucsd.msjava.msutil.IonType.PrefixIon; -import edu.ucsd.msjava.mgf.MgfSpectrumParser; - -import java.io.File; -import java.util.*; - -/** - * This only supports low accuracy fragment ions. - * - * @author sangtaekim - */ -public class ScoringParameterGeneratorWithErrors extends NewRankScorer { - private static final float MIN_PRECURSOR_OFFSET = -300; // for precursors - private static final float MAX_PRECURSOR_OFFSET = 30; - private static final int MIN_NUM_SPECTRA_PER_PARTITION = 400; // 400 - private static final int MIN_NUM_SPECTRA_FOR_PRECURSOR_OFF = 150; - private static final int MAX_NUM_PARTITIONS_PER_CHARGE = 30; // 30 - - private static final float MIN_PRECURSOR_OFFSET_PROBABILITY = 0.15f; // 0.15 - private static final float MIN_ION_OFFSET_PROBABILITY = 0.15f; // 0.15, for ion types - private static final float MIN_MAIN_ION_OFFSET_PROBABILITY = 0.01f; // ions with probabilities below this number will be ignored - - private static final int MAX_RANK = 150; - private static final int NUM_SEGMENTS_PER_SPECTRUM = 2; // 2 - - private static final int[] smoothingRanks = {3, 5, 10, 20, 50, Integer.MAX_VALUE}; //Ranks around which smoothing occurs - private static final int[] smoothingWindowSize = {0, 1, 2, 3, 4, 5}; //Smoothing windows for each smoothing rank - - private static final float DECONVOLUTION_MASS_TOLERANCE = 0.02f; - protected static final int MAX_CHARGE = 20; - - public static void generateParameters( - File specFile, - SpecDataType dataType, - AminoAcidSet aaSet, - File outputDir, - boolean isText, - boolean verbose, - boolean singlePartition - ) { - SpectraContainer container = new SpectraContainer(specFile.getPath(), new MgfSpectrumParser().aaSet(aaSet)); - generateParameters(container, dataType, aaSet, outputDir, isText, verbose, singlePartition); - } - - public static void generateParameters( - SpectraContainer container, - SpecDataType dataType, - AminoAcidSet aaSet, - File outputDir, - boolean isText, - boolean verbose) { - generateParameters(container, dataType, aaSet, outputDir, isText, verbose, false); - } - - public static void generateParameters( - SpectraContainer container, - SpecDataType dataType, - AminoAcidSet aaSet, - File outputDir, - boolean isText, - boolean verbose, - boolean singlePartition) { - if (verbose) - System.out.println("Number of annotated PSMs: " + container.size()); - - String paramFileName = dataType.toString() + ".param"; - - File outputFile; - if (outputDir != null) - outputFile = new File(outputDir, paramFileName); - else - outputFile = new File(paramFileName); - - if (verbose) - System.out.println("Output file name: " + outputFile.getAbsolutePath()); - int errorScalingFactor = 0; - boolean applyDeconvolution = false; - - if (dataType.getInstrumentType() == InstrumentType.HIGH_RESOLUTION_LTQ - || dataType.getInstrumentType() == InstrumentType.TOF - || dataType.getInstrumentType().isHighResolution()) { - errorScalingFactor = 100; - applyDeconvolution = true; - if (verbose) - System.out.println("High-precision MS/MS data: " + - "errorScalingFactor(" + errorScalingFactor + ") " + - "chargeDeconvolution(" + applyDeconvolution + ")"); - } - - boolean considerPhosLoss = false; - if (dataType.getProtocol().getName().equals("Phosphorylation")) { - considerPhosLoss = true; - if (verbose) - System.out.println("Consider H3PO4 loss."); - } - - boolean consideriTRAQLoss = false; - if (dataType.getProtocol().getName().equals("iTRAQ")) { - consideriTRAQLoss = true; - if (verbose) - System.out.println("Consider iTRAQ loss."); - } - - boolean considerTMTLoss = false; - if (dataType.getProtocol().getName().equals("TMT")) { - considerTMTLoss = true; - if (verbose) - System.out.println("Consider TMT loss."); - } - - if (dataType.getProtocol().getName().equals("iTRAQPhospho")) { - considerPhosLoss = true; - consideriTRAQLoss = true; - if (verbose) - System.out.println("Consider iTRAQ and H3PO4 loss."); - } - - HashSet pepSet = new HashSet(); - for (Spectrum spec : container) - pepSet.add(spec.getAnnotationStr()); - - if (verbose) - System.out.println("Number of unique peptides: " + pepSet.size()); - int numSpecsPerPeptide; - if (pepSet.size() < 2000) { - numSpecsPerPeptide = 3; - } else { - numSpecsPerPeptide = 1; - } - if (verbose) - System.out.println("Consider " + numSpecsPerPeptide + " per spectrum."); - - // multiple spectra with the same peptide -> one spec per peptide - HashMap> pepSpecMap = new HashMap>(); - for (Spectrum spec : container) { - if (spec.getAnnotationStr() == null) - continue; - String pep = spec.getAnnotationStr() + ":" + spec.getCharge(); - if (pep != null && pep.length() > 0) { - ArrayList specList = pepSpecMap.get(pep); - if (specList == null) { - specList = new ArrayList(); - pepSpecMap.put(pep, specList); - } - if (specList.size() < numSpecsPerPeptide) - specList.add(spec); - } - } - - SpectraContainer specContOnePerPep = new SpectraContainer(); - for (ArrayList specList : pepSpecMap.values()) { - for (Spectrum spec : specList) { - specContOnePerPep.add(spec); - } - } - - ScoringParameterGeneratorWithErrors gen = new ScoringParameterGeneratorWithErrors( - specContOnePerPep, - dataType, - considerPhosLoss, - consideriTRAQLoss, - considerTMTLoss, - applyDeconvolution); - - // set up the tolerance - gen.tolerance(new Tolerance(0.5f)); - - // Step 1: partition spectra - if (singlePartition) - gen.partition(2, true); - else - gen.partition(NUM_SEGMENTS_PER_SPECTRUM, false); - if (verbose) - System.out.println("Partition: " + gen.partitionSet.size()); - - // Step 2: compute offset frequency functions of precursor peaks and their neutral losses - gen.precursorOFF(MIN_PRECURSOR_OFFSET_PROBABILITY); - if (verbose) - System.out.println("PrecursorOFF Done."); - - // Step 3: filter out "significant" precursor offsets - gen.filterPrecursorPeaks(); - if (verbose) - System.out.println("Filtering Done."); - - if (applyDeconvolution) { - gen.deconvoluteSpectra(); - if (verbose) - System.out.println("Deconvolution Done."); - } - - // Step 4: compute offset frequency function of fragment peaks and determine ion types to be considered for scoring - gen.selectIonTypes(); - if (verbose) - System.out.println("Ion types selected."); - - // Step 5: compute rank distributions - gen.generateRankDist(MAX_RANK); - if (verbose) - System.out.println("Rank distribution computed."); - - // Step 6 (optional): generate error distribution - gen.generateErrorDist(errorScalingFactor); - if (verbose) - System.out.println("Error disbribution computed"); - - // Step 7: smoothing parameters - gen.smoothing(); - if (verbose) - System.out.println("Smoothing complete."); - - // output - - gen.writeParameters(outputFile); - gen.writeParametersPlainText(new File(outputFile.getPath()+".txt")); - //if (!isText) - // gen.writeParameters(outputFile); - //else - // gen.writeParametersPlainText(outputFile); - - if (verbose) - System.out.println("Writing Done."); - } - - // Required - private SpectraContainer specContainer; - private final boolean considerPhosLoss; - private final boolean consideriTRAQLoss; - private final boolean considerTMTLoss; - - public ScoringParameterGeneratorWithErrors(SpectraContainer specContainer, SpecDataType dataType, boolean considerPhosLoss, boolean consideriTRAQLoss, boolean considerTMTLoss, boolean applyDeconvolution) { - this.specContainer = specContainer; - this.considerPhosLoss = considerPhosLoss; - this.consideriTRAQLoss = consideriTRAQLoss; - this.considerTMTLoss = considerTMTLoss; - super.dataType = dataType; - super.applyDeconvolution = applyDeconvolution; - super.deconvolutionErrorTolerance = DECONVOLUTION_MASS_TOLERANCE; - } - - public void partition(int numSegments, boolean singlePartition) { - super.numSegments = numSegments; - chargeHist = new Histogram(); - partitionSet = new TreeSet(); - - - HashMap> parentMassMap = new HashMap>(); - for (Spectrum spec : specContainer) { - int charge = spec.getCharge(); - if (charge <= 0) - continue; - chargeHist.add(charge); - if (spec.getAnnotation() != null) { - ArrayList precursorList = parentMassMap.get(charge); - if (precursorList == null) { - precursorList = new ArrayList(); - parentMassMap.put(charge, precursorList); - } - precursorList.add(spec.getPrecursorMass()); - } - } - - for (int c = chargeHist.minKey(); c <= chargeHist.maxKey(); c++) { - - ArrayList parentMassList = parentMassMap.get(c); - if (parentMassList == null) - continue; - - int numSpec = parentMassList.size(); - if (numSpec < Math.round(MIN_NUM_SPECTRA_PER_PARTITION * 0.9f)) // to few spectra - continue; - - int partitionSize = Math.max(numSpec / MAX_NUM_PARTITIONS_PER_CHARGE, MIN_NUM_SPECTRA_PER_PARTITION); - - Collections.sort(parentMassList); - int bestSetSize = 0; - - if (singlePartition) - bestSetSize = numSpec; - else { - int smallestRemainder = partitionSize; - for (int i = Math.round(partitionSize * 0.9f); i <= Math.round(partitionSize * 1.1f); i++) { - int remainder = numSpec % i; - if (i - remainder < remainder) - remainder = i - remainder; - if (remainder < smallestRemainder || (remainder == smallestRemainder && Math.abs(partitionSize - i) < Math.abs(partitionSize - bestSetSize))) { - bestSetSize = i; - smallestRemainder = remainder; - } - } - } - int num = 0; - for (int i = 0; i == 0 || i < Math.round(numSpec / (float) bestSetSize); i++) { - if (num != 0) { - for (int seg = 0; seg < numSegments; seg++) - partitionSet.add(new Partition(c, parentMassList.get(num), seg)); - } else { - for (int seg = 0; seg < numSegments; seg++) - partitionSet.add(new Partition(c, 0f, seg)); - } - num += bestSetSize; - } - } - } - - private void precursorOFF(float minProbThreshold) { - if (chargeHist == null) { - assert (false) : "partition() must have been called before"; - return; - } - precursorOFFMap = new TreeMap>(); - numPrecurOFF = 0; - - for (int charge = chargeHist.minKey(); charge <= chargeHist.maxKey(); charge++) { - if (chargeHist.get(charge) < MIN_NUM_SPECTRA_FOR_PRECURSOR_OFF) - continue; - ArrayList precursorOffsetList = new ArrayList(); - int numSpecs = 0; - HashMap> histList = new HashMap>(); - for (int c = charge; c >= 2; c--) - histList.put(c, new Histogram()); - - for (Spectrum spec : specContainer) { - if (spec.getAnnotation() == null) - continue; - if (spec.getCharge() != charge) - continue; - numSpecs++; - spec = filter.apply(spec); - float precursorNeutralMass = spec.getPrecursorMass(); - for (int c = charge; c >= 2; c--) { - float precursorMz = (precursorNeutralMass + c * (float) Composition.ChargeCarrierMass()) / c; - ArrayList peakList = spec.getPeakListByMassRange( - precursorMz + MIN_PRECURSOR_OFFSET / (float) c - mme.getToleranceAsDa(precursorMz + MIN_PRECURSOR_OFFSET / (float) c) / 2, - precursorMz + MAX_PRECURSOR_OFFSET / (float) c + mme.getToleranceAsDa(precursorMz + MAX_PRECURSOR_OFFSET / (float) c) / 2); - - int prevMassIndexDiff = Integer.MIN_VALUE; - for (Peak p : peakList) { - float peakMass = p.getMz(); - int massIndexDiff = NominalMass.toNominalMass(peakMass - precursorMz); - if (massIndexDiff > prevMassIndexDiff) { - histList.get(c).add(massIndexDiff); - prevMassIndexDiff = massIndexDiff; - } - } - } - } - - for (int c = charge; c >= 2; c--) { - ArrayList keyList = new ArrayList(histList.get(c).keySet()); - Collections.sort(keyList); - for (Integer key : keyList) { - float prob = (histList.get(c).get(key)) / (float) numSpecs; - if (prob > minProbThreshold) { - precursorOffsetList.add(new PrecursorOffsetFrequency((charge - c), NominalMass.getMassFromNominalMass(key), prob)); - } - } - } - precursorOFFMap.put(charge, precursorOffsetList); - numPrecurOFF += precursorOffsetList.size(); - } - } - - private void filterPrecursorPeaks() { - if (this.precursorOFFMap == null) - return; - for (Spectrum spec : specContainer) { - for (PrecursorOffsetFrequency off : this.getPrecursorOFF(spec.getCharge())) - spec.filterPrecursorPeaks(mme, off.getReducedCharge(), off.getOffset()); - } - } - - private void deconvoluteSpectra() { - SpectraContainer newSpecContainer = new SpectraContainer(); - for (Spectrum spec : specContainer) { - newSpecContainer.add(spec.getDeconvolutedSpectrum(DECONVOLUTION_MASS_TOLERANCE)); - } - specContainer = newSpecContainer; - } - - private Pair getPrecursorMassRange(Partition partition) { - float minParentMass = partition.getParentMass(); - float maxParentMass = Float.MAX_VALUE; - Partition higherPartition = partitionSet.higher(partition); - if (higherPartition != null) { - if (higherPartition.getCharge() == partition.getCharge() && higherPartition.getSegNum() == partition.getSegNum()) { - maxParentMass = higherPartition.getParentMass(); - } - } - return new Pair(minParentMass, maxParentMass); - } - - private void selectIonTypes() { - if (partitionSet == null) { - assert (false) : "partition() must have been called before!"; - return; - } - - fragOFFTable = new HashMap>(); - - for (Partition partition : partitionSet) { - int charge = partition.getCharge(); - // parent mass range check - Pair parentMassRange = getPrecursorMassRange(partition); - - SpectraContainer curPartContainer = new SpectraContainer(); - for (Spectrum spec : specContainer) { - if (spec.getAnnotation() == null) - continue; - if (spec.getCharge() != charge) - continue; - - float curParentMass = spec.getPrecursorMass(); - if (curParentMass < parentMassRange.getFirst() || curParentMass >= parentMassRange.getSecond()) - continue; - - curPartContainer.add(spec); - } - - ArrayList signalFragmentOffsetFrequencyList = new ArrayList(); - - int seg = partition.getSegNum(); - IonType[] allIonTypes = IonType.getAllKnownIonTypes(Math.min(charge, 3), true, considerPhosLoss, consideriTRAQLoss, considerTMTLoss).toArray(new IonType[0]); - - IonProbability probGen = new IonProbability( - curPartContainer.iterator(), - allIonTypes, - mme) - .filter(filter) - .segment(seg, numSegments); - -// if(partition.getCharge() == 2 && partition.getSegNum() == 1 && partition.getParentMass() >= 1008 && partition.getParentMass() < 1009) -// { -// System.out.println("Debug"); -// } - float[] ionProb = probGen.getIonProb(); - - float signalThreshold = MIN_ION_OFFSET_PROBABILITY; - for (int i = 0; i < allIonTypes.length; i++) { - if (ionProb[i] >= signalThreshold) - signalFragmentOffsetFrequencyList.add(new FragmentOffsetFrequency(allIonTypes[i], ionProb[i])); - } - - if (signalFragmentOffsetFrequencyList.size() == 0) { - int maxIndex = -1; - float maxIonProb = Float.MIN_VALUE; - for (int i = 0; i < allIonTypes.length; i++) { - if (ionProb[i] > MIN_MAIN_ION_OFFSET_PROBABILITY && ionProb[i] > maxIonProb) { - maxIndex = i; - maxIonProb = ionProb[i]; - } - } - if (maxIndex >= 0) - signalFragmentOffsetFrequencyList.add(new FragmentOffsetFrequency(allIonTypes[maxIndex], maxIonProb)); - } - - Collections.sort(signalFragmentOffsetFrequencyList, Collections.reverseOrder()); - fragOFFTable.put(partition, signalFragmentOffsetFrequencyList); - } - super.determineIonTypes(); - } - - private void generateRankDist(int maxRank) { - if (partitionSet == null) { - assert (false) : "partition() must have been called!"; - return; - } - - rankDistTable = new HashMap>(); - this.maxRank = maxRank; - - for (Partition partition : partitionSet) { - int charge = partition.getCharge(); - IonType[] ionTypes = getIonTypes(partition); - if (ionTypes == null || ionTypes.length == 0) - continue; - - Pair parentMassRange = getPrecursorMassRange(partition); - int seg = partition.getSegNum(); - - int numSpec = 0; - HashMap> rankDist = new HashMap>(); - HashMap rankDistMaxRank = new HashMap(); - HashMap rankDistUnexplained = new HashMap(); - - for (IonType ion : ionTypes) { - rankDist.put(ion, new Histogram()); - rankDistMaxRank.put(ion, 0f); - rankDistUnexplained.put(ion, 0f); - } - rankDist.put(IonType.NOISE, new Histogram()); - - float[] noiseDist = new float[maxRank + 2]; - int numMaxRankPeaks = 0; - int totalCleavageSites = 0; - - for (Spectrum spec : specContainer) { - int numExplainedPeaks = 0; - if (spec.getAnnotation() == null) - continue; - if (spec.getCharge() != charge) - continue; - float curParentMass = spec.getPrecursorMass(); - if (curParentMass < parentMassRange.getFirst() || curParentMass >= parentMassRange.getSecond()) - continue; - - Peptide annotation = spec.getAnnotation(); - spec.setRanksOfPeaks(); - numSpec++; - numMaxRankPeaks += spec.size() - maxRank + 1; - totalCleavageSites += annotation.size() - 1; - int prmMassIndex = 0; - int srmMassIndex = 0; - - HashSet explainedPeakSet = new HashSet(); - HashMap numExplainedMaxRankPeaks = new HashMap(); - for (IonType ion : ionTypes) { - numExplainedMaxRankPeaks.put(ion, 0); - } - - int numSignalBinsAtThisSegment = 0; - for (int i = 0; i < annotation.size() - 1; i++) { - prmMassIndex += NominalMass.toNominalMass(annotation.get(i).getMass()); - srmMassIndex += NominalMass.toNominalMass(annotation.get(annotation.size() - 1 - i).getMass()); - - float prm = NominalMass.getMassFromNominalMass(prmMassIndex); - float srm = NominalMass.getMassFromNominalMass(srmMassIndex); - for (IonType ion : ionTypes) { - float theoMass; - if (ion instanceof IonType.PrefixIon) - theoMass = ion.getMz(prm); - else - theoMass = ion.getMz(srm); - -// if(ion.getName().equals("z-H-TMT")) -// { -// System.out.println("Debug"); -// } - - int segNum = super.getSegmentNum(theoMass, curParentMass); - if (segNum == seg) { - numSignalBinsAtThisSegment++; - Peak p = spec.getPeakByMass(theoMass, mme); - if (p != null) { - numExplainedPeaks++; - int rank = p.getRank(); - if (rank >= maxRank) { - rank = maxRank; - numExplainedMaxRankPeaks.put(ion, numExplainedMaxRankPeaks.get(ion) + 1); - } - explainedPeakSet.add(p); - rankDist.get(ion).add(rank); - } else { - rankDist.get(ion).add(maxRank + 1); // maxRank+1: missing ion - } - } - } - } - - ArrayList unexplainedPeaksAtThisSegment = new ArrayList(); - int numPeaksAtThisSegment = 0; - int numMaxRankPeaksAtThisSegment = 0; - for (Peak p : spec) { - if (super.getSegmentNum(p.getMz(), curParentMass) == seg) { - numPeaksAtThisSegment++; - if (p.getRank() >= maxRank) - numMaxRankPeaksAtThisSegment++; - if (!explainedPeakSet.contains(p)) - unexplainedPeaksAtThisSegment.add(p); - } - } - - float midMassThisSegment = (1f / numSegments * seg + 1f / numSegments / 2) * annotation.getParentMass(); - float numBinsAtThisSegment = annotation.getParentMass() / numSegments / mme.getToleranceAsDa(midMassThisSegment) / 2; - - for (Peak p : unexplainedPeaksAtThisSegment) { - int rank = p.getRank(); - // float noiseFreq = (float)(annotation.size()-1)/(annotation.getParentMass()/(mme.getToleranceAsDa(midMassThisSegment)*2)); - float noiseFreq = (annotation.size() - 1) / numSegments / numBinsAtThisSegment; - if (rank >= maxRank) - noiseDist[maxRank] += noiseFreq / numMaxRankPeaksAtThisSegment; - else - noiseDist[rank] += noiseFreq; - } - - for (IonType ion : ionTypes) { - if (numMaxRankPeaksAtThisSegment > 0) { - Float prevSumFreq = rankDistMaxRank.get(ion); - float curFreq = numExplainedMaxRankPeaks.get(ion) / (float) numMaxRankPeaksAtThisSegment; - rankDistMaxRank.put(ion, prevSumFreq + curFreq); - } - } - - noiseDist[maxRank + 1] += (numBinsAtThisSegment - numPeaksAtThisSegment) * (annotation.size() - 1) / numSegments / numBinsAtThisSegment; - } - - HashMap freqDist = new HashMap(); - for (IonType ion : ionTypes) { - Float[] dist = new Float[maxRank + 1]; - Histogram hist = rankDist.get(ion); - for (int i = 1; i <= maxRank - 1; i++) { - Integer num = hist.get(i); - dist[i - 1] = (num / (float) numSpec); - } - dist[maxRank - 1] = rankDistMaxRank.get(ion) / numSpec; - dist[maxRank] = hist.get(maxRank + 1) / (float) numSpec; - freqDist.put(ion, dist); - } - - // noise - Float[] dist = new Float[maxRank + 1]; - for (int i = 1; i <= maxRank + 1; i++) - dist[i - 1] = noiseDist[i] / numSpec; - freqDist.put(IonType.NOISE, dist); - - rankDistTable.put(partition, freqDist); - } - } - - private void generateErrorDist(int errorScalingFactor) { - this.errorScalingFactor = errorScalingFactor; - if (errorScalingFactor > 0) { - generateIonErrorDist(); - generateNoiseErrorDist(); - } - } - - private void generateIonErrorDist() { - ionErrDistTable = new HashMap(); - ionExistenceTable = new HashMap(); - for (Partition partition : partitionSet) { - int charge = partition.getCharge(); - Pair parentMassRange = getPrecursorMassRange(partition); - int seg = partition.getSegNum(); - if (seg != super.getNumSegments() - 1) - continue; - IonType mainIon = this.getMainIonType(partition); - IntHistogram errHist = new IntHistogram(); - int[] edgeCount = new int[4]; - int numSpecs = 0; - for (Spectrum spec : specContainer) { - if (spec.getAnnotation() == null) - continue; - if (spec.getCharge() != charge) - continue; - - float curParentMass = spec.getPrecursorMass(); - if (curParentMass < parentMassRange.getFirst() || curParentMass >= parentMassRange.getSecond()) - continue; - - numSpecs++; - Peptide peptide; - - peptide = spec.getAnnotation(); - - int intResidueMass = 0; - float[] obsMass = new float[peptide.size() + 1]; - - obsMass[0] = mainIon.getOffset(); - for (int i = 0; i < peptide.size() - 1; i++) { - if (mainIon instanceof PrefixIon) - intResidueMass += peptide.get(i).getNominalMass(); - else - intResidueMass += peptide.get(peptide.size() - 1 - i).getNominalMass(); - - float theoMass = mainIon.getMz(NominalMass.getMassFromNominalMass(intResidueMass)); - Peak p = spec.getPeakByMass(theoMass, mme); - if (p != null) - obsMass[i + 1] = p.getMz(); - else - obsMass[i + 1] = -1; - } - - obsMass[peptide.size()] = mainIon.getMz(peptide.getMass()); - for (int i = 1; i <= peptide.size(); i++) { - if (obsMass[i] >= 0) { - if (obsMass[i - 1] >= 0) // yy - { - AminoAcid aa; - if (mainIon instanceof PrefixIon) - aa = peptide.get(i - 1); - else - aa = peptide.get(peptide.size() - i); - - float expMass = obsMass[i] - obsMass[i - 1]; - float theoMass = aa.getMass() / mainIon.getCharge(); - float diff = expMass - theoMass; - int diffIndex = Math.round(diff * errorScalingFactor); - if (diffIndex > errorScalingFactor) - diffIndex = errorScalingFactor; - else if (diffIndex < -errorScalingFactor) - diffIndex = -errorScalingFactor; - errHist.add(diffIndex); - edgeCount[3]++; - } else // ny - edgeCount[1]++; - } else { - if (obsMass[i - 1] >= 0) // yn - edgeCount[2]++; - else // nn - edgeCount[0]++; - } - } - } - - Float[] ionErrHist = new Float[2 * errorScalingFactor + 1]; - // smoothing - float[] smoothedHist = errHist.getSmoothedHist(errorScalingFactor); - for (int i = -errorScalingFactor; i <= errorScalingFactor; i++) - ionErrHist[i + errorScalingFactor] = smoothedHist[i + errorScalingFactor] / (float) errHist.totalCount(); - - Float[] ionExistence = new Float[edgeCount.length]; - int sumEdgeCount = 0; - for (int i = 0; i < edgeCount.length; i++) - sumEdgeCount += edgeCount[i]; - for (int i = 0; i < edgeCount.length; i++) - ionExistence[i] = edgeCount[i] / (float) sumEdgeCount; - - for (int i = 0; i < this.numSegments; i++) { - Partition part = new Partition(partition.getCharge(), partition.getParentMass(), i); - if (partitionSet.contains(part)) { - ionErrDistTable.put(part, ionErrHist); - ionExistenceTable.put(part, ionExistence); - } - } - // if(partition.getCharge() == 2 && partition.getParentMass() > 1000 && partition.getParentMass() < 1110) - // { - // System.out.println("Partition\t"+partition.getCharge()+"\t"+partition.getParentMass()); - // System.out.println("ErrorHist:"); - // for(int i=0; i(); - AminoAcidSet aaSet = AminoAcidSet.getStandardAminoAcidSetWithFixedCarbamidomethylatedCys(); - AminoAcid aaK = aaSet.getAminoAcid('K'); - AminoAcid aaQ = aaSet.getAminoAcid('Q'); - int heaviestAANominalMass = aaSet.getMaxNominalMass(); - float[] nominalMass = new float[heaviestAANominalMass + 1]; - for (AminoAcid aa : aaSet) - nominalMass[aa.getNominalMass()] = aa.getMass(); - - for (Partition partition : partitionSet) { - int charge = partition.getCharge(); - Pair parentMassRange = getPrecursorMassRange(partition); - int seg = partition.getSegNum(); - if (seg != super.getNumSegments() - 1) - continue; - - IntHistogram errHist = new IntHistogram(); - int numSpecs = 0; - for (Spectrum spec : specContainer) { - if (spec.getAnnotation() == null) - continue; - if (spec.getCharge() != charge) - continue; - - float curParentMass = spec.getPrecursorMass(); - if (curParentMass < parentMassRange.getFirst() || curParentMass >= parentMassRange.getSecond()) - continue; - - Spectrum noiseSpec = (Spectrum) spec.clone(); - - numSpecs++; - - for (int i = 0; i < noiseSpec.size() - 1; i++) { - Peak p1 = noiseSpec.get(i); - float p1Mass = p1.getMz(); - int nominalP1 = NominalMass.toNominalMass(p1.getMz()); - for (int j = i + 1; j < noiseSpec.size(); j++) { - Peak p2 = noiseSpec.get(j); - float p2Mass = p2.getMz(); - int nominalP2 = NominalMass.toNominalMass(p2.getMz()); - int nominalDiff = nominalP2 - nominalP1; - if (nominalDiff > heaviestAANominalMass) - break; - if (nominalMass[nominalDiff] == 0) - continue; - - float diff = p2Mass - p1Mass; - float aaMass = nominalMass[nominalDiff]; - if (nominalDiff == 128) // K or Q - { - if (Math.abs(diff - aaQ.getMass()) > Math.abs(diff - aaK.getMass())) - aaMass = aaK.getMass(); - else - aaMass = aaQ.getMass(); - } - float err = diff - aaMass; - errHist.add(Math.round(err * errorScalingFactor)); - } - } - } - Float[] noiseErrHist = new Float[2 * errorScalingFactor + 1]; - // smoothing - float[] smoothedHist = errHist.getSmoothedHist(errorScalingFactor); - for (int i = -errorScalingFactor; i <= errorScalingFactor; i++) - noiseErrHist[i + errorScalingFactor] = smoothedHist[i + errorScalingFactor] / (float) errHist.totalCount(); - - for (int i = 0; i < this.numSegments; i++) { - Partition part = new Partition(partition.getCharge(), partition.getParentMass(), i); - if (partitionSet.contains(part)) { - noiseErrDistTable.put(part, noiseErrHist); - } - } - } - } - - protected void smoothing() { - smoothingRankDistTable(); - } - - protected void smoothingRankDistTable() { - if (rankDistTable == null) - return; - assert (smoothingRanks.length == smoothingWindowSize.length); - for (Partition partition : rankDistTable.keySet()) { - HashMap table = this.rankDistTable.get(partition); - for (IonType ion : table.keySet()) { - Float[] freq = table.get(ion); - Float[] smoothedFreq = new Float[freq.length]; - int smoothingIndex = 0; - for (int i = 0; i < freq.length - 2; i++) // last 2 columns: maxRank, unexplained - { - if (smoothingIndex < smoothingRanks.length - 1 && - i == smoothingRanks[smoothingIndex]) - smoothingIndex++; - int windowSize = smoothingWindowSize[smoothingIndex]; - float sumFrequencies = 0; - int numIndicesSummed = 0; - for (int d = -windowSize; d <= windowSize; d++) { - int index = i + d; - if (index < 0 || index > freq.length - 3) - continue; - sumFrequencies += freq[index]; - numIndicesSummed++; - } - while (sumFrequencies == 0 && windowSize < freq.length - 4) { - windowSize++; - int index = i - windowSize; - if (index >= 0) { - sumFrequencies += freq[index]; - numIndicesSummed++; - } - index = i + windowSize; - if (index <= freq.length - 3) { - sumFrequencies += freq[index]; - numIndicesSummed++; - } - } - if (sumFrequencies != 0) - smoothedFreq[i] = sumFrequencies / numIndicesSummed; - else - assert (false); - } - for (int i = 0; i < freq.length - 2; i++) - freq[i] = smoothedFreq[i]; - if (freq[freq.length - 1] == 0) - freq[freq.length - 1] = Float.MIN_VALUE; - if (freq[freq.length - 2] == 0) - freq[freq.length - 2] = freq[freq.length - 3]; - } - } - } -} - diff --git a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java index 96bda071..ef199c04 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java +++ b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java @@ -864,7 +864,7 @@ private static boolean parseConfigEntry( ArrayList customAA, ModificationMetadata modMetadata) { - String modSetting = SearchParams.getConfigLineWithoutComment(dataLine); + String modSetting = MSGFPlusOptions.stripComment(dataLine); if (modSetting.length() == 0) { return true; } diff --git a/src/main/java/edu/ucsd/msjava/msutil/ScoredString.java b/src/main/java/edu/ucsd/msjava/msutil/ScoredString.java deleted file mode 100644 index 7cf895aa..00000000 --- a/src/main/java/edu/ucsd/msjava/msutil/ScoredString.java +++ /dev/null @@ -1,67 +0,0 @@ -/*************************************************************************** - * Title: - * Author: Sangtae Kim - * Last modified: - * - * Copyright (c) 2008-2009 The Regents of the University of California - * All Rights Reserved - * See file LICENSE for details. - ***************************************************************************/ -package edu.ucsd.msjava.msutil; - -/** - * The Class ScoredString. - */ -public class ScoredString extends Pair implements Comparable> { - - /** - * Instantiates a new scored string. - * - * @param peptide the peptide - * @param score the score - */ - public ScoredString(String peptide, Float score) { - super(peptide, score); - } - - /** - * Instantiates a new scored string, using an integer score. - * - * @param score - * @param peptide - */ - public ScoredString(String peptide, int score) { - super(peptide, (float) score); - } - - /* (non-Javadoc) - * @see java.lang.Comparable#compareTo(java.lang.Object) - */ - public int compareTo(Pair o) { - int scoreComp = getSecond().compareTo(o.getSecond()); - if (scoreComp != 0) - return scoreComp; - else - return getFirst().compareTo(o.getFirst()); - } - - /** - * Gets the str. - * - * @return the str - */ - public String getStr() { - return super.getFirst(); - } - - /** - * Gets the score. - * - * @return the score - */ - public float getScore() { - return super.getSecond(); - } - -} - diff --git a/src/main/java/edu/ucsd/msjava/output/Unimod.java b/src/main/java/edu/ucsd/msjava/output/Unimod.java deleted file mode 100644 index d3368a88..00000000 --- a/src/main/java/edu/ucsd/msjava/output/Unimod.java +++ /dev/null @@ -1,85 +0,0 @@ -package edu.ucsd.msjava.output; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.HashMap; -import java.util.Map; - -public class Unimod { - private static final String UNIMOD_RESOURCE_PATH = "unimod.obo"; - - public static Unimod getUnimod() { - return unimod; - } - - public String getRecordID(String name) { - return recordIDMap.get(name); - } - - public String getDeltaComposition(String id) { - return idToDeltaCompositionMap.get(id); - } - - private Map recordIDMap; // name -> record id - private Map idToDeltaCompositionMap; // id -> delta_composition - - private Unimod() { - readUnimodOBOFile(); - } - - private void readUnimodOBOFile() { - InputStream is = Unimod.class.getClassLoader().getResourceAsStream(UNIMOD_RESOURCE_PATH); - if (is == null) { - System.err.println("Unable to access \"unimod.obo\"."); - System.exit(-1); - } - BufferedReader in = new BufferedReader(new InputStreamReader(is)); - - recordIDMap = new HashMap(); - idToDeltaCompositionMap = new HashMap(); - String s; - String curID = null; - String deltaMass = null; - try { - while ((s = in.readLine()) != null) { - if (s.startsWith("id:")) { - String id = s.split("\\s+")[1].trim(); - String nameLine = in.readLine(); - assert (nameLine.startsWith("name:")); - String name = nameLine.split("\\s+")[1].trim(); - recordIDMap.put(name, id); - curID = id; - } - if (s.startsWith("xref: delta_composition")) { - String deltaComposition = s.substring(s.indexOf('"') + 1, s.lastIndexOf('"')); - idToDeltaCompositionMap.put(curID, deltaComposition); -// Double mass = UnimodComposition.getMass(deltaComposition); -// if(mass == null) -// { -// System.out.println(deltaComposition); -// } - if (deltaMass != null) { - Double mass = UnimodComposition.getMass(deltaComposition); - Double mass2 = Double.parseDouble(deltaMass); - if (Math.abs(mass - mass2) > 0.001) { - System.out.println("Error: " + deltaComposition + " " + mass + " " + mass2); - } - } - } - if (s.startsWith("xref: delta_mono_mass")) { - deltaMass = s.substring(s.indexOf('"') + 1, s.lastIndexOf('"')); - } - } - } catch (IOException e) { - e.printStackTrace(); - } - } - - private static Unimod unimod; - - static { - unimod = new Unimod(); - } -} diff --git a/src/main/java/edu/ucsd/msjava/output/UnimodComposition.java b/src/main/java/edu/ucsd/msjava/output/UnimodComposition.java deleted file mode 100644 index 474f67fc..00000000 --- a/src/main/java/edu/ucsd/msjava/output/UnimodComposition.java +++ /dev/null @@ -1,133 +0,0 @@ -package edu.ucsd.msjava.output; - -import edu.ucsd.msjava.msutil.Atom; -import edu.ucsd.msjava.msutil.Composition; - -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.Map.Entry; - -public class UnimodComposition { - - - public UnimodComposition() { - compMap = new LinkedHashMap(); - compMap.put("H", 0); - compMap.put("C", 0); - compMap.put("N", 0); - compMap.put("O", 0); - compMap.put("P", 0); - compMap.put("S", 0); - } - - public void add(Composition comp) { - add("C", comp.getC()); - add("H", comp.getH()); - add("N", comp.getN()); - add("O", comp.getO()); - add("S", comp.getS()); - } - - public void add(String deltaComposition) { - String[] token = deltaComposition.split("\\s+"); - for (String e : token) { - if (e.matches("\\d*?[a-zA-Z]+(\\(-?\\d+\\))?")) { - String element; - int num; - if (e.matches("\\d*?[a-zA-Z]+")) { - element = e; - num = 1; - } else { - element = e.substring(0, e.indexOf('(')); - num = Integer.parseInt(e.substring(e.indexOf('(') + 1, e.lastIndexOf(')'))); - } - add(element, num); - } else if (e.matches("\\d+\\.?\\d*")) { - double mass = Double.parseDouble(e); - add(mass); - } else { - System.err.println("Wrong Unimod delta_composition: " + deltaComposition); - System.exit(-1); - } - } - } - - public void add(String element, int number) { - Integer num = compMap.get(element); - if (num == null) - compMap.put(element, number); - else - compMap.put(element, num + number); - } - - public void add(double deltaMass) { - if (this.deltaMass == null) - this.deltaMass = deltaMass; - else - this.deltaMass += deltaMass; - } - - public Double getMass() { - double mass = 0; - Iterator> itr = compMap.entrySet().iterator(); - while (itr.hasNext()) { - Entry entry = itr.next(); - String element = entry.getKey(); - int num = entry.getValue(); - if (num == 0) - continue; - Atom atom = Atom.get(element); - if (atom == null) { - System.out.println("Error: Could not parse element/molecule \"" + element + "\""); - return null; - } - mass += atom.getMass() * num; - } - - if (deltaMass != null) - mass += deltaMass; - return mass; - } - - public static Double getMass(String unimodCompositionStr) { - UnimodComposition comp = new UnimodComposition(); - comp.add(unimodCompositionStr); - return comp.getMass(); - } - - @Override - public String toString() { - StringBuffer buf = new StringBuffer(); - Iterator> itr = compMap.entrySet().iterator(); - boolean first = true; - while (itr.hasNext()) { - Entry entry = itr.next(); - String element = entry.getKey(); - int num = entry.getValue(); - if (num == 0) - continue; - else if (num == 1) { - if (!first) - buf.append(" "); - else - first = false; - buf.append(element); - } else { - if (!first) - buf.append(" "); - else - first = false; - buf.append(element + "(" + num + ")"); - } - } - - if (deltaMass != null) - buf.append(" " + deltaMass); - return buf.toString(); - } - - private Map compMap; - private Double deltaMass = null; - -} diff --git a/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java index d62867eb..d4710f3e 100644 --- a/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java +++ b/src/test/java/edu/ucsd/msjava/cli/MSGFPlusOptionsConfigFileTest.java @@ -121,4 +121,31 @@ public void validateRejectsOutOfRangeFlags() { // A clean invocation passes. Assert.assertNull(opts.validate()); } + + @Test + public void validateRejectsMissingModificationFile() throws IOException { + MSGFPlusOptions opts = new MSGFPlusOptions(); + opts.spectrumFile = new File("anything.mgf"); + opts.databaseFile = new File("anything.fasta"); + + opts.modificationFile = new File("does-not-exist.mods"); + Assert.assertEquals("Modification file not found: does-not-exist.mods", opts.validate()); + + Path tmpDir = Files.createTempDirectory("msgfplus-missing-mod-"); + Path conf = tmpDir.resolve("missing_mod.txt"); + Files.write(conf, "ModificationFile=does-not-exist-from-conf.mods\n".getBytes(StandardCharsets.UTF_8)); + + MSGFPlusOptions confOpts = new MSGFPlusOptions(); + confOpts.spectrumFile = new File("anything.mgf"); + confOpts.databaseFile = new File("anything.fasta"); + confOpts.configFile = conf.toFile(); + + SearchParams params = new SearchParams(); + Assert.assertEquals( + "Modification file not found: does-not-exist-from-conf.mods", + params.parse(confOpts)); + + Files.deleteIfExists(conf); + Files.deleteIfExists(tmpDir); + } } diff --git a/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java b/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java index 5d9987fe..f0320354 100644 --- a/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java +++ b/src/test/java/edu/ucsd/msjava/msdbsearch/SearchParamsTest.java @@ -28,7 +28,7 @@ public void parse() throws URISyntaxException { Assert.assertNull("SearchParams.parse returned: " + err, err); Assert.assertEquals("HighRes", opts.effectiveInstrumentType().getName()); - Assert.assertEquals("20.0 ppm", opts.effectivePrecursorTolerance().left.toString()); - Assert.assertEquals("20.0 ppm", opts.effectivePrecursorTolerance().right.toString()); + Assert.assertEquals("20.0 ppm", params.getLeftPrecursorMassTolerance().toString()); + Assert.assertEquals("20.0 ppm", params.getRightPrecursorMassTolerance().toString()); } } diff --git a/src/test/java/msgfplus/TestDirectPinWriter.java b/src/test/java/msgfplus/TestDirectPinWriter.java index 14b9d76c..e50e9b5b 100644 --- a/src/test/java/msgfplus/TestDirectPinWriter.java +++ b/src/test/java/msgfplus/TestDirectPinWriter.java @@ -56,7 +56,6 @@ public void writePinGetterReflectsOutputFormat() throws URISyntaxException { SearchParams params = new SearchParams(); Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); - Assert.assertTrue("writePin() should be true when outputFormat=pin", params.writePin()); Assert.assertFalse("writeTsv() should be false when outputFormat=pin", params.writeTsv()); } From 4e2ad5032dc91b6242f06840d60e95bf9efd6a7b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 11:37:15 +0100 Subject: [PATCH 29/34] refactor: trim deps + dead methods across fdr/msgf/msscorer/msutil/sequences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Net change: 21 files, +24 / -409 = -385 LOC. Combines the audit's medium-value pass with a parallel sweep across the codebase for methods and fields with no live callers. ### Dependencies dropped from pom.xml - commons-text: zero direct usages. - commons-io: NullOutputStream replaced by Java 11's OutputStream.nullOutputStream() in ConcurrentMSGFPlus; FilenameUtils.removeExtension replaced by an inline lastIndexOf('.') + substring in BuildSA. Removing commons-text also drops its transitive commons-lang3, which was used by mgf/BufferedRandomAccessLineReader for Pair. Replaced with a small local record BomStripResult (text, bomLength) — single-purpose, no API consumers, simpler at the use sites (result.text() / result.bomLength() vs result.getKey() / result.getValue()). ### Dead code removed across packages Verified zero remaining callers for each: - fdr/TargetDecoyAnalysis: ~38 lines of commented-out legacy constructors that referenced a non-existent TargetDecoyPSMSet class. Pure comment cleanup. - fdr/TSVPSMSet: 14 LOC of dead methods. - mgf/MgfSpectrumParser: 35 LOC of dead methods. - msgf/AAFrequencyCounter: 44 LOC of dead helpers. - msgf/MassListComparator: 15 LOC. - msscorer/NewRankScorer + NewScorerFactory: 31 LOC of dead helpers (kept all live scoring-path code per CLAUDE.md invariants). - msutil/AminoAcidSet, Composition, CompositionFactory, IonType, Peptide: ~110 LOC of dead helpers and fields. - sequences/FastaSequence + FastaSequences + ProteinFastaSequence + ProteinFastaSequences: ~76 LOC of dead methods. ### Other cleanup - SearchParams.toString: drop 6 lines of commented-out spectrum-list output and switch StringBuffer (synchronized, single-threaded caller) to StringBuilder. Verified: scoped sweep (TestDirectPinWriter, TestMSUtils, TestSA, TestMisc, TestRunManifestWriter, SearchParamsTest, TestPercolator, TestMinSpectraPerThread, TestPrecursorCalScaffolding, TestCandidatePeptideGrid + ConsideringMetCleavage, MSGFPlusOptionsConfigFileTest, MSGFPlusOptionsActivationMethodTest): 78 tests, 0 failures, 0 errors, 3 skipped. --- pom.xml | 14 +--- .../java/edu/ucsd/msjava/fdr/TSVPSMSet.java | 14 ---- .../ucsd/msjava/fdr/TargetDecoyAnalysis.java | 41 ----------- .../mgf/BufferedRandomAccessLineReader.java | 42 +++-------- .../ucsd/msjava/mgf/MgfSpectrumParser.java | 35 ---------- .../edu/ucsd/msjava/msdbsearch/BuildSA.java | 5 +- .../msjava/msdbsearch/ConcurrentMSGFPlus.java | 5 +- .../ucsd/msjava/msdbsearch/SearchParams.java | 9 +-- .../ucsd/msjava/msgf/AAFrequencyCounter.java | 44 ------------ .../ucsd/msjava/msgf/MassListComparator.java | 15 ---- .../ucsd/msjava/msscorer/NewRankScorer.java | 7 -- .../msjava/msscorer/NewScorerFactory.java | 24 ------- .../edu/ucsd/msjava/msutil/AminoAcidSet.java | 9 --- .../edu/ucsd/msjava/msutil/Composition.java | 69 +------------------ .../msjava/msutil/CompositionFactory.java | 4 +- .../java/edu/ucsd/msjava/msutil/IonType.java | 5 -- .../java/edu/ucsd/msjava/msutil/Peptide.java | 15 ---- .../ucsd/msjava/sequences/FastaSequence.java | 39 +---------- .../ucsd/msjava/sequences/FastaSequences.java | 15 ---- .../sequences/ProteinFastaSequence.java | 7 +- .../sequences/ProteinFastaSequences.java | 15 ---- 21 files changed, 24 insertions(+), 409 deletions(-) diff --git a/pom.xml b/pom.xml index 0e26ba33..0256882d 100644 --- a/pom.xml +++ b/pom.xml @@ -113,13 +113,10 @@ test jar - - org.apache.commons - commons-text - 1.11.0 - + + it.unimi.dsi fastutil @@ -135,11 +132,6 @@ logback-classic 1.2.12 - - commons-io - commons-io - 2.15.1 - info.picocli picocli @@ -164,4 +156,4 @@ https://proteomics.ucsd.edu MSGF+ - \ No newline at end of file + diff --git a/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java b/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java index 326a9ca4..d3cc769b 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java +++ b/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java @@ -235,18 +235,4 @@ public static String getPeptideFromAnnotation(String annotation) { return pep; } - public static void main(String argv[]) throws Exception { - File file = new File("/home/sangtaekim/Research/ToolDistribution/Test/inspect.out"); - ArrayList>> reqStrList = new ArrayList>>(); - ArrayList charges = new ArrayList(); - charges.add("1"); - charges.add("3"); - ArrayList peps = new ArrayList(); - peps.add("EE"); - reqStrList.add(new Pair>(2, peps)); - reqStrList.add(new Pair>(4, charges)); - TSVPSMSet psmSet = new TSVPSMSet(file, "\t", true, 14, true, 0, 1, 2, reqStrList); - psmSet.read(); - psmSet.printPeptideScoreTable(); - } } diff --git a/src/main/java/edu/ucsd/msjava/fdr/TargetDecoyAnalysis.java b/src/main/java/edu/ucsd/msjava/fdr/TargetDecoyAnalysis.java index ec8abc11..2c5c938e 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/TargetDecoyAnalysis.java +++ b/src/main/java/edu/ucsd/msjava/fdr/TargetDecoyAnalysis.java @@ -28,47 +28,6 @@ public TargetDecoyAnalysis(PSMSet target, PSMSet decoy, float pit) { pepLevelFDRMap = getFDRMap(target.getPepScores(), decoy.getPepScores(), isGreaterBetter, pit); } -// public TargetDecoyPSMSet( -// File concatenatedFile, -// String delimiter, -// boolean hasHeader, -// int scoreCol, -// boolean isGreaterBetter, -// int specFileCol, -// int specIndexCol, -// int pepCol, -// ArrayList>> reqStrList, -// int dbCol, String decoyPrefix) -// { -// target = new TSVPSMSet(concatenatedFile, delimiter, hasHeader, scoreCol, isGreaterBetter, specFileCol, specIndexCol, pepCol, reqStrList).decoy(dbCol, decoyPrefix, true).read(); -// decoy = new TSVPSMSet(concatenatedFile, delimiter, hasHeader, scoreCol, isGreaterBetter, specFileCol, specIndexCol, pepCol, reqStrList).decoy(dbCol, decoyPrefix, false).read(); -// this.isGreaterBetter = isGreaterBetter; -// isConcatenated = true; -// psmLevelFDRMap = getFDRMap(target.getPSMScores(), decoy.getPSMScores(), isGreaterBetter, isConcatenated, 1); -// pepLevelFDRMap = getFDRMap(target.getPepScores(), decoy.getPepScores(), isGreaterBetter, isConcatenated, 1); -// } -// -// public TargetDecoyPSMSet( -// File targetFile, -// File decoyFile, -// String delimiter, -// boolean hasHeader, -// int scoreCol, -// boolean isGreaterBetter, -// int specFileCol, -// int specIndexCol, -// int pepCol, -// ArrayList>> reqStrListPSMSet, -// float pit -// ) -// { -// target = new TSVPSMSet(targetFile, delimiter, hasHeader, scoreCol, isGreaterBetter, specFileCol, specIndexCol, pepCol, reqStrListPSMSet).read(); -// decoy = new TSVPSMSet(decoyFile, delimiter, hasHeader, scoreCol, isGreaterBetter, specFileCol, specIndexCol, pepCol, reqStrListPSMSet).read(); -// isConcatenated = false; -// psmLevelFDRMap = getFDRMap(target.getPSMScores(), decoy.getPSMScores(), isGreaterBetter, isConcatenated, pit); -// pepLevelFDRMap = getFDRMap(target.getPepScores(), decoy.getPepScores(), isGreaterBetter, isConcatenated, pit); -// } - public PSMSet getTargetPSMSet() { return target; } diff --git a/src/main/java/edu/ucsd/msjava/mgf/BufferedRandomAccessLineReader.java b/src/main/java/edu/ucsd/msjava/mgf/BufferedRandomAccessLineReader.java index a3422380..cb60076f 100644 --- a/src/main/java/edu/ucsd/msjava/mgf/BufferedRandomAccessLineReader.java +++ b/src/main/java/edu/ucsd/msjava/mgf/BufferedRandomAccessLineReader.java @@ -1,6 +1,5 @@ package edu.ucsd.msjava.mgf; -import org.apache.commons.lang3.tuple.Pair; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -78,17 +77,17 @@ private static boolean bytesMatchBOM(byte[] buf, UnicodeBOMInputStream.BOM bomTy * @return */ public static String stripBOM(String str) { - Pair result = stripBOMAndGetLength(str); - return result.getKey(); + return stripBOMAndGetLength(str).text(); } + /** Result of a BOM-strip: the updated string plus the BOM byte length. */ + public record BomStripResult(String text, int bomLength) {} + /** - * Check for a byte order mark at the start of str - * If found, remove it - * @param str - * @return Key/value pair where the key is the updated string and the value is the byte order mark length + * Check for a byte order mark at the start of {@code str}; if found, + * remove it. Returns the updated string and the BOM byte length. */ - public static Pair stripBOMAndGetLength(String str) { + public static BomStripResult stripBOMAndGetLength(String str) { // Check for byte order marks byte[] buf = str.getBytes(); int copyOffset = 0; @@ -119,7 +118,7 @@ public static Pair stripBOMAndGetLength(String str) { str = new String(java.util.Arrays.copyOfRange(buf, copyOffset, buf.length)); } - return Pair.of(str, copyOffset); + return new BomStripResult(str, copyOffset); } private int fillBuffer() { @@ -153,11 +152,11 @@ public String readLine() { if (startOfFile) { // Check for a byte order mark - Pair result = stripBOMAndGetLength(str); + BomStripResult result = stripBOMAndGetLength(str); - bomLength = result.getValue(); + bomLength = result.bomLength(); if (bomLength > 0) { - str = result.getKey(); + str = result.text(); } } @@ -243,23 +242,4 @@ public void close() throws IOException { in.close(); } - public static void main(String argv[]) throws Exception { - long time = System.currentTimeMillis(); - String fileName = "/home/sangtaekim/Research/Data/ABRF/2011/UniProt.Yeast.NFISnr.contamsS48.fasta"; - BufferedRandomAccessLineReader in = new BufferedRandomAccessLineReader(fileName, 1 << 16); -// BufferedReader in = new BufferedReader(new FileReader(fileName)); -// RandomAccessFile in = new RandomAccessFile(fileName, "r"); - String s; - int lineNum = 0; - long pos = 0; - while ((s = in.readLine()) != null) { - lineNum++; - if (lineNum == 48232) - System.out.println(lineNum + " " + s + " " + (pos = in.getPosition())); - } - in.seek(pos); - System.out.println(in.readLine()); - System.out.println("Time: " + (System.currentTimeMillis() - time)); - } - } diff --git a/src/main/java/edu/ucsd/msjava/mgf/MgfSpectrumParser.java b/src/main/java/edu/ucsd/msjava/mgf/MgfSpectrumParser.java index e8ed1e80..451f6dc2 100644 --- a/src/main/java/edu/ucsd/msjava/mgf/MgfSpectrumParser.java +++ b/src/main/java/edu/ucsd/msjava/mgf/MgfSpectrumParser.java @@ -410,39 +410,4 @@ private boolean extractScanNumFromTitleKeyValue(Spectrum spec, String title) { } // test code - public static void main(String argv[]) throws Exception { - long time = System.currentTimeMillis(); - String mgfFile = "/Users/sangtaekim/Research/Data/PNNL/IPYS_TD_Scere010_Orbitrap_001a.mgf"; -// String mgfFile = "/Users/sangtaekim/Research/Data/AgilentQTOF/notAnnotatedAgilentQTOF.mgf"; - - /* - // SpectraIterator test - MgfSpectrumParser parser = new MgfSpectrumParser(); - SpectraIterator itr = new SpectraIterator(mgfFile, parser); - int size = 0; - while(itr.hasNext()) - { - Spectrum spec = itr.next(); - size++; - System.out.println(spec.getScanNum()+" "+spec.getPrecursorPeak()); - } - System.out.println("Size: " + size); - */ - // SpectraMap test - - /* SpectraMap test - SpectraMap map = new SpectraMap(mgfFile, new MgfSpectrumParser()); - Spectrum spec = map.getSpectrumByScanNum(1585); - System.out.println(spec.getScanNum() + " " + spec.getPrecursorPeak()); - */ - -// SpectraContainer container = new SpectraContainer(mgfFile, new MgfSpectrumParser()); -// for(Spectrum spec : container) -// System.out.println(spec.getScanNum() + " " + spec.getPrecursorPeak()); - ArrayList specContainer = new ArrayList(); - SpectraIterator iterator = new SpectraIterator(mgfFile, new MgfSpectrumParser()); - while (iterator.hasNext()) - specContainer.add(iterator.next()); - System.out.println("Time: " + (System.currentTimeMillis() - time)); - } } diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/BuildSA.java b/src/main/java/edu/ucsd/msjava/msdbsearch/BuildSA.java index 5db5b57d..6e5c5195 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/BuildSA.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/BuildSA.java @@ -1,7 +1,6 @@ package edu.ucsd.msjava.msdbsearch; import edu.ucsd.msjava.cli.MSGFPlus; -import org.apache.commons.io.FilenameUtils; import java.io.BufferedWriter; import java.io.File; @@ -156,7 +155,9 @@ public static void buildSAFiles(File databaseFile, File outputDir, int mode, Str if (databaseFile.getName().toLowerCase().endsWith(".revCat.fasta".toLowerCase())) { System.err.println("Delete " + databaseFile.getName() + " and run MS-GF+ (or BuildSA) again."); } else { - String baseName = FilenameUtils.removeExtension(databaseFile.getName()); + String fileName = databaseFile.getName(); + int dot = fileName.lastIndexOf('.'); + String baseName = dot >= 0 ? fileName.substring(0, dot) : fileName; System.err.println("Delete files starting with " + baseName + " (but keep " + databaseFile.getName() + ") and run MS-GF+ (or BuildSA) again."); } diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java index 8943afc8..abee64da 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java @@ -3,15 +3,14 @@ import edu.ucsd.msjava.misc.ProgressData; import edu.ucsd.msjava.misc.ProgressReporter; +import java.io.OutputStream; import java.io.PrintStream; import java.util.ArrayList; import java.util.List; import java.util.function.Supplier; -import org.apache.commons.io.output.NullOutputStream; - public class ConcurrentMSGFPlus { - private static final PrintStream NULL_PRINT_STREAM = new PrintStream(new NullOutputStream()); + private static final PrintStream NULL_PRINT_STREAM = new PrintStream(OutputStream.nullOutputStream()); /** Per-task wall stats in milliseconds. {@code null} if the task didn't * complete (interrupted). */ diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java index 81edd496..93fb9d55 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java @@ -482,14 +482,7 @@ private static boolean isSupportedSpectrumFormat(SpecFileFormat fmt) { @Override public String toString() { - StringBuffer buf = new StringBuffer(); - -// buf.append("Spectrum File(s):\n"); -// for(DBSearchIOFiles ioFile : this.dbSearchIOList) -// { -// buf.append("\t"+ioFile.getSpecFile().getAbsolutePath()+"\n"); -// } -// buf.append("Database File: " + this.databaseFile.getAbsolutePath() + "\n"); + StringBuilder buf = new StringBuilder(); buf.append("\tPrecursorMassTolerance: "); if (leftPrecursorMassTolerance.equals(rightPrecursorMassTolerance)) { diff --git a/src/main/java/edu/ucsd/msjava/msgf/AAFrequencyCounter.java b/src/main/java/edu/ucsd/msjava/msgf/AAFrequencyCounter.java index bb04dc82..50b2bb29 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/AAFrequencyCounter.java +++ b/src/main/java/edu/ucsd/msjava/msgf/AAFrequencyCounter.java @@ -109,48 +109,4 @@ public int getOccurrence(String str) { return occ; } - public static void main(String argv[]) { - System.out.println(getRandomFrequency("AAA")); -// generate(3); - /* - AAFrequencyCounter counter = new AAFrequencyCounter(); - counter.readFromFreqFile("/home/sangtaekim/Research/Data/AAFrequency/SProt_2mer.txt"); - counter.frequencyTable.printSorted(); - */ - } - - public static void generate(int nMer) { - AAFrequencyCounter counter = new AAFrequencyCounter(); - counter.setNMer(nMer); -// counter.readFromFasta("/home/sangtaekim/Research/Data/SProt/uniprot_sprot.fasta"); - counter.readFromFasta("/home/sangtaekim/Research/Data/EColiDB/Ecol_protein_formatted.fasta"); - - System.out.println("n\t" + nMer); - System.out.println("size\t" + counter.sizeNMer); - String allAA = "GASPVTCLINDQKEMHFRYW"; - - if (nMer == 1) { - for (int i = 0; i < allAA.length(); i++) { - char c = allAA.charAt(i); - System.out.println(c + "\t" + counter.getOccurrence(String.valueOf(c))); - } - - } else if (nMer == 2) { - for (int i = 0; i < allAA.length(); i++) { - for (int j = 0; j < allAA.length(); j++) { - String s = "" + allAA.charAt(i) + allAA.charAt(j); - System.out.println(s + "\t" + counter.getOccurrence(s)); - } - } - } else if (nMer == 3) { - for (int i = 0; i < allAA.length(); i++) { - for (int j = 0; j < allAA.length(); j++) { - for (int k = 0; k < allAA.length(); k++) { - String s = "" + allAA.charAt(i) + allAA.charAt(j) + allAA.charAt(k); - System.out.println(s + "\t" + counter.getOccurrence(s)); - } - } - } - } - } } diff --git a/src/main/java/edu/ucsd/msjava/msgf/MassListComparator.java b/src/main/java/edu/ucsd/msjava/msgf/MassListComparator.java index 529fb4e9..b74c3872 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/MassListComparator.java +++ b/src/main/java/edu/ucsd/msjava/msgf/MassListComparator.java @@ -67,19 +67,4 @@ public T getMass2() { } } - public static void main(String argv[]) { - float[] data1 = {40, 40.1f, 40.2f, 50}; - float[] data2 = {39.7f, 40.05f, 40.6f}; - ArrayList list1 = new ArrayList(); - ArrayList list2 = new ArrayList(); - - for (float f : data1) - list1.add(new Mass(f)); - for (float f : data2) - list2.add(new Mass(f)); - - MassListComparator comparator = new MassListComparator(list1, list2); - for (MatchedPair pair : comparator.getMatchedList(new Tolerance(0.5f))) - System.out.println(pair.m1.getMass() + "\t" + pair.m2.getMass()); - } } diff --git a/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java b/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java index 99dd0378..faa84e2b 100644 --- a/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java +++ b/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java @@ -946,11 +946,4 @@ public void writeParametersPlainText(File outputFile) { out.close(); } - public static void main(String argv[]) throws Exception { - readWriteTest(); -// paramTest(); - } - - public static void readWriteTest() throws Exception { - } } diff --git a/src/main/java/edu/ucsd/msjava/msscorer/NewScorerFactory.java b/src/main/java/edu/ucsd/msjava/msscorer/NewScorerFactory.java index 38ce7ba4..094fc60c 100644 --- a/src/main/java/edu/ucsd/msjava/msscorer/NewScorerFactory.java +++ b/src/main/java/edu/ucsd/msjava/msscorer/NewScorerFactory.java @@ -171,28 +171,4 @@ else if (!method.isElectronBased() && enzyme.isNTerm()) return scorer; } - public static void main(String argv[]) { - for (ActivationMethod method : ActivationMethod.getAllRegisteredActivationMethods()) { - if (method == ActivationMethod.FUSION || method == ActivationMethod.ASWRITTEN) - continue; - for (InstrumentType inst : InstrumentType.getAllRegisteredInstrumentTypes()) { - for (Enzyme enzyme : Enzyme.getAllRegisteredEnzymes()) { - for (Protocol protocol : Protocol.getAllRegisteredProtocols()) { -// if(method == ActivationMethod.HCD && inst == InstrumentType.QEXACTIVE && enzyme == Enzyme.UnspecificCleavage && protocol == Protocol.NOPROTOCOL) -// { -// System.out.println("Debug"); -// } - NewRankScorer scorer = NewScorerFactory.get(method, inst, enzyme, protocol); - System.out.print(method.getName() + "_" + inst.getName() + "_" + enzyme.getName() + "_" + protocol.getName() + " -> "); - if (scorer != null) { - System.out.println(scorer.getSpecDataType()); - } else { - System.err.println("Null!"); - System.exit(-1); - } - } - } - } - } - } } diff --git a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java index ef199c04..162ee034 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java +++ b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java @@ -7,8 +7,6 @@ import java.io.File; import java.io.IOException; -import java.nio.file.Path; -import java.nio.file.Paths; import java.text.DecimalFormat; import java.util.*; @@ -1711,13 +1709,6 @@ private void updateAAListMapWithFixedModAA( aaListMap.put(loc, new ArrayList<>(newAAList)); } - public static void main(String argv[]) { - MSGFPlusOptions opts = new MSGFPlusOptions(); - Path modFilePath = Paths.get(System.getProperty("user.home") + "Research", "Data", "Debug", "mods.txt"); - AminoAcidSet aaSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath.toString(), opts); - aaSet.printAASet(); - } - private static class ModificationMetadata { public ModificationMetadata(int maxNumModsPerPeptide) { this.maxNumModsPerPeptide = maxNumModsPerPeptide; diff --git a/src/main/java/edu/ucsd/msjava/msutil/Composition.java b/src/main/java/edu/ucsd/msjava/msutil/Composition.java index 3a370429..21865f50 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Composition.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Composition.java @@ -382,47 +382,6 @@ public static int compareCompositions(int comp1, int comp2) { } - /** - * Check compositions sums that are equal to another standard composition. - */ - private static void checkEquality() { - // check which compositions are the sum of any two standard composition - AminoAcid[] stdAa = AminoAcid.getStandardAminoAcids(); - Composition[] stdComp = new Composition[stdAa.length]; - for (int i = 0; i < stdAa.length; i++) { - stdComp[i] = stdAa[i].getComposition(); - } - - System.out.println("Composition equalities: "); - for (int i = 0; i < stdAa.length; i++) { - for (int j = i; j < stdAa.length; j++) { - Composition sum = stdComp[i].getAddition(stdComp[j]); - for (int k = 0; k < stdAa.length; k++) { - if (sum.equals(stdComp[k])) { - System.out.println(stdAa[i].toString() + " plus " + stdAa[j].toString() + " equals " + stdAa[k].toString()); - } - } - } - } - - int[] singleMasses = new int[stdAa.length]; - for (int i = 0; i < stdAa.length; i++) { - singleMasses[i] = stdAa[i].getNominalMass(); - } - System.out.println("Integer equalities: "); - for (int i = 0; i < stdAa.length; i++) { - for (int j = i; j < stdAa.length; j++) { - int sum = stdComp[i].getNominalMass() + stdComp[j].getNominalMass(); - for (int k = 0; k < stdAa.length; k++) { - if (sum == stdComp[k].getNominalMass()) { - System.out.println(stdAa[i].toString() + " plus " + stdAa[j].toString() + " equals " + stdAa[k].toString()); - } - } - } - } - - } - /** * Remove spaces and tab characters anywhere in the text * @param text @@ -432,30 +391,4 @@ public static String removeWhitespace(String text) { return text.replaceAll("[ \\t]", "").trim(); } - public static void main(String argv[]) { - /* - Composition[] aa = { - new Composition(2,3,1,1,0), - new Composition(3,5,1,1,0), - new Composition(3,5,1,2,0), - new Composition(5,7,1,1,0), - new Composition(5,9,1,1,0), - new Composition(4,7,1,2,0), - new Composition(3,5,1,1,1), - new Composition(6,11,1,1,0), - new Composition(4,6,2,2,0), - new Composition(4,5,1,3,0), - new Composition(5,8,2,2,0), - new Composition(6,12,2,1,0), - new Composition(5,7,1,3,0), - new Composition(5,9,1,1,1),Serializable, - new Composition(6,7,3,1,0), - new Composition(9,9,1,1,0), - new Composition(6,12,4,1,0), - new Composition(9,9,1,2,0), - new Composition(11,10,2,1,0), - }; - */ - checkEquality(); - } -} \ No newline at end of file +} diff --git a/src/main/java/edu/ucsd/msjava/msutil/CompositionFactory.java b/src/main/java/edu/ucsd/msjava/msutil/CompositionFactory.java index 2c6b24f1..d003cb3d 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/CompositionFactory.java +++ b/src/main/java/edu/ucsd/msjava/msutil/CompositionFactory.java @@ -254,6 +254,4 @@ protected void makeAllPossibleMasses() { finalizeCompositionSet(); } - public static void main(String[] argv) { - } -} \ No newline at end of file +} diff --git a/src/main/java/edu/ucsd/msjava/msutil/IonType.java b/src/main/java/edu/ucsd/msjava/msutil/IonType.java index fde14b61..092ca973 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/IonType.java +++ b/src/main/java/edu/ucsd/msjava/msutil/IonType.java @@ -368,9 +368,4 @@ public static ArrayList getAllKnownIonTypes(int maxCharge, boolean remo compositionOffsetTable.put("TMT", 229.162932f); } - public static void main(String[] args) { - ArrayList allIons = IonType.getAllKnownIonTypes(3, true, true, false, true); - for (IonType ion : allIons) - System.out.println(ion.getName() + "\t" + ion.getOffset()); - } } diff --git a/src/main/java/edu/ucsd/msjava/msutil/Peptide.java b/src/main/java/edu/ucsd/msjava/msutil/Peptide.java index cdcd91db..6b493c90 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Peptide.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Peptide.java @@ -5,10 +5,6 @@ import edu.ucsd.msjava.msgf.MassListComparator; import edu.ucsd.msjava.msgf.Tolerance; import edu.ucsd.msjava.msutil.Modification.Location; -import edu.ucsd.msjava.cli.MSGFPlus; - -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -884,15 +880,4 @@ public ArrayList getModifications() return modList; } */ - public static void main(String[] a) { - edu.ucsd.msjava.cli.MSGFPlusOptions opts = new edu.ucsd.msjava.cli.MSGFPlusOptions(); - Path modFilePath = Paths.get(System.getProperty("user.home") + "Research", "ToolDistribution", "mods.txt"); - AminoAcidSet aaSet = AminoAcidSet.getAminoAcidSetFromModFile(modFilePath.toString(), opts); - Peptide p = new Peptide("+42.011+15.995MDNKTPVTLAK", aaSet); - System.out.println(p); - for (AminoAcid aa : p) - System.out.println(aa.getResidueStr() + " " + aa.getMass()); - System.out.println(p.getMass()); - } - } diff --git a/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java b/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java index 5a5328cb..2222b8a5 100644 --- a/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java +++ b/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java @@ -622,46 +622,9 @@ public Collection getValidAlphabetSetAsChars() { } */ - /* - public static void main(String[] args) { - debug(); - } - - - private static void debug() { - long time = System.currentTimeMillis(); - String fastaFile; - String userHome = System.getProperty("user.home"); - - //fastaFile = userHome+"/Data/Databases/uniprot_sprot.fasta"; - //fastaFile = userHome+"/Data/Databases/small.fasta"; - fastaFile = userHome+"/Data/Databases/tiny.fasta"; - - System.out.println("File name: "+fastaFile); - Adapter fr = new FastaSequence(fastaFile, Constants.AMINO_ACIDS_18); - //Adapter fr = new FastaSequence(fastaFile); - - System.out.println(fr.getByteAt(2)); - System.out.println("Total number of characters: " + fr.getSize()); - System.out.println("Alphabet size: " + fr.getAlphabetSize()); - System.out.println("Time to complete: " + (System.currentTimeMillis() - time)/1000.0 + "s"); - } - */ - /** * @author kyowon - will be erased soon */ //public int getMatchingEntryStartPosition(long position){ return annotations.floorKey((int)position)+1; } //public int getMatchingEntryEndPosition(long position){ return annotations.higherKey((int)position); } - public static void main(String[] args) { - String userHome = System.getProperty("user.home"); - //String filename = userHome+"/Data/Databases/Sone/pro/SOne_uniprot_plus_contaminants.fasta"; - //String filename = userHome+"/Data/Databases/human/ipi.HUMAN.v3.72.fasta"; - String filename = userHome + "/Data/Databases/uniprot_sprot.fasta"; - FastaSequence fs = new FastaSequence(filename); - System.out.println("Total number of bases: " + fs.getSize()); - - } - - -} \ No newline at end of file +} diff --git a/src/main/java/edu/ucsd/msjava/sequences/FastaSequences.java b/src/main/java/edu/ucsd/msjava/sequences/FastaSequences.java index 5614658c..75260e20 100644 --- a/src/main/java/edu/ucsd/msjava/sequences/FastaSequences.java +++ b/src/main/java/edu/ucsd/msjava/sequences/FastaSequences.java @@ -220,21 +220,6 @@ public char getCharAt(long position) { return getSequence((int) (pair >>> 32)).getCharAt((int) pair); } - public static void main(String[] args) { - String userHome = System.getProperty("user.home"); - String directory = userHome + "/Data/Databases/Scerv/gen"; - FastaSequences pfs = new FastaSequences(directory, false); - - System.out.println("Total number of bases: " + pfs.getSize()); - for (long start = 0; start < pfs.getSize(); start++) { - if (start % 1000000 == 0) { - if (pfs.isTerminator(start)) - System.out.println(pfs.getAnnotation(start)); - } - pfs.getByteAt(start); - } - } - public byte[] getBytes(int start, int end) { long pair1 = translate(start); long pair2 = translate(end); diff --git a/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequence.java b/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequence.java index 353254f5..c85bd412 100644 --- a/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequence.java +++ b/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequence.java @@ -93,9 +93,4 @@ public boolean hasMass(long position) { /***** Main method to test the size of a sequence *****/ - public static void main(String[] args) { - ProteinFastaSequence s = new ProteinFastaSequence(System.getProperty("user.home") + "/Data/Databases/ShewDB/SOne_uniprot_plus_contaminants.fasta"); - //ProteinFastaSequence s = new ProteinFastaSequence(System.getProperty("user.home")+"/Data/Databases/Asp/pro/translated.fasta"); - System.out.println("Size of database: " + s.getSize()); - } -} \ No newline at end of file +} diff --git a/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequences.java b/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequences.java index a0950163..e97cec7f 100644 --- a/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequences.java +++ b/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequences.java @@ -317,19 +317,4 @@ public long getStartPosition(long position) { /***** Main method to get the size of the database *****/ - public static void main(String[] args) { - String userHome = System.getProperty("user.home"); - String directory = userHome + "/Data/Databases/Hsapiens/translated"; - ProteinFastaSequences pfs = new ProteinFastaSequences(directory, false); - - System.out.println("Total number of bases: " + pfs.getSize()); - for (long start = 0; start < pfs.getSize(); start++) { - if (start % 1000000 == 0) { - if (pfs.isTerminator(start)) - System.out.println(pfs.getAnnotation(start)); - } - pfs.getByteAt(start); - } - } - } From f89d6ed5c21dd5e020eb948a0402249524da5d59 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 11:43:56 +0100 Subject: [PATCH 30/34] test: consolidate fixture builders into SearchTestFixtures Four test files (TestDirectPinWriter, TestPrecursorCalScaffolding, TestPrecursorCalIntegration, TestRunManifestWriter) had nearly identical buildOpts()/parsedSearchParams() helpers loading the standard MSGFDB_Param.txt + test.mgf + human-uniprot-contaminants.fasta fixture set. Collapse into a single shared SearchTestFixtures.standardOpts() helper in src/test/java/edu/ucsd/ msjava/cli; the two tests that need extra fields (output file, maxMissedCleavages) call standardOpts() and tweak the result. Net change: -25 LOC, plus tests now stale-import-free. Verified: scoped sweep (42 tests across the 4 affected files + MSGFPlusOptionsConfigFileTest + MSGFPlusOptionsActivationMethodTest + SearchParamsTest): 0 failures, 0 errors. --- .../ucsd/msjava/cli/SearchTestFixtures.java | 26 +++++++++++++++++++ .../java/msgfplus/TestDirectPinWriter.java | 16 +++--------- .../msgfplus/TestPrecursorCalIntegration.java | 11 +++----- .../msgfplus/TestPrecursorCalScaffolding.java | 14 +++------- .../java/msgfplus/TestRunManifestWriter.java | 8 ++---- 5 files changed, 40 insertions(+), 35 deletions(-) create mode 100644 src/test/java/edu/ucsd/msjava/cli/SearchTestFixtures.java diff --git a/src/test/java/edu/ucsd/msjava/cli/SearchTestFixtures.java b/src/test/java/edu/ucsd/msjava/cli/SearchTestFixtures.java new file mode 100644 index 00000000..e7c50024 --- /dev/null +++ b/src/test/java/edu/ucsd/msjava/cli/SearchTestFixtures.java @@ -0,0 +1,26 @@ +package edu.ucsd.msjava.cli; + +import java.io.File; +import java.net.URISyntaxException; + +/** Shared test helpers for the standard search fixture set + * ({@code MSGFDB_Param.txt} + {@code test.mgf} + {@code human-uniprot-contaminants.fasta}). */ +public final class SearchTestFixtures { + + private SearchTestFixtures() {} + + /** Build an {@link MSGFPlusOptions} pointing at the bundled + * {@code MSGFDB_Param.txt} config, {@code test.mgf} spectra, and + * {@code human-uniprot-contaminants.fasta} database. */ + public static MSGFPlusOptions standardOpts() throws URISyntaxException { + MSGFPlusOptions opts = new MSGFPlusOptions(); + opts.configFile = resource("MSGFDB_Param.txt"); + opts.spectrumFile = resource("test.mgf"); + opts.databaseFile = resource("human-uniprot-contaminants.fasta"); + return opts; + } + + private static File resource(String name) throws URISyntaxException { + return new File(SearchTestFixtures.class.getClassLoader().getResource(name).toURI()); + } +} diff --git a/src/test/java/msgfplus/TestDirectPinWriter.java b/src/test/java/msgfplus/TestDirectPinWriter.java index e50e9b5b..bda31155 100644 --- a/src/test/java/msgfplus/TestDirectPinWriter.java +++ b/src/test/java/msgfplus/TestDirectPinWriter.java @@ -2,9 +2,9 @@ import edu.ucsd.msjava.cli.MSGFPlusOptions; import edu.ucsd.msjava.cli.OutputFormat; +import edu.ucsd.msjava.cli.SearchTestFixtures; import edu.ucsd.msjava.msdbsearch.DatabaseMatch; import edu.ucsd.msjava.msdbsearch.SearchParams; -import edu.ucsd.msjava.msdbsearch.SearchParamsTest; import edu.ucsd.msjava.msutil.ActivationMethod; import edu.ucsd.msjava.msutil.Enzyme; import edu.ucsd.msjava.output.DirectPinWriter; @@ -33,24 +33,16 @@ */ public class TestDirectPinWriter { - private MSGFPlusOptions buildOpts() throws URISyntaxException { - MSGFPlusOptions opts = new MSGFPlusOptions(); - opts.configFile = new File(SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI()); - opts.spectrumFile = new File(SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI()); - opts.databaseFile = new File(SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI()); - return opts; - } - @Test public void pinOutputFormatFlagIsAccepted() throws URISyntaxException { - MSGFPlusOptions opts = buildOpts(); + MSGFPlusOptions opts = SearchTestFixtures.standardOpts(); opts.outputFormat = OutputFormat.PIN; Assert.assertEquals(OutputFormat.PIN, opts.effectiveOutputFormat()); } @Test public void writePinGetterReflectsOutputFormat() throws URISyntaxException { - MSGFPlusOptions opts = buildOpts(); + MSGFPlusOptions opts = SearchTestFixtures.standardOpts(); opts.outputFormat = OutputFormat.PIN; SearchParams params = new SearchParams(); @@ -83,7 +75,7 @@ public void outputFormatAcceptsOnlyPinAndTsv() throws URISyntaxException { @Test public void pinHeaderColumnsIncludeRequiredPercolatorFields() throws Exception { - MSGFPlusOptions opts = buildOpts(); + MSGFPlusOptions opts = SearchTestFixtures.standardOpts(); opts.outputFormat = OutputFormat.PIN; SearchParams params = new SearchParams(); diff --git a/src/test/java/msgfplus/TestPrecursorCalIntegration.java b/src/test/java/msgfplus/TestPrecursorCalIntegration.java index fb85c668..bcc239f4 100644 --- a/src/test/java/msgfplus/TestPrecursorCalIntegration.java +++ b/src/test/java/msgfplus/TestPrecursorCalIntegration.java @@ -2,8 +2,8 @@ import edu.ucsd.msjava.cli.MSGFPlus; import edu.ucsd.msjava.cli.MSGFPlusOptions; +import edu.ucsd.msjava.cli.SearchTestFixtures; import edu.ucsd.msjava.msdbsearch.SearchParams.PrecursorCalMode; -import edu.ucsd.msjava.msdbsearch.SearchParamsTest; import edu.ucsd.msjava.msutil.DBSearchIOFiles; import edu.ucsd.msjava.msutil.SpecFileFormat; import org.junit.Assert; @@ -39,12 +39,9 @@ */ public class TestPrecursorCalIntegration { - private MSGFPlusOptions buildOpts(File outputFile) throws URISyntaxException { - MSGFPlusOptions opts = new MSGFPlusOptions(); - opts.configFile = new File(SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI()); - opts.spectrumFile = new File(SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI()); - opts.databaseFile = new File(SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI()); - opts.outputFile = outputFile; + private static MSGFPlusOptions buildOpts(File outputFile) throws URISyntaxException { + MSGFPlusOptions opts = SearchTestFixtures.standardOpts(); + opts.outputFile = outputFile; return opts; } diff --git a/src/test/java/msgfplus/TestPrecursorCalScaffolding.java b/src/test/java/msgfplus/TestPrecursorCalScaffolding.java index 8f1c5e80..d66dfa4d 100644 --- a/src/test/java/msgfplus/TestPrecursorCalScaffolding.java +++ b/src/test/java/msgfplus/TestPrecursorCalScaffolding.java @@ -1,6 +1,7 @@ package msgfplus; import edu.ucsd.msjava.cli.MSGFPlusOptions; +import edu.ucsd.msjava.cli.SearchTestFixtures; import edu.ucsd.msjava.msdbsearch.SearchParams; import edu.ucsd.msjava.msdbsearch.SearchParams.PrecursorCalMode; import edu.ucsd.msjava.msdbsearch.SearchParamsTest; @@ -30,17 +31,10 @@ */ public class TestPrecursorCalScaffolding { - private MSGFPlusOptions buildOpts() throws URISyntaxException { - MSGFPlusOptions opts = new MSGFPlusOptions(); - opts.configFile = new File(SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI()); - opts.spectrumFile = new File(SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI()); - opts.databaseFile = new File(SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI()); - return opts; - } @Test public void precursorCalDefaultIsAuto() throws URISyntaxException { - MSGFPlusOptions opts = buildOpts(); + MSGFPlusOptions opts = SearchTestFixtures.standardOpts(); SearchParams params = new SearchParams(); Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); Assert.assertEquals("Default -precursorCal should be AUTO", @@ -49,7 +43,7 @@ public void precursorCalDefaultIsAuto() throws URISyntaxException { @Test public void precursorCalOnIsParsed() throws URISyntaxException { - MSGFPlusOptions opts = buildOpts(); + MSGFPlusOptions opts = SearchTestFixtures.standardOpts(); opts.precursorCalMode = PrecursorCalMode.ON; SearchParams params = new SearchParams(); Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); @@ -58,7 +52,7 @@ public void precursorCalOnIsParsed() throws URISyntaxException { @Test public void precursorCalOffIsParsed() throws URISyntaxException { - MSGFPlusOptions opts = buildOpts(); + MSGFPlusOptions opts = SearchTestFixtures.standardOpts(); opts.precursorCalMode = PrecursorCalMode.OFF; SearchParams params = new SearchParams(); Assert.assertNull("SearchParams.parse should succeed", params.parse(opts)); diff --git a/src/test/java/msgfplus/TestRunManifestWriter.java b/src/test/java/msgfplus/TestRunManifestWriter.java index 39018b92..bc641460 100644 --- a/src/test/java/msgfplus/TestRunManifestWriter.java +++ b/src/test/java/msgfplus/TestRunManifestWriter.java @@ -2,9 +2,9 @@ import edu.ucsd.msjava.cli.MSGFPlus; import edu.ucsd.msjava.cli.MSGFPlusOptions; +import edu.ucsd.msjava.cli.SearchTestFixtures; import edu.ucsd.msjava.misc.RunManifestWriter; import edu.ucsd.msjava.msdbsearch.SearchParams; -import edu.ucsd.msjava.msdbsearch.SearchParamsTest; import edu.ucsd.msjava.msutil.DBSearchIOFiles; import org.junit.Assert; import org.junit.Test; @@ -25,12 +25,8 @@ public class TestRunManifestWriter { private SearchParams parsedSearchParams() throws URISyntaxException { - MSGFPlusOptions opts = new MSGFPlusOptions(); - opts.configFile = new File(SearchParamsTest.class.getClassLoader().getResource("MSGFDB_Param.txt").toURI()); - opts.spectrumFile = new File(SearchParamsTest.class.getClassLoader().getResource("test.mgf").toURI()); - opts.databaseFile = new File(SearchParamsTest.class.getClassLoader().getResource("human-uniprot-contaminants.fasta").toURI()); + MSGFPlusOptions opts = SearchTestFixtures.standardOpts(); opts.maxMissedCleavages = 2; - SearchParams params = new SearchParams(); String err = params.parse(opts); Assert.assertNull("SearchParams.parse should succeed: " + err, err); From 6d7f8b7d8dc9167816a2bf05e1ec67c1705f40d0 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 13:12:44 +0100 Subject: [PATCH 31/34] chore: drop trivial comments that restate signatures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scrub 31 Java source files of five comment categories: - single-line javadoc restating the method/field name - block javadoc containing only @param/@return lines - /* (non-Javadoc) */ markers - section headers (GETTERS / SETTERS / CONSTRUCTORS / etc.) - commented-out code blocks (dead debug printlns, old algorithms, inactive fields, alternative scorer variants) Non-obvious WHY comments, license headers, performance invariants, CLI contract javadoc, and Picocli @Option descriptions are preserved unchanged. LOC delta: -2262 lines, +42 lines across 31 files. Scoped test sweep: 78 tests run, 0 failures, 0 errors, 3 skipped (TestMSUtils×1, TestSA×2 — pre-existing skips unrelated to this change). --- src/main/java/edu/ucsd/msjava/fdr/Pair.java | 57 +-- .../edu/ucsd/msjava/fdr/ScoredString.java | 25 -- .../ucsd/msjava/mgf/MgfSpectrumParser.java | 67 --- .../msdbsearch/CandidatePeptideGrid.java | 5 - .../msjava/msdbsearch/ConcurrentMSGFPlus.java | 1 - .../edu/ucsd/msjava/msdbsearch/DBScanner.java | 13 +- .../msjava/msdbsearch/ScoredSpectraMap.java | 16 - .../ucsd/msjava/msdbsearch/SearchParams.java | 31 -- .../ucsd/msjava/msgf/GeneratingFunction.java | 9 - .../java/edu/ucsd/msjava/msgf/ScoreDist.java | 18 - .../java/edu/ucsd/msjava/msgf/Tolerance.java | 11 +- .../ucsd/msjava/msscorer/DBScanScorer.java | 3 - .../edu/ucsd/msjava/msscorer/FastScorer.java | 7 - .../ucsd/msjava/msscorer/NewRankScorer.java | 12 - .../msjava/msscorer/NewScoredSpectrum.java | 32 +- .../ucsd/msjava/msutil/ActivationMethod.java | 3 - .../edu/ucsd/msjava/msutil/AminoAcid.java | 181 +------- .../edu/ucsd/msjava/msutil/AminoAcidSet.java | 120 +---- .../java/edu/ucsd/msjava/msutil/Enzyme.java | 201 +-------- .../ucsd/msjava/msutil/InstrumentType.java | 3 - .../java/edu/ucsd/msjava/msutil/Matter.java | 33 +- .../edu/ucsd/msjava/msutil/Modification.java | 69 +-- .../java/edu/ucsd/msjava/msutil/Pair.java | 64 +-- .../java/edu/ucsd/msjava/msutil/Peak.java | 188 +------- .../java/edu/ucsd/msjava/msutil/Peptide.java | 389 +---------------- .../java/edu/ucsd/msjava/msutil/Sequence.java | 145 +----- .../java/edu/ucsd/msjava/msutil/SpecKey.java | 6 +- .../ucsd/msjava/msutil/SpectraAccessor.java | 16 - .../java/edu/ucsd/msjava/msutil/Spectrum.java | 412 +----------------- .../edu/ucsd/msjava/mzml/StaxMzMLParser.java | 8 - .../ucsd/msjava/sequences/FastaSequence.java | 159 +------ 31 files changed, 42 insertions(+), 2262 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/fdr/Pair.java b/src/main/java/edu/ucsd/msjava/fdr/Pair.java index b1e14bb1..fd179bd7 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/Pair.java +++ b/src/main/java/edu/ucsd/msjava/fdr/Pair.java @@ -2,40 +2,18 @@ import java.util.Comparator; -/** - * This class represents a pair of two objects. - * - * @param the first object - * @param the second object - * @author sangtaekim - */ +/** Generic ordered pair. */ public class Pair { - /** - * The first. - */ private A first; - - /** - * The second. - */ private B second; - /** - * Instantiates a new pair. - * - * @param first the first - * @param second the second - */ public Pair(A first, B second) { super(); this.first = first; this.second = second; } - /* (non-Javadoc) - * @see java.lang.Object#hashCode() - */ public int hashCode() { int hashFirst = first != null ? first.hashCode() : 0; int hashSecond = second != null ? second.hashCode() : 0; @@ -43,9 +21,6 @@ public int hashCode() { return (hashFirst + hashSecond) * hashSecond + hashFirst; } - /* (non-Javadoc) - * @see java.lang.Object#equals(java.lang.Object) - */ public boolean equals(Object other) { if (other instanceof Pair) { Pair otherPair = (Pair) other; @@ -61,45 +36,22 @@ public boolean equals(Object other) { return false; } - /* (non-Javadoc) - * @see java.lang.Object#toString() - */ public String toString() { return "(" + first + ", " + second + ")"; } - /** - * Gets the first. - * - * @return the first - */ public A getFirst() { return first; } - /** - * Sets the first. - * - * @param first the new first - */ public void setFirst(A first) { this.first = first; } - /** - * Gets the second. - * - * @return the second - */ public B getSecond() { return second; } - /** - * Sets the second. - * - * @param second the new second - */ public void setSecond(B second) { this.second = second; } @@ -115,13 +67,6 @@ public PairComparator(boolean useSecondForComprison) { this.useSecondForComprison = useSecondForComprison; } - /** - * Determines the order of Pair objects. If useSecondForComparison is set, use B for comparison, otherwise A is used. - * - * @param p1 the first element. - * @param p2 the second element. - * @return 1 if p1 > p2, -1 if p2 > p1 and 0 otherwise. - */ public int compare(Pair p1, Pair p2) { if (!useSecondForComprison) return p1.getFirst().compareTo(p2.getFirst()); diff --git a/src/main/java/edu/ucsd/msjava/fdr/ScoredString.java b/src/main/java/edu/ucsd/msjava/fdr/ScoredString.java index dca5ce14..06bc6636 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/ScoredString.java +++ b/src/main/java/edu/ucsd/msjava/fdr/ScoredString.java @@ -9,27 +9,12 @@ ***************************************************************************/ package edu.ucsd.msjava.fdr; -/** - * The Class ScoredString. - */ public class ScoredString extends Pair implements Comparable> { - /** - * Instantiates a new scored string. - * - * @param peptide the peptide - * @param score the score - */ public ScoredString(String peptide, Float score) { super(peptide, score); } - /** - * Instantiates a new scored string, using an integer score. - * - * @param score - * @param peptide - */ public ScoredString(String peptide, int score) { super(peptide, (float) score); } @@ -42,20 +27,10 @@ public int compareTo(Pair o) { return getFirst().compareTo(o.getFirst()); } - /** - * Gets the str. - * - * @return the str - */ public String getStr() { return super.getFirst(); } - /** - * Gets the score. - * - * @return the score - */ public float getScore() { return super.getSecond(); } diff --git a/src/main/java/edu/ucsd/msjava/mgf/MgfSpectrumParser.java b/src/main/java/edu/ucsd/msjava/mgf/MgfSpectrumParser.java index 451f6dc2..093e63ee 100644 --- a/src/main/java/edu/ucsd/msjava/mgf/MgfSpectrumParser.java +++ b/src/main/java/edu/ucsd/msjava/mgf/MgfSpectrumParser.java @@ -11,11 +11,6 @@ import static edu.ucsd.msjava.misc.TextParsingUtils.isInteger; -/** - * This class enables to parse spectrum file with mgf format. - * - * @author sangtaekim - */ public class MgfSpectrumParser implements SpectrumParser { private static final Pattern TITLE_SCAN_KEY_VALUE_PATTERN = Pattern.compile("(?i)(?:^|[\\s;])(?:scan|scans|spectrum)=(\\d+)(?:\\b|$)"); @@ -26,27 +21,13 @@ public class MgfSpectrumParser implements SpectrumParser { private long scanMissingWarningCount; - /** - * Number of scans where we could not determine the scan number - * This method is required by interface SpectrumParser - * @return - */ public long getScanMissingWarningCount() { return scanMissingWarningCount; } - /** - * Amino acid set to be used to parse "SEQ=" - */ private AminoAcidSet aaSet = AminoAcidSet.getStandardAminoAcidSetWithFixedCarbamidomethylatedCys(); - /** - * Specify amino acid set to be used to parse "SEQ=" field. - * - * @param aaSet amino acid set. - * @return this object. - */ public MgfSpectrumParser aaSet(AminoAcidSet aaSet) { this.aaSet = aaSet; linesRead = 0; @@ -55,14 +36,6 @@ public MgfSpectrumParser aaSet(AminoAcidSet aaSet) { return this; } - /** - * Implementation of readSpectrum method. Implicitly lineReader points to the start of a spectrum. - * Reads mgf file line by line until the spectrum ends, generate a Spectrum object and returns it. - * If it cannot read a spectrum, it returns null. - * - * @param lineReader a LineReader object points to the start of a spectrum - * @return a spectrum object. null if no spectrum can be generated. - */ public Spectrum readSpectrum(LineReader lineReader) { Spectrum spec = null; String title = null; @@ -72,8 +45,6 @@ public Spectrum readSpectrum(LineReader lineReader) { int precursorCharge = 0; ActivationMethod activation = null; float elutionTimeSeconds = 0; -// Float toleranceVal = null; -// Tolerance.Unit toleranceUnit = null; String buf; boolean parse = false; // parse only after the BEGIN IONS @@ -112,7 +83,6 @@ public Spectrum readSpectrum(LineReader lineReader) { } else if (buf.startsWith("TITLE")) { title = buf.substring(buf.indexOf('=') + 1); spec.setTitle(title); -// spec.setID(title); } else if (buf.startsWith("CHARGE")) { // Charge state, e.g. CHARGE=2+ // Extract the text after the equals sign @@ -206,23 +176,6 @@ public Spectrum readSpectrum(LineReader lineReader) { else elutionTimeSeconds = Float.valueOf(token[0]); } -// else if(buf.startsWith("TOL=")) -// { -// String tolStr = buf.substring(buf.indexOf("=")+1); -// float toleranceValue = Float.parseFloat(tolStr); -// if(toleranceValue > 0) -// { -// toleranceVal = toleranceValue; -// } -// } -// else if(buf.startsWith("TOLU=")) -// { -// String tolUnitStr = buf.substring(buf.indexOf("=")+1); -// if(tolUnitStr.equalsIgnoreCase("ppm")) -// toleranceUnit = Tolerance.Unit.PPM; -// else if(tolUnitStr.equalsIgnoreCase("Da")) -// toleranceUnit = Tolerance.Unit.Da; -// } else if (buf.startsWith("END IONS")) { assert (spec != null); if (spec.getScanNum() < 0 && title != null) { @@ -260,11 +213,6 @@ else if (buf.startsWith("END IONS")) { spec.setRt(elutionTimeSeconds); spec.setRtIsSeconds(true); } -// if(toleranceVal != null && toleranceUnit != null) -// { -// Tolerance precursorTolerance = new Tolerance(toleranceVal, toleranceUnit); -// spec.setPrecursorTolerance(precursorTolerance); -// } if (!sorted) Collections.sort(spec); @@ -275,13 +223,6 @@ else if (buf.startsWith("END IONS")) { return null; } - /** - * Extract start and end scan from the title if it is of the form: - * DatasetName.ScanStart.ScanEnd.Charge - * - * @param spec Spectrum - * @param title Title line - */ private void extractScanRangeFromTitle(Spectrum spec, String title) { // Split on periods String[] token = title.split("\\."); @@ -323,13 +264,6 @@ private void extractScanRangeFromTitle(Spectrum spec, String title) { } } - /** - * Implementation of getSpecIndexMap object. Reads the entire spectrum file and - * generates a map from a spectrum index to the file position of the spectrum. - * - * @param lineReader a LineReader object that points to the start of a file. - * @return A map from spectrum indexes to the spectrum meta information. - */ public Map getSpecMetaInfoMap(BufferedRandomAccessLineReader lineReader) { Hashtable specIndexMap = new Hashtable(); String buf; @@ -409,5 +343,4 @@ private boolean extractScanNumFromTitleKeyValue(Spectrum spec, String title) { return true; } - // test code } diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGrid.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGrid.java index ddf3af10..b0667f6d 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGrid.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGrid.java @@ -64,11 +64,6 @@ public class CandidatePeptideGrid { private int length; private int[] size; -// public CandidatePeptideGrid(AminoAcidSet aaSet, int maxPeptideLength) -// { -// this(aaSet, maxPeptideLength, Constants.NUM_VARIANTS_PER_PEPTIDE); -// } - public CandidatePeptideGrid(AminoAcidSet aaSet, Enzyme enzyme, int maxPeptideLength, int maxNumVariantsPerPeptide, int maxMissedCleavages) { this.numMaxMods = aaSet.getMaxNumberOfVariableModificationsPerPeptide(); this.maxPeptideLength = maxPeptideLength; diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java index abee64da..1a82f7d1 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java @@ -191,7 +191,6 @@ public void run() { scanner.addResultsToList(resultList); progress.report(100.0); -// gen.addSpectrumIdentificationResults(scanner.getSpecIndexDBMatchMap()); long totalMs = (System.nanoTime() - taskStartNs) / 1_000_000L; wallStats = new TaskWallStats(taskNum, preprocessMs, dbSearchMs, computeEvalueMs, totalMs); scanner = null; diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java b/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java index 2ac946af..7b7eea4e 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java @@ -251,10 +251,6 @@ class MatchList extends ArrayList { if (bufferIndex == 0) lcp = 0; - // For debugging -// System.out.println(index+": " +sequence.getSubsequence(index, sequence.getSize())); -// if(index == 4) -// System.out.println("Debug"); // skip redundant peptides if (Thread.currentThread().isInterrupted()) { @@ -415,19 +411,12 @@ else if (lcp == 0) // preceding aa is changed if (peptideLengthIndex < minPeptideLength) continue; -// System.out.println(sequence.getSubsequence(index+1, index+i+1)); -// if(sequence.getSubsequence(index+1, index+i+1).equalsIgnoreCase("KYPCRYCEK")) -// { -// System.out.println("DebugSequence: " + sequence.getSubsequence(index, index+i+1)); -// } - int cTermCleavageScore = 0; if (enzyme != null) { char cTermNeighboringResidue = sequence.getCharAt(index + peptideLengthIndex + 1); isProteinCTerm = (cTermNeighboringResidue == Constants.TERMINATOR_CHAR); if (enzyme.isCTerm()) { -// if(isProteinCTerm || enzyme.isCleavable(residue)) // || cTermNeighboringResidue == Constants.INVALID_CHAR) - if (enzyme.isCleavable(residue)) // || cTermNeighboringResidue == Constants.INVALID_CHAR) // changed by Sangtae to avoid SpecProb=0 + if (enzyme.isCleavable(residue)) // changed by Sangtae to avoid SpecProb=0 cTermCleavageScore = peptideCleavageCredit; else { cTermCleavageScore = peptideCleavagePenalty; diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/ScoredSpectraMap.java b/src/main/java/edu/ucsd/msjava/msdbsearch/ScoredSpectraMap.java index 821baa7f..8dea0dfa 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/ScoredSpectraMap.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/ScoredSpectraMap.java @@ -33,8 +33,6 @@ public class ScoredSpectraMap { private Map specKeyRankScorerMap; -// private Map specKeyToleranceMap; - private boolean turnOffEdgeScoring = false; private ProgressData progress; @@ -158,7 +156,6 @@ public Tolerance getRightPrecursorMassTolerance() { return rightPrecursorMassTolerance; } - // public int getNumAllowedC13() { return numAllowedC13; } public int getMaxIsotopeError() { return maxIsotopeError; } @@ -182,14 +179,6 @@ public NewRankScorer getRankScorer(SpecKey specKey) { return this.specKeyRankScorerMap.get(specKey); } -// public Tolerance getSpectrumSpecificPrecursorTolerance(SpecKey specKey) -// { -// if(specKeyToleranceMap == null) -// return null; -// else -// return specKeyToleranceMap.get(specKey); -// } - public ScoredSpectraMap makePepMassSpecKeyMap() { for (SpecKey specKey : specKeyList) { int specIndex = specKey.getSpecIndex(); @@ -207,9 +196,6 @@ public ScoredSpectraMap makePepMassSpecKeyMap() { } specIndexChargeToSpecKeyMap.put(new Pair(specIndex, specKey.getCharge()), specKey); -// if(specKeyToleranceMap != null && spec.getPrecursorTolerance() != null) -// specKeyToleranceMap.put(specKey, spec.getPrecursorTolerance()); - } else { // Skip since precursor m/z is zero } @@ -269,7 +255,6 @@ private void preProcessIndividualSpectra(int fromIndex, int toIndex) { int charge = specKey.getCharge(); spec.setCharge(charge); - // System.out.println("GetScoredSpectrum for " + specKey.toString()); NewScoredSpectrum scoredSpec = scorer.getScoredSpectrum(spec); float peptideMass = spec.getPrecursorMass() - (float) Composition.H2O; @@ -366,7 +351,6 @@ private void preProcessFusedSpectra(int fromIndex, int toIndex) { float tolDaLeft = leftPrecursorMassTolerance.getToleranceAsDa(peptideMass); int maxNominalPeptideMass = NominalMass.toNominalMass(peptideMass) + Math.round(tolDaLeft - 0.4999f) + 1; if (supportEdgeScore) -// specKeyScorerMap.put(specKey, new DBScanScorerSum(scoredSpecList, maxNominalPeptideMass)); specKeyScorerMap.put(specKey, new FastScorer(scoredSpec, maxNominalPeptideMass)); else specKeyScorerMap.put(specKey, new FastScorer(scoredSpec, maxNominalPeptideMass)); diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java index 93fb9d55..e1b913cc 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java @@ -88,42 +88,34 @@ public PrecursorCalMode getPrecursorCalMode() { return precursorCalMode; } - // Used by MS-GF+ public List getDBSearchIOList() { return dbSearchIOList; } - // Used by MS-GF+ public File getDatabaseFile() { return databaseFile; } - // Used by MS-GF+ public String getDecoyProteinPrefix() { return decoyProteinPrefix; } - // Used by MS-GF+ public Tolerance getLeftPrecursorMassTolerance() { return leftPrecursorMassTolerance; } - // Used by MS-GF+ public Tolerance getRightPrecursorMassTolerance() { return rightPrecursorMassTolerance; } - // Used by MS-GF+ public int getMinIsotopeError() { return minIsotopeError; } - // Used by MS-GF+ public int getMaxIsotopeError() { return maxIsotopeError; } - // Used by MS-GF+ public Enzyme getEnzyme() { return enzyme; } @@ -132,77 +124,62 @@ public int getNumTolerableTermini() { return numTolerableTermini; } - // Used by MS-GF+ public ActivationMethod getActivationMethod() { return activationMethod; } - // Used by MS-GF+ public InstrumentType getInstType() { return instType; } - // Used by MS-GF+ public Protocol getProtocol() { return protocol; } - // Used by MS-GF+ public AminoAcidSet getAASet() { return aaSet; } - // Used by MS-GF+ public int getNumMatchesPerSpec() { return numMatchesPerSpec; } - // Used by MS-GF+ public int getStartSpecIndex() { return startSpecIndex; } - // Used by MS-GF+ public int getEndSpecIndex() { return endSpecIndex; } - // Used by MS-GF+ public boolean useTDA() { return useTDA; } - // Used by MS-GF+ public boolean ignoreMetCleavage() { return ignoreMetCleavage; } - // Used by MS-GF+ public int getMinPeptideLength() { return minPeptideLength; } - // Used by MS-GF+ public int getMaxPeptideLength() { return maxPeptideLength; } - // Used by MS-GF+ public int getMaxNumVariantsPerPeptide() { return maxNumVariantsPerPeptide; } - // Used by MS-GF+ public int getMinCharge() { return minCharge; } - // Used by MS-GF+ public int getMaxCharge() { return maxCharge; } - // Used by MS-GF+ public int getNumThreads() { return numThreads; } @@ -219,12 +196,10 @@ public boolean getVerbose() { return verbose; } - // Used by MS-GF+ public boolean doNotUseEdgeScore() { return doNotUseEdgeScore; } - // Used by MS-GF+ public File getDBIndexDir() { return dbIndexDir; } @@ -233,12 +208,10 @@ public boolean outputAdditionalFeatures() { return outputAdditionalFeatures; } - // Used by MS-GF+ public int getMinNumPeaksPerSpectrum() { return minNumPeaksPerSpectrum; } - // Used by MS-GF+ public int getMinDeNovoScore() { return minDeNovoScore; } @@ -247,22 +220,18 @@ public double getChargeCarrierMass() { return chargeCarrierMass; } - // Used by MS-GF+ public int getMaxMissedCleavages() { return maxMissedCleavages; } - // Used by MS-GF+ public boolean getAllowDenseCentroidedPeaks() { return allowDenseCentroidedPeaks; } - // Used by MS-GF+ public int getMinMSLevel() { return minMSLevel; } - // Used by MS-GF+ public int getMaxMSLevel() { return maxMSLevel; } diff --git a/src/main/java/edu/ucsd/msjava/msgf/GeneratingFunction.java b/src/main/java/edu/ucsd/msjava/msgf/GeneratingFunction.java index 367fdf04..d7455754 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/GeneratingFunction.java +++ b/src/main/java/edu/ucsd/msjava/msgf/GeneratingFunction.java @@ -19,7 +19,6 @@ public class GeneratingFunction implements GF { private boolean calcProb = true; private Enzyme enzyme = Enzyme.TRYPSIN; - // private int numScoreBinsPerNode = 1000; private int gfTableCapacity; private ScoreDist distribution = null; @@ -27,9 +26,6 @@ public class GeneratingFunction implements GF { private class GFTable extends LinkedHashMap { - /** - * - */ private static final long serialVersionUID = 1L; private final int capacity; @@ -49,14 +45,11 @@ protected boolean removeEldestEntry(Map.Entry eldest) { private boolean isGFComputed = false; -// private HashMap srmScore = null; - public GeneratingFunction(DeNovoGraph graph) { this.graph = graph; this.gfTableCapacity = 1 + graph.intermediateNodes.size() + graph.sinkNodes.size(); } - // Builder public GeneratingFunction doNotBacktrack() { this.backtrack = false; return this; @@ -77,7 +70,6 @@ public GeneratingFunction enzyme(Enzyme enzyme) { return this; } - // public GeneratingFunction numScoreBinsPerNode(int numBins) { this.numScoreBinsPerNode = numBins; return this; } public GeneratingFunction gfTableCapacity(int gfTableCapacity) { this.gfTableCapacity = gfTableCapacity; return this; @@ -99,7 +91,6 @@ public Enzyme getEnzyme() { return enzyme; } - // public int getNumScoreBinsPerNode() { return numScoreBinsPerNode; } public boolean isGFComputed() { return this.isGFComputed; } diff --git a/src/main/java/edu/ucsd/msjava/msgf/ScoreDist.java b/src/main/java/edu/ucsd/msjava/msgf/ScoreDist.java index 46fbd97a..4effc750 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/ScoreDist.java +++ b/src/main/java/edu/ucsd/msjava/msgf/ScoreDist.java @@ -50,7 +50,6 @@ public double getSpectralProbability(int score) { double specProb = 0; int minIndex = (score >= minScore) ? score - minScore : 0; for (int t = minIndex; t < probDistribution.length; t++) { -// System.out.println("***********\t"+(t+minScore)+"\t"+probDistribution[t]); specProb += probDistribution[t]; } if (specProb > 1.) @@ -112,21 +111,4 @@ public ScoreBound getPercentileRange(float percentile) { return null; } -// // added by kyowon. Get a new ScoreDist instance. it has the same value as the original one from newMinScore to max score of the original ScoreDist -// static public ScoreDist getTruncatedScoreDist(ScoreDist original, int newMinScore){ -// ScoreDistFactory factory = new ScoreDistFactory(original.isNumSet(), original.isProbSet()); -// ScoreDist newDist = factory.getInstance(Math.max(newMinScore, original.getMinScore()), original.getMaxScore()); -// -// for(int score = newDist.getMinScore(); score= nodeMass.length || prevNominalMass >= nodeMass.length || curNominalMass < 0 || prevNominalMass < 0) return 0; int ionExistenceIndex = 0; diff --git a/src/main/java/edu/ucsd/msjava/msscorer/FastScorer.java b/src/main/java/edu/ucsd/msjava/msscorer/FastScorer.java index da9380f3..6cc969e4 100644 --- a/src/main/java/edu/ucsd/msjava/msscorer/FastScorer.java +++ b/src/main/java/edu/ucsd/msjava/msscorer/FastScorer.java @@ -62,10 +62,6 @@ public int getScore(double[] prefixMassArr, int[] nominalPrefixMassArr, int from for (int i = fromIndex; i < toIndex - 1; i++) { int prefixMass = nominalPrefixMassArr[i]; int suffixMass = peptideMass - prefixMass; -// if(prefixMass >= prefixScore.length || suffixMass >= suffixScore.length) -// { -// System.out.println("Debug"); -// } int curScore; try { curScore = Math.round(prefixScore[prefixMass] + suffixScore[suffixMass]); @@ -80,9 +76,6 @@ public int getScore(double[] prefixMassArr, int[] nominalPrefixMassArr, int from } public int getNodeScore(NominalMass prefixMass, NominalMass suffixMass) { -// if(prefixMass.getNominalMass() >= prefixScore.length || -// suffixMass.getNominalMass() >= suffixScore.length) -// System.out.println("Debug"); int preNormMass = prefixMass.getNominalMass(); int sufNormMass = suffixMass.getNominalMass(); if (preNormMass >= prefixScore.length || sufNormMass >= suffixScore.length || preNormMass < 0 || sufNormMass < 0) diff --git a/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java b/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java index faa84e2b..0ea1db20 100644 --- a/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java +++ b/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java @@ -197,14 +197,7 @@ protected void readFromFile(File paramFile, boolean verbose) { private void readFromInputStream(InputStream is, boolean verbose) { DataInputStream in = new DataInputStream(is); - // Read the date try { -// int year = in.readInt(); // version information -// int month = in.readInt(); -// int date = in.readInt(); -// if(verbose) -// System.out.println("CreationDate: " + year + "/" + (month+1) + "/" + date); - int version = in.readInt(); if (verbose) System.out.println("Version: " + version); @@ -245,11 +238,6 @@ private void readFromInputStream(InputStream is, boolean verbose) { for (byte i = 0; i < lenProtocol; i++) bufProtocol.append(in.readChar()); protocol = Protocol.get(bufProtocol.toString()); -// if(protocol == null) -// { -// System.out.println(bufProtocol.toString()); -// System.exit(-1); -// } } else protocol = Protocol.AUTOMATIC; diff --git a/src/main/java/edu/ucsd/msjava/msscorer/NewScoredSpectrum.java b/src/main/java/edu/ucsd/msjava/msscorer/NewScoredSpectrum.java index 195122c3..56c1a653 100644 --- a/src/main/java/edu/ucsd/msjava/msscorer/NewScoredSpectrum.java +++ b/src/main/java/edu/ucsd/msjava/msscorer/NewScoredSpectrum.java @@ -28,7 +28,6 @@ public NewScoredSpectrum(Spectrum spec, NewRankScorer scorer) { this.mme = scorer.mme; this.precursor = spec.getPrecursorPeak().clone(); this.activationMethodArr = new ActivationMethod[1]; -// activationMethodArr[0] = scorer.getActivationMethod(); if (spec.getActivationMethod() != null) activationMethodArr[0] = spec.getActivationMethod(); else @@ -93,16 +92,6 @@ public int getEdgeScore(T curNode, T prevNode, float theoMass) { float edgeScore = scorer.getIonExistenceScore(partition, ionExistenceIndex, probPeak); if (ionExistenceIndex == 3) edgeScore += scorer.getErrorScore(partition, curNodeMass - prevNodeMass - theoMass); - -// // debug -// if(edgeScore < -1000 || edgeScore > 1000) -// { -// System.out.println("Error! EdgeScore = " + edgeScore); -// System.out.println("Spectrum ScanNum: " + spec.getScanNum()); -// System.out.println("Partition: " + partition.getCharge() + " " + partition.getSegNum() + " " + partition.getParentMass()); -// System.out.println("IonExistence: " + scorer.getIonExistenceScore(partition, ionExistenceIndex, probPeak)); -// System.out.println("Error: " + scorer.getErrorScore(partition, curNodeMass-prevNodeMass-theoMass)); -// } return Math.round(edgeScore); } @@ -126,12 +115,7 @@ public boolean getMainIonDirection() { return mainIon.isPrefixIon(); } - /** - * returns the corrected mass of the node based on the peak observed in the spectrum - * - * @param node - * @return corrected mass of the node if peak exists, null -1 - */ + /** Returns the corrected m/z from the observed peak, or -1 if no peak was found. */ public float getNodeMass(T node) { if (node.getNominalMass() == 0) return 0; @@ -215,9 +199,6 @@ public float getExplainedIonCurrent(float residueMass, boolean isPrefix, Toleran public Pair getMassErrorWithIntensity(float residueMass, boolean isPrefix, Tolerance fragmentTolerance) { Float error = null; float maxIntensity = 0; -// IonType bestIon = null; -// Peak bestPeak = null; -// float bestTheoMass = 0; for (int segIndex = 0; segIndex < scorer.getNumSegments(); segIndex++) { for (IonType ion : ionTypes[segIndex]) { @@ -246,18 +227,10 @@ public Pair getMassErrorWithIntensity(float residueMass, boolean i if (p != null) // peak exists { float err = (p.getMz() - theoMass) / theoMass * 1e6f; -// float err = p.getMz() - theoMass; -// if(err < 0) -// err = -err; float intensity = p.getIntensity(); - // Debug -// System.out.println(residueMass + " " + ion.getName() + " " + err + " " + intensity); if (intensity > maxIntensity) { error = err; maxIntensity = intensity; -// bestIon = ion; -// bestPeak = p; -// bestTheoMass = theoMass; } } } @@ -265,9 +238,6 @@ public Pair getMassErrorWithIntensity(float residueMass, boolean i if (error == null) return null; else { -// // Debug -// System.out.println("*\t" + residueMass + "\t" + bestIon.getName() + "\t" + error + "\t" + bestPeak.getRank() -// + "\t" + bestPeak.getMz() + "\t" + bestPeak.getIntensity() + "\t" + bestTheoMass); return new Pair(error, maxIntensity); } } diff --git a/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java b/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java index eb050444..4639b667 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java +++ b/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java @@ -113,12 +113,10 @@ private static void add(ActivationMethod actMethod) { registeredActMethods.add(actMethod); } - // add to the HashMap only private static void addAlias(String name, ActivationMethod actMethod) { table.put(name, actMethod); } - // add to the list only private static void addToList(ActivationMethod actMethod) { registeredActMethods.add(actMethod); } @@ -148,7 +146,6 @@ private static void addToList(ActivationMethod actMethod) { // Parse activation methods defined by a user File actMethodFile = Paths.get("params", "activationMethods.txt").toFile(); if (actMethodFile.exists()) { -// System.out.println("Loading " + actMethodFile.getAbsolutePath()); ArrayList paramLines = UserParam.parseFromFile(actMethodFile.getPath(), 2); for (String paramLine : paramLines) { String[] token = paramLine.split(","); diff --git a/src/main/java/edu/ucsd/msjava/msutil/AminoAcid.java b/src/main/java/edu/ucsd/msjava/msutil/AminoAcid.java index 61081dbc..3f5a64f8 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/AminoAcid.java +++ b/src/main/java/edu/ucsd/msjava/msutil/AminoAcid.java @@ -20,13 +20,6 @@ public class AminoAcid extends Matter { private float probability = 0.05f; private Composition composition; - /** - * Constructor. - * - * @param residue single letter identifier. - * @param name full name of the amino acid. - * @param composition CHNOS composition object. - */ protected AminoAcid(char residue, String name, Composition composition) { this.mass = composition.getAccurateMass(); this.nominalMass = composition.getNominalMass(); @@ -35,13 +28,6 @@ protected AminoAcid(char residue, String name, Composition composition) { this.composition = composition; } - /** - * Constructor. Generates a custom amino acid. - * - * @param residue Single letter identifier. - * @param name Full name of the amino acid. - * @param mass Mass - */ protected AminoAcid(char residue, String name, double mass) { this.mass = mass; this.nominalMass = Math.round(Constants.INTEGER_MASS_SCALER * (float) mass); @@ -49,109 +35,54 @@ protected AminoAcid(char residue, String name, double mass) { this.name = name; } - /** - * Builder. Set probability and returns this object. - * - * @return this object. - */ public AminoAcid setProbability(float probability) { this.probability = probability; return this; } - /** - * Standard string representation of this object. Output the single letter - * representation. - * - * @return the single letter code for this amino acid. - */ public String toString() { return String.valueOf(residue) + ": " + String.format("%.2f", mass); } - /** - * Quick way to tell whether this object is modified. - * - * @return false if this is not modified. - */ + /** Returns false; overridden by {@code ModifiedAminoAcid}. */ public boolean isModified() { return false; } - /** - * Quick way to tell the number of variable modifications applied to this amino acid. - * - * @return the number of variable modifications applied to this amino acid. - */ + /** Returns 0; overridden by {@code ModifiedAminoAcid}. */ public int getNumVariableMods() { return 0; } - /** - * Tell whether this object is associated with a terminal-specific modification - * - * @return false if this is not associated with terminal-specific modification - */ + /** Returns false; overridden by {@code ModifiedAminoAcid}. */ public boolean hasTerminalVariableMod() { return false; } - /** - * Tell whether this object is associated with a residue-specific modification - * - * @return false if this is not associated with residue-specific modification - */ + /** Returns false; overridden by {@code ModifiedAminoAcid}. */ public boolean hasResidueSpecificVariableMod() { return false; } - // accessor methods - - /** - * Gets the mass of this amino acid. This is the mono isotopic mass. - * - * @return the mass of this amino acid - */ @Override public float getMass() { return (float) mass; } - /** - * Gets the mass of this amino acid as double precision. This is the mono isotopic mass. - * - * @return the mass of this amino acid (double precision) - */ @Override public double getAccurateMass() { return mass; } - /** - * Gets the nominal mass of this object. - * - * @return nominal mass of this object. - */ @Override public int getNominalMass() { return nominalMass; } - /** - * Gets the probability of this amino acid. Currently set as 1/20, uniformly. - * - * @return the probability of this amino acid. - */ public float getProbability() { return probability; } - // // prohibited - // @Override - // public void add(AminoAcid other) { - // assert(false); - // } - @Override public boolean equals(Object obj) { if (!(obj instanceof AminoAcid)) @@ -160,52 +91,27 @@ public boolean equals(Object obj) { return this == aa; } - /** - * Gets the representation of the residue as string. - * - * @return the string representing this amino acid. - */ public String getResidueStr() { return String.valueOf(residue); } - /** - * Gets the single letter amino acid representation. - * - * @return the single letter amino acid character. - */ public char getResidue() { return residue; } - /** - * Gets the single letter amino acid representation of the unmodified version of this amino acid. - * - * @return the single letter amino acid character. - */ + /** Returns the unmodified residue letter; overridden by ModifiedAminoAcid. */ public char getUnmodResidue() { return residue; } - /** - * Gets the full string. - * - * @return the full name/description of the amino acid. - */ public String getName() { return name; } - /** - * Gets the composition object for this amino acid. - * - * @return the composition object for this amino acid. - */ public Composition getComposition() { return composition; } - // static members public static AminoAcid getStandardAminoAcid(char residue) { return residueMap.get(residue); } @@ -214,12 +120,6 @@ public static AminoAcid[] getStandardAminoAcids() { return standardAATable; } - /** - * Returns a modified version of this amino acid (fixed modification). - * - * @param mod a modification. - * @return a modified amino acid object. - */ public AminoAcid getAAWithFixedModification(Modification mod) { String name = mod.getName() + " " + this.getName(); AminoAcid modAA; @@ -230,13 +130,6 @@ public AminoAcid getAAWithFixedModification(Modification mod) { return modAA; } - /** - * Get an amino acid with a customized mass - * @param residue - * @param name - * @param mass - * @return - */ public static AminoAcid getCustomAminoAcid(char residue, String name, double mass) { AminoAcid standardAA = AminoAcid.getStandardAminoAcid(residue); if (standardAA != null && Math.abs(mass - standardAA.getMass()) < 0.001f) @@ -262,20 +155,8 @@ public int hashCode() { return (int) residue; } -// @Override -// public boolean equals(Object obj) -// { -// if(!(obj instanceof AminoAcid)) -// return false; -// else -// { -// AminoAcid otherAA = (AminoAcid)obj; -// return this.getResidue() == otherAA.getResidue(); -// } -// } - private static Hashtable residueMap; - // Static table containing Predefined Amino Acids, sorted by increasing mass + // Standard amino acids sorted by increasing nominal mass private static final AminoAcid[] standardAATable = { // C H N O S @@ -303,65 +184,17 @@ public int hashCode() { new AminoAcid('W', "Tryptophan", new Composition(11, 10, 2, 1, 0)), // 186.0793 }; -// public static final AminoAcid N_TERN = new AminoAcid('[', "N-terminus", new Composition(0,0,0,0,0)); -// public static final AminoAcid C_TERM = new AminoAcid(']', "C-terminus", new Composition(0,0,0,0,0)); -// public static final AminoAcid PROTEIN_N_TERN = new AminoAcid('{', "Protein N-terminus", new Composition(0,0,0,0,0)); -// public static final AminoAcid PROTEIN_C_TERM = new AminoAcid('}', "Protein C-terminus", new Composition(0,0,0,0,0)); -// public static final AminoAcid ANY = new AminoAcid('*', "C-terminus", new Composition(0,0,0,0,0)); - static { residueMap = new Hashtable(); for (AminoAcid aa : standardAATable) residueMap.put(aa.getResidue(), aa); } - /* - public static Color getColor(AminoAcid aa) - { - int index = aa.getIndex(); - switch(index) - { - case 0: return new Color(200,200,200); - case 1: return new Color(140,255,140); - case 2: return new Color(255,112,66); - case 3: return new Color(82,82,82); - case 4: return new Color(255,140,255); - case 5: return new Color(184,76,0); - case 6: return new Color(69,94,69); - case 7: return new Color(0,76,0); - case 8: return new Color(255,124,112); - case 9: return new Color(160,0,66); - case 10: return new Color(102,0,0); - case 11: return new Color(71,71,184); - case 12: return new Color(102,0,100); - case 13: return new Color(184,160,66); - case 14: return new Color(112,112,255); - case 15: return new Color(83,76,82); - case 16: return new Color(100,100,224); - case 17: return Color.orange; - case 18: return new Color(140,112,76); - case 19: return new Color(79,70,0); - default: return null; - } - } - */ - - /** - * Get the amino acid of the given integer mass. - * - * @param mass the integer mass - * @return the list of amino acids with the mass. - */ + public static ArrayList getAminoAcids(int mass) { if (mass2aa.containsKey(mass)) return mass2aa.get(mass); return new ArrayList(); } - /** - * Checks whether the character is an standard amino acid - * - * @param c the character input - * @return true if it is part of the standard amino acid set, false otherwise - */ public static boolean isStdAminoAcid(char c) { return residueMap.containsKey(c); } diff --git a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java index 162ee034..cb443c0c 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java +++ b/src/main/java/edu/ucsd/msjava/msutil/AminoAcidSet.java @@ -16,23 +16,13 @@ * @author sangtaekim */ public class AminoAcidSet implements Iterable { - /** - * - */ private static final AminoAcid[] EMPTY_AA_ARRAY = new AminoAcid[0]; private HashMap> aaListMap; - /** - * Mapping from Location Enum name to places where the location applies - */ private static HashMap locMap; - /** - * This tracks any default mods that the user has defined - * Keys are mod names and values are the mod mass that the user defined for this modification - * This list is used to warn users of non-standard mod masses for default mods - */ + // maps mod name -> user-supplied mass; used to warn on non-standard masses for built-in mods private static Hashtable defaultModUsage = new Hashtable<>(); static { @@ -44,7 +34,6 @@ public class AminoAcidSet implements Iterable { locMap.put(Location.Protein_C_Term, new Location[]{Location.Protein_C_Term}); } - // for fast indexing private HashMap residueMap; // residue -> aa (residue must be unique) private HashMap aa2index; // aa -> index private HashMap> standardResidueAAArrayMap; // std residue -> array of amino acids @@ -63,8 +52,6 @@ public class AminoAcidSet implements Iterable { private HashSet modResidueSet = new HashSet<>(); // set of symbols used for residues private char nextResidue; - // for enzyme -// private ArrayList enzymeAAList; private int neighboringAACleavageCredit = 0; private int neighboringAACleavagePenalty = 0; private int peptideCleavageCredit = 0; @@ -73,9 +60,6 @@ public class AminoAcidSet implements Iterable { AminoAcid lightestAA, heaviestAA; - /** - * This tracks user-friendly descriptions of the modifications in use - */ private ArrayList modificationsInUse = new ArrayList<>(); private AminoAcidSet() // prevents instantiation @@ -88,11 +72,6 @@ private AminoAcidSet() // prevents instantiation nextResidue = 128; } - /** - * Returns the list of amino acids specific to the position. - * - * @return list of intermediate amino acids. - */ public ArrayList getAAList(Location location) { return aaListMap.get(location); } @@ -117,39 +96,18 @@ public ArrayList getModificationsInUse() { return modificationsInUse; } - /** - * Returns the iterator of anywhere amino acids - */ public Iterator iterator() { return aaListMap.get(Location.Anywhere).iterator(); } - /** - * Returns the size of amino acid depending on the location. - * - * @param location amino acid location - * @return - */ public int size(Location location) { return aaListMap.get(location).size(); } - /** - * Returns the size of anywhere amino acids - * - * @return the size of anywhere amino acids - */ public int size() { return aaListMap.get(Location.Anywhere).size(); } - /** - * Retrieve an array of amino acids given the specific standard residue. - * - * @param location amino acid location - * @param standardAAResidue the standard residue to look up - * @return the array of amino acids or an empty array otherwise - */ public AminoAcid[] getAminoAcids(Location location, char standardAAResidue) { AminoAcid[] matches = standardResidueAAArrayMap.get(location).get(standardAAResidue); if (matches != null) @@ -158,44 +116,20 @@ public AminoAcid[] getAminoAcids(Location location, char standardAAResidue) { return EMPTY_AA_ARRAY; } - /** - * Retrieve an array of amino acids given the specific nominal mass. - * - * @param location amino acid location - * @param nominalMass nominal mass to look up - * @return the array of amino acids or an empty list otherwise - */ public AminoAcid[] getAminoAcids(Location location, int nominalMass) { AminoAcid[] matches = nominalMass2aa.get(location).get(nominalMass); if (matches != null) return matches; return EMPTY_AA_ARRAY; } - /** - * Retrieve an array of amino acids given the specific nominal mass. - * - * @param nominalMass the mass to look up - * @return the array of amino acids or an empty list otherwise - */ public AminoAcid[] getAminoAcids(int nominalMass) { return getAminoAcids(Location.Anywhere, nominalMass); } - /** - * Checks whether a residue belongs to this amino acid set - * - * @param residue a residue - * @return true if residue belongs to the amino acid set - */ public boolean contains(char residue) { return residueMap.containsKey(residue); } - /** - * Returns a list of all residues without mods - * - * @return - */ public ArrayList getResidueListWithoutMods() { ArrayList residues = new ArrayList<>(); for (Map.Entry aa : residueMap.entrySet()) { @@ -207,22 +141,10 @@ public ArrayList getResidueListWithoutMods() { return residues; } - /** - * Returns a list of all residues, including modified residues - * - * @return - */ public ArrayList getResidueList() { return new ArrayList<>(residueMap.keySet()); } - /** - * Get the amino acid mass of the residue. - * - * @param residue the amino acid mass. Use uppercase for standard aa (convention). - * this method is case sensitive. - * @return the amino acid object. null if no aa corresponding to the residue - */ public AminoAcid getAminoAcid(Location location, char residue) { AminoAcid[] aaArr = getAminoAcids(location, residue); for (AminoAcid aa : aaArr) @@ -231,60 +153,26 @@ public AminoAcid getAminoAcid(Location location, char residue) { return null; } - /** - * Get the amino acid mass of the residue. - * - * @param residue the amino acid mass. Use uppercase for standard aa (convention). - * this method is case sensitive. - * @return the amino acid object. null if no aa corresponding to the residue - */ public AminoAcid getAminoAcid(char residue) { return residueMap.get(residue); } - /** - * Set the number of allowable variable modifications per peptide - * - * @param maxNumberOfVariableModificationsPerPeptide the number of allowable variable modifications per peptide - */ public void setMaxNumberOfVariableModificationsPerPeptide(int maxNumberOfVariableModificationsPerPeptide) { this.maxNumberOfVariableModificationsPerPeptide = maxNumberOfVariableModificationsPerPeptide; } - /** - * Get the number of allowable variable modifications per peptide - * - * @return the number of allowable variable modifications per peptide - */ public int getMaxNumberOfVariableModificationsPerPeptide() { return this.maxNumberOfVariableModificationsPerPeptide; } - /** - * Get all amino acids for all locations. - * - * @return an array of all amino acids. - */ public AminoAcid[] getAllAminoAcidArr() { return this.allAminoAcidArr; } - /** - * Get the amino acid corresponding to the index - * - * @param index amino acid index - * @return amino acid object - */ public AminoAcid getAminoAcid(int index) { return allAminoAcidArr[index]; } - /** - * Get the index of the aa - * - * @param aa amino acid - * @return the index of aa. null if aa does not belong to this amino acid set - */ public int getIndex(AminoAcid aa) { Integer index = aa2index.get(aa); if (index == null) @@ -292,12 +180,6 @@ public int getIndex(AminoAcid aa) { return index; } - /** - * Get the peptide corresponding to the string sequence. - * - * @param sequence sequence of the peptide. - * @return peptide object of the sequence - */ public Peptide getPeptide(String sequence) { boolean isModified = false; ArrayList aaArray = new ArrayList<>(); diff --git a/src/main/java/edu/ucsd/msjava/msutil/Enzyme.java b/src/main/java/edu/ucsd/msjava/msutil/Enzyme.java index 9dd9f26e..aa5b842d 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Enzyme.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Enzyme.java @@ -15,38 +15,14 @@ import java.util.ArrayList; import java.util.HashMap; -/** - * This class represents an enzyme. - * - * @author sangtaekim - */ public class Enzyme implements ParamObject { - /** - * True if the enzyme cleaves n-terminal of the residue, otherwise false - */ private boolean isNTerm; - - /** - * Name of the enzyme. - */ private String name; - - /** - * Description - */ private String description; - - /** - * Amino acid residues cleaved by the enzyme. - */ private char[] residues; - /** - * Tracks whether a residue is cleavable - * Residue symbols as chars are converted to their ASCII value when updating this array - * For example, the cleavability of residue K is tracked at isResidueCleavable[82] - */ + // residue symbols as chars are converted to ASCII value: isResidueCleavable['K'] == isResidueCleavable[75] private boolean[] isResidueCleavable; // the probability that a peptide generated by this enzyme follows the cleavage rule @@ -59,13 +35,6 @@ public class Enzyme implements ParamObject { private String psiCvAccession; - /** - * Instantiates a new enzyme. - * - * @param name the name - * @param residues the residues cleaved by the enzyme (String) - * @param isNTerm N term or C term (true if it cleaves N-term) - */ private Enzyme(String name, String residues, boolean isNTerm, String description, String psiCvAccession) { this.name = name; this.description = description; @@ -102,9 +71,6 @@ public static void loadCustomEnzymeFile(File enzymeFile) { customEnzymeFilePath = enzymeFile.getAbsolutePath(); - // Uncomment to debug - // System.out.println("Loading user-defined enzyme file: " + customEnzymeFilePath); - int tokenLength = 4; ArrayList paramLines = UserParam.parseFromFile(enzymeFile.getPath(), tokenLength); for (String paramLine : paramLines) { @@ -145,106 +111,48 @@ else if (token[2].equals("N")) } } - /** - * Sets the neighboring amino acid efficiency as the probability that a neighboring amino acid follows the enzyme rule - * - * @param neighboringAACleavageEfficiency neighboring amino acid efficiency - * @return this object - */ private void setNeighboringAAEfficiency(float neighboringAACleavageEfficiency) { this.neighboringAACleavageEfficiency = neighboringAACleavageEfficiency; } - /** - * Gets the neighboring amino acid efficiency - * Deprecated, use getNeighboringAACleavageEfficiency - * - * @return neighboring amino acid efficiency - */ + /** @deprecated use getNeighboringAACleavageEfficiency */ @Deprecated() public float getNeighboringAACleavageEffiency() { return getNeighboringAACleavageEfficiency(); } - /** - * Gets the neighboring amino acid efficiency - * - * @return neighboring amino acid efficiency - */ public float getNeighboringAACleavageEfficiency() { return neighboringAACleavageEfficiency; } - /** - * Sets the peptide cleavage efficiency as the probability that a peptide generated by this enzyme follows the cleavage rule - * - * @param peptideCleavageEfficiency peptide cleavage efficiency - * @return this object - */ private void setPeptideCleavageEfficiency(float peptideCleavageEfficiency) { this.peptideCleavageEfficiency = peptideCleavageEfficiency; } - /** - * Gets the peptide efficiency. - * - * @return peptide efficiency - */ public float getPeptideCleavageEfficiency() { return peptideCleavageEfficiency; } - /** - * Returns the name of the enzyme. - * - * @return the name of the enzyme. - */ public String getName() { return name; } - /** - * Returns the description of the enzyme. - * - * @return the description of the enzyme. - */ public String getDescription() { return description; } - /** - * Returns the description of the enzyme when it is showed in the usage info. - * - * @return the description of the enzyme when it is showed in the usage info. - */ public String getParamDescription() { return description; } - /** - * Checks if this enzyme cleaves N term. - * - * @return true, if it cleaves N term. - */ public boolean isNTerm() { return isNTerm; } - /** - * Checks if this enzyme cleaves C term. - * - * @return true, if it cleaves C term. - */ public boolean isCTerm() { return !isNTerm; } - /** - * Checks if the amino acid is cleavable. - * - * @param aa the amino acid - * @return true, if aa is cleavable - */ public boolean isCleavable(AminoAcid aa) { if (this.residues == null) return true; @@ -254,25 +162,13 @@ public boolean isCleavable(AminoAcid aa) { return false; } - /** - * Checks if the amino acid is cleavable. - * - * @param residue amino acid residue - * @return true, if residue is cleavable - */ public boolean isCleavable(char residue) { if (isResidueCleavable == null) return true; return isResidueCleavable[residue]; } - /** - * Checks if the peptide is cleaved by the enzyme. - * Does not check for exception residues (meaning K.P or K.P is considered cleavable for trypsin) - * - * @param p peptide - * @return true if p is cleaved, false otherwise. - */ + /** Does not check for exception residues (K.P is considered cleavable for trypsin). */ public boolean isCleaved(Peptide p) { AminoAcid aa; if (isNTerm) @@ -282,21 +178,11 @@ public boolean isCleaved(Peptide p) { return isCleavable(aa.getResidue()); } - /** - * Returns PSI CV accession. - * - * @return HUPO PSI CV accession of this enzyme. null if unknown. - */ + /** Returns HUPO PSI CV accession of this enzyme, or null if unknown. */ public String getPSICvAccession() { return this.psiCvAccession; } - /** - * Returns the number of cleavable terminii - * - * @param annotation annotation (e.g. K.DLFGEK.I) - * @return the number of cleavable terminii - */ public int getNumCleavedTermini(String annotation, AminoAcidSet aaSet) { int nCT = 0; String pepStr = annotation.substring(annotation.indexOf('.') + 1, annotation.lastIndexOf('.')); @@ -326,99 +212,36 @@ public int hashCode() { return name.hashCode(); } - /** - * Gets the residues. - * - * @return the residues - */ public char[] getResidues() { return residues; } - /** - * Unspecific cleavage enzyme (can cleavage after any residue) - */ public static final Enzyme UnspecificCleavage; - - /** - * TRYPSIN enzyme (cleave after K or R) - */ public static final Enzyme TRYPSIN; - - /** - * CHYMOTRYPSIN enzyme (cleave after FYWL) - */ public static final Enzyme CHYMOTRYPSIN; - - /** - * LysC enzyme (cleave after K) - */ public static final Enzyme LysC; - - /** - * LysN enzyme (cleave before K) - */ public static final Enzyme LysN; - - /** - * GluC enzyme (cleave after E) - */ public static final Enzyme GluC; - - /** - * ArgC enzyme (cleave after R) - */ public static final Enzyme ArgC; - - /** - * AspN enzyme (cleave before D) - */ public static final Enzyme AspN; - - /** - * ALP enzyme - */ public static final Enzyme ALP; - - /** - * Endogenous peptides (do not cleave after any residue, i.e. no internal cleavage) - */ + /** No internal cleavage — for endogenous peptides. */ public static final Enzyme NoCleavage; - - /** - * Trypsin plus C (cleave after K, R, or C) - */ public static final Enzyme TrypsinPlusC; - /** - * Custom enzyme file path - * @return - */ public static String getCustomEnzymeFilePath() { return customEnzymeFilePath; } - /** - * Messages associated with enzymes loaded from the custom enzyme file - * @return - */ public static ArrayList getCustomEnzymeMessages() { return customEnzymeMessages; } - /** - * Get an Enzyme by enzyme name - */ public static Enzyme getEnzymeByName(String name) { return enzymeTable.get(name); } - /** - * Get all registered enzymes - */ public static Enzyme[] getAllRegisteredEnzymes() { return registeredEnzymeList.toArray(new Enzyme[0]); } - /** - * Obsolete method; does nothing - */ + /** @deprecated Does nothing. */ @Deprecated public static Enzyme register(String name, String residues, boolean isNTerm, String description) { return null; @@ -474,21 +297,12 @@ private static void register(String name, Enzyme enzyme, boolean notifyNewEnzyme static { UnspecificCleavage = new Enzyme("UnspecificCleavage", null, false, "unspecific cleavage", "MS:1001956"); TRYPSIN = new Enzyme("Tryp", "KR", false, "Trypsin", "MS:1001251"); -// TRYPSIN.setNeighboringAAEfficiency(0.9148273f); -// TRYPSIN.setPeptideCleavageEfficiency(0.98173124f); - -// TRYPSIN.setNeighboringAAEfficiency(0.9523f); -// TRYPSIN.setPeptideCleavageEfficiency(0.9742f); - - // Modified by Sangtae to boost the performance TRYPSIN.setNeighboringAAEfficiency(0.99999f); TRYPSIN.setPeptideCleavageEfficiency(0.99999f); CHYMOTRYPSIN = new Enzyme("Chymotrypsin", "FYWL", false, "Chymotrypsin", "MS:1001306"); LysC = new Enzyme("LysC", "K", false, "Lys-C", "MS:1001309"); -// LysC.setNeighboringAAEfficiency(0.79f); -// LysC.setPeptideCleavageEfficiency(0.89f); LysC.setNeighboringAAEfficiency(0.999f); LysC.setPeptideCleavageEfficiency(0.999f); @@ -534,9 +348,6 @@ private static void register(String name, Enzyme enzyme, boolean notifyNewEnzyme // look for file enzymes.txt in the params directory below the working directory File enzymeFile = Paths.get("params", "enzymes.txt").toFile(); - // Uncomment to debug - // System.out.println("Looking for user-defined enzyme file at " + enzymeFile.getAbsolutePath()); - if (enzymeFile.exists()) { loadCustomEnzymeFile(enzymeFile); } diff --git a/src/main/java/edu/ucsd/msjava/msutil/InstrumentType.java b/src/main/java/edu/ucsd/msjava/msutil/InstrumentType.java index 513ced47..6cfd365e 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/InstrumentType.java +++ b/src/main/java/edu/ucsd/msjava/msutil/InstrumentType.java @@ -64,7 +64,6 @@ public static InstrumentType get(String name) { public static final InstrumentType TOF; public static final InstrumentType HIGH_RESOLUTION_LTQ; public static final InstrumentType QEXACTIVE; -// public static final InstrumentType TRIPLETOF; public static InstrumentType[] getAllRegisteredInstrumentTypes() { return table.values().toArray(new InstrumentType[0]); @@ -75,13 +74,11 @@ public static InstrumentType[] getAllRegisteredInstrumentTypes() { HIGH_RESOLUTION_LTQ = new InstrumentType("HighRes", "Orbitrap/FTICR/Lumos", true); TOF = new InstrumentType("TOF", "TOF", true); QEXACTIVE = new InstrumentType("QExactive", "Q-Exactive", true); -// TRIPLETOF = new InstrumentType("TripleTOF", "TripleTOF 5600", true); table.put(LOW_RESOLUTION_LTQ.getName(), LOW_RESOLUTION_LTQ); table.put(HIGH_RESOLUTION_LTQ.getName(), HIGH_RESOLUTION_LTQ); table.put(TOF.getName(), TOF); table.put(QEXACTIVE.getName(), QEXACTIVE); -// table.put(TRIPLETOF.getName(), TRIPLETOF); } } diff --git a/src/main/java/edu/ucsd/msjava/msutil/Matter.java b/src/main/java/edu/ucsd/msjava/msutil/Matter.java index dce04196..57672f47 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Matter.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Matter.java @@ -1,54 +1,23 @@ package edu.ucsd.msjava.msutil; -/** - * This is root class for anything that has a mass. - * - * @author jung - */ +/** Root class for anything that has a mass. */ public abstract class Matter implements Comparable { - /** - * Get the mass of this object. - * - * @return the mass in Daltons of this object. - */ public abstract float getMass(); - /** - * Get the accurate (double-precision) mass of this object - * - * @return - */ public double getAccurateMass() { return getMass(); } - /** - * Get the nominal (integer) mass of this object. - * - * @return the nominal mass in Daltons of this object - */ public abstract int getNominalMass(); - /** - * Defines the ordering of amino acids. Order by mass. - * - * @param other massive to compared it to. - * @return 1 if this is greater than the other, -1 if the other is greater - * than this and 0 if they are equal. - */ public int compareTo(Matter other) { if (this.getMass() > other.getMass()) return 1; if (other.getMass() > this.getMass()) return -1; return 0; } - /** - * Standard string representation of this object. - * - * @return mass in string with 2 significant figures. - */ public String toString() { return String.format("[%.2f]", getMass()); } diff --git a/src/main/java/edu/ucsd/msjava/msutil/Modification.java b/src/main/java/edu/ucsd/msjava/msutil/Modification.java index a0dbadc1..00b4949f 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Modification.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Modification.java @@ -5,15 +5,8 @@ import java.util.Comparator; import java.util.HashMap; -/** - * A class representing a modification. - * - * @author sangtaekim - */ public class Modification { - /** - * Threshold to use when determining if two modifications have the same mass - */ + /** Tolerance for treating two modification masses as equivalent (Da). */ public static final double MOD_MASS_COMPARISON_THRESHOLD = 0.01; private final String name; @@ -40,47 +33,23 @@ private Modification(String name, double mass) { this.nominalMass = NominalMass.toNominalMass((float) mass); } - /** - * Modification name - * - * @return - */ public String getName() { return name; } - /** - * Modification mass (as a float) - * - * @return - */ public float getMass() { return (float) mass; } - /** - * Modification mass (as a double) - * - * @return - */ public double getAccurateMass() { return mass; } - /** - * Modification mass (as an integer) - * - * @return - */ public int getNominalMass() { return nominalMass; } - /** - * Modification identifier (used in mzid output) - * - * @return - */ + /** Unique short identifier used in mzid output (e.g. "+57", "-18#1"). */ public String getModId() { return modId; } @@ -96,11 +65,6 @@ public Composition getComposition() { return composition; } - /** - * List of default modifications - * - * @return - */ public static Modification[] getDefaultModList() { return defaultModList; } @@ -157,13 +121,6 @@ public static boolean isModConflict(String name, Composition composition, double return isModConflict(name, composition.getAccurateMass(), massTolerance); } - /** - * Register a modification by name and mass - * - * @param modName Modification name (though getAminoAcidSetFromXMLFile uses 'residueStr + " " + modMass') - * @param mass Monoisotopic mass - * @return - */ public static Modification register(String modName, double mass) { Modification mod = new Modification(modName, mass); setModIdentifier(mod); @@ -171,13 +128,6 @@ public static Modification register(String modName, double mass) { return mod; } - /** - * Register a modification by name and composition - * - * @param name Modification name - * @param composition Modification empirical formula - * @return - */ public static Modification register(String name, Composition composition) { Modification mod = new Modification(name, composition); setModIdentifier(mod); @@ -197,11 +147,6 @@ public static void setModIdentifiers() { } } - /** - * Generate a unique identifier for the modification to be used in mzid output (in peptide and peptideEvidence IDs) - * - * @param mod - */ private static void setModIdentifier(Modification mod) { double mass = mod.getAccurateMass(); String baseId = ""; @@ -256,7 +201,6 @@ public static Modification getModByName(String name) { public static final Modification Acetyl = new Modification("Acetyl", new Composition(2, 2, 0, 1, 0)); public static final Modification PyroCarbamidomethyl = new Modification("Pyro-carbamidomethyl", Composition.getMass("H-3N-1")); - // static member private static final Modification[] defaultModList = { Carbamidomethyl, @@ -272,10 +216,6 @@ public static Modification getModByName(String name) { PyroCarbamidomethyl }; - /** - * Keys are modification names - * Values are modification details - */ private static final HashMap modTable; static { @@ -293,11 +233,6 @@ public enum Location { Protein_C_Term, } - /** - * A class representing the modification instance. - * - * @author sangtaekim - */ public static class Instance { private final Modification mod; private final char residue; // if null, no amino acid specificity diff --git a/src/main/java/edu/ucsd/msjava/msutil/Pair.java b/src/main/java/edu/ucsd/msjava/msutil/Pair.java index e0288167..d34953bc 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Pair.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Pair.java @@ -2,40 +2,18 @@ import java.util.Comparator; -/** - * This class represents a pair of two objects. - * - * @param the first object - * @param the second object - * @author sangtaekim - */ +/** Generic ordered pair. */ public class Pair { - /** - * The first. - */ private A first; - - /** - * The second. - */ private B second; - /** - * Instantiates a new pair. - * - * @param first the first - * @param second the second - */ public Pair(A first, B second) { super(); this.first = first; this.second = second; } - /* (non-Javadoc) - * @see java.lang.Object#hashCode() - */ public int hashCode() { int hashFirst = first != null ? first.hashCode() : 0; int hashSecond = second != null ? second.hashCode() : 0; @@ -43,9 +21,6 @@ public int hashCode() { return (hashFirst + hashSecond) * hashSecond + hashFirst; } - /* (non-Javadoc) - * @see java.lang.Object#equals(java.lang.Object) - */ public boolean equals(Object other) { if (other instanceof Pair) { Pair otherPair = (Pair) other; @@ -61,45 +36,22 @@ public boolean equals(Object other) { return false; } - /* (non-Javadoc) - * @see java.lang.Object#toString() - */ public String toString() { return "(" + first + ", " + second + ")"; } - /** - * Gets the first. - * - * @return the first - */ public A getFirst() { return first; } - /** - * Sets the first. - * - * @param first the new first - */ public void setFirst(A first) { this.first = first; } - /** - * Gets the second. - * - * @return the second - */ public B getSecond() { return second; } - /** - * Sets the second. - * - * @param second the new second - */ public void setSecond(B second) { this.second = second; } @@ -115,13 +67,6 @@ public PairComparator(boolean useSecondForComprison) { this.useSecondForComprison = useSecondForComprison; } - /** - * Determines the order of Pair objects. If useSecondForComparison is set, use B for comparison, otherwise A is used. - * - * @param o1 the first element. - * @param o2 the second element. - * @return 1 if p1 > p2, -1 if p2 > p1 and 0 otherwise. - */ public int compare(Pair p1, Pair p2) { if (!useSecondForComprison) return p1.getFirst().compareTo(p2.getFirst()); @@ -141,13 +86,6 @@ public PairReverseComparator(boolean useSecondForComprison) { this.useSecondForComprison = useSecondForComprison; } - /** - * Determines the order of Pair objects. If useSecondForComparison is set, use B for comparison, otherwise A is used. - * - * @param o1 the first element. - * @param o2 the second element. - * @return 1 if p1 > p2, -1 if p2 > p1 and 0 otherwise. - */ public int compare(Pair p1, Pair p2) { if (!useSecondForComprison) return p2.getFirst().compareTo(p1.getFirst()); diff --git a/src/main/java/edu/ucsd/msjava/msutil/Peak.java b/src/main/java/edu/ucsd/msjava/msutil/Peak.java index 44a2350b..3dbfef52 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Peak.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Peak.java @@ -9,56 +9,28 @@ */ public class Peak implements Comparable { - // required fields private int charge = 1; private float mz; private float intensity; - // optional fields private int index = -1; private int rank = 151; - - /** - * Constructor. - * - * @param mz the m/z. - * @param intensity the absolute intensity of this peak. - * @param charge the charge of this peak - */ public Peak(float mz, float intensity, int charge) { this.mz = mz; this.intensity = intensity; this.charge = charge; -// if (charge!=0) this.charge = charge; // commented out by Sangtae } - - /** - * Gets the index of this peak. - * - * @return the index of this peak of -1 if not initialized. - */ public int getIndex() { return index; } - - /** - * Gets the mz of this peak (as read from the file). - * - * @return the mass in Daltons. - */ public float getMz() { return mz; } - - /** - * Return the de-charged mass. - * - * @return (m/z - H) * charge. - */ + /** Returns (m/z - H) * charge: the de-charged monoisotopic mass. */ public float getMass() { Float monoMass = (mz - (float)Composition.ChargeCarrierMass()) * (float)charge; if (monoMass > 0) @@ -68,31 +40,14 @@ public float getMass() { } - /** - * Gets the intensity of this peak. - * - * @return the intensity of this peak. - */ public float getIntensity() { return intensity; } - - /** - * Gets the charge of the peak. - * - * @return the charge of the peak. - */ public int getCharge() { return this.charge; } - /** - * Gets a new peak with different mass - * - * @param mz m/z - * @return a new peak with mass - */ public Peak getShiftedPeak(float mz) { Peak newPeak = new Peak(mz, this.intensity, this.charge); newPeak.rank = this.rank; @@ -100,20 +55,10 @@ public Peak getShiftedPeak(float mz) { return newPeak; } - /** - * Sets the rank of this peak. - * - * @param rank the rank of this peak. - */ public void setRank(int rank) { this.rank = rank; } - /** - * Gets the rank of this peak. - * - * @return the rank of this peaks or -1 if not initialized. - */ public int getRank() { return rank; } @@ -130,71 +75,26 @@ public float getComplementMass(float parentMass) { } - /** - * Sets the intensity of this peak. - * - * @param intensity - */ public void setIntensity(float intensity) { this.intensity = intensity; } - - /** - * Sets the index of this peak. - * - * @param index the index to set for this peak. - */ public void setIndex(int index) { this.index = index; } - - /** - * Sets the mass of this peak to the given float. - * - * @param mz the mass to set this peak to. - */ public void setMz(float mz) { this.mz = mz; } - - /** - * Sets the charge of this peak. - * - * @param charge the integer charge - */ public void setCharge(int charge) { this.charge = charge; } - - /** - * Given ppm tolerance convert it to unit mass tolerance. - * - * @param ppmTolerance the tolerance in ppm value - * @return - */ public float toUnitTolerance(float ppmTolerance) { return getMass() * ppmTolerance / Constants.MILLION; } - - /** - * - * @param shiftMass - * @return - */ - /* - public Peak getShiftedPeak(float shiftMass) { - Peak p = this.clone(); - p.mass += shiftMass; - return p; - } - */ - - /** * Compares this peak to another peak by mass. If the masses are equal, * compare by intensity. @@ -215,12 +115,6 @@ public int hashCode() { return (int) (mz + intensity + charge); } - /** - * Checks the equality of this peak with another object. - * - * @param obj the other object. - * @return true if the intensities and masses are equal, false, otherwise. - */ @Override public boolean equals(Object obj) { if (obj instanceof Peak) @@ -228,46 +122,21 @@ public boolean equals(Object obj) { return false; } - /** - * Checks the equality of this peak with another peak. - * - * @param p the other peak. - * @return true if the intensities and masses are equal, false, otherwise. - */ public boolean equals(Peak p) { // this might not be a good idea for floats return mz == p.mz && intensity == p.intensity && charge == p.charge; } - /** - * Calculates the absolute mass difference between 2 peaks. The m/z values - * are used for this method. - * - * @param p1 the peak to subtract the mass from. - * @param p2 the peak to subtract the mass by. - * @return the mass difference in Daltons. - */ public static float getAbsoluteMassDiff(Peak p1, Peak p2) { return Math.abs(p1.mz - p2.mz); } - - /** - * String representation of this peak. This is simply the mass followed by - * its intensity. - * - * @return mass, space, intensity string representation of this peak. - */ + @Override public String toString() { return mz + " " + intensity; } - /** - * Make a deep copy of this peak. - * - * @return a deep copy of this peak - */ public Peak clone() { Peak p = new Peak(mz, intensity, charge); p.index = index; @@ -276,22 +145,8 @@ public Peak clone() { } - /** - * Comparator to sort peaks by intensity. If the intensities are equal, sort - * by mass - * - * @author Sangtae Kim - */ public static class IntensityComparator implements Comparator { - - /** - * Dictates ordering of peaks by intensity - * - * @param p1 first peak to compare. - * @param p2 second peak to compare. - * @return 1 if p1 > p2, -1 if p2 > p1 and 0 if they are equal. - */ public int compare(Peak p1, Peak p2) { if (p1.intensity > p2.intensity) return 1; if (p2.intensity > p1.intensity) return -1; @@ -302,61 +157,24 @@ public int compare(Peak p1, Peak p2) { return 0; } - - /** - * Dictates equality of two peaks. - * - * @param p1 first peak to compare. - * @param p2 second peak to compare. - * @return true, if both peaks have the same mass and intensity; false - * otherwise. - */ public boolean equals(Peak p1, Peak p2) { - // this might not be a good idea because of float errors + // float exact equality intentional: these are cached values, not computed return p1.mz == p2.mz && p1.intensity == p2.intensity; } } - /** - * Dictates the order of peaks by mass. - * - * @author Sangtae Kim - */ public static class MassComparator implements Comparator { - /** - * Comparison function by mass. - * - * @param p1 first peak. - * @param p2 second peak. - * @return 1 if p1 > p2, -1 if p2 > p1 and 0 if they are equal. - */ public int compare(Peak p1, Peak p2) { return p1.compareTo(p2); } - /** - * Equality method. - * - * @param p1 first peak. - * @param p2 second peak. - * @return true if their masses and intensities are equal, false otherwise. - */ public boolean equals(Peak p1, Peak p2) { return p1.equals(p2); } } - - /** - * Creates a new peak with the same parameters as the current peak, but with - * a mass offset given. - * - * @param offset the offset to add - * @return a peak object such that the getMass methods this return this.getMass()+offset - * as mass. - */ public Peak duplicate(float offset) { float mzOffset = offset / this.charge; return new Peak(mz + mzOffset, this.intensity, this.charge); diff --git a/src/main/java/edu/ucsd/msjava/msutil/Peptide.java b/src/main/java/edu/ucsd/msjava/msutil/Peptide.java index 6b493c90..a81a8135 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Peptide.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Peptide.java @@ -9,9 +9,6 @@ import java.util.HashSet; import java.util.List; -/** - * @author Sangtae Kim - */ public class Peptide extends Sequence implements Comparable { //this is recommended for Serializable objects @@ -27,16 +24,7 @@ public class Peptide extends Sequence implements Comparable // true if this peptide contains invalid amino acid private boolean isInvalid = false; - // true if there's n-term modification -// private boolean hasNTermMod = false; -// private float nTermModMass = 0f; - - /** - * Constructor. Parses sequence string and check for modifications. Not fully implemented!! - * Examples: QWSYL -17QSVL QSV+2.12QLK-3 - * - * @params sequence the sequence in string representation. - */ + /** Parses a sequence string, supporting N-term mods (e.g. +42ACDEFGR) and inline mods (e.g. QSV+2.12QLK). Not fully implemented for all edge cases. */ public Peptide(String sequence, AminoAcidSet aaSet) { isModified = false; int seqLen = sequence.length(); @@ -129,21 +117,10 @@ public Peptide(String sequence, AminoAcidSet aaSet) { } } - /** - * Constructor. Parses sequence string and check for modifications. Not fully implemented!! - * Examples: QWSYL Q-17SVL QSV+2.12QLK-3 - * - * @params sequence the sequence in string representation. - */ public Peptide(String sequence) { this(sequence, AminoAcidSet.getStandardAminoAcidSetWithFixedCarbamidomethylatedCys()); } - /** - * Constructor from an ArrayList of AminoAcids. - * - * @params aaArray the array of amino acids. - */ public Peptide(ArrayList aaArray) { for (AminoAcid aa : aaArray) { assert (aa != null) : "Null amino acid"; @@ -151,13 +128,6 @@ public Peptide(ArrayList aaArray) { } } - - /** - * Constructor from an ArrayList of AminoAcids. - * - * @params aaArray the array of amino acids. - * added by Kyowon - */ public Peptide(List aaArray) { for (AminoAcid aa : aaArray) { assert (aa != null) : "Null amino acid"; @@ -165,52 +135,26 @@ public Peptide(List aaArray) { } } - /** - * Constructor from an array of AminoAcids. - * - * @param aaArray the array of aminoacids. - */ public Peptide(AminoAcid[] aaArray) { for (AminoAcid aa : aaArray) this.add(aa); } - /** - * Returns a subpeptide of the specified range (half open intervals). - * - * @param fromIndex the index of the starting subsequence (inclusive). - * @param toIndex the end index of the subsequence (exclusive) - * @return a subsequence of specified range - */ public Peptide subPeptide(int fromIndex, int toIndex) { return (Peptide) super.subSequence(fromIndex, toIndex); } - /** - * Set isModified as true and returns this object - */ public Peptide setModified() { isModified = true; return this; } - /** - * Set isModified and returns this object - * - * @param isModified the flag indicating whether this peptide is modified - * @return - */ public Peptide setModified(boolean isModified) { this.isModified = isModified; return this; } - /** - * Construct an boolean array representing the spectrum. All positions - * with a theoretical peak will be true. - * - * @return - */ + /** Returns boolean array indexed by nominal mass; true at each prefix-mass position. */ public boolean[] getBooleanPeptide() { boolean[] boolPeptide = new boolean[this.getNominalMass() + 1]; int mass = 0; @@ -231,70 +175,21 @@ public boolean isGappedPeptideTrue(ArrayList gp) { return isTrue; } -// /** -// * Returns whether this has N-term mod. -// * @return true if this peptide has N-term modification, false otherwise.; -// */ -// public boolean hasNTermMod() -// { -// return this.hasNTermMod; -// } - -// /** -// * Returns n-terminal modification mass; -// * @return n-terminal modification mass; -// */ -// public float getNTermModMass() -// { -// return this.nTermModMass; -// } - -// /** -// * Returns n-terminal nominal modification mass; -// * @return n-terminal nominal modification mass; -// */ -// public int getNominalNTermModMass() -// { -// return NominalMass.toNominalMass(this.nTermModMass); -// } - - /** - * Returns whether this peptide contains invalid amino acids. - * - * @return true if this peptide is invalid. - */ public boolean isInvalid() { return this.isInvalid; } - /** - * Checks whether the last amino acid is modified. - * - * @return false if not modifies, true otherwise. - */ public boolean isCTermModified() { return get(this.size() - 1).isModified(); } - /** - * Checks whether the C terminus is triptic. - * - * @return true if the last amino acid is triptic and not modified, false - * otherwise. - */ public boolean hasTrypticCTerm() { AminoAcid cTerm = this.get(this.size() - 1); return !isCTermModified() && (cTerm == AminoAcid.getStandardAminoAcid('K') || cTerm == AminoAcid.getStandardAminoAcid('R')); } - /** - * Checks whether this peptide has a cleavage site. - * - * @param enzyme the enzyme used - * @return true if this has a cleavage site and false otherwise. - */ public boolean hasCleavageSite(Enzyme enzyme) { AminoAcid target; if (enzyme.isCTerm()) @@ -304,12 +199,6 @@ public boolean hasCleavageSite(Enzyme enzyme) { return enzyme.isCleavable(target); } - /** - * Gets the amino acid at the given index. - * - * @param i the index of the amino acid in the array to retrieve. - * @return the amino acid at the given index. - */ public AminoAcid get(int i) { if (i <= -1) // N-terminal return null; @@ -319,16 +208,6 @@ else if (i >= this.size()) // C-terminal } - /** - * Defines the order of different Sequence objects. The order is determined - * by the first mass which is greater than the corresponding mass - * in other (same index) in the other sequence. If there is a tie, the - * longer Sequence is greater. This is like lexographical ordering. - * - * @param other the other Sequence to compare. - * @return 1 if this Sequence is greater than the other Sequence, -1 if this - * is smaller, 0 otherwise. - */ public int compareTo(Peptide other) { // funky ordering int minSize = java.lang.Math.min(this.size(), other.size()); @@ -349,12 +228,6 @@ public int compareTo(Peptide other) { return 0; } - /** - * Matches this peptide against other peptide. If differ only by "I" and "L", returns true, returns false otherwise. - * - * @param pep peptide matched to - * @return true if equals ignoring I/L difference. false otherwise. - */ public boolean equalsIgnoreIL(Peptide pep) { if (this.size() != pep.size()) return false; @@ -367,11 +240,6 @@ public boolean equalsIgnoreIL(Peptide pep) { return true; } - /** - * Returns a string represention of this peptide. - * - * @return peptide string. - */ public String toString() { StringBuffer output = new StringBuffer(); for (AminoAcid aa : this) { @@ -380,13 +248,6 @@ public String toString() { return output.toString(); } - /** - * Converts this into a Sequence of compositions. - * - * @param isPrefix whether to convert into prefixes or suffixes - * @return sequence of compositions - */ - public Sequence toCumulativeCompositionSequence(boolean isPrefix, Composition offset) { Sequence seq = new Sequence(); Composition c = offset; @@ -402,11 +263,6 @@ public Sequence toCumulativeCompositionSequence(boolean isPrefix, C return seq; } - /** - * Convert this peptide into a sequence of compositions. Each amino acid maps to a corresponding composition. - * - * @return sequence of compositions. - */ public Sequence toCompositionSequence() { Sequence seq = new Sequence(); for (AminoAcid aa : this) @@ -414,11 +270,6 @@ public Sequence toCompositionSequence() { return seq; } - /** - * Convert this peptide into a sequence of compositions in the reversed order. Each amino acid maps to a corresponding composition. - * - * @return sequence of compositions. - */ public Sequence toReverseCompositionSequence() { Sequence seq = new Sequence(); for (int i = this.size() - 1; i >= 0; i--) @@ -426,12 +277,6 @@ public Sequence toReverseCompositionSequence() { return seq; } - /** - * Convert this peptide into a sequence of integer masses. - * - * @param factory - * @return sequence of integer masses. - */ public Sequence toPrefixIntMassSequence(IntMassFactory factory) { Sequence seq = new Sequence(); for (int i = 0; i < this.size(); i++) @@ -454,12 +299,6 @@ public Sequence toCumulativeIntMassSequence(boolean isPrefix, IntMassFa return seq; } - /** - * Convert this peptide into a sequence of integer masses in the reversed order. - * - * @param factory - * @return sequence of integer masses in the reversed order. - */ public Sequence toSuffixIntMassSequence(IntMassFactory factory) { Sequence seq = new Sequence(); for (int i = this.size() - 1; i >= 0; i--) @@ -467,21 +306,11 @@ public Sequence toSuffixIntMassSequence(IntMassFactory factory) { return seq; } - /** - * Sums up the masses of the amino acids plus the mass of a water molecule. - * - * @return the mass in Daltons of the monoisotopic masses plus water. - */ + /** Sum of residue masses plus H2O (neutral monoisotopic peptide mass). */ public float getParentMass() { return getMass() + (float) Composition.H2O; } - /** - * Computes the number of symmetric b/y pairs - * - * @param tolerance tolerance - * @return the number of symmetric b/y pairs - */ public int getNumSymmetricPeaks(Tolerance tolerance) { ArrayList bIons = toCumulativeCompositionSequence(true, new Composition(0, 1, 0, 0, 0)); ArrayList yIons = toCumulativeCompositionSequence(false, new Composition(0, 3, 0, 1, 0)); @@ -490,11 +319,7 @@ public int getNumSymmetricPeaks(Tolerance tolerance) { return comparator.getMatchedList(tolerance).length; } - /** - * Computes the number of symmetric b/y pairs. Use nominal masses - * - * @return the number of symmetric b/y pairs - */ + /** Uses nominal masses. */ public int getNumSymmetricPeaks() { int numSymmPeaks = 0; HashSet bIons = new HashSet(); @@ -512,40 +337,6 @@ public int getNumSymmetricPeaks() { return numSymmPeaks; } - /* - public float getAvgMass() - { - float sum = 0.f; - if(modMass != null) - for(int i=0; i aaList) - { - int num = 0; - for(AminoAcid aa : this) - if(aaList.contains(aa)) - num++; - return num; - } - */ - public float getProbability() { float prob = 1; for (int i = 0; i < this.size(); i++) { @@ -610,14 +385,6 @@ public float getNumber() { } - /** - * Returns an slice of the current sequence with the given coordinates. - * - * @param from the starting index (inclusive). - * @param to the ending index (exclusive). - * @return a new Sequence object after the slice operation, null if the - * ranges yield no sequence. - */ public Peptide slice(int from, int to) { from = java.lang.Math.max(0, from); to = java.lang.Math.min(this.size(), to); @@ -632,12 +399,6 @@ public Peptide slice(int from, int to) { } - /** - * Factory function that creates a sequence from a String. - * - * @param seq the String representing a standard sequence. - * @return the Sequence object. - */ public static Peptide getSequence(String seq) { ArrayList aaList = new ArrayList(); int seqLen = seq.length(); @@ -648,12 +409,6 @@ public static Peptide getSequence(String seq) { } - /** - * This function checks that the peptide agrees with the given set of masses - * - * @param masses the masses - * @return true if the correct, false otherwise - */ public boolean isCorrect(ArrayList masses) { int cumMass = 0; int massIndex = 0; @@ -681,15 +436,6 @@ public boolean isCorrect(ArrayList masses) { } - /** - * This function checks that a given peptide sequence agrees with the mass - * list. The mass list can expand more than one amino acid. - * - * @param sequence the amino acid letters - * @param masses the masses - * @param aaSet the amino acid alphabet - * @return true if the condition is true, false otherwise - */ public static boolean isCorrect(String sequence, ArrayList masses, AminoAcidSet aaSet) { int cumMass = 0; int massIndex = 0; @@ -717,14 +463,6 @@ public static boolean isCorrect(String sequence, ArrayList masses, Amin } - /** - * This function checks that a given peptide sequence agrees with the mass - * list. The mass list can expand more than one amino acid. - * - * @param sequence the amino acid letters - * @param masses the masses - * @return true if the condition is true, false otherwise - */ public static boolean isCorrect(String sequence, ArrayList masses) { return isCorrect(sequence, masses, AminoAcidSet.getStandardAminoAcidSet()); } @@ -751,12 +489,6 @@ public boolean isModified() { } - /** - * Given a string of standard amino acids, return the mass of this string. - * - * @param peptide the string of standard amino acids, one amino acid per character. - * @return the mass of this peptide in Daltons. - */ public static float getMassFromString(String peptide) { float cumMass = 0; for (int i = peptide.length(), j = 0; i > 0; i--, j++) { @@ -767,117 +499,4 @@ public static float getMassFromString(String peptide) { } - /* - public ArrayList getTheoSpec(boolean isPrefix, int offset) - { - return getTheoSpec(isPrefix, offset, PeakProperty.NORMAL); - } - - public ArrayList getTheoSpec(boolean isPrefix, int offset, PeakProperty property) - { - return getTheoSpec(isPrefix, offset, property, 1); - } - - - public ArrayList getCharge2TheoSpec(boolean isPrefix, int offset, PeakProperty property) - { - return getTheoSpec(isPrefix, offset, property, 2); - } - - public ArrayList getTheoSpec(boolean isPrefix, int offset, PeakProperty property, int charge) - { - ArrayList theoSpec = new ArrayList(); - - float mass = offset; - - for(int i=0; i bPeaks = new ArrayList(); - ArrayList yPeaks = new ArrayList(); - - int bMass = 1, yMass = 19; - for(int i=0; i (this.getIntMonoMass()+18)/2) - break; - for(int j=0; j bPeaks = getTheoSpec(true, 1); - ArrayList yPeaks = getTheoSpec(false, 19); - - return new PeakListComparator(bPeaks, yPeaks).getSharedPeakCount(); - } - - public int[] getIntPRM() - { - int[] intMass = new int[this.size()+1]; - int mass = 0; - intMass[0] = mass; - for(int i=0; i prm) - break; - else if(mass == prm) - return true; - } - return false; - } - public ArrayList getModifications() - { - if(!isModified) - return null; - ArrayList modList = new ArrayList(); - for(int i=0; i extends ArrayList { static final private long serialVersionUID = 1L; - /** - * Sums up the masses of this Sequence. - * - * @return the mass in Daltons of the mono isotopic masses. - */ public float getMass() { return getMass(0, this.size()); } - /** - * Sums up the masses of this Sequence (double-precision). - * - * @return the mass in Daltons of the mono isotopic masses (double-precision). - */ public double getAccurateMass() { return getMass(0, this.size()); } - /** - * Sums up the masses of the specified range of masses (half open - * intervals). - * - * @param from the index of the starting mass (inclusive). - * @param to the end index of the mass (exclusive). - * @return the mass in Daltons. - */ + /** Sum of masses in [from, to), clamped to [0, size). */ public float getMass(int from, int to) { from = java.lang.Math.max(from, 0); to = java.lang.Math.min(to, this.size()); @@ -54,13 +37,6 @@ public float getMass(int from, int to) { return sum; } - /** - * Similar to getMass(), but returns double. - * - * @param from the index of the starting mass (inclusive). - * @param to the end index of the mass (exclusive). - * @return the mass in Daltons (double). - */ public double getAccurateMass(int from, int to) { from = java.lang.Math.max(from, 0); to = java.lang.Math.min(to, this.size()); @@ -70,22 +46,10 @@ public double getAccurateMass(int from, int to) { return sum; } - /** - * Returns a subsequence of the specified range (half open intervals). - * - * @param fromIndex the index of the starting subsequence (inclusive). - * @param toIndex the end index of the subsequence (exclusive) - * @return a subsequence of specified range - */ public Sequence subSequence(int fromIndex, int toIndex) { return (Sequence) super.subList(fromIndex, toIndex); } - /** - * String representation of this sequence. - * - * @return the String representing the amino acid letters in this sequence. - */ public String toString() { StringBuffer output = new StringBuffer(); for (T matter : this) { @@ -94,13 +58,6 @@ public String toString() { return output.toString(); } - /** - * Returns the union of two input sequences. - * - * @param seq1 the first sequence - * @param seq2 the second sequence - * @return the union of seq1 and seq2 - */ public static Sequence getIntersection(Sequence seq1, Sequence seq2) { Sequence union = new Sequence(); HashSet set = new HashSet(); @@ -112,13 +69,6 @@ public static Sequence getIntersection(Sequence seq1, S return union; } - /** - * Checks if this sequence matches to the specified peptide within the input tolerance - * - * @param peptide Peptide. - * @param tolerance Tolerance. - * @return True if matches, false otherwise. - */ public boolean isMatchedTo(Peptide peptide, Tolerance tolerance, boolean isPrefix) { ArrayList pepMassList = new ArrayList(); float mass = 0; @@ -137,13 +87,6 @@ public boolean isMatchedTo(Peptide peptide, Tolerance tolerance, boolean isPrefi return (matchSize == this.size()); } - /** - * Checks if this sequence matches to the specified peptide. Use nominal masses. - * - * @param peptide Peptide. - * @param isTolerancePPM Tolerance is interpreted as ppm is true. If false, Tolerance is interpreted as Da. - * @return True if matches, false otherwise. - */ public boolean isMatchedToNominalMasses(Peptide peptide, boolean isPrefix) { HashSet massList = new HashSet(); int mass = 0; @@ -161,11 +104,6 @@ public boolean isMatchedToNominalMasses(Peptide peptide, boolean isPrefix) { return true; } - /** - * Converts this sequence into an array of masses. - * - * @return a mass array of this object. null if - */ public float[] toMassArray() { float[] massArr = new float[this.size()]; int index = 0; @@ -173,85 +111,4 @@ public float[] toMassArray() { massArr[index++] = m.getMass(); return massArr; } - - /* - public static Sequence getIntMassGappedPeptide(ArrayList dictionary, float minProbability, boolean prefix) - { - GappedPeptide gp = new GappedPeptide(); - - Hashtable hist = new Hashtable(); - - for(Peptide peptide : dictionary) - { - IntMass mass = new IntMass(0); - for(int i=0; i> itr = hist.entrySet().iterator(); - while(itr.hasNext()) - { - Entry entry = itr.next(); - if(entry.getValue() > dictionary.size()*minProbability) - gp.add(entry.getKey()); - } - Collections.sort(gp); - return gp; - } - */ - /* - * Returns the gapped peptide as an array of compositions/ - * The spectral profile of dictionary is generated and prm compositions exceeding minProbability are selected. - * @param dictionary the spectral dictionary - * @param minProbability the threshold of profile probability - * @param prefix gapped peptide is a set of prefixes if true, suffixes if false - * @returns the gapped peptide of compositions - public static GappedPeptide getCompositionGappedPeptide(ArrayList dictionary, float minProbability, boolean prefix) - { - GappedPeptide gp = new GappedPeptide(); - - Hashtable hist = new Hashtable(); - - for(Peptide peptide : dictionary) - { - Composition composition = new Composition(0); - for(int i=0; i> itr = hist.entrySet().iterator(); - while(itr.hasNext()) - { - Entry entry = itr.next(); - if(entry.getValue() > dictionary.size()*minProbability) - gp.add(entry.getKey()); - } - Collections.sort(gp); - return gp; - } - */ } diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java b/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java index c03a0ee0..c87ea5ca 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java @@ -45,7 +45,6 @@ public static SpecKey getSpecKey(String specKeyString) { public void addSpecIndex(int scanNum) { if (specIndexList == null) { specIndexList = new ArrayList(); -// specIndexList.add(super.getFirst()); } specIndexList.add(scanNum); } @@ -142,9 +141,7 @@ public static ArrayList getSpecKeyList( ActivationMethod specActivationMethod = spec.getActivationMethod(); if (activationMethod == ActivationMethod.ASWRITTEN) { - // System.out.println( - // "Use spectrum " + spec.getID() + - // " since assumed activationMethod is " + activationMethod.toString()); + // no-op: accept all activation methods when user specified ASWRITTEN } else if (specActivationMethod != null) { // If specActivationMethod is null, we use whatever was specified // - some supported spectra input types do not allow/require activation method @@ -214,7 +211,6 @@ public static ArrayList getSpecKeyList( } if (spec.size() < minNumPeaksPerSpectrum) { -// System.out.println("Spectrum " + spec.getScanNum() + " has too few peaks (#Peaks: " + spec.size()+"): ignored."); numSpectraWithTooFewPeaks++; continue; } diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java b/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java index 223644b4..fec29adf 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java @@ -24,22 +24,10 @@ public class SpectraAccessor { SpectrumAccessorBySpecIndex specMap = null; Iterator specItr = null; - /** - * Constructor that accepts a file - * Determines the file format based on the file extension - * - * @param specFile - */ public SpectraAccessor(File specFile) { this(specFile, SpecFileFormat.getSpecFileFormat(specFile.getName())); } - /** - * Constructor that accepts a file and a file format - * - * @param specFile - * @param specFormat - */ public SpectraAccessor(File specFile, SpecFileFormat specFormat) { if (specFormat == null) { throw new IllegalArgumentException("Unsupported spectrum file format: " + specFile.getName()); @@ -123,10 +111,6 @@ public Spectrum getSpectrumById(String specId) { return getSpecMap().getSpectrumById(specId); } - /** - * Get the current spectrum parser, or null if no parser - * @return - */ public SpectrumParser getSpectrumParser() { return spectrumParser; } diff --git a/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java b/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java index 473f622e..c244807a 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java @@ -48,96 +48,43 @@ public enum Polarity { private Boolean isCentroidedWithDensePeaks = false; private boolean isHighPrecision = false; -// private Tolerance precursorTolerance = null; -// private Integer minIsotopeError = null; -// private Integer maxIsotopeError = null; -// private Float fractionIonCurrent = null; -// private Integer numMS1Features = null; private ArrayList addlCvParams; private Float isolationWindowTargetMz = null; - /***** CONSTRUCTORS *****/ - /** - * Empty constructor. - */ public Spectrum() { } - /** - * Constructor from a precursor peak. - * - * @param precursorPeak the precursor peak. - */ public Spectrum(Peak precursorPeak) { this.precursor = precursorPeak; } - /** - * Constructor. - * - * @param precursorMz m/z value read from the file. - * @param charge charge read from the file. - * @param precursorIntensity intensity of the precursor peak. - */ public Spectrum(float precursorMz, int charge, float precursorIntensity) { this.precursor = new Peak(precursorMz, precursorIntensity, charge); } - - /***** GETTERS ****/ - /** - * Gets the unique identifier of this spectrum. - * - * @return the unique identifier of this spectrum. - */ public String getID() { return id; } - /** - * Gets the Peptide object of the annotation. - * - * @return the Peptide object representing the annotation or null if not annotated. - */ public Peptide getAnnotation() { return annotation; } - /** - * Gets the annotation as a String object. - * - * @return the String object of the annotation. null if there is no annotation. - */ public String getAnnotationStr() { if (annotation != null) return annotation.toString(); return null; } - /** - * Gets the list of seq fi as String objects. - * - * @return the ArrayList of String objects of the annotations. null if there is no annotation. - */ public ArrayList getSeqList() { return seqList; } - /** - * Gets the charge of this spectrum. - * - * @return the integer charge. - */ public int getCharge() { return precursor.getCharge(); } - /** - * Gets the end scan number of this spectrum. - * - * @return the end scan for this spectrum if present. -1 otherwise. - */ public int getEndScanNum() { return endScanNum; } @@ -169,91 +116,43 @@ public float getPeptideMass() { return 0; } - /** - * Gets the precursor peak member. - * - * @return the peak. - */ public Peak getPrecursorPeak() { return precursor; } - /** - * Gets the scan number of this spectrum. - * - * @return the start scan number of this spectrum if present. -1 otherwise. - */ public int getScanNum() { return getStartScanNum(); } - /** - * Gets the spectrum index of this spectrum. Spectrum index is a 1-based sequential number of this spectrum in the file. - * - * @return the spectrum index of this spectrum if present. 0 otherwise. - */ public int getSpecIndex() { return specIndex; } - /** - * Gets the start scan number of the spectrum. - * - * @return the start scan for this spectrum if present. -1 otherwise. - */ public int getStartScanNum() { return startScanNum; } - /** - * Gets the title of this spectrum. - * - * @return the title of this spectrum or null if it was not specified. - */ public String getTitle() { return title; } - /** - * Returns the retention time; see getRtIsSeconds for the units. - * - * @return the retention time if set, negative value otherwise. - */ public float getRt() { return this.rt; } - /** - * Returns the retention time units - true if in seconds, false if in minutes - * - * @return true if retention time is in seconds, false if in minutes - */ + /** Returns true if retention time is in seconds, false if in minutes. */ public boolean getRtIsSeconds() { return this.rtIsSeconds; } - /** - * Returns the activation method. - * - * @return the activation method or null if not specified. - */ public ActivationMethod getActivationMethod() { return this.activationMethod; } - /** - * Gets the polarity of this scan - * - */ public Polarity getScanPolarity() { return this.scanPolarity; } - /** - * Returns whether this spectrum is centroided. - * - * @return true if this spectrum is centroided and false otherwise. - */ public boolean isCentroided() { return this.isCentroided; } @@ -267,229 +166,89 @@ public boolean isCentroidedWithDensePeaks() { return this.isCentroidedWithDensePeaks; } - /** - * Returns whether this spectrum peaks are measured with high-precision. - * - * @return true if this spectrum is centroided and false otherwise. - */ public boolean isHighPrecision() { return this.isHighPrecision; } -// /** -// * Returns the precursor tolerance. -// * @return precursor tolerance -// */ -// public Tolerance getPrecursorTolerance() { return this.precursorTolerance; } -// -// /** -// * Returns the minimum isotope error. -// * @return minimum isotope error -// */ -// public Integer getMinIsotopeError() { return this.minIsotopeError; } -// -// /** -// * Returns the maximum isotope error. -// * @return maximum isotope error -// */ -// public Integer getMaxIsotopeError() { return this.maxIsotopeError; } - - /** - * Returns the ms level. - * - * @return the ms level. - */ public int getMSLevel() { return this.msLevel; } - /** - * Gets additional cvParams to output as cvParams under the mzIdentML SpectrumIdentificationResult - * @return cvParam necessary info - */ + /** Returns additional cvParams to output under the mzIdentML SpectrumIdentificationResult. */ public ArrayList getAddlCvParams() { return this.addlCvParams; } - /***** SETTERS *****/ - - /** - * Sets the unique String id of this spectrum. - * - * @param id unique string identifier. - */ public void setID(String id) { this.id = id; } - /** - * Sets the annotation with a Peptide object. - * - * @param annotation annotation object - */ public void setAnnotation(Peptide annotation) { this.annotation = annotation; } - /** - * Sets the annotation with a Sequence string. - * - * @param seq annotation object. - */ public void addSEQ(String seq) { if (seqList == null) seqList = new ArrayList(); this.seqList.add(seq); } - /** - * Set the precursor mass. - * - * @param precursor - */ public void setPrecursor(Peak precursor) { this.precursor = precursor; } - /** - * Sets the starting scan number of this spectrum. - * - * @param startScanNum starting scan number from the file. - */ public void setStartScanNum(int startScanNum) { this.startScanNum = startScanNum; } - /** - * Sets the ending scan number of this spectrum. - * - * @param endScanNum ending scan number from the file. - */ public void setEndScanNum(int endScanNum) { this.endScanNum = endScanNum; } - /** - * Sets the scan number of this spectrum. - * - * @param scanNum scan number from the file. - */ public void setScanNum(int scanNum) { this.startScanNum = scanNum; } - /** - * Sets the spectrum index of this spectrum. - * - * @param specIndex scan spectrum index. - */ public void setSpecIndex(int specIndex) { this.specIndex = specIndex; } - /** - * Sets title property. - * - * @param title title from the file. - */ public void setTitle(String title) { this.title = title; } - /** - * Sets the retention time of this Spectrum. - * - * @param rt the retention time. See setRtIsSeconds to specify the units correctly to minutes or seconds - */ + /** @param rt retention time; see {@link #setRtIsSeconds} for units. */ public void setRt(float rt) { this.rt = rt; } - /** - * Sets the units of the retention time - true for seconds, false for minutes - * - * @param isSeconds - */ + /** Sets retention time units: true = seconds, false = minutes. */ public void setRtIsSeconds(boolean isSeconds) { this.rtIsSeconds = isSeconds; } - /** - * Sets the fragmentation method of this Spectrum. - * - * @param fragMethod the fragmentation method. - */ public void setActivationMethod(ActivationMethod fragMethod) { this.activationMethod = fragMethod; } - /** - * Sets the ms level of this Spectrum. - * - * @param msLevel the ms level. - */ public void setMsLevel(int msLevel) { this.msLevel = msLevel; } - /** - * Sets the polarity of this scan (positive or negative) - * - * @param scanPolarity the scan polarity - */ public void setScanPolarity(Polarity scanPolarity) { this.scanPolarity = scanPolarity; } - /** - * Sets isCentroided. - * - * @param isCentroided whether this spectrum is centroided. - */ public void setIsCentroided(boolean isCentroided) { this.isCentroided = isCentroided; - // function is used for mzML and mzXML files, track that isCentroided was set outside of this class + // track that isCentroided was set from external reader (mzML/mzXML) this.externalSetIsCentroided = true; } - /** - * Sets isHighPrecision. - * - * @param isHighPrecision whether this spectrum (fragment peaks) is high-precision. - */ public void setIsHighPrecision(boolean isHighPrecision) { this.isHighPrecision = isHighPrecision; } -// /** -// * Sets precursorTolerance. -// * @param precursorTolerance the precursor tolerance. -// */ -// public void setPrecursorTolerance(Tolerance precursorTolerance) -// { -// this.precursorTolerance = precursorTolerance; -// } -// -// /** -// * Sets the isotope error range. -// * @param minIsotopeError minimum isotope error. -// * @param maxIsotopeError maximum isotope error. -// */ -// public void setIsotopeError(int minIsotopeError, int maxIsotopeError) -// { -// this.minIsotopeError = minIsotopeError; -// this.maxIsotopeError = maxIsotopeError; -// } -// -// /** -// * Sets fraction of ion current within the selection window explained by the MS1 feature -// * @param fractionIonCurrent the precursor tolerance. -// */ -// public void setFractionIonCurrent(float fractionIonCurrent) -// { -// this.fractionIonCurrent = fractionIonCurrent; -// } - public void setIsolationWindowTargetMz(Float isolationWindowTargetMz) { this.isolationWindowTargetMz = isolationWindowTargetMz; } @@ -498,26 +257,9 @@ public Float getIsolationWindowTargetMz() { return isolationWindowTargetMz; } - /** - * Sets isCentroided by a simple testing. - */ public void determineIsCentroided() { boolean centroidedCheckPass = true; -// if(this.size() > 100) -// { -// float[] diff = new float[100]; -// float prevMz = this.get(0).getMz(); -// for(int i=this.size()-100; i 0) { ArrayList diff = new ArrayList(); float prevMz = this.get(0).getMz(); @@ -575,17 +317,11 @@ public void addAddlCvParam(CvParamInfo cvParam) { addlCvParams.add(cvParam); } - /****** FUNCTIONS *****/ @Override public String toString() { return "Spectrum - mz: " + getPrecursorPeak().getMz() + ", peaks: " + size(); } - /** - * Gets a clone of this spectrum with no peaks - * - * @return a new spectrum without peaks - */ public Spectrum getCloneWithoutPeakList() { Spectrum newSpec = new Spectrum(); newSpec.precursor = this.precursor.clone(); @@ -610,11 +346,6 @@ public Spectrum getDeconvolutedSpectrum(float toleranceBetweenIsotopes) { if (ignore[i]) continue; Peak p = this.get(i); -// if(p.getMz() < 300) -// { -// deconvSpec.add(p); -// continue; -// } float pMz = p.getMz(); for (int ionCharge = 2; ionCharge < charge && ionCharge < 4; ionCharge++) { boolean isDeconvoluted = false; @@ -652,19 +383,11 @@ public Spectrum getDeconvolutedSpectrum(float toleranceBetweenIsotopes) { return deconvSpec; } - /** - * Append a peak to the end of this spectrum. - * - * @param peak peak object to be added. - */ public void addPeak(Peak peak) { this.add(peak); } - /** - * Correct parent mass according to annotation. Do nothing if annotation == null. - */ public void correctParentMass() { if (this.annotation == null || this.getCharge() <= 0) return; @@ -672,18 +395,10 @@ public void correctParentMass() { this.precursor.setMz((annotation.getParentMass() + precursor.getCharge() * (float) Composition.ChargeCarrierMass()) / precursor.getCharge()); } - /** - * Correct parent mass according to parentMass. - * - * @param parentMass - */ public void correctParentMass(float parentMass) { this.precursor.setMz((parentMass + precursor.getCharge() * (float) Composition.ChargeCarrierMass()) / precursor.getCharge()); } - /** - * Correct parent mass according to input peptide. - */ public void correctParentMass(Peptide pep) { if (this.getCharge() <= 0) return; @@ -691,21 +406,10 @@ public void correctParentMass(Peptide pep) { this.precursor.setMz((pep.getParentMass() + precursor.getCharge() * (float) Composition.ChargeCarrierMass()) / precursor.getCharge()); } - /** - * Correct charge of the precursor and set the charge of the peaks of this - * spectrum to max(1, charge - 1). - * - * @param charge charge - */ public void setCharge(int charge) { this.precursor.setCharge(charge); } - /** - * Set the charge of the precursor peak - * - * @param charge the new charge to use - */ public void setPrecursorCharge(int charge) { this.precursor.setCharge(charge); } @@ -771,10 +475,7 @@ public ArrayList getPeakListByMassRange(float minMass, float maxMass) { return matchList; } - /** - * Goes through the peaks and set their rank by intensity. - * Rank 1: highest intensity peak - */ + /** Ranks peaks by intensity descending; rank 1 = highest intensity. */ public void setRanksOfPeaks() { ArrayList intensitySorted = new ArrayList(this); Collections.sort(intensitySorted, Collections.reverseOrder(new IntensityComparator())); @@ -823,9 +524,6 @@ public void filterPrecursorPeaksAroundPM() { } - /** - * The order of the a spectrum is determined by their parent masses. - */ public int compareTo(Spectrum s) { if (getPrecursorMass() > s.getPrecursorMass()) return 1; @@ -834,47 +532,6 @@ else if (getPrecursorMass() < s.getPrecursorMass()) return 0; } - /* - public void outputDtaFile(String fileName) - { - PrintStream out = null; - try { - out = new PrintStream(new File(fileName)); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - out.println(getPrecursorMass()+Composition.H + " " + getCharge()); - for(Peak p : this) - { - out.println(p.getMass() + " " + p.getIntensity()); - } - out.close(); - } - - public void outputPklFile(String fileName) - { - PrintStream out = null; - try { - out = new PrintStream(new File(fileName)); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - out.println(precursor.getMass() + " " + precursor.getIntensity() + " " + getCharge()); - for(Peak p : this) - { - out.println(p.getMass() + " " + p.getIntensity()); - } - out.close(); - } - - public void outputPkl(PrintStream out) - { - out.println(precursor.getMass() + " " + precursor.getIntensity() + " " + getCharge()); - for(Peak p : this) - out.println(p.getMass() + "\t" + p.getIntensity()); - } - */ - /** * Output this spectrum to the input PrintStream as the mgf format. * It needs to be changed later. @@ -946,19 +603,8 @@ public String toDta() { return sb.toString(); } - /** - * Inner class for intensity sorting. - */ class IntensityComparator implements Comparator { - - /** - * Determines the order of peak objects, when sorted by intensity. - * - * @param o1 the first peak object. - * @param o2 the second peak object. - * @return 1 if o1 > o2, -1 if o2 > o1 and 0 otherwise. - */ public int compare(Peak o1, Peak o2) { if (o1.getIntensity() > o2.getIntensity()) return 1; if (o2.getIntensity() > o1.getIntensity()) return -1; @@ -967,58 +613,12 @@ public int compare(Peak o1, Peak o2) { return 0; } - - /** - * Determines equality according to intensities. - * - * @param o1 the first peak object. - * @param o2 the second peak object. - * @return true if they compare to 0, false otherwise. - */ public boolean equals(Peak o1, Peak o2) { return compare(o1, o2) == 0; } } - /* - * Returns the most intense peak that is within tolerance of the target mass. - * The current implementation takes linear time. - * @param mass target mass. - * @param toleranceDa tolerance in Daltons. - * @return a Peak object if there is match or null otherwise. - @Deprecated - public Peak getPeakByMass(float mass, float toleranceDa) - { - ArrayList matchList = getPeakListByMassDa(mass, toleranceDa); - if(matchList == null || matchList.size() == 0) - return null; - else - return Collections.max(matchList, new IntensityComparator()); - } - */ - - /* - * @param mass - * @param tolerancePPM - * @return - @Deprecated - public Peak getPeakByMass(float mass, int tolerancePPM) - { - ArrayList matchList = getPeakListByMassDa(mass, tolerancePPM); - if(matchList == null || matchList.size() == 0) - return null; - else - return Collections.max(matchList, new IntensityComparator()); - } - */ - - /** - * Take a spectrum file name, infers the spectrum format by recognizing the extension and returns the spectrum file format. - * - * @param specFileName the spectrum file name. - * @return SpecFileFormat object corresponding to specFileName - */ public static SpecFileFormat getSpectrumFileFormat(String specFileName) { SpecFileFormat specFormat = null; diff --git a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java index 12b15475..75a4d0ef 100644 --- a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java +++ b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java @@ -112,19 +112,16 @@ public StaxMzMLParser(File specFile, int minMSLevel, int maxMSLevel) throws IOEx // Public API // ----------------------------------------------------------------------- - /** Number of spectra in the file. */ public int getSpectrumCount() { return indexList.size(); } - /** All spectrum indices (1-based), ordered. */ public ArrayList getSpecIndexList() { ArrayList list = new ArrayList<>(indexList.size()); for (SpectrumIndex si : indexList) list.add(si.specIndex); return list; } - /** All spectrum indices filtered by MS level range (both inclusive). */ public ArrayList getSpecIndexList(int minMSLevel, int maxMSLevel) { ArrayList list = new ArrayList<>(); for (SpectrumIndex si : indexList) { @@ -134,18 +131,15 @@ public ArrayList getSpecIndexList(int minMSLevel, int maxMSLevel) { return list; } - /** Get spectrum index metadata (without parsing peaks). */ public SpectrumIndex getSpectrumIndex(int specIndex) { return indexBySpecIdx.get(specIndex); } - /** Get spectrum ID for a given 1-based index. */ public String getID(int specIndex) { SpectrumIndex si = indexBySpecIdx.get(specIndex); return si != null ? si.id : null; } - /** Get precursor m/z for a given 1-based index (from index, no parsing). */ public Float getPrecursorMz(int specIndex) { SpectrumIndex si = indexBySpecIdx.get(specIndex); if (si == null) return null; @@ -289,7 +283,6 @@ private static boolean looksLikeBomOrPrologIssue(String msg) { || m.contains("content is not allowed"); } - /** Parse and return the full spectrum by its string ID. */ public Spectrum getSpectrumById(String specId) { SpectrumIndex si = indexById.get(specId); if (si == null) return null; @@ -305,7 +298,6 @@ public Iterator iterator(int minMSLevel, int maxMSLevel) { return new StaxSequentialIterator(minMSLevel, maxMSLevel); } - /** Get the list of SpectrumIndex entries (for iteration without full parse). */ public List getIndexList() { return Collections.unmodifiableList(indexList); } diff --git a/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java b/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java index 2222b8a5..75edbc50 100644 --- a/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java +++ b/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java @@ -5,12 +5,7 @@ import java.util.*; import java.util.Map.Entry; -/** - * An implementation of the Sequence class allowing a fasta file to be used as - * the database. - * - * @author jung - */ +/** Sequence implementation backed by a FASTA file. */ public class FastaSequence implements Sequence { //this is the file in which the sequence was generated @@ -48,8 +43,7 @@ public class FastaSequence implements Sequence { private int id; - /***** HELPER METHODS *****/ - // helper method, initialize the alphabet with given colon separated string + // initialize alphabet from a colon-separated string private void initializeAlphabet(String s) { String[] tokens = s.split(":"); this.alpha2byte = new HashMap(); @@ -63,7 +57,6 @@ private void initializeAlphabet(String s) { } } - // the other helper method when the hashmap is not known before hand private void createObjectFromRawFile(String filepath) { // a rough estimate of the space required to hold everything @@ -130,7 +123,6 @@ private void createObjectFromRawFile(String filepath) { writeSequence(original, sequence, size, id); } - // helper method to read and write the processed files given the alphabet private void createObjectFromRawFile(String filepath, String alphabet) { // estimate the length of the buffer @@ -199,7 +191,6 @@ private void createObjectFromRawFile(String filepath, String alphabet) { writeSequence(original, sequence, size, id); } - // helper method that writes the metainformation into a file in text format. private void writeMetaInfo(HashMap annotations, String alphabet, int size, int id) { String filepath = this.baseFilepath + this.seqExtension + "anno"; try { @@ -218,7 +209,6 @@ private void writeMetaInfo(HashMap annotations, String alphabet } } - // read the metainformation file private int readMetaInfo() { String filepath = this.baseFilepath + this.seqExtension + "anno"; try { @@ -240,7 +230,6 @@ private int readMetaInfo() { return 0; } - // helper method to write the sequence in bynary format private void writeSequence(StringBuffer original, ByteBuffer sequence, int size, int id) { String filepath = this.baseFilepath + this.seqExtension; try { @@ -259,29 +248,18 @@ private void writeSequence(StringBuffer original, ByteBuffer sequence, int size, } } - // read the sequence in binary private int readSequence() { String filepath = this.baseFilepath + this.seqExtension; try { - // read the first integer which encodes for the size of the file DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(filepath))); int size = in.readInt(); int id = in.readInt(); - - // Modified by Sangtae -// FileChannel fc = new FileInputStream(filepath).getChannel(); byte[] sequenceArr = new byte[size]; in.read(sequenceArr); sequence = ByteBuffer.wrap(sequenceArr).asReadOnlyBuffer(); - -// this.original = fc.map(FileChannel.MapMode.READ_ONLY, 2*Integer.SIZE/Byte.SIZE + size, size); byte[] originalArr = new byte[size]; in.read(originalArr); original = ByteBuffer.wrap(originalArr).asReadOnlyBuffer(); - - //this.original = new char[size]; - //ByteBuffer originalChars = fc.map(FileChannel.MapMode.READ_ONLY, 2*Integer.SIZE/Byte.SIZE + size, size); - //for (int index=0; index getAlphabetAsBytes() { return this.byte2alpha.keySet(); } @@ -497,41 +452,20 @@ public String getMatchingEntry(String name) { return this.getSubsequence(start + 1, end); } - /** - * Setter method. - * - * @param baseFilepath set the baseFilepath for this object. The baseFilepath - * has no extension. - */ public void setBaseFilepath(String baseFilepath) { this.baseFilepath = baseFilepath; } - /** - * Getter method. - * - * @return the baseFilename with properties described in the setter method. - */ public String getBaseFilepath() { return this.baseFilepath; } - /** - * This method allows modification of this sequence - * - * @param start the index to modify - * @param c the character to put in there - */ public void set(long start, char c) { this.sequence.put((int) start, this.alpha2byte.get(c)); this.original.put((int) start, (byte) c); - //this.original[(int)start] = c; } - /** - * This method make the buffers writeable. This must be called before - * the set method is usable - */ + /** Must be called before set() — read-only ByteBuffers do not support put(). */ public void makeModifiable() { ByteBuffer sequenceCopy = ByteBuffer.allocateDirect(this.size); ByteBuffer originalCopy = ByteBuffer.allocateDirect(this.size); @@ -541,90 +475,7 @@ public void makeModifiable() { this.original = originalCopy; } - /** - * This method returns List of annotations. - * - * @return All annotations as a list of Strings - */ public List getAnnotations() { return new ArrayList(annotations.values()); } - - - /***** Methods doomed to deprecation *****/ - /** - * Returns the substring specified by the position and extension parameters of - * the concatenated original fasta sequence. If the coordinates cross a sequence - * boundary, the terminator will be represented by "_". If the coordinates - * specify a substring out of range, the out-of-range portion will be ignored. - * If position is negative, it will be rounded up to 0. - * @param position the starting position. - * @param extension how many characters to extend. - * @return the substring specified by the coordinates. - */ - /* - public String getMatch(long position, int extension) { - char[] ba = new char[extension]; - position = Math.max(0, position); - for(long i = position, limit = Math.min(position+extension, this.getSize()); i < limit; i++) - ba[(int)(i-position)] = this.getCharAt(i); - return new String(ba); - } - */ - - /** - * Get the letter at a given position. This is the same as calling getMatch(position, 1). - * @param position the starting position. - * @return the letter specified by the coordinate. - */ - /* - public Character getMatch(long position) { - if(position >= this.getSize() || position < 0) return null; - return this.getCharAt(position); - }*/ - - /** - * This function will check whether the sequence can be encoded into bytes. - * @param sequence the sequence to test. - * @return true if all letters are in the alphabet, false otherwise. - */ - /* - public boolean isEncodable(String sequence) { - for(int i = 0; i < sequence.length(); i++) { - if(!alpha2byte.containsKey(sequence.charAt(i))) return false; - } - return true; - } - */ - - /** - * Return the set of bytes that are valid for sequence. This is the alphabet - * set in the form of bytes (including the terminator character, but excluding - * un-encodable characters). - * @return the byte alphabet set - */ - /* - public Set getAlphabetSetAsBytes() { - return this.byte2alpha.keySet(); - } - */ - - /** - * Return the alphabet set of this sequence as a Set of characters. - * @return the set of characters representing the alphabet - */ - /* - public Collection getValidAlphabetSetAsChars() { - ArrayList results = new ArrayList(); - for (char c : this.byte2alpha.values()) - if (c!='_') results.add(c); - return results; - } - */ - - /** - * @author kyowon - will be erased soon - */ - //public int getMatchingEntryStartPosition(long position){ return annotations.floorKey((int)position)+1; } - //public int getMatchingEntryEndPosition(long position){ return annotations.higherKey((int)position); } } From fff7b8221fb108b24b7cf61c6db96e4122339b8b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 14:19:15 +0100 Subject: [PATCH 32/34] chore: remove commented-out code blocks repo-wide Continues the comment cleanup pass with a focused removal of disabled-but-kept Java code masquerading as comments. Net change: 32 files, +1 / -323 = -322 LOC. What was removed: - fdr/TargetDecoyAnalysis: dead 10-line if/else block at the tail of getFDRMap() that re-set fdrMap edges no longer needed after the q-value conversion. - fdr/ComputeFDR: 2-line commented-out specFileCol guard. - msdbsearch/{CandidatePeptideGrid,CandidatePeptideGridConsideringMetCleavage, CompactFastaSequence, CompactSuffixArray, DBScanner, MassErrorStat, PSMFeatureFinder} and msgf/{ProfileGF, FlexAminoAcidGraph, ScoredSpectrumSum}: scattered // foo() / /* alt impl */ blocks documenting old algorithm variants and disabled debug printlns. - msutil/{AminoAcid, Composition, Constants, IonType, VolatileAminoAcid}: ~90 LOC of commented-out static initializers, alternate composition tables, and debug-only branches. - sequences/{FastaSequence, FastaSequences, ProteinFastaSequences} and suffixarray/SuffixArray: ~37 LOC of dead disabled stubs. Genuine prose comments (TODOs, performance invariants, why-explanations, URLs to upstream issues, the `applyShift` bit-identical-path guard) are preserved. Verified: scoped sweep (TestDirectPinWriter, TestMSUtils, TestSA, TestMisc, TestRunManifestWriter, SearchParamsTest, TestPercolator, TestMinSpectraPerThread, TestPrecursorCalScaffolding, TestCandidatePeptideGrid + ConsideringMetCleavage, MSGFPlusOptionsConfigFileTest, MSGFPlusOptionsActivationMethodTest): 78 tests, 0 failures, 0 errors, 3 skipped. --- .../java/edu/ucsd/msjava/fdr/ComputeFDR.java | 2 - src/main/java/edu/ucsd/msjava/fdr/PSMSet.java | 2 - .../java/edu/ucsd/msjava/fdr/TSVPSMSet.java | 7 --- .../ucsd/msjava/fdr/TargetDecoyAnalysis.java | 12 ----- .../msdbsearch/CandidatePeptideGrid.java | 5 --- ...datePeptideGridConsideringMetCleavage.java | 5 --- .../msdbsearch/CompactFastaSequence.java | 2 - .../msjava/msdbsearch/CompactSuffixArray.java | 15 +------ .../edu/ucsd/msjava/msdbsearch/DBScanner.java | 22 --------- .../ucsd/msjava/msdbsearch/MassErrorStat.java | 24 ---------- .../msjava/msdbsearch/PSMFeatureFinder.java | 7 --- .../edu/ucsd/msjava/msgf/BacktrackTable.java | 18 -------- .../edu/ucsd/msjava/msgf/DeNovoGraph.java | 1 - .../ucsd/msjava/msgf/FlexAminoAcidGraph.java | 28 ------------ .../ucsd/msjava/msgf/GeneratingFunction.java | 4 -- .../java/edu/ucsd/msjava/msgf/Histogram.java | 2 - .../msjava/msgf/MSGFDBResultGenerator.java | 11 ----- .../ucsd/msjava/msgf/NominalMassFactory.java | 1 - .../java/edu/ucsd/msjava/msgf/ProfileGF.java | 14 ------ .../ucsd/msjava/msgf/ScoredSpectrumSum.java | 1 - .../ucsd/msjava/msscorer/IonProbability.java | 2 - .../ucsd/msjava/msscorer/NewRankScorer.java | 9 ---- .../msscorer/PrecursorOffsetFrequency.java | 2 - .../edu/ucsd/msjava/msutil/AminoAcid.java | 1 - .../edu/ucsd/msjava/msutil/Composition.java | 45 ------------------- .../edu/ucsd/msjava/msutil/Constants.java | 41 ----------------- .../java/edu/ucsd/msjava/msutil/IonType.java | 2 - .../ucsd/msjava/msutil/VolatileAminoAcid.java | 1 - .../ucsd/msjava/sequences/FastaSequence.java | 11 ----- .../ucsd/msjava/sequences/FastaSequences.java | 1 - .../sequences/ProteinFastaSequences.java | 1 - .../ucsd/msjava/suffixarray/SuffixArray.java | 25 ----------- 32 files changed, 1 insertion(+), 323 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/fdr/ComputeFDR.java b/src/main/java/edu/ucsd/msjava/fdr/ComputeFDR.java index 15004902..72a5257f 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/ComputeFDR.java +++ b/src/main/java/edu/ucsd/msjava/fdr/ComputeFDR.java @@ -165,8 +165,6 @@ else if (!decoyFile.isFile()) if (targetFile == null) printUsageAndExit("Target is missing!"); -// if(specFileCol < 0) -// printUsageAndExit("specFileCol is missing or invalid!"); if (scoreCol < 0) printUsageAndExit("scoreCol is missing or invalid!"); if (pepCol < 0) diff --git a/src/main/java/edu/ucsd/msjava/fdr/PSMSet.java b/src/main/java/edu/ucsd/msjava/fdr/PSMSet.java index fd8f4397..15a553f2 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/PSMSet.java +++ b/src/main/java/edu/ucsd/msjava/fdr/PSMSet.java @@ -58,7 +58,5 @@ public ArrayList getPepScores() { return pepScores; } - // -// public abstract void writeResults(TargetDecoyAnalysis tda, PrintStream out, float fdrThreshold, float pepFDRThreshold, float scoreThreshold); public abstract void read(); } diff --git a/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java b/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java index d3cc769b..f0454945 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java +++ b/src/main/java/edu/ucsd/msjava/fdr/TSVPSMSet.java @@ -224,13 +224,6 @@ public static String getPeptideFromAnnotation(String annotation) { else pep = annotation; - // if there are flanking amino acids (e.g. R.ACDEFK.G), remove them -// int firstDotIndex = annotation.indexOf('.'); -// int lastDotIndex = annotation.lastIndexOf('.'); -// if(firstDotIndex < lastDotIndex) -// pep = annotation.substring(firstDotIndex+1, lastDotIndex); -// else -// pep = annotation; pep = pep.toUpperCase(); return pep; } diff --git a/src/main/java/edu/ucsd/msjava/fdr/TargetDecoyAnalysis.java b/src/main/java/edu/ucsd/msjava/fdr/TargetDecoyAnalysis.java index 2c5c938e..87142d59 100644 --- a/src/main/java/edu/ucsd/msjava/fdr/TargetDecoyAnalysis.java +++ b/src/main/java/edu/ucsd/msjava/fdr/TargetDecoyAnalysis.java @@ -106,7 +106,6 @@ public float getThresholdScore(float fdrThreshold, boolean isPeptideLevel) { threshold = Float.MIN_VALUE; for (Entry entry : map.entrySet()) { -// System.out.println(entry.getKey()+"\t"+entry.getValue()); if (entry.getValue() > fdrThreshold) break; else @@ -177,7 +176,6 @@ public static TreeMap getFDRMap(ArrayList target, ArrayList fdrMap.put(decoyScore, fdr); if (fdr >= 1) break; -// System.out.println("1: " + decoyScore + ":" + fdr); } } @@ -206,16 +204,6 @@ public static TreeMap getFDRMap(ArrayList target, ArrayList finalFDRMap.put(entry.getKey(), fdr); } -// if(isGreaterBetter) -// { -// finalFDRMap.put(Float.POSITIVE_INFINITY, 0f); -// finalFDRMap.put(Float.NEGATIVE_INFINITY, 1f); -// } -// else -// { -// finalFDRMap.put(Float.POSITIVE_INFINITY, 1f); -// finalFDRMap.put(Float.NEGATIVE_INFINITY, 0f); -// } return finalFDRMap; } diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGrid.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGrid.java index b0667f6d..8e1c0670 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGrid.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGrid.java @@ -155,11 +155,6 @@ public int getNumMods(int index) { return numMods[index][length]; } -// public boolean addResidue(char residue) -// { -// return addResidue(length+1, residue); -// } - /** * Add a residue to the candidate peptide grid * @param length diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGridConsideringMetCleavage.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGridConsideringMetCleavage.java index bfd38472..d2e27fcb 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGridConsideringMetCleavage.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CandidatePeptideGridConsideringMetCleavage.java @@ -8,11 +8,6 @@ public class CandidatePeptideGridConsideringMetCleavage extends CandidatePeptide private final CandidatePeptideGrid candidatePepGridMetCleaved; // For peptides with Met cleaved boolean isProteinNTermWithHeadingMet = false; -// public CandidatePeptideGridConsideringMetCleavage(AminoAcidSet aaSet, int maxPeptideLength) -// { -// this(aaSet, maxPeptideLength, Constants.NUM_VARIANTS_PER_PEPTIDE); -// } - public CandidatePeptideGridConsideringMetCleavage(AminoAcidSet aaSet, Enzyme enzyme, int maxPeptideLength, int maxNumVariantsPerPeptide, int maxNumMissedCleavages) { super(aaSet, enzyme, maxPeptideLength, maxNumVariantsPerPeptide, maxNumMissedCleavages); candidatePepGridMetCleaved = new CandidatePeptideGrid(aaSet, enzyme, maxPeptideLength, maxNumVariantsPerPeptide, maxNumMissedCleavages); diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java index d922e9e7..0e2200b3 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java @@ -256,8 +256,6 @@ public long getSize() { public byte getByteAt(long position) { // forget boundary check for faster access -// if(position >= this.size) return Constants.TERMINATOR; -// return this.sequence.get((int)position); return this.sequence[(int) position]; } diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java index 033b443b..9f3bdcb5 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java @@ -190,9 +190,6 @@ private boolean isCompactSuffixArrayValid(long lastModified) { } } - //System.out.println("LastModified times in the existing csarr and cnlcp files " + - // "match the LastModified time of the sequence file (" + lastModified + ")"); - return true; } @@ -730,17 +727,7 @@ private static byte computeLcpByte(CompactFastaSequence sequence, int idxA, int @Override public String toString() { - String retVal = "Size of the suffix array: " + this.size + "\n"; -// int rank = 0; -// while(indices.hasRemaining()) { -// int index = indices.get(); -// int lcp = this.neighboringLcps.get(rank); -// retVal += rank + "\t" + index + "\t" + lcp + "\t" + sequence.toString(factory.makeSuffix(index).getSequence()) + "\n"; -// rank++; -// } -// indices.rewind(); // reset marks after iteration -// neighboringLcps.rewind(); - return retVal; + return "Size of the suffix array: " + this.size + "\n"; } public void measureNominalMassError(AminoAcidSet aaSet) throws Exception { diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java b/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java index 7b7eea4e..4f94d64e 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/DBScanner.java @@ -474,13 +474,6 @@ else if (lcp == 0) // preceding aa is changed double leftThr = (double) (theoPeptideMass - tolDaLeft); double rightThr = (double) (theoPeptideMass + tolDaRight); -// float tolDaLeft = specScanner.getLeftPrecursorMassTolerance().getToleranceAsDa(peptideMass); -// float tolDaRight = specScanner.getRightPrecursorMassTolerance().getToleranceAsDa(peptideMass); -// int maxPeptideMassIndex, minPeptideMassIndex; -// -// maxPeptideMassIndex = maxNominalPeptideMass + Math.round(tolDaLeft-0.4999f); -// minPeptideMassIndex = minNominalPeptideMass - Math.round(tolDaRight-0.4999f); - if (leftThr < 1 || rightThr < 1) { // Either or both of the thresholds is less than 1 (and probably negative) // This can happen when a dynamic mod with a large negative mass is defined and is applied to a small peptide @@ -494,11 +487,7 @@ else if (lcp == 0) // preceding aa is changed Collection matchedSpecKeyList = specScanner.getPepMassSpecKeyMap().subMap(leftThr, rightThr).values(); if (matchedSpecKeyList.size() > 0) { - ////// -// System.out.println("\tMatch: " + sequence.getCharAt(index)+"."+sequence.getSubsequence(index+1, index+i+1)+"."+sequence.getCharAt(index+i+1)); - /////// boolean isNTermMetCleaved = candidatePepGrid.isNTermMetCleaved(j); -// int pepLength = i; int pepLength; if (!isNTermMetCleaved) pepLength = peptideLengthIndex; @@ -675,15 +664,6 @@ public void generateSpecIndexDBMatchMap() { Map pepSeqMap = new HashMap(); for (DatabaseMatch m : matchQueue) { String pepSeq = m.getPepSeq(); -// int index = m.getIndex(); -// char pre = sa.getSequence().getCharAt(index); -// char post; -// if(m.isNTermMetCleaved()) -// post = sa.getSequence().getCharAt(index+m.getLength()); -// else -// post = sa.getSequence().getCharAt(index+m.getLength()-1); -// String key = pre+pepSeq+post; - String key = pepSeq + m.getScore(); DatabaseMatch existingMatch = pepSeqMap.get(key); if (existingMatch == null) @@ -819,7 +799,6 @@ public void addDBSearchResults(List gen, String s } float expMass = scorer.getPrecursorPeak().getMass(); -// float theoMass = pep.getParentMass(); float peptideMass = match.getPeptideMass(); float pmError = Float.MAX_VALUE; float theoMass = peptideMass + (float) Composition.H2O; @@ -830,7 +809,6 @@ public void addDBSearchResults(List gen, String s pmError = error; } } -// if(pmError > ) if (specScanner.getRightPrecursorMassTolerance().isTolerancePPM()) pmError = pmError / theoMass * 1e6f; diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/MassErrorStat.java b/src/main/java/edu/ucsd/msjava/msdbsearch/MassErrorStat.java index f6e6b06a..bdeba08e 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/MassErrorStat.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/MassErrorStat.java @@ -10,27 +10,19 @@ public class MassErrorStat { private List> errorList; // (error, intensity) // for all peaks (absolute) -// private float sum; private float mean; - // private float median; private float sd; // for top 7 peaks (absolute) -// private float sum7; private float mean7; - // private float median7; private float sd7; // for all peaks (absolute) -// private float rSum; private float rMean; - // private float rMedian; private float rSd; // for top 7 peaks (absolute) -// private float rSum7; private float rMean7; - // private float rMedian7; private float rSd7; public MassErrorStat() { @@ -80,10 +72,6 @@ public int size() { return errorList.size(); } -// public float getSum() { -// return sum; -// } - public float getMean() { return mean; } @@ -92,10 +80,6 @@ public float getRMean() { return rMean; } -// public float getMedian() { -// return median; -// } - public float getSd() { return sd; } @@ -104,14 +88,6 @@ public float getRSd() { return rSd; } -// public float getSum7() { -// return sum7; -// } - -// public float getRSum7() { -// return rSum7; -// } - public float getMean7() { return mean7; } diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/PSMFeatureFinder.java b/src/main/java/edu/ucsd/msjava/msdbsearch/PSMFeatureFinder.java index a21d4d1b..69fa6e4d 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/PSMFeatureFinder.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/PSMFeatureFinder.java @@ -15,7 +15,6 @@ public class PSMFeatureFinder { private final Spectrum spec; // MS/MS spectrum - // private final Spectrum precursorSpec; private final Peptide peptide; private final NewScoredSpectrum scoredSpec; @@ -43,15 +42,11 @@ public class PSMFeatureFinder { private int longestB = 0; private int longestY = 0; - // private Float ms1IonCurrent; -// private Float isolationWindowEfficiency; private Tolerance mme; public PSMFeatureFinder(Spectrum spec, Spectrum precursorSpec, Peptide peptide, NewRankScorer scorer) { this.spec = spec; this.peptide = peptide; -// this.precursorSpec = precursorSpec; - scoredSpec = scorer.getScoredSpectrum(spec); if (scorer.getSpecDataType().getInstrumentType().isHighResolution()) mme = new Tolerance(20f, true); // for high-precision MS/MS, set tolerance as 20ppm @@ -208,12 +203,10 @@ public Float getMS2IonCurrent() { } public Float getMS1IonCurrent() { -// return ms1IonCurrent; return null; } public Float getIsolationWindowEfficiency() { -// return isolationWindowEfficiency; return null; } } diff --git a/src/main/java/edu/ucsd/msjava/msgf/BacktrackTable.java b/src/main/java/edu/ucsd/msjava/msgf/BacktrackTable.java index 2015ad6c..a8db230a 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/BacktrackTable.java +++ b/src/main/java/edu/ucsd/msjava/msgf/BacktrackTable.java @@ -36,18 +36,6 @@ public void getReconstructions(T curNode, int score, String prefix, ArrayList edge : graph.getEdges(curNode)) { int edgeIndex = edge.getEdgeIndex(); -// String residue; -// if(edgeIndex >= 0) -// residue = String.valueOf(graph.getAASet().getAminoAcid(edgeIndex).getResidue()); -// else -// { -// if(edgeIndex == -2) -// residue="K."; -// else if(edgeIndex == -3) -// residue = "G."; -// else -// residue = ""; -// } if (pointer.isSet(score, edgeIndex)) getReconstructions(edge.getPrevNode(), score - (edge.getEdgeScore() + pointer.getNodeScore()), prefix + graph.getAASet().getAminoAcid(edgeIndex).getResidueStr(), reconstructions, sa); } @@ -64,12 +52,6 @@ public String getOneReconstruction(T curNode, int score, String prefix) { { return prefix; } -// for(T prevNode : graph.getPreviousNodes(curNode)) -// { -// int edgeIndex = graph.getEdgeIndex(curNode, prevNode); -// if(pointer.isSet(score, edgeIndex)) -// getOneReconstruction(prevNode, score-pointer.getCurScore(), prefix+aaSet.getAminoAcid(edgeIndex).getResidue()); -// } for (DeNovoGraph.Edge edge : graph.getEdges(curNode)) { int edgeIndex = edge.getEdgeIndex(); if (pointer.isSet(score, edgeIndex)) diff --git a/src/main/java/edu/ucsd/msjava/msgf/DeNovoGraph.java b/src/main/java/edu/ucsd/msjava/msgf/DeNovoGraph.java index 9028eac0..da66ecb0 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/DeNovoGraph.java +++ b/src/main/java/edu/ucsd/msjava/msgf/DeNovoGraph.java @@ -37,7 +37,6 @@ public ArrayList getIntermediateNodeList() { public abstract int getNodeScore(T node); - // public abstract int getEdgeScore(T curNode, T prevNode); public abstract ArrayList> getEdges(T curNode); public abstract T getComplementNode(T node); diff --git a/src/main/java/edu/ucsd/msjava/msgf/FlexAminoAcidGraph.java b/src/main/java/edu/ucsd/msjava/msgf/FlexAminoAcidGraph.java index 18f86291..eb98d5ed 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/FlexAminoAcidGraph.java +++ b/src/main/java/edu/ucsd/msjava/msgf/FlexAminoAcidGraph.java @@ -276,11 +276,6 @@ private void setBackwardEdgesFromSink() { ArrayList aaList = aaSet.getAAList(location); -// if(enzymaticCleavageOnly && direction != enzyme.isNTerm()) -// aaList = aaSet.getEnzymeAAList(); -// else -// aaList = aaSet.getAAList(location); - int peptideNominalMass = pmNode.getNominalMass(); ArrayList> edges = new ArrayList>(); for (AminoAcid aa : aaList) { @@ -321,17 +316,9 @@ private void makeForwardEdges(NominalMass curNode, ArrayList aaList, aa.getProbability(), aaSet.getIndex(aa), aa.getMass()); -// if(curNode.getNominalMass() == 57 && nextNode.getNominalMass() == 114) -// System.out.println("Debug"); int errorScore = scoredSpec.getEdgeScore(nextNode, curNode, aa.getMass()); -// if(aa.isModified()) -// errorScore += MODIFIED_EDGE_PENALTY; if (errorScore < -100 || errorScore > 100) { System.err.println("Warning, invalid ErrorScore: " + errorScore); - - // Could abort the search - // System.exit(-1); - // Instead, use a score of -4 errorScore = -4; } @@ -347,19 +334,4 @@ private void makeForwardEdges(NominalMass curNode, ArrayList aaList, } } -// private void computeEdgeScores() -// { -// for(NominalMass curNode : intermediateNodes) -// { -// ArrayList> edges = edgeMap.get(curNode); -// for(DeNovoGraph.Edge edge : edges) -// { -// NominalMass prevNode = edge.getPrevNode(); -// int errorScore = scoredSpec.getEdgeScore(curNode, prevNode, edge.getEdgeMass()); -// assert(errorScore == edge.getErrorScore()); -// edge.setErrorScore(errorScore); -// } -// edgeMap.put(curNode, edges); -// } -// } } diff --git a/src/main/java/edu/ucsd/msjava/msgf/GeneratingFunction.java b/src/main/java/edu/ucsd/msjava/msgf/GeneratingFunction.java index d7455754..0d0774d9 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/GeneratingFunction.java +++ b/src/main/java/edu/ucsd/msjava/msgf/GeneratingFunction.java @@ -408,8 +408,6 @@ private void setCurNode(T curNode, ScoreDistFactory scoreDistFactory) { System.err.println("Warning, MinScore is abnormally low; " + "MinScore: " + curMinScore + ", MaxScore: " + curMaxScore + ", " + "CurNode: " + curNode.getNominalMass() + ", CurNodeScore: " + curNodeScore); - // Could abort processing - // System.exit(-1); // Instead, skip this node return; } @@ -418,8 +416,6 @@ private void setCurNode(T curNode, ScoreDistFactory scoreDistFactory) { System.err.println("Warning, MaxScore is abnormally high; " + "MinScore: " + curMinScore + ", MaxScore: " + curMaxScore + ", " + "CurNode: " + curNode.getNominalMass() + ", CurNodeScore: " + curNodeScore); - // Could abort processing - // System.exit(-1); // Instead, skip this node return; } diff --git a/src/main/java/edu/ucsd/msjava/msgf/Histogram.java b/src/main/java/edu/ucsd/msjava/msgf/Histogram.java index 3b2c9046..09d65785 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/Histogram.java +++ b/src/main/java/edu/ucsd/msjava/msgf/Histogram.java @@ -68,8 +68,6 @@ public void printSortedRatio() { Collections.sort(keyList); for (T key : keyList) { System.out.println(key + "\t" + this.get(key) + "\t" + this.get(key) / (float) totalCount); -// System.out.print(key+"\t"+this.get(key)+"\t"); -// System.out.format("%.3f\n", this.get(key)/(float)totalCount); } } } diff --git a/src/main/java/edu/ucsd/msjava/msgf/MSGFDBResultGenerator.java b/src/main/java/edu/ucsd/msjava/msgf/MSGFDBResultGenerator.java index 18854ec0..992b3ecd 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/MSGFDBResultGenerator.java +++ b/src/main/java/edu/ucsd/msjava/msgf/MSGFDBResultGenerator.java @@ -127,17 +127,6 @@ public double getEDD(double specProbThreshold) { // returns cumulative probability <= specProbThreshold public double getSpectralProbability(double specProbThreshold) { -// int index = Arrays.binarySearch(cumScoreDist, specProbThreshold); -// if(index >= 0) -// return cumScoreDist[index]; -// else -// { -// index = -index-1; -// if(index > 0) -// return cumScoreDist[index-1]; -// else -// return 0; -// } while (curIndex < cumScoreDist.length - 1 && cumScoreDist[curIndex + 1] <= specProbThreshold) ++curIndex; diff --git a/src/main/java/edu/ucsd/msjava/msgf/NominalMassFactory.java b/src/main/java/edu/ucsd/msjava/msgf/NominalMassFactory.java index 073dd70c..adb395f3 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/NominalMassFactory.java +++ b/src/main/java/edu/ucsd/msjava/msgf/NominalMassFactory.java @@ -111,7 +111,6 @@ public boolean contains(NominalMass node) { return factory[index] != null; } - // // static methods private static NominalMassFactory defaultNominalMassFactory = new NominalMassFactory(50); public static NominalMass getInstanceFor(float mass) { diff --git a/src/main/java/edu/ucsd/msjava/msgf/ProfileGF.java b/src/main/java/edu/ucsd/msjava/msgf/ProfileGF.java index b26bc2e0..5e4c0a23 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/ProfileGF.java +++ b/src/main/java/edu/ucsd/msjava/msgf/ProfileGF.java @@ -98,8 +98,6 @@ public ProfileGF computeProfile(float specProb) { int thresholdScore = gf.getThresholdScore(specProb) + 1; if (thresholdScore >= gf.getMaxScore()) thresholdScore = gf.getMaxScore() - 1; -// else if(thresholdScore < gf.getMaxScore()-gf.getNumScoreBinsPerNode()) -// thresholdScore = gf.getMaxScore()-gf.getNumScoreBinsPerNode(); return computeProfile(thresholdScore); } @@ -182,18 +180,6 @@ private void setBackwardNodes(T curNode, HashMap bwdTable) { if (prevBwdDist != null) prevBwdDist.addNumber(score - curNodeScore, numberRecs); } -// for(int aaIndex : pointer.getBacktrackAAIndexList(score)) -// { -// if((bits & (1 << aaIndex)) == 0){ -// bits |= (1 << aaIndex); -// T prevNode = gf.getGraph().getPreviousNode(curNode, gf.getGraph().getAASet().getAminoAcid(aaIndex)); -// prevBwdDists[aaIndex] = bwdTable.get(prevNode); -// } -// T prevNode = gf.getGraph().getPreviousNode(curNode, gf.getAASet().getAminoAcid(aaIndex)); -// ScoreDist prevBwdDist = prevBwdDists[aaIndex]; -// if(prevBwdDist != null) -// prevBwdDist.addNumber(score-curNodeScore, numberRecs); -// } } } } diff --git a/src/main/java/edu/ucsd/msjava/msgf/ScoredSpectrumSum.java b/src/main/java/edu/ucsd/msjava/msgf/ScoredSpectrumSum.java index b6d506bb..af7ca94a 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/ScoredSpectrumSum.java +++ b/src/main/java/edu/ucsd/msjava/msgf/ScoredSpectrumSum.java @@ -42,7 +42,6 @@ public int getEdgeScore(T curNode, T prevNode, float theoMass) { } public boolean getMainIonDirection() { -// assert(false): "Not supported!"; return false; } diff --git a/src/main/java/edu/ucsd/msjava/msscorer/IonProbability.java b/src/main/java/edu/ucsd/msjava/msscorer/IonProbability.java index 0ab02834..5f442d34 100644 --- a/src/main/java/edu/ucsd/msjava/msscorer/IonProbability.java +++ b/src/main/java/edu/ucsd/msjava/msscorer/IonProbability.java @@ -100,8 +100,6 @@ public float[] getIonProb() { } if (spec.getPeakByMass(mz, tol) != null) { numObservedPeaks[index]++; -// if(ion.getName().equals("y2-H3PO4")) -// System.out.println("Debug"); } else numMissingPeaks[index]++; } diff --git a/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java b/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java index 0ea1db20..290c70d6 100644 --- a/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java +++ b/src/main/java/edu/ucsd/msjava/msscorer/NewRankScorer.java @@ -408,8 +408,6 @@ private void readFromInputStream(InputStream is, boolean verbose) { for (int i = 0; i < ionExTable.length; i++) { ionExTable[i] = in.readFloat(); if (ionExTable[i] == 0) { -// System.out.println("IonExTable: " + partition.getCharge() + " " + partition.getSegNum() -// + " " + partition.getParentMass() + " " + ionExTable[i]); ionExTable[i] = 0.001f; } assert (ionExTable[i] > 0); @@ -764,9 +762,6 @@ public void writeParameters(File outputFile) { // Rank distributions out.writeInt(maxRank); for (Partition partition : partitionSet) { -// if(partition.getParentMass() > 4100 && partition.getCharge() == 5 && partition.getSegNum() == 1) -// System.out.println("Debug"); - HashMap rankDistTable = getRankDistTable(partition); if (rankDistTable == null) continue; @@ -786,10 +781,6 @@ public void writeParameters(File outputFile) { } // Error distribution -// protected int errorScalingFactor = 0; // if 0, don't user errors, 10 for low accuracy, 100 for high accuracy -// protected Hashtable ionErrDistTable = null; -// protected Hashtable noiseErrDistTable = null; -// protected Hashtable ionExistenceTable = null; out.writeInt(errorScalingFactor); if (errorScalingFactor > 0) { for (Partition partition : partitionSet) { diff --git a/src/main/java/edu/ucsd/msjava/msscorer/PrecursorOffsetFrequency.java b/src/main/java/edu/ucsd/msjava/msscorer/PrecursorOffsetFrequency.java index 51633a14..7d55c708 100644 --- a/src/main/java/edu/ucsd/msjava/msscorer/PrecursorOffsetFrequency.java +++ b/src/main/java/edu/ucsd/msjava/msscorer/PrecursorOffsetFrequency.java @@ -84,8 +84,6 @@ else if (offList.size() == 0) float tolDa = granularity / 2 * (offList.size() - clusterStartIndex); clusteredOFF.add(new PrecursorOffsetFrequency(reducedCharge, offset, clusterFreq).tolerance(new Tolerance(tolDa))); -// for(PrecursorOffsetFrequency off : clusteredOFF) -// System.out.println(off.getReducedCharge()+"\t"+off.getOffset()+"\t"+off.getFrequency()+"\t"+off.getTolerance().toString()); return clusteredOFF; } } diff --git a/src/main/java/edu/ucsd/msjava/msutil/AminoAcid.java b/src/main/java/edu/ucsd/msjava/msutil/AminoAcid.java index 3f5a64f8..688ce7b4 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/AminoAcid.java +++ b/src/main/java/edu/ucsd/msjava/msutil/AminoAcid.java @@ -167,7 +167,6 @@ public int hashCode() { new AminoAcid('V', "Valine", new Composition(5, 9, 1, 1, 0)), // 99.0684 new AminoAcid('T', "Threonine", new Composition(4, 7, 1, 2, 0)), // 101.0477 new AminoAcid('C', "Cystine", new Composition(3, 5, 1, 1, 1)), // 103.0092 - // new AminoAcid('O', "Hydroxyproline", new Composition(5, 7, 1, 2, 0)), // 113.0477; note that O could be Hydroxyproline, Ornithine, or Pyrrolysine new AminoAcid('L', "Leucine", new Composition(6, 11, 1, 1, 0)), // 113.0841 new AminoAcid('I', "Isoleucine", new Composition(6, 11, 1, 1, 0)), // 113.0841 new AminoAcid('N', "Asparagine", new Composition(4, 6, 2, 2, 0)), // 114.0429 diff --git a/src/main/java/edu/ucsd/msjava/msutil/Composition.java b/src/main/java/edu/ucsd/msjava/msutil/Composition.java index 21865f50..31dc0fd1 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Composition.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Composition.java @@ -48,10 +48,6 @@ public class Composition extends Matter { */ int number; - // Unused: - // static final double[] monoMass = new double[]{C, H, N, O, S}; - // static final float[] avgMass = new float[]{12.011f, 1.00794f, 14.00674f, 15.9994f, 32.066f}; - public static final double OffsetY() { return offsetY; } @@ -71,8 +67,6 @@ public static final void setChargeCarrierMass(double mass) { } - // private Composition() {} - public Composition(int C, int H, int N, int O, int S) { number = C * 0x01000000 + H * 0x00010000 + N * 0x00000400 + O * 0x00000010 + S; } @@ -129,15 +123,6 @@ public Composition(String compositionStr) { compTable.get('S')).number; } - // public static final Composition getInstance(int C, int H, int N, int O, int S) - // { - // int number = C*0x01000000 + H*0x00010000 + N*0x00000400 + O*0x00000010 + S; - // } - - // public static final Composition getInstance(int number) - // { - // } - public int getC() { return (number & 0xFF000000) >>> 24; } @@ -161,8 +146,6 @@ public int getS() { public int getNumber() { return number; } - // public int getIndex() { return number; } - @Override public int hashCode() { return number; @@ -177,16 +160,6 @@ public static float getMonoMass(int number) { (number & 0x0000000F) * Composition.S); } - // Unused: - // public static float getAvgMass(int number) { - // return - // ((number & 0xFF000000) >>> 24) * avgMass[0] + - // ((number & 0x00FF0000) >> 16) * avgMass[1] + - // ((number & 0x0000FC00) >> 10) * avgMass[2] + - // ((number & 0x000003F0) >> 4) * avgMass[3] + - // (number & 0x0000000F) * avgMass[4]; - // } - @Override public float getMass() { return (float)getAccurateMass(); @@ -205,11 +178,6 @@ public int getNominalMass() { return getC() * 12 + getH() * 1 + getN() * 14 + getO() * 16 + getS() * 32; } - // Unused: - //public float getAvgMass() { - // return getC() * avgMass[0] + getH() * avgMass[1] + getN() * avgMass[2] + getO() * avgMass[3] + getS() * avgMass[4]; - //} - public String toString() { return new String(getC() + " " + getH() + " " + getN() + " " + getO() + " " + getS()); } @@ -331,19 +299,6 @@ public static Double getMass(String compositionStr) { return modMass; } - /* - public int compareTo(Composition c) - { - float diff = getMass() - c.getMass(); - if(diff == 0) - return this.number - c.number; - else if(diff > 0) - return 1; - else - return -1; - } - */ - public static class CompositionComparator implements Comparator { public int compare(Integer c1, Integer c2) { double mass1 = Composition.getMonoMass(c1); diff --git a/src/main/java/edu/ucsd/msjava/msutil/Constants.java b/src/main/java/edu/ucsd/msjava/msutil/Constants.java index 2b6a510d..a9642b84 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Constants.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Constants.java @@ -73,8 +73,6 @@ public class Constants { public static String SPECTRUM_FILE_NAME = ""; -// public static SpectraFileType SPECTRA_FILE_TYPE = SpectraFileType.DTA; - public static String INSTRUMENTS_NAME = "QTOF"; public static String PTM_FILE_NAME = "PTMDB.xml"; @@ -101,9 +99,6 @@ public class Constants { public static final int maxPTMSizePerGap = 5; - // public static final int maxPTMOccurrence = 5; - - public static final String SPECTRUM_EXTENSION = ".unidta"; public static final String ANALYSIS_EXTENSION = ".unidrawing"; @@ -120,9 +115,6 @@ public class Constants { public static final float MAXIMIM_PRECURSOR_MASS_ERROR = 1.5f; - // public static final boolean IS_TRYPTIC = true; // not used currently - - // if true, write unidrawing only tag chains whose all gaps are annotated public static final boolean writeAnnotatedTagChainOnly = false; @@ -181,39 +173,6 @@ public static String getString(float value) public static float PTM_ADD_PENALTY = 0.2f; - // Can use Wysocki paper results? - /* - public static float getMissingPenaltyWeight(PeakProperty property) - - { - - if(property == PeakProperty.Y_ION) - - return 0.4f; - - else if(property == PeakProperty.Y_MINUS_NH3_ION) - - return 0; - - else if(property == PeakProperty.Y_MINUS_H2O_ION) - - return 0; - - else if(property == PeakProperty.B_ION) - - return 0; - - else if(property == PeakProperty.A_ION) - - return 0; - - else - - return 0; - - } - */ - public static float getNotExplainedPenaltyWeight() { diff --git a/src/main/java/edu/ucsd/msjava/msutil/IonType.java b/src/main/java/edu/ucsd/msjava/msutil/IonType.java index 092ca973..2ecfa388 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/IonType.java +++ b/src/main/java/edu/ucsd/msjava/msutil/IonType.java @@ -273,9 +273,7 @@ public static ArrayList getAllKnownIonTypes(int maxCharge, boolean remo } } -// System.out.println(maxCharge+"\t"+removeRedundancy+"\t"+nlString); Collections.sort(ionList, new IonTypeComparator()); -// System.out.println("Sorting Done"); if (!removeRedundancy) return ionList; diff --git a/src/main/java/edu/ucsd/msjava/msutil/VolatileAminoAcid.java b/src/main/java/edu/ucsd/msjava/msutil/VolatileAminoAcid.java index 6e8d48f8..4485b644 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/VolatileAminoAcid.java +++ b/src/main/java/edu/ucsd/msjava/msutil/VolatileAminoAcid.java @@ -21,7 +21,6 @@ public boolean isModified() { public static AminoAcid getVolatileAminoAcid(float mass) { AminoAcid aa = table.get(mass); if (aa == null) { -// System.out.println("Register " + mass); aa = new VolatileAminoAcid(mass); table.put(mass, aa); } diff --git a/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java b/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java index 75edbc50..826ea55e 100644 --- a/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java +++ b/src/main/java/edu/ucsd/msjava/sequences/FastaSequence.java @@ -25,7 +25,6 @@ public class FastaSequence implements Sequence { // the original serialized fasta file private ByteBuffer original; - //private char[] original; // the number of characters in the buffer private int size; @@ -362,20 +361,11 @@ public String getSubsequence(long start, long end) { char[] seq = new char[(int) (end - start)]; for (long i = start; i < end; i++) { seq[(int) (i - start)] = (char) this.original.get((int) i); - //seq[(int)(i-start)] = this.original[(int)i]; } return new String(seq); } - /* - public String toString(long start, int ext) { - return toString(start, start+ext); - } - */ - public char getCharAt(long position) { - //return toChar(getByteAt(position)); - //return this.original[(int)position]; return (char) this.original.get((int) position); } @@ -425,7 +415,6 @@ public String getAnnotation(long position) { public long getStartPosition(long position) { Integer startPos = annotations.floorKey((int) position); if (startPos == null) { - //System.err.println("There is no start for position " + position); return 0; } return startPos; diff --git a/src/main/java/edu/ucsd/msjava/sequences/FastaSequences.java b/src/main/java/edu/ucsd/msjava/sequences/FastaSequences.java index 75260e20..46bb4ba7 100644 --- a/src/main/java/edu/ucsd/msjava/sequences/FastaSequences.java +++ b/src/main/java/edu/ucsd/msjava/sequences/FastaSequences.java @@ -177,7 +177,6 @@ public int getAlphabetSize() { public String getAnnotation(long position) { long pair = translate(position); - //System.out.println(this.files.get((int)(pair>>>32)) + " "); return getSequence((int) (pair >>> 32)).getAnnotation((int) pair); } diff --git a/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequences.java b/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequences.java index e97cec7f..022128dc 100644 --- a/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequences.java +++ b/src/main/java/edu/ucsd/msjava/sequences/ProteinFastaSequences.java @@ -209,7 +209,6 @@ public int getAlphabetSize() { public String getAnnotation(long position) { long pair = translate(position); - //System.out.println(this.files.get((int)(pair>>>32)) + " "); return getSequence((int) (pair >>> 32)).getAnnotation((int) pair); } diff --git a/src/main/java/edu/ucsd/msjava/suffixarray/SuffixArray.java b/src/main/java/edu/ucsd/msjava/suffixarray/SuffixArray.java index 1fb312a9..3e52e3ca 100644 --- a/src/main/java/edu/ucsd/msjava/suffixarray/SuffixArray.java +++ b/src/main/java/edu/ucsd/msjava/suffixarray/SuffixArray.java @@ -53,9 +53,6 @@ private static void queryAllSubstrings(SuffixArray sa, SuffixArraySequence seque if (pos >= 0) { String match = sequence.getSubsequence(sa.getPosition(pos), sa.getPosition(pos) + length); if (match.equals(query)) { - //System.out.println("We found correctly " + query + " at " + pos); - //System.out.println(sequence.toString(sa.getPosition(pos), length)); - //System.exit(-1); tp++; } else { fn++; @@ -101,15 +98,7 @@ private static void debug() { String userHome = System.getProperty("user.home"); int iterations = 1000000; - fastaFile = userHome + "/Data/Databases/tiny.fasta"; - //fastaFile = userHome+"/Data/Databases/small.fasta"; - //fastaFile = userHome+"/Data/Databases/single.fasta"; - //fastaFile = userHome+"/Data/Databases/uniprot_sprot.fasta"; fastaFile = userHome + "/Data/Databases/yeast_nr050706.fasta"; - //fastaFile = "/home/sangtaekim/Research/Data/EColiDB/Ecol_protein_formatted.fasta"; - //fastaFile = "/home/sangtaekim/Research/Data/SProt/uniprot_sprot.fasta"; - //fastaFile = userHome+"/Desktop/test.fasta"; - //fastaFile = "/home/sangtaekim/Research/Data/HumanGenome/translated/HSRM.NCBI36.54.translation.0.fasta"; long time = System.currentTimeMillis(); SuffixArraySequence sequence = new SuffixArraySequence(fastaFile); @@ -119,15 +108,6 @@ private static void debug() { SuffixArray sa = new SuffixArray(sequence); System.out.println("-- Loading SuffixArray file time: " + (System.currentTimeMillis() - time) / 1000.0 + "s"); - //MatchSet match = sa.findAll("PKVPFDPKFKEKLYDSYLDKAAKTK"); - - //System.out.println("Translating _ " + sequence.toByte('_')); - - // print out the matches - //for (int i=0; i< match.getSize(); i++) { - // System.out.println(sequence.toString(match.getStart(i), 10)); - //} - time = System.currentTimeMillis(); queryAllSubstrings(sa, sequence, iterations); @@ -777,9 +757,6 @@ public int search(ByteSequence pattern) { } - // To-do (sangtae): search suffix array using partial matches - // public MatchSet search(MatchSet partialMatch, byte b) - /** * Treat the parameter as the source of input. One line per query. * @@ -790,8 +767,6 @@ public void searchWithFile(BufferedReader in) { } public void printAllPeptides(AminoAcidSet aaSet, int minLength, int maxLength) { - // ArrayList> pepList = new ArrayList>(); - double[] aaMass = new double[128]; for (int i = 0; i < aaMass.length; i++) aaMass[i] = -1; From a3994defbcce08320e7bea4346480428921e2e48 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 16:45:45 +0100 Subject: [PATCH 33/34] fix(mgf): strip UTF-8 BOM in BufferedLineReader + drop dead MSGFResult MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-ups from the PR #25 review thread: 1. BOM strip in BufferedLineReader (Copilot finding). The constructor wrapped the FileInputStream in UnicodeBOMInputStream but never called skipBOM() -- BOM bytes leaked into line 1, breaking config / mod / FASTA files saved by editors that prepend a UTF-8 BOM (Windows Notepad, some VS Code configurations, etc.). Fix: chain .skipBOM() in the super(...) call. The BOM-stripping path is per-instance; the wrapper still detects the BOM in its constructor, we just now consume it. New test BufferedLineReaderTest pins the contract: a file with a UTF-8 BOM followed by `ParentMassTolerance=20ppm` returns the bare key=value as line 1 (not `ParentMassTolerance=20ppm`), and a no-BOM file is unchanged. 2. Delete unused MSGFResult.java. Verified zero callers in src/main and src/test before removal. The reviewer flagged this as a small follow-up to land alongside the modernization sweep. Verified: scoped sweep (BufferedLineReaderTest, MSGFPlusOptionsConfigFileTest, MSGFPlusOptionsActivationMethodTest, SearchParamsTest, TestPrecursorCalScaffolding, TestRunManifestWriter, TestDirectPinWriter, TestMSUtils, TestMisc, TestMinSpectraPerThread): 55 tests, 0 failures, 0 errors, 2 skipped. --- .../ucsd/msjava/mgf/BufferedLineReader.java | 9 ++- .../java/edu/ucsd/msjava/msgf/MSGFResult.java | 33 ----------- .../msjava/mgf/BufferedLineReaderTest.java | 56 +++++++++++++++++++ 3 files changed, 62 insertions(+), 36 deletions(-) delete mode 100644 src/main/java/edu/ucsd/msjava/msgf/MSGFResult.java create mode 100644 src/test/java/edu/ucsd/msjava/mgf/BufferedLineReaderTest.java diff --git a/src/main/java/edu/ucsd/msjava/mgf/BufferedLineReader.java b/src/main/java/edu/ucsd/msjava/mgf/BufferedLineReader.java index d068aed6..e7135ecc 100644 --- a/src/main/java/edu/ucsd/msjava/mgf/BufferedLineReader.java +++ b/src/main/java/edu/ucsd/msjava/mgf/BufferedLineReader.java @@ -3,13 +3,16 @@ import java.io.*; /** - * Buffered line reader class - * Uses UnicodeBOMInputStream to properly detect files that start with a byte order mark + * Buffered line reader. Wraps the file in {@link UnicodeBOMInputStream} + * and consumes the BOM via {@code skipBOM()} so the first line returned by + * {@link #readLine()} never contains the BOM glyph -- this matters for + * config / mod / FASTA files saved by Windows editors that prepend a UTF-8 + * BOM. */ public class BufferedLineReader extends BufferedReader implements LineReader { public BufferedLineReader(String fileName) throws IOException { - super(new InputStreamReader(new UnicodeBOMInputStream(new FileInputStream(fileName)))); + super(new InputStreamReader(new UnicodeBOMInputStream(new FileInputStream(fileName)).skipBOM())); } @Override diff --git a/src/main/java/edu/ucsd/msjava/msgf/MSGFResult.java b/src/main/java/edu/ucsd/msjava/msgf/MSGFResult.java deleted file mode 100644 index a4ac6aa3..00000000 --- a/src/main/java/edu/ucsd/msjava/msgf/MSGFResult.java +++ /dev/null @@ -1,33 +0,0 @@ -package edu.ucsd.msjava.msgf; - -import edu.ucsd.msjava.msutil.Peptide; -import edu.ucsd.msjava.msutil.Spectrum; - -public class MSGFResult { - public MSGFResult(Spectrum spec, Peptide annotation, GeneratingFunction gf) { - this.spec = spec; - this.annotation = annotation; - this.gf = gf; - } - - public Spectrum getSpec() { - return spec; - } - - public Peptide getAnnotation() { - return annotation; - } - - public GeneratingFunction getGf() { - return gf; - } - - public ProfileGF getProfGF() { - return profGF; - } - - private Spectrum spec; - private Peptide annotation; - private GeneratingFunction gf; - private ProfileGF profGF; -} diff --git a/src/test/java/edu/ucsd/msjava/mgf/BufferedLineReaderTest.java b/src/test/java/edu/ucsd/msjava/mgf/BufferedLineReaderTest.java new file mode 100644 index 00000000..f8af4d39 --- /dev/null +++ b/src/test/java/edu/ucsd/msjava/mgf/BufferedLineReaderTest.java @@ -0,0 +1,56 @@ +package edu.ucsd.msjava.mgf; + +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Regression test for the BOM-strip fix on {@link BufferedLineReader}: the + * constructor must invoke {@link UnicodeBOMInputStream#skipBOM()} so the + * leading byte-order-mark bytes are consumed before the first + * {@link BufferedLineReader#readLine()} call. Caught by the Copilot review on + * PR #25. + */ +public class BufferedLineReaderTest { + + private static final byte[] UTF8_BOM = new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}; + + @Test + public void firstLineDoesNotContainUtf8Bom() throws IOException { + Path tmp = Files.createTempFile("msgfplus-bom-", ".txt"); + try { + byte[] payload = ("ParentMassTolerance=20ppm\n").getBytes(StandardCharsets.UTF_8); + byte[] withBom = new byte[UTF8_BOM.length + payload.length]; + System.arraycopy(UTF8_BOM, 0, withBom, 0, UTF8_BOM.length); + System.arraycopy(payload, 0, withBom, UTF8_BOM.length, payload.length); + Files.write(tmp, withBom); + + try (BufferedLineReader reader = new BufferedLineReader(tmp.toString())) { + String first = reader.readLine(); + Assert.assertEquals("BOM bytes must not appear in line 1", "ParentMassTolerance=20ppm", first); + Assert.assertNull("only one line in fixture", reader.readLine()); + } + } finally { + Files.deleteIfExists(tmp); + } + } + + @Test + public void firstLineUnchangedWhenNoBomPresent() throws IOException { + Path tmp = Files.createTempFile("msgfplus-no-bom-", ".txt"); + try { + Files.writeString(tmp, "Header\nbody\n"); + try (BufferedLineReader reader = new BufferedLineReader(tmp.toString())) { + Assert.assertEquals("Header", reader.readLine()); + Assert.assertEquals("body", reader.readLine()); + Assert.assertNull(reader.readLine()); + } + } finally { + Files.deleteIfExists(tmp); + } + } +} From 38b02edb0f1eaced1bbbc0b42af44d4403bd4bae Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 27 Apr 2026 16:57:39 +0100 Subject: [PATCH 34/34] refactor: first-wave record migration (8 types) Convert eight value-shaped types to Java records, per the audit: small immutable carriers with no inheritance constraints, where the record's auto-generated equals/hashCode/toString and accessor methods replace hand-rolled boilerplate. No behavioral change; net trim of ~80 LOC plus a clearer surface. Converted: - cli.IntRange -- record IntRange(int min, int max). Compact constructor keeps the min<=max validation. parse() and Converter (picocli ITypeConverter) preserved. - cli.PrecursorTolerance -- record PrecursorTolerance(Tolerance left, Tolerance right). Compact constructor enforces the matching-unit and non-negative invariants in-place; parse() / Converter retained. - msutil.CvParamInfo -- record CvParamInfo(String accession, String name, String value, String unitAccession, String unitName). The stored hasUnit field is gone; hasUnit() is now derived from unitAccession != null. Compatibility getters (getAccession(), getValue(), getUnitAccession(), getUnitName(), getHasUnit()) keep existing call sites untouched. - msutil.Annotation -- record Annotation(AminoAcid prevAA, Peptide peptide, AminoAcid nextAA). Setters (which had no live callers) removed; the parsing constructor delegates via this(...). isProteinNTerm/isProteinCTerm and the dotted toString are kept. - msgf.ProfilePeak -- record. Setters were unused; Comparable> is preserved. - msutil.Atom -- record Atom(String code, double mass, int nominalMass, String name). Static atomArr table + atomMap + static initializer block all kept verbatim. getCode() / getName() / getMass() / getNominalMass() retained as compatibility wrappers. - msdbsearch.CompactSuffixArray.RangeMetadata -- record (4 file/int fields). Three call sites updated from md.numEntries to md.numEntries() etc. - msgf.MassListComparator.MatchedPair -- record. getMass1/getMass2 retained as compatibility wrappers. Call-site updates in SearchParams.parse: tol.left/.right, isotope.min/.max, specIdx.min/.max, ms.min/.max all switched to record accessor methods (.left()/.right()/.min()/.max()). Verified: scoped sweep (incl. BufferedLineReaderTest, MSGFPlusOptionsConfigFileTest, MSGFPlusOptionsActivationMethodTest, SearchParamsTest, TestPrecursorCalScaffolding, TestRunManifestWriter, TestDirectPinWriter, TestMinSpectraPerThread, TestMSUtils, TestSA, TestMisc, TestCandidatePeptideGrid + ConsideringMetCleavage): 80 tests, 0 failures, 0 errors, 3 skipped. --- .../java/edu/ucsd/msjava/cli/IntRange.java | 9 +- .../ucsd/msjava/cli/PrecursorTolerance.java | 24 +++--- .../msjava/msdbsearch/CompactSuffixArray.java | 28 ++----- .../ucsd/msjava/msdbsearch/SearchParams.java | 16 ++-- .../ucsd/msjava/msgf/MassListComparator.java | 18 +--- .../edu/ucsd/msjava/msgf/ProfilePeak.java | 29 ++----- .../edu/ucsd/msjava/msutil/Annotation.java | 82 +++++-------------- .../java/edu/ucsd/msjava/msutil/Atom.java | 46 ++--------- .../edu/ucsd/msjava/msutil/CvParamInfo.java | 60 ++++---------- 9 files changed, 80 insertions(+), 232 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/cli/IntRange.java b/src/main/java/edu/ucsd/msjava/cli/IntRange.java index fd792fe1..7a8cd369 100644 --- a/src/main/java/edu/ucsd/msjava/cli/IntRange.java +++ b/src/main/java/edu/ucsd/msjava/cli/IntRange.java @@ -8,17 +8,12 @@ * {@code "min,max"} or single value {@code "n"} (interpreted as * {@code n,n}). Used by {@code -ti}, {@code -msLevel}, {@code -index}. */ -public final class IntRange { +public record IntRange(int min, int max) { - public final int min; - public final int max; - - public IntRange(int min, int max) { + public IntRange { if (min > max) { throw new IllegalArgumentException("min (" + min + ") > max (" + max + ")"); } - this.min = min; - this.max = max; } public static IntRange parse(String value) { diff --git a/src/main/java/edu/ucsd/msjava/cli/PrecursorTolerance.java b/src/main/java/edu/ucsd/msjava/cli/PrecursorTolerance.java index f55d0db5..b214ef01 100644 --- a/src/main/java/edu/ucsd/msjava/cli/PrecursorTolerance.java +++ b/src/main/java/edu/ucsd/msjava/cli/PrecursorTolerance.java @@ -10,14 +10,18 @@ * asymmetric form ({@code "0.5Da,2.5Da"}). Both sides must use the * same unit and be non-negative. */ -public final class PrecursorTolerance { +public record PrecursorTolerance(Tolerance left, Tolerance right) { - public final Tolerance left; - public final Tolerance right; - - private PrecursorTolerance(Tolerance left, Tolerance right) { - this.left = left; - this.right = right; + public PrecursorTolerance { + if (left == null || right == null) { + throw new IllegalArgumentException("left and right tolerances must be non-null"); + } + if (left.isTolerancePPM() != right.isTolerancePPM()) { + throw new IllegalArgumentException("left and right tolerance units must be the same"); + } + if (left.getValue() < 0 || right.getValue() < 0) { + throw new IllegalArgumentException("parent mass tolerance must not be negative"); + } } public static PrecursorTolerance parse(String value) { @@ -34,12 +38,6 @@ public static PrecursorTolerance parse(String value) { if (l == null || r == null) { throw new IllegalArgumentException("invalid tolerance value: " + value); } - if (l.isTolerancePPM() != r.isTolerancePPM()) { - throw new IllegalArgumentException("left and right tolerance units must be the same"); - } - if (l.getValue() < 0 || r.getValue() < 0) { - throw new IllegalArgumentException("parent mass tolerance must not be negative"); - } return new PrecursorTolerance(l, r); } diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java index 9f3bdcb5..2f8083ef 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java @@ -430,14 +430,14 @@ private static void sortAndWriteBuckets(CompactFastaSequence sequence, int prevRangeLastBucketFirst = -1; for (RangeMetadata md : rangeMetadatas) { - if (md.numEntries == 0) continue; + if (md.numEntries() == 0) continue; mergeRangeIntoOutput(sequence, md, prevRangeLastBucketFirst, indexOut, nlcpOut); - prevRangeLastBucketFirst = md.lastBucketFirstSuffix; + prevRangeLastBucketFirst = md.lastBucketFirstSuffix(); } } finally { for (RangeMetadata md : rangeMetadatas) { - deleteQuietly(md.tempIndicesFile); - deleteQuietly(md.tempLcpsFile); + deleteQuietly(md.tempIndicesFile()); + deleteQuietly(md.tempLcpsFile()); } // Sweep debris from workers that died before returning a RangeMetadata. File[] orphans = parentDir.listFiles((dir, name) -> name.startsWith(tempBasename)); @@ -464,8 +464,8 @@ private static void mergeRangeIntoOutput(CompactFastaSequence sequence, int prevRangeLastBucketFirst, DataOutputStream indexOut, DataOutputStream nlcpOut) throws IOException { - try (DataInputStream idxIn = new DataInputStream(new BufferedInputStream(new FileInputStream(md.tempIndicesFile))); - DataInputStream lcpIn = new DataInputStream(new BufferedInputStream(new FileInputStream(md.tempLcpsFile)))) { + try (DataInputStream idxIn = new DataInputStream(new BufferedInputStream(new FileInputStream(md.tempIndicesFile()))); + DataInputStream lcpIn = new DataInputStream(new BufferedInputStream(new FileInputStream(md.tempLcpsFile())))) { int firstIndex = idxIn.readInt(); byte firstLcp = lcpIn.readByte(); if (prevRangeLastBucketFirst >= 0) { @@ -474,7 +474,7 @@ private static void mergeRangeIntoOutput(CompactFastaSequence sequence, indexOut.writeInt(firstIndex); nlcpOut.writeByte(firstLcp); - for (int i = 1; i < md.numEntries; i++) { + for (int i = 1; i < md.numEntries(); i++) { indexOut.writeInt(idxIn.readInt()); nlcpOut.writeByte(lcpIn.readByte()); } @@ -645,19 +645,7 @@ private static void writeBucketsDirect(CompactFastaSequence sequence, /** Per-worker sort+LCP output handle. Indices/LCPs live on disk; this carries * the small metadata the merge step needs. Empty ranges return {@code null} * file paths. */ - static final class RangeMetadata { - final File tempIndicesFile; - final File tempLcpsFile; - final int numEntries; - final int lastBucketFirstSuffix; - - RangeMetadata(File tempIndicesFile, File tempLcpsFile, int numEntries, int lastBucketFirstSuffix) { - this.tempIndicesFile = tempIndicesFile; - this.tempLcpsFile = tempLcpsFile; - this.numEntries = numEntries; - this.lastBucketFirstSuffix = lastBucketFirstSuffix; - } - } + record RangeMetadata(File tempIndicesFile, File tempLcpsFile, int numEntries, int lastBucketFirstSuffix) {} /** Growable {@code int[]} bucket of suffix indices. Shared between the * bucketing phase (sequential {@link #add}) and the per-range worker diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java index e1b913cc..58794855 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java @@ -314,8 +314,8 @@ public String parse(MSGFPlusOptions opts) { decoyProteinPrefix = opts.decoyPrefix != null ? opts.decoyPrefix : "XXX"; PrecursorTolerance tol = opts.precursorTolerance != null ? opts.precursorTolerance : PrecursorTolerance.parse("20ppm"); - leftPrecursorMassTolerance = tol.left; - rightPrecursorMassTolerance = tol.right; + leftPrecursorMassTolerance = tol.left(); + rightPrecursorMassTolerance = tol.right(); int toleranceUnit = opts.precursorToleranceUnits != null ? opts.precursorToleranceUnits : 2; if (toleranceUnit != 2) { @@ -325,8 +325,8 @@ public String parse(MSGFPlusOptions opts) { } IntRange isotope = opts.isotopeErrorRange != null ? opts.isotopeErrorRange : new IntRange(0, 1); - this.minIsotopeError = isotope.min; - this.maxIsotopeError = isotope.max; + this.minIsotopeError = isotope.min(); + this.maxIsotopeError = isotope.max(); if (rightPrecursorMassTolerance.getToleranceAsDa(1000, 2) >= 0.5f || leftPrecursorMassTolerance.getToleranceAsDa(1000, 2) >= 0.5f) { @@ -384,8 +384,8 @@ public String parse(MSGFPlusOptions opts) { numMatchesPerSpec = opts.numMatchesPerSpec != null ? opts.numMatchesPerSpec : 1; IntRange specIdx = opts.specIndexRange != null ? opts.specIndexRange : new IntRange(1, Integer.MAX_VALUE - 1); - startSpecIndex = specIdx.min; - endSpecIndex = specIdx.max; + startSpecIndex = specIdx.min(); + endSpecIndex = specIdx.max(); useTDA = opts.effectiveTdaStrategy() == 1; ignoreMetCleavage = (opts.ignoreMetCleavage != null ? opts.ignoreMetCleavage : 0) == 1; @@ -426,8 +426,8 @@ public String parse(MSGFPlusOptions opts) { precursorCalMode = opts.precursorCalMode != null ? opts.precursorCalMode : PrecursorCalMode.AUTO; IntRange ms = opts.msLevel != null ? opts.msLevel : new IntRange(2, 2); - minMSLevel = ms.min; - maxMSLevel = ms.max; + minMSLevel = ms.min(); + maxMSLevel = ms.max(); maxNumMods = opts.effectiveMaxNumMods(); int maxNumModsCompare = aaSet.getMaxNumberOfVariableModificationsPerPeptide(); diff --git a/src/main/java/edu/ucsd/msjava/msgf/MassListComparator.java b/src/main/java/edu/ucsd/msjava/msgf/MassListComparator.java index b74c3872..f5e558bb 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/MassListComparator.java +++ b/src/main/java/edu/ucsd/msjava/msgf/MassListComparator.java @@ -50,21 +50,9 @@ else if (i2 == massList2.size() - 1) } - public static class MatchedPair { - T m1, m2; - - public MatchedPair(T m1, T m2) { - this.m1 = m1; - this.m2 = m2; - } - - public T getMass1() { - return m1; - } - - public T getMass2() { - return m2; - } + public record MatchedPair(T m1, T m2) { + public T getMass1() { return m1; } + public T getMass2() { return m2; } } } diff --git a/src/main/java/edu/ucsd/msjava/msgf/ProfilePeak.java b/src/main/java/edu/ucsd/msjava/msgf/ProfilePeak.java index 0048ef3e..bf4e4a76 100644 --- a/src/main/java/edu/ucsd/msjava/msgf/ProfilePeak.java +++ b/src/main/java/edu/ucsd/msjava/msgf/ProfilePeak.java @@ -2,32 +2,13 @@ import edu.ucsd.msjava.msutil.Matter; -public class ProfilePeak implements Comparable> { - T node; - float probability; +public record ProfilePeak(T node, float probability) implements Comparable> { - public ProfilePeak(T node, float probability) { - this.node = node; - this.probability = probability; - } - - public T getNode() { - return node; - } - - public void setNode(T node) { - this.node = node; - } - - public float getProbability() { - return probability; - } - - public void setProbability(float probability) { - this.probability = probability; - } + public T getNode() { return node; } + public float getProbability() { return probability; } + @Override public int compareTo(ProfilePeak p) { return node.compareTo(p.node); } -} \ No newline at end of file +} diff --git a/src/main/java/edu/ucsd/msjava/msutil/Annotation.java b/src/main/java/edu/ucsd/msjava/msutil/Annotation.java index 2a5874cb..2e60e689 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Annotation.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Annotation.java @@ -1,68 +1,28 @@ package edu.ucsd.msjava.msutil; -public class Annotation { - private AminoAcid prevAA; - private Peptide peptide; - private AminoAcid nextAA; - - public Annotation(AminoAcid prevAA, Peptide peptide, AminoAcid nextAA) { - this.prevAA = prevAA; - this.peptide = peptide; - this.nextAA = nextAA; - } +public record Annotation(AminoAcid prevAA, Peptide peptide, AminoAcid nextAA) { public Annotation(String annotationStr, AminoAcidSet aaSet) { - String pepStr = annotationStr.substring(annotationStr.indexOf('.') + 1, annotationStr.lastIndexOf('.')); - char prevAAResidue = annotationStr.charAt(0); - char nextAAResidue = annotationStr.charAt(annotationStr.length() - 1); - - prevAA = aaSet.getAminoAcid(prevAAResidue); - peptide = aaSet.getPeptide(pepStr); - nextAA = aaSet.getAminoAcid(nextAAResidue); - } - - public boolean isProteinNTerm() { - return prevAA == null; - } - - public boolean isProteinCTerm() { - return nextAA == null; - } - - public AminoAcid getPrevAA() { - return prevAA; - } - - public void setPrevAA(AminoAcid prevAA) { - this.prevAA = prevAA; - } - - public Peptide getPeptide() { - return peptide; - } - - public void setPeptide(Peptide peptide) { - this.peptide = peptide; - } - - public AminoAcid getNextAA() { - return nextAA; - } - - public void setNextAA(AminoAcid nextAA) { - this.nextAA = nextAA; - } - - @Override - public String toString() { - if (peptide == null) - return null; - StringBuffer output = new StringBuffer(); - if (prevAA != null) - output.append(prevAA.getResidueStr()); - output.append("." + peptide.toString() + "."); - if (nextAA != null) - output.append(nextAA.getResidueStr()); + this( + aaSet.getAminoAcid(annotationStr.charAt(0)), + aaSet.getPeptide(annotationStr.substring(annotationStr.indexOf('.') + 1, annotationStr.lastIndexOf('.'))), + aaSet.getAminoAcid(annotationStr.charAt(annotationStr.length() - 1)) + ); + } + + public boolean isProteinNTerm() { return prevAA == null; } + public boolean isProteinCTerm() { return nextAA == null; } + + public AminoAcid getPrevAA() { return prevAA; } + public Peptide getPeptide() { return peptide; } + public AminoAcid getNextAA() { return nextAA; } + + @Override public String toString() { + if (peptide == null) return null; + StringBuilder output = new StringBuilder(); + if (prevAA != null) output.append(prevAA.getResidueStr()); + output.append('.').append(peptide).append('.'); + if (nextAA != null) output.append(nextAA.getResidueStr()); return output.toString(); } } diff --git a/src/main/java/edu/ucsd/msjava/msutil/Atom.java b/src/main/java/edu/ucsd/msjava/msutil/Atom.java index c5213ca0..c5b149c2 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Atom.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Atom.java @@ -2,46 +2,16 @@ import java.util.HashMap; -public class Atom { - public Atom(String code, double mass, int nominalMass, String name) { - this.code = code; - this.mass = mass; - this.nominalMass = nominalMass; - this.name = name; - } - - public String getCode() { - return code; - } - - public String getName() { - return name; - } - - public double getMass() { - return mass; - } +public record Atom(String code, double mass, int nominalMass, String name) { - public int getNominalMass() { - return nominalMass; - } - - public static Atom[] getAtomarr() { - return atomArr; - } - - public static HashMap getAtomMap() { - return atomMap; - } - - public static Atom get(String code) { - return atomMap.get(code); - } + public String getCode() { return code; } + public String getName() { return name; } + public double getMass() { return mass; } + public int getNominalMass() { return nominalMass; } - private final String code; - private final String name; - private final double mass; - private final int nominalMass; + public static Atom[] getAtomarr() { return atomArr; } + public static HashMap getAtomMap() { return atomMap; } + public static Atom get(String code) { return atomMap.get(code); } private static final Atom[] atomArr = { diff --git a/src/main/java/edu/ucsd/msjava/msutil/CvParamInfo.java b/src/main/java/edu/ucsd/msjava/msutil/CvParamInfo.java index 86a83109..32b620be 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/CvParamInfo.java +++ b/src/main/java/edu/ucsd/msjava/msutil/CvParamInfo.java @@ -6,53 +6,21 @@ * * @author Bryson Gibbons */ -public class CvParamInfo { - private final String accession; - private final String name; - private final String value; - private final String unitAccession; - private final String unitName; - private final Boolean hasUnit; +public record CvParamInfo(String accession, String name, String value, + String unitAccession, String unitName) { public CvParamInfo(String accession, String name, String value) { - this.accession = accession; - this.name = name; - this.value = value; - this.unitAccession = null; - this.unitName = null; - this.hasUnit = false; + this(accession, name, value, null, null); } - public CvParamInfo(String accession, String name, String value, String unitAccession, String unitName) { - this.accession = accession; - this.name = name; - this.value = value; - this.hasUnit = true; - this.unitAccession = unitAccession; - this.unitName = unitName; - } - - public String getAccession() { - return this.accession; - } - - public String getName() { - return this.name; - } - - public String getValue() { - return this.value; - } - - public Boolean getHasUnit() { - return this.hasUnit; - } - - public String getUnitAccession() { - return this.unitAccession; - } - - public String getUnitName() { - return this.unitName; - } -} + public boolean hasUnit() { + return unitAccession != null; + } + + public String getAccession() { return accession; } + public String getName() { return name; } + public String getValue() { return value; } + public Boolean getHasUnit() { return hasUnit(); } + public String getUnitAccession() { return unitAccession; } + public String getUnitName() { return unitName; } +}