diff --git a/Makefile.am b/Makefile.am index 7ebf1d31..068ffad6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -236,6 +236,7 @@ dnmtools_SOURCES += src/amrfinder/amrtester.cpp dnmtools_SOURCES += src/radmeth/dmr.cpp dnmtools_SOURCES += src/radmeth/methdiff.cpp dnmtools_SOURCES += src/radmeth/radmeth_optimize.cpp +dnmtools_SOURCES += src/radmeth/radmeth_model.cpp dnmtools_SOURCES += src/radmeth/radmeth.cpp dnmtools_SOURCES += src/radmeth/radmeth-adjust.cpp dnmtools_SOURCES += src/radmeth/radmeth-merge.cpp diff --git a/docs/content/radmeth.md b/docs/content/radmeth.md index 54d03292..d46dc108 100644 --- a/docs/content/radmeth.md +++ b/docs/content/radmeth.md @@ -1,62 +1,65 @@ ## radmeth - Differential CpG methylation ## Synopsis + ```console -$ dnmtools radmeth -factor case design_matrix.txt proportion_table.txt >output.bed +dnmtools radmeth -factor case -o output.txt design_matrix.txt proportion_table.txt ``` ## Description -The DM detection method described in this section is based on the -beta-binomial regression. We recommend using this method when more -than three replicates are available in each group. For rapid -differential methylation analysis the regression-based method should -be run on a computing cluster with a few hundred available nodes, in -which case it takes approximately a few hours to process a dataset -consisting of 30-50 WGBS samples. The analysis can be also performed -on a personal workstation, but it will take substantially longer. Note -that the actual processing time depends on the coverage of each -methylome, the number of sites analyzed, and the number of methylomes -in the dataset. +The differential-methylation detection method described in this section is +based on the beta-binomial regression. We recommend using this method when +more than three replicates are available in each group. For rapid differential +methylation analysis the regression-based method should be run on a large +machine with many cores. Note that the actual processing time depends on the +coverage of each methylome, the number of sites analyzed, and the number of +methylomes in the dataset. ### Generating proportion tables -The first step in the differential methylation analysis is to assemble -a proportion table containing read proportions for all target -methylomes. Dnmtools includes a program [merge](../merge) to generate -a proportion table from the given collection of methylomes. Suppose -that we want to compare methylomes named `control-a.meth`, -`control-b.meth`, `control-c.meth` to the methylomes `case-a.meth`, -`case-b.meth`, `case-c.meth`. The proportion table can be created with -the following command: +The first step in the differential methylation analysis is to assemble a +proportion table containing read proportions for all target +methylomes. Dnmtools includes a program [merge](../merge) to generate a +proportion table from the given collection of methylomes. Suppose that we want +to compare methylomes named `control-a.meth`, `control-b.meth`, +`control-c.meth` to the methylomes `case-a.meth`, `case-b.meth`, +`case-c.meth`. The proportion table can be created with the following +command: + ```console $ dnmtools merge -t -radmeth \ control-a.meth control-b.meth control-c.meth \ case-a.meth case-b.meth case-c.meth > proportion-table.txt ``` + This is what `proportion-table.txt` looks like: + ```txt - control_a control_b control_c case_a case_b case_c +control_a control_b control_c case_a case_b case_c chr1:108:+:CpG 9 6 10 8 1 1 2 2 2 1 14 1 chr1:114:+:CpG 17 7 10 0 14 3 5 1 9 1 7 1 chr1:160:+:CpG 12 8 10 5 17 4 15 14 13 6 4 4 chr1:309:+:CpG 1 1 1 0 17 12 12 8 2 1 19 8 ``` -As indicated in the header, this proportion table contains information -about 6 methylomes: 3 controls and 3 cases. Each row of the table -contains information about a CpG site and a proportion of reads -mapping over this site in each methylome. For example, the first row -describes a cytosine within a CpG site located on chromosome 1 at -position 108. This site is present in 9 reads in the methylome control -a and is methylated in 6 of them. Note that the dnmtools `merge` -command adds methylomes into the proportion table in the order in -which they are listed on the command line. + +As indicated in the header, this proportion table contains information about 6 +methylomes: 3 controls and 3 cases. Each row of the table contains +information about a CpG site and a proportion of reads mapping over this site +in each methylome. For example, the first row describes a cytosine within a +CpG site located on chromosome 1 at position 108. This site is present in 9 +reads in the methylome control a and is methylated in 6 of them. Note that the +dnmtools `merge` command adds methylomes into the proportion table in the +order in which they are listed on the command line. + +Note: the header conforms to a traditional "data frame" from R, which does not +include a column name for the row names, which are the first column. ### Design matrix -The next step is to specify the design matrix, which describes the -structure of the experiment. For our running example, the design -matrix looks like this: +The next step is to specify the design matrix, which describes the structure +of the experiment. For our running example, the design matrix looks like this: + ```txt base case control_a 1 0 @@ -66,59 +69,67 @@ case_a 1 1 case_b 1 1 case_c 1 1 ``` -The design matrix shows that samples in this dataset are associated -with two factors: base and case. The first column corresponds to the -base factor and will always be present in the design matrix. Think of -it as stating that all samples have the same baseline mean methylation -level. To distinguish cases from controls we add another factor case -(second column). The `1`s in this column correspond to the samples -which belong to the cases group. You can use this design matrix as a -template to create design matrices for two-group comparisons involving -arbitrary many methylomes. - -After creating the proportion table and the design matrix, we are now -ready to start the differential methylation analysis. It consists of -(1) regression, (2) combining significance, and (3) multiple testing -adjustment steps. + +The design matrix shows that samples in this dataset are associated with two +factors: base and case. The first column corresponds to the base factor and +will always be present in the design matrix. Think of it as stating that all +samples have the same baseline mean methylation level. To distinguish cases +from controls we add another factor case (second column). The `1`s in this +column correspond to the samples which belong to the cases group. You can use +this design matrix as a template to create design matrices for two-group +comparisons involving arbitrary many methylomes. + +After creating the proportion table and the design matrix, we are now ready to +start the differential methylation analysis. It consists of (1) regression, +(2) combining significance, and (3) multiple testing adjustment steps. ### Regression Suppose that the `proportion-table.txt` and `design-matrix.txt` are as described above. The regression step is run with the command + ```console -$ dnmtools radmeth -factor case design-matrix.txt proportion-table.txt > output.bed +$ dnmtools radmeth -factor case -o output.txt design-matrix.txt proportion-table.txt ``` -The `-factor` parameter specifies the factor with respect to which we -want to test for differential methylation. The test factor is case, -meaning that we are testing for differential methylation between cases -and controls. In the output file `output.bed`, the last four columns -correspond to the total read counts and methylated read counts of the -case group and control group, respectively. The p-value (5-th column) -is given by the log-likelihood ratio test. - -Depending on the distribution of methylated and unmethylated CpGs, -some p-values may not be calculated. If there are no reads that cover -a given CpG, the value `NA_LOW_COV` will be printed instead of a -p-value. If all CpGs are fully methylated and fully methylated in both -case and control, we say the CpG has an "extreme distribution". In -these cases, we cannot reject the null and output `NA_EXTREME_CNT`. -Finally, if there are numerical errors in the log-likelihood ratio -test, we simply print `NA`. You can opt out of these additional NA -informations by not using the `-n` flag. - -We do not recommend using the individual p-values of CpGs from -radmeth, as they often do not contain sufficient coverage for -significant biological interpretations. Instead, we recommend merging -p-values of neighboring CpGs using [radadjust](../radadjust). + +The `-factor` parameter specifies the factor with respect to which we want to +test for differential methylation. The test factor is 'case', meaning that we +are testing for differential methylation between cases and controls. In the +output file `output.txt`, the last four columns correspond to the total read +counts and methylated read counts of the case group and control group, +respectively. The p-value (5-th column) is given by the log-likelihood ratio +test. + +Depending on the distribution of methylated and unmethylated CpGs, some +p-values may not be calculated. If there are no reads that cover a given CpG, +the value `NA_LOW_COV` will be printed instead of a p-value. If all CpGs are +fully methylated and fully methylated in both case and control, we say the CpG +has an "extreme distribution". In these cases, we cannot reject the null and +output `NA_EXTREME_CNT`. Finally, if there are numerical errors in the +log-likelihood ratio test, we simply print `NA`. You can opt out of these +additional NA informations by not using the `-n` flag. + +We do not recommend relying too much on the individual p-values of CpGs from +radmeth, as they often do not contain sufficient coverage for significant +biological interpretations. Instead, we recommend merging p-values of +neighboring CpGs using [radadjust](../radadjust). ## Tutorial -This tutorial aims to answer questions users often have about the -assumptions of radmeth and how to interepret the results. We will -assume a factor of interest, with levels A and B. We will also assume -a potentially confounding factor, sex, with levels M and F. Further, -we have 8 total samples, 2 samples for each of the 4 combinations of -{A,B}x{M,F}. The design matrix should look like this: +The data used here is available at: + +```txt +https://github.com/andrewdavidsmith/radmeth-demo-simulated-data +``` + +And the results below were generated with v1.5.1 on Linux. + +This tutorial should answer questions users often have about the assumptions +of radmeth and how to interepret the results. We will assume a factor of +interest, with levels A and B. We will also assume a potentially confounding +factor, sex, with levels M and F. Further, we have 8 total samples, 2 samples +for each of the 4 combinations of {A,B}x{M,F}. The design matrix should look +like this: ```txt base sex factor @@ -131,28 +142,28 @@ sample_MA2 1 0 1 sample_MB1 1 0 0 sample_MB2 1 0 0 ``` -Note that the 3 headings correspond with the three binary columns, but -need not be aligned over them. The first sample in the design matrix, -`sample_FA1`, has an indicator in the other columns for both the "F" -and the "A", both encoded with a "1" in their respective columns. But -the replicate number 1 in the `_FA1` is just a replicate, and if it -had meaning we would consider encoding it using some factor. -Designating it as a replicate with no meaning is a choice here, -aligning with the idea that the replicates are statistically -identical. If all replicates identified with a "2" were done on "day -2", for example, then we might reconsider. - -For our example we will use a genome file `genome.fa` that is very -small and not a real genome. We will also use a file `features.bed` -that will be the starting point for generating data that fit with our -hypotheses. We will be generating data that *should* show us the -differences we want to see -- assuming we use the tools properly. I -will refer to each of the intervals in `features.bed` as a "feature" -but just think of it as a genomic interval with a chromosome name, a -start and an end. We need to first obtain some features that are -associated specifically with each of the levels of our factors: A, B, + +Note that the 3 headings correspond with the three binary columns, but need +not be aligned over them. The first sample in the design matrix, `sample_FA1`, +has an indicator in the other columns for both the "F" and the "A", both +encoded with a "1" in their respective columns. But the replicate number 1 in +the `_FA1` is just a replicate, and if it had meaning we would consider +encoding it using some factor. Designating it as a replicate with no meaning +is a choice here, aligning with the idea that the replicates are statistically +identical. If all replicates identified with a "2" were done on "day 2", for +example, then we might reconsider. + +For our example we will use a genome file `genome.fa` that is very small and +not a real genome. We will also use a file `features.bed` that will be the +starting point for generating data that fit with our hypotheses. We will be +generating data that *should* show us the differences we want to see -- +assuming we use the tools properly. I will refer to each of the intervals in +`features.bed` as a "feature" but just think of it as a genomic interval with +a chromosome name, a start and an end. We need to first obtain some features +that are associated specifically with each of the levels of our factors: A, B, M and F. We can do this starting with our features file and using the `bedtools` shuffle command: + ```console $ bedtools shuffle -i features.bed -g genome.chrom.sizes > sim/features_M.bed; $ cat sim/features_*.bed | sort -k 1,1 -k 2,2g -o excl.bed; @@ -162,10 +173,11 @@ $ bedtools shuffle -excl excl.bed -i features.bed -g genome.chrom.sizes > sim/fe $ cat sim/features_*.bed | sort -k 1,1 -k 2,2g -o excl.bed; $ bedtools shuffle -excl excl.bed -i features.bed -g genome.chrom.sizes > sim/features_B.bed; ``` -This ensures none of the "features" are overlapping between A, B, M or -F, and all are similar in number and size distribution. Using each of -these, I then made files of features corresponding to the -*combinations* of factors: + +This ensures none of the "features" are overlapping between A, B, M or F, and +all are similar in number and size distribution. Using each of these, I then +made files of features corresponding to the *combinations* of factors: + ```shell for i in M F; do for j in A B; do @@ -174,28 +186,30 @@ for i in M F; do done; done; ``` + Now I have files named: + ```txt sim/features_MA.bed sim/features_MB.bed sim/features_FA.bed sim/features_FB.bed ``` -The files named with an "M" include all intervals in -`sim/features_M.bed`, and similarly the files named with a "B" include -all intervals in `sim/features_B.bed`. So the file -`sim/features_FA.bed` is all the features for "A" and all the features -for "F". It will differ from `sim/features_FB.bed` less than it will -from `sim/features_MB.bed`. - -Using these files, I simulated methylation levels at every CpG site in -the fake genome `genome.fa` with CpGs outside the intervals receiving -a methylation level of 0.8 and CpGs inside the intervals receiving a -methylation level of 0.15. At each CpG site, a number of reads was -sampled (Poisson with mean 10) and then the methylation levels were -sampled with a coin flip weighed as either 0.8 or 0.15, depending on -the CpG site. I did this twice for each combination of {M,F}x{A,B}. -The result is 8 files named as follows: + +The files named with an "M" include all intervals in `sim/features_M.bed`, and +similarly the files named with a "B" include all intervals in +`sim/features_B.bed`. So the file `sim/features_FA.bed` is all the features +for "A" and all the features for "F". It will differ from +`sim/features_FB.bed` less than it will from `sim/features_MB.bed`. + +Using these files, I simulated methylation levels at every CpG site in the +fake genome `genome.fa` with CpGs outside the intervals receiving a +methylation level of 0.8 and CpGs inside the intervals receiving a methylation +level of 0.15. At each CpG site, a number of reads was sampled (Poisson with +mean 10) and then the methylation levels were sampled with a coin flip weighed +as either 0.8 or 0.15, depending on the CpG site. I did this twice for each +combination of {M,F}x{A,B}. The result is 8 files named as follows: + ```console $ ls -1 sim/*.counts sim/sample_FA1.counts @@ -207,12 +221,14 @@ sim/sample_MA2.counts sim/sample_MB1.counts sim/sample_MB2.counts ``` -The program I used for the simulation above is not currently -available. In this simulation, the only difference between replicates -(e.g., `sim/sample_FA1.counts` vs. `sim/sample_FA2.counts`) is random -noise due to sampling. + +The program I used for the simulation above is not currently available. In +this simulation, the only difference between replicates (e.g., +`sim/sample_FA1.counts` vs. `sim/sample_FA2.counts`) is random noise due to +sampling. Here is a look inside one of these files: + ```console $ head sim/sample_FA1.counts chr1 163 + CG 0.857143 7 @@ -226,12 +242,16 @@ chr1 324 + CG 0.555556 9 chr1 350 + CG 0.933333 15 chr1 356 + CG 0.923077 13 ``` + Then I used the `merge` command from dnmtools to make a data matrix as follows: + ```console $ dnmtools merge -t -radmeth -v -remove .counts -o sim/table.txt sim/sample_*.counts ``` + And this is what it gave me: + ```console $ head sim/table.txt sample_FA1 sample_FA2 sample_FB1 sample_FB2 sample_MA1 sample_MA2 sample_MB1 sample_MB2 @@ -245,67 +265,75 @@ chr1:322:+:CG 6 5 6 6 9 8 19 14 13 11 13 11 9 6 10 7 chr1:324:+:CG 9 5 10 9 8 5 10 9 7 6 14 9 7 6 6 6 chr1:350:+:CG 15 14 8 6 6 5 9 6 7 6 6 4 9 8 11 10 ``` -With this simulated data, in which each column has the influence of -two different factors, we can use `radmeth` with the design matrix -shown above (in a file named `desgin.txt`) to get p-values for -differential methylation between levels A and B of our factor of -interest -- while controling for the effect of M/F: + +With this simulated data, in which each column has the influence of two +different factors, we can use `radmeth` with the design matrix shown above (in +a file named `desgin.txt`) to get p-values for differential methylation +between levels A and B of our factor of interest -- while controling for the +effect of M/F: + ```console $ dnmtools radmeth -o sim/samples.radmeth -f factor design.txt sim/table.txt ``` -As explained already, the 5th column of the output gives the p-values -for differential methylation between levels for our factor of interest -(named "factor" in the design matrix). Here is a look at part of the -output: + +As explained already, the 5th column of the output gives the p-values for +differential methylation between levels for our factor of interest (named +"factor" in the design matrix). Here is a look at part of the output: + ```console $ head sim/samples.radmeth -chr1 163 + CG 0.0864953 35 22 45 36 -chr1 206 + CG 0.444024 37 30 43 38 -chr1 232 + CG 0.756591 28 20 35 26 -chr1 278 + CG 0.780385 43 35 37 31 -chr1 296 + CG 0.64704 33 29 38 32 -chr1 310 + CG 0.640826 37 28 31 22 -chr1 322 + CG 0.0971364 38 33 47 35 -chr1 324 + CG 0.257851 40 29 31 26 -chr1 350 + CG 0.893476 36 30 35 29 -chr1 356 + CG 0.0527032 37 35 42 33 +chr1 163 + CG 0.0877822 35 22 45 36 +chr1 206 + CG 0.442745 37 30 43 38 +chr1 232 + CG 0.756925 28 20 35 26 +chr1 278 + CG 0.794626 43 35 37 31 +chr1 296 + CG 0.635986 33 29 38 32 +chr1 310 + CG 0.641986 37 28 31 22 +chr1 322 + CG 0.0973903 38 33 47 35 +chr1 324 + CG 0.260703 40 29 31 26 +chr1 350 + CG 0.888242 36 30 35 29 +chr1 356 + CG 0.0535286 37 35 42 33 ``` -Focusing on the 5th column, we see values between 0 and 1, but none of -them are very extreme. This particular output file has 17903 rows. We -can use the dnmtools command `selectsites` to pull out the rows in -this file that correspond to each of the original feature sets. Here -is how we would do it for all 4 different original sets of features: + +Focusing on the 5th column, we see values between 0 and 1, but none of them +are very extreme. This particular output file has 17903 rows. We can use the +dnmtools command `selectsites` to pull out the rows in this file that +correspond to each of the original feature sets. Here is how we would do it +for all 4 different original sets of features: + ```shell for i in M F A B; do - dnmtools selectsites -o sim/features_${i}.txt sim/features_${i}.bed sim/samples.radmeth; + dnmtools selectsites -relaxed -o sim/features_${i}.txt sim/features_${i}.bed sim/samples.radmeth; done ``` -The result allows us to verify that `radmeth` with the design matrix -given above identifies sites as significant when they differ between -levels A and B of the factor of interest: + +The result allows us to verify that `radmeth` with the design matrix given +above identifies sites as significant when they differ between levels A and B +of the factor of interest: + ```console -$ for i in sim/features_*.counts; do echo $i `awk 'BEGIN{k=0;c=0}{k+=$5;c+=1}END{print k,c,k/c}' $i`; done | column -t -sim/features_A.counts 0.255448 430 0.000594064 -sim/features_B.counts 0.114256 372 0.00030714 -sim/features_F.counts 196.093 400 0.490232 -sim/features_M.counts 295.234 588 0.502099 +$ for i in sim/features_[ABFM].txt; do echo $i `awk 'BEGIN{k=0;c=0}{k+=$5;c+=1}END{print k,c,k/c}' $i`; done | column -t +sim/features_A.txt 0.256072 430 0.000595517 +sim/features_B.txt 0.114786 372 0.000308564 +sim/features_F.txt 197.332 400 0.493329 +sim/features_M.txt 297.728 588 0.50634 ``` -The p-values in features associated with either M or F are averaging -around 0.5, which is what we want if we are trying to control this -possible confounding variable. In the features associated either with -A or B, the p-values are very small on average. Of course this is -literally by design. Knowing that we simulated just as much difference -between M and F as between A and B, if we had used `-f sex` then -`radmeth` would have correctly assigned low p-values to sites in the -intervals of `features_M.bed` and `features_F.bed`. - -We can use the same simulated data to verify that we do need the -intercept in our design. In other words, the column "base" in -`design.txt` needs to be present (you can give it any name you want; -make sure it's all 1s). - -We change the design matrix to make the following, which has the -intercept column removed: + +The p-values in features associated with either M or F are averaging around +0.5, which is what we want if we are trying to control this possible +confounding variable. In the features associated either with A or B, the +p-values are very small on average. Of course this is literally by +design. Knowing that we simulated just as much difference between M and F as +between A and B, if we had used `-f sex` then `radmeth` would have correctly +assigned low p-values to sites in the intervals of `features_M.bed` and +`features_F.bed`. + +We can use the same simulated data to verify that we do need the intercept in +our design. In other words, the column "base" in `design.txt` needs to be +present (you can give it any name you want; make sure it's all 1s). + +We change the design matrix to make the following, which has the intercept +column removed: + ```console $ cat design_noint.txt sex factor @@ -318,44 +346,56 @@ sample_MA2 0 1 sample_MB1 0 0 sample_MB2 0 0 ``` -Then we re-run all the above commands (I won't exaplain each) to get -results analogous but with this modified design matrix: + +Then we re-run all the above commands (I won't exaplain each) to get results +analogous but with this modified design matrix: + ```console $ dnmtools radmeth -o sim/samples_noint.radmeth -f factor design_noint.txt sim/table.txt -$ for i in M F A B; do dnmtools selectsites -o sim/features_noint_${i}.txt sim/features_${i}.bed sim/samples_noint.radmeth; done +$ for i in M F A B; do dnmtools selectsites -relaxed -o sim/features_noint_${i}.txt sim/features_${i}.bed sim/samples_noint.radmeth; done $ for i in sim/*_noint_*.txt; do echo $i `awk 'BEGIN{k=0;c=0}{k+=$5;c+=1}END{print k,c,k/c}' $i`; done | column -t -sim/features_noint_A.txt 8.27788 430 0.0192509 -sim/features_noint_B.txt 15.2391 372 0.0409653 -sim/features_noint_F.txt 77.2438 400 0.193109 -sim/features_noint_M.txt 93.7181 588 0.159385 +sim/features_noint_A.txt 8.27931 430 0.0192542 +sim/features_noint_B.txt 15.2397 372 0.040967 +sim/features_noint_F.txt 77.2656 400 0.193164 +sim/features_noint_M.txt 93.7312 588 0.159407 ``` -Although the sites within features corresponding to A and B are much -lower on average than those for M or F, the distinction has been -reduced. More importantly, we know that the sites in -`sim/features_M.bed` and `sim/features_F.bed` are not influenced by -the level A/B of our factor of interest. The p-values for sites in the -last two rows above should have mean of 0.5. But without the intercept -included in the design matrix, the outcomes to not match our -statistical assumptions. I cannot predict what will happen if you use -the wrong design matrix. In analyzing bisulfite sequencing data for -differential methylation, we become aware of uncertainty through the -counts, and we leverage this to try and be more accurate. So unlike -other regression settings, if you try to normalize the data first, in -order to avoid having to deal with an intercept, you will not arrive -at an equivalent procedure -- if you want to normalize the data -yourself (i.e. subtract mu and divide by sigma) then you are probably -better off using a general regression package. + +Although the sites within features corresponding to A and B are much lower on +average than those for M or F, the distinction has been reduced. More +importantly, we know that the sites in `sim/features_M.bed` and +`sim/features_F.bed` are not influenced by the level A/B of our factor of +interest. The p-values for sites in the last two rows above should have mean +of 0.5. But without the intercept included in the design matrix, the outcomes +to not match our statistical assumptions. I cannot predict what will happen if +you use the wrong design matrix. In analyzing bisulfite sequencing data for +differential methylation, we become aware of uncertainty through the counts, +and we leverage this to try and be more accurate. So unlike other regression +settings, if you try to normalize the data first, in order to avoid having to +deal with an intercept, you will not arrive at an equivalent procedure -- if +you want to normalize the data yourself (i.e. subtract mu and divide by sigma) +then you are probably better off using a general regression package. ## Options ```txt -o, -out ``` + The name of the output file (default: stdout). +```txt + -t, -threads +``` + +Use multiple threads. This gives very good speedup as long as you do not +exceed the number of available physical cores. Most CPUs support 2 hardware +threads per physical core via SMT/Hyper-Threading, so check your number of +cores if you are unsure. + ```txt -n -na-info ``` + If a p-value is not calculated, print NAs in more detail: low coverage (`NA_LOW_COV`), extreme values (`NA_EXTREME`) or numerical errors in likelihood ratios (`NA`). @@ -363,10 +403,34 @@ likelihood ratios (`NA`). ```txt -v, -verbose ``` + Print more information while the command is running. +```txt + -progress +``` + +Show progress while radmeth runs. + ```txt -f, -factor ``` + The name of the factor on which to test differences. This must be the name of one of the columns in file design matrix. This is required. + +```txt + -tolerance +``` + +The numerical tolerance for convergence when estimating model parameters. +This cutoff is applied to the norm of the gradient of the log-likelihood +function for MLE parameter estimation, and is multiplied by both the square +root of the number of parameters and by the number of methylomes in the +analysis. + +```txt + -max-iter +``` + +The maximum iterations to allow when estimating model parameters. diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index d8886c5b..6eb3ba53 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -760,7 +760,6 @@ consistent_existing_targets( return true; } -int counter{}; struct read_processor { static constexpr auto default_expected_basecall_model = "dna_r10.4.1_e8.2_400bps_hac@v4.3.0"; diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index 815e599a..0fa8fcbe 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -16,15 +16,17 @@ * General Public License for more details. */ -#include // GSL header for chisqrd distribution +#include // GSL header for chisqrd distribution #include #include +#include #include #include #include #include #include +#include #include // smithlab headers @@ -37,175 +39,44 @@ #include "radmeth_optimize.hpp" using std::begin; -using std::cerr; using std::cout; using std::distance; using std::end; -using std::endl; -using std::istringstream; -using std::runtime_error; -using std::sort; -using std::string; -using std::vector; - -static std::istream & -operator>>(std::istream &is, Design &design) { - string header_encoding; - getline(is, header_encoding); - - istringstream header_is(header_encoding); - string header_name; - while (header_is >> header_name) design.factor_names.push_back(header_name); - - string row; - while (getline(is, row)) { - if (row.empty()) continue; - - istringstream row_is(row); - string token; - row_is >> token; - design.sample_names.push_back(token); - - vector matrix_row; - while (row_is >> token) { - if (token.length() == 1 && (token == "0" || token == "1")) - matrix_row.push_back(token == "1"); - else - throw runtime_error("only binary factor levels are allowed:\n" + row); - } - - if (matrix_row.size() != design.n_factors()) - throw runtime_error( - "each row must have as many columns as factors:\n" + - row); - design.matrix.push_back(vector()); - swap(design.matrix.back(), matrix_row); +struct file_progress { + double one_hundred_over_filesize{}; + std::size_t prev_offset{}; + explicit file_progress(const std::string &filename) : + one_hundred_over_filesize{100.0 / std::filesystem::file_size(filename)} {} + void + operator()(std::ifstream &in) { + const std::size_t curr_offset = in.tellg() * one_hundred_over_filesize; + if (curr_offset <= prev_offset) + return; + std::cerr << "[progress: " << std::setw(3) << curr_offset + << (curr_offset == 100 ? "%]\n" : "%]\r"); + prev_offset = (curr_offset == 100) ? std::numeric_limits::max() + : curr_offset; } - return is; -} - -static std::ostream & -operator<<(std::ostream &out, const Design &design) { - for (size_t factor = 0; factor < design.factor_names.size(); ++factor) { - out << design.factor_names[factor]; - if (factor + 1 != design.factor_names.size()) out << "\t"; - } - out << endl; - - for (size_t i = 0; i < design.n_samples(); ++i) { - out << design.sample_names[i]; - for (size_t factor = 0; factor < design.factor_names.size(); ++factor) - out << "\t" << design.matrix[i][factor]; - out << "\n"; - } - return out; -} +}; static void -remove_factor(Design &design, const size_t factor) { - design.factor_names.erase(begin(design.factor_names) + factor); - for (size_t i = 0; i < design.n_samples(); ++i) - design.matrix[i].erase(begin(design.matrix[i]) + factor); -} - -// Parses a natural number from its string representation. Throws exception if -// the string does not encode one. -static size_t -parse_natural_number(string encoding) { - istringstream iss(encoding); - size_t number; - iss >> number; - if (!iss) - throw runtime_error("The token \"" + encoding + - "\" does not encode a natural number"); - return number; -} - -static std::istream & -operator>>(std::istream &table_encoding, SiteProportions &props) { - props.chrom.clear(); - props.position = 0; - props.strand.clear(); - props.context.clear(); - props.meth.clear(); - props.total.clear(); - - string row_encoding; - getline(table_encoding, row_encoding); - - // Skip lines contining only the newline character (e.g. the last line of the - // proportion table). - if (row_encoding.empty()) return table_encoding; - - // Get the row name (which must be specified like this: "chr:position") and - // parse it. - istringstream row_stream(row_encoding); - string row_name_encoding; - row_stream >> row_name_encoding; - - // Every row must start an identifier consisiting of genomic loci of the - // corresponding site. Here we check this identifier has the correct number - // of colons. - const size_t num_colon = - count(row_name_encoding.begin(), row_name_encoding.end(), ':'); - - if (num_colon != 3) - throw runtime_error( - "Each row in the count table must start with " - "a line chromosome:position:strand:context. Got \"" + - row_name_encoding + "\" instead."); - - // First parse the row identifier. - istringstream name_stream(row_name_encoding); - getline(name_stream, props.chrom, ':'); - - if (props.chrom.empty()) - throw runtime_error("Error parsing " + row_name_encoding + - ": chromosome name is missing."); - - string position_encoding; - - getline(name_stream, position_encoding, ':'); - props.position = parse_natural_number(position_encoding); - getline(name_stream, props.strand, ':'); - getline(name_stream, props.context, ':'); - - // After parsing the row identifier, parse count proportions. - size_t total_count, meth_count; - - while (row_stream >> total_count >> meth_count) { - props.total.push_back(total_count); - props.meth.push_back(meth_count); - } - - if (!row_stream.eof()) - throw runtime_error("Some row entries are not natural numbers: " + - row_stream.str()); - - if (props.total.size() != props.meth.size()) - throw runtime_error( - "This row does not encode proportions" - "correctly:\n" + - row_encoding); - return table_encoding; -} - -static bool -fit_regression_model(Regression &r) { - vector initial_params; - return fit_regression_model(r, initial_params); +remove_factor(Design &design, const std::size_t factor_idx) { + design.factor_names.erase(std::begin(design.factor_names) + factor_idx); + for (std::size_t i = 0; i < design.n_samples(); ++i) + design.matrix[i].erase(std::begin(design.matrix[i]) + factor_idx); } /***************** RADMETH ALGORITHM *****************/ static bool -consistent_sample_names(const Regression ®, const string &header) { - istringstream iss(header); - auto nm_itr(begin(reg.design.sample_names)); - const auto nm_end(end(reg.design.sample_names)); - string token; +consistent_sample_names(const Regression ®, const std::string &header) { + std::istringstream iss(header); + auto nm_itr(std::begin(reg.design.sample_names)); + const auto nm_end(std::end(reg.design.sample_names)); + std::string token; while (iss >> token && nm_itr != nm_end) - if (token != *nm_itr++) return false; + if (token != *nm_itr++) + return false; return true; } @@ -213,13 +84,13 @@ consistent_sample_names(const Regression ®, const string &header) { // function outputs the p-value of the log-likelihood ratio. *Note* that it is // assumed that the reduced model has one fewer factor than the reduced model. static double -loglikratio_test(double null_loglik, double full_loglik) { +llr_test(double null_loglik, double full_loglik) { // The log-likelihood ratio statistic. const double log_lik_stat = -2 * (null_loglik - full_loglik); // It is assumed that null model has one fewer factor than the full // model. Hence the number of degrees of freedom is 1. - const size_t degrees_of_freedom = 1; + const std::size_t degrees_of_freedom = 1; // Log-likelihood ratio statistic has a chi-sqare distribution. const double chisq_p = gsl_cdf_chisq_P(log_lik_stat, degrees_of_freedom); @@ -229,46 +100,273 @@ loglikratio_test(double null_loglik, double full_loglik) { } static bool -has_low_coverage(const Regression ®, const size_t test_fac) { +has_low_coverage(const Regression ®, const std::size_t test_fac) { bool cvrd_in_test_fact_smpls = false; - for (size_t i = 0; i < reg.n_samples() && !cvrd_in_test_fact_smpls; ++i) + for (std::size_t i = 0; i < reg.n_samples() && !cvrd_in_test_fact_smpls; ++i) cvrd_in_test_fact_smpls = - (reg.design.matrix[i][test_fac] == 1 && reg.props.total[i] != 0); + (reg.design.matrix[i][test_fac] == 1 && reg.props.mc[i].n_reads != 0); bool cvrd_in_other_smpls = false; - for (size_t i = 0; i < reg.n_samples() && !cvrd_in_other_smpls; ++i) + for (std::size_t i = 0; i < reg.n_samples() && !cvrd_in_other_smpls; ++i) cvrd_in_other_smpls = - (reg.design.matrix[i][test_fac] != 1 && reg.props.total[i] != 0); + (reg.design.matrix[i][test_fac] != 1 && reg.props.mc[i].n_reads != 0); return !cvrd_in_test_fact_smpls || !cvrd_in_other_smpls; } -static bool +[[nodiscard]] static bool has_extreme_counts(const Regression ®) { - bool is_maximally_methylated = true; - for (size_t i = 0; i < reg.n_samples() && is_maximally_methylated; ++i) - is_maximally_methylated = (reg.props.total[i] == reg.props.meth[i]); + const auto &mc = reg.props.mc; + + bool full_meth = true; + for (auto i = std::cbegin(mc); i != std::cend(mc) && full_meth; ++i) + full_meth = (i->n_reads == i->n_meth); - bool is_unmethylated = true; - for (size_t i = 0; i < reg.n_samples() && is_unmethylated; ++i) - is_unmethylated = (reg.props.meth[i] == 0.0); + bool no_meth = true; + for (auto i = std::cbegin(mc); i != std::cend(mc) && no_meth; ++i) + no_meth = (i->n_meth == 0.0); - return is_maximally_methylated || is_unmethylated; + return full_meth || no_meth; } +[[nodiscard]] static bool +has_two_values(const Regression ®, const std::size_t test_factor) { + const auto &v = reg.design.tmatrix[test_factor]; + for (auto i = std::cbegin(v); i != std::cend(v); ++i) + if (*i != v[0]) + return true; + return false; +} -static bool -verify_multiple_levels(const Regression &full_regression, - const size_t test_factor) { - const size_t n_samples = full_regression.design.n_samples(); - const auto first_level = full_regression.design.matrix[0][test_factor]; - bool test_fact_mult_levels = false; - for (size_t i = 1; i < n_samples; ++i) - if (full_regression.design.matrix[i][test_factor] != first_level) - test_fact_mult_levels = true; - return test_fact_mult_levels; +[[nodiscard]] static std::uint32_t +get_test_factor_idx(const Regression &model, const std::string &test_factor) { + const auto &factors = model.design.factor_names; + const auto itr = + std::find(std::cbegin(factors), std::cend(factors), test_factor); + + if (itr == std::cend(factors)) + throw std::runtime_error("Factor not part of design: " + test_factor); + + return std::distance(std::cbegin(factors), itr); } +static void +read_design(const bool verbose, const std::string &design_filename, + Design &design) { + if (verbose) + std::cerr << "design table filename: " << design_filename << std::endl; + std::ifstream design_file(design_filename); + if (!design_file) + throw std::runtime_error("could not open file: " + design_filename); + + // initialize full design matrix from file + design_file >> design; + + if (verbose) + std::cerr << design << std::endl; +} + +enum class row_status : std::uint8_t { + ok, + na, + na_low_cov, + na_extreme_cnt, +}; + +[[nodiscard]] static std::vector +drop_idx(const std::vector &v, const std::size_t to_drop) { + std::vector u; + u.reserve(std::size(v) - 1); + for (auto i = 0u; i < std::size(v); ++i) + if (i != to_drop) + u.push_back(v[i]); + return u; +} + +static inline void +get_chunk_bounds(const std::uint32_t n_elements, const std::uint32_t n_chunks, + std::vector> &chunks) { + const std::uint32_t q = n_elements / n_chunks; + const std::uint32_t r = n_elements - q * n_chunks; + std::uint32_t block_start{}; + for (std::size_t i = 0; i < n_chunks; ++i) { + const auto sz = (i < r) ? q + 1 : q; + const auto block_end = block_start + sz; + chunks[i] = {block_start, block_end}; + block_start = block_end; + } +} + +static void +radmeth(const bool show_progress, const bool more_na_info, + const std::uint32_t n_threads, const std::string &table_filename, + const std::string &outfile, const Regression &alt_model, + const Regression &null_model, const std::uint32_t test_factor_idx) { + static constexpr auto prefix_fmt = "%s\t%ld\t%c\t%s\t"; + static constexpr auto suffix_fmt = "\t%ld\t%ld\t%ld\t%ld\n"; + static constexpr auto buf_size = 1024; + static constexpr auto max_lines = 16384; + + // ADS: open the data table file + std::ifstream table_file(table_filename); + if (!table_file) + throw std::runtime_error("could not open file: " + table_filename); + + // Make sure that the first line of the proportion table file contains + // names of the samples. Throw an exception if the names or their order + // in the proportion table does not match those in the full design matrix. + std::string sample_names_header; + std::getline(table_file, sample_names_header); + + if (!consistent_sample_names(alt_model, sample_names_header)) + throw std::runtime_error( + "header:\n" + sample_names_header + "\n" + + "does not match factor names or their order in the\n" + "design matrix. Check that the design matrix and\n" + "the proportion table are correctly formatted."); + + const std::size_t n_samples = alt_model.design.n_samples(); + + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open output file: " + outfile); + + file_progress progress{table_filename}; + + std::vector> bufs(max_lines, + std::vector(buf_size, 0)); + std::vector n_bytes(max_lines, 0); + std::vector lines(max_lines); + + std::vector alt_models(n_threads, alt_model); + std::vector null_models(n_threads, null_model); + + std::vector> chunks(n_threads); + + // Iterate over rows in the file and do llr test on proportions from + // each. Do this in sets of rows to avoid having to spawn too many threads. + while (true) { + std::uint32_t n_lines = 0; + while (n_lines < max_lines && std::getline(table_file, lines[n_lines])) + ++n_lines; + if (n_lines == 0) + break; + + get_chunk_bounds(n_lines, n_threads, chunks); + + std::vector threads; + for (auto thread_id = 0u; thread_id < n_threads; ++thread_id) { + threads.emplace_back([&, thread_id] { + const auto &[chunk_beg, chunk_end] = chunks[thread_id]; + auto &t_alt_model = alt_models[thread_id]; + auto &t_null_model = null_models[thread_id]; + for (auto b = chunk_beg; b != chunk_end; ++b) { + t_alt_model.props.parse(lines[b]); + if (t_alt_model.props_size() != n_samples) + throw std::runtime_error("found row with wrong number of columns"); + + const auto [p_val, status] = [&]() -> std::tuple { + // Skip the test if (1) no coverage in all cases or in all controls, + // or (2) the site is completely methylated or completely + // unmethylated across all samples. + if (has_low_coverage(t_alt_model, test_factor_idx)) + return std::tuple{1.0, row_status::na_low_cov}; + + if (has_extreme_counts(t_alt_model)) + return std::tuple{1.0, row_status::na_extreme_cnt}; + + std::vector alternate_params; + fit_regression_model(t_alt_model, alternate_params); + t_null_model.props = t_alt_model.props; + + auto null_params = drop_idx(alternate_params, test_factor_idx); + + fit_regression_model(t_null_model, null_params); + const double p_value = + llr_test(t_null_model.max_loglik, t_alt_model.max_loglik); + + return (p_value != p_value) ? std::tuple{1.0, row_status::na} + : std::tuple{p_value, row_status::ok}; + }(); + + std::size_t n_reads_factor = 0; + std::size_t n_reads_others = 0; + std::size_t n_meth_factor = 0; + std::size_t n_meth_others = 0; + + const auto &mc = t_alt_model.props.mc; + const auto &vec = t_alt_model.design.tmatrix[test_factor_idx]; + for (std::size_t s = 0; s < n_samples; ++s) + if (vec[s] != 0) { + n_reads_factor += mc[s].n_reads; + n_meth_factor += mc[s].n_meth; + } + else { + n_reads_others += mc[s].n_reads; + n_meth_others += mc[s].n_meth; + } + + n_bytes[b] = [&] { + // clang-format off + const int n_prefix_bytes = std::sprintf(bufs[b].data(), prefix_fmt, + t_alt_model.props.chrom.data(), + t_alt_model.props.position, + t_alt_model.props.strand, + t_alt_model.props.context.data()); + // clang-format on + if (n_prefix_bytes < 0) + return n_prefix_bytes; + + auto cursor = bufs[b].data() + n_prefix_bytes; + + const int n_pval_bytes = [&] { + if (status == row_status::ok) + return std::sprintf(cursor, "%.6g", p_val); + if (!more_na_info || status == row_status::na) + return std::sprintf(cursor, "NA"); + if (status == row_status::na_extreme_cnt) + return std::sprintf(cursor, "NA_EXTREME_CNT"); + // if (status == row_status::na_low_cov) + return std::sprintf(cursor, "NA_LOW_COV"); + }(); + + if (n_pval_bytes < 0) + return n_pval_bytes; + + cursor += n_pval_bytes; + + // clang-format off + const int n_suffix_bytes = std::sprintf(cursor, suffix_fmt, + n_reads_factor, + n_meth_factor, + n_reads_others, + n_meth_others); + // clang-format on + if (n_suffix_bytes < 0) + return n_suffix_bytes; + + return n_prefix_bytes + n_pval_bytes + n_suffix_bytes; + }(); + } + }); + } + + for (auto &thread : threads) + thread.join(); + + for (auto i = 0u; i < n_lines; ++i) { + if (n_bytes[i] < 0) + throw std::runtime_error("failed to write to output buffer"); + out.write(bufs[i].data(), n_bytes[i]); + } + + if (show_progress) + progress(table_file); + + if (n_lines < n_threads) + break; + } +} /*********************************************************************** * Run beta-binoimial regression using the specified table with @@ -277,171 +375,88 @@ verify_multiple_levels(const Regression &full_regression, int main_radmeth(int argc, char *argv[]) { try { - static const string description = + static const std::string description = "calculate differential methylation scores"; - string outfile; - string test_factor_name; - bool VERBOSE = false; - bool more_na_info = false; + std::string outfile; + std::string test_factor; + bool verbose{false}; + bool show_progress{false}; + bool more_na_info{false}; + std::uint32_t n_threads{1}; /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(strip_path(argv[0]), description, " "); opt_parse.set_show_defaults(); - opt_parse.add_opt("out", 'o', "output file (default: stdout)", false, - outfile); - opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); + opt_parse.add_opt("out", 'o', "output file", true, outfile); + opt_parse.add_opt("threads", 't', "number of threads to use", false, + n_threads); + opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); + opt_parse.add_opt("progress", '\0', "show progress", false, show_progress); opt_parse.add_opt( "na-info", 'n', "if a p-value is not calculated, print NAs in more detail: " "low count (NA_LOW_COV) extreme values (NA_EXTREME_CNT) or " "numerical errors in likelihood ratios (NA)", false, more_na_info); - opt_parse.add_opt("factor", 'f', "a factor to test", true, - test_factor_name); - - vector leftover_args; + opt_parse.add_opt("factor", 'f', "a factor to test", true, test_factor); + opt_parse.add_opt("tolerance", '\0', + "numerical tolerance to test convergence", false, + Regression::tolerance); + opt_parse.add_opt("max-iter", '\0', + "max iterations when estimating parameters", false, + Regression::max_iter); + + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + std::cerr << opt_parse.help_message() << std::endl + << opt_parse.about_message() << std::endl; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + std::cerr << opt_parse.about_message() << std::endl; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + std::cerr << opt_parse.option_missing_message() << std::endl; return EXIT_SUCCESS; } if (leftover_args.size() != 2) { - cerr << opt_parse.help_message() << endl; + std::cerr << opt_parse.help_message() << std::endl; return EXIT_SUCCESS; } - const string design_filename(leftover_args.front()); - const string table_filename(leftover_args.back()); + const std::string design_filename(leftover_args.front()); + const std::string table_filename(leftover_args.back()); /****************** END COMMAND LINE OPTIONS *****************/ - if (VERBOSE) cerr << "design table filename: " << design_filename << endl; - std::ifstream design_file(design_filename); - if (!design_file) - throw runtime_error("could not open file: " + design_filename); - // initialize full design matrix from file - Regression full_regression; - design_file >> full_regression.design; - - if (VERBOSE) cerr << full_regression.design << endl; + Regression orig_alt_model; + read_design(verbose, design_filename, orig_alt_model.design); // Check that provided test factor name exists and find its index. - // Identify test factors with their indexes to simplify naming - auto test_fact_it = - find(begin(full_regression.design.factor_names), - end(full_regression.design.factor_names), test_factor_name); - - if (test_fact_it == end(full_regression.design.factor_names)) - throw runtime_error(test_factor_name + - " factor not part of design specification"); - - const size_t test_factor = - distance(begin(full_regression.design.factor_names), test_fact_it); + const auto test_factor_idx = + get_test_factor_idx(orig_alt_model, test_factor); // verify that the design includes more than one level for the // test factor - if (!verify_multiple_levels(full_regression, test_factor)) { - const auto first_level = full_regression.design.matrix[0][test_factor]; - throw runtime_error("test factor only one level: " + - test_factor_name + ", " + - std::to_string(first_level)); + if (!has_two_values(orig_alt_model, test_factor_idx)) { + const auto first_level = orig_alt_model.design.matrix[0][test_factor_idx]; + throw std::runtime_error("test factor only one level: " + test_factor + + ", " + std::to_string(first_level)); } - Regression null_regression; - null_regression.design = full_regression.design; - remove_factor(null_regression.design, test_factor); - // ADS: done setup for the model - - // ADS: open the data table file - std::ifstream table_file(table_filename); - if (!table_file) - throw runtime_error("could not open file: " + table_filename); - - // Make sure that the first line of the proportion table file contains - // names of the samples. Throw an exception if the names or their order - // in the proportion table does not match those in the full design matrix. - string sample_names_header; - getline(table_file, sample_names_header); - - if (!consistent_sample_names(full_regression, sample_names_header)) - throw runtime_error("header:\n" + sample_names_header + "\n" + - "does not match factor names or their order in the\n" - "design matrix. Check that the design matrix and\n" - "the proportion table are correctly formatted."); - - const size_t n_samples = full_regression.design.n_samples(); - - // ADS: now open the output file because each row is processed - // sequentially - std::ofstream of; - if (!outfile.empty()) of.open(outfile); - std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); - - // Performing the log-likelihood ratio test on proportions from each row - // of the proportion table. - while (table_file >> full_regression.props) { - if (full_regression.props.total.size() != n_samples) - throw runtime_error("found row with wrong number of columns"); - - size_t coverage_factor = 0, coverage_rest = 0, meth_factor = 0, - meth_rest = 0; - - for (size_t s = 0; s < n_samples; ++s) { - if (full_regression.design.matrix[s][test_factor] != 0) { - coverage_factor += full_regression.props.total[s]; - meth_factor += full_regression.props.meth[s]; - } - else { - coverage_rest += full_regression.props.total[s]; - meth_rest += full_regression.props.meth[s]; - } - } - - out << full_regression.props.chrom << "\t" - << full_regression.props.position << "\t" - << full_regression.props.strand << "\t" - << full_regression.props.context << "\t"; - - // Do not perform the test if there's no coverage in either all - // case or all control samples. Also do not test if the site is - // completely methylated or completely unmethylated across all - // samples. - if (has_low_coverage(full_regression, test_factor)) { - out << ((more_na_info) ? "NA_LOW_COV" : "NA"); - } - else if (has_extreme_counts(full_regression)) { - out << ((more_na_info) ? "NA_EXTREME_CNT" : "NA"); - } - else { - fit_regression_model(full_regression); - null_regression.props = full_regression.props; - fit_regression_model(null_regression); - const double p_value = loglikratio_test(null_regression.max_loglik, - full_regression.max_loglik); - - // If error occured in fitting (p-val = nan or -nan). - if (p_value != p_value) - out << "NA"; - else - out << p_value; - } - out << "\t" << coverage_factor << "\t" << meth_factor << "\t" - << coverage_rest << "\t" << meth_rest << endl; - } + Regression orig_null_model; + orig_null_model.design = orig_alt_model.design; + remove_factor(orig_null_model.design, test_factor_idx); + + radmeth(show_progress, more_na_info, n_threads, table_filename, outfile, + orig_alt_model, orig_null_model, test_factor_idx); } catch (const std::exception &e) { - cerr << "ERROR: " << e.what() << endl; - exit(EXIT_FAILURE); + std::cerr << "ERROR: " << e.what() << std::endl; + return EXIT_FAILURE; } return EXIT_SUCCESS; } diff --git a/src/radmeth/radmeth_model.cpp b/src/radmeth/radmeth_model.cpp new file mode 100644 index 00000000..faa4a8bb --- /dev/null +++ b/src/radmeth/radmeth_model.cpp @@ -0,0 +1,175 @@ +/* Copyright (C) 2025 Andrew D Smith + * + * Authors: Andrew D. Smith + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "radmeth_model.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +double Regression::tolerance = 1e-3; +double Regression::stepsize = 0.001; +std::uint32_t Regression::max_iter = 250; + +void +SiteProportions::parse(const std::string &line) { + const auto first_ws = line.find_first_of(" \t"); + + // Parse the row name (must be like: "chr:position:strand:context") + bool failed = false; + + const auto label_end = line.data() + first_ws; + auto field_s = line.data(); + auto field_e = std::find(field_s + 1, label_end, ':'); + if (field_e == label_end) + failed = true; + + { + const std::uint32_t d = std::distance(field_s, field_e); + chrom = std::string{field_s, d}; + } + + field_s = field_e + 1; + field_e = std::find(field_s + 1, label_end, ':'); + failed = failed || (field_e == label_end); + + { + const auto [ptr, ec] = std::from_chars(field_s, field_e, position); + failed = failed || (ptr == field_s); + } + + field_s = field_e + 1; + field_e = std::find(field_s + 1, label_end, ':'); + failed = failed || (field_e != field_s + 1 || field_e == label_end); + + strand = *field_s; + failed = failed || (strand != '-' && strand != '+'); + + field_s = field_e + 1; + field_e = std::find_if(field_s + 1, label_end, + [](const auto x) { return x == ' ' || x == '\t'; }); + failed = failed || (field_e != label_end); + + { + const std::uint32_t d = std::distance(field_s, field_e); + context = std::string{field_s, d}; + } + + if (failed) + throw std::runtime_error("failed to parse label from:\n" + line); + + // Parse the counts of total reads and methylated reads + const auto is_sep = [](const auto x) { return std::isspace(x); }; + const auto not_sep = [](const auto x) { return std::isdigit(x); }; + + const auto line_end = line.data() + std::size(line); + mcounts mc1{}; + mc.clear(); + while (field_e != line_end) { + // get the total count + field_s = std::find_if(field_e + 1, line_end, not_sep); + field_e = std::find_if(field_s + 1, line_end, is_sep); + { + const auto [ptr, ec] = std::from_chars(field_s, field_e, mc1.n_reads); + failed = failed || (ptr != field_e); + } + + field_s = std::find_if(field_e + 1, line_end, not_sep); + field_e = std::find_if(field_s + 1, line_end, is_sep); + { + const auto [ptr, ec] = std::from_chars(field_s, field_e, mc1.n_meth); + failed = failed || (ptr != field_e); + } + + mc.push_back(mc1); + } + + if (failed) + throw std::runtime_error("failed to parse counts from:\n" + line); +} + +template +static void +transpose(const std::vector> &mat, + std::vector> &tmat) { + const auto n_row = std::size(mat); + const auto n_col = std::size(mat.front()); + tmat.resize(n_col, std::vector(n_row, 0.0)); + for (auto row_idx = 0u; row_idx < n_row; ++row_idx) + for (auto col_idx = 0u; col_idx < n_col; ++col_idx) + tmat[col_idx][row_idx] = mat[row_idx][col_idx]; +} + +std::istream & +operator>>(std::istream &is, Design &design) { + std::string header_encoding; + std::getline(is, header_encoding); + + std::istringstream header_is(header_encoding); + std::string header_name; + while (header_is >> header_name) + design.factor_names.push_back(header_name); + + std::string row; + while (std::getline(is, row)) { + if (row.empty()) + continue; + + std::istringstream row_is(row); + std::string token; + row_is >> token; + design.sample_names.push_back(token); + + std::vector matrix_row; + while (row_is >> token) { + if (std::size(token) != 1 || (token != "0" && token != "1")) + throw std::runtime_error("Must use binary factor levels:\n" + row); + matrix_row.push_back(token == "1"); + } + + if (std::size(matrix_row) != design.n_factors()) + throw std::runtime_error( + "each row must have as many columns as factors:\n" + row); + + design.matrix.push_back(matrix_row); + } + + transpose(design.matrix, design.tmatrix); + + return is; +} + +std::ostream & +operator<<(std::ostream &out, const Design &design) { + for (std::size_t factor = 0; factor < design.factor_names.size(); ++factor) { + if (factor != 0) + out << '\t'; + out << design.factor_names[factor]; + } + out << '\n'; + + for (std::size_t i = 0; i < design.n_samples(); ++i) { + out << design.sample_names[i]; + for (std::size_t j = 0; j < design.n_factors(); ++j) + out << "\t" << design.matrix[i][j]; + out << "\n"; + } + return out; +} diff --git a/src/radmeth/radmeth_model.hpp b/src/radmeth/radmeth_model.hpp index 0402a269..d7a01d63 100644 --- a/src/radmeth/radmeth_model.hpp +++ b/src/radmeth/radmeth_model.hpp @@ -19,32 +19,78 @@ #ifndef RADMETH_MODEL_HPP #define RADMETH_MODEL_HPP +#include +#include +#include #include #include struct Design { std::vector factor_names; std::vector sample_names; - std::vector > matrix; - size_t n_factors() const {return factor_names.size();} - size_t n_samples() const {return sample_names.size();} + std::vector> matrix; + std::vector> tmatrix; + std::size_t + n_factors() const { + return factor_names.size(); + } + std::size_t + n_samples() const { + return sample_names.size(); + } }; +std::istream & +operator>>(std::istream &is, Design &design); + +std::ostream & +operator<<(std::ostream &os, const Design &design); + +struct mcounts { + std::uint32_t n_reads{}; + std::uint32_t n_meth{}; +}; + +[[nodiscard]] inline std::istream & +operator>>(std::istream &in, mcounts &rm) { + return in >> rm.n_reads >> rm.n_meth; +} + struct SiteProportions { std::string chrom; - size_t position; - std::string strand; + std::size_t position{}; + char strand{}; std::string context; - std::vector total; - std::vector meth; + std::vector mc; + + void + parse(const std::string &line); }; struct Regression { + static double tolerance; // 1e-4; + static double stepsize; // 0.001; + static std::uint32_t max_iter; // 700; + Design design; SiteProportions props; - double max_loglik; - size_t n_factors() const {return design.n_factors();} - size_t n_samples() const {return design.n_samples();} + double max_loglik{}; + std::vector p_v; // scratch space + + [[nodiscard]] std::size_t + n_factors() const { + return design.n_factors(); + } + + [[nodiscard]] std::size_t + props_size() const { + return std::size(props.mc); + } + + [[nodiscard]] std::size_t + n_samples() const { + return design.n_samples(); + } }; #endif diff --git a/src/radmeth/radmeth_optimize.cpp b/src/radmeth/radmeth_optimize.cpp index b1942328..8affe87e 100644 --- a/src/radmeth/radmeth_optimize.cpp +++ b/src/radmeth/radmeth_optimize.cpp @@ -16,184 +16,221 @@ * General Public License for more details. */ +#include "radmeth_optimize.hpp" + #include "radmeth_model.hpp" #include #include +#include +#include +#include #include #include -using std::vector; -using std::runtime_error; - -static double -pi(Regression *reg, size_t sample, const gsl_vector *params) { +template +[[nodiscard]] static double +pi(const std::vector &v, const gsl_vector *params) { // ADS: this function doesn't have a very helpful name - double dot_prod = 0; - for (size_t fact = 0; fact < reg->design.n_factors(); ++fact) - dot_prod += reg->design.matrix[sample][fact]*gsl_vector_get(params, fact); - - double p = exp(dot_prod)/(1 + exp(dot_prod)); - - return p; + const auto a = v.data(); + const double dot = std::inner_product(a, a + std::size(v), params->data, 0.0); + // const double p = std::exp(dot) / (1.0 + std::exp(dot)); + return 1.0 / (1.0 / std::exp(dot) + 1.0); } - -static double +[[nodiscard]] static double neg_loglik(const gsl_vector *params, void *object) { Regression *reg = (Regression *)(object); - const size_t n_params = reg->design.factor_names.size() + 1; - - double log_lik = 0; // ADS: the dispersion parameter phi is the last element of // parameter vector - const double disp_param = gsl_vector_get(params, n_params - 1); - const double phi = exp(disp_param)/(1.0 + exp(disp_param)); + const double disp_param = gsl_vector_get(params, reg->design.n_factors()); + // const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param)); + const double phi = 1.0 / (1.0 / std::exp(disp_param) + 1.0); + const double one_minus_phi = 1.0 - phi; - for(size_t s = 0; s < reg->design.n_samples(); ++s) { - const double n_s = reg->props.total[s]; - const double y_s = reg->props.meth[s]; - const double p_s = pi(reg, s, params); + const auto &mc = reg->props.mc; + const auto &mat = reg->design.matrix; - for (int k = 0; k < y_s; ++k) - log_lik += log((1 - phi)*p_s + phi*k); - - for (int k = 0; k < n_s - y_s; ++k) - log_lik += log((1 - phi)*(1 - p_s) + phi*k); - - for (int k = 0; k < n_s; ++k) - log_lik -= log(1.0 + phi*(k - 1)); + double log_lik = 0; + const std::size_t n_samples = reg->design.n_samples(); + for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { + const auto n = mc[col_idx].n_reads; + const auto y = mc[col_idx].n_meth; + const double p = pi(mat[col_idx], params); + const double one_minus_p = 1.0 - p; + + const double term1 = one_minus_phi * p; + for (auto k = 0u; k < y; ++k) + log_lik += std::log(term1 + phi * k); + + const double term2 = one_minus_phi * one_minus_p; + for (auto k = 0u; k < n - y; ++k) + log_lik += std::log(term2 + phi * k); + + for (auto k = 0u; k < n; ++k) + // log_lik -= std::log(1.0 + phi * (k - 1.0)); + log_lik -= std::log1p(phi * (k - 1.0)); } - return -log_lik; } - static void -neg_gradient(const gsl_vector *params, void *object, - gsl_vector *output) { - +neg_gradient(const gsl_vector *params, void *object, gsl_vector *output) { Regression *reg = (Regression *)(object); - const size_t n_params = reg->design.n_factors() + 1; + const std::size_t n_samples = reg->design.n_samples(); + const std::size_t n_factors = reg->design.n_factors(); - const double disp_param = gsl_vector_get(params, n_params - 1); + /// ADS: using scratch space held in reg instead of allocating here + // std::vector p_v(n_samples, 0.0); + auto &p_v = reg->p_v; + for (auto col_idx = 0u; col_idx < n_samples; ++col_idx) + p_v[col_idx] = pi(reg->design.matrix[col_idx], params); - const double phi = exp(disp_param)/(1 + exp(disp_param)); + const double disp_param = gsl_vector_get(params, n_factors); - for(size_t f = 0; f < n_params; ++f) { + const auto &tmat = reg->design.tmatrix; + // const double phi = std::exp(disp_param) / (1.0 + std::exp(disp_param)); + const double phi = 1.0 / (1.0 / std::exp(disp_param) + 1.0); + const double one_minus_phi = 1.0 - phi; - double deriv = 0; + const auto &mc = reg->props.mc; - for(size_t s = 0; s < reg->design.n_samples(); ++s) { - int n_s = reg->props.total[s]; - int y_s = reg->props.meth[s]; - double p_s = pi(reg, s, params); + for (std::size_t factor_idx = 0; factor_idx < n_factors; ++factor_idx) { + double deriv = 0; + const auto &vec = tmat[factor_idx]; + for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { + const auto n = mc[col_idx].n_reads; + const auto y = mc[col_idx].n_meth; + const double p = p_v[col_idx]; + const double one_minus_p = 1.0 - p; + + const double denom_term1 = one_minus_phi * p; + const double factor = denom_term1 * one_minus_p * vec[col_idx]; + if (factor == 0) + continue; double term = 0; - //a parameter linked to p - if(f < reg->design.factor_names.size()) { - double factor = (1 - phi)*p_s*(1 - p_s)*reg->design.matrix[s][f]; - if (factor == 0) continue; - - for(int k = 0; k < y_s; ++k) - term += 1/((1 - phi)*p_s + phi*k); - - for(int k = 0; k < n_s - y_s; ++k) - term -= 1/((1 - phi)*(1 - p_s) + phi*k); + const auto accum_term = [&](auto k, const auto lim, + const double denom_base) { + for (; k < lim; ++k) + term += 1.0 / (denom_base + phi * k); + }; - deriv += term*factor; - } else { // the parameter linked to phi - for(int k = 0; k < y_s; ++k) - term += (k - p_s)/((1 - phi)*p_s + phi*k); + const auto accum_subtract_term = [&](auto k, const auto lim, + const double denom_base) { + for (; k < lim; ++k) + term -= 1.0 / (denom_base + phi * k); + }; - for(int k = 0; k < n_s - y_s; ++k) - term += (k - (1 - p_s))/((1 - phi)*(1 - p_s) + phi*k); + accum_term(0u, y, denom_term1); + accum_subtract_term(0u, n - y, one_minus_phi * one_minus_p); - for(int k = 0; k < n_s; ++k) { - term -= (k - 1)/(1 + phi*(k - 1)); - } - - deriv += term * phi * (1 - phi); - } + deriv += term * factor; } - - gsl_vector_set(output, f, deriv); + gsl_vector_set(output, factor_idx, deriv); } + double deriv = 0; + for (std::size_t col_idx = 0; col_idx < n_samples; ++col_idx) { + const auto n = mc[col_idx].n_reads; + const auto y = mc[col_idx].n_meth; + const double p = p_v[col_idx]; + const double one_minus_p = 1.0 - p; + + double term = 0; + const auto accum_term = [&](auto k, const auto lim, const double num_shift, + const double denom_base) { + for (; k < lim; ++k) + term += (k - num_shift) / (denom_base + phi * k); + }; + const auto accum_subtract_term = [&](auto k, const auto lim) { + for (; k < lim; ++k) + term -= (k - 1.0) / (1.0 + phi * (k - 1.0)); + }; + + accum_term(0u, y, p, one_minus_phi * p); + accum_term(0u, n - y, one_minus_p, one_minus_phi * one_minus_p); + accum_subtract_term(0u, n); + + deriv += term * phi * one_minus_phi; + } + gsl_vector_set(output, n_factors, deriv); gsl_vector_scale(output, -1.0); } static void -neg_loglik_and_grad(const gsl_vector *params, - void *object, - double *loglik_val, +neg_loglik_and_grad(const gsl_vector *params, void *object, double *loglik_val, gsl_vector *d_loglik_val) { - *loglik_val = neg_loglik(params, object); neg_gradient(params, object, d_loglik_val); } - bool -fit_regression_model(Regression &r, vector &initial_params) { +fit_regression_model(Regression &r, std::vector ¶ms_init) { + const auto stepsize = Regression::stepsize; + const auto max_iter = Regression::max_iter; // one more than the number of factors - const size_t n_params = r.n_factors() + 1; - if (initial_params.empty()) { - initial_params.resize(n_params, 0.0); - initial_params.back() = -2.5; + const std::size_t n_params = r.n_factors() + 1; + if (params_init.empty()) { + params_init.resize(n_params, 0.0); + params_init.back() = -2.5; } - if (initial_params.size() != n_params) - throw runtime_error("Wrong number of initial parameters."); + r.p_v.resize(r.n_samples()); - int status = 0; + const double tolerance = + std::sqrt(n_params) * r.n_samples() * Regression::tolerance; - size_t iter = 0; + if (params_init.size() != n_params) + throw std::runtime_error("Wrong number of initial parameters."); - { - gsl_multimin_function_fdf loglik_bundle; - loglik_bundle.f = &neg_loglik; - loglik_bundle.df = &neg_gradient; - loglik_bundle.fdf = &neg_loglik_and_grad; - loglik_bundle.n = n_params; - loglik_bundle.params = (void *)&r; + // clang-format off + gsl_multimin_function_fdf loglik_bundle = { + &neg_loglik, // objective function + &neg_gradient, // gradient + &neg_loglik_and_grad, // combined obj and grad + n_params, // number of model params + static_cast(&r) // parameters for objective and gradient functions + }; + // clang-format on - gsl_vector *params = gsl_vector_alloc(n_params); + // Alternatives: + // - gsl_multimin_fdfminimizer_conjugate_pr + // - gsl_multimin_fdfminimizer_conjugate_fr + // - gsl_multimin_fdfminimizer_vector_bfgs2 + // - gsl_multimin_fdfminimizer_steepest_descent + const gsl_multimin_fdfminimizer_type *minimizer = + gsl_multimin_fdfminimizer_conjugate_fr; - for (size_t param = 0; param < initial_params.size(); ++param) - gsl_vector_set(params, param, initial_params[param]); + gsl_multimin_fdfminimizer *s = + gsl_multimin_fdfminimizer_alloc(minimizer, n_params); - //can also try gsl_multimin_fdfminimizer_conjugate_pr; - const gsl_multimin_fdfminimizer_type *T = gsl_multimin_fdfminimizer_conjugate_fr; + gsl_vector *params = gsl_vector_alloc(n_params); + for (std::size_t i = 0; i < n_params; ++i) + gsl_vector_set(params, i, params_init[i]); - gsl_multimin_fdfminimizer *s = gsl_multimin_fdfminimizer_alloc (T, n_params); + gsl_multimin_fdfminimizer_set(s, &loglik_bundle, params, stepsize, tolerance); - // ADS: 0.001 is the step size and 1e-4 is the tolerance - // get the minimizer - gsl_multimin_fdfminimizer_set(s, &loglik_bundle, params, 0.001, 1e-4); + int status = 0; + std::size_t iter = 0; - do { - iter++; - // do one iteration and get the status - status = gsl_multimin_fdfminimizer_iterate(s); - if (status) - break; - // check the status based on gradient - status = gsl_multimin_test_gradient (s->gradient, 1e-4); - } - while (status == GSL_CONTINUE && iter < 700); - // It it reasonable to reduce the number of iterations to 500? - // ADS: 700 vs. 500? what's the difference? + do { + status = gsl_multimin_fdfminimizer_iterate(s); // one iter and get status + if (status) + break; + // check status from gradient + status = gsl_multimin_test_gradient(s->gradient, tolerance); + } while (status == GSL_CONTINUE && ++iter < max_iter); + // ADS: What is a reasonable number of iterations? - r.max_loglik = (-1)*neg_loglik(s->x, &r); + r.max_loglik = -1.0 * neg_loglik(s->x, &r); - gsl_multimin_fdfminimizer_free(s); - gsl_vector_free(params); - } + gsl_multimin_fdfminimizer_free(s); + gsl_vector_free(params); return status == GSL_SUCCESS; } diff --git a/src/radmeth/radmeth_optimize.hpp b/src/radmeth/radmeth_optimize.hpp index 2bf115de..16f4a061 100644 --- a/src/radmeth/radmeth_optimize.hpp +++ b/src/radmeth/radmeth_optimize.hpp @@ -19,10 +19,11 @@ #ifndef RADMETH_OPTIMIZE_HPP #define RADMETH_OPTIMIZE_HPP +#include + struct Regression; bool fit_regression_model(Regression &r, std::vector &initial_params); - #endif diff --git a/src/utils/selectsites.cpp b/src/utils/selectsites.cpp index 43b288a7..b0389ac4 100644 --- a/src/utils/selectsites.cpp +++ b/src/utils/selectsites.cpp @@ -294,6 +294,7 @@ main_selectsites(int argc, char *argv[]) { bool VERBOSE = false; bool keep_file_on_disk = false; bool compress_output = false; + bool allow_extra_fields = false; std::string outfile("-"); std::string summary_file; @@ -315,6 +316,8 @@ main_selectsites(int argc, char *argv[]) { opt_parse.add_opt("summary", 'S', "write summary to this file", false, summary_file); opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output); + opt_parse.add_opt("relaxed", '\0', "input has extra fields", false, + allow_extra_fields); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); opt_parse.set_show_defaults(); std::vector leftover_args; @@ -340,6 +343,8 @@ main_selectsites(int argc, char *argv[]) { const std::string sites_file = leftover_args.back(); /****************** END COMMAND LINE OPTIONS *****************/ + MSite::no_extra_fields = (allow_extra_fields == false); + selectsites_summary summary; summary.command_line = get_command_line(argc, argv);