diff --git a/chipathlon/conf.py b/chipathlon/conf.py index 31363ea6284461cf82516a19aa6c4b930445d33b..e327dd234255129f3e9fb09cfb4c6ee1d7bf42e7 100644 --- a/chipathlon/conf.py +++ b/chipathlon/conf.py @@ -40,6 +40,31 @@ peak_tools = [ "music" ] +executables = [ + "bedtools", + "samtools", + "idr", + "picard", + "bwa", + "bowtie2", + "macs2", + "gem", + "peakranger", + "MUSIC", + "CCAT", + "run_spp_nodups", + "chip-job-cat-peak", + "chip-job-ccat-format-bed", + "chip-job-chr-convert", + "chip-job-save-result", + "chip-job-download-encode", + "chip-job-download-gridfs", + "chip-job-music", + "chip-job-peakranger-format", + "chip-job-sort-peak", + "chip-job-zcat-peak" +] + # Peak_type validation peak_types = { "spp": ["narrow", "broad"], @@ -152,3 +177,12 @@ genomes = { "additional_files": file_extensions["bowtie2_genome"] } } + +config_file = { + "required_keys": [ + "chipathlon_bin", "idr_bin", "pegasus_home", "email" + ], + "optional_keys": [ + "arch", "os" + ] +} diff --git a/chipathlon/jobs/params/cat_awk_sort_peaks.yaml b/chipathlon/jobs/params/cat_awk_sort_peaks.yaml index 5d2f965e8e4a82275bc2a809c3ccc1b48f711b5f..b61934924c5167b86903a3d802c6fdacfc31b66c 100644 --- a/chipathlon/jobs/params/cat_awk_sort_peaks.yaml +++ b/chipathlon/jobs/params/cat_awk_sort_peaks.yaml @@ -8,7 +8,7 @@ cat_awk_sort_peaks: - name: sorted_result type: file file_type: bed - command: cat_spp.sh + command: chip-job-cat-peak arguments: - "$inputs.0": type: file diff --git a/chipathlon/jobs/params/ccat_callpeak.yaml b/chipathlon/jobs/params/ccat_callpeak.yaml index 8c8a39ffbe5fa0a5de2da3df3e3b36a5b62456dc..6bb20b42975d7af5b6b3254b198216df49f0607a 100644 --- a/chipathlon/jobs/params/ccat_callpeak.yaml +++ b/chipathlon/jobs/params/ccat_callpeak.yaml @@ -35,7 +35,7 @@ ccat_callpeak: - name: ccat_log type: stdout file_type: log - command: ccat + command: CCAT arguments: - "$inputs.0": type: file diff --git a/chipathlon/jobs/params/ccat_format_bed.yaml b/chipathlon/jobs/params/ccat_format_bed.yaml index 925249a75dbe648ba5cede73af4c39ec1a83920a..7a461c36229f72a306f4804543a589ffc549b2ca 100644 --- a/chipathlon/jobs/params/ccat_format_bed.yaml +++ b/chipathlon/jobs/params/ccat_format_bed.yaml @@ -8,15 +8,15 @@ ccat_format_bed: - name: sorted_peaks type: file file_type: bed - command: ccat_format_bed.pl + command: chip-job-ccat-format-bed arguments: - - "-c": + - "--input": type: file changeable: false required: true has_value: true default: $inputs.0 - - "-b": + - "--output": type: file changeable: false required: true diff --git a/chipathlon/jobs/params/chr_locus_convert.yaml b/chipathlon/jobs/params/chr_locus_convert.yaml index dc13307bdb2ff656571a71e5c5914636d392423f..45e5721583f5c24f00851047d3c16e049e42e417 100644 --- a/chipathlon/jobs/params/chr_locus_convert.yaml +++ b/chipathlon/jobs/params/chr_locus_convert.yaml @@ -8,7 +8,7 @@ chr_locus_convert: - name: chr_bed type: file file_type: bed - command: chr_locus_convert.py + command: chip-job-chr-convert arguments: - "-b": type: file diff --git a/chipathlon/jobs/params/db_save_result.yaml b/chipathlon/jobs/params/db_save_result.yaml index be166a7e501a5dad56f90ecf924d0f9ed1e21d40..54f36ed3f9d37145eef9df222cf4539b19160216 100644 --- a/chipathlon/jobs/params/db_save_result.yaml +++ b/chipathlon/jobs/params/db_save_result.yaml @@ -2,7 +2,7 @@ db_save_result: inputs: - name: username type: string - + - name: password type: string @@ -18,7 +18,7 @@ db_save_result: file_type: yaml additional_inputs: null outputs: null - command: db_save_result.py + command: chip-job-save-result arguments: - "-u": type: string diff --git a/chipathlon/jobs/params/download_from_encode.yaml b/chipathlon/jobs/params/download_from_encode.yaml index be8311a64e2f7c3d23c877cef611cf761a59f210..0f8dc6bed9543e912ac009633a6b99c48a4d8f9a 100644 --- a/chipathlon/jobs/params/download_from_encode.yaml +++ b/chipathlon/jobs/params/download_from_encode.yaml @@ -10,7 +10,7 @@ download_from_encode: - name: downloaded_file type: file file_type: any - command: download_from_encode.py + command: chip-job-download-encode arguments: - "-u": type: string diff --git a/chipathlon/jobs/params/download_from_gridfs.yaml b/chipathlon/jobs/params/download_from_gridfs.yaml index 1c25a44d27d0212e333735175ebf8bd2bdd5c22c..548a7209d6a151ffe193ab37bac1bbb2c8e8073e 100644 --- a/chipathlon/jobs/params/download_from_gridfs.yaml +++ b/chipathlon/jobs/params/download_from_gridfs.yaml @@ -16,7 +16,7 @@ download_from_gridfs: - name: downloaded_result type: file file_type: any - command: download_from_gridfs.py + command: chip-job-download-gridfs arguments: - "-H": type: string diff --git a/chipathlon/jobs/params/music_broad.yaml b/chipathlon/jobs/params/music_broad.yaml index b30eec6dab92ac12b3b7adc6c56985f229ae0133..9d041862f2d80cad44abc5e5e9726456f1f269cd 100644 --- a/chipathlon/jobs/params/music_broad.yaml +++ b/chipathlon/jobs/params/music_broad.yaml @@ -42,7 +42,7 @@ music_broad: - name: scale_16626.bed type: file file_type: bed - command: music + command: chip-job-music arguments: - "--prefix": type: string diff --git a/chipathlon/jobs/params/music_narrow.yaml b/chipathlon/jobs/params/music_narrow.yaml index de7b8cc7c32238c657b5400c75920b54b397e927..e61122fcc54f4b8b2852089c35e1ff0e427474cd 100644 --- a/chipathlon/jobs/params/music_narrow.yaml +++ b/chipathlon/jobs/params/music_narrow.yaml @@ -26,7 +26,7 @@ music_narrow: - name: scale_291.bed type: file file_type: bed - command: music + command: chip-job-music arguments: - "--prefix": type: string diff --git a/chipathlon/jobs/params/music_punctate.yaml b/chipathlon/jobs/params/music_punctate.yaml index b3a6cbcbd05f4f64c9857ae29e41427b64a5887c..7930802abb559b3b9e093252f3ca66c6bc25ea19 100644 --- a/chipathlon/jobs/params/music_punctate.yaml +++ b/chipathlon/jobs/params/music_punctate.yaml @@ -46,7 +46,7 @@ music_punctate: - name: scale_2216.bed type: file file_type: bed - command: music + command: MUSIC arguments: - "--prefix": type: string diff --git a/chipathlon/jobs/params/peakranger_format_bed.yaml b/chipathlon/jobs/params/peakranger_format_bed.yaml index 734afb9f1570777ddfe2ee1f186eb159fca54bd9..2c78772898eaefa4ccd450e160ebe20c58a14185 100644 --- a/chipathlon/jobs/params/peakranger_format_bed.yaml +++ b/chipathlon/jobs/params/peakranger_format_bed.yaml @@ -8,15 +8,15 @@ peakranger_format_bed: - name: sorted_peaks type: file file_type: bed - command: peakranger_format_bed.pl + command: chip-job-peakranger-format arguments: - - "-c": + - "--input": type: file changeable: false required: true has_value: true default: $inputs.0 - - "-b": + - "--output": type: file changeable: false required: true diff --git a/chipathlon/jobs/params/sort_awk_sort_peaks.yaml b/chipathlon/jobs/params/sort_awk_sort_peaks.yaml index 0b8608df86af1d07900186357a78b9df0ec09957..10f712040526b4bde16395d7dc407989793c0f3f 100644 --- a/chipathlon/jobs/params/sort_awk_sort_peaks.yaml +++ b/chipathlon/jobs/params/sort_awk_sort_peaks.yaml @@ -8,7 +8,7 @@ sort_awk_sort_peaks: - name: sorted_peaks type: file file_type: bed - command: sort_peak.sh + command: chip-job-sort-peak arguments: - "$inputs.0": type: file diff --git a/chipathlon/jobs/params/spp_nodups_broad.yaml b/chipathlon/jobs/params/spp_nodups_broad.yaml index ae2baa4b1712c8ba236da44d86c1190d32783d2d..b31b1c238b6a13d1b0475bc5441b8b77a2939c43 100644 --- a/chipathlon/jobs/params/spp_nodups_broad.yaml +++ b/chipathlon/jobs/params/spp_nodups_broad.yaml @@ -20,7 +20,7 @@ spp_nodups_broad: - name: score type: file file_type: ccscore - command: spp_nodups + command: run_spp_nodups arguments: - "-c=": type: file diff --git a/chipathlon/jobs/params/spp_nodups_narrow.yaml b/chipathlon/jobs/params/spp_nodups_narrow.yaml index 1e8e657e754a67a17d65abd72feb17835e38ca49..6808e5c2524e181f67934e153d65788676696968 100644 --- a/chipathlon/jobs/params/spp_nodups_narrow.yaml +++ b/chipathlon/jobs/params/spp_nodups_narrow.yaml @@ -20,7 +20,7 @@ spp_nodups_narrow: - name: score type: file file_type: ccscore - command: spp_nodups + command: run_spp_nodups arguments: - "-c=": type: file diff --git a/chipathlon/jobs/params/zcat_awk_sort_peaks.yaml b/chipathlon/jobs/params/zcat_awk_sort_peaks.yaml index 925353fb9c0cb52f20697e921d4138c5261bf6eb..11d77867a9164cd9d4f7ae84f4afeda7380f3c5a 100644 --- a/chipathlon/jobs/params/zcat_awk_sort_peaks.yaml +++ b/chipathlon/jobs/params/zcat_awk_sort_peaks.yaml @@ -8,7 +8,7 @@ zcat_awk_sort_peaks: - name: sorted_peaks type: file file_type: bed - command: zcat_spp.sh + command: chip-job-zcat-peak arguments: - "$inputs.0": type: file diff --git a/chipathlon/jobs/scripts/V3_csaw_rscript_yaml.R b/chipathlon/jobs/scripts/V3_csaw_rscript_yaml.R deleted file mode 100755 index 468e355f34c875de3b514ee85c2e437f5cbe57bd..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/scripts/V3_csaw_rscript_yaml.R +++ /dev/null @@ -1,200 +0,0 @@ -#args <- getArgs(defaults=list(workdir=NULL, exper.bams=NULL,control.bams="", -# analysisID=NULL, exper.dir="./", control.dir="./", -# tf=NULL,cond1=NULL,cond2=NULL, -# orgn=NULL, samples=NULL, contrast=NULL, -# bin.win=10000, blacklist.regions="", -# FDR=0.05, frag.len=10, min.quality=30, paired=FALSE, -# spacing=50, tol=100, win.width=10)) - -#print(args) -commandArgs(trailingOnly = FALSE) -yamlfile <- '~/Documents/R/csaw_test.yaml' -library(yaml) -cat("yamlfile: ", yamlfile) -pars <- yaml.load_file(yamlfile) -names(pars) -# partly based on https://www.bioconductor.org/help/course-materials/2015/BioC2015/csaw_lab.html -# an example call: -# where experiments came from GSE25532: -# A. K. Tiwari, M. B. Stadler, C. Wirbelauer, R. Paro, D. Schubeler, and C. Beisel. -# A chromatin-modifying function of JNK during stem cell dfferentiation. -# Nat. Genet., 44(1): 94-100, Jan 2012 - -#yaml.load_file( foo.yml ) - -outfileStem <- paste0(c(pars$analysisID, pars$tf, pars$cond1.cell, pars$cond2.cell, pars$orgn), collapse="_") -outfileSimes<-paste0(c(outfileStem,'Simes.txt'), collapse="_") -outfileBestTest<-paste0(c(outfileStem,'bestTest.txt'), collapse="_") -cat( "Simes outfile: ", outfileSimes, " bestTestoutfile: ", outfileBestTest, "\n" ) - -source("https://bioconductor.org/biocLite.R") -setwd(pars$workdir) - -bam.files <- file.path(pars$exper.dir, pars$exper.bams) -cat("bam.files:\n") -bam.files - -if ( length(pars$control.bams) > 1){ - control.bams <- file.path(pars$control.dir, pars$control.bams) - cat("Control (input) bam(s): ", control.bams, "\n" ) -} -samples <- pars$samples -cat("Samples: ", samples, "\n" ) -contrasts<-pars$contrast -cat("Contrasts: ", contrasts, "\n" ) -# the blacklisted genomic regions -library(GenomicRanges) -if ( length(pars$blacklist.regions) > 1){ - blacklist.regions <- pars$blacklist.regions -} else { - if ( pars$orgn == 'human'){ - - } else { - if ( pars$orgn == 'mouse'){ - - } else { - cat("Organism: ", pars$orgn, "not implemented yet, only human and mouse\n") - quit() - } - } -} - -library(csaw) -library(DESeq2) -require(edgeR) -param <- readParam(minq=pars$min.quality) -#Reads are directionally extended to the average fragment length, -#to represent the original fragments that were present during immunoprecipitation. -#Windows of constant size (10 bp) are tiled across the genome at regular intervals (50 bp). -if (pars$paired==TRUE){ - pe.param <- readParam(max.frag=400, pe="both") - if ( length(pars$control.bams) > 1){ - d1 <- windowCounts( c(bam.files, control.bams), bin=TRUE, width=10000, - ext=pars$frag.len, width=pars$win.width, param=pe.param) - } else { # no input bams - d1 <- windowCounts( bam.files, ext=pars$frag.len, width=pars$win.width, param=pe.param) - } -} else { #single reads - if ( length(pars$control.bams) > 1){ - d1 <- windowCounts( c(bam.files, control.bams), bin=TRUE, width=10000, - ext=pars$frag.len, width=pars$win.width, param=param) - } else { # no input bams - d1 <- windowCounts(bam.files, ext=pars$frag.len, width=pars$win.width, param=param) - } -} -if ( length(pars$control.bams) > 1){ - chip <- d1[,1:length(bam.files)] - control <- d1[,((length(bam.files) + 1):(length(bam.files) + length(control.bams)))] - filter.stat <- filterWindows(chip, control, type="control", prior.count=5, - norm.fac=list(chip, control)) - data <-filter.stat$filter > log2(3) -} else { # no input bams - abundances <- aveLogCPM(asDGEList(d1)) - print("aveLogCPM abundances") - summary(abundances) - keep.simple <- abundances > aveLogCPM(5, lib.size=mean(d1$totals)) - data <- d1[keep.simple, ] - summary(keep.simple) -} -# here implement the blacklist filter?? - -# In edgeR, the log-transformed overall NB mean is referred to as the average abundance. -# This is computed with the aveLogCPM function, as shown below for each region - -data$totals - -#max.delay <- 500 -dedup.on <- readParam(dedup=TRUE, minq=pars$min.quality) -#x <- correlateReads(bam.files, $pars.max.delay, param=dedup.on) -#plot(0:$pars.max.delay, x, type="l", ylab="CCF", xlab="Delay (bp)") - -normfacs <- normalize(data) -normfacs -y <- asDGEList(data, norm.factors=normfacs) -cat("nrow(data: ") -nrow(data) -grouping <- factor(samples) -design <- model.matrix(~0 + grouping) -colnames(design) <- levels(grouping) -y <- asDGEList(data, norm.factors=normfacs) -y <- estimateDisp(y, design) -# biocLite("statmod") -require(statmod) -#install.packages("dplyr") -library(dplyr) -#install.packages("gridGraphics") -#library(gridGraphics) -fit <- glmQLFit(y, design, robust=TRUE) -#contrast <- makeContrasts(es - tn, levels=design) -cstring <- paste0(contrasts, collapse=' - ') -contrast <- makeContrasts(cstring, levels=design) - -#QL F-test [Lund et al., 2012] -results <- glmQLFTest(fit, contrast=contrast) - -merged <- mergeWindows(rowRanges(data), tol=pars$tol, max.width=pars$bin.win) - -# We demand replicates! -#norep.fit <- glmFit(y, design, dispersion=0.05) -#norep.results <- glmLRT(norep.fit, contrast=contrast) - -#bin.adjc <- cpm(asDGEList(binned.2), log=TRUE) -#plotMDS(bin.adjc, labels=grouping) -#plotBCV(y) -#plotQLDisp(fit) - -clustered <- mergeWindows(rowRanges(data), tol=1000) -clustered$region -cat("entries in clustered: ") -length(unique(clustered$id)) - -# first method: Simes method -# to cluster adjacent windows into regions. A combined p-value can then be computed -# for each cluster, based on the p-values of the constituent windows [Simes, 1986]. -# This tests the joint null hypothesis for each cluster, i.e., that -# no enrichment is observed across any sites within the corresponding region. -# The combined p-values are then adjusted using the BH method to control the region-level FDR. - -tabcom <- combineTests(clustered$id, results$table) -print("Simes method") -cat("entries in tabcom: ") -length(tabcom$nWindows) -head(tabcom) -#is.sig.enrich <- select(tabcom, logFC.up > logFC.down & FDR<= pars$FDR) -#cat("Significantly enriched regions: ", summary(is.sig.enrich)) -#is.sig.depleted <- select(tabcom, logFC.up < logFC.down & FDR<= pars$FDR) -#cat("Significantly depleted regions: ", summary(is.sig.depleted)) -#summary(is.sig.depleted) - -tabcom$chr <- seqnames(clustered$region)[as.numeric(row.names(tabcom))] -tabcom$start <- start(clustered$region)[as.numeric(row.names(tabcom))] -tabcom$stop <- end(clustered$region)[as.numeric(row.names(tabcom))] -#tabcom$start <- start(rowRanges(data))[tabcom$best] -#tabcom$stop <- end(rowRanges(data))[tabcom$best] -tabCombyPadj <- tabcom[with(tabcom, order(FDR)), ] -cat("outfileSimes: ", outfileSimes,"\n") -write.table(tabCombyPadj, file=outfileSimes, row.names=FALSE, quote=FALSE, sep="\t") - -# Alternative to Simes: getBestTest -# choose a single window to represent each cluster/region. -# For example, the window with the highest average abundance in each cluster can be used. -# This is sensible for analyses involving sharp binding events, where each cluster is expected -# to be small and contain no more than one binding site. Thus, a single window -# (and p-value) can reasonably be used as a representative of the entire region. -# The BH method can then be applied to the corresponding p-values -# of the representative windows from all clusters. - -clustered2 <- mergeWindows(rowRanges(data), tol=100, max.width=5000) -tab.best <- getBestTest(clustered$id, results$table) - -tab.best$chr <- seqnames(rowRanges(data))[tab.best$best] -tab.best$start <- start(rowRanges(data))[tab.best$best] -tab.best$stop <- end(rowRanges(data))[tab.best$best] - -resultsbypadj <- tab.best[with(tab.best, order(FDR)), ] -cat("outfileBestTest: ", outfileBestTest, "\n") -cat("entries in resultsbypadj: ") -length(resultsbypadj$best) -write.table(resultsbypadj, file=outfileBestTest, row.names=FALSE, quote=FALSE, sep="\t") - -quit() diff --git a/chipathlon/jobs/scripts/ccat_format_bed.pl b/chipathlon/jobs/scripts/ccat_format_bed.pl deleted file mode 100755 index e4cb1b17c5fbb263f6d41d7540dfc0a0d0fc6a21..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/scripts/ccat_format_bed.pl +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/perl -w - -=NAME - -=VERSION - -=head1 SYNOPSYS -From CCAT output format generates a decent, sorted BED file with no spaces -Author: Istvan (Steve) Ladunga, University of Nebraska-Lincoln, sladunga@unl.edu -=cut - -my $usage = << "EOF"; - -Usage: $0 - -c input CCAT result file (significant.peak) - -b output BEDfile - -EOF - -use lib qw ( /home/ladunga/sladunga/bin/ /home/ladunga/sladunga/LIB/ /home/ladunga/perllib/ ); -use NoStatUtils; -use strict; -use Carp; -use Getopt::Long; - -my %par = ( - ); -&GetOptions("c=s" => \$par{infile}, - "b=s" => \$par{outfile}); - - -($par{infile} && $par{outfile}) or die $usage; - -&go (\%par ); - -print STDERR "$0 done\n"; - -###################################################################### - -sub go { - my ( $par ) = @_; - my $n = 0; - open (IN, "<$par->{infile}") or die "Cannot open infile $par->{infile} $!\n"; - my $tmpFile = '/tmp/ccatFormat2Bed.'.$$; - open (OF, ">$tmpFile") or die "Cannot open outfile $tmpFile $!\n"; - - while(<IN>){ - chomp; - next if /^\#/; - next unless /\w/; -#chr1 16523865 16522790 16526550 184 14 73.063558 0.001000 - s/ +//g; # get rid of silly spaces - my @w = split; - my $name = 'ccat_'.$n; - my $start = ( $w[1] < $w[2] ) ? $w[1] : $w[2]; - my $end = ( $w[1] < $w[2] ) ? $w[2] : $w[1]; - my $s = join("\t", ($w[0], $start, $end, $name, $w[4], '.', $w[6], $w[7], 0.001 )); #, '.', $w[6])); - print OF "$s\n"; - $n++; - } - close (IN); - close (OF); - `sort -k1,1V -k2,2n -k3,3n $tmpFile > $par->{outfile}`; -} - -################################################################### - diff --git a/chipathlon/jobs/scripts/csaw_rscript.R b/chipathlon/jobs/scripts/csaw_rscript.R deleted file mode 100755 index 7807843baa73891d79c4b022366f262edb781856..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/scripts/csaw_rscript.R +++ /dev/null @@ -1,209 +0,0 @@ -library(argparse) - -parser <- ArgumentParser(description = "CSAW wrapper R script.") -parser$add_argument("--workdir", dest="workdir", required=TRUE, help="Working directory.") -parser$add_argument("--frag_len", dest="frag_len", default=20, help="Fragment Length.") -parser$add_argument("--window_width", dest="window_width", default=20, help="Default window width.") -parser$add_argument("--exp_bams", dest="exp_bams", nargs="+", required=TRUE, help="List of experiment bam files to use.") -parser$add_argument("--exp_dir", dest="exp_dir", required=TRUE, help="Path to experiment directory.") -parser$add_argument("--control_bams", dest="control_bams", nargs="+", required=TRUE, help="List of control bam files to use.") -parser$add_argument("--control_dir", dest="control_dir", required=TRUE, help="List of control dirs to use.") -parser$add_argument("--organism", dest="organism", required=TRUE, help="Organism name.") -parser$add_argument("--tf", dest="tf", required=TRUE, help="Transcription factor.") -parser$add_argument("--cond1", dest="cond1", required=TRUE, help="First condition.") -parser$add_argument("--cond2", dest="cond2", required=TRUE, help="Second condition.") -parser$add_argument("--samples", dest="samples", required=TRUE, nargs="+", help="?") -parser$add_argument("--contrast", dest="contrast", required=TRUE, nargs="+", help="?") -parser$add_argument("--id", dest="id", required=TRUE, help="?") -parser$add_argument("--fdr", dest="fdr", default=0.05, help="False Discovery Rate.") -parser$add_argument("--min_q", dest="min_q", default=30, help="Minimum quality score.") -parser$add_argument("--paired", dest="paired", default=FALSE, action="store_true", help="Paired end reads.") -parser$add_argument("--spacing", dest="spacing", default=50, help="?") -parser$add_argument("--tol", dest="tol", default=100, help="?") -args <- parser$parse_args() - -# partly based on https://www.bioconductor.org/help/course-materials/2015/BioC2015/csaw_lab.html -# an example call: -# where experiments came from GSE25532: -# A. K. Tiwari, M. B. Stadler, C. Wirbelauer, R. Paro, D. Schubeler, and C. Beisel. -# A chromatin-modifying function of JNK during stem cell dfferentiation. -# Nat. Genet., 44(1): 94-100, Jan 2012 - -#yaml.load_file( foo.yml ) - -outfileStem <- paste0(c(args$analysisID, args$tf, args$cond1.cell, args$cond2.cell, args$orgn), collapse="_") -outfileSimes<-paste0(c(outfileStem,'Simes.txt'), collapse="_") -outfileBestTest<-paste0(c(outfileStem,'bestTest.txt'), collapse="_") -cat( "Simes outfile: ", outfileSimes, " bestTestoutfile: ", outfileBestTest, "\n" ) - -source("https://bioconductor.org/biocLite.R") -setwd(args$workdir) - -bam.files <- file.path(args$exper.dir, args$exper.bams) -cat("bam.files:\n") -bam.files - -if ( length(args$control.bams) > 1){ - control.bams <- file.path(args$control.dir, args$control.bams) - cat("Control (input) bam(s): ", control.bams, "\n" ) -} -samples <- args$samples -cat("Samples: ", samples, "\n" ) -contrasts<-args$contrast -cat("Contrasts: ", contrasts, "\n" ) -# the blacklisted genomic regions -library(GenomicRanges) -if ( length(args$blacklist.regions) > 1){ - blacklist.regions <- args$blacklist.regions -} else { - if ( args$orgn == 'human'){ - - } else { - if ( args$orgn == 'mouse'){ - - } else { - cat("Organism: ", args$orgn, "not implemented yet, only human and mouse\n") - quit() - } - } -} - -library(csaw) -library(DESeq2) -require(edgeR) -param <- readParam(minq=args$min.quality) -#Reads are directionally extended to the average fragment length, -#to represent the original fragments that were present during immunoprecipitation. -#Windows of constant size (10 bp) are tiled across the genome at regular intervals (50 bp). -if (args$paired==TRUE){ - pe.param <- readParam(max.frag=400, pe="both") - if ( length(args$control.bams) > 1){ - d1 <- windowCounts( c(bam.files, control.bams), bin=TRUE, width=10000, - ext=args$frag.len, width=args$win.width, param=pe.param) - } else { # no input bams - d1 <- windowCounts( bam.files, ext=args$frag.len, width=args$win.width, param=pe.param) - } -} else { #single reads - if ( length(args$control.bams) > 1){ - d1 <- windowCounts( c(bam.files, control.bams), bin=TRUE, width=10000, - ext=args$frag.len, width=args$win.width, param=param) - } else { # no input bams - d1 <- windowCounts(bam.files, ext=args$frag.len, width=args$win.width, param=param) - } -} -if ( length(args$control.bams) > 1){ - chip <- d1[,1:length(bam.files)] - control <- d1[,((length(bam.files) + 1):(length(bam.files) + length(control.bams)))] - filter.stat <- filterWindows(chip, control, type="control", prior.count=5, - norm.fac=list(chip, control)) - data <-filter.stat$filter > log2(3) -} else { # no input bams - abundances <- aveLogCPM(asDGEList(d1)) - print("aveLogCPM abundances") - summary(abundances) - keep.simple <- abundances > aveLogCPM(5, lib.size=mean(d1$totals)) - data <- d1[keep.simple, ] - summary(keep.simple) -} -# here implement the blacklist filter?? - -# In edgeR, the log-transformed overall NB mean is referred to as the average abundance. -# This is computed with the aveLogCPM function, as shown below for each region - -data$totals - -#max.delay <- 500 -dedup.on <- readParam(dedup=TRUE, minq=args$min.quality) -#x <- correlateReads(bam.files, $args.max.delay, param=dedup.on) -#plot(0:$args.max.delay, x, type="l", ylab="CCF", xlab="Delay (bp)") - -normfacs <- normalize(data) -normfacs -y <- asDGEList(data, norm.factors=normfacs) -cat("nrow(data: ") -nrow(data) -grouping <- factor(samples) -design <- model.matrix(~0 + grouping) -colnames(design) <- levels(grouping) -y <- asDGEList(data, norm.factors=normfacs) -y <- estimateDisp(y, design) -# biocLite("statmod") -require(statmod) -#install.packages("dplyr") -library(dplyr) -#install.packages("gridGraphics") -#library(gridGraphics) -fit <- glmQLFit(y, design, robust=TRUE) -#contrast <- makeContrasts(es - tn, levels=design) -cstring <- paste0(contrasts, collapse=' - ') -contrast <- makeContrasts(cstring, levels=design) - -#QL F-test [Lund et al., 2012] -results <- glmQLFTest(fit, contrast=contrast) - -merged <- mergeWindows(rowRanges(data), tol=args$tol, max.width=args$bin.win) - -# We demand replicates! -#norep.fit <- glmFit(y, design, dispersion=0.05) -#norep.results <- glmLRT(norep.fit, contrast=contrast) - -#bin.adjc <- cpm(asDGEList(binned.2), log=TRUE) -#plotMDS(bin.adjc, labels=grouping) -#plotBCV(y) -#plotQLDisp(fit) - -clustered <- mergeWindows(rowRanges(data), tol=1000) -clustered$region -cat("entries in clustered: ") -length(unique(clustered$id)) - -# first method: Simes method -# to cluster adjacent windows into regions. A combined p-value can then be computed -# for each cluster, based on the p-values of the constituent windows [Simes, 1986]. -# This tests the joint null hypothesis for each cluster, i.e., that -# no enrichment is observed across any sites within the corresponding region. -# The combined p-values are then adjusted using the BH method to control the region-level FDR. - -tabcom <- combineTests(clustered$id, results$table) -print("Simes method") -cat("entries in tabcom: ") -length(tabcom$nWindows) -head(tabcom) -#is.sig.enrich <- select(tabcom, logFC.up > logFC.down & FDR<= args$FDR) -#cat("Significantly enriched regions: ", summary(is.sig.enrich)) -#is.sig.depleted <- select(tabcom, logFC.up < logFC.down & FDR<= args$FDR) -#cat("Significantly depleted regions: ", summary(is.sig.depleted)) -#summary(is.sig.depleted) - -tabcom$chr <- seqnames(clustered$region)[as.numeric(row.names(tabcom))] -tabcom$start <- start(clustered$region)[as.numeric(row.names(tabcom))] -tabcom$stop <- end(clustered$region)[as.numeric(row.names(tabcom))] -#tabcom$start <- start(rowRanges(data))[tabcom$best] -#tabcom$stop <- end(rowRanges(data))[tabcom$best] -tabCombyPadj <- tabcom[with(tabcom, order(FDR)), ] -cat("outfileSimes: ", outfileSimes,"\n") -write.table(tabCombyPadj, file=outfileSimes, row.names=FALSE, quote=FALSE, sep="\t") - -# Alternative to Simes: getBestTest -# choose a single window to represent each cluster/region. -# For example, the window with the highest average abundance in each cluster can be used. -# This is sensible for analyses involving sharp binding events, where each cluster is expected -# to be small and contain no more than one binding site. Thus, a single window -# (and p-value) can reasonably be used as a representative of the entire region. -# The BH method can then be applied to the corresponding p-values -# of the representative windows from all clusters. - -clustered2 <- mergeWindows(rowRanges(data), tol=100, max.width=5000) -tab.best <- getBestTest(clustered$id, results$table) - -tab.best$chr <- seqnames(rowRanges(data))[tab.best$best] -tab.best$start <- start(rowRanges(data))[tab.best$best] -tab.best$stop <- end(rowRanges(data))[tab.best$best] - -resultsbypadj <- tab.best[with(tab.best, order(FDR)), ] -cat("outfileBestTest: ", outfileBestTest, "\n") -cat("entries in resultsbypadj: ") -length(resultsbypadj$best) -write.table(resultsbypadj, file=outfileBestTest, row.names=FALSE, quote=FALSE, sep="\t") - -quit() diff --git a/chipathlon/jobs/scripts/genome_chr_locus_convert.py b/chipathlon/jobs/scripts/genome_chr_locus_convert.py deleted file mode 100755 index c419dc5a025ac9359deb8f398e3581e46205b854..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/scripts/genome_chr_locus_convert.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python - -import pymysql -import argparse -import os - -parser = argparse.ArgumentParser(description="Replace chromosome locus with number.") -parser.add_argument("-g", "--genome", dest="genome", required=True, help="Input genome fasta file.") -parser.add_argument("-o", "--out", dest="out", required=True, help="Output file.") -parser.add_argument("-c", "--c", dest="chr", default=False, action="store_true", help="If specified convert from name to chr number, otherwise convert from chr number to name.") -parser.add_argument("-d", "--db", dest="db", default="hg38", help="Database to load conversion from.") -parser.add_argument("-s", "--server", dest="server", default="genome-mysql.cse.ucsc.edu", help="Location of mysql server.") -parser.add_argument("-u", "--user", dest="user", default="genome", help="Username for db login.") -parser.add_argument("-p", "--password", dest="password", default="", help="Password for db login.") -parser.add_argument("-t", "--table", dest="table", default="ucscToINSDC", help="Table to retrieve locus -> number info from.") -args = parser.parse_args() - -if os.path.isfile(args.bed): - - # Setup connection, retrieve all info - conn = pymysql.connect(host=args.server, user=args.user, password=args.password, db=args.db, cursorclass=pymysql.cursors.DictCursor) - with conn.cursor() as cursor: - cursor.execute("SELECT * FROM %s" % (args.table,)) - conn.commit() - results = cursor.fetchall() - - # Results are a list of dictionaries -> - # [{"name": "CM000663.2", "chrom": "chr1"}, ...] - # Convert to a single dictionary with comprehension - # depending on directionality of the conversion - chr_map = dict([ - (row["name"], row["chrom"]) if args.chr else (row["chrom"], row["name"]) - for row in results - ]) - - with open(args.genome, "r") as rh: - with open(args.out, "w") as wh: - for line in rh: - if line.startswith(">"): - info = line.split() - accession = info[0][1:] - if accession in chr_map: - info[0] = ">" + chr_map[accession] - wh.write(" ".join(info) + "\n") -else: - print "Input file %s does not exist." % (args.bed,) diff --git a/chipathlon/jobs/scripts/peakranger_format_bed.pl b/chipathlon/jobs/scripts/peakranger_format_bed.pl deleted file mode 100755 index 6c223c323326f21693547682a94dad21817631dc..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/scripts/peakranger_format_bed.pl +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/perl -w - -=NAME - -=VERSION - -=head1 SYNOPSYS -From CCAT output format generates a decent, sorted BED file with no spaces -Author: Istvan (Steve) Ladunga, University of Nebraska-Lincoln, sladunga@unl.edu -=cut - -my $usage = << "EOF"; - -Usage: $0 - -c input CCAT result file (significant.peak) - -b output BEDfile - -EOF - -use lib qw ( /home/ladunga/sladunga/bin/ /home/ladunga/sladunga/LIB/ /home/ladunga/perllib/ ); -use NoStatUtils; -use strict; -use Carp; -use Getopt::Long; - -my %par = ( - ); -&GetOptions("c=s" => \$par{infile}, - "b=s" => \$par{outfile}); - - -($par{infile} && $par{outfile}) or die $usage; - -&go (\%par ); - -print STDERR "$0 done\n"; - -###################################################################### - -sub go { - my ( $par ) = @_; - my $n = 0; - my $log10 = log(10); - open (IN, "<$par->{infile}") or die "Cannot open infile $par->{infile} $!\n"; - my $tmpFile = '/tmp/peakrangerFormat2Bed.'.$$; - open (OF, ">$tmpFile") or die "Cannot open outfile $tmpFile $!\n"; - - while(<IN>){ - chomp; - next if /^\#/; - next unless /\w/; - -#chr1 174147470 174147471 ranger_region_174147298_174147692_pval_1.58715e-186_fdrPassed_12 1.90214e-184 + - s/ +//g; # get rid of silly spaces - my @w = split; - my $name = 'ranger_'.$n; - my $start = ( $w[1] < $w[2] ) ? $w[1] : $w[2]; - my $end = ( $w[1] < $w[2] ) ? $w[2] : $w[1]; - my $tmp = $w[4] + 1E-300; - my $score = int(-1 * log($tmp)/ $log10); - my $s = join("\t", ($w[0], $start, $end, $name, $score, $w[5], 111.1, 111.1, $w[4] )); #, $w[6], $w[7], 0.001 )); #, '.', $w[6])); - print OF "$s\n"; - $n++; - } - close (IN); - close (OF); - `sort -k1,1V -k2,2n -k3,3n $tmpFile > $par->{outfile}`; -} - -################################################################### - diff --git a/chipathlon/jobs/scripts/prep_jamm_inputs.py b/chipathlon/jobs/scripts/prep_jamm_inputs.py deleted file mode 100755 index e85e33e1fd8379e453dcbc7c756b2208cb853489..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/scripts/prep_jamm_inputs.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/env python - -import argparse -import os -import shutil - -parser = argparse.ArgumentParser(description="Move jamm inputs into uniquely identified sample/control folders.") -parser.add_argument("-e", "--exp", dest="exp", required=True, help="Input experiment bed file.") -parser.add_argument("-c", "--control", dest="control", required=True, help="Input control bed file.") -parser.add_argument("-d", "--dir", dest="dir", required=True, help="Directory to move files into.") -args = parser.parse_args() - -if os.path.isfile(args.exp) and os.path.isfile(args.control): - if not os.path.isdir(args.dir): - os.makedirs(args.dir) - os.makedirs(args.dir + "/sample") - os.makedirs(args.dir + "/control") - shutil.copyfile(args.control, args.dir + "/control/" + os.path.basename(args.control)) - shutil.copyfile(args.exp, args.dir + "/sample/" + os.path.basename(args.exp)) - else: - print "Directory %s already exists." % (args.dir,) -else: - print "Not all inputs [%s, %s] exist." % (args.exp, args.control) diff --git a/chipathlon/jobs/scripts/spp_nodups_broad.R b/chipathlon/jobs/scripts/spp_nodups_broad.R deleted file mode 100755 index 019feb6a3c81b6af8f6a42a824668c0c28f65ad3..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/scripts/spp_nodups_broad.R +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/env Rscript -library(argparse) -library(spp) -library(snow) - -# With lots of help from: http://compbio.med.harvard.edu/Supplements/ChIP-seq/tutorial.html -parser <- ArgumentParser(description = "CSAW wrapper R script.") -parser$add_argument("--signal", dest="signal", required=TRUE, help="Path to signal chip input file.") -parser$add_argument("--control", dest="control", required=TRUE, help="Path to control input file.") -parser$add_argument("--output", dest="output", required=TRUE, help="Path to output file.") - -parser$add_argument("--tag_length", dest="tag_length", type="integer", default=32, help="Tag length.") -parser$add_argument("--rmin", dest="rmin", type="integer", default=50, help="Protected region minimum size.") -parser$add_argument("--rmax", dest="rmax", type="integer", default=500, help="Protected region maximum size.") -parser$add_argument("--bin", dest="bin", type="integer", default=5, help="Bin tags with specified number of basepairs.") -parser$add_argument("--window", dest="window", type="integer", default=1000, help="Window size") -parser$add_argument("--thr", dest="thr", type="integer", default=3, help="????") -args <- parser$parse_args() - -cluster <- makeCluster(8) - -chip.data <- read.tagalign.tags(args$signal); -tmpDataRows <- read.table(args$signal, nrows=500) -chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2)) - -input.data <- read.tagalign.tags(args$control); -tmpDataRows <- read.table(args$control,nrows=500) -input.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2)) - -# get binding info from cross-correlation profile -# srange gives the possible range for the size of the protected region; -# srange should be higher than tag length; making the upper boundary too high will increase calculation time -# -# bin - bin tags within the specified number of basepairs to speed up calculation; -# increasing bin size decreases the accuracy of the determined parameters -binding.characteristics <- get.binding.characteristics(chip.data, srange=c(args$rmin, args$rmax), bin=args$bin, cluster=cluster); - -chip.data <- select.informative.tags(chip.data, binding.characteristics); -input.data <- select.informative.tags(input.data, binding.characteristics); -chip.data <- remove.local.tag.anomalies(chip.data); -input.data <- remove.local.tag.anomalies(input.data); - -broad.clusters <- get.broad.enrichment.clusters( - chip.data, - input.data, - window.size=args$window, - z.thr=args$thr, - tag.shift=round(binding.characteristics$peak$x / 2) # Should talk about this calculation -) - -# write out in broadPeak format -write.broadpeak.info(broad.clusters, args$output); diff --git a/chipathlon/jobs/wrappers/bedtools_wrapper.sh b/chipathlon/jobs/wrappers/bedtools_wrapper.sh deleted file mode 100755 index f1cbc0bad75c040e44d7a49526e11820c4b11001..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/bedtools_wrapper.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -bedtools "$@" diff --git a/chipathlon/jobs/wrappers/bowtie2_wrapper.sh b/chipathlon/jobs/wrappers/bowtie2_wrapper.sh deleted file mode 100755 index f63962526836e7336f4167867b649a319c7f91b9..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/bowtie2_wrapper.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -bowtie2 "$@" diff --git a/chipathlon/jobs/wrappers/bwa_wrapper.sh b/chipathlon/jobs/wrappers/bwa_wrapper.sh deleted file mode 100755 index 0c71b922f0f70cd90884d1e400070bdc04eb05fb..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/bwa_wrapper.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -bwa "$@" diff --git a/chipathlon/jobs/wrappers/ccat_wrapper.sh b/chipathlon/jobs/wrappers/ccat_wrapper.sh deleted file mode 100755 index c4426a25b74d2add59a1f89763c6b21896ec1b95..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/ccat_wrapper.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - - -CCAT "$@" diff --git a/chipathlon/jobs/wrappers/gem_wrapper.sh b/chipathlon/jobs/wrappers/gem_wrapper.sh deleted file mode 100755 index 3f158b167c43b0004334b6a410cd43b592aa9aff..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/gem_wrapper.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -gem -Xmx16G "$@" diff --git a/chipathlon/jobs/wrappers/idr_wrapper.sh b/chipathlon/jobs/wrappers/idr_wrapper.sh deleted file mode 100755 index c0e26d60496d6808d31d84c4fcf5fe878cfe58bd..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/idr_wrapper.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - - -idr "$@" diff --git a/chipathlon/jobs/wrappers/jamm_wrapper.sh b/chipathlon/jobs/wrappers/jamm_wrapper.sh deleted file mode 100755 index c5d7f0dceb77d395f4b2009cc39a4b390d9245dc..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/jamm_wrapper.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - - -JAMM.sh "$@" - -# Avert your eyes -while [[ $# -gt 1 ]] -do -key="$1" - -case $key in - -o) - JAMM_DIR="$2" - shift - ;; - *) - - ;; -esac -shift -done - -cp $JAMM_DIR/peaks/all.peaks.narrowPeak "${JAMM_DIR}_results_sorted.narrowPeak" diff --git a/chipathlon/jobs/wrappers/macs2_wrapper.sh b/chipathlon/jobs/wrappers/macs2_wrapper.sh deleted file mode 100755 index 576fdd02b9f5f358590932cffd4e13f584d8ff20..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/macs2_wrapper.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -macs2 "$@" diff --git a/chipathlon/jobs/wrappers/music_wrapper.sh b/chipathlon/jobs/wrappers/music_wrapper.sh deleted file mode 100755 index fb61998fd9dc739df682262e7ea083f86b212916..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/music_wrapper.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -run_music.py "$@" diff --git a/chipathlon/jobs/wrappers/peakranger_wrapper.sh b/chipathlon/jobs/wrappers/peakranger_wrapper.sh deleted file mode 100755 index 0ce2141c7f901acc575e6d0b4efe44c58d260b67..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/peakranger_wrapper.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -peakranger "$@" diff --git a/chipathlon/jobs/wrappers/picard_wrapper.sh b/chipathlon/jobs/wrappers/picard_wrapper.sh deleted file mode 100755 index 3d0801f3dac9e4c872385d3824fa287588190400..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/picard_wrapper.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -#set -eu -o pipefail -#set -o pipefail - -picard "$@" diff --git a/chipathlon/jobs/wrappers/samtools_wrapper.sh b/chipathlon/jobs/wrappers/samtools_wrapper.sh deleted file mode 100755 index efcd1f38606fd292073342616e8a06bbdc929f30..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/samtools_wrapper.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -samtools "$@" diff --git a/chipathlon/jobs/wrappers/spp_nodups_wrapper.sh b/chipathlon/jobs/wrappers/spp_nodups_wrapper.sh deleted file mode 100755 index e5a6d40baecd12aa1e5272ad9e5b889c912278d6..0000000000000000000000000000000000000000 --- a/chipathlon/jobs/wrappers/spp_nodups_wrapper.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -run_spp_nodups "$@" diff --git a/chipathlon/test/config_parsing/bridges_config.yaml b/chipathlon/test/config_parsing/bridges_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b0643f0e19f8e7aae10edfdcfda3dca60efc89a0 --- /dev/null +++ b/chipathlon/test/config_parsing/bridges_config.yaml @@ -0,0 +1,4 @@ +chipathlon_bin: /home/aknecht/.conda/envs/chip/bin +idr_bin: /home/aknecht/.conda/envs/idr/bin +pegasus_home: "/home/centos/anaconda2/share/pegasus/" +email: "avi@kurtknecht.com" diff --git a/chipathlon/test/config_parsing/tusker_config.yaml b/chipathlon/test/config_parsing/tusker_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a2406fc720cf7f2b2d75c285376b2477806412c9 --- /dev/null +++ b/chipathlon/test/config_parsing/tusker_config.yaml @@ -0,0 +1,25 @@ +install_paths: + - /home/swanson/aknecht/.conda/envs/chip/bin + - /home/swanson/aknecht/.conda/envs/idr/bin +notify: + pegasus_home: "/usr/share/pegasus" + email: "avi@kurtknecht.com" +sites: + local: + arch: x86_64 + os: LINUX + dirs: + shared-scratch: "%base_path%/work" + local-storage: "%base_path%/output" + profile: + pegasus: + style: "glite" + change.dir: true + transfer.threads: 4 + condor: + grid_resource: "pbs" + universe: "vanilla" + batch_queue: "batch" + env: + PATH: "/home/swanson/aknecht/.conda/envs/chip/bin:/bin/:/usr/bin/:/usr/local/bin/" + PEGASUS_HOME: "/usr/" diff --git a/chipathlon/workflow.py b/chipathlon/workflow.py index 42bb9bce146a9041532dfe4736062eb0a470fa88..8c36dc5d7adb6518c5a6d046ada38fdf64235a36 100644 --- a/chipathlon/workflow.py +++ b/chipathlon/workflow.py @@ -5,7 +5,6 @@ import sys import json import datetime import textwrap -import xml.dom.minidom import yaml import traceback import chipathlon @@ -17,14 +16,13 @@ from chipathlon.generators.align_generator import AlignGenerator from chipathlon.generators.remove_duplicates_generator import RemoveDuplicatesGenerator from chipathlon.generators.peak_call_generator import PeakCallGenerator from chipathlon.generators.idr_generator import IdrGenerator -import random from pprint import pprint from Pegasus.DAX3 import * class Workflow(object): """ - :param jobhome: The base directory to generate the workflow in. - :type jobhome: str + :param job_home: The base directory to generate the workflow in. + :type job_home: str :param run_file: The run yaml file defining controls, samples, alignment tool, peak calling tool, peak calling type, and which samples to run idr on. @@ -37,6 +35,12 @@ class Workflow(object): :type username: str :param password: The password to authenticate for MongoDB access. :type password: str + :param execute_site: A list of sites to submit jobs to. These sites should + be defined in the configuration file. + :type execute_site: list + :param output_site: The output site to transfer files to. This site should + be defined in the configuration file. + :type output_site: str :param rewrite: Whether or not to rewrite existing files. If true, it will ignore files in Mongo and recreate them. If false, it will download files based on the latest available completed job. @@ -50,9 +54,13 @@ class Workflow(object): executables. """ - def __init__(self, jobhome, run_file, param_file, config_file, host, username, password, rewrite=False, debug=False): + def __init__(self, job_home, run_file, param_file, config_file, + properties_file, host, username, password, execute_site="local", + output_site=["local"], rewrite=False, debug=False): # debug mode, print out additional information self.debug = debug + self.execute_site = execute_site + self.output_site = output_site self.username = username self.host = host @@ -60,15 +68,19 @@ class Workflow(object): # Initialize db connection self.mdb = chipathlon.db.MongoDB(host, username, password) # Jobname info & err - self.jobhome = os.path.abspath(jobhome) - self.basepath = self.jobhome + "/" + datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") - self.jobname = os.path.basename(os.path.dirname(self.jobhome + "/")) + self.job_home = os.path.abspath(job_home) + self.base_path = self.job_home + "/" + datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") + self.jobname = os.path.basename(os.path.dirname(self.job_home + "/")) self.errors = [] # add new results even if they exist self.rewrite = rewrite # Input file info self.run_file = run_file self.param_file = param_file + + self.properties_file = os.path.abspath(properties_file) + if not os.path.isfile(self.properties_file): + self.errors.append("Provided pegasus properties file '%s' does not exist." % (self.properties_file,)) self._load_config(config_file) # Dax specific info self.dax = ADAG(self.jobname) @@ -90,10 +102,7 @@ class Workflow(object): self._generate_jobs, # Create pegasus important stuff self._add_notify, - self._create_replica, - self._create_sites, - self._create_submit, - self._write + self._create_pegasus_files ] return @@ -151,21 +160,19 @@ class Workflow(object): try: with open(config_file, "r") as rh: self.config = yaml.load(rh) - except yaml.YAMLError as exc: - self.errors.append("Error parsing config template file '%s': %s.\n" % (config_file, exc)) - if "notify" in self.config: - if "pegasus_home" in self.config["notify"]: - if not os.path.isdir(self.config["notify"]["pegasus_home"]): - self.errors.append("Error parsing config template file '%s': pegasus_home path '%s' does not exist." % (config_file, self.config["notify"]["pegasus_home"])) - else: - self.errors.append("Error parsing config template file '%s': pegasus_home must be defined under 'notify'." % (config_file,)) - if not "email" in self.config["notify"]: - self.errors.append("Error parsing config template file '%s': email must be defined under 'notify'." % (config_file,)) - else: - self.errors.append("Error parsing config template file '%s': notify must be defined at root level of config." % (config_file,)) + for key in chipathlon.conf.config_file["required_keys"]: + if key not in self.config: + self.errors.append("Config file '%s' does not have required key '%s'." % (config_file, key)) + all_keys = chipathlon.conf.config_file["optional_keys"] + chipathlon.conf.config_file["required_keys"] + for key in self.config: + if key not in all_keys: + self.errors.append("Config file '%s' has invalid key '%s' specified, should be one of: %s." % (config_file, key, all_keys)) + except yaml.YAMLError as ye: + self.errors.append("Error reading config file '%s': %s" % (config_file, ye)) + return - def _add_executable(self, name, path, os_type="linux", arch="x86_64", site="local"): - self.executables[name] = Executable(name=name, os=os_type, arch=arch) + def _add_executable(self, name, path, os_type="linux", arch="x86_64", site="local", installed=True): + self.executables[name] = Executable(name=name, os=os_type.lower(), arch=arch, installed=installed) self.executables[name].addPFN(PFN(os.path.join("file://", path), site)) self.dax.addExecutable(self.executables[name]) if self.debug: @@ -173,20 +180,36 @@ class Workflow(object): return def _load_executables(self, os_type="linux", arch="x86_64"): - # Load wrapper scripts for commands that need to be loaded from module - for root, dirs, files in os.walk("%s/%s" % (os.path.dirname(os.path.realpath(__file__)), chipathlon.conf.job_wrappers)): - for f in files: - self._add_executable("_".join(f.split("_")[:-1]), os.path.join(root, f)) - break + # Load actual executables don't need no wrappers + for executable in chipathlon.conf.executables: + self._add_executable( + executable, + os.path.join(self.config["idr_bin"] if executable == "idr" else self.config["chipathlon_bin"], executable), + site=self.execute_site, + arch=self.config.get("arch", "x86_64"), + os_type=self.config.get("os", "linux") + ) # Load actual scripts for root, dirs, files in os.walk("%s/%s" % (os.path.dirname(os.path.realpath(__file__)), chipathlon.conf.job_scripts)): for f in files: if not os.path.splitext(f)[1] == '.pyc': - self._add_executable(f, os.path.join(root, f)) + self._add_executable( + f, + os.path.join(self.config["chipathlon_bin"], f), + site=self.execute_site, + arch=self.config.get("arch", "x86_64"), + os_type=self.config.get("os", "linux") + ) break # Handle necessary installed scripts for cmd in chipathlon.conf.system_commands: - self._add_executable(cmd, os.path.join(chipathlon.conf.system_path, cmd)) + self._add_executable( + cmd, + os.path.join(chipathlon.conf.system_path, cmd), + site=self.execute_site, + arch=self.config.get("arch", "x86_64"), + os_type=self.config.get("os", "linux") + ) return def _load_workflow_jobs(self): @@ -258,10 +281,10 @@ class Workflow(object): return def _load_generators(self): - self.align_gen = AlignGenerator(self.dax, self.jobs, self.files, self.mdb, self.modules["align"], self.workflow_jobs, self.basepath, debug=self.debug) - self.remove_dup_gen = RemoveDuplicatesGenerator(self.dax, self.jobs, self.files, self.mdb, self.modules["remove_duplicates"], self.workflow_jobs, self.basepath, debug=self.debug) - self.peak_call_gen = PeakCallGenerator(self.dax, self.jobs, self.files, self.mdb, self.modules["peak_call"], self.workflow_jobs, self.basepath, debug=self.debug) - self.idr_gen = IdrGenerator(self.dax, self.jobs, self.files, self.mdb, self.modules["idr"], self.workflow_jobs, self.basepath, debug=self.debug) + self.align_gen = AlignGenerator(self.dax, self.jobs, self.files, self.mdb, self.modules["align"], self.workflow_jobs, self.base_path, debug=self.debug) + self.remove_dup_gen = RemoveDuplicatesGenerator(self.dax, self.jobs, self.files, self.mdb, self.modules["remove_duplicates"], self.workflow_jobs, self.base_path, debug=self.debug) + self.peak_call_gen = PeakCallGenerator(self.dax, self.jobs, self.files, self.mdb, self.modules["peak_call"], self.workflow_jobs, self.base_path, debug=self.debug) + self.idr_gen = IdrGenerator(self.dax, self.jobs, self.files, self.mdb, self.modules["idr"], self.workflow_jobs, self.base_path, debug=self.debug) return def _generate_jobs(self): @@ -279,98 +302,51 @@ class Workflow(object): Creates the base structure for job submission. Everything is contained within a folder based on the current timestamp. """ - if not os.path.exists(self.basepath): - os.makedirs(self.basepath) + if not os.path.exists(self.base_path): + os.makedirs(self.base_path) for folder in ["input", "output", "input/db_meta"]: - if not os.path.exists(os.path.join(self.basepath, folder)): - os.makedirs(os.path.join(self.basepath, folder)) + if not os.path.exists(os.path.join(self.base_path, folder)): + os.makedirs(os.path.join(self.base_path, folder)) return def _add_notify(self): - # NEED TO DETERMINE HOW TO READ IN THESE VARS - notify_path = os.path.join(self.basepath, "input/notify.sh") + notify_path = os.path.join(self.base_path, "input/notify.sh") with open(notify_path, "w") as wh: notify = textwrap.dedent("""\ #!/bin/bash %s/notification/email -t %s --report=pegasus-analyzer - """ % (self.config["notify"]["pegasus_home"], self.config["notify"]["email"])) + """ % (self.config["pegasus_home"], self.config["email"])) wh.write(notify) os.chmod(notify_path, 0755) self.dax.invoke(When.AT_END, notify_path) return - def _create_replica(self): - with open(os.path.join(self.basepath, "input/conf.rc"), "w") as wh: - pegasusrc = textwrap.dedent("""\ - pegasus.catalog.site = XML - pegasus.catalog.site.file = %s/sites.xml - - pegasus.condor.logs.symlink = false - pegasus.transfer.links=true - pegasus.data.configuration = sharedfs - """ % (os.path.join(self.basepath, "input"),)) - wh.write(pegasusrc) - return - - def _create_sites(self): - with open(os.path.join(self.basepath, "input/sites.xml"), "w") as wh: - sites = """\ - <sitecatalog xmlns="http://pegasus.isi.edu/schema/sitecatalog" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pegasus.isi.edu/schema/sitecatalog http://pegasus.isi.edu/schema/sc-4.0.xsd" version="4.0"> - <site handle="local" arch="x86_64" os="LINUX"> - <directory type="shared-scratch" path="%s"> - <file-server operation="all" url="file://%s" /> - </directory> - <directory type="local-storage" path="%s"> - <file-server operation="all" url="file://%s" /> - </directory>""" % ( - os.path.join(self.basepath, "work"), - os.path.join(self.basepath, "work"), - os.path.join(self.basepath, "output"), - os.path.join(self.basepath, "output") - ) - # NEED TO DETERMINE HOW TO READ IN THIS INFO - # change.dir moves execute directory to the work directory created by pegasus - sites += """\n\t<profile key="change.dir" namespace="pegasus">true</profile>""" - # Increase the number of threads for transfer jobs. - sites += """<profile key="transfer.threads" namespace="pegasus">4</profile>""" - if "profile" in self.config: - for namespace in self.config["profile"]: - for key in self.config["profile"][namespace]: - value = self.config["profile"][namespace][key] - if key == "PATH": - value = value + ":%s/jobs/scripts/" % (os.path.dirname(os.path.abspath(__file__)),) - sites += """\n\t<profile namespace="%s" key="%s">%s</profile> """ % (namespace, key, value) - - sites += """</site> - <site handle="dummylocal" arch="x86_64" os="LINUX"> - <directory type="shared-scratch" path="%s"> - <file-server operation="all" url="file://%s" /> - </directory> - - """ % ( - os.path.join(self.basepath, "work"), - os.path.join(self.basepath, "work") - ) - sites += "</site></sitecatalog>" - sites = sites.replace("\n", "") - wh.write("\n".join([line for line in xml.dom.minidom.parseString(sites).toprettyxml().split('\n') if line.strip()])) + def _create_pegasus_files(self): + self._create_submit() + self._create_dax() return def _create_submit(self): """ Creates the pegasus submit script. submit.sh """ - with open(os.path.join(self.basepath, "input/submit.sh"), "w") as wh: + with open(os.path.join(self.base_path, "input/submit.sh"), "w") as wh: submit = textwrap.dedent("""\ #!/bin/bash plan=`pegasus-plan \\ - --conf "%s" \\ - --sites "%s" \\ - --dir "%s" \\ - --output-site local \\ - --dax "%s" \\ + --conf %s \\ + --sites %s \\ + --output-site %s \\ + --dir %s \\ + --dax %s \\ --randomdir \\ - """ % (self.basepath + "/input/conf.rc", "local", self.basepath + "/work/", self.basepath + "/input/chipathlon.dax")) + """ % ( + self.properties_file, + self.execute_site, + self.output_site, + os.path.join(self.base_path, "work"), + os.path.join(self.base_path, "input/chipathlon.dax") + )) submit += textwrap.dedent("""\ --submit` @@ -388,11 +364,11 @@ class Workflow(object): """) wh.write(submit) - os.chmod(self.basepath + "/input/submit.sh", 0755) + os.chmod(os.path.join(self.base_path, "input/submit.sh"), 0755) return - def _write(self): - with open(self.basepath + "/input/chipathlon.dax", "w") as wh: + def _create_dax(self): + with open(os.path.join(self.base_path, "input/chipathlon.dax"), "w") as wh: self.dax.writeXML(wh) return ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������� diff --git a/chipathlon/workflow_job.py b/chipathlon/workflow_job.py index 36df2d9e0747b5c6d7415a7c6893352c31282e8a..4a498e6ba140834f495a968deeff8da20ddea0cf 100644 --- a/chipathlon/workflow_job.py +++ b/chipathlon/workflow_job.py @@ -481,6 +481,8 @@ class WorkflowJob(object): "$outputs": outputs } arg_list = [] + if self.executable == "gem": + arg_list.append("-Xmx%sM" % (self.params.get("memory", self.job_data["memory"]))) has_args = re.compile("\$((in)|(out))puts\.[0-9]") # Loop through each argument, which are stored as a list of dicts for arg_dict in self.job_data["arguments"]: diff --git a/scripts/chip-gen b/scripts/chip-gen index 00c3a711608e97d31302a122e43f9f8e2d23557f..71f7dbf65621d1c6a1b22f744d25b28413fb39d3 100644 --- a/scripts/chip-gen +++ b/scripts/chip-gen @@ -12,6 +12,9 @@ parser.add_argument("-d", "--dir", dest="dir", required=True, help="Directory na parser.add_argument("--param", dest="param", required=True, help="Path to param file to load.") parser.add_argument("--conf", dest="config", required=True, help="Path to config file to load.") parser.add_argument("--run", dest="run", required=True, help="Path to run file to load.") +parser.add_argument("--properties", dest="properties", required=True, help="Path to pegasus properties file.") +parser.add_argument("--execute-site", dest="execute_site", required=True, default="local", help="Target execute site. Sites should be defined in configuration.") +parser.add_argument("--output-site", dest="output_site", required=True, default="local", help="Target output site. Site should be defined in configuration.") parser.add_argument("--rewrite", dest="rewrite", default=False, action="store_true", help="If specified, don't load from the database, rewrite files.") parser.add_argument("--debug", dest="debug", default=False, action="store_true", help="Print out more information while generating.") @@ -22,9 +25,12 @@ workflow = Workflow( args.run, args.param, args.config, + args.properties, args.host, args.username, args.password, + execute_site=args.execute_site, + output_site=args.output_site, rewrite=args.rewrite, debug=args.debug ) diff --git a/chipathlon/jobs/scripts/cat_spp.sh b/scripts/chip-job-cat-peak similarity index 100% rename from chipathlon/jobs/scripts/cat_spp.sh rename to scripts/chip-job-cat-peak diff --git a/scripts/chip-job-ccat-format-bed b/scripts/chip-job-ccat-format-bed new file mode 100755 index 0000000000000000000000000000000000000000..a6c7e3c537f9714e92e815dbd7c471905700fb02 --- /dev/null +++ b/scripts/chip-job-ccat-format-bed @@ -0,0 +1,37 @@ +#!/usr/bin/env python +import argparse +import os + +# The ccat output is *mostly* correct, however simply sorting is not enough +# as the fourth column labels are not done in correct chromosome order +# After sorting the output looks like this: +# chr1 3086860 3087035 ccat_131620 4 . 5.483551 0.577000 0.001 +# chr1 3318040 3318245 ccat_131610 4 . 5.483551 0.577000 0.001 +# chr1 3372210 3372465 ccat_87299 5 . 6.854439 0.462000 0.001 +# When it should look like this: +# chr1 3086860 3087035 ccat_0 4 . 5.483551 0.577000 0.001 +# chr1 3318040 3318245 ccat_1 4 . 5.483551 0.577000 0.001 +# chr1 3372210 3372465 ccat_2 5 . 6.854439 0.462000 0.001 + +parser = argparse.ArgumentParser(description = "Format ccat result files.") +parser.add_argument("--input", "-i", dest="input", required=True, help="Path to input ccat file.") +parser.add_argument("--output", "-o", dest="output", required=True, help="Output file to write formatted results.") +args = parser.parse_args() + +if not os.path.isfile(args.input): + sys.exit("Could not open input file '%s' for reading." % (args.input,)) + +bed_data = [] +# To sort we unfortunately need to load the whole file +with open(args.input, "r") as rh: + for line in rh: + bed_data.append(line.strip().split()) + +# Equivalent to sort -k1,1V -k2,2n -k3,3n +sorted_data = sorted(bed_data, key=lambda line: (line[0], int(line[1]), int(line[2]))) + +with open(args.output, "w") as wh: + for i, line in enumerate(sorted_data): + # Fix the peak numbers + line[3] = "ccat_%s" % (i,) + wh.write("\t".join(line) + "\n") diff --git a/chipathlon/jobs/scripts/chr_locus_convert.py b/scripts/chip-job-chr-convert similarity index 100% rename from chipathlon/jobs/scripts/chr_locus_convert.py rename to scripts/chip-job-chr-convert diff --git a/chipathlon/jobs/scripts/download_from_encode.py b/scripts/chip-job-download-encode similarity index 100% rename from chipathlon/jobs/scripts/download_from_encode.py rename to scripts/chip-job-download-encode diff --git a/chipathlon/jobs/scripts/download_from_gridfs.py b/scripts/chip-job-download-gridfs similarity index 100% rename from chipathlon/jobs/scripts/download_from_gridfs.py rename to scripts/chip-job-download-gridfs diff --git a/chipathlon/jobs/scripts/run_music.py b/scripts/chip-job-music similarity index 100% rename from chipathlon/jobs/scripts/run_music.py rename to scripts/chip-job-music diff --git a/scripts/chip-job-peakranger-format b/scripts/chip-job-peakranger-format new file mode 100755 index 0000000000000000000000000000000000000000..bd2202fbbcaf967da3aacb6235606aea9bc3f1d0 --- /dev/null +++ b/scripts/chip-job-peakranger-format @@ -0,0 +1,42 @@ +#!/usr/bin/env python +import argparse +import math +import sys +import os + +# Peakranger has very different output than expected: +# chr1 180330013 180330834 ranger_fdrPassed_0_pval_2.00647e-193_fdr_2.79703e-190 2.79703e-190 + +# chr1 106321435 106322114 ranger_fdrPassed_1_pval_5.1557e-147_fdr_3.59352e-144 3.59352e-144 + +# chr1 37474619 37475638 ranger_fdrPassed_2_pval_5.45776e-144_fdr_2.53604e-141 2.53604e-141 + +# There is no calculated score, the pval needs to be stripped from a different +# column, and the strand is on the end instead of the 6th column + +parser = argparse.ArgumentParser(description = "Format peakranger output files.") +parser.add_argument("--input", "-i", dest="input", required=True, help="Input result file from peakranger.") +parser.add_argument("--output", "-o", dest="output", required=True, help="Output file to write formatted results.") +args = parser.parse_args() + +if not os.path.isfile(args.input): + sys.exit("Could not open input file '%s' for reading." % (args.input,)) + +# We need to sort so again, load the entire file +bed_data = [] +with open(args.input, "r") as rh: + for line in rh: + # Skip comments + if line.strip() and not line.startswith("#"): + bed_data.append(line.strip().split()) + +# Equivalent to sort -k1,1V -k2,2n -k3,3n +sorted_data = sorted(bed_data, key=lambda line: (line[0], int(line[1]), int(line[2]))) + +with open(args.output, "w") as wh: + for i, line in enumerate(sorted_data): + # Things get a little trickier now: + peak_name = "ranger_%s" % (i,) + strand = line[5] + # This is actually faster than using regex + pval = float(line[3].split("pval_")[1].split("_")[0]) + qval = float(line[4]) + score = int(-1 * math.log(qval, 10)) + wh.write("\t".join(line[:3] + map(str, [peak_name, score, strand, ".", pval, qval])) + "\n") diff --git a/chipathlon/jobs/scripts/db_save_result.py b/scripts/chip-job-save-result similarity index 100% rename from chipathlon/jobs/scripts/db_save_result.py rename to scripts/chip-job-save-result diff --git a/chipathlon/jobs/scripts/sort_peak.sh b/scripts/chip-job-sort-peak similarity index 100% rename from chipathlon/jobs/scripts/sort_peak.sh rename to scripts/chip-job-sort-peak diff --git a/chipathlon/jobs/scripts/zcat_spp.sh b/scripts/chip-job-zcat-peak similarity index 100% rename from chipathlon/jobs/scripts/zcat_spp.sh rename to scripts/chip-job-zcat-peak diff --git a/setup.py b/setup.py index 4525dfce3c999b8d685e76cc666c598ced464de0..de5977557b4394d6d961e6bb3333cd9a3589072c 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,17 @@ setup( "scripts/chip-gen", "scripts/chip-create-run", "scripts/chip-meta-download", - "scripts/chip-meta-import" + "scripts/chip-meta-import", + "scripts/chip-job-cat-peak", + "scripts/chip-job-ccat-format-bed", + "scripts/chip-job-chr-convert", + "scripts/chip-job-download-encode", + "scripts/chip-job-download-gridfs", + "scripts/chip-job-music", + "scripts/chip-job-peakranger-format", + "scripts/chip-job-save-result", + "scripts/chip-job-sort-peak", + "scripts/chip-job-zcat-peak" ], install_requires=["pymongo","pyyaml"], zip_safe=False