Commit cb56d4fb authored by aknecht2's avatar aknecht2
Browse files

Merge branch 'bridges-compatability' into 'master'

Bridges compatability

Updates so things work on bridges as well as on local submit sites.

See merge request !32
parents bf8cd175 9273df65
......@@ -40,6 +40,31 @@ peak_tools = [
"music"
]
executables = [
"bedtools",
"samtools",
"idr",
"picard",
"bwa",
"bowtie2",
"macs2",
"gem",
"peakranger",
"MUSIC",
"CCAT",
"run_spp_nodups",
"chip-job-cat-peak",
"chip-job-ccat-format-bed",
"chip-job-chr-convert",
"chip-job-save-result",
"chip-job-download-encode",
"chip-job-download-gridfs",
"chip-job-music",
"chip-job-peakranger-format",
"chip-job-sort-peak",
"chip-job-zcat-peak"
]
# Peak_type validation
peak_types = {
"spp": ["narrow", "broad"],
......@@ -152,3 +177,12 @@ genomes = {
"additional_files": file_extensions["bowtie2_genome"]
}
}
config_file = {
"required_keys": [
"chipathlon_bin", "idr_bin", "pegasus_home", "email"
],
"optional_keys": [
"arch", "os"
]
}
......@@ -8,7 +8,7 @@ cat_awk_sort_peaks:
- name: sorted_result
type: file
file_type: bed
command: cat_spp.sh
command: chip-job-cat-peak
arguments:
- "$inputs.0":
type: file
......
......@@ -35,7 +35,7 @@ ccat_callpeak:
- name: ccat_log
type: stdout
file_type: log
command: ccat
command: CCAT
arguments:
- "$inputs.0":
type: file
......
......@@ -8,15 +8,15 @@ ccat_format_bed:
- name: sorted_peaks
type: file
file_type: bed
command: ccat_format_bed.pl
command: chip-job-ccat-format-bed
arguments:
- "-c":
- "--input":
type: file
changeable: false
required: true
has_value: true
default: $inputs.0
- "-b":
- "--output":
type: file
changeable: false
required: true
......
......@@ -8,7 +8,7 @@ chr_locus_convert:
- name: chr_bed
type: file
file_type: bed
command: chr_locus_convert.py
command: chip-job-chr-convert
arguments:
- "-b":
type: file
......
......@@ -2,7 +2,7 @@ db_save_result:
inputs:
- name: username
type: string
- name: password
type: string
......@@ -18,7 +18,7 @@ db_save_result:
file_type: yaml
additional_inputs: null
outputs: null
command: db_save_result.py
command: chip-job-save-result
arguments:
- "-u":
type: string
......
......@@ -10,7 +10,7 @@ download_from_encode:
- name: downloaded_file
type: file
file_type: any
command: download_from_encode.py
command: chip-job-download-encode
arguments:
- "-u":
type: string
......
......@@ -16,7 +16,7 @@ download_from_gridfs:
- name: downloaded_result
type: file
file_type: any
command: download_from_gridfs.py
command: chip-job-download-gridfs
arguments:
- "-H":
type: string
......
......@@ -42,7 +42,7 @@ music_broad:
- name: scale_16626.bed
type: file
file_type: bed
command: music
command: chip-job-music
arguments:
- "--prefix":
type: string
......
......@@ -26,7 +26,7 @@ music_narrow:
- name: scale_291.bed
type: file
file_type: bed
command: music
command: chip-job-music
arguments:
- "--prefix":
type: string
......
......@@ -46,7 +46,7 @@ music_punctate:
- name: scale_2216.bed
type: file
file_type: bed
command: music
command: MUSIC
arguments:
- "--prefix":
type: string
......
......@@ -8,15 +8,15 @@ peakranger_format_bed:
- name: sorted_peaks
type: file
file_type: bed
command: peakranger_format_bed.pl
command: chip-job-peakranger-format
arguments:
- "-c":
- "--input":
type: file
changeable: false
required: true
has_value: true
default: $inputs.0
- "-b":
- "--output":
type: file
changeable: false
required: true
......
......@@ -8,7 +8,7 @@ sort_awk_sort_peaks:
- name: sorted_peaks
type: file
file_type: bed
command: sort_peak.sh
command: chip-job-sort-peak
arguments:
- "$inputs.0":
type: file
......
......@@ -20,7 +20,7 @@ spp_nodups_broad:
- name: score
type: file
file_type: ccscore
command: spp_nodups
command: run_spp_nodups
arguments:
- "-c=":
type: file
......
......@@ -20,7 +20,7 @@ spp_nodups_narrow:
- name: score
type: file
file_type: ccscore
command: spp_nodups
command: run_spp_nodups
arguments:
- "-c=":
type: file
......
......@@ -8,7 +8,7 @@ zcat_awk_sort_peaks:
- name: sorted_peaks
type: file
file_type: bed
command: zcat_spp.sh
command: chip-job-zcat-peak
arguments:
- "$inputs.0":
type: file
......
#args <- getArgs(defaults=list(workdir=NULL, exper.bams=NULL,control.bams="",
# analysisID=NULL, exper.dir="./", control.dir="./",
# tf=NULL,cond1=NULL,cond2=NULL,
# orgn=NULL, samples=NULL, contrast=NULL,
# bin.win=10000, blacklist.regions="",
# FDR=0.05, frag.len=10, min.quality=30, paired=FALSE,
# spacing=50, tol=100, win.width=10))
#print(args)
commandArgs(trailingOnly = FALSE)
yamlfile <- '~/Documents/R/csaw_test.yaml'
library(yaml)
cat("yamlfile: ", yamlfile)
pars <- yaml.load_file(yamlfile)
names(pars)
# partly based on https://www.bioconductor.org/help/course-materials/2015/BioC2015/csaw_lab.html
# an example call:
# where experiments came from GSE25532:
# A. K. Tiwari, M. B. Stadler, C. Wirbelauer, R. Paro, D. Schubeler, and C. Beisel.
# A chromatin-modifying function of JNK during stem cell dfferentiation.
# Nat. Genet., 44(1): 94-100, Jan 2012
#yaml.load_file( foo.yml )
outfileStem <- paste0(c(pars$analysisID, pars$tf, pars$cond1.cell, pars$cond2.cell, pars$orgn), collapse="_")
outfileSimes<-paste0(c(outfileStem,'Simes.txt'), collapse="_")
outfileBestTest<-paste0(c(outfileStem,'bestTest.txt'), collapse="_")
cat( "Simes outfile: ", outfileSimes, " bestTestoutfile: ", outfileBestTest, "\n" )
source("https://bioconductor.org/biocLite.R")
setwd(pars$workdir)
bam.files <- file.path(pars$exper.dir, pars$exper.bams)
cat("bam.files:\n")
bam.files
if ( length(pars$control.bams) > 1){
control.bams <- file.path(pars$control.dir, pars$control.bams)
cat("Control (input) bam(s): ", control.bams, "\n" )
}
samples <- pars$samples
cat("Samples: ", samples, "\n" )
contrasts<-pars$contrast
cat("Contrasts: ", contrasts, "\n" )
# the blacklisted genomic regions
library(GenomicRanges)
if ( length(pars$blacklist.regions) > 1){
blacklist.regions <- pars$blacklist.regions
} else {
if ( pars$orgn == 'human'){
} else {
if ( pars$orgn == 'mouse'){
} else {
cat("Organism: ", pars$orgn, "not implemented yet, only human and mouse\n")
quit()
}
}
}
library(csaw)
library(DESeq2)
require(edgeR)
param <- readParam(minq=pars$min.quality)
#Reads are directionally extended to the average fragment length,
#to represent the original fragments that were present during immunoprecipitation.
#Windows of constant size (10 bp) are tiled across the genome at regular intervals (50 bp).
if (pars$paired==TRUE){
pe.param <- readParam(max.frag=400, pe="both")
if ( length(pars$control.bams) > 1){
d1 <- windowCounts( c(bam.files, control.bams), bin=TRUE, width=10000,
ext=pars$frag.len, width=pars$win.width, param=pe.param)
} else { # no input bams
d1 <- windowCounts( bam.files, ext=pars$frag.len, width=pars$win.width, param=pe.param)
}
} else { #single reads
if ( length(pars$control.bams) > 1){
d1 <- windowCounts( c(bam.files, control.bams), bin=TRUE, width=10000,
ext=pars$frag.len, width=pars$win.width, param=param)
} else { # no input bams
d1 <- windowCounts(bam.files, ext=pars$frag.len, width=pars$win.width, param=param)
}
}
if ( length(pars$control.bams) > 1){
chip <- d1[,1:length(bam.files)]
control <- d1[,((length(bam.files) + 1):(length(bam.files) + length(control.bams)))]
filter.stat <- filterWindows(chip, control, type="control", prior.count=5,
norm.fac=list(chip, control))
data <-filter.stat$filter > log2(3)
} else { # no input bams
abundances <- aveLogCPM(asDGEList(d1))
print("aveLogCPM abundances")
summary(abundances)
keep.simple <- abundances > aveLogCPM(5, lib.size=mean(d1$totals))
data <- d1[keep.simple, ]
summary(keep.simple)
}
# here implement the blacklist filter??
# In edgeR, the log-transformed overall NB mean is referred to as the average abundance.
# This is computed with the aveLogCPM function, as shown below for each region
data$totals
#max.delay <- 500
dedup.on <- readParam(dedup=TRUE, minq=pars$min.quality)
#x <- correlateReads(bam.files, $pars.max.delay, param=dedup.on)
#plot(0:$pars.max.delay, x, type="l", ylab="CCF", xlab="Delay (bp)")
normfacs <- normalize(data)
normfacs
y <- asDGEList(data, norm.factors=normfacs)
cat("nrow(data: ")
nrow(data)
grouping <- factor(samples)
design <- model.matrix(~0 + grouping)
colnames(design) <- levels(grouping)
y <- asDGEList(data, norm.factors=normfacs)
y <- estimateDisp(y, design)
# biocLite("statmod")
require(statmod)
#install.packages("dplyr")
library(dplyr)
#install.packages("gridGraphics")
#library(gridGraphics)
fit <- glmQLFit(y, design, robust=TRUE)
#contrast <- makeContrasts(es - tn, levels=design)
cstring <- paste0(contrasts, collapse=' - ')
contrast <- makeContrasts(cstring, levels=design)
#QL F-test [Lund et al., 2012]
results <- glmQLFTest(fit, contrast=contrast)
merged <- mergeWindows(rowRanges(data), tol=pars$tol, max.width=pars$bin.win)
# We demand replicates!
#norep.fit <- glmFit(y, design, dispersion=0.05)
#norep.results <- glmLRT(norep.fit, contrast=contrast)
#bin.adjc <- cpm(asDGEList(binned.2), log=TRUE)
#plotMDS(bin.adjc, labels=grouping)
#plotBCV(y)
#plotQLDisp(fit)
clustered <- mergeWindows(rowRanges(data), tol=1000)
clustered$region
cat("entries in clustered: ")
length(unique(clustered$id))
# first method: Simes method
# to cluster adjacent windows into regions. A combined p-value can then be computed
# for each cluster, based on the p-values of the constituent windows [Simes, 1986].
# This tests the joint null hypothesis for each cluster, i.e., that
# no enrichment is observed across any sites within the corresponding region.
# The combined p-values are then adjusted using the BH method to control the region-level FDR.
tabcom <- combineTests(clustered$id, results$table)
print("Simes method")
cat("entries in tabcom: ")
length(tabcom$nWindows)
head(tabcom)
#is.sig.enrich <- select(tabcom, logFC.up > logFC.down & FDR<= pars$FDR)
#cat("Significantly enriched regions: ", summary(is.sig.enrich))
#is.sig.depleted <- select(tabcom, logFC.up < logFC.down & FDR<= pars$FDR)
#cat("Significantly depleted regions: ", summary(is.sig.depleted))
#summary(is.sig.depleted)
tabcom$chr <- seqnames(clustered$region)[as.numeric(row.names(tabcom))]
tabcom$start <- start(clustered$region)[as.numeric(row.names(tabcom))]
tabcom$stop <- end(clustered$region)[as.numeric(row.names(tabcom))]
#tabcom$start <- start(rowRanges(data))[tabcom$best]
#tabcom$stop <- end(rowRanges(data))[tabcom$best]
tabCombyPadj <- tabcom[with(tabcom, order(FDR)), ]
cat("outfileSimes: ", outfileSimes,"\n")
write.table(tabCombyPadj, file=outfileSimes, row.names=FALSE, quote=FALSE, sep="\t")
# Alternative to Simes: getBestTest
# choose a single window to represent each cluster/region.
# For example, the window with the highest average abundance in each cluster can be used.
# This is sensible for analyses involving sharp binding events, where each cluster is expected
# to be small and contain no more than one binding site. Thus, a single window
# (and p-value) can reasonably be used as a representative of the entire region.
# The BH method can then be applied to the corresponding p-values
# of the representative windows from all clusters.
clustered2 <- mergeWindows(rowRanges(data), tol=100, max.width=5000)
tab.best <- getBestTest(clustered$id, results$table)
tab.best$chr <- seqnames(rowRanges(data))[tab.best$best]
tab.best$start <- start(rowRanges(data))[tab.best$best]
tab.best$stop <- end(rowRanges(data))[tab.best$best]
resultsbypadj <- tab.best[with(tab.best, order(FDR)), ]
cat("outfileBestTest: ", outfileBestTest, "\n")
cat("entries in resultsbypadj: ")
length(resultsbypadj$best)
write.table(resultsbypadj, file=outfileBestTest, row.names=FALSE, quote=FALSE, sep="\t")
quit()
#!/usr/bin/perl -w
=NAME
=VERSION
=head1 SYNOPSYS
From CCAT output format generates a decent, sorted BED file with no spaces
Author: Istvan (Steve) Ladunga, University of Nebraska-Lincoln, sladunga@unl.edu
=cut
my $usage = << "EOF";
Usage: $0
-c input CCAT result file (significant.peak)
-b output BEDfile
EOF
use lib qw ( /home/ladunga/sladunga/bin/ /home/ladunga/sladunga/LIB/ /home/ladunga/perllib/ );
use NoStatUtils;
use strict;
use Carp;
use Getopt::Long;
my %par = (
);
&GetOptions("c=s" => \$par{infile},
"b=s" => \$par{outfile});
($par{infile} && $par{outfile}) or die $usage;
&go (\%par );
print STDERR "$0 done\n";
######################################################################
sub go {
my ( $par ) = @_;
my $n = 0;
open (IN, "<$par->{infile}") or die "Cannot open infile $par->{infile} $!\n";
my $tmpFile = '/tmp/ccatFormat2Bed.'.$$;
open (OF, ">$tmpFile") or die "Cannot open outfile $tmpFile $!\n";
while(<IN>){
chomp;
next if /^\#/;
next unless /\w/;
#chr1 16523865 16522790 16526550 184 14 73.063558 0.001000
s/ +//g; # get rid of silly spaces
my @w = split;
my $name = 'ccat_'.$n;
my $start = ( $w[1] < $w[2] ) ? $w[1] : $w[2];
my $end = ( $w[1] < $w[2] ) ? $w[2] : $w[1];
my $s = join("\t", ($w[0], $start, $end, $name, $w[4], '.', $w[6], $w[7], 0.001 )); #, '.', $w[6]));
print OF "$s\n";
$n++;
}
close (IN);
close (OF);
`sort -k1,1V -k2,2n -k3,3n $tmpFile > $par->{outfile}`;
}
###################################################################
library(argparse)
parser <- ArgumentParser(description = "CSAW wrapper R script.")
parser$add_argument("--workdir", dest="workdir", required=TRUE, help="Working directory.")
parser$add_argument("--frag_len", dest="frag_len", default=20, help="Fragment Length.")
parser$add_argument("--window_width", dest="window_width", default=20, help="Default window width.")
parser$add_argument("--exp_bams", dest="exp_bams", nargs="+", required=TRUE, help="List of experiment bam files to use.")
parser$add_argument("--exp_dir", dest="exp_dir", required=TRUE, help="Path to experiment directory.")
parser$add_argument("--control_bams", dest="control_bams", nargs="+", required=TRUE, help="List of control bam files to use.")
parser$add_argument("--control_dir", dest="control_dir", required=TRUE, help="List of control dirs to use.")
parser$add_argument("--organism", dest="organism", required=TRUE, help="Organism name.")
parser$add_argument("--tf", dest="tf", required=TRUE, help="Transcription factor.")
parser$add_argument("--cond1", dest="cond1", required=TRUE, help="First condition.")
parser$add_argument("--cond2", dest="cond2", required=TRUE, help="Second condition.")
parser$add_argument("--samples", dest="samples", required=TRUE, nargs="+", help="?")
parser$add_argument("--contrast", dest="contrast", required=TRUE, nargs="+", help="?")
parser$add_argument("--id", dest="id", required=TRUE, help="?")
parser$add_argument("--fdr", dest="fdr", default=0.05, help="False Discovery Rate.")
parser$add_argument("--min_q", dest="min_q", default=30, help="Minimum quality score.")
parser$add_argument("--paired", dest="paired", default=FALSE, action="store_true", help="Paired end reads.")
parser$add_argument("--spacing", dest="spacing", default=50, help="?")
parser$add_argument("--tol", dest="tol", default=100, help="?")
args <- parser$parse_args()
# partly based on https://www.bioconductor.org/help/course-materials/2015/BioC2015/csaw_lab.html
# an example call:
# where experiments came from GSE25532:
# A. K. Tiwari, M. B. Stadler, C. Wirbelauer, R. Paro, D. Schubeler, and C. Beisel.
# A chromatin-modifying function of JNK during stem cell dfferentiation.
# Nat. Genet., 44(1): 94-100, Jan 2012
#yaml.load_file( foo.yml )
outfileStem <- paste0(c(args$analysisID, args$tf, args$cond1.cell, args$cond2.cell, args$orgn), collapse="_")
outfileSimes<-paste0(c(outfileStem,'Simes.txt'), collapse="_")
outfileBestTest<-paste0(c(outfileStem,'bestTest.txt'), collapse="_")
cat( "Simes outfile: ", outfileSimes, " bestTestoutfile: ", outfileBestTest, "\n" )
source("https://bioconductor.org/biocLite.R")
setwd(args$workdir)
bam.files <- file.path(args$exper.dir, args$exper.bams)
cat("bam.files:\n")
bam.files
if ( length(args$control.bams) > 1){
control.bams <- file.path(args$control.dir, args$control.bams)
cat("Control (input) bam(s): ", control.bams, "\n" )
}
samples <- args$samples
cat("Samples: ", samples, "\n" )
contrasts<-args$contrast
cat("Contrasts: ", contrasts, "\n" )
# the blacklisted genomic regions
library(GenomicRanges)
if ( length(args$blacklist.regions) > 1){
blacklist.regions <- args$blacklist.regions
} else {
if ( args$orgn == 'human'){
} else {
if ( args$orgn == 'mouse'){
} else {
cat("Organism: ", args$orgn, "not implemented yet, only human and mouse\n")
quit()
}
}
}
library(csaw)
library(DESeq2)
require(edgeR)
param <- readParam(minq=args$min.quality)
#Reads are directionally extended to the average fragment length,
#to represent the original fragments that were present during immunoprecipitation.
#Windows of constant size (10 bp) are tiled across the genome at regular intervals (50 bp).
if (args$paired==TRUE){
pe.param <- readParam(max.frag=400, pe="both")
if ( length(args$control.bams) > 1){
d1 <- windowCounts( c(bam.files, control.bams), bin=TRUE, width=10000,
ext=args$frag.len, width=args$win.width, param=pe.param)
} else { # no input bams
d1 <- windowCounts( bam.files, ext=args$frag.len, width=args$win.width, param=pe.param)
}
} else { #single reads
if ( length(args$control.bams) > 1){
d1 <- windowCounts( c(bam.files, control.bams), bin=TRUE, width=10000,
ext=args$frag.len, width=args$win.width, param=param)
} else { # no input bams
d1 <- windowCounts(bam.files, ext=args$frag.len, width=args$win.width, param=param)
}
}
if ( length(args$control.bams) > 1){
chip <- d1[,1:length(bam.files)]
control <- d1[,((length(bam.files) + 1):(length(bam.files) + length(control.bams)))]