Commit f85c8175 authored by aknecht2's avatar aknecht2
Browse files

Merge branch '26-zerone' into 'master'

Resolve "Zerone"

Closes #26 
Woo!

See merge request !34
parents ff84032d c8c9edee
......@@ -37,7 +37,10 @@ peak_tools = [
"gem",
"peakranger",
"ccat",
"music"
"music",
"zerone",
"hiddendomains",
"pepr"
]
executables = [
......@@ -52,6 +55,9 @@ executables = [
"peakranger",
"MUSIC",
"CCAT",
"PePr",
"hiddenDomains",
"zerone",
"run_spp_nodups",
"chip-job-cat-peak",
"chip-job-ccat-format-bed",
......@@ -62,7 +68,9 @@ executables = [
"chip-job-music",
"chip-job-peakranger-format",
"chip-job-sort-peak",
"chip-job-zcat-peak"
"chip-job-zcat-peak",
"chip-job-zerone-format",
"chip-job-hd-format"
]
# Java needs to have -Xmx specified...
......@@ -77,7 +85,10 @@ peak_types = {
"gem": ["narrow"],
"peakranger": ["narrow"],
"ccat": ["broad"],
"music": ["narrow", "punctate", "broad"]
"music": ["narrow", "punctate", "broad"],
"zerone": ["broad"],
"hiddendomains": ["broad"],
"pepr": ["broad", "sharp"],
}
# File extensions
......@@ -147,8 +158,8 @@ argument_types = {
# Defines information about arguments
argument_keys = {
"required": ["type", "changeable", "has_value", "required"],
"optional": ["default", "path", "file_type"]
"required": ["type", "changeable", "has_value"],
"optional": ["required", "default", "file_type", "path", "separator"]
}
# workflow_job keys
......
......@@ -18,7 +18,7 @@ def download_from_gridfs(host, gridfs_id, local_path, username=None, password=No
if not os.path.isfile(local_path) or overwrite:
for i in range(0, retries):
print "Attempt #%s, downloading file with ID '%s' to '%s'" % (i + 1, gridfs_id, local_path)
if mdb.fetch_from_gridfs(bson.objectid.ObjectId(gridfs_id), localpath, checkmd5):
if mdb.fetch_from_gridfs(bson.objectid.ObjectId(gridfs_id), local_path, checkmd5):
return True
else:
print "Download attempt #%s from GridFS failed, retrying..." % (i + 1)
......
from module_generator import ModuleGenerator
from chipathlon.result import Result
import collections
class IdrGenerator(ModuleGenerator):
"""
......@@ -36,14 +37,8 @@ class IdrGenerator(ModuleGenerator):
)
self.module_name = "idr"
self.result_dict = {}
self.output_files = {
"peakranger": {},
"ccat": {},
"gem": {},
"spp": {},
"macs2": {},
"music": {}
}
self.output_files = collections.defaultdict(dict)
self.output_files["peakranger"]["narrow"] = ["region_sorted.bed", "summit_sorted.bed"]
self.output_files["ccat"]["broad"] = ["region_sorted.bed", "peak_sorted.bed"]
self.output_files["gem"]["narrow"] = ["results_GEM_sorted.bed", "results_GPS_sorted.bed"]
......@@ -52,6 +47,10 @@ class IdrGenerator(ModuleGenerator):
self.output_files["music"]["narrow"] = ["sorted_scale_%s_all.bed" % (i,) for i in [129, 194, 291]]
self.output_files["music"]["punctate"] = ["sorted_scale_%s_all.bed" % (i,) for i in [129, 194, 291, 437, 656, 985, 1477, 2216]]
self.output_files["music"]["broad"] = ["sorted_scale_%s_all.bed" % (i,) for i in [1459, 2189, 3284, 4926, 7389, 11084, 16626]]
self.output_files["zerone"]["broad"] = ["results_sorted.bed"]
self.output_files["hiddendomains"]["broad"] = ["results_sorted.bed"]
self.output_files["pepr"]["broad"] = ["results_sorted.bed"]
self.output_files["pepr"]["sharp"] = ["results_sorted.bed"]
if debug:
print "[LOADING GENERATOR] IdrGenerator"
return
......
......@@ -41,7 +41,10 @@ class PeakCallGenerator(ModuleGenerator):
"macs2": self._macs2,
"ccat": self._ccat,
"peakranger": self._peakranger,
"music": self._music
"music": self._music,
"zerone": self._zerone,
"hiddendomains": self._hiddendomains,
"pepr": self._pepr
}
self.call_pairs = {}
if debug:
......@@ -146,6 +149,52 @@ class PeakCallGenerator(ModuleGenerator):
}
return (self.get_markers(run), inputs)
def _zerone(self, run, result):
"""
:param run: The run to generate jobs for
:type run: :py:class:chipathlon.run.Run
:param result: The result to generate jobs for.
:type result: :py:class:chipathlon.result.Result
"""
call_pair = self.call_pairs[result.full_name]
inputs = {
"control.bam": call_pair[0].full_name,
"signal.bam": call_pair[1].full_name
}
return (self.get_markers(run), inputs)
def _hiddendomains(self, run, result):
"""
:param run: The run to generate jobs for
:type run: :py:class:chipathlon.run.Run
:param result: The result to generate jobs for.
:type result: :py:class:chipathlon.result.Result
"""
call_pair = self.call_pairs[result.full_name]
inputs = {
"chrom.sizes": run.genome.get_chrom_sizes()["name"],
"control.bed": call_pair[0].full_name,
"signal.bed": call_pair[1].full_name,
"prefix": result.prefix
}
return (self.get_markers(run), inputs)
def _pepr(self, run, result):
"""
:param run: The run to generate jobs for
:type run: :py:class:chipathlon.run.Run
:param result: The result to generate jobs for.
:type result: :py:class:chipathlon.result.Result
"""
call_pair = self.call_pairs[result.full_name]
inputs = {
"control.bed": call_pair[0].full_name,
"signal.bed": call_pair[1].full_name,
"prefix": result.prefix,
"peak_type": run.peak_type
}
return (self.get_markers(run), inputs)
def _make_call_pairs(self, run, result_list):
"""
:param run: The run currently being processed.
......@@ -185,12 +234,23 @@ class PeakCallGenerator(ModuleGenerator):
:param run: The target run to generate jobs for.
:type run: :py:class:`~chipathlon.run.Run`
"""
remove_duplicates_results = run.get_results("remove_duplicates", "no_dups_chr.bed")
if run.peak == "zerone":
print "zerone"
if run.file_type == "fastq":
results = run.get_results("align", "align.bam")
elif run.file_type == "bam":
results = run.get_results("download", "encode.bam")
# results = run.get_results("align", "align.bam")
print results
else:
results = run.get_results("remove_duplicates", "no_dups_chr.bed")
module_markers = {"peak_call": self.get_markers(run)}
all_result_names = []
final_results = self.module.get_all_final_results(self.get_markers(run))
for paired_result in self._make_call_pairs(run, remove_duplicates_results):
for paired_result in self._make_call_pairs(run, results):
for i, final_result in enumerate(final_results):
final_result_name = final_result["file_name"]
if final_result_name not in all_result_names:
......@@ -218,11 +278,19 @@ class PeakCallGenerator(ModuleGenerator):
:param result: The target result to create jobs for.
:type result: :py:class:`~chipathlon.result.Result`
"""
remove_duplicate_results = run.get_results("remove_duplicates", "no_dups_chr.bed")
if run.peak == "zerone":
if run.file_type == "fastq":
results = run.get_results("align", "align.bam")
elif run.file_type == "bam":
results = run.get_results("download", "encode.bam")
print results
else:
results = run.get_results("remove_duplicates", "no_dups_chr.bed")
prev_results = []
control_accessions = result.get_accessions("control")
signal_accessions = result.get_accessions("signal")
for prev_result in remove_duplicate_results:
for prev_result in results:
if (set(prev_result.get_accessions("control")).issubset(control_accessions) and
set(prev_result.get_accessions("signal")).issubset(signal_accessions)):
prev_results.append(prev_result)
......
......@@ -307,3 +307,104 @@ peak_call:
results_sorted.bed:
param_name: sorted_peaks
final_result: true
- zerone[tool]:
- broad[peak_type]:
- zerone_callpeak:
inputs:
control.bam:
param_name: control.bam
signal.bam:
param_name: signal.bam
outputs:
peaks.bed:
param_name: result_peaks
- zerone_format:
inputs:
peaks.bed:
param_name: result_peaks
outputs:
results_sorted.bed:
param_name: full_result
final_result: true
- hiddendomains[tool]:
- broad[peak_type]:
- hiddendomains_callpeak:
inputs:
control.bed:
param_name: control.bed
signal.bed:
param_name: signal.bed
chrom.sizes:
param_name: chrom_sizes
prefix:
param_name: prefix
outputs:
analysis.bed:
param_name: result_peaks
vis.bed:
param_name: enriched_bins
domains.txt:
param_name: domains
control_bins.txt:
param_name: control_bins
treatment_bins.txt:
param_name: treatment_bins
- hiddendomains_format:
inputs:
analysis.bed:
param_name: result_peaks
outputs:
results_sorted.bed:
param_name: full_result
final_result: true
- pepr[tool]:
- sharp[peak_type]:
- cp:
inputs:
control.bed:
param_name: input_file
outputs:
control2.bed:
param_name: output_file
- broad[peak_type]:
- cp:
inputs:
control.bed:
param_name: input_file
outputs:
control2.bed:
param_name: output_file
- cp:
inputs:
signal.bed:
param_name: input_file
outputs:
signal2.bed:
param_name: output_file
- pepr_callpeak:
inputs:
control.bed:
param_name: control1.bed
control2.bed:
param_name: control2.bed
signal.bed:
param_name: signal1.bed
signal2.bed:
param_name: signal2.bed
prefix:
param_name: prefix
peak_type:
param_name: peak_type
outputs:
_PePr_peaks.bed:
param_name: result_peaks
parameters.txt:
param_name: pepr_params
- sort_awk_sort_peaks:
inputs:
_PePr_peaks.bed:
param_name: result_peaks
outputs:
results_sorted.bed:
param_name: sorted_peaks
final_result: true
hiddendomains_callpeak:
inputs:
control.bed:
type: file
file_type: bed
signal.bed:
type: file
file_type: bed
chrom_sizes:
type: file
file_type: chrom_sizes
prefix:
type: string
outputs:
result_peaks:
type: file
file_type: bed
enriched_bins:
type: file
file_type: bed
domains:
type: file
file_type: txt
control_bins:
type: file
file_type: txt
treatment_bins:
type: file
file_type: txt
command: hiddenDomains
arguments:
- "-B":
type: string
changeable: false
required: true
has_value: false
- "-c":
type: file
changeable: false
required: true
has_value: true
default: "$control.bed"
- "-t":
type: file
changeable: false
required: true
has_value: true
default: "$signal.bed"
- "-g":
type: file
changeable: false
required: true
has_value: true
default: "$chrom_sizes"
- "-o":
type: string
changeable: false
required: true
has_value: true
default: "$prefix"
- "-b":
type: numeric
changeable: true
required: true
has_value: true
default: 1000
- "-p":
type: numeric
changeable: true
required: true
has_value: true
default: 0
- "-q":
type: numeric
changeable: true
required: true
has_value: true
default: 30
walltime: 240
memory: 16000
cores: 1
nodes: 1
hiddendomains_format:
inputs:
result_peaks:
type: file
file_type: bed
outputs:
full_result:
type: file
file_type: bed
command: chip-job-hd-format
arguments:
- "$result_peaks":
type: file
changeable: false
required: true
has_value: false
- "$full_result":
type: file
changeable: false
required: true
has_value: false
walltime: 2000
memory: 2000
cores: 1
nodes: 1
pepr_callpeak:
inputs:
control1.bed:
type: file
file_type: bed
control2.bed:
type: file
file_type: bed
signal1.bed:
type: file
file_type: bed
signal2.bed:
type: file
file_type: bed
prefix:
type: string
peak_type:
type: string
outputs:
result_peaks:
type: file
file_type: bed
pepr_params:
type: file
file_type: txt
command: PePr
arguments:
- "-c":
type: file_list
separator: ","
changeable: false
required: true
has_value: true
default:
- $control1.bed
- $control2.bed
- "-i":
type: file_list
separator: ","
changeable: false
required: true
has_value: true
default:
- $signal1.bed
- $signal2.bed
- "-n":
type: string
changeable: true
required: true
has_value: true
default: $prefix
- "-f":
type: string
changeable: true
required: true
has_value: true
default: "bed"
- "-s":
type: numeric
changeable: true
required: false
has_value: true
- "-w":
type: numeric
changeable: true
required: false
has_value: true
- "--threshold":
type: string
changeable: true
required: false
has_value: true
default: "1e-5"
- "--peaktype":
type: string
changeable: true
required: true
has_value: true
default: $peak_type
- "--normalization":
type: string
changeable: true
required: false
has_value: true
default: "intra-group"
- "--keep-max-dup":
type: numeric
changeable: true
required: false
has_value: true
- "--num-processors":
type: numeric
changeable: true
required: false
has_value: true
default: 1
walltime: 120
memory: 16000
cores: 1
nodes: 1
zerone_callpeak:
inputs:
control.bam:
type: file
file_type: bam
signal.bam:
type: file
file_type: bam
outputs:
result_peaks:
type: stdout
file_type: bed
command: zerone
arguments:
- "--mock":
type: file
changeable: false
required: true
has_value: true
default: $control.bam
- "--chip":
type: file
changeable: false
required: true
has_value: true
default: $signal.bam
- "--window":
type: numeric
changeable: true
required: true
has_value: true
default: 300
- "--quality":
type: numeric
changeable: true
required: true
has_value: true
default: 20
walltime: 120
memory: 16000
cores: 1
nodes: 1
zerone_format:
inputs:
result_peaks:
type: file
file_type: bed
outputs:
full_result:
type: file
file_type: bed
command: chip-job-zerone-format
arguments:
- "$result_peaks":
type: file
changeable: false
required: true
has_value: false
- "$full_result":
type: file
changeable: false
required: true
has_value: false
walltime: 2000
memory: 2000
cores: 1
nodes: 1
......@@ -35,11 +35,11 @@ class Workflow(object):
:type username: str
:param password: The password to authenticate for MongoDB access.
:type password: str
:param execute_site: A list of sites to submit jobs to. These sites should
be defined in the configuration file.
:type execute_site: list
:param execute_site: The target site to submit jobs to. These sites should
be defined in the sites.xml file.
:type execute_site: str
:param output_site: The output site to transfer files to. This site should
be defined in the configuration file.
be defined in the sites.xml file.