Commit 8dbbdf44 authored by aknecht2's avatar aknecht2
Browse files

Merge branch '22-gem-output-files-bug' into 'master'

Resolve "Gem output files bug"

Closes #22

See merge request !20
parents 9c64dc4f c0f69d09
......@@ -323,7 +323,7 @@ class MongoDB(object):
"$lookup": {
"from": "samples",
"localField": "uuid",
"foreignField": "experiment_accession",
"foreignField": "experiment_id",
"as": "samples"
}
},
......@@ -334,7 +334,7 @@ class MongoDB(object):
"$lookup": {
"from": "samples",
"localField": "possible_controls.uuid",
"foreignField": "experiment_accession",
"foreignField": "experiment_id",
"as": "possible_controls.samples"
}
},
......@@ -359,7 +359,7 @@ class MongoDB(object):
}
else:
valid = False
msg = "Experiment with id '%s' has %s possible control inputs, and %s possible experiment inputs.\n" % (experiment_accession, len(control_inputs), len(experiment_inputs))
msg = "Experiment with id '%s' has %s possible control inputs, and %s possible signal inputs.\n" % (experiment_accession, len(control_inputs), len(signal_inputs))
else:
valid = False
msg = "Experiment with id '%s' does not have possible_controls.\n" % (experiment_accession,)
......
......@@ -97,10 +97,30 @@ peak_call:
- chr_fasta:
type: list
outputs:
- results_sorted.bed:
- GEM_events.narrowPeak:
type: file
name_template: "$prefix$/$prefix$_GEM_events.narrowPeak"
- GPS_events.narrowPeak:
type: file
name_template: "$prefix$/$prefix$_GPS_events.narrowPeak"
- sort_awk_sort_peaks:
inputs:
- GEM_events.narrowPeak:
type: file
additional_inputs: null
outputs:
- results_GEM_sorted.bed:
type: file
- sort_awk_sort_peaks:
inputs:
- GPS_events.narrowPeak:
type: file
additional_inputs: null
outputs:
- results_GPS_sorted.bed:
type: file
- spp[tool]:
- cp_bed_tagalign:
- cp:
inputs:
- exp.bed:
type: file
......@@ -108,7 +128,7 @@ peak_call:
outputs:
- exp.tagAlign:
type: file
- cp_bed_tagalign:
- cp:
inputs:
- control.bed:
type: file
......
......@@ -26,7 +26,10 @@ gem_callpeak:
type: list
file_type: chr_fasta
outputs:
- name: peak_result
- name: gem_peak_result
type: file
file_type: bed
- name: gps_peak_result
type: file
file_type: bed
command: gem
......
#!/bin/bash
/bin/sort -k 8gr,8gr "$1" | awk 'BEGIN{OFS="\t"}{$4="PEAK_"NR; print $0;}' > "$2"
/bin/sort -k1,1V -k2,2n -k3,3n "$1" | awk 'BEGIN{OFS="\t"}{$4="PEAK_"NR; print $0;}' > "$2"
......@@ -87,7 +87,8 @@ class ModuleGenerator(object):
all_markers,
all_jobs,
should_save=output_info[logical_name]["save_result"] if "save_result" in output_info[logical_name] else False,
prefix_join=output_info[logical_name].get("prefix_join")
prefix_join=output_info[logical_name].get("prefix_join"),
name_template=output_info[logical_name].get("name_template")
)
results.append(result)
run.add_result(self.module.name, result)
......
......@@ -218,22 +218,27 @@ class PeakCallGenerator(ModuleGenerator):
remove_duplicates_results = run.get_results("remove_duplicates", "no_dups_chr.bed")
module_markers = {"peak_call": self.get_markers(run)}
module_jobs = [self.workflow_jobs[job_name] for job_name in self.module.get_job_names(module_markers["peak_call"])]
final_module_outputs = self.module.get_final_outputs(self.get_markers(run))
if len(final_module_outputs) == 1:
final_result_name = final_module_outputs[0]
else:
final_result_name = "results_sorted.bed"
for paired_result in self._make_call_pairs(run, remove_duplicates_results):
markers = dict(paired_result[0].all_markers, **module_markers)
prev_result_jobs = list(set(paired_result[0].all_jobs).union(paired_result[1].all_jobs))
result = Result(
"results_sorted.bed",
final_result_name,
paired_result[0].control_samples + paired_result[1].control_samples,
paired_result[0].signal_samples + paired_result[1].signal_samples,
markers,
prev_result_jobs + module_jobs,
should_save = True
should_save=True
)
run.add_result("peak_call", result)
self.call_pairs[result.full_name] = paired_result
return run.get_results("peak_call", "results_sorted.bed")
return run.get_results("peak_call", final_result_name)
def find_prev_results(self, run, result):
"""
......
......@@ -10,7 +10,7 @@ class Result(object):
run on the result file up to this point.
"""
def __init__(self, logical_name, control_samples, signal_samples, all_markers, all_jobs, should_save=False, prefix_join=None):
def __init__(self, logical_name, control_samples, signal_samples, all_markers, all_jobs, should_save=False, prefix_join=None, name_template=None):
"""
:param logical_name: The unique name of the file as presented in the module yaml
:type logical_name: string
......@@ -24,6 +24,10 @@ class Result(object):
:type all_jobs: list
:param should_save: Whether or not the result should be saved to the database.
:type should_save: boolean
:param prefix_join: How to combine prefixes for the final result
:type prefix_join str
:param name_template: A template to load the correct name of the result
:type name_template: str
The result class is for managing all intermediate output files.
It also helps manage checking if a result already exists for the
......@@ -42,8 +46,9 @@ class Result(object):
self.all_jobs = all_jobs
self.should_save = should_save
self.prefix = self._get_prefix()
self.full_name = self.prefix + ("_" if prefix_join is None else prefix_join) + self.logical_name
self._load_prefix()
self._load_full_name(name_template, prefix_join)
self.file_type = os.path.splitext(self.logical_name)[1][1:]
self.pegasus_file = File(self.full_name)
return
......@@ -58,17 +63,24 @@ class Result(object):
self.full_name
)
def _get_prefix(self):
def _load_full_name(self, template=None, prefix_join=None):
"""
Loads the full name
"""
if template is None:
self.full_name = self.prefix + ("_" if prefix_join is None else prefix_join) + self.logical_name
else:
self.full_name = template.replace("$prefix$", self.prefix)
def _load_prefix(self):
"""
Computes the prefix based on the input information.
We don't want to have extra underscores hence the
ternary operator to check the sample length.
We don't want to have extra underscores afterwards
"""
prefix = "_".join(self.get_accessions("signal") + self.get_accessions("control"))
self.prefix = "_".join(self.get_accessions("signal") + self.get_accessions("control"))
for module_name, module_markers in self.all_markers.iteritems():
for marker_name, marker_val in module_markers.iteritems():
prefix += "_%s" % (marker_val,)
return prefix
self.prefix += "_%s" % (marker_val,)
def exists_in_db(self, mdb, genome):
"""
......
......@@ -342,7 +342,7 @@ class WorkflowJob(object):
self.errors.append("File '%s' is not of type '%s'. Should match one of these extensions: '%s'.\n" \
% (param["name"], file_type, chipathlon.conf.file_extensions[file_type]))
else:
self.errors.append("Expected %s %s files. Provided %s instead.\n" % (len(self.job_data[param_type]), param_type[:-1], len(param_list)))
self.errors.append("Job Error (%s): Expected %s %s files. Provided %s instead.\n" % (self.job_name, len(self.job_data[param_type]), param_type[:-1], len(param_list)))
return self.is_valid()
def create_job(self, inputs, additional_inputs, outputs):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment