Commit ae6f874b authored by aknecht2's avatar aknecht2
Browse files

Removed unnecessary stuff.

parent 655770f9
from module_generator import ModuleGenerator
class DownloadGenerator(ModuleGenerator):
def __init__(self, master_files, workflow_module, run_data, debug = False):
"""
:param master_files: The dictionary mapping file name -> file object.
:type master_files: dict
:param workflow_module: The actual module being used.
:type workflow_module: chipathlon.workflow_module.WorkflowModule
:param run_data: Input sample data.
:type run_data: chipathlon.run_data.RunData
:param debug: If true, prints out params for each job & module.
:type debug: bool
"""
super(DownloadGenerator, self).__init__(master_files, workflow_module, run_data, debug)
self.module_name = "download"
return
def parse_run(self, run_index):
"""
:param run_index: The index of the run in the yaml file.
:type run_index: int
Generate necessary params for a single run.
"""
run = self.run_data.runs[run_index]
for experiment_id in run["experiments"]:
for treatment in ["experiment", "control"]:
for sample in run["samples"][experiment_id][treatment]:
inputs = {
"url": sample["hcc_url"] if "hcc_url" in sample else sample["url"],
"md5": sample["md5sum"]
}
additional_inputs = {}
file_data = [
[{
"file_name": "%s_%s.fastq.gz" % (experiment_id, sample["accession"]),
"save_result": False
}]
]
prefix = "%s_%s" % (experiment_id, sample["accession"])
control_sample_ids = []
experiment_sample_ids = []
if treatment == "control":
control_sample_ids = [sample["accession"]]
else:
experiment_sample_ids = [sample["accession"]]
outputs = self.construct_outputs(file_data, {}, {}, prefix, sample, experiment_sample_ids, control_sample_ids, [])
yield ({}, inputs, additional_inputs, outputs)
import os
import chipathlon.conf
import yaml
class RunData(object):
def __init__(self, run_file, mdb):
"""
:param run_file: The path to the input run yaml file.
:type run_file: str
:param mdb: The MongoDB instance to load sample data from.
:type mdb: chipathlon.db.MongoDB
This class holds all genome & run data information loaded from
the run.yaml input file.
"""
self.run_file = run_file
self.mdb = mdb
self.file_list = []
self.err = ""
with open(self.run_file, "r") as rh:
try:
self.data = yaml.load(rh)
self._load_genomes()
self._load_runs()
except yaml.YAMLError as exc:
self.err += "Error parsing run template file '%s': %s.\n" % (self.run_file, exc)
return
def _add_file(self, name, path, site):
"""
:param name: The logical file name.
:type name: str
:param path: The path to the actual file.
:type path: str
:param site: The site to add the file to.
:type site: str
This functions similarly to the workflow _add_file, but this
class does not interact directly with the dax. Instead, it
keeps track of files that need to be added.
"""
self.file_list.append({"name": name, "path": path, "site": site})
return
def add_output_file(self, run_index, module_name, arg_name, output):
"""
:param run_index: The index of the run to add the output_file to.
:type run_index: int
:param module_name: The name of the module where the output file came from.
:type module_name: str
:param arg_name: The unique name of the file defined in the module.
:type arg_name: str
:param output: The actual data to keep track of.
:type output: dict
To help keep track of files between steps, output files can be added
to a specific run definition. Future steps can then loop through a
specific set of output files from a previous step, as well as have
all necessary metadata to run. The output data itself can contain
any data, so additional data can be added as necessary. Currently the
:py:meth:~`chipathlon.module_generator.ModuleGenerator.construct_outputs`
function sets the following output convention:
{
"file_name": Name of the file,
"sample": Sample data from db,
"markers": Markers used for the current module
"all_markers": Markers used for all previous modules & the current one,
"prefix": Prefix used for output files
"experiment_sample_ids": Sample id's of all experiment files,
"control_sample_ids": Sample id's of all control files,
"jobs": Jobs used for the current module,
"all_jobs": Jobs used for all previous modules & the current one
}
"""
run = self.runs[run_index]
if module_name not in run["outputs"]:
run["outputs"][module_name] = {}
if arg_name not in run["outputs"][module_name]:
run["outputs"][module_name][arg_name] = []
run["outputs"][module_name][arg_name].append(output)
return
def get_output_files(self, run_index, module_name, arg_name):
"""
:param run_index: The index of the run to add the output_file to.
:type run_index: int
:param module_name: The name of the module where the output file came from.
:type module_name: str
:param arg_name: The unique name of the file defined in the module.
:type arg_name: str
Returns a list of output files.
"""
return self.runs[run_index]["outputs"][module_name][arg_name]
def get_genome_files(self, assembly, tool):
"""
:param assembly: The assembly used for the genome. (i.e. grch38p6)
:type assembly: str
:param tool: The tool used to create the genome. (i.e. bwa)
:type tool: str
Gets genome files from the run_data. More clear to call
get_genome_files than do a dictionary access.
"""
return self.genomes[assembly][tool]
def get_chrom_sizes(self, assembly):
"""
:param assembly: The assembly used for the genome. (i.e. grch38p6)
:type assembly: str
Gets the chrom.sizes file for the genome.
"""
return self.genomes[assembly]["chrom.sizes"]
def _validate_genomes(self):
"""
Makes sure the genome data is safe to load.
"""
for assembly in self.data["genomes"]:
for tool in [key for key in self.data["genomes"][assembly] if key != "chrom.sizes"]:
base_file = self.data["genomes"][assembly][tool]
if not os.path.isfile(base_file):
self.err += "Genome defined with tool '%s' and assembly '%s' is missing base file %s.\n" % (tool, assembly, base_file)
prefix = base_file if tool == "bwa" else os.path.splitext(base_file)[0]
missing = []
for ext in chipathlon.conf.genomes[tool]["additional_files"]:
if not os.path.isfile("%s.%s" % (prefix, ext)):
missing.append(ext)
if missing:
self.err += "Genome defined with assembly '%s' and tool '%s' is missing additional_files with extensions: %s.\n" % (assembly, tool, missing)
if "chrom.sizes" in self.data["genomes"][assembly]:
if not os.path.isfile(self.data["genomes"][assembly]["chrom.sizes"]):
self.err += "Genome defined with assembly '%s' is missing chrom.sizes file %s.\n" % (assembly, self.data["genomes"][assembly]["chrom.sizes"])
else:
self.err += "Genome defined with assembly '%s' does not have definition fro chrom.sizes.\n" % (assembly,)
return
def _load_genomes(self):
"""
Stores all genome files in the variable self.genomes. Files are
indexed in the following manner:
self.genomes = {
"grch38p6": {
"chrom.sizes": file_name,
"bwa": {
"base_file": file_name,
"additional_files": {
"ext1": file_name.ext1,
"ext2": file_name.ext2
}
},
...
},
...
}
"""
self._validate_genomes()
if not self.err:
self.genomes = {}
for assembly in self.data["genomes"]:
self.genomes[assembly] = {}
for tool in [key for key in self.data["genomes"][assembly] if key != "chrom.sizes"]:
self._load_genome_files(self.data["genomes"][assembly][tool], assembly, tool)
chrom_name = "%s_%s" % (assembly, os.path.basename(self.data["genomes"][assembly]["chrom.sizes"]))
self._add_file(chrom_name, self.data["genomes"][assembly]["chrom.sizes"], "local")
self.genomes[assembly]["chrom.sizes"] = chrom_name
self.genomes[assembly]["chr_fasta"] = []
# Load chromosome files here:
base_dir = os.path.dirname(os.path.dirname(self.data["genomes"][assembly]["chrom.sizes"] + "/"))
for root, dirs, files in os.walk(base_dir):
for f in files:
if f.startswith("chr"):
self._add_file(f, root + "/" + f, "local")
self.genomes[assembly]["chr_fasta"].append(f)
break
else:
print self.err
raise SystemExit(1)
return
def _load_genome_files(self, base_file_path, assembly, tool):
"""
:param base_file_path: Path to the main genome file. (i.e. gca000001405_21_grch38p6_genomic.fna)
:type base_file_path: str
:param assembly: The assembly used to create the genome. (i.e. grch38p6)
:type assembly: str
:param tool: The tool used to create the genome. (i.e. bwa)
:type tool: str
Loads all necessary genome files. For bwa this loads the main
.fna file, as wells as .fna.pac, .fna.sa, .fna.amb, and .fna.bwt.
For bowtie2, this loads the main .fna file, as well as .rev.1.bt2,
.rev.2.bt2, .1.bt2, .2.bt2, .3.bt2, and .4.bt2
"""
genome_files = self.genomes[assembly][tool] = {}
base_file_prefix, base_file_ext = os.path.splitext(base_file_path)
gen_prefix = "genome_%s_%s" % (assembly, tool)
# Remove the period
base_file_ext = base_file_ext[1:]
base_file_name = "%s.%s" % (gen_prefix, base_file_ext)
genome_files["base_file"] = base_file_name
self._add_file(base_file_name, base_file_path, "local")
genome_files["additional_files"] = {}
prefix = base_file_path if tool == "bwa" else os.path.splitext(base_file_path)[0]
# We need that .fna for bwa genomes
if tool == "bwa":
gen_prefix += "." + base_file_ext
for ext in chipathlon.conf.genomes[tool]["additional_files"]:
name = "%s.%s" % (gen_prefix, ext)
path = "%s.%s" % (prefix, ext)
self._add_file(name, path, "local")
genome_files["additional_files"][ext] = name
return
def _load_runs(self):
"""
Load run data and get all sample information.
"""
checked = []
self.runs = self.data["runs"]
for run in self.runs:
if "experiments" in run:
if run["align"] not in chipathlon.conf.align_tools:
self.err += "Error parsing run template file %s.\nAlignment tool '%s' not supported." % \
(self.run_file, run["align"])
break
if run["peak"] not in chipathlon.conf.peak_tools:
self.err += "Error parsing run template file %s.\nPeak calling tool '%s' not supported." % \
(self.run_file, run["peak"])
break
run["samples"] = {}
run["outputs"] = {}
for experiment_id in run["experiments"]:
valid, msg, samples = self.mdb.get_samples(experiment_id)
if valid:
run["samples"][experiment_id] = samples
else:
self.err += msg
check = (run["genome"], run["align"])
if check not in checked:
valid, msg = self._valid_genome(run)
if not valid:
self.err += msg
checked.append(check)
else:
self.err += "Error parsing run template file '%s'. Required key 'experiment' not defined.\n" % (self.run_file,)
if self.err:
print self.err
raise SystemExit(1)
return
def _valid_genome(self, run):
"""
:param run: The run to check.
:type run: dict
Check that the provided run has a genome that exists.
"""
valid = False
if run["genome"] in self.genomes:
if run["align"] in self.genomes[run["genome"]]:
valid = True
msg = "Genome is valid."
else:
msg = "Alignment tool '%s' not defined for genome '%s'." % (run["align"], run["genome"])
else:
msg = "Run genome '%s' not defined in genome data." % (run["genome"],)
return (valid, msg)
No preview for this file type
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment