Added genome validation. Added module loading with minor validation. Added...

Added genome validation. Added module loading with minor validation. Added required file detection in modules.

Added genome validation. Added module loading with minor validation. Added...
0119ee2a · aknecht2 · dc6a2061 · 0119ee2a · 0119ee2a · 0119ee2a
Commit 0119ee2a authored 9 years ago by aknecht2
--- a/chipathlon/conf.py
+++ b/chipathlon/conf.py
@@ -15,7 +15,7 @@ file_extensions = {
    "bam": ["bam"],
    "bed": ["bed", "narrowPeak", "broadPeak"],
    "bwa_genome": ["amb", "ann", "bwt", "pac", "sac"],
-    "bowtie2_genome": [".bt2"]
+    "bowtie2_genome": ["1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2"]
 }

 # param keys
@@ -26,3 +26,15 @@ param_keys = {

 # workflow order
 workflow = ["align", "remove_duplicates", "peak_calling"]
+
+# genome info
+genomes = {
+    "bwa": {
+        "base_file": file_extensions["genome_index"],
+        "additional_files": file_extensions["bwa_genome"]
+    },
+    "bowtie2": {
+        "base_file": file_extensions["genome_index"],
+        "additional_files": file_extensions["bowtie2_genome"]
+    }
+}
--- a/chipathlon/db.py
+++ b/chipathlon/db.py
@@ -50,6 +50,26 @@ class MongoDB(object):
                has_samples += 1
        return (has_samples, total)

+    def get_assembly(self, experiment_id):
+        valid = True
+        msg = ""
+        data = ""
+        cursor = self.db.experiments.find({
+            "target": {"$exists": True},
+            "revoked_files.0": {"$exists": False},
+            "assembly.0": {"$exists": True},
+            "assembly.1": {"$exists": False},
+            "@id": "/experiments/%s/" % (experiment_id,)
+        })
+        if cursor.count() == 1:
+            document = cursor.next()
+            data = document["assembly"][0]
+            msg = "Succesfully retrieved assembly for experiment with id '%s'." % (experiment_id,)
+        else:
+            valid = False
+            msg = "Experiment with id '%s' does not exist." % (experiment_id,)
+        return (valid, msg, data)
+
    def get_samples(self, experiment_id):
        valid = True
        msg = ""

--- a/chipathlon/jobs/modules/align.yaml
+++ b/chipathlon/jobs/modules/align.yaml
--- a/chipathlon/jobs/modules/peak_calling.yaml
+++ b/chipathlon/jobs/modules/peak_calling.yaml
-peak:
-  macs2:
-    - run_macs2
-    - sort_macs2
-  spp:
-    - run_spp
-    - unzip_spp
--- a/chipathlon/jobs/modules/remove_duplicates.yaml
+++ b/chipathlon/jobs/modules/remove_duplicates.yaml
-duplicates:
-  - samtools_filter
-  - picard_sort
-  - picard_remove_duplicates
-  - bedtools_bam_to_bed
--- a/chipathlon/jobs/params/bowtie2_align_paired.yaml
+++ b/chipathlon/jobs/params/bowtie2_align_paired.yaml
-bwa_align_paired:
+bowtie2_align_paired:
  inputs:
    - genome_index
    - fastq

--- a/chipathlon/jobs/params/bowtie2_align_single.yaml
+++ b/chipathlon/jobs/params/bowtie2_align_single.yaml
-bwa_align_single:
+bowtie2_align_single:
  inputs:
    - genome_index
    - fastq

--- a/chipathlon/test/run/param.yaml
+++ b/chipathlon/test/run/param.yaml
@@ -15,6 +15,14 @@ bwa_align_paired:
    "-t": 1
  walltime: 2000
  memory: 2000
+bowtie2_align_single:
+  arguments: null
+  walltime: 2000
+  memory: 2000
+bowtie2_align_paired:
+  arguments: null
+  walltime: 2000
+  memory: 2000
 samtools_sam_to_bam:
  arguments: null
  walltime: 2000

--- a/chipathlon/workflow.py
+++ b/chipathlon/workflow.py
@@ -8,6 +8,9 @@ import xml.dom.minidom
 import yaml
 import traceback
 import chipathlon
+import chipathlon.yaml_job
+import chipathlon.db
+import chipathlon.workflow_module
 from pprint import pprint
 from Pegasus.DAX3 import *

@@ -42,14 +45,14 @@ class Workflow(object):
        return

    def generate(self):
-        self.load_modules()
-        self.load_runs()
+        self._load_yaml_jobs()
+        self._load_modules()
        if not self.err:
-            self.load_yaml_jobs()
+            self._load_runs()
            if not self.err:
                print "Put the rest of the generating functions hurrrr."
-                self.load_executables()
-                # self.info()
+                self._load_executables()
+                self.info()

            else:
                print self.err
@@ -59,7 +62,7 @@ class Workflow(object):
            sys.exit(1)
        return

-    def load_executables(self, os_type="linux", arch="x86_64"):
+    def _load_executables(self, os_type="linux", arch="x86_64"):
        # Load wrapper scripts for commands that need to be loaded from module
        for root, dirs, files in os.walk("%s/%s" % (os.path.dirname(os.path.realpath(__file__)), chipathlon.conf.job_wrappers)):
            for f in files:
@@ -74,23 +77,25 @@ class Workflow(object):
                self.executables[f] = Executable(name=f, os=os_type, arch=arch)
                self.executables[f].addPFN(PFN("file://%s/%s" % (root, f), "condorpool"))
                self.dax.addExecutable(self.executables[f])
+            break
        return

-    def load_modules(self):
-        for root, dirs, files in os.walk("%s/%s" % (os.path.dirname(os.path.realpath(__file__)), chipathlon.conf.job_modules)):
+    def _load_modules(self):
+        for root, dirs, files in os.walk(os.path.join(os.path.dirname(os.path.realpath(__file__)), chipathlon.conf.job_modules)):
            for f in files:
-                with open("%s/%s" % (root, f), "r") as rh:
-                    mdata = yaml.load(rh)
-                    root_key = mdata.keys()[0]
-                    self.modules[root_key] = mdata[root_key]
+                mod = chipathlon.workflow_module.WorkflowModule(os.path.join(root, f), self.yaml_jobs)
+                if not mod.err:
+                    self.modules[mod.name] = mod
+                else:
+                    self.err += mod.err
            break
        return

-    def load_runs(self):
+    def _load_runs(self):
        try:
            with open(self.run_file, "r") as rh:
                self.run_data = yaml.load(rh)
-            for run in self.run_data:
+            for run in self.run_data["runs"]:
                for module_name in self.modules:
                    if isinstance(self.modules[module_name], dict):
                        keys = self.modules[module_name].keys()
@@ -106,6 +111,7 @@ class Workflow(object):
                        run["data"] = data
                    else:
                        self.err += msg
+                    self._check_genomes(run)
                else:
                    self.err += "Error parsing run template file '%s'.  Required key 'experiment' not defined.\n" % (self.run_file,)
        except:
@@ -113,12 +119,41 @@ class Workflow(object):
            self.err += traceback.format_exc()
        return

-    def load_yaml_jobs(self):
+    def _check_genomes(self, run):
+        valid, msg, assembly = self.mdb.get_assembly(run["experiment"])
+        if valid:
+            if run["align"] in self.run_data["genomes"]:
+                if assembly in self.run_data["genomes"][run["align"]]:
+                    base_file = self.run_data["genomes"][run["align"]][assembly]
+                    if os.path.isfile(base_file):
+                        if base_file.split(".", 1)[1] in chipathlon.conf.genomes[run["align"]]["base_file"]:
+                            prefix = base_file.split(".", 1)[0]
+                            missing = []
+                            for ext in chipathlon.conf.genomes[run["align"]]["additional_files"]:
+                                if not os.path.isfile(prefix + "." + ext):
+                                    missing.append(ext)
+                            if len(missing) > 0:
+                                self.err += "Genome defined with tool '%s' and assembly '%s' is missing additional_files with extensions %s.\n" % (run["align"], assembly, missing)
+                        else:
+                            self.err += "Genome defined with tool '%s' and assembly '%s', has invalid extension.  Should be one of %s.\n" % (run["align"], assembly, chipathlon.conf.genomes[run["align"]]["base_file"])
+                    else:
+                        self.err += "Genome defined with tool '%s' and assembly '%s', has non-existant base file '%s'.\n" % (run["align"], assembly, base_file)
+                else:
+                    self.err += "Alignment defined with tool '%s' for assembly '%s', no corresponding genome definition.\n" % (run["align"], assembly)
+            else:
+                self.err += "Alignment defined with tool '%s', no corresponding genome definition.\n" % (run["align"])
+        else:
+            self.err += "DB error: %s.\n" % (msg, )
+        return
+
+    def _load_yaml_jobs(self):
        for root, dirs, files in os.walk("%s/%s" % (os.path.dirname(os.path.realpath(__file__)), chipathlon.conf.job_params)):
            for f in files:
-                yj = chipathlon.yaml_job.YamlJob("%s/%s" % (root, f), self.param_file)
-                self.err += yj.err
+                yj = chipathlon.yaml_job.YamlJob(os.path.join(root, f), self.param_file)
+                if not yj.err:
                    self.yaml_jobs[yj.jobname] = yj
+                else:
+                    self.err += yj.err
            break
        return


--- a/chipathlon/workflow_module.py
+++ b/chipathlon/workflow_module.py
@@ -7,12 +7,12 @@ import textwrap
 import xml.dom.minidom
 import yaml
 import traceback
-import chipathlon
+import re
 from pprint import pprint
 from Pegasus.DAX3 import *


-class WorkflowModule(Object):
+class WorkflowModule(object):

    def __init__(self, module_yaml, yaml_jobs):
        # module_yaml is the module yaml file
@@ -22,32 +22,61 @@ class WorkflowModule(Object):
            with open(module_yaml, "r") as rh:
                self.data = yaml.load(rh)
            self.name = self.data.keys()[0]
-            self.input_files = []
-            self.additional_inputs = []
-            self.output_files = []
-            self.splits = {}
-            self.workflow = {}
+            self.markers = {}
            self.yaml_jobs = yaml_jobs
-            self._construct()
+            self._load_markers(self.data[self.name])
        except:
            self.err += "Error parsing job template yaml file.\n"
            self.err += traceback.format_exc()
        return

-    def _construct(self):
-        # Here's where things get complicated.
-        # We need to maintain a complex structure
-        # of actual jobs, and where they split.
-        # From there, we will be able to
-        # specify exactly what inputs a module needs
-        # to build correctly.
-        for key in self.data[self.name]:
-            self.workflow[key] = self._construct_mod(self.data[self.name][key])
+    def _load_markers(self, data):
+        # Finds all potential splitting points and their values
+        for item in data:
+            for key in item.keys():
+                if key in self.yaml_jobs:
+                    return
+                elif "[" in key and "]" in key:
+                    val, marker, dummy = re.split("[\[\]]", key)
+                    if marker not in self.markers:
+                        self.markers[marker] = []
+                    if val not in self.markers[marker]:
+                        self.markers[marker].append(val)
+                self._load_markers(item[key])
        return

-    def _construct_mod(self, data):
-        # We always have a list!
+    def _check_input_markers(self, markers):
+        valid = True
+        msg = ""
+        for marker in markers:
+            if marker in self.markers:
+                if markers[marker] not in self.markers[marker]:
+                    msg += "Marker '%s' does not have value '%s', should be one of %s.\n" % (marker, markers[marker], self.markers[marker])
+                    valid = False
+            else:
+                msg += "Marker '%s' does not exist.\n" % (marker,)
+                valid = False
+        return (valid, msg)
+
+    def _load_required_files(self, data, markers, input_files = [], additional_files = [], output_files = []):
        for item in data:
-            if item not in self.yaml_jobs:
-                pass
+            for key in item.keys():
+                if key in self.yaml_jobs:
+                    for ftype, flist in zip(["inputs", "additional_inputs", "outputs"], [input_files, additional_files, output_files]):
+                        if item[key][ftype] is not None:
+                            for f in item[key][ftype]:
+                                if f not in flist and f not in output_files:
+                                    flist.append(f)
+                elif "[" in key and "]" in key:
+                    val, marker, dummy = re.split("[\[\]]", key)
+                    if markers[marker] == val:
+                        self._load_required_files(item[key], markers, input_files, additional_files, output_files)
+        return (input_files, additional_files, output_files)
+
+    def print_required_files(self, markers):
+        valid, msg = self._check_input_markers(markers)
+        if valid:
+            print self._load_required_files(self.data[self.name], markers)
+        else:
+            print msg
        return
--- a/chipathlon/yaml_job.py
+++ b/chipathlon/yaml_job.py
@@ -21,7 +21,10 @@ class YamlJob(object):
        try:
            with open(param_file, "r") as rh:
                params = yaml.load(rh)
+                if self.jobname in params:
                    self.params = params[self.jobname]
+                else:
+                    self.err += "Params file does not contain definition for '%s'.\n" % (self.jobname,)
        except:
            self.err += "Error parsing params yaml file.\n"
            self.err += traceback.format_exc()