run_parser.py 5.46 KB
Newer Older
1
from chipathlon.run import Run
2
3
import yaml
from chipathlon.genome import Genome
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

class RunParser(object):

    def __init__(self, run_file, mdb):
        """
        :param run_file: The path to the input run yaml file.
        :type run_file: str
        :param mdb: The MongoDB instance to load sample data from.
        :type mdb: chipathlon.db.MongoDB

        This class parses an input file to return a list of
        properly instantiated Run classes
        """
        self.run_file = run_file
        self.errors = []
        self.runs = []
        self.genomes = {}
        self.parse_error = False

        with open(self.run_file, "r") as rh:
            try:
                self.yaml_data = yaml.load(rh)
                self._load_runs(mdb)
            except yaml.YAMLError as e:
                self.err += "Error pasring run template file [%s]: %s.\n" % (self.run_file, e)
        return

    def is_valid(self):
        """
        Checks if the run is valid.
        """
aknecht2's avatar
aknecht2 committed
35
        return len(self.errors) == 0 and not self.parse_error
36

37
    def get_error_string(self):
38
39
40
41
        """
        Returns the errors as a newline separated string.
        """
        return "\n".join([
42
            "\n".join(self.errors),
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
            "\n".join([genome.get_error_string() for tool_dict in self.genomes.values() for genome in tool_dict.values()]),
            "\n".join([run.get_error_string() for run in self.runs])
        ])

    def get_run(self, index):
        """
        :param index: The ith spot in the yaml file
        :type index: int
        """
        return self.runs[index] if len(self.runs) > index else None

    def get_runs(self):
        return self.runs

    def yield_runs(self):
        for run in self.runs:
            yield run

    def get_genome(self, assembly, tool):
        if assembly in self.genomes:
            if tool in self.genomes[assembly]:
                return self.genomes[assembly][tool]
        return None

67
68
69
    def get_genomes(self):
        return [genome for gen_info in self.genomes.values() for genome in gen_info.values()]

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
    def _load_genomes(self):
        """
        Validate that the correct genomic information exists, we leave it up
        to the genome class to validate the content.
        """
        if "genomes" in self.yaml_data:
            for assembly, gen_data in self.yaml_data["genomes"].iteritems():
                if "chrom.sizes" in gen_data:
                    tools = [key for key in gen_data if key != "chrom.sizes"]
                    if len(tools) > 0:
                        for tool in tools:
                            genome = Genome(assembly, tool, gen_data[tool], gen_data["chrom.sizes"])
                            if not genome.is_valid():
                                self.parse_error = True
                            if assembly not in self.genomes:
                                self.genomes[assembly] = {}
                            self.genomes[assembly][tool] = genome
                    else:
                        self.errors.append("Error parsing run file[%s]: Genome defined under assembly [%s] has no alignment tools defined." % (self.run_file, assembly))
                else:
                    self.errors.append("Error parsing run file[%s]: Genome defined under assembly [%s] has no 'chrom.sizes' key." % (self.run_file, assembly))
        else:
            self.errors.append("Error parsing run file [%s]: No 'genomes' key defined." % (self.run_file,))
        return

    def _load_runs(self, mdb):
        """
        Validate that the correct run information exists, we leave it up to
        the run class to validate the content.
        """
        self._load_genomes()
        if not self.parse_error:
            if "runs" in self.yaml_data:
                for i, yaml_run in enumerate(self.yaml_data["runs"]):
                    has_req_params = True
                    for req_key in ["assembly", "align", "peak", "signal1", "control1", "file_type"]:
                        if req_key not in yaml_run:
                            self.errors.append("Error parsing run file[%s]:  Run #%s is missing required key '%s'." % (self.run_file, i, req_key))
                            self.parse_error = True
                            has_req_params = False
                    if has_req_params:
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
                        genome = self.get_genome(yaml_run.get("assembly"), yaml_run.get("align"))
                        if genome is not None:
                            run = Run(
                                mdb,
                                genome,
                                yaml_run.get("peak"),
                                yaml_run.get("signal1"),
                                yaml_run.get("control1"),
                                signal2=yaml_run.get("signal2"),
                                control2=yaml_run.get("control2"),
                                file_type=yaml_run.get("file_type"),
                                idr=yaml_run.get("idr")
                            )
                            if not run.is_valid():
                                self.parse_error = True
                            self.runs.append(run)
                        else:
                            self.errors.append("Error parsing run file[%s]: Run #%s references non-existant genome with tool=%s and assembly=%s" % (self.run_file, i, yaml_run.get("align"), yaml_run.get("assembly")))
129
130
            else:
                self.errors.append("Error parsing run file [%s]: No 'runs' key defined." % (self.run_file,))