db.py 6.94 KB
Newer Older
aknecht2's avatar
aknecht2 committed
1
from pymongo import MongoClient
2
3
4
import gridfs
import sys
import traceback
5
from pprint import pprint
aknecht2's avatar
aknecht2 committed
6

7
8
9
class MongoDB(object):

    def __init__(self, host, username, password):
10
11
        self.client = MongoClient(host)
        self.db = self.client.chipseq
12
        try:
13
            self.db.authenticate(username, password, mechanism="SCRAM-SHA-1")
14
15
16
17
18
19
20
21
22
23
24
        except:
            print("Could not authenticate to db %s!" % (host,))
            print traceback.format_exc()
            sys.exit(1)
        self.gfs = gridfs.GridFS(self.db)
        return

    def load_bed(self, collection, result_id, bed_file, attributes = {}):

        return

25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
    def check_valid_samples(self):
        cursor = self.db.experiments.aggregate([
            {
                "$match": {
                    "target": {"$exists": True},
                    "revoked_files.0": {"$exists": False},
                    "assembly.0": {"$exists": True},
                    "assembly.1": {"$exists": False}
                }
            },
            {
                "$lookup": {
                    "from": "samples",
                    "localField": "_id",
                    "foreignField": "experiment_id",
                    "as": "samples"
                }
            }
        ])
        total = 0
        has_samples = 0
        for document in cursor:
            total += 1
            if len(document["samples"]) > 0:
                has_samples += 1
        return (has_samples, total)

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
    def get_samples(self, experiment_id):
        valid = True
        msg = ""
        data = {}
        # First, check to make sure the target experiment exists.
        check = self.db.experiments.find({
            "@id": "/experiments/%s/" % (experiment_id,)
        })
        if check.count() == 1:
            # Next, we check that all metadata is defined
            check2 = self.db.experiments.find({
                "target": {"$exists": True},
                "revoked_files.0": {"$exists": False},
                "assembly.0": {"$exists": True},
                "assembly.1": {"$exists": False},
                "@id": "/experiments/%s/" % (experiment_id,)
            })
            if check2.count() == 1:
                # Next, we check that there is a least 1 possible control
                check3 = self.db.experiments.find({
                    "target": {"$exists": True},
                    "revoked_files.0": {"$exists": False},
                    "assembly.0": {"$exists": True},
                    "assembly.1": {"$exists": False},
                    "possible_controls.0": {"$exists": True},
                    "@id": "/experiments/%s/" % (experiment_id,)
                })
                if check3.count() == 1:
                    # Complicated aggregtaion pipeline does the following steps:
                    # 1. Find the experiment that matches the given id
                    # 2. Join samples into the collection by exp_id
                    # 3. Iterate through possible_controls
                    # 4. Join possible_control data into control_exps
                    # 5. Iterate through control_exps
                    # 6. Join samples into the control_exps by exp_id
                    # 7. Re-aggregate all data into arrays
                    cursor = self.db.experiments.aggregate([
                        {
                            "$match": {
                                "target": {"$exists": True},
                                "revoked_files.0": {"$exists": False},
                                "assembly.0": {"$exists": True},
                                "assembly.1": {"$exists": False},
                                "possible_controls.0": {"$exists": True},
                                "@id": "/experiments/%s/" % (experiment_id,)
                            }
                        },
                        {
                            "$lookup": {
                                "from": "samples",
                                "localField": "_id",
                                "foreignField": "experiment_id",
                                "as": "samples"
                            }
                        },
                        {
                            "$unwind": "$possible_controls"
                        },
                        {
                            "$lookup": {
                                "from": "experiments",
                                "localField": "possible_controls",
                                "foreignField": "@id",
                                "as": "control_exps"
                            }
                        },
                        {
                            "$unwind": "$control_exps"
                        },
                        {
                            "$lookup": {
                                "from": "samples",
                                "localField": "control_exps._id",
                                "foreignField": "experiment_id",
                                "as": "control_exps.samples"
                            }
                        },
                        {
                            "$group": {
                                "_id": "$_id",
                                "possible_controls": {"$push": "$possible_controls"},
                                "control_exps": {"$push": "$control_exps"},
                                "samples": {"$push": "$samples"}
                            }
                        }
                    ])
                    # We should have only 1 document
                    document = cursor.next()
                    control_inputs = [sample for control in document["control_exps"] for sample in control["samples"] if ("filetype" in sample and sample["filetype"] == "fastq")]
                    experiment_inputs = [sample for sample in document["samples"][0] if ("filetype" in sample and sample["filetype"] == "fastq")]
                    if (len(control_inputs) > 0 and len(experiment_inputs) > 0):
                        msg = "Succesfully retrieved input files for experiment with id '%s'." % (experiment_id,)
                        data = {
                            "control": control_inputs,
                            "experiment": experiment_inputs
                        }
                    else:
                        valid = False
                        msg = "Experiment with id '%s' has '%s' possible control inputs, and '%s' possible experiment inputs." % (experiment_id, len(control_inputs), len(experiment_inputs))
                else:
                    valid = False
                    msg = "Experiment with id '%s' does not have possible_controls." % (experiment_id,)
            else:
                valid = False
                msg = "Experiment with id '%s' does not have all required metadata (assembly, target, no revoked_files)." % (experiment_id,)
        else:
            valid = False
            msg = "Experiment with id '%s' does not exist." % (experiment_id,)
        return (valid, msg, data)