db.py 6.47 KB
Newer Older
aknecht2's avatar
aknecht2 committed
1
from pymongo import MongoClient
2
3
4
import gridfs
import sys
import traceback
5
from pprint import pprint
aknecht2's avatar
aknecht2 committed
6

7
8
9
class MongoDB(object):

    def __init__(self, host, username, password):
10
11
        self.client = MongoClient(host)
        self.db = self.client.chipseq
12
        try:
13
            self.db.authenticate(username, password, mechanism="SCRAM-SHA-1")
14
15
16
17
18
19
20
21
22
23
24
        except:
            print("Could not authenticate to db %s!" % (host,))
            print traceback.format_exc()
            sys.exit(1)
        self.gfs = gridfs.GridFS(self.db)
        return

    def load_bed(self, collection, result_id, bed_file, attributes = {}):

        return

25
26
27
28
29
30
31
32
33
34
35
36
    def check_valid_samples(self):
        cursor = self.db.experiments.aggregate([
            {
                "$match": {
                    "target": {"$exists": True},
                    "revoked_files.0": {"$exists": False},
                    "assembly.0": {"$exists": True},
                    "assembly.1": {"$exists": False}
                }
            },
            {
                "$lookup": {
37
                    "from": "samples",
38
                    "localField": "uuid",
39
40
41
42
43
44
45
46
47
48
49
50
51
                    "foreignField": "experiment_id",
                    "as": "samples"
                }
            }
        ])
        total = 0
        has_samples = 0
        for document in cursor:
            total += 1
            if len(document["samples"]) > 0:
                has_samples += 1
        return (has_samples, total)

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
    def get_samples(self, experiment_id):
        valid = True
        msg = ""
        data = {}
        # First, check to make sure the target experiment exists.
        check = self.db.experiments.find({
            "@id": "/experiments/%s/" % (experiment_id,)
        })
        if check.count() == 1:
            # Next, we check that all metadata is defined
            check2 = self.db.experiments.find({
                "target": {"$exists": True},
                "revoked_files.0": {"$exists": False},
                "assembly.0": {"$exists": True},
                "assembly.1": {"$exists": False},
                "@id": "/experiments/%s/" % (experiment_id,)
            })
            if check2.count() == 1:
                # Next, we check that there is a least 1 possible control
                check3 = self.db.experiments.find({
                    "target": {"$exists": True},
                    "revoked_files.0": {"$exists": False},
                    "assembly.0": {"$exists": True},
                    "assembly.1": {"$exists": False},
                    "possible_controls.0": {"$exists": True},
                    "@id": "/experiments/%s/" % (experiment_id,)
                })
                if check3.count() == 1:
                    # Complicated aggregtaion pipeline does the following steps:
                    # 1. Find the experiment that matches the given id
                    # 2. Join samples into the collection by exp_id
                    # 3. Iterate through possible_controls
                    # 4. Join possible_control data into control_exps
                    # 5. Iterate through control_exps
                    # 6. Join samples into the control_exps by exp_id
                    # 7. Re-aggregate all data into arrays
88
                    pipeline = [
89
90
91
92
93
94
95
96
97
98
99
100
                        {
                            "$match": {
                                "target": {"$exists": True},
                                "revoked_files.0": {"$exists": False},
                                "assembly.0": {"$exists": True},
                                "assembly.1": {"$exists": False},
                                "possible_controls.0": {"$exists": True},
                                "@id": "/experiments/%s/" % (experiment_id,)
                            }
                        },
                        {
                            "$lookup": {
101
                                "from": "samples",
102
                                "localField": "uuid",
103
104
105
106
107
108
109
110
111
                                "foreignField": "experiment_id",
                                "as": "samples"
                            }
                        },
                        {
                            "$unwind": "$possible_controls"
                        },
                        {
                            "$lookup": {
112
                                "from": "samples",
113
                                "localField": "possible_controls.uuid",
114
                                "foreignField": "experiment_id",
115
                                "as": "possible_controls.samples"
116
117
118
119
120
121
122
123
124
                            }
                        },
                        {
                            "$group": {
                                "_id": "$_id",
                                "possible_controls": {"$push": "$possible_controls"},
                                "samples": {"$push": "$samples"}
                            }
                        }
125
126
                    ]
                    cursor = self.db.experiments.aggregate(pipeline)
127
128
                    # We should have only 1 document
                    document = cursor.next()
129
130
                    control_inputs = [sample for control in document["possible_controls"] for sample in control["samples"] if ("file_type" in sample and sample["file_type"] == "fastq")]
                    experiment_inputs = [sample for sample in document["samples"][0] if ("file_type" in sample and sample["file_type"] == "fastq")]
131
132
133
134
135
136
137
138
                    if (len(control_inputs) > 0 and len(experiment_inputs) > 0):
                        msg = "Succesfully retrieved input files for experiment with id '%s'." % (experiment_id,)
                        data = {
                            "control": control_inputs,
                            "experiment": experiment_inputs
                        }
                    else:
                        valid = False
139
                        msg = "Experiment with id '%s' has %s possible control inputs, and %s possible experiment inputs." % (experiment_id, len(control_inputs), len(experiment_inputs))
140
141
142
143
144
145
146
147
148
149
                else:
                    valid = False
                    msg = "Experiment with id '%s' does not have possible_controls." % (experiment_id,)
            else:
                valid = False
                msg = "Experiment with id '%s' does not have all required metadata (assembly, target, no revoked_files)." % (experiment_id,)
        else:
            valid = False
            msg = "Experiment with id '%s' does not exist." % (experiment_id,)
        return (valid, msg, data)