chip-meta-import 4.43 KB
Newer Older
1
2
#!/usr/bin/env python

3
4
5
6
from pymongo import MongoClient
import argparse
import json
import sys
7
from chipathlon.utils import progress
8
9
import os
import os.path
10

11
parser = argparse.ArgumentParser(description="Read per-experiment JSON files and create experiment and samples collections.")
12
13
14
parser.add_argument("-H", "--host", dest="host", default="localhost", help="Database host. (default: %(default)s)")
parser.add_argument("-u", "--username", dest="username", help="Database username (if required).")
parser.add_argument("-p", "--password", dest="password", help="Database password (if required).")
15
16
parser.add_argument("-i", "--input-dir", dest="inputdir", default=os.getcwd(), help="Directory containing per-experiment JSON files.  (default: %(default)s)")
parser.add_argument("-d", "--drop", dest="drop", default=False, action="store_true", help="Drop data if it exists. (default: %(default)s)")
17
parser.add_argument("-s", "--samples", dest="samples", default=False, action="store_true", help="Only recreate the samples collection.")
18
parser.add_argument("-q", "--quiet", action='store_true', help="Quiet mode.  Do not print progress information. (default: false)")
19
20
21
22
23
args = parser.parse_args()


client = MongoClient(args.host)
db = client.chipseq
24
25
if args.username:
    db.authenticate(args.username, args.password, mechanism="SCRAM-SHA-1")
26
27

if args.drop:
28
29
    if not args.samples:
        db.experiments.drop()
30
    db.samples.drop()
31

32
33
34
if not args.samples:
    # Insert all experiment JSON files into DB
    json_filelist = os.listdir(args.inputdir)
35
36
    if not args.quiet:
        print "Creating experiments collection..."
37
38
39
40
41
42
43
    for i, json_file in enumerate(json_filelist):
        if json_file.endswith("json"):
            exp_file_contents = open(os.path.join(args.inputdir, json_file)).read()
            # '$' is a reserved character (operator) in MongoDB.  Some experiments have keys that start
            # with $, which breaks the insert.  Replace with _$ everywhere as a quick-and-dirty workaround.
            exp_file_contents = exp_file_contents.replace('$', '_$')
            exp_json_data = json.loads(exp_file_contents)
44
45
46
47
            try:
                db.experiments.insert_one(exp_json_data)
            except Exception:
                pass
48
49
        if not args.quiet:
            progress(i, len(json_filelist))
50
51

organism = {
52
53
    "mm": "mouse",
    "hg": "human",
54
55
56
    "ce": "celegans",
    "gr": "human",
    "dm": "fruitfly"
57
58
}

59
# Find the experiments we want to create the samples collection.
60
cursor = db.experiments.find({
61
        "target": {"$exists": True}
62
}, no_cursor_timeout=True)
63
64
total = cursor.count()

65
# Create the samples collection using the 'files' value from each experiment
66
67
if not args.quiet:
    print "\nCreating samples collection..."
68
for i, document in enumerate(cursor):
69
70
        for f in document["files"]:
                doc = {}
71
72
                if "assembly" in f or ("assembly" in document and len(document["assembly"]) > 0):
                    assembly = f["assembly"] if "assembly" in f else document["assembly"][0]
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
                    if "accession" in f:
                        doc["experiment_id"] = document["uuid"]
                        doc["organism"] = organism[assembly[:2].lower()]
                        doc["genome"] = assembly
                        doc["assembly"] = assembly
                        if "/targets/H" in document["target"]["@id"]:
                                doc["histone_modification"] = document["target"]["@id"].split("/")[2][:2]
                                doc["transcription_factor"] = document["target"]["@id"].split("/")[2][2:].split("-")[0]
                        elif document["target"]["@id"].split("/")[2].split("-")[0].lower() == "control":
                                doc["control"] = True
                        else:
                                doc["transcription_factor"] = document["target"]["@id"].split("/")[2].split("-")[0]
                        doc["cell_type"] = document["biosample_term_name"]
                        doc["filename"] = os.path.split(f['href'])[-1]
                        doc["url"] = os.path.join("encodeproject.org", "files", f["accession"], "@@download")
88
89
90
91
92
                        try:
                            result = db.samples.insert_one(doc)
                            db.samples.update_one({'_id': result.inserted_id}, {"$set": f})
                        except Exception:
                            pass
93
94
        if not args.quiet:
            progress(i, total)
95
96

print
97
cursor.close()