import.py 2.14 KB
Newer Older
1
2
3
4
from pymongo import MongoClient
import argparse
import json
import sys
5
from chipathlon.utils import progress
6
7
8
9
10
11
12
13

parser = argparse.ArgumentParser(description = "Import all test data into the database.")
parser.add_argument("--password", dest="password", required=True, help="Database user password.")
parser.add_argument("--username", dest="username", default="aknecht", required=True, help="Database user.")
parser.add_argument("--host", dest="host", default="hcc-anvil-241-41.unl.edu", required=True, help="Database host.")
parser.add_argument("--drop", dest="drop", default=False, action="store_true", help="Drop data if it exists.")
args = parser.parse_args()

14
# Everything in a db named chipseq
15
16
17
18
19
20
21
22
23
24
25
# Set up connection and authenticate

client = MongoClient(args.host)
db = client.chipseq
db.authenticate(args.username, args.password, mechanism="SCRAM-SHA-1")

if args.drop:
	db.experiments.drop()
	db.samples.drop()

# Insert all experiment metadata, expected in meta/meta_clean.json
26
with open("data/meta_clean.json", "r") as rh:
27
28
29
30
31
32
33
34
35
36
37
	data = json.load(rh)
	exp_ids = db.experiments.insert_many(data)

organism = {
	"mm": "mouse",
	"hg": "human",
	"ce": "celegans"
}

# Loop through experiments to create samples
cursor = db.experiments.find({
38
39
40
	"target": {"$exists": True},
	"revoked_files.0": {"$exists": False},
	"assembly.0": {"$exists": True},
41
42
43
44
45
46
47
48
49
50
51
52
53
54
	"assembly.1": {"$exists": False}
})

total = cursor.count()

for i,document in enumerate(cursor):
	for f in document["files"]:
		doc = {}
		doc["experiment_id"] = document["_id"]
		doc["organism"] = organism[document["assembly"][0][:2]]
		doc["genome"] = document["assembly"][0]
		if "/targets/H" in document["target"]:
			doc["histone_modification"] = document["target"].split("/")[2][:2]
			doc["transcription_factor"] = document["target"].split("/")[2][2:].split("-")[0]
55
56
		elif document["target"].split("/")[2].split("-")[0].lower() == "control":
			doc["control"] = True
57
58
59
60
61
62
63
64
65
		else:
			doc["transcription_factor"] = document["target"].split("/")[2].split("-")[0]
		doc["cell_type"] = document["biosample_term_name"]
		doc["url"] = "encodeproject.org%s@@download" % (f,)
		db.samples.insert_one(doc)
	progress(i, total)

progress(total, total)
print