Commit f234af91 authored by Avi Knecht's avatar Avi Knecht
Browse files

Added metadata, and metadata import script.

parent d5e98d68
from pymongo import MongoClient
import argparse
import json
import sys
def progress(current, end):
percent = float(current) / end
hashes = "#" * int(round(percent * 20))
spaces = " " * (20 - len(hashes))
sys.stdout.write("\rProcessed %s / %s entries. [%s] %s%%" % (current, end, hashes + spaces, int(round(percent * 100))))
sys.stdout.flush()
parser = argparse.ArgumentParser(description = "Import all test data into the database.")
parser.add_argument("--password", dest="password", required=True, help="Database user password.")
parser.add_argument("--username", dest="username", default="aknecht", required=True, help="Database user.")
parser.add_argument("--host", dest="host", default="hcc-anvil-241-41.unl.edu", required=True, help="Database host.")
parser.add_argument("--drop", dest="drop", default=False, action="store_true", help="Drop data if it exists.")
args = parser.parse_args()
# Everything in a db named chipseq
# Set up connection and authenticate
client = MongoClient(args.host)
db = client.chipseq
db.authenticate(args.username, args.password, mechanism="SCRAM-SHA-1")
if args.drop:
db.experiments.drop()
db.samples.drop()
# Insert all experiment metadata, expected in meta/meta_clean.json
with open("meta/meta_clean.json", "r") as rh:
data = json.load(rh)
exp_ids = db.experiments.insert_many(data)
organism = {
"mm": "mouse",
"hg": "human",
"ce": "celegans"
}
# Loop through experiments to create samples
cursor = db.experiments.find({
"target": {"$exists": True},
"revoked_files.0": {"$exists": False},
"assembly.0": {"$exists": True},
"assembly.1": {"$exists": False}
})
total = cursor.count()
for i,document in enumerate(cursor):
for f in document["files"]:
doc = {}
doc["experiment_id"] = document["_id"]
doc["organism"] = organism[document["assembly"][0][:2]]
doc["genome"] = document["assembly"][0]
if "/targets/H" in document["target"]:
doc["histone_modification"] = document["target"].split("/")[2][:2]
doc["transcription_factor"] = document["target"].split("/")[2][2:].split("-")[0]
else:
doc["transcription_factor"] = document["target"].split("/")[2].split("-")[0]
doc["cell_type"] = document["biosample_term_name"]
doc["url"] = "encodeproject.org%s@@download" % (f,)
db.samples.insert_one(doc)
progress(i, total)
progress(total, total)
print
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment