Commit 0a181c3f authored by aknecht2's avatar aknecht2
Browse files

Added new tests for experiment files. Updated db class with sample_file grabbing.

parent d52a0651
......@@ -2,14 +2,15 @@ from pymongo import MongoClient
import gridfs
import sys
import traceback
from pprint import pprint
class MongoDB(object):
def __init__(self, host, username, password):
self.client = MongoClient(args.host)
self.db = client.chipseq
self.client = MongoClient(host)
self.db = self.client.chipseq
try:
self.db.authenticate(args.username, args.password, mechanism="SCRAM-SHA-1")
self.db.authenticate(username, password, mechanism="SCRAM-SHA-1")
except:
print("Could not authenticate to db %s!" % (host,))
print traceback.format_exc()
......@@ -21,4 +22,112 @@ class MongoDB(object):
return
def get_samples(self, experiment_id):
valid = True
msg = ""
data = {}
# First, check to make sure the target experiment exists.
check = self.db.experiments.find({
"@id": "/experiments/%s/" % (experiment_id,)
})
if check.count() == 1:
# Next, we check that all metadata is defined
check2 = self.db.experiments.find({
"target": {"$exists": True},
"revoked_files.0": {"$exists": False},
"assembly.0": {"$exists": True},
"assembly.1": {"$exists": False},
"@id": "/experiments/%s/" % (experiment_id,)
})
if check2.count() == 1:
# Next, we check that there is a least 1 possible control
check3 = self.db.experiments.find({
"target": {"$exists": True},
"revoked_files.0": {"$exists": False},
"assembly.0": {"$exists": True},
"assembly.1": {"$exists": False},
"possible_controls.0": {"$exists": True},
"@id": "/experiments/%s/" % (experiment_id,)
})
if check3.count() == 1:
# Complicated aggregtaion pipeline does the following steps:
# 1. Find the experiment that matches the given id
# 2. Join samples into the collection by exp_id
# 3. Iterate through possible_controls
# 4. Join possible_control data into control_exps
# 5. Iterate through control_exps
# 6. Join samples into the control_exps by exp_id
# 7. Re-aggregate all data into arrays
cursor = self.db.experiments.aggregate([
{
"$match": {
"target": {"$exists": True},
"revoked_files.0": {"$exists": False},
"assembly.0": {"$exists": True},
"assembly.1": {"$exists": False},
"possible_controls.0": {"$exists": True},
"@id": "/experiments/%s/" % (experiment_id,)
}
},
{
"$lookup": {
"from": "samples",
"localField": "_id",
"foreignField": "experiment_id",
"as": "samples"
}
},
{
"$unwind": "$possible_controls"
},
{
"$lookup": {
"from": "experiments",
"localField": "possible_controls",
"foreignField": "@id",
"as": "control_exps"
}
},
{
"$unwind": "$control_exps"
},
{
"$lookup": {
"from": "samples",
"localField": "control_exps._id",
"foreignField": "experiment_id",
"as": "control_exps.samples"
}
},
{
"$group": {
"_id": "$_id",
"possible_controls": {"$push": "$possible_controls"},
"control_exps": {"$push": "$control_exps"},
"samples": {"$push": "$samples"}
}
}
])
# We should have only 1 document
document = cursor.next()
control_inputs = [sample for control in document["control_exps"] for sample in control["samples"] if ("filetype" in sample and sample["filetype"] == "fastq")]
experiment_inputs = [sample for sample in document["samples"][0] if ("filetype" in sample and sample["filetype"] == "fastq")]
if (len(control_inputs) > 0 and len(experiment_inputs) > 0):
msg = "Succesfully retrieved input files for experiment with id '%s'." % (experiment_id,)
data = {
"control": control_inputs,
"experiment": experiment_inputs
}
else:
valid = False
msg = "Experiment with id '%s' has '%s' possible control inputs, and '%s' possible experiment inputs." % (experiment_id, len(control_inputs), len(experiment_inputs))
else:
valid = False
msg = "Experiment with id '%s' does not have possible_controls." % (experiment_id,)
else:
valid = False
msg = "Experiment with id '%s' does not have all required metadata (assembly, target, no revoked_files)." % (experiment_id,)
else:
valid = False
msg = "Experiment with id '%s' does not exist." % (experiment_id,)
return (valid, msg, data)
- run1:
experiment: "expeirment name"
experiment: "ENCSR000BSE"
align: bwa
peak: spp
- run2:
experiment: "experiment name"
experiment: "ENCSR000BSE"
align: bowtie2
peak: spp
#!/usr/bin/python
import chipathlon
import chipathlon.yaml_job
import chipathlon.db
import argparse
parser = argparse.ArgumentParser(description = "Perform a join between the experiment and sample collections.")
parser.add_argument("--password", dest="password", required=True, help="Database user password.")
parser.add_argument("--username", dest="username", default="aknecht", required=True, help="Database user.")
parser.add_argument("--host", dest="host", default="hcc-anvil-241-41.unl.edu", required=True, help="Database host.")
args = parser.parse_args()
mdb = chipathlon.db.MongoDB(args.host, args.username, args.password)
# Shamelessly stolen from: https://svn.blender.org/svnroot/bf-blender/trunk/blender/build_files/scons/tools/bcolors.py
class bcolors(object):
......@@ -29,7 +40,8 @@ def yaml_job_test_1():
# Test2, invalid arguments
def yaml_job_test_2():
yj = chipathlon.yaml_job.YamlJob("bwa_align_paired", "test/yaml_job/params2.yaml")
print_test("Yaml_Job_Test_2", not yj.valid())
err = "Unchangeable argument '-M' specified in params file."
print_test("Yaml_Job_Test_2", yj.err != err)
return
# Test3, non-yaml input file
......@@ -41,10 +53,45 @@ def yaml_job_test_3():
# Test4, ill-defined input file
def yaml_job_test_4():
yj = chipathlon.yaml_job.YamlJob("bwa_align_paired", "test/yaml_job/params4.yaml")
print_test("Yaml_Job_Test_4", not yj.valid())
err = "Specified key 'batman' does not exist for job 'bwa_align_paired'."
print_test("Yaml_Job_Test_4", yj.err != err)
return
# Test5, valid exp, with 2 experiment and 2 control samples
def exp_files_test_1():
valid, msg, data = mdb.get_samples("ENCSR000BSE")
print_test("Exp_Files_Test_1", valid and msg == "Succesfully retrieved input files for experiment with id 'ENCSR000BSE'." and len(data["control"]) == 2 and len(data["experiment"]) == 2)
return
# Test6, invalid exp id
def exp_files_test_2():
valid, msg, data = mdb.get_samples("NOT_AN_ID")
print_test("Exp_Files_Test_2", not valid and msg == "Experiment with id 'NOT_AN_ID' does not exist.")
return
# Test7, invalid metadata
def exp_files_test_3():
valid, msg, data = mdb.get_samples("ENCSR329RIP")
print_test("Exp_Files_Test_3", not valid and msg == "Experiment with id 'ENCSR329RIP' does not have all required metadata (assembly, target, no revoked_files).")
return
# Test8, multiple control experiments, no possible control_inputs
def exp_files_test_4():
valid, msg, data = mdb.get_samples("ENCSR000CWZ")
print_test("Exp_Files_Test_4", valid and msg == "Succesfully retrieved input files for experiment with id 'ENCSR000CWZ'." and len(data["control"]) == 4 and len(data["experiment"]) == 2)
return
tests = [yaml_job_test_1, yaml_job_test_2, yaml_job_test_3, yaml_job_test_4]
tests = [
yaml_job_test_1,
yaml_job_test_2,
yaml_job_test_3,
yaml_job_test_4,
exp_files_test_1,
exp_files_test_2,
exp_files_test_3,
exp_files_test_4
]
for test in tests:
test()
......@@ -44,7 +44,7 @@ class YamlJob(object):
self.err += "Required key '%s' not defined for job '%s'.\n" % (key, self.jobname)
for key in self.params:
if key not in (chipathlon.conf.param_keys["required"] + chipathlon.conf.param_keys["optional"]):
self.err += "Key '%s' does not exist for job '%s'.\n" % (key, self.jobname)
self.err += "Specified key '%s' does not exist for job '%s'.\n" % (key, self.jobname)
return
def _validate_arguments(self):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment