Commit 1ad7c145 authored by aknecht2's avatar aknecht2
Browse files

Fixed issues with submitting multiple jobs for same files. Added new example...

Fixed issues with submitting multiple jobs for same files.  Added new example to find experiment_ids from the T4 and HM collections. Download now prioritizes hcc_url if it exists.
parent 3c5242b4
......@@ -5,3 +5,4 @@ build/
dist/
chipathlon.egg-info/
MANIFEST
examples/all_files.list
......@@ -25,7 +25,7 @@ def downloadFile(url, localpath, urltype="ftp://", retries=3, overwrite=True, ch
hash_md5 = hashlib.md5()
conn = urllib2.urlopen(urltype + url)
with open(localpath, "w") as fwh:
for chunk in iter(lambda: conn.read(4 * 1024), ""):
for chunk in iter(lambda: conn.read(16 * 1024), ""):
fwh.write(chunk)
hash_md5.update(chunk)
if checkmd5:
......
......@@ -288,7 +288,7 @@ class Workflow(object):
job = Job(self.executables["download_fastq.py"])
job.profile("globus", "maxwalltime", 360)
job.uses(output_file, link=Link.OUTPUT, transfer=True)
job.addArguments("-u", f["url"], "-p", output_file, "-t http://", "-m", f["md5sum"])
job.addArguments("-u", f["hcc_url"] if "hcc_url" in f else f["url"], "-p", output_file, "-t http://", "-m", f["md5sum"])
self.jobs[name] = job
self.dax.addJob(job)
# USING PEGASUS TRANSFER:
......
......@@ -199,7 +199,7 @@ class WorkflowModule(object):
job_inputs = []
job_additional = []
# If all outputs have already been added, we don't add the job
if not any([self._get_full_name(prefix, markers, file_name) in master_files for file_name in job_info["outputs"]]):
if not any([self._get_full_name(prefix, markers, file_dict.keys()[0]) in master_files for file_dict in job_info["outputs"]]):
job_inputs = self._setup_param_list(master_files, job_info, "inputs", inputs, prefix, markers)
job_additional = self._setup_param_list(master_files, job_info, "additional_inputs", additional_files, prefix, markers)
job_outputs = self._setup_param_list(master_files, job_info, "outputs", {}, prefix, markers)
......
from pymongo import MongoClient
import argparse
import chipathlon.db
from pprint import pprint
import json
parser = argparse.ArgumentParser(description="Perform a join between the experiment and sample collections.")
parser.add_argument("--password", dest="password", help="Database user password.")
parser.add_argument("--username", dest="username", help="Database user.")
parser.add_argument("--host", dest="host", help="Database host.")
args = parser.parse_args()
# Everything in a db named chipseq
# Set up connection and authenticate
mdb = chipathlon.db.MongoDB(args.host, args.username, args.password)
write_data = []
counts = {}
experiment_ids = []
for c in ["TF_4cells", "HM_4cells"]:
counts[c] = {}
counts[c]["control"] = 0
counts[c]["experiment"] = 0
cursor = mdb.db.get_collection(c).aggregate([
{
"$lookup": {
"from": "experiments",
"localField": "experiment_id",
"foreignField": "uuid",
"as": "experiment_doc"
}
},
{
"$group": {
"_id": None,
"experiment_ids": {"$addToSet": "$experiment_doc.@id"}
}
}
])
for document in cursor:
for encode_id in document["experiment_ids"]:
experiment_ids.append(encode_id[0].split("/")[2])
break
for exp in experiment_ids:
valid, msg, data = mdb.get_samples(exp)
if valid:
for x in ["control", "experiment"]:
for sample in data[x]:
counts[c][x] += 1
write_data.append("%s,%s,%s" % (sample["url"], sample["filename"], sample["md5sum"]))
else:
print msg
with open("all_files.list", "w") as wh:
for row in write_data:
wh.write(row + "\n")
print "File counts: %s." % (counts,)
print "Final experiment_ids: %s." % (experiment_ids,)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment