From 73a0f8c19d8d19117a68d763f15ae5d513cf29e5 Mon Sep 17 00:00:00 2001 From: aknecht2 <aknecht2@unl.edu> Date: Thu, 14 Apr 2016 16:48:53 -0500 Subject: [PATCH] Added valid_sample checking in database. Fixed db_save_results to use sample ids. Adjusted memory request usage for spp & db_save. --- chipathlon/db.py | 24 ++++++++++++++++------ chipathlon/jobs/params/db_save_result.yaml | 2 +- chipathlon/jobs/params/r_spp_nodups.yaml | 2 +- chipathlon/jobs/scripts/db_save_result.py | 11 ++++++++-- 4 files changed, 29 insertions(+), 10 deletions(-) mode change 100644 => 100755 chipathlon/jobs/scripts/db_save_result.py diff --git a/chipathlon/db.py b/chipathlon/db.py index a181701..3cb7ec6 100644 --- a/chipathlon/db.py +++ b/chipathlon/db.py @@ -39,8 +39,9 @@ class MongoDB(object): # Make sure output_file exists if os.path.isfile(output_file): # Make sure that all control_ids & experiment_ids are valid - valid_controls = [self.is_valid_experiment(cid) for cid in control_ids] - valid_experiments = [self.is_valid_experiment(eid) for eid in experiment_ids] + # REMEMBER, these are ids for control & experiment SAMPLES + valid_controls = [self.is_valid_sample(cid) for cid in control_ids] + valid_experiments = [self.is_valid_sample(eid) for eid in experiment_ids] if all(valid_controls) and all(valid_experiments): # First, we load the output file into gfs with open(output_file, "r") as rh: @@ -59,11 +60,9 @@ class MongoDB(object): result = self.db.results.insert_one(result_entry) return (True, "Result created successfully.", result.inserted_id) else: - msg = "Not all input ids are valid. The following are invalid:" + msg = "Not all input ids are valid. The following are invalid: " for id_list, valid_list in zip([control_ids, experiment_ids], [valid_controls, valid_experiments]): - for i, valid in enumerate(valid_list): - if not valid: - msg += id_list[i] + ", " + msg += ",".join([id_list[i] for i, valid in enumerate(valid_list) if not valid]) else: msg = "Specified output_file %s does not exist." % (output_file,) return (False, msg, None) @@ -77,6 +76,7 @@ class MongoDB(object): # chr, start, end, name, score, strand # Load data using a list comprehension over lines, # then insert with insert_many() + print "loading bed_data..." with open(bed_file, "r") as rh: bed_data = [ { @@ -92,6 +92,7 @@ class MongoDB(object): for line_info in (line.split(),) ] try: + print "bed data loaded, inserting." self.db.bed.insert_many(bed_data) return (True, "Bed file successfully inserted.", result_id) except pymongo.errors.OperationFailure as e: @@ -132,6 +133,17 @@ class MongoDB(object): msg = "Error inserting peak_file %s: %s" % (peak_file, e) return (valid, msg, None) + def is_valid_sample(self, sample_accession): + try: + cursor = self.db.samples.find({ + "accession": sample_accession + }) + if cursor.count() == 1: + return True + except pymongo.errors.OperationFailure as e: + print "Error with sample_accession %s: %s" % (sample_accession, e) + return False + def is_valid_experiment(self, experiment_id): try: cursor = self.db.experiments.find({ diff --git a/chipathlon/jobs/params/db_save_result.yaml b/chipathlon/jobs/params/db_save_result.yaml index a90fde8..887ce4a 100644 --- a/chipathlon/jobs/params/db_save_result.yaml +++ b/chipathlon/jobs/params/db_save_result.yaml @@ -40,5 +40,5 @@ db_save_result: has_value: true default: $inputs.4 walltime: 2000 - memory: 2000 + memory: 16000 cores: 1 diff --git a/chipathlon/jobs/params/r_spp_nodups.yaml b/chipathlon/jobs/params/r_spp_nodups.yaml index 1195a84..1d03155 100644 --- a/chipathlon/jobs/params/r_spp_nodups.yaml +++ b/chipathlon/jobs/params/r_spp_nodups.yaml @@ -65,5 +65,5 @@ r_spp_nodups: has_value: true default: 8 walltime: 2000 - memory: 8000 + memory: 16000 cores: 8 diff --git a/chipathlon/jobs/scripts/db_save_result.py b/chipathlon/jobs/scripts/db_save_result.py old mode 100644 new mode 100755 index 822431f..b26e43f --- a/chipathlon/jobs/scripts/db_save_result.py +++ b/chipathlon/jobs/scripts/db_save_result.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import chipathlon.db import argparse import yaml @@ -15,7 +16,13 @@ if os.path.isfile(args.file) and os.path.isfile(args.meta): mdb = chipathlon.db.MongoDB(args.host, args.username, args.password) with open(args.meta, "r") as rh: meta = yaml.load(rh) + valid = False + msg = "" if meta["result_type"] == "bed": - mdb.save_bed(args.file, meta["control_ids"], meta["experiment_ids"], meta) + valid, msg, data = mdb.save_bed(args.file, meta["control_ids"], meta["experiment_ids"], meta) elif meta["result_type"] == "peak": - mdb.save_peak(args.file, meta["control_ids"], meta["experiment_ids"], meta) + valid, msg, data = mdb.save_peak(args.file, meta["control_ids"], meta["experiment_ids"], meta) + print msg + +else: + print "Either input file %s or meta file %s does not exist." % (args.file, args.meta) -- GitLab