diff --git a/chipathlon/db.py b/chipathlon/db.py index a181701e6fb7f76e631019b37ab61a47fd17c531..3cb7ec628e1d5ae417f35efc5b3b490f266fd54f 100644 --- a/chipathlon/db.py +++ b/chipathlon/db.py @@ -39,8 +39,9 @@ class MongoDB(object): # Make sure output_file exists if os.path.isfile(output_file): # Make sure that all control_ids & experiment_ids are valid - valid_controls = [self.is_valid_experiment(cid) for cid in control_ids] - valid_experiments = [self.is_valid_experiment(eid) for eid in experiment_ids] + # REMEMBER, these are ids for control & experiment SAMPLES + valid_controls = [self.is_valid_sample(cid) for cid in control_ids] + valid_experiments = [self.is_valid_sample(eid) for eid in experiment_ids] if all(valid_controls) and all(valid_experiments): # First, we load the output file into gfs with open(output_file, "r") as rh: @@ -59,11 +60,9 @@ class MongoDB(object): result = self.db.results.insert_one(result_entry) return (True, "Result created successfully.", result.inserted_id) else: - msg = "Not all input ids are valid. The following are invalid:" + msg = "Not all input ids are valid. The following are invalid: " for id_list, valid_list in zip([control_ids, experiment_ids], [valid_controls, valid_experiments]): - for i, valid in enumerate(valid_list): - if not valid: - msg += id_list[i] + ", " + msg += ",".join([id_list[i] for i, valid in enumerate(valid_list) if not valid]) else: msg = "Specified output_file %s does not exist." % (output_file,) return (False, msg, None) @@ -77,6 +76,7 @@ class MongoDB(object): # chr, start, end, name, score, strand # Load data using a list comprehension over lines, # then insert with insert_many() + print "loading bed_data..." with open(bed_file, "r") as rh: bed_data = [ { @@ -92,6 +92,7 @@ class MongoDB(object): for line_info in (line.split(),) ] try: + print "bed data loaded, inserting." self.db.bed.insert_many(bed_data) return (True, "Bed file successfully inserted.", result_id) except pymongo.errors.OperationFailure as e: @@ -132,6 +133,17 @@ class MongoDB(object): msg = "Error inserting peak_file %s: %s" % (peak_file, e) return (valid, msg, None) + def is_valid_sample(self, sample_accession): + try: + cursor = self.db.samples.find({ + "accession": sample_accession + }) + if cursor.count() == 1: + return True + except pymongo.errors.OperationFailure as e: + print "Error with sample_accession %s: %s" % (sample_accession, e) + return False + def is_valid_experiment(self, experiment_id): try: cursor = self.db.experiments.find({ diff --git a/chipathlon/jobs/params/db_save_result.yaml b/chipathlon/jobs/params/db_save_result.yaml index a90fde8fa51b8982c7be07234cd345602e187847..887ce4aba4befc82dfcfe167a83802a779371f21 100644 --- a/chipathlon/jobs/params/db_save_result.yaml +++ b/chipathlon/jobs/params/db_save_result.yaml @@ -40,5 +40,5 @@ db_save_result: has_value: true default: $inputs.4 walltime: 2000 - memory: 2000 + memory: 16000 cores: 1 diff --git a/chipathlon/jobs/params/r_spp_nodups.yaml b/chipathlon/jobs/params/r_spp_nodups.yaml index 1195a848d6f4a401289b87aba1b97f1fc15880b3..1d03155b3f73076fcaafc35a4a38737472eb07d6 100644 --- a/chipathlon/jobs/params/r_spp_nodups.yaml +++ b/chipathlon/jobs/params/r_spp_nodups.yaml @@ -65,5 +65,5 @@ r_spp_nodups: has_value: true default: 8 walltime: 2000 - memory: 8000 + memory: 16000 cores: 8 diff --git a/chipathlon/jobs/scripts/db_save_result.py b/chipathlon/jobs/scripts/db_save_result.py old mode 100644 new mode 100755 index 822431fb613526399cd6951dad458444b5b8ffb5..b26e43f1cf113a5c937f9a26ee73d020edd2ae18 --- a/chipathlon/jobs/scripts/db_save_result.py +++ b/chipathlon/jobs/scripts/db_save_result.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import chipathlon.db import argparse import yaml @@ -15,7 +16,13 @@ if os.path.isfile(args.file) and os.path.isfile(args.meta): mdb = chipathlon.db.MongoDB(args.host, args.username, args.password) with open(args.meta, "r") as rh: meta = yaml.load(rh) + valid = False + msg = "" if meta["result_type"] == "bed": - mdb.save_bed(args.file, meta["control_ids"], meta["experiment_ids"], meta) + valid, msg, data = mdb.save_bed(args.file, meta["control_ids"], meta["experiment_ids"], meta) elif meta["result_type"] == "peak": - mdb.save_peak(args.file, meta["control_ids"], meta["experiment_ids"], meta) + valid, msg, data = mdb.save_peak(args.file, meta["control_ids"], meta["experiment_ids"], meta) + print msg + +else: + print "Either input file %s or meta file %s does not exist." % (args.file, args.meta)