Commit 10f026fd authored by aknecht2's avatar aknecht2
Browse files

Added node support. Updated database to load files in chunks.

parent 73a0f8c1
......@@ -67,6 +67,10 @@ resources = {
"cores": {
"namespace": "pegasus",
"key": "cores"
},
"nodes": {
"namespace": "pegasus",
"key": "nodes"
}
}
......
......@@ -4,6 +4,7 @@ import gridfs
import sys
import traceback
import os
import itertools
from pprint import pprint
......@@ -62,7 +63,7 @@ class MongoDB(object):
else:
msg = "Not all input ids are valid. The following are invalid: "
for id_list, valid_list in zip([control_ids, experiment_ids], [valid_controls, valid_experiments]):
msg += ",".join([id_list[i] for i, valid in enumerate(valid_list) if not valid])
msg += ", ".join([id_list[i] for i, valid in enumerate(valid_list) if not valid])
else:
msg = "Specified output_file %s does not exist." % (output_file,)
return (False, msg, None)
......@@ -78,7 +79,13 @@ class MongoDB(object):
# then insert with insert_many()
print "loading bed_data..."
with open(bed_file, "r") as rh:
bed_data = [
msg = "Bed file successfully inserted."
# Lazy load files in specified line chunk size ~100k lines
n_lines = 100000
line_set = list(itertools.islice(rh, n_lines))
while line_set:
try:
self.db.bed.insert_many([
{
"result_id": result_id,
"chr": line_info[0],
......@@ -88,17 +95,14 @@ class MongoDB(object):
"score": line_info[4],
"strand": line_info[5]
}
for line in rh.readlines()
for line in line_set
for line_info in (line.split(),)
]
try:
print "bed data loaded, inserting."
self.db.bed.insert_many(bed_data)
return (True, "Bed file successfully inserted.", result_id)
])
except pymongo.errors.OperationFailure as e:
valid = False
msg = "Error inserting bed_file %s: %s" % (bed_file, e)
return (valid, msg, None)
line_set = list(itertools.islice(rh, n_lines))
return (valid, msg, result_id)
def save_peak(self, peak_file, control_ids, experiment_ids, additional_data = {}):
# Create result_entry for peak_file
......@@ -108,7 +112,13 @@ class MongoDB(object):
# Data is in a 10 column format
# chr, start, end, name, score, strand, signal_value, p_value, q_value, summit
with open(peak_file, "r") as rh:
peak_data = [
msg = "Peak file successfully inserted."
# Lazy load files in specified line chunk size ~100k lines
n_lines = 10000
line_set = list(itertools.islice(rh, n_lines))
while line_set:
try:
self.db.peak.insert_many([
{
"result_id": result_id,
"chr": line_info[0],
......@@ -122,16 +132,14 @@ class MongoDB(object):
"q_value": line_info[8],
"summit": line_info[9]
}
for line in rh.readlines()
for line in line_set
for line_info in (line.split(),)
]
try:
self.db.peak.insert_many(peak_data)
return (True, "Peak file successfully inserted.", result_id)
])
except pymongo.errors.OperationFailure as e:
valid = False
msg = "Error inserting peak_file %s: %s" % (peak_file, e)
return (valid, msg, None)
line_set = list(itertools.islice(rh, n_lines))
return (valid, msg, result_id)
def is_valid_sample(self, sample_accession):
try:
......
......@@ -20,3 +20,4 @@ bedtools_bam_to_bed:
walltime: 2000
memory: 2000
cores: 1
nodes: 1
......@@ -56,3 +56,4 @@ bowtie2_align_paired:
walltime: 2000
memory: 2000
cores: 8
nodes: 1
......@@ -49,3 +49,4 @@ bowtie2_align_single:
walltime: 2000
memory: 2000
cores: 8
nodes: 1
......@@ -50,3 +50,4 @@ bwa_align_paired:
walltime: 2000
memory: 2000
cores: 8
nodes: 1
......@@ -60,3 +60,4 @@ bwa_align_single:
walltime: 2000
memory: 2000
cores: 8
nodes: 1
......@@ -46,3 +46,4 @@ bwa_sai_to_sam:
walltime: 2000
memory: 2000
cores: 1
nodes: 1
......@@ -19,3 +19,4 @@ cat_awk_sort_peaks:
walltime: 2000
memory: 2000
cores: 1
nodes: 1
......@@ -19,3 +19,4 @@ cp_bed_tagalign:
walltime: 2000
memory: 2000
cores: 1
nodes: 1
......@@ -42,3 +42,4 @@ db_save_result:
walltime: 2000
memory: 16000
cores: 1
nodes: 1
......@@ -75,3 +75,4 @@ macs2_callpeak:
walltime: 2000
memory: 8000
cores: 1
nodes: 1
......@@ -47,3 +47,4 @@ picard_mark_duplicates:
walltime: 2000
memory: 8000
cores: 1
nodes: 1
......@@ -30,3 +30,4 @@ picard_sort_sam:
walltime: 2000
memory: 8000
cores: 1
nodes: 1
......@@ -67,3 +67,4 @@ r_spp_nodups:
walltime: 2000
memory: 16000
cores: 8
nodes: 1
......@@ -38,3 +38,4 @@ samtools_filter_bam:
walltime: 2000
memory: 2000
cores: 1
nodes: 1
......@@ -33,3 +33,4 @@ samtools_remove_duplicates:
walltime: 2000
memory: 2000
cores: 1
nodes: 1
......@@ -28,3 +28,4 @@ samtools_sam_to_bam:
walltime: 2000
memory: 2000
cores: 1
nodes: 1
......@@ -19,3 +19,4 @@ sort_awk_sort_peaks:
walltime: 2000
memory: 2000
cores: 1
nodes: 1
......@@ -19,3 +19,4 @@ zcat_awk_sort_peaks:
walltime: 2000
memory: 2000
cores: 1
nodes: 1
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment