Skip to content
Snippets Groups Projects
Commit 10f026fd authored by aknecht2's avatar aknecht2
Browse files

Added node support. Updated database to load files in chunks.

parent 73a0f8c1
No related branches found
No related tags found
No related merge requests found
Showing
with 76 additions and 46 deletions
...@@ -67,6 +67,10 @@ resources = { ...@@ -67,6 +67,10 @@ resources = {
"cores": { "cores": {
"namespace": "pegasus", "namespace": "pegasus",
"key": "cores" "key": "cores"
},
"nodes": {
"namespace": "pegasus",
"key": "nodes"
} }
} }
......
...@@ -4,6 +4,7 @@ import gridfs ...@@ -4,6 +4,7 @@ import gridfs
import sys import sys
import traceback import traceback
import os import os
import itertools
from pprint import pprint from pprint import pprint
...@@ -78,7 +79,13 @@ class MongoDB(object): ...@@ -78,7 +79,13 @@ class MongoDB(object):
# then insert with insert_many() # then insert with insert_many()
print "loading bed_data..." print "loading bed_data..."
with open(bed_file, "r") as rh: with open(bed_file, "r") as rh:
bed_data = [ msg = "Bed file successfully inserted."
# Lazy load files in specified line chunk size ~100k lines
n_lines = 100000
line_set = list(itertools.islice(rh, n_lines))
while line_set:
try:
self.db.bed.insert_many([
{ {
"result_id": result_id, "result_id": result_id,
"chr": line_info[0], "chr": line_info[0],
...@@ -88,17 +95,14 @@ class MongoDB(object): ...@@ -88,17 +95,14 @@ class MongoDB(object):
"score": line_info[4], "score": line_info[4],
"strand": line_info[5] "strand": line_info[5]
} }
for line in rh.readlines() for line in line_set
for line_info in (line.split(),) for line_info in (line.split(),)
] ])
try:
print "bed data loaded, inserting."
self.db.bed.insert_many(bed_data)
return (True, "Bed file successfully inserted.", result_id)
except pymongo.errors.OperationFailure as e: except pymongo.errors.OperationFailure as e:
valid = False valid = False
msg = "Error inserting bed_file %s: %s" % (bed_file, e) msg = "Error inserting bed_file %s: %s" % (bed_file, e)
return (valid, msg, None) line_set = list(itertools.islice(rh, n_lines))
return (valid, msg, result_id)
def save_peak(self, peak_file, control_ids, experiment_ids, additional_data = {}): def save_peak(self, peak_file, control_ids, experiment_ids, additional_data = {}):
# Create result_entry for peak_file # Create result_entry for peak_file
...@@ -108,7 +112,13 @@ class MongoDB(object): ...@@ -108,7 +112,13 @@ class MongoDB(object):
# Data is in a 10 column format # Data is in a 10 column format
# chr, start, end, name, score, strand, signal_value, p_value, q_value, summit # chr, start, end, name, score, strand, signal_value, p_value, q_value, summit
with open(peak_file, "r") as rh: with open(peak_file, "r") as rh:
peak_data = [ msg = "Peak file successfully inserted."
# Lazy load files in specified line chunk size ~100k lines
n_lines = 10000
line_set = list(itertools.islice(rh, n_lines))
while line_set:
try:
self.db.peak.insert_many([
{ {
"result_id": result_id, "result_id": result_id,
"chr": line_info[0], "chr": line_info[0],
...@@ -122,16 +132,14 @@ class MongoDB(object): ...@@ -122,16 +132,14 @@ class MongoDB(object):
"q_value": line_info[8], "q_value": line_info[8],
"summit": line_info[9] "summit": line_info[9]
} }
for line in rh.readlines() for line in line_set
for line_info in (line.split(),) for line_info in (line.split(),)
] ])
try:
self.db.peak.insert_many(peak_data)
return (True, "Peak file successfully inserted.", result_id)
except pymongo.errors.OperationFailure as e: except pymongo.errors.OperationFailure as e:
valid = False valid = False
msg = "Error inserting peak_file %s: %s" % (peak_file, e) msg = "Error inserting peak_file %s: %s" % (peak_file, e)
return (valid, msg, None) line_set = list(itertools.islice(rh, n_lines))
return (valid, msg, result_id)
def is_valid_sample(self, sample_accession): def is_valid_sample(self, sample_accession):
try: try:
......
...@@ -20,3 +20,4 @@ bedtools_bam_to_bed: ...@@ -20,3 +20,4 @@ bedtools_bam_to_bed:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 1 cores: 1
nodes: 1
...@@ -56,3 +56,4 @@ bowtie2_align_paired: ...@@ -56,3 +56,4 @@ bowtie2_align_paired:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 8 cores: 8
nodes: 1
...@@ -49,3 +49,4 @@ bowtie2_align_single: ...@@ -49,3 +49,4 @@ bowtie2_align_single:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 8 cores: 8
nodes: 1
...@@ -50,3 +50,4 @@ bwa_align_paired: ...@@ -50,3 +50,4 @@ bwa_align_paired:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 8 cores: 8
nodes: 1
...@@ -60,3 +60,4 @@ bwa_align_single: ...@@ -60,3 +60,4 @@ bwa_align_single:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 8 cores: 8
nodes: 1
...@@ -46,3 +46,4 @@ bwa_sai_to_sam: ...@@ -46,3 +46,4 @@ bwa_sai_to_sam:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 1 cores: 1
nodes: 1
...@@ -19,3 +19,4 @@ cat_awk_sort_peaks: ...@@ -19,3 +19,4 @@ cat_awk_sort_peaks:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 1 cores: 1
nodes: 1
...@@ -19,3 +19,4 @@ cp_bed_tagalign: ...@@ -19,3 +19,4 @@ cp_bed_tagalign:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 1 cores: 1
nodes: 1
...@@ -42,3 +42,4 @@ db_save_result: ...@@ -42,3 +42,4 @@ db_save_result:
walltime: 2000 walltime: 2000
memory: 16000 memory: 16000
cores: 1 cores: 1
nodes: 1
...@@ -75,3 +75,4 @@ macs2_callpeak: ...@@ -75,3 +75,4 @@ macs2_callpeak:
walltime: 2000 walltime: 2000
memory: 8000 memory: 8000
cores: 1 cores: 1
nodes: 1
...@@ -47,3 +47,4 @@ picard_mark_duplicates: ...@@ -47,3 +47,4 @@ picard_mark_duplicates:
walltime: 2000 walltime: 2000
memory: 8000 memory: 8000
cores: 1 cores: 1
nodes: 1
...@@ -30,3 +30,4 @@ picard_sort_sam: ...@@ -30,3 +30,4 @@ picard_sort_sam:
walltime: 2000 walltime: 2000
memory: 8000 memory: 8000
cores: 1 cores: 1
nodes: 1
...@@ -67,3 +67,4 @@ r_spp_nodups: ...@@ -67,3 +67,4 @@ r_spp_nodups:
walltime: 2000 walltime: 2000
memory: 16000 memory: 16000
cores: 8 cores: 8
nodes: 1
...@@ -38,3 +38,4 @@ samtools_filter_bam: ...@@ -38,3 +38,4 @@ samtools_filter_bam:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 1 cores: 1
nodes: 1
...@@ -33,3 +33,4 @@ samtools_remove_duplicates: ...@@ -33,3 +33,4 @@ samtools_remove_duplicates:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 1 cores: 1
nodes: 1
...@@ -28,3 +28,4 @@ samtools_sam_to_bam: ...@@ -28,3 +28,4 @@ samtools_sam_to_bam:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 1 cores: 1
nodes: 1
...@@ -19,3 +19,4 @@ sort_awk_sort_peaks: ...@@ -19,3 +19,4 @@ sort_awk_sort_peaks:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 1 cores: 1
nodes: 1
...@@ -19,3 +19,4 @@ zcat_awk_sort_peaks: ...@@ -19,3 +19,4 @@ zcat_awk_sort_peaks:
walltime: 2000 walltime: 2000
memory: 2000 memory: 2000
cores: 1 cores: 1
nodes: 1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment