Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Holland Computing Center
chipathlon
Commits
10f026fd
Commit
10f026fd
authored
Apr 15, 2016
by
aknecht2
Browse files
Added node support. Updated database to load files in chunks.
parent
73a0f8c1
Changes
22
Show whitespace changes
Inline
Side-by-side
chipathlon/conf.py
View file @
10f026fd
...
...
@@ -67,6 +67,10 @@ resources = {
"cores"
:
{
"namespace"
:
"pegasus"
,
"key"
:
"cores"
},
"nodes"
:
{
"namespace"
:
"pegasus"
,
"key"
:
"nodes"
}
}
...
...
chipathlon/db.py
View file @
10f026fd
...
...
@@ -4,6 +4,7 @@ import gridfs
import
sys
import
traceback
import
os
import
itertools
from
pprint
import
pprint
...
...
@@ -62,7 +63,7 @@ class MongoDB(object):
else
:
msg
=
"Not all input ids are valid. The following are invalid: "
for
id_list
,
valid_list
in
zip
([
control_ids
,
experiment_ids
],
[
valid_controls
,
valid_experiments
]):
msg
+=
","
.
join
([
id_list
[
i
]
for
i
,
valid
in
enumerate
(
valid_list
)
if
not
valid
])
msg
+=
",
"
.
join
([
id_list
[
i
]
for
i
,
valid
in
enumerate
(
valid_list
)
if
not
valid
])
else
:
msg
=
"Specified output_file %s does not exist."
%
(
output_file
,)
return
(
False
,
msg
,
None
)
...
...
@@ -78,7 +79,13 @@ class MongoDB(object):
# then insert with insert_many()
print
"loading bed_data..."
with
open
(
bed_file
,
"r"
)
as
rh
:
bed_data
=
[
msg
=
"Bed file successfully inserted."
# Lazy load files in specified line chunk size ~100k lines
n_lines
=
100000
line_set
=
list
(
itertools
.
islice
(
rh
,
n_lines
))
while
line_set
:
try
:
self
.
db
.
bed
.
insert_many
([
{
"result_id"
:
result_id
,
"chr"
:
line_info
[
0
],
...
...
@@ -88,17 +95,14 @@ class MongoDB(object):
"score"
:
line_info
[
4
],
"strand"
:
line_info
[
5
]
}
for
line
in
rh
.
readlines
()
for
line
in
line_set
for
line_info
in
(
line
.
split
(),)
]
try
:
print
"bed data loaded, inserting."
self
.
db
.
bed
.
insert_many
(
bed_data
)
return
(
True
,
"Bed file successfully inserted."
,
result_id
)
])
except
pymongo
.
errors
.
OperationFailure
as
e
:
valid
=
False
msg
=
"Error inserting bed_file %s: %s"
%
(
bed_file
,
e
)
return
(
valid
,
msg
,
None
)
line_set
=
list
(
itertools
.
islice
(
rh
,
n_lines
))
return
(
valid
,
msg
,
result_id
)
def
save_peak
(
self
,
peak_file
,
control_ids
,
experiment_ids
,
additional_data
=
{}):
# Create result_entry for peak_file
...
...
@@ -108,7 +112,13 @@ class MongoDB(object):
# Data is in a 10 column format
# chr, start, end, name, score, strand, signal_value, p_value, q_value, summit
with
open
(
peak_file
,
"r"
)
as
rh
:
peak_data
=
[
msg
=
"Peak file successfully inserted."
# Lazy load files in specified line chunk size ~100k lines
n_lines
=
10000
line_set
=
list
(
itertools
.
islice
(
rh
,
n_lines
))
while
line_set
:
try
:
self
.
db
.
peak
.
insert_many
([
{
"result_id"
:
result_id
,
"chr"
:
line_info
[
0
],
...
...
@@ -122,16 +132,14 @@ class MongoDB(object):
"q_value"
:
line_info
[
8
],
"summit"
:
line_info
[
9
]
}
for
line
in
rh
.
readlines
()
for
line
in
line_set
for
line_info
in
(
line
.
split
(),)
]
try
:
self
.
db
.
peak
.
insert_many
(
peak_data
)
return
(
True
,
"Peak file successfully inserted."
,
result_id
)
])
except
pymongo
.
errors
.
OperationFailure
as
e
:
valid
=
False
msg
=
"Error inserting peak_file %s: %s"
%
(
peak_file
,
e
)
return
(
valid
,
msg
,
None
)
line_set
=
list
(
itertools
.
islice
(
rh
,
n_lines
))
return
(
valid
,
msg
,
result_id
)
def
is_valid_sample
(
self
,
sample_accession
):
try
:
...
...
chipathlon/jobs/params/bedtools_bam_to_bed.yaml
View file @
10f026fd
...
...
@@ -20,3 +20,4 @@ bedtools_bam_to_bed:
walltime
:
2000
memory
:
2000
cores
:
1
nodes
:
1
chipathlon/jobs/params/bowtie2_align_paired.yaml
View file @
10f026fd
...
...
@@ -56,3 +56,4 @@ bowtie2_align_paired:
walltime
:
2000
memory
:
2000
cores
:
8
nodes
:
1
chipathlon/jobs/params/bowtie2_align_single.yaml
View file @
10f026fd
...
...
@@ -49,3 +49,4 @@ bowtie2_align_single:
walltime
:
2000
memory
:
2000
cores
:
8
nodes
:
1
chipathlon/jobs/params/bwa_align_paired.yaml
View file @
10f026fd
...
...
@@ -50,3 +50,4 @@ bwa_align_paired:
walltime
:
2000
memory
:
2000
cores
:
8
nodes
:
1
chipathlon/jobs/params/bwa_align_single.yaml
View file @
10f026fd
...
...
@@ -60,3 +60,4 @@ bwa_align_single:
walltime
:
2000
memory
:
2000
cores
:
8
nodes
:
1
chipathlon/jobs/params/bwa_sai_to_sam.yaml
View file @
10f026fd
...
...
@@ -46,3 +46,4 @@ bwa_sai_to_sam:
walltime
:
2000
memory
:
2000
cores
:
1
nodes
:
1
chipathlon/jobs/params/cat_awk_sort_peaks.yaml
View file @
10f026fd
...
...
@@ -19,3 +19,4 @@ cat_awk_sort_peaks:
walltime
:
2000
memory
:
2000
cores
:
1
nodes
:
1
chipathlon/jobs/params/cp_bed_tagalign.yaml
View file @
10f026fd
...
...
@@ -19,3 +19,4 @@ cp_bed_tagalign:
walltime
:
2000
memory
:
2000
cores
:
1
nodes
:
1
chipathlon/jobs/params/db_save_result.yaml
View file @
10f026fd
...
...
@@ -42,3 +42,4 @@ db_save_result:
walltime
:
2000
memory
:
16000
cores
:
1
nodes
:
1
chipathlon/jobs/params/macs2_callpeak.yaml
View file @
10f026fd
...
...
@@ -75,3 +75,4 @@ macs2_callpeak:
walltime
:
2000
memory
:
8000
cores
:
1
nodes
:
1
chipathlon/jobs/params/picard_mark_duplicates.yaml
View file @
10f026fd
...
...
@@ -47,3 +47,4 @@ picard_mark_duplicates:
walltime
:
2000
memory
:
8000
cores
:
1
nodes
:
1
chipathlon/jobs/params/picard_sort_sam.yaml
View file @
10f026fd
...
...
@@ -30,3 +30,4 @@ picard_sort_sam:
walltime
:
2000
memory
:
8000
cores
:
1
nodes
:
1
chipathlon/jobs/params/r_spp_nodups.yaml
View file @
10f026fd
...
...
@@ -67,3 +67,4 @@ r_spp_nodups:
walltime
:
2000
memory
:
16000
cores
:
8
nodes
:
1
chipathlon/jobs/params/samtools_filter_bam.yaml
View file @
10f026fd
...
...
@@ -38,3 +38,4 @@ samtools_filter_bam:
walltime
:
2000
memory
:
2000
cores
:
1
nodes
:
1
chipathlon/jobs/params/samtools_remove_duplicates.yaml
View file @
10f026fd
...
...
@@ -33,3 +33,4 @@ samtools_remove_duplicates:
walltime
:
2000
memory
:
2000
cores
:
1
nodes
:
1
chipathlon/jobs/params/samtools_sam_to_bam.yaml
View file @
10f026fd
...
...
@@ -28,3 +28,4 @@ samtools_sam_to_bam:
walltime
:
2000
memory
:
2000
cores
:
1
nodes
:
1
chipathlon/jobs/params/sort_awk_sort_peaks.yaml
View file @
10f026fd
...
...
@@ -19,3 +19,4 @@ sort_awk_sort_peaks:
walltime
:
2000
memory
:
2000
cores
:
1
nodes
:
1
chipathlon/jobs/params/zcat_awk_sort_peaks.yaml
View file @
10f026fd
...
...
@@ -19,3 +19,4 @@ zcat_awk_sort_peaks:
walltime
:
2000
memory
:
2000
cores
:
1
nodes
:
1
Prev
1
2
Next
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment