chip-job-ccat-format-bed 1.53 KB
Newer Older
aknecht2's avatar
aknecht2 committed
1
2
#!/usr/bin/env python
import argparse
3
import os
aknecht2's avatar
aknecht2 committed
4
5
6
7

# The ccat output is *mostly* correct, however simply sorting is not enough
# as the fourth column labels are not done in correct chromosome order
# After sorting the output looks like this:
8
9
10
# chr1    3086860 3087035 ccat_131620  4   0   5.483551    0.577000
# chr1    3318040 3318245 ccat_131610  4   0   5.483551    0.577000
# chr1    3372210 3372465 ccat_87299  5   0   6.854439    0.462000
aknecht2's avatar
aknecht2 committed
11
# When it should look like this:
12
13
14
# chr1    3086860 3087035 ccat_0  4   0   5.483551    0.577000    -1
# chr1    3318040 3318245 ccat_1  4   0   5.483551    0.577000    -1
# chr1    3372210 3372465 ccat_2  5   0   6.854439    0.462000    -1
aknecht2's avatar
aknecht2 committed
15

16
parser = argparse.ArgumentParser(description = "Format ccat result files.")
aknecht2's avatar
aknecht2 committed
17
18
19
20
parser.add_argument("--input", "-i", dest="input", required=True, help="Path to input ccat file.")
parser.add_argument("--output", "-o", dest="output", required=True, help="Output file to write formatted results.")
args = parser.parse_args()

21
if not os.path.isfile(args.input):
22
	sys.exit("Could not open input file '%s' for reading." % (args.input,))
23

aknecht2's avatar
aknecht2 committed
24
25
26
27
28
29
30
bed_data = []
# To sort we unfortunately need to load the whole file
with open(args.input, "r") as rh:
	for line in rh:
		bed_data.append(line.strip().split())

# Equivalent to sort -k1,1V -k2,2n -k3,3n
31
sorted_data = sorted(bed_data, key=lambda line: (line[0], int(line[1]), int(line[2])))
aknecht2's avatar
aknecht2 committed
32
33
34
35
36

with open(args.output, "w") as wh:
	for i, line in enumerate(sorted_data):
		# Fix the peak numbers
		line[3] = "ccat_%s" % (i,)
37
		wh.write("\t".join(line) + "\t-1\n")