Skip to content
Snippets Groups Projects
Commit 5e54c62a authored by Aden Hester's avatar Aden Hester
Browse files

Add converter for fastq files to filter only the sequences;

add tester to run on files produced by previous converter
parent c471e147
Branches
Tags
No related merge requests found
import os
import sys
fastqFilePath = "/work/vinod/ahester2/SRA/SRR31131061/SRR31131061_1.fastq"
newFileDirectory = "/work/vinod/ahester2/SRA/SRR31131061/"
numLines = 0
with open(fastqFilePath, 'r') as f, open((newFileDirectory + "SRR31131061_1.txt"), 'w') as o:
for line in f:
if line[0] == '@':
line = f.readline()
o.write(line)
numLines+=1
print(str(numLines))
\ No newline at end of file
import time
import string
import random
import os
import statistics
import sys
from algorithms.HashTable import HashTable
from algorithms.Estimator import Estimator
from algorithms.CVMHash import CVMHash
# Constants
FILEPATH = '/work/vinod/ahester2/SRA/SRR31131061/SRR31131061_1.txt'
BUFFERSIZES = [10_000]
NUMRUNS = 1
# JOBID = os.environ["SLURM_JOB_ID"]
estimators = []
for i in range(len(BUFFERSIZES)):
estimators.append([])
for j in range(NUMRUNS):
estimators[i].append(CVMHash(BUFFERSIZES[i]))
counter = 0
lines = 0
LIMITATION = 1_000_000
# Main generation
with open(FILEPATH, 'r') as file:
kmer=""
for row in file:
for char in row:
if char == '\n':
lines += 1
continue
kmer = kmer + char.strip()
if(len(kmer) >= 10):
counter += 1
for line in estimators:
for estimator in line:
estimator.newInput(kmer)
kmer = (kmer[1:len(kmer)])
if counter >= LIMITATION:
break
if counter >= LIMITATION:
break
estimatesArray = []
for i in range(len(BUFFERSIZES)):
estimateList = []
for estimator in estimators[i]:
estimate = estimator.getEstimate()
estimateList.append(estimate)
estimatesArray.append(estimateList)
print("# of 10mers: " + str(counter))
print("# of lines processed: " + str(lines+1))
print()
header = "buffersize"
for i in range(NUMRUNS):
header = header + ',' + str(i+1)
print(header)
for i in range(len(estimatesArray)):
line = str(BUFFERSIZES[i])
for j in range(len(estimatesArray[i])):
line = line + "," + str(estimatesArray[i][j])
print(line)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment