#!/usr/bin/env python #from Bio import SeqIO #from scipy import stats # # project3.py # Author: [YOUR NAME HERE] # Date Created: # Last Modified: # # [DESCRIPTION OF PROGRAM FUNCTION HERE] # # CONSTANTS CUTOFF = 0.8 # update these to the full data files when you are sure your code works! FASTQ="rand_100.fastq" CLASSIF="rand_100.txt" # additional constants go here def main(): belowcutoffNs, abovecutoffNs = [],[] # initialize both as empty lists id2seq = read_fastq(FASTQ) classifseq = read_classifications(CLASSIF) # here, you should use getGenusConfidence to find sequences whose confidence # scores are above or below the cutoff, and add the number of N's in the # sequence to belowcutoffNs or abovecutoffNs, as appropriate. # then print on the number of N's in each printListAvg("below",belowcutoffNs) printListAvg("above",abovecutoffNs) # inputs: a string called 'label', and a list of numeric values called 'data' # prints the label and then the average of the numbers in the list # no return value # def printListAvg(label,data): pass # input: a classification string (e.g. " # if 12th elt of the classification string exists, # return its floating point value; else return zero # def getGenusConfidence(clstring): pass # input is a sequence # output is an integer representing the number of Ns in the sequence # def countNs (sequence): pass # take as input the name of the fastq file # open it, read fastq formatted stuff from it # return a dict whose keys are the rest of the id and values are the sequences # def read_fastq(filename): pass # input is the name of the classfication file # reads from the file # remove the ">39 " chars from start of the id # creates a dict that maps sequence ids to classification strings # def read_classifications(filename): pass # main program: just calls the main function # main()