import argparse import sys import numpy as np import math import time # the q-gram length Q = 7 # alphabet size; the alphabet is {o,1,...,asize-1} ASIZE = 5 listStrings = [] def editDistance(a, b): # Edit-Matrix editMatrix = np.zeros((len(a) + 1, len(b) + 1)) # Initialisierung for i in range(0, len(a) + 1): editMatrix.itemset((i, 0), i) for j in range(0, len(b) + 1): editMatrix.itemset((0, j), j) # Berechnung der inneren Werte for i in range(1, len(a) + 1): for j in range(1, len(b) + 1): insertion = editMatrix[i][j - 1] + 1 deletion = editMatrix[i - 1][j] + 1 if a[i - 1] == b[j - 1]: copsub = editMatrix[i - 1][j - 1] else: copsub = editMatrix[i - 1][j - 1] + 1 # wähle Minimum if insertion < deletion and insertion < copsub: editMatrix[i][j] = insertion elif deletion < insertion and deletion < copsub: editMatrix[i][j] = deletion else: editMatrix[i][j] = copsub return editMatrix[len(a)][len(b)] def getCode(c): dic = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4} return dic[c] def getRank(s): p = 1 rank = 0 # Berechne Rank für Präfix der Länge q (erstes q-Gram) for pos in range(0, Q): rank = rank + getCode(s[pos]) * p p *= ASIZE return rank def updateRank(c, prevRank): #TODO: Berechne neuen Rank aus vorhergehendem Rank #(siehe SA-Skript, Seite 23, Achtung: Welches System wurde hier gewählt?) def readMultipleFasta(path: str): file = open(path, "r") seq = "" for line in file: if line[0] == ">": if seq != "": seq.strip() listStrings.append(seq) seq = "" else: seq = seq + line.strip(); listStrings.append(seq) def getProfile(s: str): # Erstes q-Gram rank = getRank(s) profile = [0]*int('''TODO: höchstmöglicher Rank = Länge des Profils''') #TODO: weitere q-Gramme return profile def qGramDistance(a: str, b: str) -> int: #TODO... return dist def compDistTimeQgram(a: str, b: str) -> float: start = time.time() qGramDistance(a,b) stop = time.time() return stop - start def compDistTimeEdit(a: str, b: str) -> float: start = time.time() editDistance(a,b) stop = time.time() return stop - start def main(): parser = argparse.ArgumentParser() parser.add_argument('fasta_file', help='File including the sequences', type=argparse.FileType('r')) if len(sys.argv) == 0: parser.print_help() args = parser.parse_args() #Einlesen Fasta readMultipleFasta(args.fasta_file.name) print("Read "+ str(len(listStrings) )+" sequences") #Berechnung aller Distanzen der Sequenz-Paare (0,1), (2,3) for i in range(0,len(listStrings)-1,2): a = listStrings[i] b = listStrings[i+1] #Ausgabe der Sequenzlängen, Distanzen und Zeiten print(str(len(a))+"\t"+str(len(b))+"\t"+str(editDistance(a,b))+"\t"+str(qGramDistance(a,b))+"\t"+str(compDistTimeEdit(a,b))+"\t"+str(compDistTimeQgram(a,b))) # print(len(a)) # print(len(b)) # print(editDistance(a,b)) # print(qGramDistance(a,b)) # print(compDistTimeEdit(a,b)) # print(compDistTimeQgram(a,b)) if __name__ == "__main__": main()