#!/usr/local/bin/python #This program will parse sorted mousecyc_data file chromofile = open("mousecyc_data.txt", "r") #opens the unparsed file for chromosome1 and reads it ("r" = reads) fileparse1 = open("chr1.txt", "w")#opens a file to which the parsed chrom_1 will be written fileparse2 = open("chr2.txt", "w") # The "w" means that we will write to the chr1.txt file. fileparse3 = open("chr3.txt", "w") fileparse4 = open("chr4.txt", "w") fileparse5 = open("chr5.txt", "w") fileparse6 = open("chr6.txt", "w") fileparse7 = open("chr7.txt", "w") fileparse8 = open("chr8.txt", "w") fileparse9 = open("chr9.txt", "w") fileparse10 = open("chr10.txt", "w") fileparse11 = open("chr11.txt", "w") fileparse12 = open("chr12.txt", "w") fileparse13 = open("chr13.txt", "w") fileparse14 = open("chr14.txt", "w") fileparse15 = open("chr15.txt", "w") fileparse16 = open("chr16.txt", "w") fileparse17 = open("chr17.txt", "w") fileparse18 = open("chr18.txt", "w") fileparse19 = open("chr19.txt", "w") fileparseu = open("chru.txt", "w") fileparsex = open("chrx.txt", "w") fileparsey = open("chry.txt", "w") fileparsem = open("chrm.txt", "w") fileparsexy = open("chrxy.txt", "w") fileline = chromofile.read() # reads the file into the format of a string of characters filelist = fileline.splitlines() #breaks the string apart by newline characters into list of lines item_list = [] #establishes a variable, item_list numlines = len(filelist) #sets a variable that is equal to the number of items in the list, i.e. the number of lines mgiid = "" #all of the next four lines set variable names. Each variable will be a string. oldmgiid = "-" ecnumbs = [] #empty list for holding multiple ec numbers for a given gene id upnumbs = [] #empty list for holding multiple ec numbers for a given gene id entreznumbs = [] #empty list for holding multiple ec numbers for a given gene id orthonumbs = [] #empty list for holding multiple ec numbers for a given gene id omimnumbs = [] #empty list for holding multiple ec numbers for a given gene id for line in filelist: #splits the line by tab demarcations. item = line.split("\t") item_list.append(item) #sticks each split onto the end of a list known as item_list, giving a list #of the lines, with each line having a sublist made by tab separations x = 0 #sets a variable, x, equal to zero while x < numlines: #the x acts as a counter. Each time it goes through the loop, x will become 1 greater, eventually #reaching the value of the number of lines. ecnumbs = [] #resets the ec number list upnumbs = [] #resets the ec number list entreznumbs = [] #resets the ec number list orthonumbs = [] #resets the ec number list omimnumbs = [] #resets the ec number list mgiid = item_list[x][0] #gives the variable name of mgiid to the xth item in the list, and the 0th place (actually the first place) #in line). This works because the MGIID is always the first thing in the line. symbol = item_list[x][1] #works the same as above, but now the symbol is the second name = item_list[x][2] #works the same as above, but now the name is the third chro = item_list[x][3] product = item_list[x][12] if str(item_list[x][4]) != (""): # a period (.) indicates no value in the original MGI file, so here startbase = str(item_list[x][4]) #we're saying if the third item is NOT a period, then let that set of startbase = startbase.replace(".00", "") #digits to equal the variable startbase, but strips the tens decimal place off elif str(item_list[x][4]) == (""): #If there is a period, we wanted to retain it. startbase = "." if str(item_list[x][5]) != (""): #same as what was done for the startbase endbase = str(item_list[x][5]) endbase = endbase.replace(".00", "") #digits to equal the variable startbase, but strips the tens decimal place off elif str(item_list[x][5]) == (""): endbase = "." strand = item_list[x][6] #strand if strand== "-": #if - reverse start and end tempbase = endbase endbase = startbase startbase = tempbase goid = "" #the sixth slot is the goid. Not complicated here. gocomment = "" if "," in item_list[x][7]: #accounts for multiple EC numbers, separated by commas item_list[x][7].strip("\"") ecnumbs = item_list[x][7].split(",") #makes a list called ecnumbs to hold multiple EC numbs else: ec = item_list[x][7] if "," in item_list[x][8]: #accounts for multiple UniProt numbers, separated by commas item_list[x][8].strip("\"") entreznumbs = item_list[x][8].split(",") #makes a list called upnumbs to hold multiple UniProt numbs else: entrez = item_list[x][8] if "," in item_list[x][9]: #accounts for multiple EntrezGene numbers, separated by commas item_list[x][9].strip("\"") upnumbs = item_list[x][9].split(",") #makes a list called entreznumbs to hold multiple EntrezGene numbs else: up = item_list[x][9] if "," in item_list[x][10]: #accounts for multiple orthologous OMIM Gene numbers, separated by commas item_list[x][10].strip("\"") orthonumbs = item_list[x][10].split(",") #makes a list called entreznumbs to hold multiple EntrezGene numbs else: ortho = item_list[x][10] if "," in item_list[x][11]: #accounts for multiple orthologous OMIM disorder numbers, separated by commas item_list[x][11].strip("\"") omimnumbs = item_list[x][11].split(",") #makes a list called entreznumbs to hold multiple EntrezGene numbs else: omim = item_list[x][11] if (mgiid == oldmgiid): #If read below, we assign mgiid to oldmgiid so can check if an mgiid is repeated #If an MGIID is repeated, you want to add just the GO ID and comment to the original one, #not to a new one #This section prevents repeated entries for a given MGIID. info = ("") #adding a second or more goid. else: # writes a list of information the the file indicated above # info = ("\n//" + "\nID\t" + mgiid + "\nNAME\t" + symbol + "\nFUNCTION\t" + gocomment + # "\nFUNCTION-COMMENT\t" + goid # + "\nPRODUCT-TYPE\t" + "P") # need to capture gene name if item_list[x][12] != "": info = ("\n//" + "\nID\t" + mgiid + "\nNAME\t" + symbol + "\nFUNCTION\t" + name + "\nPRODUCT-TYPE\t" + product) else: info = ("\n//" + "\nID\t" + mgiid + "\nNAME\t" + symbol + "\nFUNCTION\t" + name + "\nPRODUCT-TYPE\t" + "P") if item_list[x][7] != "" and "," not in item_list[x][7]: info += ("\nEC\t" + ec) if ecnumbs != []: #this says that so long as the EC number list is not empty for i in ecnumbs: #add each item in the list as an EC number. Only works when more than 1 EC number info += ("\nEC\t" + i) if startbase!= ".": info += ("\nSTARTBASE\t" + startbase) if endbase != ".": info += ("\nENDBASE\t" + endbase) info += ("\nDBLINK\t" + mgiid) if item_list[x][9] != "" and "," not in item_list[x][9]: info += ("\nDBLINK\tSP:" + up) if entreznumbs != []: #this says that so long as the EC number list is not empty for i in entreznumbs: #add each item in the list as an EC number. Only works when more than 1 EC number info += ("\nDBLINK\tSP:" + i) if item_list[x][8] != "" and "," not in item_list[x][8]: info += ("\nDBLINK\tLOCUSLINK:" + entrez) if upnumbs != []: #this says that so long as the EC number list is not empty for i in upnumbs: #add each item in the list as an EC number. Only works when more than 1 EC number info += ("\nDBLINK\tLOCUSLINK:" + i) if item_list[x][10] != "" and "," not in item_list[x][10]: info += ("\nDBLINK\tMIM:" + ortho) if orthonumbs != []: #this says that so long as the EC number list is not empty for i in orthonumbs: #add each item in the list as an EC number. Only works when more than 1 EC number info += ("\nDBLINK\tMIM:" + i) if item_list[x][11] != "" and "," not in item_list[x][11]: info += ("\nDBLINK\tMIM:" + omim) if omimnumbs != []: #this says that so long as the EC number list is not empty for i in omimnumbs: #add each item in the list as an EC number. Only works when more than 1 EC number info += ("\nDBLINK\tMIM:" + i) if chro == "1": #writes the info related to chromosome 1 to its own file, chr1.txt fileparse1.write(info) #adds the variable info, which now has the information stuck onto it, to the chro_1.txt. file elif chro == "2": fileparse2.write(info) elif chro == "3": fileparse3.write(info) elif chro == "4": fileparse4.write(info) elif chro == "5": fileparse5.write(info) elif chro == "6": fileparse6.write(info) elif chro == "7": fileparse7.write(info) elif chro == "8": fileparse8.write(info) elif chro == "9": fileparse9.write(info) elif chro == "10": fileparse10.write(info) elif chro == "11": fileparse11.write(info) elif chro == "12": fileparse12.write(info) elif chro == "13": fileparse13.write(info) elif chro == "14": fileparse14.write(info) elif chro == "15": fileparse15.write(info) elif chro == "16": fileparse16.write(info) elif chro == "17": fileparse17.write(info) elif chro == "18": fileparse18.write(info) elif chro == "19": fileparse19.write(info) elif chro == "UN": #for unknown chromosome genes fileparseu.write(info) elif chro == "X": fileparsex.write(info) elif chro == "Y": fileparsey.write(info) elif chro == "MT": fileparsem.write(info) elif chro == "XY": #for unknown chromosome genes fileparsexy.write(info) oldmgiid = mgiid x = x + 1 #counter gains a number chromofile.close() #have to close files, or data keeps getting taken/sent and will severely mess up the program fileparse1.close() fileparse2.close() fileparse3.close() fileparse4.close() fileparse5.close() fileparse6.close() fileparse7.close() fileparse8.close() fileparse9.close() fileparse10.close() fileparse11.close() fileparse12.close() fileparse13.close() fileparse14.close() fileparse15.close() fileparse16.close() fileparse17.close() fileparse18.close() fileparse19.close() fileparseu.close() fileparsex.close() fileparsey.close() raw_input("Press the enter key to exit") #you know the program reached the actual end if this comes up on the screen.