#This program will parse all the tr7545_rptsorted excel file provided by MGI. chromofile = open("tr7545_rpt_09072007_srtd.txt", "r") #opens the unparsed file for chromosome1 and reads it ("r" = reads) fileparse1 = open("chr1.txt", "w")#opens a file to which the parsed chrom_1 will be written fileparse2 = open("chr2.txt", "w") # The "w" means that we will write to the chr1.txt file. fileparse3 = open("chr3.txt", "w") fileparse4 = open("chr4.txt", "w") fileparse5 = open("chr5.txt", "w") fileparse6 = open("chr6.txt", "w") fileparse7 = open("chr7.txt", "w") fileparse8 = open("chr8.txt", "w") fileparse9 = open("chr9.txt", "w") fileparse10 = open("chr10.txt", "w") fileparse11 = open("chr11.txt", "w") fileparse12 = open("chr12.txt", "w") fileparse13 = open("chr13.txt", "w") fileparse14 = open("chr14.txt", "w") fileparse15 = open("chr15.txt", "w") fileparse16 = open("chr16.txt", "w") fileparse17 = open("chr17.txt", "w") fileparse18 = open("chr18.txt", "w") fileparse19 = open("chr19.txt", "w") fileparseu = open("chru.txt", "w") fileparsex = open("chrx.txt", "w") fileparsey = open("chry.txt", "w") fileparsem = open("chrm.txt", "w") fileline = chromofile.read() # reads the file into the format of a string of characters filelist = fileline.splitlines() #breaks the string apart by newline characters into list of lines item_list = [] #establishes a variable, item_list numlines = len(filelist) #sets a variable that is equal to the number of items in the list, i.e. the number of lines mgiid = "" #all of the next four lines set variable names. Each variable will be a string. oldmgiid = "-" ecnumbs = [] #empty list for holding multiple ec numbers for a given gene id for line in filelist: #splits the line by tab demarcations. item = line.split("\t") item_list.append(item) #sticks each split onto the end of a list known as item_list, giving a list #of the lines, with each line having a sublist made by tab separations x = 0 #sets a variable, x, equal to zero while x < numlines: #the x acts as a counter. Each time it goes through the loop, x will become 1 greater, eventually #reaching the value of the number of lines. ecnumbs = [] #resets the ec number list mgiid = item_list[x][0] #gives the variable name of mgiid to the xth item in the list, and the 0th place (actually the first place) #in line). This works because the MGIID is always the first thing in the line. symbol = item_list[x][1] #works the same as above, but now the symbol is the second chro = item_list[x][2] if str(item_list[x][3]) != ("."): # a period (.) indicates no value in the original MGI file, so here startbase = str(item_list[x][3]) #we're saying if the third item is NOT a period, then let that set of startbase = startbase.strip(".0") #digits to equal the variable startbase, but strips the tens decimal place off elif str(item_list[x][3]) == ("."): #If there is a period, we wanted to retain it. startbase = "." if str(item_list[x][4]) != ("."): #same as what was done for the startbase endbase = str(item_list[x][4]) endbase = endbase.strip(".0") elif str(item_list[x][4]) == ("."): endbase = "." goid = item_list[x][6] #the sixth slot is the goid. Not complicated here. gocomment = item_list[x][7] if "," in item_list[x][8]: #accounts for multiple EC numbers, separated by commas item_list[x][8].strip("\"") ecnumbs = item_list[x][8].split(",") #makes a list called ecnumbs to hold multiple EC numbs else: ec = item_list[x][8] if item_list[x][9] != ".": #Same idea as with the startbase, giving a value if needed, leaving period otherwise uniprot = "SP:" + item_list[x][9] #The uniprot number needs SP: as a prefix. Here we add that. elif item_list[x][9] == ".": uniprot = "." if item_list[x][10] != ".": #Same as with uniprot ID. Also need EG: as prefix, so added in next line. entrez = "EG:" + item_list[x][10] elif item_list[x][10] == ".": entrez = "." if (mgiid == oldmgiid): #If read below, we assign mgiid to oldmgiid so can check if an mgiid is repeated #If an MGIID is repeated, you want to add just the GO ID and comment to the original one, #not to a new one #This section prevents repeated entries for a given MGIID. goid2 = item_list[x][6] gocomment2 = item_list[x][7] info = ("\nFUNCTION\t" + gocomment2 + "\nFUNCTION-COMMENT\t" + goid2) #adding a second or more goid. else: # writes a list of information the the file indicated above info = ("\n//" + "\nID\t" + mgiid + "\nNAME\t" + symbol + "\nFUNCTION\t" + gocomment + "\nFUNCTION-COMMENT\t" + goid + "\nPRODUCT-TYPE\t" + "P") if item_list[x][8] != "." and "," not in item_list[x][8]: info += ("\nEC\t" + ec) if ecnumbs != []: #this says that so long as the EC number list is not empty for i in ecnumbs: #add each item in the list as an EC number. Only works when more than 1 EC number info += ("\nEC\t" + i) if startbase!= ".": info += ("\nSTARTBASE\t" + startbase) if endbase != ".": info += ("\nENDBASE\t" + endbase) if uniprot != ".": info += ("\nDBLINK\t" + uniprot) if entrez != ".": info += ("\nDBLINK\t" + entrez) if chro == "1": #writes the info related to chromosome 1 to its own file, chr1.txt fileparse1.write(info) #adds the variable info, which now has the information stuck onto it, to the chro_1.txt. file elif chro == "2": fileparse2.write(info) elif chro == "3": fileparse3.write(info) elif chro == "4": fileparse4.write(info) elif chro == "5": fileparse5.write(info) elif chro == "6": fileparse6.write(info) elif chro == "7": fileparse7.write(info) elif chro == "8": fileparse8.write(info) elif chro == "9": fileparse9.write(info) elif chro == "10": fileparse10.write(info) elif chro == "11": fileparse11.write(info) elif chro == "12": fileparse12.write(info) elif chro == "13": fileparse13.write(info) elif chro == "14": fileparse14.write(info) elif chro == "15": fileparse15.write(info) elif chro == "16": fileparse16.write(info) elif chro == "17": fileparse17.write(info) elif chro == "18": fileparse18.write(info) elif chro == "19": fileparse19.write(info) elif chro == "UN": #for unknown chromosome genes fileparseu.write(info) elif chro == "X": fileparsex.write(info) elif chro == "Y": fileparsey.write(info) elif chro == "MT": fileparsem.write(info) oldmgiid = mgiid x = x + 1 #counter gains a number chromofile.close() #have to close files, or data keeps getting taken/sent and will severely mess up the program fileparse1.close() fileparse2.close() fileparse3.close() fileparse4.close() fileparse5.close() fileparse6.close() fileparse7.close() fileparse8.close() fileparse9.close() fileparse10.close() fileparse11.close() fileparse12.close() fileparse13.close() fileparse14.close() fileparse15.close() fileparse16.close() fileparse17.close() fileparse18.close() fileparse19.close() fileparseu.close() fileparsex.close() fileparsey.close() raw_input("Press the enter key to exit") #you know the program reached the actual end if this comes up on the screen.