#!/usr/local/bin/python
#This program will parse sorted mousecyc_data file

chromofile = open("mousecyc_data.txt", "r")  #opens the unparsed file for chromosome1 and reads it ("r" = reads)
fileparse1  = open("chr1.txt", "w")#opens a file to which the parsed chrom_1 will be written
fileparse2 = open("chr2.txt", "w") # The "w" means that we will write to the chr1.txt file.
fileparse3 = open("chr3.txt", "w")
fileparse4 = open("chr4.txt", "w")
fileparse5 = open("chr5.txt", "w")
fileparse6 = open("chr6.txt", "w")
fileparse7 = open("chr7.txt", "w")
fileparse8 = open("chr8.txt", "w")
fileparse9 = open("chr9.txt", "w")
fileparse10 = open("chr10.txt", "w")
fileparse11 = open("chr11.txt", "w")
fileparse12 = open("chr12.txt", "w")
fileparse13 = open("chr13.txt", "w")
fileparse14 = open("chr14.txt", "w")
fileparse15 = open("chr15.txt", "w")
fileparse16 = open("chr16.txt", "w")
fileparse17 = open("chr17.txt", "w")
fileparse18 = open("chr18.txt", "w")
fileparse19 = open("chr19.txt", "w")
fileparseu = open("chru.txt", "w")
fileparsex = open("chrx.txt", "w")
fileparsey = open("chry.txt", "w")
fileparsem = open("chrm.txt", "w")
fileparsexy = open("chrxy.txt", "w")

fileline = chromofile.read()    # reads the file into the format of a string of characters
filelist = fileline.splitlines()    #breaks the string apart by newline characters into list of lines
item_list = []  #establishes a variable, item_list

numlines = len(filelist)    #sets a variable that is equal to the number of items in the list, i.e. the number of lines


mgiid = ""  #all of the next four lines set variable names.  Each variable will be a string.
oldmgiid = "-"

ecnumbs = [] #empty list for holding multiple ec numbers for a given gene id
upnumbs = [] #empty list for holding multiple ec numbers for a given gene id
entreznumbs = [] #empty list for holding multiple ec numbers for a given gene id
orthonumbs = [] #empty list for holding multiple ec numbers for a given gene id
omimnumbs = [] #empty list for holding multiple ec numbers for a given gene id

for line in filelist:   #splits the line by tab demarcations.
    item = line.split("\t")
    item_list.append(item)  #sticks each split onto the end of a list known as item_list, giving a list
                            #of the lines, with each line having a sublist made by tab separations

x = 0  #sets a variable, x, equal to zero
while x < numlines: #the x acts as a counter.  Each time it goes through the loop, x will become 1 greater, eventually
                    #reaching the value of the number of lines.
    ecnumbs = []    #resets the ec number list
    upnumbs = []    #resets the ec number list
    entreznumbs = []    #resets the ec number list
    orthonumbs = []    #resets the ec number list
    omimnumbs = []    #resets the ec number list
    
    mgiid = item_list[x][0] #gives the variable name of mgiid to the xth item in the list, and the 0th place (actually the first place)
                            #in line). This works because the MGIID is always the first thing in the line.
    symbol = item_list[x][1]    #works the same as above, but now the symbol is the second 
    name = item_list[x][2]    #works the same as above, but now the name is the third 
    chro = item_list[x][3]
    product = item_list[x][12]
    if str(item_list[x][4]) != (""):   # a period (.) indicates no value in the original MGI file, so here 
        startbase = str(item_list[x][4])    #we're saying if the third item is NOT a period, then let that set of 
        startbase = startbase.replace(".00", "")   #digits to equal the variable startbase, but strips the tens decimal place off
    elif str(item_list[x][4]) == (""): #If there is a period, we wanted to retain it.
        startbase = "."
    if str(item_list[x][5]) != (""):   #same as what was done for the startbase
        endbase = str(item_list[x][5])
        endbase = endbase.replace(".00", "")   #digits to equal the variable startbase, but strips the tens decimal place off
    elif str(item_list[x][5]) == (""):
        endbase = "."
        
    strand = item_list[x][6]  #strand
    if strand== "-":   #if - reverse start and end
        tempbase = endbase
        endbase = startbase
        startbase = tempbase
        
    goid = "" #the sixth slot is the goid.  Not complicated here.
    gocomment = ""

    if "," in item_list[x][7]:  #accounts for multiple EC numbers, separated by commas
        item_list[x][7].strip("\"")
        ecnumbs = item_list[x][7].split(",")  #makes a list called ecnumbs to hold multiple EC numbs
             
    else:    
        ec = item_list[x][7]

    if "," in item_list[x][8]:  #accounts for multiple UniProt numbers, separated by commas
        item_list[x][8].strip("\"")
        entreznumbs = item_list[x][8].split(",")  #makes a list called upnumbs to hold multiple UniProt numbs
             
    else:    
        entrez = item_list[x][8]

    if "," in item_list[x][9]:  #accounts for multiple EntrezGene numbers, separated by commas
	item_list[x][9].strip("\"")
	upnumbs = item_list[x][9].split(",")  #makes a list called entreznumbs to hold multiple EntrezGene numbs

    else:    
	up = item_list[x][9]
    
    if "," in item_list[x][10]:  #accounts for multiple orthologous OMIM Gene numbers, separated by commas
	item_list[x][10].strip("\"")
	orthonumbs = item_list[x][10].split(",")  #makes a list called entreznumbs to hold multiple EntrezGene numbs

    else:    
	ortho = item_list[x][10]
    
   
    if "," in item_list[x][11]:  #accounts for multiple orthologous OMIM disorder numbers, separated by commas
	item_list[x][11].strip("\"")
	omimnumbs = item_list[x][11].split(",")  #makes a list called entreznumbs to hold multiple EntrezGene numbs

    else:    
	omim = item_list[x][11]
    
    if (mgiid == oldmgiid):  #If read below, we assign mgiid to oldmgiid so can check if an mgiid is repeated
                            #If an MGIID is repeated, you want to add just the GO ID and comment to the original one,
                            #not to a new one
                            #This section prevents repeated entries for a given MGIID.
       
        info = ("")  #adding a second or more goid.
    else:   # writes a list of information the the file indicated above
#        info = ("\n//" + "\nID\t" + mgiid + "\nNAME\t" + symbol + "\nFUNCTION\t" + gocomment + 
#                "\nFUNCTION-COMMENT\t" + goid  
#                + "\nPRODUCT-TYPE\t" + "P")
# need to capture gene name

        if item_list[x][12] != "":
            info = ("\n//" + "\nID\t" + mgiid + "\nNAME\t" + symbol + "\nFUNCTION\t" + name + "\nPRODUCT-TYPE\t" + product)
        else:
            info = ("\n//" + "\nID\t" + mgiid + "\nNAME\t" + symbol + "\nFUNCTION\t" + name + "\nPRODUCT-TYPE\t" + "P")
        
        if item_list[x][7] != "" and "," not in item_list[x][7]:
            info += ("\nEC\t" + ec)
        if ecnumbs != []:  #this says that so long as the EC number list is not empty
            for i in ecnumbs:   #add each item in the list as an EC number.  Only works when more than 1 EC number
                info += ("\nEC\t" + i)
        if startbase!= ".":
            info += ("\nSTARTBASE\t" + 
                startbase)
        if endbase != ".":
            info += ("\nENDBASE\t" + endbase)
            
        info += ("\nDBLINK\t" + mgiid)
        
        if item_list[x][9] != "" and "," not in item_list[x][9]:
            info += ("\nDBLINK\tSP:" + up)
        if entreznumbs != []:  #this says that so long as the EC number list is not empty
            for i in entreznumbs:   #add each item in the list as an EC number.  Only works when more than 1 EC number
                info += ("\nDBLINK\tSP:" + i)

        if item_list[x][8] != "" and "," not in item_list[x][8]:
            info += ("\nDBLINK\tLOCUSLINK:" + entrez)
        if upnumbs != []:  #this says that so long as the EC number list is not empty
            for i in upnumbs:   #add each item in the list as an EC number.  Only works when more than 1 EC number
                info += ("\nDBLINK\tLOCUSLINK:" + i)
        
        if item_list[x][10] != "" and "," not in item_list[x][10]:
            info += ("\nDBLINK\tMIM:" + ortho)
        if orthonumbs != []:  #this says that so long as the EC number list is not empty
            for i in orthonumbs:   #add each item in the list as an EC number.  Only works when more than 1 EC number
                info += ("\nDBLINK\tMIM:" + i)

        if item_list[x][11] != "" and "," not in item_list[x][11]:
            info += ("\nDBLINK\tMIM:" + omim)
        if omimnumbs != []:  #this says that so long as the EC number list is not empty
            for i in omimnumbs:   #add each item in the list as an EC number.  Only works when more than 1 EC number
                info += ("\nDBLINK\tMIM:" + i)
                
    if chro == "1":    #writes the info related to chromosome 1 to its own file, chr1.txt
        fileparse1.write(info)  #adds the variable info, which now has the information stuck onto it, to the chro_1.txt. file
    elif chro == "2":
        fileparse2.write(info)
    elif chro == "3":
        fileparse3.write(info)
    elif chro == "4":
        fileparse4.write(info)
    elif chro == "5":
        fileparse5.write(info)
    elif chro == "6":
        fileparse6.write(info)
    elif chro == "7":
        fileparse7.write(info)
    elif chro == "8":
        fileparse8.write(info)
    elif chro == "9":
        fileparse9.write(info)
    elif chro == "10":
        fileparse10.write(info)
    elif chro == "11":
        fileparse11.write(info)
    elif chro == "12":
        fileparse12.write(info)
    elif chro == "13":
        fileparse13.write(info)
    elif chro == "14":
        fileparse14.write(info)
    elif chro == "15":
        fileparse15.write(info)
    elif chro == "16":
        fileparse16.write(info)
    elif chro == "17":
        fileparse17.write(info)
    elif chro == "18":
        fileparse18.write(info)
    elif chro == "19":
        fileparse19.write(info)
    elif chro == "UN":   #for unknown chromosome genes
        fileparseu.write(info)
    elif chro == "X":
        fileparsex.write(info)
    elif chro == "Y":
        fileparsey.write(info)
    elif chro == "MT":
        fileparsem.write(info)
    elif chro == "XY":   #for unknown chromosome genes
        fileparsexy.write(info)
    
    oldmgiid = mgiid
    x = x + 1  #counter gains a number
  
        
chromofile.close()  #have to close files, or data keeps getting taken/sent and will severely mess up the program
fileparse1.close()
fileparse2.close()
fileparse3.close()
fileparse4.close()
fileparse5.close()
fileparse6.close()
fileparse7.close()
fileparse8.close()
fileparse9.close()
fileparse10.close()
fileparse11.close()
fileparse12.close()
fileparse13.close()
fileparse14.close()
fileparse15.close()
fileparse16.close()
fileparse17.close()
fileparse18.close()
fileparse19.close()
fileparseu.close()
fileparsex.close()
fileparsey.close()
raw_input("Press the enter key to exit")  #you know the program reached the actual end if this comes up on the screen.