#!C:\Python23\python.exe # # Author: # Chris Hollenbeck [ hollec@rpi.edu ] # # Version: # Initial release - works with Python 1.4 (July 30, 2004) # Updated to work with Python 2.3 (August 3, 2004) # Now splits the graphs into their own DOT files (August 5, 2004) # Modified cleanup.py to only print out pairs, which were discarded # in that version (August 17, 2004) # # Usage: # binary.py < input.txt > output.dot # # Debugging: # Enable debugging to have the reasons for # removing a pair printed to STDERR. # # Directedness: # Change the arrow style to either get a directed or an undirected # graph in the DOT file. # directed: -> # undirected: -- # # Input File: # The input file has lines in the form # gene1 gene2 # # Removal Information: # This version removes the following from # the original file: # 1. Anything that is not a pair (A->B only, or A->B and B->A) # 2. Self-references (e.g. A->A only) # 3. Duplicate connections (a connection is printed only once) # import os import re import sys True, False = 1, 0 # Python 1.4 has no idea what these mean DEBUG = False # set debugging arrow = "--" # '->' for directed, '--' for undirected start_relations = {} # dictionary indexed by the starting gene end_relations = {} # dictionary indexed by the ending gene self_refs = {} # dictionary to hold self-references printed = {} done = {} valid_connections = {} marked = {} # marked connections was = {"printed":False} ##################### ##### FUNCTIONS ##### ##################### def fix_name (name): # put quotes around the name to prevent errors fixed = "\"%s\"" % (name) return fixed #-------------------# def print_debug (msg): if DEBUG: sys.stderr.write(msg) return #-------------------# def add_valid (start, end): # put the information in one direction if valid_connections.has_key(start): if not valid_connections[start].has_key(end): valid_connections[start][end] = 1 else: valid_connections[start] = {end:1} # and now the other if valid_connections.has_key(end): if not valid_connections[end].has_key(start): valid_connections[end][start] = 1 else: valid_connections[end] = {start:1} return #-------------------# def traverse (start, outfile): endings = valid_connections[start].keys() endings.sort() for end in endings: if marked.has_key(start) and marked.has_key(end): if not marked[start].has_key(end) and \ not marked[end].has_key(start): marked[start][end] = 1 marked[end][start] = 1 # move on to the next node print_connection(start, end, outfile) traverse(end, outfile) else: if marked.has_key(start): marked[start][ending] = 1 else: marked[start] = {end:1} if marked.has_key(end): marked[end][start] = 1 else: marked[end] = {start:1} # move on to the next node print_connection(start, end, outfile) traverse(end, outfile) return #-------------------# def print_connection(start, ending, outfile): # only print out an edge if it hasn't already been done if done.has_key(start) and done.has_key(ending): if not done[start].has_key(ending) and \ not done[ending].has_key(start): done[start][ending] = 1 done[ending][start] = 1 msg = "\t%s %s %s;\n" % (fix_name(start), arrow, fix_name(ending)) outfile.write(msg) was["printed"] = True else: # only print out an edge if it hasn't been done if done.has_key(start): done[start][ending] = 1 else: done[start] = {ending:1} if done.has_key(ending): done[ending][start] = 1 else: done[ending] = {start:1} msg = "\t%s %s %s;\n" % (fix_name(start), arrow, fix_name(ending)) outfile.write(msg) was["printed"] = True # print out all self-references if self_refs.has_key(ending) and printed.has_key(ending) == False: for self in self_refs[ending]: msg = "\t%s %s %s;\n" % (fix_name(self), arrow, fix_name(self)) outfile.write(msg) printed[self] = True was["printed"] = True ##################### ####### MAIN ######## ##################### while 1: line = sys.stdin.readline() if not line: break p = re.compile("^(.+)\t$") m = p.search(line) if m: msg = "Single: %s\n" % (m.group(1)) print_debug(msg) else: # load data into the respective dictionaries q = re.compile("^(.+)\t(.+)$") n = q.search(line) if n: lhs = n.group(1) rhs = n.group(2) if lhs == rhs: if self_refs.has_key(lhs): self_refs[lhs].append(rhs) else: self_refs[lhs] = [rhs] else: # add it in by starting if start_relations.has_key(lhs): start_relations[lhs].append(rhs) else: start_relations[lhs] = [rhs] # add it in by ending if end_relations.has_key(rhs): end_relations[rhs].append(lhs) else: end_relations[rhs] = [lhs] starting = start_relations.keys() starting.sort() # choose a directed or an undirected graph based on the type # of arrow set at the top of this file if arrow == "->": print "digraph relationships\t{" else: print "graph relationships\t{" for start in starting: valid = True end = start_relations[start] if len(end) == 1: # if the ending isn't the start of any other relationships if start_relations.has_key(end[0]) == 0: if end_relations.has_key(start) == 0: if len(end_relations[end[0]]) == 1: msg = "Unary: %s -> %s\n" % (start, end[0]) print_debug(msg) valid = False # these do count as they represent A->B only else: # the start is the ending of at least one other relationship if end_relations.has_key(start): # it only ends one other relationship end_length = len(end_relations[start]) if end_length == 1: if end_relations[start][0] == end[0]: # the ending only starts one relationship # this is the one they are in if len(start_relations[end[0]]) == 1: # the ending only ends one relationship # this is the one they are in if len(end_relations[end[0]]) == 1: msg = "Binary: %s -> %s\n" % (start, end[0]) print_debug(msg) valid = False # it ends multiple relationships, but they are all from the # same start elif end_relations[start].count(end[0]) == end_length: if len(start_relations[end[0]]) == end_length: msg = "Binary: %s -> %s\n" % (start, end[0]) print_debug(msg) valid = False else: # the ending is only the end of the given relationship if end.count(end[0]) == len(end): if end_relations.has_key(start): end_length = len(end_relations[start]) if end_length == 1: if end_relations[start][0] == end[0]: # the ending only starts one relationship # this is the one they are in if len(start_relations[end[0]]) == 1: # the ending only ends one relationship # this is the one they are in if len(end_relations[end[0]]) == \ end_relations[end[0]].count(start): msg = "Binary: %s -> %s\n" % (start, end[0]) print_debug(msg) valid = False elif end_relations[start].count(end[0]) == end_length: if len(end_relations[end[0]]) == end_length: msg = "Binary: %s -> %s\n" % (start, end[0]) print_debug(msg) valid = False else: # Removes cases like Homer2 # e.g. Homer2 -> Grm1; Homer2 -> Grm1 msg = "Binary: %s -> %s\n" % (start, end[0]) print_debug(msg) valid = False # changed to False so only binaries would be printed if valid == False: # print out all 1-1 relationships for ending in end: # only print out an edge if it hasn't already been done if done.has_key(start) and done.has_key(ending): if not done[start].has_key(ending) and \ not done[ending].has_key(start): done[start][ending] = 1 done[ending][start] = 1 msg = "\t%s %s %s;\n" % (fix_name(start), arrow, fix_name(ending)) sys.stdout.write(msg) add_valid(start, ending) else: # only print out an edge if it hasn't been done if done.has_key(start): done[start][ending] = 1 else: done[start] = {ending:1} if done.has_key(ending): done[ending][start] = 1 else: done[ending] = {start:1} msg = "\t%s %s %s;\n" % (fix_name(start), arrow, fix_name(ending)) sys.stdout.write(msg) add_valid(start, ending) # print out all self-references if self_refs.has_key(ending) and printed.has_key(ending) == False: for self in self_refs[ending]: msg = "\t%s %s %s;\n" % (fix_name(self), arrow, fix_name(self)) sys.stdout.write(msg) add_valid(start, ending) printed[self] = True # print out all self-references if self_refs.has_key(start) and printed.has_key(start) == False: for self in self_refs[start]: msg = "\t%s %s %s;\n" % (fix_name(self), arrow, fix_name(self)) sys.stdout.write(msg) add_valid(start, ending) printed[self] = True print "}" # We don't want to print out seperate files - only the large file with all pairs # so the method for doing this has been removed