#!/usr/local/bin/python

""" This program computes the scores of functional relationships between proteins derived from Sequence data.
    So the porgram takes either the file containing BLAST similarity score between proteins sequences in format
	prot1\tprot2\tS 
	Meaning that file will have 3 columns separated by tab: column1 and 2 are respective protein ID and the third column
	is the similarity score obtained by running BLAST,
	or a file containing two column in the format: prot\tsignature_entry
	where the first column is the protein ID and the second column is the signature ID found in the proteins

	The program outputs the file containing the list of the predicted interaction with respective relationscore
	in format: prot1\tprot2\trelationship_core

	The program is run under Linux environment as follow: python name_of_datafile n
	where n is either 0 if the data under consideration is from family protein data or 1 if data is from sequence similarity

	#Copyright (C) May 2009 @Gaston Kuzamunu Mazandu @Computational Biology Group, University of Cape Town
"""

import sys
from scipy import *
from scipy.stats import *
from scipy.integrate import quad, inf

SignatureNumber = {}
ControlProt = {}

def quartile(x, n = 2):
	'''This function computes the quartile 1, 2 (median) and 3 for a given no empty array or list of data x
	   and by default, it will be computing the median
	'''
	ax = array(sorted(x))
	if n==1: r = .25
	elif n==2: r = .5
	else: r = .75
	p = 1.0 + (len(ax)-1)*r
	ip = int(p)
	delta = (ax[ip]-ax[ip-1])*(p-ip)
	return ax[ip-1]+delta
    
def h2X(f):
	'''This function implements the binary entropy function for a given real variable f between 0 and 1'''
	if f==0.0 or f==1.0: return 0.0
	return -f*log2(f)-(1.0-f)*log2(1.0-f)

def readFile(familyfile):
	'''This function reads the protein family data and counts the number of occurrence of every signature
	   associates every protein to their signature  for computation of scoring score purpose'''
	try:
		pfile = open(familyfile,'r')
	except:
		print "\tAn error occurs while attempting to read the file ..."
		print "\tCheck your input file, and try again please ..."
		print "\tFor now, the program cannot be pursued ..."
		print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
		sys.exit()
	else:
		cline = 0
		for line in pfile:
			cline += 1
			ligne = line.strip()
			if not ligne or ligne.startswith('!') or ligne.startswith('$') or \
				   ligne.startswith('#'): continue  								  # skip empty or comments lines
			ligne = ligne.split('\t')
			if len(ligne) != 2:
				print "\tAn error occurs when reading file at line [%d]"%(cline,)
				print "\tPossibly, the line [%d] is not in the right format"%(cline,)
				print "\tPlease, check it and then try again ..."
				print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
				sys.exit()

			Protein = ligne[0].strip(); Signature = ligne[1].strip()

			try:                                          						 	 # Protein has been already read 
				ControlProt[Protein].append(Signature)                             	 # Add new signature in the protein Signature
			except:                                                                  # New Protein discovered, add it in dictionaries and create its signatures list
				ControlProt[Protein] = [Signature]    

			try: SignatureNumber[Signature] += 1
			except: SignatureNumber[Signature] = 1		
		pfile.close()

def familyScoreComputation(familyfile, alpha = 1.0, penal = 0.45):
	'''This function computes the scores of between proteins sharing common signature and output
	   file containing these interactions, with calibration parameter alpha = 1.0 and sigma penalized with a factor 0.45'''
	resultfile = familyfile.split('.')[0]+'_Results.txt'
	out = open(resultfile,'w')
	
	x = SignatureNumber.values()
	Q1 = quartile(x,1); Q3 = quartile(x,3); IQR = Q3-Q1
	data = [b for b in x if (b >= Q1-1.5*IQR and b <= Q3+1.5*IQR)]
	ss = std(data)

	Protein = ControlProt.keys()
	for i in xrange(len(Protein)):
		origin = Protein[i]
		for j in xrange(i+1,len(Protein)):
			dest = Protein[j]
			CommonSig = list(set(ControlProt[origin]).intersection(ControlProt[dest]))
			if len(CommonSig) != 0:                                                # Meaning that the 2 proteins have commun signature!
				eta = 0.0
				for com in CommonSig:
					eta += min(ControlProt[origin].count(com), ControlProt[dest].count(com))
				conf = quad(lambda y: exp(-(y*y)/(2.0))/sqrt(2.0*pi),-inf,pow(1.0*eta,alpha)/(penal*ss))[0]
				uncert = h2X(conf)
				score = 1.0 - uncert
				out.write(origin+'\t'+dest+'\t'+'%1.3f'%round(score,3)+'\n')
				del eta, conf, uncert, score
			del dest, CommonSig
		del origin
	out.close()

def computationRscoreBlast(similarityfile):
	""" This program computes link reliability using similarity score produced from BLAST
	    and return all result in the file named yournamefile_Result.txt whose fields are: 
	    source_id, dest_id, link reliability obtained!
	"""
	try:
		fp = open(similarityfile,'r')
	except:
		print "\tAn error occurs while attempting to read the file ..."
		print "\tCheck your input file, and try again please ..."
		print "\tFor now, the program cannot be pursued ..."
		print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
		sys.exit()
	else:
		result = similarityfile.split('.')[0]+'_Result.txt'
		rfile = open(result,'w')
		cline, control, controlg = 0, {}, {}
		for line in fp:
			cline += 1
			ligne = line.strip() 
			if not ligne or ligne.startswith('!') or ligne.startswith('$') or \
				   ligne.startswith('#'): continue  								  # skip empty or comments lines
			ligne = ligne.split('\t')
			if len(ligne) != 3:
				print "\tAn error occurs when reading file at line [%d]"%(cline,)
				print "\tPossibly, the line [%d] is not in the right format"%(cline,)
				print "\tPlease, check it and then try again ..."
				print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
				sys.exit()        
			key = ligne[0].strip(),ligne[1].strip()

			if key in control.keys():
				if control[key] < float(ligne[-1].strip()):
					control[key] = float(ligne[-1].strip())
			else:
				control[key] = float(ligne[-1].strip())
		fp.close()

		for key in control:
			if key[0] != key[1]:
				try:	        
					Lrel = control[key]+control[key[1],key[0]]
					Lrel /= 2*max(control[key[0],key[0]],control[key[1],key[1]])
					rfile.write(key[0]+'\t'+key[1]+'\t'+'%1.3f'%round(Lrel,3)+'\n')
				except:
					pass
		rfile.close()


if __name__=='__main__':
	print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
	print "       New Approach for Scoring Protein Relationships From Sequence Data        "
	print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
	
	if len(sys.argv) < 3:
		print "       Usage: python datafile_name datatype other_parameters"
		print "       Program parameter(s) missing. Aborting...\n"
		print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
	
	elif sys.argv[2].strip()=='0':
		print "\tWait Please This may take time, processing data ..."
		print "\tIf everything goes OK, Functional Protein Relationship score will be"
		print "\toutput in the file [%s]"%(sys.argv[1].strip().split('.')[0]+'_Result.txt',)
		readFile(sys.argv[1].strip())
		familyScoreComputation(sys.argv[1].strip())
		print "\nScores successfully computed ..., Please, find the output file as indicated above /~|:)"
		print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
	elif sys.argv[2].strip()=='1': 
		print "\tWait Please This may take time, processing data ..."
		print "\tIf everything goes OK, Functional Protein Relationship score will be"
		print "\toutput in the file [%s]"%(sys.argv[1].strip().split('.')[0]+'_Result.txt',)
		computationRscoreBlast(sys.argv[1].strip())
		print "\nScores successfully computed ..., Please, find the output file as indicated above /~|:)"
		print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
	else:
		print "\tAccording to the data, the choice should be 0 or 1 ..."
		print "\t0 for protein family and 1 for similarity sequence data from BLAST ..."
		print "\tConsider the above information and try again ..."
		print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
