##
##
##     sarchitect_designer_2.2 script to get "regression diagnostics"
##
##     Shaillay Kumar Dogra
##     18 Oct 2006    
##     shaillay@strandls.com
##
##
##     INPUTs required:  (in that order)
##                 -> "actual values" column
##                 -> "predicted values" column
##                 -> "standard error" column
##
##                 -> "number of descriptors" used in LR model
##
##
##     OUTPUTs:
##  	      	  -> 'Ordinary Residuals' column
##      	  -> 'Standardized Residuals' column
##      	  -> 'Jack-Knifed Residuals' column
##	  	  -> 'Leverage' column 
##        	  -> 'Leverage Status' column
##        	  -> 'Cook's Distance' column
##
##        	  -> launches scatterplot
##        	  -> launches histogram
##
##
##
##     Mathematics:
##
## 		'ordinary residuals' is (actual - predicted)
##		'standardized residuals' is (actual - predicted)/{SQRT(1 - h_i) * stdDev} 
##      	'jack-knifed residuals' is (std-resid_i) * SQRT{(n-p-1)/(n-p-(std-resid_i)^2)}
##      	'leverage' is  (stdErr_i/stdDev)^2
##      	'leverage status' is 'high' if leverage > 3(p+1)/n
##      	'Cook's distance' is  ((stdErr_i)^2 * h_i) / (1-h_i)*(p+1)
##
##
##      	Now, Standard-error (reported in sarchitect regression results) is:
##             	stdErr_i = stdDev * sqrt(h_i)
##          	=>    h_i = (stdErr_i/stdDev)^2
##
##      	Also,  (stdDev)^2  =  RSS / dfE
##      	where - RSS = SUM(actual - predicted)^2  
##             	- dfE = (n-p-1)  [when LR is done withIntercept]
##              or dfE = (n-p) [when LR is done withoutIntercept]
##
##             'n' - no. of compounds, 'p' - no. of descriptors used in LR model
##       
##      	Since division by 'dfE' (instead of 'n') is a bias correction in the estimation of
##      	(stdDev)^2  or 'variance' we will stay conservative and always use dfE = (n-p-1) 
##     		whether LR model happens to be 'withIntercept' or 'withoutIntercept'.
##
##     		Thus, 'standardized residuals' becomes -
##
##                 'ordinary residuals' / [ SQRT( (stdDev)^2 - (stdErr_i)^2 ) ]
## 
##
##


import script
from script.dataset import *
from script.omega import createComponent, showDialog
from javax.swing import *
from math import *
from script.view import *

##--------------------------
def textarea(text):
        t = JTextArea(text)
        t.setBackground(JLabel().getBackground())
        return t

##------------------------------------------------
## PANEL TO GET COLUMN SELECTION
def getInputColumns():
        helptext="""
   Select 'actual', 'predicted' and 'standard-error' columns. 
   Make 'actual' column the first selection, 'predicted' column the second selection and so on. 
            """
        helpPanel=createComponent(type="ui",id="help", description="",component=textarea(helptext))
        colPanel = createComponent(type="ColumnSelector",id="columns", description="Select Columns",dataset=script.project.getActiveDataset())
        finalPanel= createComponent(type="group",id="Provide Actual and Predicted Columns", description="Select Columns",components=[helpPanel,colPanel])
        result=showDialog(finalPanel)
        return result['columns']

##-----------------------------------------------------
## PANEL TO GET NUMBER OF DESCRPTORS USED IN MODEL
def getNumDescriptors():
	p = createComponent(type="int", id="name", description="No. of Descriptors used in model?",value="0")
	result=showDialog(p)
	return result

##-------------------------------------------------------
## COMPUTE SQUARE OF STANDARD DEVIATION (VARIANCE)
def StdDevSqr():
	## compute sum of squares of residuals
	RSS = 0
	for i in residual_col:
		RSS =  RSS + (i*i)
	
	degree_of_freedom = (no_of_compounds - no_of_descriptors - 1)
	sqr_stdDev = (RSS / degree_of_freedom)
	return sqr_stdDev

##------------------------------------------------------
## COMPUTE STANDARDIZED RESIDUALS (INTERNALLY STUDENTIZED RESIDUALS)
def IntStdResid():
	stdDev_sqr = StdDevSqr()
	int_standardized_residual = [ ]
	for idx in range(no_of_compounds):
		denominator = sqrt((stdDev_sqr)-(std_err[idx]*std_err[idx]))  
		std_resid = (residual_col[idx] / denominator)
		int_standardized_residual.append(std_resid)

	return int_standardized_residual

##-----------------------------------------------------------
## COMPUTE JACK-KNIFED RESIDUALS (EXTERNALLY STUDENTIZED RESIDUALS)
def ExtStdResid():
	int_std_resid = dataset.getColumn("Standardized Residuals")
	ext_standardized_residual = [ ]
	for val in int_std_resid:
		std_resid =  val * sqrt( (no_of_compounds - no_of_descriptors - 1) / (no_of_compounds - no_of_descriptors - (val * val)) )
		ext_standardized_residual.append(std_resid)

	return ext_standardized_residual

##-----------------------------------------------------------
## COMPUTE LEVERAGE
def Leverage():
	stdDev_sqr = StdDevSqr()
	threshold = (3.00 * (no_of_descriptors + 1.00) / (no_of_compounds))
	leverage = [ ]
	warning = [ ]
	for idx in range(no_of_compounds):
		lvrg = ( std_err[idx] * std_err[idx] ) / stdDev_sqr
		leverage.append(lvrg)
		if (lvrg > threshold):
			warning.append("HIGH")
		else: 
			warning.append("OK")
			
	return leverage, warning

##-----------------------------------------------------------
## COMPUTE COOK'S DISTANCE
def CooksDist():
	StdResidCol = dataset.getColumn("Standardized Residuals")
	LvrgCol = dataset.getColumn("Leverage")
	cooks_distance = [ ]
	for idx in range(no_of_compounds):
		numrtr =  (StdResidCol[idx] * StdResidCol[idx]) * LvrgCol[idx]
		denmntr = (1 - LvrgCol[idx]) * (no_of_descriptors + 1)
		distance = (numrtr/denmntr)
		
		cooks_distance.append(distance)

	return cooks_distance

##-----------------------------------------------------------


## MAIN 

dataset=script.project.getActiveDataset()

## GET REQUIRED COLUMNS
columns=getInputColumns()

## get 'n'
no_of_compounds = dataset.getRowCount()

## get 'p'
no_of_descriptors = getNumDescriptors()


actual_value = dataset[columns.get(0)]
pred_value = dataset[columns.get(1)]
std_err = dataset[columns.get(2)]

residual_col = (actual_value - pred_value)
dataset.addColumn(createFloatColumn("Ordinary Residuals", residual_col))	

std_residual_col = IntStdResid()
dataset.addColumn(createFloatColumn("Standardized Residuals", std_residual_col))

jack_knife_resid_col = ExtStdResid()
dataset.addColumn(createFloatColumn("Jack-Knifed Residuals", jack_knife_resid_col))

(leverage_col, warning_col) = Leverage()
dataset.addColumn(createFloatColumn("Leverage", leverage_col))
dataset.addColumn(createStringColumn("Leverage Status", warning_col))

cooks_dist_col = CooksDist()
dataset.addColumn(createFloatColumn("Cook's Distance", cooks_dist_col))



## SCATTER PLOT
xcol = dataset.index(dataset[columns.get(1)])
ycol = dataset.index("Ordinary Residuals")
plot = ScatterPlot(yaxis = ycol, xaxis = xcol)
colorIdx = dataset.index("Leverage Status")
plot.colorBy.columnIndex = colorIdx
plot.show()

## HISTOGRAM
col = dataset.index("Ordinary Residuals")
Histogram(column = col).show()


## REPORT COMPLETION
parent=script.tool.getTool().getFrame()
mesg = "Done With Script Execution."
JOptionPane.showMessageDialog(parent,mesg,"STATUS!",JOptionPane.INFORMATION_MESSAGE)

