##
##
## Sarchitect desginer 2.3 script to filter descriptors that have less than 
## certain number of distince values
##
## Shaillay Dogra
## 25 July 2007
## editor@qsarworld.com
##
##
## User sets a cutoff of how many minimum distinct values should exist in a given descriptor column
## Based on the cutoff the descriptor is retained or rejected
## The descriptors that were filtered out are displayed in a scatter plot
## A subset is created containing those descriptors that passed the filtering criterion
##
##

import script
from script.dataset import *
from script.algorithm import *
from script.project import *
from script.view import *
from script.omega import createComponent, showDialog
from javax.swing import *
from math import *


##-----------------------------------------------------------------
## DEFINE CHECKDATA
def checkdata(dataset):
	indices_continuous = DatasetUtil.getContinuousColumnIndices(dataset)
	if(indices_continuous.getSize()==0):
		parent=script.tool.getTool().getFrame()
		mesg = "No Descriptor Data"
		JOptionPane.showMessageDialog(parent,mesg,"ERROR!",JOptionPane.INFORMATION_MESSAGE)
		return 0
	else:
		return 1
##-----------------------------------------------------------------

## DEFINE DISTINCEVALUES
def distinctvalues(column):
	from java.util import HashSet
	set = HashSet()
	for rowIndex in range(column.getSize()):
		set.add(column.get(rowIndex))
	values = set.size()
	return values

##--------------------------------------------------



##
## DEFINE MAIN
##

def main(dataset):
	
	## Get descriptor columns, assumption: continuous and unmarked columns	
	indices_continuous = DatasetUtil.getContinuousColumnIndices(dataset)
	indices_nm_continuous = script.project.removeMarkedColumns(dataset,indices_continuous)
	columnList = indices_nm_continuous
	#print columnList

	## Get endpoint column
	classlabelcolumnIndex = indices_continuous.get(0)
	classlabelCol = DatasetUtil.getMarkedColumns(dataset, "Endpoint")
	for col in classlabelCol:
		if(not col.isCategorical()):
        		classlabelcolumnIndex = dataset.indexOf(col)
        	break
	#print "Endpoint:", dataset[classlabelcolumnIndex]

	total_points = dataset.getRowCount()
	dynamic_default = total_points/10

	## Ask user for minimum number of distince values to be used as cut-off
	p = createComponent(type="float", id="name", description="Minimum Distinct Values?", value=dynamic_default)
	cutoff=showDialog(p)
	#print cutoff

	
	passlist = []
	faillist = []

	## Check number of distinct values
	for i in range(columnList.getSize()):
		idx = columnList.get(i)
		col = dataset[idx]
		howmany = distinctvalues(col)
		#print howmany

		if (howmany >= cutoff):
			#print col, howmany
			passlist.append(col)
		else: faillist.append(col)


	#print passlist, "  >>>-<<<  ", faillist

	## Show a scatterplot containing columns that failed the filtering above
	rowIndices = [i for i in range(dataset.getRowCount())]
	colIndices = faillist
	endpoint = dataset[classlabelcolumnIndex]
	colIndices.append(endpoint)
	endIdx = len(colIndices)-1   # since count starts from zero
	tmpdataset = script.dataset.createDataset("Failed Set", colIndices)
	script.view.ScatterPlot(dataset=tmpdataset, yaxis=endIdx).show()


	## Define a new child dataset containing columns that passed the filtering above

	rowIndices = [i for i in range(dataset.getRowCount())]
	colIndices = []
	script.project.addSubsetChild(rowIndices, colIndices , name="Filtered Set", addMarkedColumns=1)
	subset=script.project.getActiveDataset()

	for col in passlist:
		subset.addColumn(col)
	script.view.Table(dataset=subset).show()

##-----------------------------------------------------------------


## Call main
dataset = script.project.getActiveDataset()
if (checkdata(dataset)!=0):
	main(dataset)

	## Report completion
	parent=script.tool.getTool().getFrame()
	mesg = "Done With Script Execution."
	JOptionPane.showMessageDialog(parent,mesg,"STATUS!",JOptionPane.INFORMATION_MESSAGE)


##
## END
##