|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectweka.deduping.Deduper
weka.deduping.BasicDeduper
A basic deduper class that takes a set of objects and identifies disjoint subsets of duplicates
Field Summary | |
protected int[] |
m_attrIdxs
the attribute indeces on which to do deduping |
protected double[] |
m_classValues
An array containing class values for instances (for faster statistics) |
protected int[] |
m_clusterAssignments
temporary variable holding cluster assignments |
protected java.util.ArrayList |
m_clusters
holds the clusters |
protected boolean |
m_debug
verbose? |
protected double[][] |
m_distanceMatrix
distance matrix containing the distance between each pair |
protected java.util.HashMap |
m_instancesHash
instance hash, where each Integer index is hashed to an instance |
protected int |
m_numActualDupePairsTrain
|
protected int |
m_numActualNonDupePairsTrain
|
protected int |
m_numCurrentObjects
Number of clusters in the process |
protected int |
m_numGoodPairs
|
protected int |
m_numObjects
The total number of true objects |
protected int |
m_numPotentialDupePairsTrain
|
protected int |
m_numPotentialNonDupePairsTrain
|
protected int |
m_numTotalPairs
Statistics |
protected int |
m_numTotalPairsTest
|
protected int |
m_numTotalPairsTrain
|
protected int |
m_numTruePairs
|
protected java.util.HashMap |
m_reverseInstancesHash
reverse instance hash, where each instance is hashed to its Integer index |
protected Instances |
m_testInstances
A set of instances to dedupe |
protected double |
m_testTimeStart
|
protected double |
m_trainProportion
The proportion of the training fold that should be used for training |
protected double |
m_trainTime
|
protected boolean |
m_useBlocking
Use blocking ? |
Fields inherited from class weka.deduping.Deduper |
m_statistics |
Constructor Summary | |
BasicDeduper()
|
Method Summary | |
protected void |
accumulateStatistics()
Add the current state of things to statistics |
void |
buildDeduper(Instances trainFold,
Instances testInstances)
Given training data, build the metrics required by the deduper |
protected double |
clusterDistance(Cluster cluster1,
Cluster cluster2)
internal method that returns the distance between two clusters |
static java.lang.String |
concatStringArray(java.lang.String[] strings)
A little helper to create a single String from an array of Strings |
protected void |
createDistanceMatrix()
Fill the distance matrix with values using the metric |
void |
findDuplicates(Instances testInstances,
int numObjects)
Identify duplicates within the testing data |
boolean |
getDebug()
See whether debugging output is on/off |
InstanceMetric |
getMetric()
Get the InstanceMetric that is used |
java.lang.String[] |
getOptions()
Gets the current settings of Greedy Agglomerative Clustering |
double |
getTrainProportion()
Get the amount of training |
boolean |
getUseBlocking()
See whether blocking is on/off |
protected void |
hashInstances(Instances data)
Create the hashtable from given Instances; keys are numeric indeces, values are actual Instances |
java.util.ArrayList |
initIntClusters()
Computes the clusters from the cluster assignments |
java.util.Enumeration |
listOptions()
Returns an enumeration describing the available options |
protected Cluster |
mergeClusters(int cluster1Idx,
int cluster2Idx)
Internal method to merge two clusters and update distances |
protected void |
mergeStep()
Internal method that finds two most similar clusters and merges them |
protected int |
numCrossClusterTruePairs(Cluster cluster1,
Cluster cluster2)
Given two clusters, calculate the number of true pairs that will be added when the clusters are merged |
protected int |
numTruePairs(Instances instances)
Given a test set, calculate the number of true pairs |
void |
printIntClusters()
Outputs the current clustering |
protected void |
resetStatistics()
Reset the current statistics |
void |
setDebug(boolean debug)
Turn debugging output on/off |
void |
setMetric(InstanceMetric metric)
Set the InstanceMetric that is used |
void |
setOptions(java.lang.String[] options)
Parses a given list of options. |
void |
setTrainProportion(double trainProportion)
Set the amount of training |
void |
setUseBlocking(boolean useBlocking)
Turn debugging output on/off |
Methods inherited from class weka.deduping.Deduper |
forName, getStatistics |
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
protected double m_trainProportion
protected double[][] m_distanceMatrix
protected java.util.HashMap m_instancesHash
protected java.util.HashMap m_reverseInstancesHash
protected int[] m_attrIdxs
protected int m_numObjects
protected double[] m_classValues
protected int m_numCurrentObjects
protected java.util.ArrayList m_clusters
protected Instances m_testInstances
protected boolean m_useBlocking
protected int[] m_clusterAssignments
protected boolean m_debug
protected int m_numTotalPairs
protected int m_numGoodPairs
protected int m_numTruePairs
protected int m_numTotalPairsTrain
protected int m_numTotalPairsTest
protected int m_numPotentialDupePairsTrain
protected int m_numActualDupePairsTrain
protected int m_numPotentialNonDupePairsTrain
protected int m_numActualNonDupePairsTrain
protected double m_trainTime
protected double m_testTimeStart
Constructor Detail |
public BasicDeduper()
Method Detail |
public void buildDeduper(Instances trainFold, Instances testInstances) throws java.lang.Exception
buildDeduper
in class Deduper
java.lang.Exception
public void findDuplicates(Instances testInstances, int numObjects) throws java.lang.Exception
findDuplicates
in class Deduper
testInstances
- a set of instances among which to identify duplicatesnumObjects
- the number of "true object" sets to create
java.lang.Exception
public java.util.ArrayList initIntClusters() throws java.lang.Exception
java.lang.Exception
- if clusters could not be computed successfullyprotected void mergeStep() throws java.lang.Exception
java.lang.Exception
protected double clusterDistance(Cluster cluster1, Cluster cluster2)
protected Cluster mergeClusters(int cluster1Idx, int cluster2Idx) throws java.lang.Exception
java.lang.Exception
protected void hashInstances(Instances data)
data
- Instancesprotected void createDistanceMatrix() throws java.lang.Exception
java.lang.Exception
public void printIntClusters() throws java.lang.Exception
java.lang.Exception
- if something goes wrongpublic void setTrainProportion(double trainProportion)
trainProportion
- the proportion of the training set that will be used for learningpublic double getTrainProportion()
protected int numTruePairs(Instances instances)
instances
- a set of objects, class has the true object IDprotected int numCrossClusterTruePairs(Cluster cluster1, Cluster cluster2)
cluster1
- the first cluster to mergecluster2
- the second cluster to mergeprotected void accumulateStatistics()
protected void resetStatistics()
public void setMetric(InstanceMetric metric)
metric
- the InstanceMetric that is used to dedupepublic InstanceMetric getMetric()
public void setDebug(boolean debug)
debug
- if true, debugging info will be printedpublic boolean getDebug()
public void setUseBlocking(boolean useBlocking)
public boolean getUseBlocking()
public java.util.Enumeration listOptions()
listOptions
in interface OptionHandler
public void setOptions(java.lang.String[] options) throws java.lang.Exception
-M metric options
InstanceMetric used
setOptions
in interface OptionHandler
options
- the list of options as an array of strings
java.lang.Exception
- if an option is not supportedpublic java.lang.String[] getOptions()
getOptions
in interface OptionHandler
public static java.lang.String concatStringArray(java.lang.String[] strings)
strings
- an array of strings
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |