weka.deduping
Class PairwiseSelector

java.lang.Object
  extended byweka.deduping.PairwiseSelector
All Implemented Interfaces:
OptionHandler, java.io.Serializable

public class PairwiseSelector
extends java.lang.Object
implements OptionHandler, java.io.Serializable

PairwiseSelector class. Given a string metric and training data, create a set of instance pairs that correspond to metric training data

See Also:
Serialized Form

Nested Class Summary
 class PairwiseSelector.ReverseComparator
          We will need this reverse comparator class to traverse a TreeSet backwards
 
Field Summary
protected  java.util.HashMap m_classInstanceMap
          A hashmap where true object IDs are mapped to lists of strings of that object
protected  java.util.ArrayList m_classValueList
          A list of classes, each element is the double value of the class attribute
protected  boolean m_debug
          Output debugging information
protected  Instances m_instances
          The set of instances used for training
protected  double m_maxImplicitCommonTokenFraction
          The maximum fraction of common tokens that instances can have to be included as implicit negatives
protected  int m_negativesMode
           
protected  java.util.ArrayList m_negPairList
          A list with a sufficient pool of negative examples as TrainingPair's
protected  int m_negStringMode
           
protected  int m_numPotentialNegatives
          The number of possible different-class pairs
protected  int m_numPotentialPositives
          The number of possible same-class pairs
protected  int m_positivesMode
           
protected  java.util.ArrayList m_posPairList
          A list with all the positive examples as TrainingPair's
protected  int m_posStringMode
           
protected  boolean m_useFalseImplicitNegatives
           
protected  boolean m_useRejectedPositives
           
static int NEG_MODE_IMPLICIT_NEGATIVES
           
static int NEG_MODE_RANDOM_NEGATIVES
           
static int NEG_MODE_RANDOM_RECORDS
           
static int POS_MODE_RANDOM_POSITIVES
           
static int POS_MODE_RANDOM_RECORDS
          The record pair selection method
static int POS_MODE_STATIC_ACTIVE
           
static int STRING_PAIRS_EASIEST
           
static int STRING_PAIRS_HARDEST
           
static int STRING_PAIRS_RANDOM
          String pair selection method
static Tag[] TAGS_NEG_MODE
           
static Tag[] TAGS_POS_MODE
           
static Tag[] TAGS_STRING_PAIR_MODE
           
 
Constructor Summary
PairwiseSelector()
          A default constructor
 
Method Summary
protected  double addUniquePair(java.util.TreeSet set, StringPair pair)
          Add a pair to a TreeSet so that there are no collisions, and no values are erased
protected  Instance createInstance(InstancePair pair, int[] attrIdxs, StringMetric[][] metrics)
          Create a nonsparse instance with features corresponding to the metric values between used fields of the two given instances
protected  void createNegPairList()
          Populate m_negPairList with negative InstancePair's
protected  void createPosPairList()
          Populate m_posPairList with all positive InstancePair's
protected  InstancePair createRandomTrainInstancePair(java.util.HashSet usedPairSet, java.util.HashMap checksumMap)
           
static double fractionCommonTokens(java.lang.String s1, java.lang.String s2)
          return the number of commmon tokens that two strings have
 boolean getDebug()
          See whether debugging output is on/off
 Instances getInstances(int[] attrIdxs, StringMetric[][] stringMetrics, int numPosPairs, int numNegPairs)
          Generate a training set of diffInstances.
 double getMaxImplicitCommonTokenFraction()
          Get the maximum fraction of common tokens that instances can have to be included as implicit negatives
 SelectedTag getNegativesMode()
          return the selection mode for negatives
 SelectedTag getNegStringMode()
          return the selection mode for negative string examples
 java.lang.String[] getOptions()
          Gets the current settings of WeightedDotP.
 SelectedTag getPositivesMode()
          return the selection mode for positives
 SelectedTag getPosStringMode()
          return the selection mode for positive string examples
 java.util.ArrayList getStringPairList(Instances instances, int attrIdx, int numPosPairs, int numNegPairs, StringMetric metric)
          Provide an array of string pairs metric using given training instances
 boolean getUseFalseImplicitNegatives()
          Check whether using false implicit negatives is on/off
 boolean getUseRejectedPositives()
          Check whether using rejected positives as negatives is on or off
static boolean haveCommonTokens(java.lang.String s1, java.lang.String s2)
          return true if two strings have commmon tokens
 void initSelector(Instances instances)
          Initialize m_classInstanceMap and m_classValueList using a given set of instances
protected  boolean isUniqueInstance(Instance instance, java.util.HashMap checksumMap, double[] checksumCoeffs)
          Check whether an instance is unique
 java.util.Enumeration listOptions()
          Returns an enumeration describing the available options.
protected  double[] populateNegStrPairSet(StringMetric metric, java.util.TreeSet strPairSet, int attrIdx)
          Populate a provided treeset with a sufficient population of negative StringPair's
protected  double[] populatePosStrPairSet(StringMetric metric, java.util.TreeSet strPairSet, int attrIdx)
          Populate a provided treeset with all positive StringPair's
static int[] randomSubset(int numIdxs, int maxIdx)
          get an array random indeces out of n possible values.
 java.util.TreeSet reverseCopy(java.util.Set set)
          Given a set, return a TreeSet whose items are accessed in descending order
 void setDebug(boolean debug)
          Turn debugging output on/off
 void setMaxImplicitCommonTokenFraction(double maxImplicitCommonTokenFraction)
          Set the maximum fraction of common tokens that instances can have to be included as implicit negatives
 void setNegativesMode(SelectedTag mode)
          Set the selection mode for negatives
 void setNegStringMode(SelectedTag mode)
          Set the selection mode for negative string examples
 void setOptions(java.lang.String[] options)
          Parses a given list of options.
 void setPositivesMode(SelectedTag mode)
          Set the selection mode for positives
 void setPosStringMode(SelectedTag mode)
          Set the selection mode for positive string examples
 void setUseFalseImplicitNegatives(boolean useFalseImplicitNegatives)
          Turn using false implicit negatives on/off
 void setUseRejectedPositives(boolean useRejectedPositives)
          Turn using rejected positives as negatives on/off
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

m_instances

protected Instances m_instances
The set of instances used for training


m_classInstanceMap

protected java.util.HashMap m_classInstanceMap
A hashmap where true object IDs are mapped to lists of strings of that object


m_classValueList

protected java.util.ArrayList m_classValueList
A list of classes, each element is the double value of the class attribute


m_posPairList

protected java.util.ArrayList m_posPairList
A list with all the positive examples as TrainingPair's


m_negPairList

protected java.util.ArrayList m_negPairList
A list with a sufficient pool of negative examples as TrainingPair's


m_numPotentialPositives

protected int m_numPotentialPositives
The number of possible same-class pairs


m_numPotentialNegatives

protected int m_numPotentialNegatives
The number of possible different-class pairs


m_debug

protected boolean m_debug
Output debugging information


POS_MODE_RANDOM_RECORDS

public static final int POS_MODE_RANDOM_RECORDS
The record pair selection method

See Also:
Constant Field Values

POS_MODE_RANDOM_POSITIVES

public static final int POS_MODE_RANDOM_POSITIVES
See Also:
Constant Field Values

POS_MODE_STATIC_ACTIVE

public static final int POS_MODE_STATIC_ACTIVE
See Also:
Constant Field Values

TAGS_POS_MODE

public static final Tag[] TAGS_POS_MODE

m_positivesMode

protected int m_positivesMode

m_useRejectedPositives

protected boolean m_useRejectedPositives

NEG_MODE_RANDOM_RECORDS

public static final int NEG_MODE_RANDOM_RECORDS
See Also:
Constant Field Values

NEG_MODE_RANDOM_NEGATIVES

public static final int NEG_MODE_RANDOM_NEGATIVES
See Also:
Constant Field Values

NEG_MODE_IMPLICIT_NEGATIVES

public static final int NEG_MODE_IMPLICIT_NEGATIVES
See Also:
Constant Field Values

TAGS_NEG_MODE

public static final Tag[] TAGS_NEG_MODE

m_negativesMode

protected int m_negativesMode

m_useFalseImplicitNegatives

protected boolean m_useFalseImplicitNegatives

STRING_PAIRS_RANDOM

public static final int STRING_PAIRS_RANDOM
String pair selection method

See Also:
Constant Field Values

STRING_PAIRS_HARDEST

public static final int STRING_PAIRS_HARDEST
See Also:
Constant Field Values

STRING_PAIRS_EASIEST

public static final int STRING_PAIRS_EASIEST
See Also:
Constant Field Values

TAGS_STRING_PAIR_MODE

public static final Tag[] TAGS_STRING_PAIR_MODE

m_posStringMode

protected int m_posStringMode

m_negStringMode

protected int m_negStringMode

m_maxImplicitCommonTokenFraction

protected double m_maxImplicitCommonTokenFraction
The maximum fraction of common tokens that instances can have to be included as implicit negatives

Constructor Detail

PairwiseSelector

public PairwiseSelector()
A default constructor

Method Detail

initSelector

public void initSelector(Instances instances)
Initialize m_classInstanceMap and m_classValueList using a given set of instances

Parameters:
instances - a set of instances from which pair examples will be selected

getInstances

public Instances getInstances(int[] attrIdxs,
                              StringMetric[][] stringMetrics,
                              int numPosPairs,
                              int numNegPairs)
                       throws java.lang.Exception
Generate a training set of diffInstances. initSelector must have been called earlier to initialize m_posPairList and m_negPairList.

Parameters:
attrIdxs - indeces of fields that should be utilized
stringMetrics - metrics that should be used on training pairs to generate diffInstances
numPosPairs - the desired number of positive (same-class) diffInstance's
numNegPairs - the desired number of negative (different-class) diffInstance's
Throws:
java.lang.Exception

createRandomTrainInstancePair

protected InstancePair createRandomTrainInstancePair(java.util.HashSet usedPairSet,
                                                     java.util.HashMap checksumMap)

createInstance

protected Instance createInstance(InstancePair pair,
                                  int[] attrIdxs,
                                  StringMetric[][] metrics)
                           throws java.lang.Exception
Create a nonsparse instance with features corresponding to the metric values between used fields of the two given instances

Parameters:
attrIdxs - indeces of fields that should be utilized
metrics - the string metrics that are used to create the training instances
Returns:
a newly created diffInstance, or null if all diff-values are 0
Throws:
java.lang.Exception

isUniqueInstance

protected boolean isUniqueInstance(Instance instance,
                                   java.util.HashMap checksumMap,
                                   double[] checksumCoeffs)
Check whether an instance is unique

Parameters:
instance - instance to be checked
checksumMap - a map where checksum values are mapped to lists of instances
Returns:
true if the instance is unique, false otherwise

getStringPairList

public java.util.ArrayList getStringPairList(Instances instances,
                                             int attrIdx,
                                             int numPosPairs,
                                             int numNegPairs,
                                             StringMetric metric)
                                      throws java.lang.Exception
Provide an array of string pairs metric using given training instances

Parameters:
metric - the metric to train
instances - data to train the metric on
Returns:
a list of StringPair's that is training data for a particular field
Throws:
java.lang.Exception - if training has gone bad.

addUniquePair

protected double addUniquePair(java.util.TreeSet set,
                               StringPair pair)
Add a pair to a TreeSet so that there are no collisions, and no values are erased

Parameters:
set - a set to which a new pair should be added
pair - a new pair of strings that is to be added; value fields holds the distance between the strings
Returns:
the unique value of the distance (possibly perturbed) with which the pair was added

populatePosStrPairSet

protected double[] populatePosStrPairSet(StringMetric metric,
                                         java.util.TreeSet strPairSet,
                                         int attrIdx)
                                  throws java.lang.Exception
Populate a provided treeset with all positive StringPair's

Parameters:
metric - a metric that will be used to calculate distance
attrIdx - the index of the attribute for which positive string pairs are being accumulated
Returns:
an array with distance values of the created pairs
Throws:
java.lang.Exception

populateNegStrPairSet

protected double[] populateNegStrPairSet(StringMetric metric,
                                         java.util.TreeSet strPairSet,
                                         int attrIdx)
                                  throws java.lang.Exception
Populate a provided treeset with a sufficient population of negative StringPair's

Parameters:
metric - a metric that will be used to calculate distance between strings
attrIdx - the index of the attribute for which positive string pairs are being accumulated
Returns:
an array with distance values of the created pairs
Throws:
java.lang.Exception

createPosPairList

protected void createPosPairList()
Populate m_posPairList with all positive InstancePair's


createNegPairList

protected void createNegPairList()
Populate m_negPairList with negative InstancePair's


reverseCopy

public java.util.TreeSet reverseCopy(java.util.Set set)
Given a set, return a TreeSet whose items are accessed in descending order

Parameters:
set - any set containing Comparable objects
Returns:
a new ordered set with those objects in reverse order

setPositivesMode

public void setPositivesMode(SelectedTag mode)
Set the selection mode for positives

Parameters:
mode - selection mode for positive examples

getPositivesMode

public SelectedTag getPositivesMode()
return the selection mode for positives

Returns:
one of the selection modes

setNegativesMode

public void setNegativesMode(SelectedTag mode)
Set the selection mode for negatives

Parameters:
mode - selection mode for negative examples

getNegativesMode

public SelectedTag getNegativesMode()
return the selection mode for negatives

Returns:
one of the selection modes

setMaxImplicitCommonTokenFraction

public void setMaxImplicitCommonTokenFraction(double maxImplicitCommonTokenFraction)
Set the maximum fraction of common tokens that instances can have to be included as implicit negatives

Parameters:
maxImplicitCommonTokenFraction -

getMaxImplicitCommonTokenFraction

public double getMaxImplicitCommonTokenFraction()
Get the maximum fraction of common tokens that instances can have to be included as implicit negatives

Returns:
the fraction

setUseRejectedPositives

public void setUseRejectedPositives(boolean useRejectedPositives)
Turn using rejected positives as negatives on/off

Parameters:
useRejectedPositives - if true, false positives that were picked during the static-active selection will be added to the negatives set

getUseRejectedPositives

public boolean getUseRejectedPositives()
Check whether using rejected positives as negatives is on or off

Returns:
returns true if false positives that were picked during the static-active selection are added to the negatives set

setUseFalseImplicitNegatives

public void setUseFalseImplicitNegatives(boolean useFalseImplicitNegatives)
Turn using false implicit negatives on/off

Parameters:
useFalseImplicitNegatives - if true, false implicit negatives will be added to positives

getUseFalseImplicitNegatives

public boolean getUseFalseImplicitNegatives()
Check whether using false implicit negatives is on/off

Returns:
true if false implicit negatives are added to positives

setPosStringMode

public void setPosStringMode(SelectedTag mode)
Set the selection mode for positive string examples

Parameters:
mode - selection mode for positive string examples

getPosStringMode

public SelectedTag getPosStringMode()
return the selection mode for positive string examples

Returns:
one of the selection modes for positive string examples

setNegStringMode

public void setNegStringMode(SelectedTag mode)
Set the selection mode for negative string examples

Parameters:
mode - selection mode for negative string examples

getNegStringMode

public SelectedTag getNegStringMode()
return the selection mode for negative string examples

Returns:
one of the selection modes for negative string examples

setDebug

public void setDebug(boolean debug)
Turn debugging output on/off

Parameters:
debug - if true, debugging info will be printed

getDebug

public boolean getDebug()
See whether debugging output is on/off


randomSubset

public static int[] randomSubset(int numIdxs,
                                 int maxIdx)
get an array random indeces out of n possible values. if the number of requested indeces is larger then maxIdx, returns maxIdx permuted values

Parameters:
maxIdx - - the maximum index of the set
numIdxs - number of indexes to return
Returns:
an array of indexes

haveCommonTokens

public static boolean haveCommonTokens(java.lang.String s1,
                                       java.lang.String s2)
return true if two strings have commmon tokens


fractionCommonTokens

public static double fractionCommonTokens(java.lang.String s1,
                                          java.lang.String s2)
return the number of commmon tokens that two strings have

Parameters:
s1 - string 1
s2 - string 2
Returns:
the number of common tokens the strings have

getOptions

public java.lang.String[] getOptions()
Gets the current settings of WeightedDotP.

Specified by:
getOptions in interface OptionHandler
Returns:
an array of strings suitable for passing to setOptions()

setOptions

public void setOptions(java.lang.String[] options)
                throws java.lang.Exception
Parses a given list of options. Valid options are:

Specified by:
setOptions in interface OptionHandler
Parameters:
options - the list of options as an array of strings
Throws:
java.lang.Exception - if an option is not supported

listOptions

public java.util.Enumeration listOptions()
Returns an enumeration describing the available options.

Specified by:
listOptions in interface OptionHandler
Returns:
an enumeration of all the available options.