|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectweka.deduping.PairwiseSelector
PairwiseSelector class. Given a string metric and training data, create a set of instance pairs that correspond to metric training data
Nested Class Summary | |
class |
PairwiseSelector.ReverseComparator
We will need this reverse comparator class to traverse a TreeSet backwards |
Field Summary | |
protected java.util.HashMap |
m_classInstanceMap
A hashmap where true object IDs are mapped to lists of strings of that object |
protected java.util.ArrayList |
m_classValueList
A list of classes, each element is the double value of the class attribute |
protected boolean |
m_debug
Output debugging information |
protected Instances |
m_instances
The set of instances used for training |
protected double |
m_maxImplicitCommonTokenFraction
The maximum fraction of common tokens that instances can have to be included as implicit negatives |
protected int |
m_negativesMode
|
protected java.util.ArrayList |
m_negPairList
A list with a sufficient pool of negative examples as TrainingPair's |
protected int |
m_negStringMode
|
protected int |
m_numPotentialNegatives
The number of possible different-class pairs |
protected int |
m_numPotentialPositives
The number of possible same-class pairs |
protected int |
m_positivesMode
|
protected java.util.ArrayList |
m_posPairList
A list with all the positive examples as TrainingPair's |
protected int |
m_posStringMode
|
protected boolean |
m_useFalseImplicitNegatives
|
protected boolean |
m_useRejectedPositives
|
static int |
NEG_MODE_IMPLICIT_NEGATIVES
|
static int |
NEG_MODE_RANDOM_NEGATIVES
|
static int |
NEG_MODE_RANDOM_RECORDS
|
static int |
POS_MODE_RANDOM_POSITIVES
|
static int |
POS_MODE_RANDOM_RECORDS
The record pair selection method |
static int |
POS_MODE_STATIC_ACTIVE
|
static int |
STRING_PAIRS_EASIEST
|
static int |
STRING_PAIRS_HARDEST
|
static int |
STRING_PAIRS_RANDOM
String pair selection method |
static Tag[] |
TAGS_NEG_MODE
|
static Tag[] |
TAGS_POS_MODE
|
static Tag[] |
TAGS_STRING_PAIR_MODE
|
Constructor Summary | |
PairwiseSelector()
A default constructor |
Method Summary | |
protected double |
addUniquePair(java.util.TreeSet set,
StringPair pair)
Add a pair to a TreeSet so that there are no collisions, and no values are erased |
protected Instance |
createInstance(InstancePair pair,
int[] attrIdxs,
StringMetric[][] metrics)
Create a nonsparse instance with features corresponding to the metric values between used fields of the two given instances |
protected void |
createNegPairList()
Populate m_negPairList with negative InstancePair's |
protected void |
createPosPairList()
Populate m_posPairList with all positive InstancePair's |
protected InstancePair |
createRandomTrainInstancePair(java.util.HashSet usedPairSet,
java.util.HashMap checksumMap)
|
static double |
fractionCommonTokens(java.lang.String s1,
java.lang.String s2)
return the number of commmon tokens that two strings have |
boolean |
getDebug()
See whether debugging output is on/off |
Instances |
getInstances(int[] attrIdxs,
StringMetric[][] stringMetrics,
int numPosPairs,
int numNegPairs)
Generate a training set of diffInstances. |
double |
getMaxImplicitCommonTokenFraction()
Get the maximum fraction of common tokens that instances can have to be included as implicit negatives |
SelectedTag |
getNegativesMode()
return the selection mode for negatives |
SelectedTag |
getNegStringMode()
return the selection mode for negative string examples |
java.lang.String[] |
getOptions()
Gets the current settings of WeightedDotP. |
SelectedTag |
getPositivesMode()
return the selection mode for positives |
SelectedTag |
getPosStringMode()
return the selection mode for positive string examples |
java.util.ArrayList |
getStringPairList(Instances instances,
int attrIdx,
int numPosPairs,
int numNegPairs,
StringMetric metric)
Provide an array of string pairs metric using given training instances |
boolean |
getUseFalseImplicitNegatives()
Check whether using false implicit negatives is on/off |
boolean |
getUseRejectedPositives()
Check whether using rejected positives as negatives is on or off |
static boolean |
haveCommonTokens(java.lang.String s1,
java.lang.String s2)
return true if two strings have commmon tokens |
void |
initSelector(Instances instances)
Initialize m_classInstanceMap and m_classValueList using a given set of instances |
protected boolean |
isUniqueInstance(Instance instance,
java.util.HashMap checksumMap,
double[] checksumCoeffs)
Check whether an instance is unique |
java.util.Enumeration |
listOptions()
Returns an enumeration describing the available options. |
protected double[] |
populateNegStrPairSet(StringMetric metric,
java.util.TreeSet strPairSet,
int attrIdx)
Populate a provided treeset with a sufficient population of negative StringPair's |
protected double[] |
populatePosStrPairSet(StringMetric metric,
java.util.TreeSet strPairSet,
int attrIdx)
Populate a provided treeset with all positive StringPair's |
static int[] |
randomSubset(int numIdxs,
int maxIdx)
get an array random indeces out of n possible values. |
java.util.TreeSet |
reverseCopy(java.util.Set set)
Given a set, return a TreeSet whose items are accessed in descending order |
void |
setDebug(boolean debug)
Turn debugging output on/off |
void |
setMaxImplicitCommonTokenFraction(double maxImplicitCommonTokenFraction)
Set the maximum fraction of common tokens that instances can have to be included as implicit negatives |
void |
setNegativesMode(SelectedTag mode)
Set the selection mode for negatives |
void |
setNegStringMode(SelectedTag mode)
Set the selection mode for negative string examples |
void |
setOptions(java.lang.String[] options)
Parses a given list of options. |
void |
setPositivesMode(SelectedTag mode)
Set the selection mode for positives |
void |
setPosStringMode(SelectedTag mode)
Set the selection mode for positive string examples |
void |
setUseFalseImplicitNegatives(boolean useFalseImplicitNegatives)
Turn using false implicit negatives on/off |
void |
setUseRejectedPositives(boolean useRejectedPositives)
Turn using rejected positives as negatives on/off |
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
protected Instances m_instances
protected java.util.HashMap m_classInstanceMap
protected java.util.ArrayList m_classValueList
protected java.util.ArrayList m_posPairList
protected java.util.ArrayList m_negPairList
protected int m_numPotentialPositives
protected int m_numPotentialNegatives
protected boolean m_debug
public static final int POS_MODE_RANDOM_RECORDS
public static final int POS_MODE_RANDOM_POSITIVES
public static final int POS_MODE_STATIC_ACTIVE
public static final Tag[] TAGS_POS_MODE
protected int m_positivesMode
protected boolean m_useRejectedPositives
public static final int NEG_MODE_RANDOM_RECORDS
public static final int NEG_MODE_RANDOM_NEGATIVES
public static final int NEG_MODE_IMPLICIT_NEGATIVES
public static final Tag[] TAGS_NEG_MODE
protected int m_negativesMode
protected boolean m_useFalseImplicitNegatives
public static final int STRING_PAIRS_RANDOM
public static final int STRING_PAIRS_HARDEST
public static final int STRING_PAIRS_EASIEST
public static final Tag[] TAGS_STRING_PAIR_MODE
protected int m_posStringMode
protected int m_negStringMode
protected double m_maxImplicitCommonTokenFraction
Constructor Detail |
public PairwiseSelector()
Method Detail |
public void initSelector(Instances instances)
instances
- a set of instances from which pair examples will be selectedpublic Instances getInstances(int[] attrIdxs, StringMetric[][] stringMetrics, int numPosPairs, int numNegPairs) throws java.lang.Exception
attrIdxs
- indeces of fields that should be utilizedstringMetrics
- metrics that should be used on training pairs to generate diffInstancesnumPosPairs
- the desired number of positive (same-class) diffInstance'snumNegPairs
- the desired number of negative (different-class) diffInstance's
java.lang.Exception
protected InstancePair createRandomTrainInstancePair(java.util.HashSet usedPairSet, java.util.HashMap checksumMap)
protected Instance createInstance(InstancePair pair, int[] attrIdxs, StringMetric[][] metrics) throws java.lang.Exception
attrIdxs
- indeces of fields that should be utilizedmetrics
- the string metrics that are used to create the training instances
java.lang.Exception
protected boolean isUniqueInstance(Instance instance, java.util.HashMap checksumMap, double[] checksumCoeffs)
instance
- instance to be checkedchecksumMap
- a map where checksum values are mapped to lists of instances
public java.util.ArrayList getStringPairList(Instances instances, int attrIdx, int numPosPairs, int numNegPairs, StringMetric metric) throws java.lang.Exception
metric
- the metric to traininstances
- data to train the metric on
java.lang.Exception
- if training has gone bad.protected double addUniquePair(java.util.TreeSet set, StringPair pair)
set
- a set to which a new pair should be addedpair
- a new pair of strings that is to be added; value
fields holds the distance between the strings
protected double[] populatePosStrPairSet(StringMetric metric, java.util.TreeSet strPairSet, int attrIdx) throws java.lang.Exception
metric
- a metric that will be used to calculate distanceattrIdx
- the index of the attribute for which positive
string pairs are being accumulated
java.lang.Exception
protected double[] populateNegStrPairSet(StringMetric metric, java.util.TreeSet strPairSet, int attrIdx) throws java.lang.Exception
metric
- a metric that will be used to calculate distance between stringsattrIdx
- the index of the attribute for which positive
string pairs are being accumulated
java.lang.Exception
protected void createPosPairList()
protected void createNegPairList()
public java.util.TreeSet reverseCopy(java.util.Set set)
set
- any set containing Comparable objects
public void setPositivesMode(SelectedTag mode)
mode
- selection mode for positive examplespublic SelectedTag getPositivesMode()
public void setNegativesMode(SelectedTag mode)
mode
- selection mode for negative examplespublic SelectedTag getNegativesMode()
public void setMaxImplicitCommonTokenFraction(double maxImplicitCommonTokenFraction)
maxImplicitCommonTokenFraction
- public double getMaxImplicitCommonTokenFraction()
public void setUseRejectedPositives(boolean useRejectedPositives)
useRejectedPositives
- if true, false positives that were picked during the
static-active selection will be added to the negatives setpublic boolean getUseRejectedPositives()
public void setUseFalseImplicitNegatives(boolean useFalseImplicitNegatives)
useFalseImplicitNegatives
- if true, false implicit negatives will be added to positivespublic boolean getUseFalseImplicitNegatives()
public void setPosStringMode(SelectedTag mode)
mode
- selection mode for positive string examplespublic SelectedTag getPosStringMode()
public void setNegStringMode(SelectedTag mode)
mode
- selection mode for negative string examplespublic SelectedTag getNegStringMode()
public void setDebug(boolean debug)
debug
- if true, debugging info will be printedpublic boolean getDebug()
public static int[] randomSubset(int numIdxs, int maxIdx)
maxIdx
- - the maximum index of the setnumIdxs
- number of indexes to return
public static boolean haveCommonTokens(java.lang.String s1, java.lang.String s2)
public static double fractionCommonTokens(java.lang.String s1, java.lang.String s2)
s1
- string 1s2
- string 2
public java.lang.String[] getOptions()
getOptions
in interface OptionHandler
public void setOptions(java.lang.String[] options) throws java.lang.Exception
setOptions
in interface OptionHandler
options
- the list of options as an array of strings
java.lang.Exception
- if an option is not supportedpublic java.util.Enumeration listOptions()
listOptions
in interface OptionHandler
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |