|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectweka.deduping.metrics.StringMetric
weka.deduping.metrics.AffineProbMetric
AffineProbMetric class implements a probabilistic model string edit distance with affine-cost gaps
Field Summary | |
protected char |
blank
A handy constant for insertions/deletions, we treat them as substitution with a null character |
static int |
CONVERSION_EXPONENTIAL
|
static int |
CONVERSION_LAPLACIAN
We can have different ways of converting from distance to similarity |
static int |
CONVERSION_UNIT
|
protected double |
m_clampProb
Minimal value of a probability parameter. |
protected int |
m_conversionType
The method of converting, by default laplacian |
protected double[][] |
m_editopCosts
parameters for the additive model, obtained from log-probs to speed up computations in the "testing" phase after weights have been learned |
protected double[][] |
m_editopLogProbs
|
protected double[][] |
m_editopOccs
|
protected double[][] |
m_editopProbs
|
protected double |
m_endAtGapCost
|
protected double |
m_endAtGapLogProb
|
protected double |
m_endAtGapOccs
|
protected double |
m_endAtGapProb
|
protected double |
m_endAtSubCost
|
protected double |
m_endAtSubLogProb
|
protected double |
m_endAtSubOccs
|
protected double |
m_endAtSubProb
|
protected double |
m_gapEndCost
|
protected double |
m_gapEndLogProb
|
protected double |
m_gapEndOccs
|
protected double |
m_gapEndProb
|
protected double |
m_gapExtendCost
|
protected double |
m_gapExtendLogProb
|
protected double |
m_gapExtendOccs
|
protected double |
m_gapExtendProb
|
protected double |
m_gapStartCost
|
protected double |
m_gapStartLogProb
|
protected double |
m_gapStartOccs
|
protected double |
m_gapStartProb
|
protected double |
m_noopCost
|
protected double |
m_noopLogProb
Parameters for the generative model |
protected double |
m_noopOccs
Parameters for the generative model |
protected double |
m_noopProb
Parameters for the generative model |
protected boolean |
m_normalized
Normalization of edit distance by string length; equivalent to using the posterior probability in the generative model |
protected int |
m_numIterations
Maximum number of iterations for training the model; usually converge in <10 iterations |
protected double |
m_subCost
|
protected double |
m_subLogProb
|
protected double |
m_subOccs
|
protected double |
m_subProb
|
protected char[] |
m_usedChars
TODO: given a corpus, populate this array with the characters that are actually encountered |
protected boolean |
m_useGenerativeModel
true if we are using a generative model for distance in the "testing" phase after learning the parameters By default we want to use the additive model that uses probabilities converted to costs |
protected boolean |
m_verbose
|
static Tag[] |
TAGS_CONVERSION
|
Constructor Summary | |
AffineProbMetric()
set up an instance of AffineProbMetric |
Method Summary | |
protected double[][][] |
backward(java.lang.String _s1,
java.lang.String _s2)
Calculate the backward matrices |
java.lang.Object |
clone()
Create a copy of this metric |
double |
costDistance(java.lang.String string1,
java.lang.String string2)
Calculate affine gapped distance using learned costs |
double |
distance(java.lang.String s1,
java.lang.String s2)
Get the distance between two strings |
protected double |
expectationStep(java.lang.String _s1,
java.lang.String _s2,
int lambda,
boolean pos_training)
Expectation part of the EM algorithm accumulates expectations of editop probabilities over example pairs Expectation is calculated based on two examples which are either duplicates (pos=true) or non-duplicates (pos=false). |
protected double[][][] |
forward(java.lang.String _s1,
java.lang.String _s2)
Calculate the forward matrices |
double |
getClampProb()
Get the clamping probability value |
SelectedTag |
getConversionType()
return the type of similarity to distance conversion |
boolean |
getNormalized()
Get whether the distance is normalized by the sum of the string's lengths |
java.lang.String[] |
getOptions()
Gets the current settings of WeightedDotP. |
boolean |
getUseGenerativeModel()
Do we use the generative model or convert back to the additive model? |
protected void |
initCosts()
initialize the costs using current values of the probabilities |
protected void |
initProbs()
initialize the probabilities to some startup values |
boolean |
isDistanceBased()
The computation of a metric can be either based on distance, or on similarity |
java.util.Enumeration |
listOptions()
Returns an enumeration describing the available options. |
protected double |
logSum(double _logA,
double _logB)
Calculation of log(a+b) with a correction for machine precision |
static void |
main(java.lang.String[] args)
|
protected void |
maximizationStep()
Maximization step of the EM algorithm |
protected void |
normalizeEmissionProbs()
Normalize the probabilities of emission editops so that they sum to 1 for each state |
protected void |
normalizeTransitionProbs()
Normalize the probabilities of transitions so that they sum to 1 for each state |
static void |
print3dMatrix(double[][][] matrix)
|
void |
printAlignmentMatrix(java.lang.String _s1,
java.lang.String _s2,
int idx,
double[][][] matrix)
|
void |
printMatrices(java.lang.String s1,
java.lang.String s2)
print out the three matrices |
protected void |
printOpProbs()
print out some data in case things go wrong |
protected void |
resetOccurrences()
reset the number of occurrences of all ops in the set |
void |
setClampProb(double clampProb)
Set the clamping probability value |
void |
setConversionType(SelectedTag conversionType)
Set the type of similarity to distance conversion. |
void |
setNormalized(boolean normalized)
Set the distance to be normalized by the sum of the string's lengths |
int |
setNumIterations()
Get the number of training iterations |
void |
setNumIterations(int numIterations)
Set the number of training iterations |
void |
setOptions(java.lang.String[] options)
Parses a given list of options. |
void |
setUseGenerativeModel(boolean useGenerativeModel)
Set the distance to use the generative model or convert back to the additive model |
double |
similarity(java.lang.String string1,
java.lang.String string2)
Returns a similarity estimate between two strings. |
void |
trainMetric(java.util.ArrayList pairList)
Train the distance parameters using provided examples using EM |
protected void |
updateLogProbs()
store logs of all probabilities in m_editopLogProbs |
Methods inherited from class weka.deduping.metrics.StringMetric |
forName |
Methods inherited from class java.lang.Object |
equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
protected double[][] m_editopProbs
protected double[][] m_editopLogProbs
protected double[][] m_editopOccs
protected double m_noopProb
protected double m_noopLogProb
protected double m_noopOccs
protected double m_endAtSubProb
protected double m_endAtSubLogProb
protected double m_endAtSubOccs
protected double m_endAtGapProb
protected double m_endAtGapLogProb
protected double m_endAtGapOccs
protected double m_gapStartProb
protected double m_gapStartLogProb
protected double m_gapStartOccs
protected double m_gapExtendProb
protected double m_gapExtendLogProb
protected double m_gapExtendOccs
protected double m_gapEndProb
protected double m_gapEndLogProb
protected double m_gapEndOccs
protected double m_subProb
protected double m_subLogProb
protected double m_subOccs
protected double[][] m_editopCosts
protected double m_noopCost
protected double m_endAtSubCost
protected double m_endAtGapCost
protected double m_gapStartCost
protected double m_gapExtendCost
protected double m_gapEndCost
protected double m_subCost
protected boolean m_useGenerativeModel
protected int m_numIterations
protected boolean m_normalized
protected double m_clampProb
protected final char blank
protected char[] m_usedChars
public static final int CONVERSION_LAPLACIAN
public static final int CONVERSION_UNIT
public static final int CONVERSION_EXPONENTIAL
public static final Tag[] TAGS_CONVERSION
protected int m_conversionType
protected boolean m_verbose
Constructor Detail |
public AffineProbMetric()
Method Detail |
protected double[][][] forward(java.lang.String _s1, java.lang.String _s2)
_s1
- first string_s2
- second string
protected double[][][] backward(java.lang.String _s1, java.lang.String _s2)
_s1
- first string_s2
- second string
public void printMatrices(java.lang.String s1, java.lang.String s2)
public void printAlignmentMatrix(java.lang.String _s1, java.lang.String _s2, int idx, double[][][] matrix)
protected void printOpProbs()
public void trainMetric(java.util.ArrayList pairList) throws java.lang.Exception
trainMetric
in interface LearnableStringMetric
pairList
- the training data as a list of StringPair's
java.lang.Exception
protected double expectationStep(java.lang.String _s1, java.lang.String _s2, int lambda, boolean pos_training)
_s1
- first string_s2
- second stringlambda
- learning rate parameter, 1 by defaultpos_training
- true if strings are matched, false if mismatchedprotected void maximizationStep()
protected void normalizeEmissionProbs()
protected void normalizeTransitionProbs()
protected void resetOccurrences()
protected void initProbs()
protected void initCosts()
protected void updateLogProbs()
public double distance(java.lang.String s1, java.lang.String s2)
distance
in class StringMetric
s1
- first strings2
- second string
protected double logSum(double _logA, double _logB)
public double costDistance(java.lang.String string1, java.lang.String string2)
public static void print3dMatrix(double[][][] matrix)
public void setNormalized(boolean normalized)
normalized
- if true, distance is normalized by the sum of string's lengthspublic boolean getNormalized()
public void setUseGenerativeModel(boolean useGenerativeModel)
useGenerativeModel
- if true, the generative model is usedpublic boolean getUseGenerativeModel()
public void setClampProb(double clampProb)
clampProb
- a lower bound for all probability values to prevent underflowpublic double getClampProb()
public void setNumIterations(int numIterations)
numIterations
- the number of iterationspublic int setNumIterations()
public java.lang.Object clone()
clone
in class StringMetric
public java.lang.String[] getOptions()
getOptions
in interface OptionHandler
public void setOptions(java.lang.String[] options) throws java.lang.Exception
-N normalize by length -m matchCost -s subCost -g gapStartCost -e gapExtendCost
setOptions
in interface OptionHandler
options
- the list of options as an array of strings
java.lang.Exception
- if an option is not supportedpublic java.util.Enumeration listOptions()
listOptions
in interface OptionHandler
public boolean isDistanceBased()
isDistanceBased
in class StringMetric
public double similarity(java.lang.String string1, java.lang.String string2) throws java.lang.Exception
similarity
in class StringMetric
string1
- First string.string2
- Second string.
java.lang.Exception
- if similarity could not be estimated.public void setConversionType(SelectedTag conversionType)
public SelectedTag getConversionType()
public static void main(java.lang.String[] args)
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |