|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectweka.deduping.metrics.Tokenizer
This abstract class defines a tokenizer that turns strings into HashMapVectors
Field Summary | |
protected boolean |
m_caseInsensitive
Converting all tokens to lowercase |
protected Porter |
m_stemmer
|
protected boolean |
m_stemming
Stemming |
protected static java.lang.String |
m_stopwordFilename
The with the stopword list |
protected boolean |
m_stopwordRemoval
Stopword removal |
protected static java.util.HashSet |
m_stopwordSet
Stopword hash |
Constructor Summary | |
Tokenizer()
|
Method Summary | |
boolean |
getCaseInsensitive()
Turn case sensitivity on/off |
boolean |
getStemming()
Find out whether stemming is on/off |
boolean |
getStopwordRemoval()
Get whether stopword removal is on or off |
void |
setCaseInsensitive(boolean caseInsensitive)
Turn case sensitivity on/off |
void |
setStemming(boolean stemming)
Turn stemming on/off |
void |
setStopwordRemoval(boolean stopwordRemoval)
Turn stopword removal on/off and load the stopwords |
java.lang.String |
stem(java.lang.String token)
Stem a given token |
abstract HashMapVector |
tokenize(java.lang.String string)
Take a string and create a vector of tokens from it |
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
protected boolean m_caseInsensitive
protected boolean m_stemming
protected Porter m_stemmer
protected boolean m_stopwordRemoval
protected static java.lang.String m_stopwordFilename
protected static java.util.HashSet m_stopwordSet
Constructor Detail |
public Tokenizer()
Method Detail |
public abstract HashMapVector tokenize(java.lang.String string)
string
- a String to tokenize
public void setCaseInsensitive(boolean caseInsensitive)
caseInsensitive
- if true, the tokenizer is case-insensitivepublic boolean getCaseInsensitive()
public void setStemming(boolean stemming)
stemming
- if true, stemming is usedpublic boolean getStemming()
public java.lang.String stem(java.lang.String token)
token
- the token to be stemmed
public void setStopwordRemoval(boolean stopwordRemoval)
stopwordRemoval
- if true, stopwords from m_stopwordFile will be removedpublic boolean getStopwordRemoval()
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |