#include <tokenizer.h>
Public Member Functions | |
tokenizer (const string &) | |
Constructor. | |
list< word > | tokenize (const string &) |
tokenize string with default options | |
list< word > | tokenize (const string &, int &) |
tokenize string with default options, tracking offset | |
Private Attributes | |
set< string > | abrevs |
abreviations set (Dr. Mrs. etc. period is not separated) | |
vector< pair< string, RegEx > > | rules |
tokenization rules | |
map< string, int > | matches |
substrings to convert into tokens in each rule |
|
Constructor.
|
|
tokenize string with default options, tracking offset
|
|
tokenize string with default options
|
|
abreviations set (Dr. Mrs. etc. period is not separated)
|
|
substrings to convert into tokens in each rule
|
|
tokenization rules
|