Logo Search packages:      
Sourcecode: ucto version File versions  Download package

Public Member Functions | Private Member Functions | Private Attributes

Tokenizer::TokenizerClass Class Reference

Collaboration diagram for Tokenizer::TokenizerClass:
Collaboration graph

List of all members.

Public Member Functions

int countSentences (bool forceentirebuffer=false)
void detectQuoteBounds (const int, const UChar)
void detectSentenceBounds (const int offset=0)
bool empty () const
int flushSentences (const int)
int getDebug ()
std::string getDocID ()
std::string getEosMarker ()
bool getFiltering () const
std::string getInputEncoding () const
bool getLowercase ()
std::string getNormalization () const
bool getParagraphDetection ()
bool getPassThru ()
bool getQuoteDetection () const
bool getSentence (int, int &begin, int &end)
std::vector< Token * > getSentence (int)
bool getSentenceDetection ()
bool getSentencePerLineInput ()
bool getSentencePerLineOutput ()
std::vector< std::string > getSentences () const
std::string getSentenceString (unsigned int, const bool=false)
bool getUppercase ()
bool getVerbose ()
bool getXMLOutput ()
bool init (const std::string &, const std::string &)
void outputTokens (std::ostream &, const size_t, const size_t, const bool=false) const
void outputTokensXML (std::ostream &, const size_t, const size_t, bool &)
void outputXMLFooter (std::ostream &, bool)
void outputXMLHeader (std::ostream &)
void passthruLine (const std::string &, bool &)
int setDebug (int d)
std::string setEosMarker (const std::string &s="<utt>")
void setErrorLog (std::ostream *os)
bool setFiltering (bool b=true)
std::string setInputEncoding (const std::string &)
bool setLowercase (bool b=true)
std::string setNormalization (const std::string &s)
bool setParagraphDetection (bool b=true)
bool setPassThru (bool b=true)
bool setQuoteDetection (bool b=true)
bool setSentenceDetection (bool b=true)
bool setSentencePerLineInput (bool b=true)
bool setSentencePerLineOutput (bool b=true)
bool setUppercase (bool b=true)
bool setVerbose (bool b=true)
bool setXMLOutput (bool b, std::string id)
void signalParagraph (bool b=true)
bool terminatesWithEOS () const
void tokenize (std::istream &, std::ostream &)
void tokenize (std::istream *in, std::ostream *out)
int tokenizeLine (const UnicodeString &)
int tokenizeLine (const std::string &)

Private Member Functions

void clear ()
bool detectEos (UChar)
bool readrules (const std::string &)
bool readsettings (const std::string &, const std::string &)
bool resolveQuote (int, const UnicodeString &)
void tokenizeWord (const UnicodeString &, bool)

Private Attributes

int count_p
int count_s
int count_w
bool detectBounds
bool detectPar
bool detectQuotes
std::string docid
bool doFilter
std::string eosmark
UnicodeString eosmarkers
UnicodeFilter filter
bool firstoutput
std::string inputEncoding
bool lowercase
UnicodeNormalizer normalizer
bool paragraphsignal
bool passthru
Quoting quotes
std::vector< Rule * > rules
bool sentenceperlineinput
bool sentenceperlineoutput
bool sentencesignal
std::string settingsfilename
std::ostream * theErrLog
int tokDebug
std::vector< Tokentokens
bool uppercase
bool verbose
bool xmlout

Detailed Description

Definition at line 131 of file tokenize.h.

The documentation for this class was generated from the following files:

Generated by  Doxygen 1.6.0   Back to index