KJB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Tokenizer.h
Go to the documentation of this file.
1 #ifndef TOKENIZER_H_
2 #define TOKENIZER_H_
3 
11 #include <vector>
12 #include <string>
13 
14 #include "spear/Wide.h"
15 #include "spear/RCIPtr.h"
16 #include "spear/StringMap.h"
17 #include "spear/Word.h"
18 
19 namespace spear {
20 
21 class Tokenizer : public RCObject {
22 
23  public:
24 
25  Tokenizer(const std::string & dataPath);
26 
27  void split(const String & buffer, std::vector<spear::Word> & tokens);
28 
29  static String normalize(const String & s);
30 
31  private:
32 
33  static bool initialize(const std::string & dataPath);
34 
35  static bool initialized;
36 
37  static spear::StringMap<bool> multiWords;
38 
39  String concatenate(const std::vector<String> & tokens,
40  unsigned int start,
41  unsigned int end) const;
42 };
43 
45 
46 }
47 
48 #endif
static String normalize(const String &s)
Definition: Tokenizer.cc:51
Reference counting pointer class This file contains the code for the classes and class templates maki...
Definition: RCIPtr.h:67
RCIPtr< Tokenizer > TokenizerPtr
Definition: Tokenizer.h:44
Definition: Tokenizer.h:21
#define String
Definition: Wide.h:36
Definition: RCIPtr.h:24
void split(const String &buffer, std::vector< spear::Word > &tokens)
Definition: Tokenizer.cc:84
Tokenizer(const std::string &dataPath)
Definition: Tokenizer.cc:44