KJB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Semspear_tree_parser.h
Go to the documentation of this file.
1 #ifndef SEMSPEAR_TREE_PARSER_H_
2 #define SEMSPEAR_TREE_PARSER_H_
3 
11 #include "semantics/Lexicon_db.h"
13 #include "spear/Pair.h"
14 #include "spear/EdgeLexer.h"
15 #include "spear/Lexicon.h"
16 #include "spear/CharUtils.h"
17 #include "spear/Exception.h"
18 #include <boost/shared_ptr.hpp>
19 #include <boost/tuple/tuple.hpp>
20 #include <iostream>
21 #include <list>
22 #include <string>
23 
24 
25 namespace semantics {
26 
35  template <class T>
37  {
38  public:
39  typedef std::istream Input_stream;
40  typedef std::string Token_contents;
41  typedef int Token_code;
42  typedef boost::tuple<Token_contents, Token_code> Token_tuple;
43  typedef std::list<Token_tuple> Tuple_stack;
44  typedef boost::shared_ptr<T> Constituent_ptr;
45  public:
46  static Lexicon_db& lexicon() {return T::lexicon();}
47  static Nonterminal_db& nt_lexicon() {return T::nt_lexicon();}
48  static bool VERBOSE;
49  public:
56  Input_stream& tree_stream,
57  Input_stream& lexicon_stream,
58  bool learn = true);
59 
63  Constituent_ptr parse_constituent(bool at_root = true);
64 
65  private:
66  spear::EdgeLexer lexer_;
70  void initialize_lexicon(Input_stream& input);
71 
75  Token_tuple lexem();
76 
79  void unget(Token_contents& text, Token_code lex);
80 
81  Tuple_stack buffer_;
82  bool learn_;
84  };
85 
87 
91  bool learn
92  ) : lexer_(syn_stream), learn_(learn)
93  {
94  initialize_lexicon(lex_stream);
95  };
96 
97  template<class T>
100  {
101  Token_contents contents;
102  Token_code code;
103 
104  // return value (lex) is code for what type of token
105  // (paren, EOL or content string);
106  // if content encountered, writes to text.
107  // lexem() is defined in EdgeLexer.cc
108  boost::tie(contents, code) = lexem();
109 
110  // end of file reached
111  if(code == spear::EdgeLexer::TOKEN_EOF)
112  {
113  if(VERBOSE)
114  {
115  std::cerr << "parse_constituent() encountered EOF"
116  << std::endl;
117  }
118  return Constituent_ptr();
119  }
120 
121  // should start with left paren
122  if(code != spear::EdgeLexer::TOKEN_LP)
123  {
124  throw(
126  "Syntax error: Left parenthesis expected",
127  lexer_.getLineCount()
128  )
129  );
130  }
131 
132  // if so, create an edge pointer
133  Constituent_ptr result(new T(at_root ? T::TOP : T::DEPENDENCY, learn_));
134 
135  // read in the next token
136  boost::tie(contents,code) = lexem();
137 
138  // Phrase label
139  // (have already found left paren, so next
140  // string is a phrase label)
141 
142  // 'contents' contains the phrase label,
143  // so attach this label to the edge object
144  result -> set_label(nt_lexicon().encode(contents, learn_));
145  // read in the next token
146  boost::tie(contents, code) = lexem();
147 
148  // This is a non-terminal phrase
149  // (if not, should encounter another string,
150  // not a left paren)
151  if(code == spear::EdgeLexer::TOKEN_LP)
152  {
153  // The head position might be specified immediately
154  // after the children
155  int head_position = -1;
156 
157  // Parse all children
158  while(code == spear::EdgeLexer::TOKEN_LP)
159  {
160  // put the node on the stack
161  unget(contents, code);
162  // recursively parse the nonterminal below
163  Constituent_ptr child = parse_constituent(false);
164  // once parsed, add the subtree as a child
165  // of this node
166  result -> add_child(child);
167  // get the next token
168  boost::tie(contents, code) = lexem();
169  }
170 
171  // This token might be the head position
172  if(code == spear::EdgeLexer::TOKEN_STRING &&
173  (head_position = spear::toInteger(contents)) >= 0 &&
174  head_position < (int) result -> children().size() )
175  {
176  // Set the head
177  for(typename T::Child_list::const_iterator it =
178  result -> children().begin();
179  it != result -> children().end();
180  it++, head_position--)
181  {
182  // Found the head child
183  if(head_position == 0)
184  {
185  result -> set_head(it);
186  break;
187  }
188  }
189  } else unget(contents, code);
190 
191  // Phrase word for terminal phrases
192  } else if(code == spear::EdgeLexer::TOKEN_STRING) {
193  result -> set_word(lexicon().encode(contents, learn_));
194  } else {
195  throw(
197  "Syntax error: Left parenthesis or string expected",
198  lexer_.getLineCount()));
199  }
200 
201  boost::tie(contents, code) = lexem();
202  if(code != spear::EdgeLexer::TOKEN_RP)
203  { \
204  throw spear::Exception(
205  "Syntax error: Right parenthesis expected",
206  lexer_.getLineCount()
207  );
208  }
209 
210  if(at_root)
211  {
212  if(VERBOSE) std::cerr << "Setting events." << std::endl;
213  result -> preprocess_tree();
214  result -> complete_tree();
215  if(VERBOSE)
216  {
217  std::cerr << "Resulting tree is:" << std::endl;
218  result -> print_dependency_tree(std::cerr);
219  // std::cerr << "Done setting events. Remaining input is ";
220  // lexer_.print_stream(std::cerr);
221  }
222  }
223 
224  return result;
225  }
226 
227  template<class T>
230  {
231  if(buffer_.empty() == true)
232  {
233  // call EdgeLexer lexem method
234  // once the buffer is empty
235  // NOTE: contents gets written to by lexer_.lexem
236  Token_contents contents;
237  Token_code code = lexer_.lexem(contents);
238  return boost::tie(contents, code);
239  }
240 
241  // buffer stack contains pairs whose first
242  // element is a string, and whose second is a
243  // code denoting the element type
244 
245  // last is at the top of the buffer stack
246  Token_tuple last = buffer_.back();
247  buffer_.pop_back();
248 
249  return last;
250  }
251 
252  template<class T>
253  void Semspear_tree_parser<T>::unget(
254  Semspear_tree_parser<T>::Token_contents& contents,
255  Semspear_tree_parser<T>::Token_code code
256  )
257  {
258  buffer_.push_back(Token_tuple(contents, code));
259  }
260 
261  template<class T>
262  void Semspear_tree_parser<T>::initialize_lexicon(Input_stream& input)
263  {
264  std::string word, tag;
265  bool is_hi_freq;
266  std::string line_string;
267 
268  while(std::getline(input, line_string))
269  {
270  std::istringstream line(line_string);
271  line >> word;
272  line >> tag;
273  line >> is_hi_freq;
274  Token_map::Val_type wcode = lexicon().encode(word, true);
275  T::lf_word_map()[wcode] = is_hi_freq;
276  }
277 
278  }
279 
280 
282 
283  template<class T>
284  bool Semspear_tree_parser<T>::VERBOSE = false;
285 
286 } // end namespace spear
287 
288 #endif
boost::tuple< Token_contents, Token_code > Token_tuple
Definition: Semspear_tree_parser.h:42
Constituent_ptr parse_constituent(bool at_root=true)
parse the next edge in the stream
Definition: Semspear_tree_parser.h:99
static const int TOKEN_STRING
Definition: EdgeLexer.h:24
int toInteger(const String &s)
Definition: CharUtils.cc:74
static Nonterminal_db & nt_lexicon()
Definition: Semspear_tree_parser.h:47
std::list< Token_tuple > Tuple_stack
Definition: Semspear_tree_parser.h:43
std::string Token_contents
Definition: Semspear_tree_parser.h:40
static const int TOKEN_RP
Definition: EdgeLexer.h:26
static bool VERBOSE
Static initialization.
Definition: Semspear_tree_parser.h:48
#define encode(triedge)
Definition: triangle.c:819
size_t Val_type
Definition: Token_map.h:24
Definition: Exception.h:15
Definition: EdgeLexer.h:19
Definition: Lexicon_db.h:17
boost::shared_ptr< T > Constituent_ptr
Definition: Semspear_tree_parser.h:44
int getline(FILE *fp, std::string *line, char EOL= '\n')
Like C's fgets but with std::string, or C++'s getline but with FILE*.
std::istream Input_stream
Definition: Semspear_tree_parser.h:39
static const int TOKEN_EOF
Definition: EdgeLexer.h:23
Definition: Nonterminal_db.h:16
static const int TOKEN_LP
Definition: EdgeLexer.h:25
Definition: Semspear_tree_parser.h:36
static Lexicon_db & lexicon()
Definition: Semspear_tree_parser.h:46
int Token_code
Definition: Semspear_tree_parser.h:41
Semspear_tree_parser(Input_stream &tree_stream, Input_stream &lexicon_stream, bool learn=true)
construct a parser object with text input 'istream'
Definition: Semspear_tree_parser.h:88