Orcus
sax_parser.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef ORCUS_SAX_PARSER_HPP
9 #define ORCUS_SAX_PARSER_HPP
10 
11 #include "sax_parser_base.hpp"
12 
13 namespace orcus {
14 
16 {
22  static const uint8_t baseline_version = 10;
23 };
24 
29 template<typename _Handler, typename _Config = sax_parser_default_config>
31 {
32 public:
33  typedef _Handler handler_type;
34  typedef _Config config_type;
35 
36  sax_parser(const char* content, const size_t size, handler_type& handler);
37  ~sax_parser();
38 
39  void parse();
40 
41 private:
42 
47  void header();
48  void body();
49  void element();
50  void element_open(const char* begin_pos);
51  void element_close(const char* begin_pos);
52  void special_tag();
53  void declaration(const char* name_check);
54  void cdata();
55  void doctype();
56  void characters();
57  void attribute();
58 
59 private:
60  handler_type& m_handler;
61 };
62 
63 template<typename _Handler, typename _Config>
65  const char* content, const size_t size, handler_type& handler) :
66  sax::parser_base(content, size),
67  m_handler(handler)
68 {
69 }
70 
71 template<typename _Handler, typename _Config>
73 {
74 }
75 
76 template<typename _Handler, typename _Config>
78 {
79  m_nest_level = 0;
80  mp_char = mp_begin;
81  header();
82  blank();
83  body();
84 
85  assert(m_buffer_pos == 0);
86 }
87 
88 template<typename _Handler, typename _Config>
90 {
91  // we don't handle multi byte encodings so we can just skip bom entry if exists.
92  skip_bom();
93  blank();
94  if (!has_char() || cur_char() != '<')
95  throw sax::malformed_xml_error("xml file must begin with '<'.", offset());
96 
97  if (config_type::baseline_version >= 11)
98  {
99  // XML version 1.1 requires a header declaration whereas in 1.0 it's
100  // optional.
101  if (next_char_checked() != '?')
102  throw sax::malformed_xml_error("xml file must begin with '<?'.", offset());
103 
104  declaration("xml");
105  }
106 }
107 
108 template<typename _Handler, typename _Config>
110 {
111  while (has_char())
112  {
113  if (cur_char() == '<')
114  {
115  element();
116  if (!m_root_elem_open)
117  // Root element closed. Stop parsing.
118  return;
119  }
120  else if (m_nest_level)
121  // Call characters only when in xml hierarchy.
122  characters();
123  else
124  next();
125  }
126 }
127 
128 template<typename _Handler, typename _Config>
130 {
131  assert(cur_char() == '<');
132  const char* pos = mp_char;
133  char c = next_char_checked();
134  switch (c)
135  {
136  case '/':
137  element_close(pos);
138  break;
139  case '!':
140  special_tag();
141  break;
142  case '?':
143  declaration(nullptr);
144  break;
145  default:
146  if (!is_alpha(c) && c != '_')
147  throw sax::malformed_xml_error("expected an alphabet.", offset());
148  element_open(pos);
149  }
150 }
151 
152 template<typename _Handler, typename _Config>
153 void sax_parser<_Handler,_Config>::element_open(const char* begin_pos)
154 {
155  assert(is_alpha(cur_char()) || cur_char() == '_');
156 
157  sax::parser_element elem;
158  element_name(elem, begin_pos);
159 
160  while (true)
161  {
162  blank();
163  char c = cur_char();
164  if (c == '/')
165  {
166  // Self-closing element: <element/>
167  if (next_and_char() != '>')
168  throw sax::malformed_xml_error("expected '/>' to self-close the element.", offset());
169  next();
170  elem.end_pos = mp_char;
171  m_handler.start_element(elem);
172  reset_buffer_pos();
173  m_handler.end_element(elem);
174 #if ORCUS_DEBUG_SAX_PARSER
175  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
176 #endif
177  return;
178  }
179  else if (c == '>')
180  {
181  // End of opening element: <element>
182  next();
183  elem.end_pos = mp_char;
184  nest_up();
185  m_handler.start_element(elem);
186  reset_buffer_pos();
187 #if ORCUS_DEBUG_SAX_PARSER
188  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
189 #endif
190  return;
191  }
192  else
193  attribute();
194  }
195 }
196 
197 template<typename _Handler, typename _Config>
198 void sax_parser<_Handler,_Config>::element_close(const char* begin_pos)
199 {
200  assert(cur_char() == '/');
201  nest_down();
202  next_check();
203  sax::parser_element elem;
204  element_name(elem, begin_pos);
205 
206  if (cur_char() != '>')
207  throw sax::malformed_xml_error("expected '>' to close the element.", offset());
208  next();
209  elem.end_pos = mp_char;
210 
211  m_handler.end_element(elem);
212 #if ORCUS_DEBUG_SAX_PARSER
213  cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
214 #endif
215  if (!m_nest_level)
216  m_root_elem_open = false;
217 }
218 
219 template<typename _Handler, typename _Config>
221 {
222  assert(cur_char() == '!');
223  // This can be either <![CDATA, <!--, or <!DOCTYPE.
224  size_t len = remains();
225  if (len < 2)
226  throw sax::malformed_xml_error("special tag too short.", offset());
227 
228  switch (next_and_char())
229  {
230  case '-':
231  {
232  // Possibly comment.
233  if (next_and_char() != '-')
234  throw sax::malformed_xml_error("comment expected.", offset());
235 
236  len -= 2;
237  if (len < 3)
238  throw sax::malformed_xml_error("malformed comment.", offset());
239 
240  next();
241  comment();
242  }
243  break;
244  case '[':
245  {
246  // Possibly a CDATA.
247  expects_next("CDATA[", 6);
248  if (has_char())
249  cdata();
250  }
251  break;
252  case 'D':
253  {
254  // check if this is a DOCTYPE.
255  expects_next("OCTYPE", 6);
256  blank();
257  if (has_char())
258  doctype();
259  }
260  break;
261  default:
262  throw sax::malformed_xml_error("failed to parse special tag.", offset());
263  }
264 }
265 
266 template<typename _Handler, typename _Config>
267 void sax_parser<_Handler,_Config>::declaration(const char* name_check)
268 {
269  assert(cur_char() == '?');
270  next_check();
271 
272  // Get the declaration name first.
273  pstring decl_name;
274  name(decl_name);
275 #if ORCUS_DEBUG_SAX_PARSER
276  cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
277 #endif
278 
279  if (name_check && decl_name != name_check)
280  {
281  std::ostringstream os;
282  os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
283  throw sax::malformed_xml_error(os.str(), offset());
284  }
285 
286  m_handler.start_declaration(decl_name);
287  blank();
288 
289  // Parse the attributes.
290  while (cur_char_checked() != '?')
291  {
292  attribute();
293  blank();
294  }
295  if (next_char_checked() != '>')
296  throw sax::malformed_xml_error("declaration must end with '?>'.", offset());
297 
298  m_handler.end_declaration(decl_name);
299  reset_buffer_pos();
300  next();
301 #if ORCUS_DEBUG_SAX_PARSER
302  cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
303 #endif
304 }
305 
306 template<typename _Handler, typename _Config>
308 {
309  size_t len = remains();
310  assert(len > 3);
311 
312  // Parse until we reach ']]>'.
313  const char* p0 = mp_char;
314  size_t i = 0, match = 0;
315  for (char c = cur_char(); i < len; ++i, c = next_and_char())
316  {
317  if (c == ']')
318  {
319  // Be aware that we may encounter a series of more than two ']'
320  // characters, in which case we'll only count the last two.
321 
322  if (match == 0)
323  // First ']'
324  ++match;
325  else if (match == 1)
326  // Second ']'
327  ++match;
328  }
329  else if (c == '>' && match == 2)
330  {
331  // Found ']]>'.
332  size_t cdata_len = i - 2;
333  m_handler.characters(pstring(p0, cdata_len), false);
334  next();
335  return;
336  }
337  else
338  match = 0;
339  }
340  throw sax::malformed_xml_error("malformed CDATA section.", offset());
341 }
342 
343 template<typename _Handler, typename _Config>
345 {
346  // Parse the root element first.
348  name(param.root_element);
349  blank();
350 
351  // Either PUBLIC or SYSTEM.
352  size_t len = remains();
353  if (len < 6)
354  throw sax::malformed_xml_error("DOCTYPE section too short.", offset());
355 
356  param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
357  char c = cur_char();
358  if (c == 'P')
359  {
360  if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
361  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
362 
363  param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
364  }
365  else if (c == 'S')
366  {
367  if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
368  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
369  }
370 
371  next_check();
372  blank();
373  has_char_throw("DOCTYPE section too short.");
374 
375  // Parse FPI.
376  value(param.fpi, false);
377 
378  has_char_throw("DOCTYPE section too short.");
379  blank();
380  has_char_throw("DOCTYPE section too short.");
381 
382  if (cur_char() == '>')
383  {
384  // Optional URI not given. Exit.
385 #if ORCUS_DEBUG_SAX_PARSER
386  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
387 #endif
388  m_handler.doctype(param);
389  next();
390  return;
391  }
392 
393  // Parse optional URI.
394  value(param.uri, false);
395 
396  has_char_throw("DOCTYPE section too short.");
397  blank();
398  has_char_throw("DOCTYPE section too short.");
399 
400  if (cur_char() != '>')
401  throw sax::malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
402 
403 #if ORCUS_DEBUG_SAX_PARSER
404  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
405 #endif
406  m_handler.doctype(param);
407  next();
408 }
409 
410 template<typename _Handler, typename _Config>
412 {
413  const char* p0 = mp_char;
414  for (; has_char(); next())
415  {
416  if (cur_char() == '<')
417  break;
418 
419  if (cur_char() == '&')
420  {
421  // Text span with one or more encoded characters. Parse using cell buffer.
422  cell_buffer& buf = get_cell_buffer();
423  buf.reset();
424  buf.append(p0, mp_char-p0);
425  characters_with_encoded_char(buf);
426  if (buf.empty())
427  m_handler.characters(pstring(), false);
428  else
429  m_handler.characters(pstring(buf.get(), buf.size()), true);
430  return;
431  }
432  }
433 
434  if (mp_char > p0)
435  {
436  pstring val(p0, mp_char-p0);
437  m_handler.characters(val, false);
438  }
439 }
440 
441 template<typename _Handler, typename _Config>
443 {
445  pstring attr_ns_name, attr_name, attr_value;
446  attribute_name(attr.ns, attr.name);
447 
448 #if ORCUS_DEBUG_SAX_PARSER
449  std::ostringstream os;
450  os << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'";
451 #endif
452 
453  char c = cur_char();
454  if (c != '=')
455  {
456  std::ostringstream os;
457  os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
458  throw sax::malformed_xml_error(os.str(), offset());
459  }
460 
461  next_check();
462  attr.transient = value(attr.value, true);
463  if (attr.transient)
464  // Value is stored in a temporary buffer. Push a new buffer.
465  inc_buffer_pos();
466 
467 #if ORCUS_DEBUG_SAX_PARSER
468  os << " value='" << attr.value << "'" << endl;
469  cout << os.str();
470 #endif
471 
472  m_handler.attribute(attr);
473 }
474 
475 }
476 
477 #endif
478 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: pstring.hpp:24
Definition: cell_buffer.hpp:21
Definition: sax_parser_base.hpp:33
Definition: sax_parser.hpp:15
static const uint8_t baseline_version
Definition: sax_parser.hpp:22
Definition: sax_parser_base.hpp:100
Definition: sax_parser_base.hpp:85
Definition: sax_parser_base.hpp:45
Definition: base64.hpp:15
Definition: sax_parser.hpp:30
Definition: sax_parser_base.hpp:108