Orcus
Loading...
Searching...
No Matches
sax_parser.hpp
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 */
7
8#ifndef INCLUDED_ORCUS_SAX_PARSER_HPP
9#define INCLUDED_ORCUS_SAX_PARSER_HPP
10
11#include "sax_parser_base.hpp"
12
13#include <string_view>
14#include <cstdint>
15
16namespace orcus {
17
19{
25 static constexpr uint8_t baseline_version = 10;
26};
27
29{
30public:
37 {
38 (void)dtd;
39 }
40
48 void start_declaration(std::string_view decl)
49 {
50 (void)decl;
51 }
52
58 void end_declaration(std::string_view decl)
59 {
60 (void)decl;
61 }
62
69 {
70 (void)elem;
71 }
72
79 {
80 (void)elem;
81 }
82
97 void characters(std::string_view val, bool transient)
98 {
99 (void)val; (void)transient;
100 }
101
111 {
112 (void)attr;
113 }
114};
115
131template<typename HandlerT, typename ConfigT = sax_parser_default_config>
132class sax_parser : public sax::parser_base
133{
134public:
135 typedef HandlerT handler_type;
136 typedef ConfigT config_type;
137
138 sax_parser(std::string_view content, handler_type& handler);
139 ~sax_parser() = default;
140
141 void parse();
142
143private:
144
149 void header();
150 void body();
151 void element();
152 void element_open(std::ptrdiff_t begin_pos);
153 void element_close(std::ptrdiff_t begin_pos);
154 void special_tag();
155 void declaration(const char* name_check);
156 void cdata();
157 void doctype();
158 void characters();
159 void attribute();
160
161private:
162 handler_type& m_handler;
163};
164
165template<typename HandlerT, typename ConfigT>
166sax_parser<HandlerT,ConfigT>::sax_parser(std::string_view content, handler_type& handler) :
167 sax::parser_base(content.data(), content.size()),
168 m_handler(handler)
169{
170}
171
172template<typename HandlerT, typename ConfigT>
173void sax_parser<HandlerT,ConfigT>::parse()
174{
175 m_nest_level = 0;
176 mp_char = mp_begin;
177 header();
178 skip_space_and_control();
179 body();
180
181 assert(m_buffer_pos == 0);
182}
183
184template<typename HandlerT, typename ConfigT>
185void sax_parser<HandlerT,ConfigT>::header()
186{
187 // we don't handle multi byte encodings so we can just skip bom entry if exists.
188 skip_bom();
189
190 // Allow leading whitespace in the XML stream.
191 // TODO : Make this configurable since strictly speaking such an XML
192 // sttream is invalid.
193 skip_space_and_control();
194
195 if (!has_char() || cur_char() != '<')
196 throw malformed_xml_error("xml file must begin with '<'.", offset());
197
198 if (config_type::baseline_version >= 11)
199 {
200 // XML version 1.1 requires a header declaration whereas in 1.0 it's
201 // optional.
202 if (next_char_checked() != '?')
203 throw malformed_xml_error("xml file must begin with '<?'.", offset());
204
205 declaration("xml");
206 }
207}
208
209template<typename HandlerT, typename ConfigT>
210void sax_parser<HandlerT,ConfigT>::body()
211{
212 while (has_char())
213 {
214 if (cur_char() == '<')
215 {
216 element();
217 if (!m_root_elem_open)
218 // Root element closed. Stop parsing.
219 return;
220 }
221 else if (m_nest_level)
222 // Call characters only when in xml hierarchy.
223 characters();
224 else
225 next();
226 }
227}
228
229template<typename HandlerT, typename ConfigT>
230void sax_parser<HandlerT,ConfigT>::element()
231{
232 assert(cur_char() == '<');
233 std::ptrdiff_t pos = offset();
234 char c = next_char_checked();
235 switch (c)
236 {
237 case '/':
238 element_close(pos);
239 return;
240 case '!':
241 special_tag();
242 return;
243 case '?':
244 declaration(nullptr);
245 return;
246 }
247
248 element_open(pos);
249}
250
251template<typename HandlerT, typename ConfigT>
252void sax_parser<HandlerT,ConfigT>::element_open(std::ptrdiff_t begin_pos)
253{
255 element_name(elem, begin_pos);
256
257 while (true)
258 {
259 skip_space_and_control();
260 char c = cur_char_checked();
261 if (c == '/')
262 {
263 // Self-closing element: <element/>
264 if (next_and_char() != '>')
265 throw malformed_xml_error("expected '/>' to self-close the element.", offset());
266 next();
267 elem.end_pos = offset();
268 m_handler.start_element(elem);
269 reset_buffer_pos();
270 m_handler.end_element(elem);
271 if (!m_nest_level)
272 m_root_elem_open = false;
273#if ORCUS_DEBUG_SAX_PARSER
274 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
275#endif
276 return;
277 }
278 else if (c == '>')
279 {
280 // End of opening element: <element>
281 next();
282 elem.end_pos = offset();
283 nest_up();
284 m_handler.start_element(elem);
285 reset_buffer_pos();
286#if ORCUS_DEBUG_SAX_PARSER
287 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
288#endif
289 return;
290 }
291 else
292 attribute();
293 }
294}
295
296template<typename HandlerT, typename ConfigT>
297void sax_parser<HandlerT,ConfigT>::element_close(std::ptrdiff_t begin_pos)
298{
299 assert(cur_char() == '/');
300 nest_down();
301 next_check();
303 element_name(elem, begin_pos);
304
305 if (cur_char() != '>')
306 throw malformed_xml_error("expected '>' to close the element.", offset());
307 next();
308 elem.end_pos = offset();
309
310 m_handler.end_element(elem);
311#if ORCUS_DEBUG_SAX_PARSER
312 cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
313#endif
314 if (!m_nest_level)
315 m_root_elem_open = false;
316}
317
318template<typename HandlerT, typename ConfigT>
319void sax_parser<HandlerT,ConfigT>::special_tag()
320{
321 assert(cur_char() == '!');
322 // This can be either <![CDATA, <!--, or <!DOCTYPE.
323 size_t len = available_size();
324 if (len < 2)
325 throw malformed_xml_error("special tag too short.", offset());
326
327 switch (next_and_char())
328 {
329 case '-':
330 {
331 // Possibly comment.
332 if (next_and_char() != '-')
333 throw malformed_xml_error("comment expected.", offset());
334
335 len -= 2;
336 if (len < 3)
337 throw malformed_xml_error("malformed comment.", offset());
338
339 next();
340 comment();
341 }
342 break;
343 case '[':
344 {
345 // Possibly a CDATA.
346 expects_next("CDATA[", 6);
347 if (has_char())
348 cdata();
349 }
350 break;
351 case 'D':
352 {
353 // check if this is a DOCTYPE.
354 expects_next("OCTYPE", 6);
355 skip_space_and_control();
356 if (has_char())
357 doctype();
358 }
359 break;
360 default:
361 throw malformed_xml_error("failed to parse special tag.", offset());
362 }
363}
364
365template<typename HandlerT, typename ConfigT>
366void sax_parser<HandlerT,ConfigT>::declaration(const char* name_check)
367{
368 assert(cur_char() == '?');
369 next_check();
370
371 // Get the declaration name first.
372 std::string_view decl_name;
373 name(decl_name);
374#if ORCUS_DEBUG_SAX_PARSER
375 cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
376#endif
377
378 if (name_check && decl_name != name_check)
379 {
380 std::ostringstream os;
381 os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
382 throw malformed_xml_error(os.str(), offset());
383 }
384
385 m_handler.start_declaration(decl_name);
386 skip_space_and_control();
387
388 // Parse the attributes.
389 while (cur_char_checked() != '?')
390 {
391 attribute();
392 skip_space_and_control();
393 }
394 if (next_char_checked() != '>')
395 throw malformed_xml_error("declaration must end with '?>'.", offset());
396
397 m_handler.end_declaration(decl_name);
398 reset_buffer_pos();
399 next();
400#if ORCUS_DEBUG_SAX_PARSER
401 cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
402#endif
403}
404
405template<typename HandlerT, typename ConfigT>
406void sax_parser<HandlerT,ConfigT>::cdata()
407{
408 size_t len = available_size();
409 assert(len > 3);
410
411 // Parse until we reach ']]>'.
412 const char* p0 = mp_char;
413 size_t i = 0, match = 0;
414 for (char c = cur_char(); i < len; ++i, c = next_and_char())
415 {
416 if (c == ']')
417 {
418 // Be aware that we may encounter a series of more than two ']'
419 // characters, in which case we'll only count the last two.
420
421 if (match == 0)
422 // First ']'
423 ++match;
424 else if (match == 1)
425 // Second ']'
426 ++match;
427 }
428 else if (c == '>' && match == 2)
429 {
430 // Found ']]>'.
431 size_t cdata_len = i - 2;
432 m_handler.characters(std::string_view(p0, cdata_len), false);
433 next();
434 return;
435 }
436 else
437 match = 0;
438 }
439 throw malformed_xml_error("malformed CDATA section.", offset());
440}
441
442template<typename HandlerT, typename ConfigT>
443void sax_parser<HandlerT,ConfigT>::doctype()
444{
445 // Parse the root element first.
447 name(param.root_element);
448 skip_space_and_control();
449
450 // Either PUBLIC or SYSTEM.
451 size_t len = available_size();
452 if (len < 6)
453 throw malformed_xml_error("DOCTYPE section too short.", offset());
454
455 param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
456 char c = cur_char();
457 if (c == 'P')
458 {
459 if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
460 throw malformed_xml_error("malformed DOCTYPE section.", offset());
461
462 param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
463 }
464 else if (c == 'S')
465 {
466 if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
467 throw malformed_xml_error("malformed DOCTYPE section.", offset());
468 }
469
470 next_check();
471 skip_space_and_control();
472
473 // Parse FPI.
474 value(param.fpi, false);
475
476 has_char_throw("DOCTYPE section too short.");
477 skip_space_and_control();
478 has_char_throw("DOCTYPE section too short.");
479
480 if (cur_char() == '>')
481 {
482 // Optional URI not given. Exit.
483#if ORCUS_DEBUG_SAX_PARSER
484 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
485#endif
486 m_handler.doctype(param);
487 next();
488 return;
489 }
490
491 // Parse optional URI.
492 value(param.uri, false);
493
494 has_char_throw("DOCTYPE section too short.");
495 skip_space_and_control();
496 has_char_throw("DOCTYPE section too short.");
497
498 if (cur_char() != '>')
499 throw malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
500
501#if ORCUS_DEBUG_SAX_PARSER
502 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
503#endif
504 m_handler.doctype(param);
505 next();
506}
507
508template<typename HandlerT, typename ConfigT>
509void sax_parser<HandlerT,ConfigT>::characters()
510{
511 const char* p0 = mp_char;
512 for (; has_char(); next())
513 {
514 if (cur_char() == '<')
515 break;
516
517 if (cur_char() == '&')
518 {
519 // Text span with one or more encoded characters. Parse using cell buffer.
520 cell_buffer& buf = get_cell_buffer();
521 buf.reset();
522 buf.append(p0, mp_char-p0);
523 characters_with_encoded_char(buf);
524 if (buf.empty())
525 m_handler.characters(std::string_view{}, false);
526 else
527 m_handler.characters(buf.str(), true);
528 return;
529 }
530 }
531
532 if (mp_char > p0)
533 {
534 std::string_view val(p0, mp_char-p0);
535 m_handler.characters(val, false);
536 }
537}
538
539template<typename HandlerT, typename ConfigT>
540void sax_parser<HandlerT,ConfigT>::attribute()
541{
543 attribute_name(attr.ns, attr.name);
544
545#if ORCUS_DEBUG_SAX_PARSER
546 cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl;
547#endif
548
549 skip_space_and_control();
550
551 char c = cur_char_checked();
552 if (c != '=')
553 {
554 std::ostringstream os;
555 os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
556 throw malformed_xml_error(os.str(), offset());
557 }
558
559 next_check(); // skip the '='.
560 skip_space_and_control();
561
562 attr.transient = value(attr.value, true);
563 if (attr.transient)
564 // Value is stored in a temporary buffer. Push a new buffer.
565 inc_buffer_pos();
566
567#if ORCUS_DEBUG_SAX_PARSER
568 cout << "sax_parser::attribute: value='" << attr.value << "'" << endl;
569#endif
570
571 m_handler.attribute(attr);
572}
573
574}
575
576#endif
577/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition cell_buffer.hpp:22
Definition exception.hpp:121
Definition parser_base.hpp:23
Definition sax_parser_base.hpp:108
Definition sax_parser.hpp:29
void end_declaration(std::string_view decl)
Definition sax_parser.hpp:58
void doctype(const orcus::sax::doctype_declaration &dtd)
Definition sax_parser.hpp:36
void attribute(const orcus::sax::parser_attribute &attr)
Definition sax_parser.hpp:110
void characters(std::string_view val, bool transient)
Definition sax_parser.hpp:97
void start_declaration(std::string_view decl)
Definition sax_parser.hpp:48
void end_element(const orcus::sax::parser_element &elem)
Definition sax_parser.hpp:78
void start_element(const orcus::sax::parser_element &elem)
Definition sax_parser.hpp:68
Definition sax_parser_base.hpp:37
Definition sax_parser_base.hpp:96
Definition sax_parser_base.hpp:77
Definition sax_parser.hpp:19
static constexpr uint8_t baseline_version
Definition sax_parser.hpp:25