Teuchos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Teuchos_XMLParser.cpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Teuchos: Common Tools Package
5// Copyright (2004) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ***********************************************************************
40// @HEADER
41
42// BUGS: There is a bug in Teuchos_XMLObjectImplem.cpp, line 82
43// when printing attribute values, one must check if the value contains quote
44// or apost;
45// a quot'd attval cannot contain literal quot
46// a apos'd attval cannot contain literal apos
47// either they have to be matched appropriately or (easier) all quot and apos must
48// be replaced by " and '
49
50#include "Teuchos_XMLParser.hpp"
52#include "Teuchos_Assert.hpp"
53#include <stack>
54
55using namespace Teuchos;
56
57// this parser currently does not support:
58// * processing instructions
59// * XML schemas
60// * CDATA sections...see http://www.w3.org/TR/2004/REC-xml-20040204/#dt-cdsection
61// * full Unicode support (we read unsigned bytes, so we get only 0x00 through 0xFF)
62//
63// it tolerates (read: ignores) xml declarations, at any point in the file where a tag would be valid
64//
65// it currently does support:
66// * comments
67// * empty element tags, e.g. <hello />
68// * entity references: &amp; &lt; &gt; &apos; &quot;
69// * numeric character references: &#32;
70// * std::exception/error handling on parse errors
71
72
73/* From the W3C XML 1.0 Third Edition
74 http://www.w3.org/TR/2004/REC-xml-20040204/
75
76 The following productions specify well-formed XML documents.
77 These have been reduced to the support anticipated for support by this parser.
78
79 element ::= EmptyElemTag
80 | STag content ETag
81 STag ::= '<' Name (S Attribute)* S? '>'
82 Attribute ::= Name Eq AttValue
83 ETag ::= '</' Name S? '>'
84 content ::= CharData? ((element | Reference | CDSect | Comment) CharData?)*
85 EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
86
87 AttValue ::= '"' ([^<&"] | Reference)* '"'
88 | "'" ([^<&'] | Reference)* "'"
89
90 CharRef ::= '&#' [0-9]+ ';'
91 EntityRef ::= '&' Name ';'
92 Reference ::= EntityRef | CharRef
93
94 #x20 (space)
95 #x9 (horizontal tab)
96 #xD (carriage return)
97 #xA (new line, new line line feed)
98
99 S ::= (#x20 | #x9 | #xD | #xA)+
100 Eq ::= S? '=' S?
101 NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
102 Name ::= (Letter | '_' | ':') (NameChar)*
103
104 Letter ::= [#x0041-#x005A] | [#x0061-#x007A]
105 | [#x00C0-#x00D6] | [#x00D8-#x00F6]
106 | [#x00F8-#x00FF]
107 Digit ::= [#x0030-#x0039]
108
109 Char ::= #x9 | #xA | #xD | [#x20-#xFF]
110 CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
111 that is, some std::string of characters not containing '<' or '&' or ']]>'
112 Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
113 that is, '<!--' txt '-->', where txt does not contain '--'
114
115 CDSect ::= CDStart CData CDEnd
116 CDStart ::= '<![CDATA['
117 CData ::= (Char* - (Char* ']]>' Char*))
118 CDEnd ::= ']]>'
119
120 document ::= prolog element Misc*
121 prolog ::= XMLDecl? Misc*
122 XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
123 Misc ::= Comment | S
124
125 VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
126 Eq ::= S? '=' S?
127 VersionNum ::= '1.' [0-9]+
128 Misc ::= Comment | S
129
130
131
132*/
133
134#define XMLPARSER_TFE( T , S ) \
135 TEUCHOS_TEST_FOR_EXCEPTION( T, std::runtime_error, "XML parse error at line " << _lineNo << ": " << S )
136
138{
139
141
142 _entities.clear();
143 _entities["apos"] = "'";
144 _entities["quot"] = "\"";
145 _entities["lt"] = "<";
146 _entities["gt"] = ">";
147 _entities["amp"] = "&";
148
149 bool done = false;
150 int curopen = 0; // number of currently open tags, or "do we process character data?"
151 bool gotRoot = false;
152 std::stack<long> tagLineStarts;
153 std::stack<string> tags;
154
155 while (!done) {
156
157 std::string tag, cdata;
158 unsigned char c1, c2;
159 Teuchos::map<std::string,string> attrs;
160
161 // Consume any whitespace
162 if (curopen == 0) {
163 // this will leave a lookahead in c1
164 c1 = '\0';
165 if ( getSpace(c1) ) {
166 done = true;
167 break;
168 }
169 }
170 else {
171 // need to manually lookahead
172 if (_is->readBytes(&c1,1) < 1) {
173 done = true;
174 break;
175 }
176 if (c1 == '\n') ++_lineNo; // a newline while processing character data; not an error
177 }
178
179 if (c1 == '<') {
180 // determine if it is a STag/EmptyElemTag or ETag or Comment
181 // get lookahead
182 XMLPARSER_TFE( _is->readBytes(&c2,1) < 1 , "stream ended in tag begin/end");
183
184 if (c2 == '/') {
185 // we have: </
186 // try to get an ETag
187 getETag(tag);
188 // have to check whether we have an enclosing, otherwise tags and tagLineStarts have no top()
189 XMLPARSER_TFE( curopen == 0, "document not well-formed: encountered end element '" << tag << "' while not enclosed." );
190 XMLPARSER_TFE( handler->endElement(tag)!=0, "document not well-formed: end element tag = '" << tag << "'"
191 << " did not match start element '" << tags.top()
192 << "' from line " << tagLineStarts.top() );
193 curopen--;
194 tagLineStarts.pop();
195 tags.pop();
196 }
197 else if (isLetter(c2) || c2==':' || c2=='_') {
198 // it looks like a STag or an EmptyElemTag
199 bool emptytag;
200 tagLineStarts.push(_lineNo);
201 getSTag(c2, tag, attrs, emptytag);
202 tags.push(tag);
203 handler->startElement(tag,attrs);
204 if (curopen == 0) {
205 XMLPARSER_TFE(gotRoot == true, "document not well-formed: more than one root element specified" );
206 gotRoot = true;
207 }
208 curopen++;
209 if (emptytag) {
210 // we just open this tag, so we should have any trouble closing it
211 XMLPARSER_TFE( handler->endElement(tag)!=0, "unknown failure from handler while processing tag '" << tag << "'" );
212 curopen--;
213 tagLineStarts.pop();
214 tags.pop();
215 }
216 }
217 else if (c2 == '?') {
218 // it is starting to look like an xml declaration
219 XMLPARSER_TFE( assertChar('x') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
220 XMLPARSER_TFE( assertChar('m') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
221 XMLPARSER_TFE( assertChar('l') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
223 }
224 else if (c2 == '!') {
225 // it is starting to look like a comment; we need '--'
226 // if we don't get this, it means
227 // * the document is not well-formed
228 // * the document employs a feature not supported by this parser,
229 // e.g. <!ELEMENT... <!ATTLIST... <!DOCTYPE... <![CDATA[...
230 XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
231 XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
233 }
234 else {
235 XMLPARSER_TFE(true, "element not well-formed or exploits unsupported feature" );
236 }
237 }
238 else if ( (curopen > 0) && (c1 == '&') ) {
239 std::string chars = "";
240 getReference(chars);
241 handler->characters(chars);
242 }
243 else if ( (curopen > 0) ) {
244 std::string chars = "";
245 chars.push_back(c1);
246 handler->characters(chars);
247 }
248 else {
249 XMLPARSER_TFE(1 , "document not well-formed: character data outside of an enclosing tag");
250 }
251 }
252
253 XMLPARSER_TFE( curopen != 0 , "file ended before closing element '" << tags.top() << "' from line " << tagLineStarts.top() );
254
255 return handler->getObject();
256
257}
258
259
260void XMLParser::getETag(std::string &tag)
261{
262 /* Recall from the specification:
263 ETag ::= '</' Name S? '>'
264 Name ::= (Letter | '_' | ':') (NameChar)*
265
266 We have already consumed: </
267 */
268
269 bool tagover = false;
270 unsigned char c;
271 // clear tag
272 tag = "";
273 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
274 XMLPARSER_TFE( !isLetter(c) && c!='_' && c!=':' , "tag not well-formed");
275 tag.push_back(c);
276 while (1) {
277 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
278 if ( isNameChar(c) ) {
279 if (tagover) {
280 XMLPARSER_TFE(1, "end element not well-formed: expected '>'");
281 }
282 tag.push_back(c);
283 }
284 else if (isSpace(c)) {
285 // mark the end of the tag and consume the whitespace
286 // if it is ia newline, it isn't an error
287 if (c == '\n') ++_lineNo;
288 tagover = true;
289 }
290 else if (c == '>') {
291 break;
292 }
293 else {
294 XMLPARSER_TFE(1, "end element not well-formed");
295 }
296 }
297}
298
299
300void XMLParser::getSTag(unsigned char lookahead, std::string &tag, Teuchos::map<std::string,string> &attrs, bool &emptytag)
301{
302
303 /* Recall from the specification:
304
305 STag ::= '<' Name (S Attribute)* S? '>'
306 EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
307 Name ::= (Letter | '_' | ':') (NameChar)*
308 NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
309
310 S ::= (#x20 | #x9 | #xD | #xA)+
311 Attribute ::= Name Eq AttValue
312 Eq ::= S? '=' S?
313 AttValue ::= '"' ([^<&"] | Reference)* '"'
314 | "'" ([^<&'] | Reference)* "'"
315 Reference ::= EntityRef | CharRef
316 CharRef ::= '&#' [0-9]+ ';'
317 EntityRef ::= '&' Name ';'
318
319 We have already consumed: <lookahead
320 */
321
322 unsigned char c;
323 attrs.clear();
324
325 tag = lookahead;
326 // get the rest of the tag: (NameChar)*
327 while (1) {
328 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before start element was terminated");
329 if (isNameChar(c)) {
330 tag.push_back(c);
331 }
332 else {
333 break;
334 }
335 }
336
337 // after the name: should be one of the following
338 // (S Attribute) | S? '>' | S? '/>'
339 do {
340
341 bool hadspace = false;
342
343 // if space, consume the whitespace
344 if ( isSpace(c) ) {
345 hadspace = true;
346 XMLPARSER_TFE( getSpace(c)!=0, "EOF before start element was terminated");
347 }
348
349 // now, either Attribute | '>' | '/>'
350 if ( (isLetter(c) || c=='_' || c==':') && hadspace ) {
351
352 // Attribute
353 // get attribute name, starting with contents of c
354 std::string attname, attval;
355 attname = c;
356 do {
357 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
358 if ( isNameChar(c) ) {
359 attname.push_back(c);
360 }
361 else if ( isSpace(c) || c=='=' ) {
362 break;
363 }
364 else {
365 XMLPARSER_TFE(1, "attribute not well-formed: expected whitespace or '='");
366 }
367 } while (1);
368
369 // if whitespace, consume it
370 if (isSpace(c)) {
371 getSpace(c);
372 }
373 // should be on '='
374 if (c != '=') {
375 XMLPARSER_TFE(1, "attribute not well-formed: expected '='");
376 }
377
378 // get any whitespace following the '='
379 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
380 if (isSpace(c)) {
381 getSpace(c);
382 }
383
384 // now get the quoted attribute value
385 bool apost;
386 attval = "";
387 if (c == '\'') {
388 apost = true;
389 }
390 else if (c == '\"') {
391 apost = false;
392 }
393 else {
394 XMLPARSER_TFE(1, "attribute value must be quoted with either ''' or '\"'");
395 }
396 do {
397 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
398 if (apost && c=='\'') {
399 // end of attval
400 break;
401 }
402 else if (!apost && c=='\"') {
403 // end of attval
404 break;
405 }
406 else if ( c == '&' ) {
407 // finish: need to add support for Reference
408 std::string refstr;
409 getReference(refstr);
410 attval += refstr;
411 }
412 else if ( c!='<' ) {
413 // valid character for attval
414 attval.push_back(c);
415 }
416 else {
417 XMLPARSER_TFE(1, "invalid character in attribute value");
418 }
419 } while(1);
420
421 // add attribute to list
422 XMLPARSER_TFE( attrs.find(attname) != attrs.end() , "cannot have two attributes with the same name");
423 attrs[attname] = attval;
424 }
425 else if (c == '>') {
426 emptytag = false;
427 break;
428 }
429 else if (c == '/') {
430 XMLPARSER_TFE(assertChar('>')!=0, "empty element tag not well-formed: expected '>'");
431 emptytag = true;
432 break;
433 }
434 else {
435 XMLPARSER_TFE(1, "start element not well-formed: invalid character");
436 }
437
438 // get next char
439 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
440
441 } while(1);
442}
443
444
445void XMLParser::getComment(long /* startLine */)
446{
447 /* Recall from the specification:
448 Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
449 that is, '<!--' txt '-->', where txt does not contain '--'
450 We have already consumed: <!--
451
452 Be wary here of the fact that c=='-' implies isChar(c)
453 */
454 unsigned char c;
455 while (1) {
456 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
457 if (c == '\n') ++_lineNo;
458 // if we have a -
459 if (c=='-') {
460 // then it must be the end of the comment or be a Char
461 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
462 if (c == '\n') ++_lineNo;
463 if (c=='-') {
464 // this had better be leading to the end of the comment
465 XMLPARSER_TFE( assertChar('>')!=0, "comment not well-formed: missing expected '>' at line " << _lineNo );
466 break;
467 }
468 else if (!isChar(c)) {
469 XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
470 }
471 }
472 else if (!isChar(c)) {
473 XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
474 }
475 }
476}
477
478
479void XMLParser::getReference(std::string &refstr) {
480 // finish: does CharRef support only dec, or hex as well?
481 unsigned char c;
482 unsigned int num, base;
483 refstr = "";
484 // none of these bytes read are allowed to be a newline, so don't do any incrementing of _lineNo
485 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
486 if (c == '#') {
487 // get a CharRef
488 // CharRef ::= '&#' [0-9]+ ';'
489 // | '&#x' [0-9]+ ';'
490 // get first number
491 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
492 if (c == 'x') {
493 base = 16;
494 num = 0;
495 }
496 else if ('0' <= c && c <= '9') {
497 base = 10;
498 num = c - '0';
499 }
500 else {
501 XMLPARSER_TFE(1, "invalid character in character reference: expected 'x' or [0-9]");
502 }
503
504 do {
505 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
506 XMLPARSER_TFE( c != ';' && !('0' <= c && c <= '9') , "invalid character in character reference: expected [0-9] or ';'");
507 if (c == ';') {
508 break;
509 }
510 num = num*base + (c-'0');
511 } while (1);
512 XMLPARSER_TFE(num > 0xFF, "character reference value out of range");
513 refstr.push_back( (unsigned char)num );
514 }
515 else if (isLetter(c) || c=='_' || c==':') {
516 // get an EntityRef
517 // EntityRef ::= '&' Name ';'
518 std::string entname = "";
519 entname.push_back(c);
520 do {
521 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
522 if (c==';') {
523 break;
524 }
525 else if ( isLetter(c) || ('0' <= c && c <= '9')
526 || c=='.' || c=='-' || c=='_' || c==':'
527 || c==0xB7 ) {
528 entname.push_back(c);
529 }
530 else {
531 XMLPARSER_TFE(1, "entity reference not well-formed: invalid character");
532 }
533 } while (1);
534 XMLPARSER_TFE( _entities.find(entname) == _entities.end(), "entity reference not well-formed: undefined entity");
535 refstr = _entities[entname];
536 }
537 else {
538 XMLPARSER_TFE(1, "reference not well-formed: expected name or '#'");
539 }
540}
541
542
543int XMLParser::getSpace(unsigned char &lookahead) {
544 // if space, consume the whitespace
545 do {
546 if (lookahead == '\n') ++_lineNo;
547 if (_is->readBytes(&lookahead,1) < 1) {
548 return 1; // inform caller that we reached the end
549 }
550 }
551 while (isSpace(lookahead));
552 return 0;
553}
554
555
556bool XMLParser::isLetter(unsigned char c) {
557 if ( (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) ||
558 (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) ||
559 (0xF8 <= c) /* unsigned char must be <= 0xFF */ )
560 {
561 return true;
562 }
563 return false;
564}
565
566
567bool XMLParser::isNameChar(unsigned char c) {
568 if ( isLetter(c) || ('0' <= c && c <= '9') ||
569 c=='.' || c=='-' || c=='_' || c==':' || c==0xB7 )
570 {
571 return true;
572 }
573 return false;
574}
575
576
577bool XMLParser::isSpace(unsigned char c) {
578 if ( c==0x20 || c==0x9 || c==0xD || c==0xA )
579 {
580 return true;
581 }
582 return false;
583}
584
585
586bool XMLParser::isChar(unsigned char c) {
587 if ( c==0x9 || c==0xA || c==0xD || 0x20 <= c) { // unsigned char must be <= 0xFF
588 return true;
589 }
590 return false;
591}
592
593
594int XMLParser::assertChar(unsigned char cexp)
595{
596 // pull the next character off the stream and verify that it is what is expected
597 // if not, return an error to the caller
598 unsigned char c;
599 // don't worry about newlines; assertChar is always wrapped in TEST_FOR_EXCEPTION, so we don't want to advance the line counter
600 if (_is->readBytes(&c,1) < 1) {
601 return 1;
602 }
603 if (c != cexp) {
604 return 2;
605 }
606 return 0;
607}
608
610{
611 /* Be a little lax on the spec here; read until we get to '?', then assert '>'
612 We have already consumed: <xml
613 */
614 unsigned char c;
615 while (1) {
616 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating XML declaration begun at line " << _lineNo );
617 if (c == '\n') ++_lineNo;
618 // if we have a -
619 if (c=='?') {
620 // this had better be leading to the end of the declaration
621 XMLPARSER_TFE( assertChar('>')!=0, "XML declaration not well-formed: missing expected '>' at line " << _lineNo );
622 break;
623 }
624 }
625}
Defines a class for assembling an XMLObject from XML input.
#define XMLPARSER_TFE(T, S)
A class providing a simple XML parser. Methods can be overloaded to exploit external XML parsing libr...
Smart reference counting pointer class for automatic garbage collection.
TreeBuildingXMLHandler assembles a XMLObject from your XML input.
Representation of an XML data tree. XMLObject is a ref-counted handle to a XMLObjectImplem object,...
static bool isLetter(unsigned char c)
Determine whether c matches the Letter production according to the XML specification.
void getReference(std::string &refstr)
Consumes a Reference production according to the XML specification.
static bool isSpace(unsigned char c)
Determine whether c matches the Space production according to the XML specification.
int getSpace(unsigned char &lookahead)
Consumes a Space (block of whitepace) production according to the XML specification.
static bool isNameChar(unsigned char c)
Determine whether c matches the NameChar production according to the XML specification.
XMLObject parse()
Consume the XMLInputStream to build an XMLObject.
void ignoreXMLDeclaration()
Ignore the rest of an XML declaration tag.
void getSTag(unsigned char lookahead, std::string &tag, Teuchos::map< std::string, string > &attrs, bool &emptytag)
Consume a STag production according to the XML specification. getSTag throws an std::exception if the...
RCP< XMLInputStream > _is
int assertChar(unsigned char cexp)
Determines if the next character on the stream.
void getETag(std::string &tag)
Consume a ETag production according to the XML specification. getETag throws an std::exception if the...
static bool isChar(unsigned char c)
Determine whether c matches the Char production according to the XML specification.
void getComment(long startLine)
Consume a Comment production according to the XML specification. getComment throws an std::exception ...
Teuchos::map< std::string, string > _entities
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Deprecated.