1 /*
2  * This file is part of gtkD.
3  *
4  * gtkD is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU Lesser General Public License
6  * as published by the Free Software Foundation; either version 3
7  * of the License, or (at your option) any later version, with
8  * some exceptions, please read the COPYING file.
9  *
10  * gtkD is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with gtkD; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
18  */
19 
20 module utils.XML;
21 
22 import std.algorithm;
23 import std.array;
24 import std.conv : to;
25 import std.exception;
26 import std.range;
27 import std.string;
28 import std.traits: isSomeChar;
29 import std.uni;
30 
31 struct XMLNode
32 {
33 	XMLNodeType type;
34 
35 	string value;
36 	string[string] attributes;
37 }
38 
39 enum XMLNodeType
40 {
41 	None,
42 	PI,
43 	StartTag,
44 	Text,
45 	CData,
46 	DocType,
47 	Comment,
48 	EmptyTag,
49 	EndTag,
50 	DocumentEnd
51 }
52 
53 class XMLReader(T)
54 	if (isInputRange!T &&  isSomeChar!(ElementType!T) )
55 {
56 	XMLNode front;
57 	string fileName;
58 
59 	static if ( is( T == string ) )
60 		private CountLines!ByChar document;
61 	else
62 		private CountLines!T document;
63 
64 	/**
65 	 * Params:
66 	 *     document = The XML document to parse.
67 	 *     fileName = File name to print in diagnostic messages.
68 	 */
69 	this(T document, string fileName = null)
70 	{
71 		static if ( is( T == string ) )
72 			this.document = CountLines!ByChar(ByChar(document));
73 		else
74 			this.document = CountLines!T(document);
75 
76 		this.fileName = fileName;
77 
78 		popFront();
79 	}
80 
81 	@property size_t line()
82 	{
83 		return document.line;
84 	}
85 
86 	void popFront()
87 	{
88 		front = XMLNode();
89 
90 		if ( document.empty )
91 		{
92 			front.type = XMLNodeType.DocumentEnd;
93 			return;
94 		}
95 
96 		if ( document.front == '<' )
97 			parseTag();
98 		else
99 			parseText();
100 	}
101 
102 	@property bool empty()
103 	{
104 		return document.empty && front.type == XMLNodeType.DocumentEnd;
105 	}
106 
107 	private void parseTag()
108 	{
109 		document.popFront();
110 
111 		switch ( document.front )
112 		{
113 			case '!':
114 				document.popFront();
115 				switch ( document.front )
116 				{
117 					case '[':
118 						enforce(document.skipOver("[CDATA["));
119 						parseCDATA();
120 						break;
121 					case 'D':
122 						enforce(document.skipOver("!DOCTYPE"));
123 						parseDocType();
124 						break;
125 					case '-':
126 						enforce(document.skipOver("--"));
127 						parseComment();
128 						break;
129 					default:
130 						throw new XMLException(this, "Invalid XML tag");
131 				}
132 				break;
133 			case '?':
134 				document.popFront();
135 				parsePI();
136 				break;
137 			case '/':
138 				document.popFront();
139 				parseStartTag();
140 				front.type = XMLNodeType.EndTag;
141 				break;
142 			default:
143 				parseStartTag();
144 				break;
145 		}
146 
147 		skipWhitespace();
148 	}
149 
150 	private void parseCDATA()
151 	{
152 		front.type = XMLNodeType.CData;
153 		auto buff = appender!string();
154 
155 		while ( !document.empty )
156 		{
157 			if ( document.front == ']' )
158 			{
159 				document.popFront();
160 
161 				if ( document.front != ']' )
162 				{
163 					buff.put(']');
164 					buff.put(document.front);
165 					document.popFront();
166 					continue;
167 				}
168 
169 				document.popFront();
170 
171 				if ( document.front == '>' )
172 				{
173 					document.popFront();
174 					return;
175 				}
176 			}
177 
178 			buff.put(document.front);
179 			document.popFront();
180 		}
181 
182 		front.value = buff.data;
183 	}
184 
185 	private void parseDocType()
186 	{
187 		front.type = XMLNodeType.DocType;
188 		auto buff = appender!string();
189 		int bracketCount;
190 
191 		skipWhitespace();
192 
193 		while ( !document.empty )
194 		{
195 			switch ( document.front )
196 			{
197 				case '[':
198 					bracketCount++;
199 					break;
200 				case ']':
201 					bracketCount--;
202 					break;
203 				case '>':
204 					if ( bracketCount == 0 )
205 					{
206 						document.popFront();
207 						return;
208 					}
209 					break;
210 				default: break;
211 			}
212 
213 			buff.put(document.front);
214 			document.popFront();
215 		}
216 
217 		front.value = buff.data.stripRight();
218 	}
219 
220 	private void parseComment()
221 	{
222 		front.type = XMLNodeType.Comment;
223 		auto buff = appender!string();
224 
225 		while ( !document.empty )
226 		{
227 			if ( document.front == '-' )
228 			{
229 				document.popFront();
230 
231 				if ( document.front != '-' )
232 				{
233 					buff.put('-');
234 					buff.put(document.front);
235 					document.popFront();
236 					continue;
237 				}
238 
239 				document.popFront();
240 
241 				if ( document.front == '>' )
242 				{
243 					document.popFront();
244 					return;
245 				}
246 
247 				throw new XMLException(this, "-- not allowed in comments.");
248 			}
249 
250 			buff.put(document.front);
251 			document.popFront();
252 		}
253 
254 		front.value = buff.data.strip();
255 	}
256 
257 	private void parsePI()
258 	{
259 		front.type = XMLNodeType.PI;
260 		auto buff = appender!string();
261 
262 		while ( !document.empty )
263 		{
264 			if ( document.front == '?' )
265 			{
266 				document.popFront();
267 
268 				if ( document.front == '>' )
269 				{
270 					document.popFront();
271 					return;
272 				}
273 
274 				buff.put('?');
275 			}
276 
277 			buff.put(document.front);
278 			document.popFront();
279 		}
280 
281 		front.value = buff.data.stripRight();
282 	}
283 
284 	private void parseStartTag()
285 	{
286 		front.type = XMLNodeType.StartTag;
287 		auto buff = appender!string();
288 
289 		while ( !document.empty && !(document.front.isWhite() || document.front == '/' || document.front == '>') )
290 		{
291 			buff.put(document.front);
292 			document.popFront();
293 		}
294 
295 		front.value = buff.data;
296 
297 		while ( !document.empty )
298 		{
299 			skipWhitespace();
300 
301 			if ( document.front == '/' )
302 			{
303 				front.type = XMLNodeType.EmptyTag;
304 				document.popFront();
305 			}
306 
307 			if ( document.front == '>' )
308 			{
309 				document.popFront();
310 				return;
311 			}
312 
313 			buff = appender!string();
314 			string attName;
315 
316 			while ( !document.empty && !(document.front.isWhite() || document.front == '=') )
317 			{
318 				buff.put(document.front);
319 				document.popFront();
320 			}
321 
322 			document.popFront();
323 			if ( document.front == '=' )
324 				document.popFront();
325 
326 			attName = buff.data;
327 			buff = appender!string();
328 
329 			if ( document.front.isWhite() )
330 				skipWhitespace();
331 
332 			ElementType!(typeof(document)) quote = document.front;
333 			document.popFront();
334 
335 			AttValue: while ( !document.empty )
336 			{
337 				switch ( document.front )
338 				{
339 					case '\'':
340 					case '"':
341 						if ( document.front != quote )
342 							goto default;
343 
344 						document.popFront();
345 						break AttValue;
346 					case '&':
347 						parseAmpersand(buff);
348 						break;
349 					default:
350 						buff.put(document.front);
351 						break;
352 				}
353 
354 				document.popFront();
355 			}
356 
357 			front.attributes[attName] = buff.data;
358 		}
359 	}
360 
361 	private void parseText()
362 	{
363 		front.type = XMLNodeType.Text;
364 		auto buff = appender!string();
365 
366 		Text: while ( !document.empty )
367 		{
368 			switch ( document.front )
369 			{
370 				case '<':
371 					break Text;
372 				case '&':
373 					parseAmpersand(buff);
374 					break;
375 				default:
376 					buff.put(document.front);
377 					break;
378 			}
379 
380 			document.popFront();
381 		}
382 
383 		front.value = buff.data.stripRight();
384 	}
385 
386 	private void skipWhitespace()
387 	{
388 		while ( !document.empty && isWhite(document.front) )
389 			document.popFront();
390 	}
391 
392 	private void parseAmpersand(Appender!(string) buff)
393 	{
394 		ElementType!(typeof(document))[5] sequence;
395 		int index;
396 
397 		document.popFront();
398 
399 		while ( document.front != ';' )
400 		{
401 			sequence[index++] = document.front;
402 			document.popFront();
403 		}
404 
405 		switch ( sequence[0 .. index] )
406 		{
407 			case "#34":
408 			case "quot":
409 				buff.put('"');
410 				break;
411 			case "#38":
412 			case "amp":
413 				buff.put('&');
414 				break;
415 			case "#39":
416 			case "apos":
417 				buff.put('\'');
418 				break;
419 			case "#60":
420 			case "lt":
421 				buff.put('<');
422 				break;
423 			case "#62":
424 			case "gt":
425 				buff.put('>');
426 				break;
427 			default:
428 				throw new XMLException(this, "Unregonized escape secuence");
429 		}
430 	}
431 
432 	unittest
433 	{
434 		auto reader = new XMLReader("&lt;test&gt;");
435 		assert(reader.front.value == "<test>");
436 	}
437 }
438 
439 /**
440  * Skip the current tag and it's content.
441  * Leaves the reader pointing to the end tag with the same depth.
442  */
443 void skipTag(T)(XMLReader!T reader)
444 {
445 	if ( reader.front.type == XMLNodeType.EmptyTag )
446 		return;
447 	if ( reader.front.type != XMLNodeType.StartTag )
448 	{
449 		reader.popFront();
450 		return;
451 	}
452 
453 	string tagName = reader.front.value;
454 	size_t depth;
455 
456 	while ( !reader.empty )
457 	{
458 		if ( reader.front.type == XMLNodeType.StartTag )
459 			depth++;
460 
461 		if ( reader.front.type == XMLNodeType.EndTag )
462 			depth--;
463 
464 		if ( depth == 0 && reader.front.value == tagName )
465 			return;
466 
467 		reader.popFront();
468 	}
469 }
470 
471 /**
472  * Is this an end tag with name tagName.
473  */
474 bool endTag(T)(XMLReader!T reader, string tagName)
475 {
476 	return reader.front.type == XMLNodeType.EndTag && reader.front.value == tagName;
477 }
478 
479 /// ditto.
480 bool endTag(T)(XMLReader!T reader, string[] tagNames ...)
481 {
482 	return reader.front.type == XMLNodeType.EndTag && tagNames.canFind(reader.front.value);
483 }
484 
485 class XMLException : Exception
486 {
487 	this (T)(XMLReader!T reader, string msg)
488 	{
489 		super(msg, reader.fileName, reader.line, null);
490 	}
491 
492 	override string toString()
493 	{
494 		string s;
495 		toString((buf) { s ~= buf; });
496 		return s;
497 	}
498 
499 	override void toString(scope void delegate(in char[]) sink) const
500 	{
501 		sink(file);
502 		sink("("); sink(to!string(line)); sink(")");
503 
504 		if (msg.length)
505 		{
506 			sink(": "); sink(msg);
507 		}
508 	}
509 
510 }
511 
512 struct ByChar
513 {
514 	string data;
515 
516 	@property char front()
517 	{
518 		return data[0];
519 	}
520 
521 	@property bool empty()
522 	{
523 		return !data.length;
524 	}
525 
526 	void popFront()
527 	{
528 		assert(data.length, "Attempting to popFront() past the end of an array");
529 		data = data[1 .. $];
530 	}
531 
532 	@property ByChar save()
533 	{
534 		return this;
535 	}
536 
537 	alias data this;
538 }
539 
540 struct CountLines(Source) if (isSomeChar!(ElementType!Source))
541 {
542 	import std.range.primitives : ElementType;
543 
544 	Source src;
545 	size_t line = 1;
546 
547 	this(Source src)
548 	{
549 		this.src = src;
550 	}
551 
552 	@property ElementType!Source front()
553 	{
554 		return src.front;
555 	}
556 
557 	@property bool empty()
558 	{
559 		return src.empty;
560 	}
561 
562 	void popFront()
563 	{
564 		src.popFront();
565 
566 		if ( src.front == '\n' )
567 			line++;
568 	}
569 
570 	@property typeof(this) save()
571 	{
572 		return typeof(this)(src.save);
573 	}
574 }