utils.HtmlStrip source code

1 /*
2  * This file is part of gtkD.
3  * 
4  * gtkD is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU Lesser General Public License
6  * as published by the Free Software Foundation; either version 3
7  * of the License, or (at your option) any later version, with
8  * some exceptions, please read the COPYING file.
9  * 
10  * gtkD is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public License for more details.
14  * 
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with gtkD; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
18  */
19 
20 /**
21  * grabs the text of an html doc
22  */
23 
24 module utils.HtmlStrip;
25 
26 //debug=amper;
27 //debug=file;
28 
29 public class HtmlStrip
30 {
31 	
32 	private import std.file;
33 	private import std.stdio;
34 	private import std.conv;
35 	private import std.string;
36 
37 	public bool convertComment = true;
38 	public bool markHR = true;
39 	public bool markP = true;
40 	public bool markH = true;
41 	public bool removeEmptyLines = true;
42 	public bool removeExtraSpaces = true;
43 
44 	public this()
45 	{
46 		
47 	}
48 	
49 	string strip(string htmlText, bool checkUTF=true)
50 	{
51 		int markupCount = 0;
52 		dchar[] stripped;
53 		dchar pc = ' ';
54 		string mark;
55 		bool inAmper = false;
56 		bool inCode = false;
57 		string amper;
58 
59 		foreach ( dchar c ; htmlText )
60 		{
61 			switch ( c )
62 			{
63 				case '<':
64 					++markupCount;
65 					mark.length = 0;
66 					break;
67 					
68 				case '>': 
69 					--markupCount;
70 					if ( markHR && (mark == "hr" || mark == "HR") )
71 					{
72 						stripped ~= "\n<hr>\n";
73 					}
74 					else if ( markP && (mark == "p" || mark == "P") )
75 					{
76 						stripped ~= "\n";
77 					}
78 					else if ( markH && (mark == "/h2" || mark == "/H2") )
79 					{
80 						stripped ~= "\n";
81 					}
82 					else if ( markH && (mark == "/h3" || mark == "/H3") )
83 					{
84 						stripped ~= "\n";
85 					}
86 					else if ( mark == "div class=\"informalexample\"" || mark == "div class=\"example\"" )
87 					{
88 						inCode = true;
89 						stripped ~= "\n$(DDOC_COMMENT example)\n";
90 					}
91 					else if ( mark == "/div" )
92 					{
93 						inCode = false;
94 					}
95 					break;
96 					
97 				case '&':
98 					inAmper = true;
99 					amper = "&";
100 					break;
101 				case '\u00A0':
102 					if ( stripped[$-1] != ' ' )
103 						stripped ~= ' ';
104 					break;
105 
106 				default:
107 					if ( inAmper )
108 					{
109 						if ( c==';' )
110 						{
111 							debug(amper) writefln("amper = ",amper);
112 							switch ( amper )
113 							{
114 								case "&lt"	: c = '<'; break;
115 								case "&gt"	: c = '>'; break;
116 								case "&nbsp": c = ' '; break;
117 								default: c = '\0'; break;
118 							}
119 							inAmper = false;
120 							amper.length = 0;
121 						}
122 						else
123 						{
124 							amper ~= c;
125 							c = '\0';
126 						}
127 					}
128 					if ( c == '\0' )
129 					{
130 						// ignore it
131 					}
132 					else if ( markupCount <= 0 && !inCode )
133 					{
134 						if ( convertComment && pc == '/' && c == '*' )
135 						{
136 							stripped ~= '+';
137 						}
138 						if ( convertComment && pc == '*' && c == '/' )
139 						{
140 							stripped[stripped.length-1] = '+';
141 							stripped ~= c;
142 						}
143 						else if ( removeEmptyLines 
144 									&& stripped.length > 2
145 									&& c == '\n' 
146 									&& stripped[stripped.length-1] == '\n'
147 									&& stripped[stripped.length-2] == '\n'
148 									)
149 						{
150 							// ignore this EOL
151 						}
152 						else if ( removeEmptyLines 
153 									&& stripped.length > 2
154 									&& c == 0x0A 
155 									&& stripped[stripped.length-1] == 0x0A
156 									&& stripped[stripped.length-2] == 0x0A
157 									)
158 						{
159 							// ignore this EOL
160 						}
161 						else if ( removeExtraSpaces && c == ' ' && pc == ' ' )
162 						{
163 							// skip this space
164 						}
165 						else
166 						{
167 							stripped ~= c;
168 						}
169 					}
170 					else if ( markupCount > 0 )
171 					{
172 						mark ~= c;
173 					}
174 					break;
175 			}
176 			pc = c;
177 		}
178 
179 		if ( checkUTF )
180 		{
181 			cleanUTF(stripped);
182 		}
183 
184 		return to!string(stripped);
185 	}
186 	
187 	string stripFile(string filename)
188 	{
189 		debug(file)writefln("HtmlStrip.stripFile filename = %s", filename);
190 		string text = cast(string)std.file.read(filename);
191 		
192 		//writefln("Original html:\n%s", text);
193 
194 		return strip(text);
195 	}
196 	
197 	private import std.utf;
198 	
199 	public void cleanUTF(dchar[] str)
200 	{
201 		//printf("before utfClean\n%s\nend before utfClean\n", (str~"\0").ptr);
202 		size_t i = 0;
203 		while ( i < str.length )
204 		{
205 			try
206 			{
207 				std.utf.decode(str, i);
208 			}
209 			catch ( UTFException e )
210 			{
211 				str[i] = ' ';
212 				++i;
213 			}
214 
215 		}
216 		
217 		//writefln("after utfClean\n%s\nend after utfClean", str);
218 	}
219 	
220 }
221 
222 
223 version (standAlone)
224 {
225 	
226 	private import std.stdio;
227 
228 	int main()
229 	{
230 
231 		HtmlStrip strip = new HtmlStrip();
232 		string stripped = strip.stripFile("/home/mike/D/gtkD/gtkD-2.20/wrap/gtkdocs/glib-html-2.24.0/glib-Lexical-Scanner.html");
233 	
234 		writefln("Stripped html:\n%s", stripped);
235 	
236 		return 0;
237 	}
238 	
239 }