1 /* 2 * This file is part of gtkD. 3 * 4 * gtkD is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU Lesser General Public License 6 * as published by the Free Software Foundation; either version 3 7 * of the License, or (at your option) any later version, with 8 * some exceptions, please read the COPYING file. 9 * 10 * gtkD is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public License for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public License 16 * along with gtkD; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA 18 */ 19 20 /** 21 * grabs the text of an html doc 22 */ 23 24 module utils.HtmlStrip; 25 26 //debug=amper; 27 //debug=file; 28 29 public class HtmlStrip 30 { 31 32 private import std.file; 33 private import std.stdio; 34 private import std.conv; 35 private import std.string; 36 37 public bool convertComment = true; 38 public bool markHR = true; 39 public bool markP = true; 40 public bool markH = true; 41 public bool removeEmptyLines = true; 42 public bool removeExtraSpaces = true; 43 44 public this() 45 { 46 47 } 48 49 string strip(string htmlText, bool checkUTF=true) 50 { 51 int markupCount = 0; 52 dchar[] stripped; 53 dchar pc = ' '; 54 string mark; 55 bool inAmper = false; 56 bool inCode = false; 57 string amper; 58 59 foreach ( dchar c ; htmlText ) 60 { 61 switch ( c ) 62 { 63 case '<': 64 ++markupCount; 65 mark.length = 0; 66 break; 67 68 case '>': 69 --markupCount; 70 if ( markHR && (mark == "hr" || mark == "HR") ) 71 { 72 stripped ~= "\n<hr>\n"; 73 } 74 else if ( markP && (mark == "p" || mark == "P") ) 75 { 76 stripped ~= "\n"; 77 } 78 else if ( markH && (mark == "/h2" || mark == "/H2") ) 79 { 80 stripped ~= "\n"; 81 } 82 else if ( markH && (mark == "/h3" || mark == "/H3") ) 83 { 84 stripped ~= "\n"; 85 } 86 else if ( mark == "div class=\"informalexample\"" || mark == "div class=\"example\"" ) 87 { 88 inCode = true; 89 stripped ~= "\n$(DDOC_COMMENT example)\n"; 90 } 91 else if ( mark == "/div" ) 92 { 93 inCode = false; 94 } 95 break; 96 97 case '&': 98 inAmper = true; 99 amper = "&"; 100 break; 101 case '\u00A0': 102 if ( stripped[$-1] != ' ' ) 103 stripped ~= ' '; 104 break; 105 106 default: 107 if ( inAmper ) 108 { 109 if ( c==';' ) 110 { 111 debug(amper) writefln("amper = ",amper); 112 switch ( amper ) 113 { 114 case "<" : c = '<'; break; 115 case ">" : c = '>'; break; 116 case " ": c = ' '; break; 117 default: c = '\0'; break; 118 } 119 inAmper = false; 120 amper.length = 0; 121 } 122 else 123 { 124 amper ~= c; 125 c = '\0'; 126 } 127 } 128 if ( c == '\0' ) 129 { 130 // ignore it 131 } 132 else if ( markupCount <= 0 && !inCode ) 133 { 134 if ( convertComment && pc == '/' && c == '*' ) 135 { 136 stripped ~= '+'; 137 } 138 if ( convertComment && pc == '*' && c == '/' ) 139 { 140 stripped[stripped.length-1] = '+'; 141 stripped ~= c; 142 } 143 else if ( removeEmptyLines 144 && stripped.length > 2 145 && c == '\n' 146 && stripped[stripped.length-1] == '\n' 147 && stripped[stripped.length-2] == '\n' 148 ) 149 { 150 // ignore this EOL 151 } 152 else if ( removeEmptyLines 153 && stripped.length > 2 154 && c == 0x0A 155 && stripped[stripped.length-1] == 0x0A 156 && stripped[stripped.length-2] == 0x0A 157 ) 158 { 159 // ignore this EOL 160 } 161 else if ( removeExtraSpaces && c == ' ' && pc == ' ' ) 162 { 163 // skip this space 164 } 165 else 166 { 167 stripped ~= c; 168 } 169 } 170 else if ( markupCount > 0 ) 171 { 172 mark ~= c; 173 } 174 break; 175 } 176 pc = c; 177 } 178 179 if ( checkUTF ) 180 { 181 cleanUTF(stripped); 182 } 183 184 return to!string(stripped); 185 } 186 187 string stripFile(string filename) 188 { 189 debug(file)writefln("HtmlStrip.stripFile filename = %s", filename); 190 string text = cast(string)std.file.read(filename); 191 192 //writefln("Original html:\n%s", text); 193 194 return strip(text); 195 } 196 197 private import std.utf; 198 199 public void cleanUTF(dchar[] str) 200 { 201 //printf("before utfClean\n%s\nend before utfClean\n", (str~"\0").ptr); 202 size_t i = 0; 203 while ( i < str.length ) 204 { 205 try 206 { 207 std.utf.decode(str, i); 208 } 209 catch ( UTFException e ) 210 { 211 str[i] = ' '; 212 ++i; 213 } 214 215 } 216 217 //writefln("after utfClean\n%s\nend after utfClean", str); 218 } 219 220 } 221 222 223 version (standAlone) 224 { 225 226 private import std.stdio; 227 228 int main() 229 { 230 231 HtmlStrip strip = new HtmlStrip(); 232 string stripped = strip.stripFile("/home/mike/D/gtkD/gtkD-2.20/wrap/gtkdocs/glib-html-2.24.0/glib-Lexical-Scanner.html"); 233 234 writefln("Stripped html:\n%s", stripped); 235 236 return 0; 237 } 238 239 }