1 /** 2 * Copyright © DiamondMVC 2019 3 * License: MIT (https://github.com/DiamondMVC/Diamond/blob/master/LICENSE) 4 * Author: Jacob Jensen (bausshf) 5 */ 6 module diamond.dom.domparser; 7 8 import std.uni : isWhite; 9 import std..string : format, strip, toLower, indexOf; 10 import std.algorithm : canFind; 11 import std.conv : to; 12 13 import diamond.dom.domdocument; 14 import diamond.dom.domnode; 15 import diamond.dom.domattribute; 16 import diamond.dom.domexception; 17 import diamond.dom.domparsersettings; 18 import diamond.errors.checks; 19 20 /** 21 * Parses a string of dom into an dom document. 22 * Params: 23 * dom = The dom string to parse. 24 * parserSettings = The settings used for parsing. 25 * Returns: 26 * The parsed dom document. 27 */ 28 TDocument parseDom(TDocument : DomDocument)(string dom, DomParserSettings parserSettings) @safe 29 { 30 enforce(parserSettings !is null, "Missing parsing settings."); 31 32 auto doc = new TDocument(parserSettings); 33 34 auto elements = parseDomElements(dom, parserSettings); 35 36 doc.parseElements(elements); 37 38 return doc; 39 } 40 41 private bool findAhead(string dom, string toFind) @safe 42 { 43 return dom && dom.length && dom.canFind(toFind); 44 } 45 46 /** 47 * Parses an dom string into an array of dom nodes. 48 * Params: 49 * dom = The dom string to parse. 50 * parserSettings = The settings used for parsing. 51 * Returns: 52 * An array of the parsed dom nodes. Null if the string is not dom. 53 */ 54 package(diamond.dom) DomNode[] parseDomElements(string dom, DomParserSettings parserSettings) @safe 55 { 56 enforce(parserSettings !is null, "Missing parsing settings."); 57 58 if (!dom || !dom.length) 59 { 60 return null; 61 } 62 63 dom = dom.strip(); 64 65 if (dom.length < 2) 66 { 67 return null; 68 } 69 70 if (dom[0] != '<' && dom[$-1] != '>') 71 { 72 return null; 73 } 74 75 DomNode[] elements; 76 DomNode currentNode; 77 bool isHeader; 78 bool evaluated; 79 string attributeName; 80 string attributeValue; 81 DomAttribute attribute; 82 string text; 83 bool comment; 84 char headerChar; 85 char attributeStringChar = '\0'; 86 87 void finalizeNode() @trusted 88 { 89 if (!currentNode) 90 { 91 return; 92 } 93 94 if (currentNode.parent) 95 { 96 currentNode.parent.addChild(currentNode); 97 } 98 else 99 { 100 elements ~= currentNode; 101 } 102 103 currentNode = currentNode.parent; 104 } 105 106 foreach (ref i; 0 .. dom.length) 107 { 108 char last = i > 0 ? dom[i - 1] : '\0'; 109 char current = dom[i]; 110 char next = i < (dom.length - 1) ? dom[i + 1] : '\0'; 111 112 if (current < 32 && (current < 8 || current > 13)) 113 { 114 continue; 115 } 116 117 if (comment) 118 { 119 if (current == '-' && next == '-' && i < (dom.length - 2)) 120 { 121 auto afterNext = dom[i + 2]; 122 123 if (afterNext == '>') 124 { 125 comment = false; 126 i += 2; 127 } 128 } 129 130 continue; 131 } 132 133 if (currentNode && evaluated && parserSettings.isFlexibleTag(currentNode.name)) 134 { 135 string content = ""; 136 bool inString; 137 char stringChar; 138 139 auto j = i; 140 141 while (j < (dom.length - 1)) 142 { 143 last = j > 0 ? dom[j - 1] : '\0'; 144 current = dom[j]; 145 next = j < (dom.length - 1) ? dom[j + 1] : '\0'; 146 147 if ((current == '\"' || current == '\'') && !inString) 148 { 149 stringChar = current; 150 inString = true; 151 } 152 else if ((current == stringChar || current == '\r' || current == '\n') && inString) 153 { 154 inString = false; 155 } 156 157 if (current == '<' && next == '/' && !inString) 158 { 159 auto endIndex = dom[j .. $].indexOf('>'); 160 161 auto fromLen = j + 2; 162 auto toLen = fromLen + (endIndex - 2); 163 164 if (endIndex >= 0 && dom[fromLen .. (toLen > $ ? $ : toLen)].toLower() == currentNode.name) 165 { 166 j = toLen; 167 break; 168 } 169 } 170 171 content ~= current; 172 j++; 173 } 174 175 i = j + 1; 176 177 currentNode.rawText = content; 178 179 finalizeNode(); 180 continue; 181 } 182 183 if (!current || current == '\r' || (current == '\n' && !evaluated)) 184 { 185 continue; 186 } 187 188 if (current == '<') 189 { 190 if (next == '!' && i < (dom.length - 3)) 191 { 192 auto afterNext = dom[i + 2]; 193 auto nextAfterNext = dom[i + 3]; 194 195 if (afterNext == '-' && nextAfterNext == '-') 196 { 197 comment = true; 198 i += 3; 199 continue; 200 } 201 } 202 203 if (currentNode && text && text.strip().length) 204 { 205 currentNode.rawText = text; 206 207 currentNode = new DomNode(currentNode); 208 currentNode.isTextNode = true; 209 currentNode.rawText = text; 210 currentNode.parserSettings = parserSettings; 211 212 finalizeNode(); 213 214 text = null; 215 } 216 217 if (currentNode && next == '/') 218 { 219 while (current != '>' && i < (dom.length - 1)) 220 { 221 i++; 222 223 if (i < (dom.length - 1)) 224 { 225 last = i > 0 ? dom[i - 1] : '\0'; 226 current = dom[i]; 227 next = i < (dom.length - 1) ? dom[i + 1] : '\0'; 228 } 229 } 230 231 finalizeNode(); 232 } 233 else 234 { 235 if (next == '?' || next == '!') 236 { 237 isHeader = true; 238 headerChar = next; 239 i++; 240 } 241 242 currentNode = new DomNode(currentNode); 243 currentNode.parserSettings = parserSettings; 244 evaluated = false; 245 } 246 } 247 else if (currentNode && (current == '?' || current == '!') && isHeader) 248 { 249 continue; 250 } 251 else if (currentNode && next == '>' && current == '/') 252 { 253 i++; 254 255 finalizeNode(); 256 } 257 else if (current == '>') 258 { 259 if 260 ( 261 currentNode && 262 parserSettings.allowSelfClosingTags && 263 ( 264 parserSettings.isSelfClosingTag(currentNode.name) || 265 ( 266 !parserSettings.isStandardTag(currentNode.name) && 267 !findAhead(dom[i .. $], "/" ~ currentNode.name) 268 ) 269 ) 270 ) 271 { 272 finalizeNode(); 273 } 274 else if (currentNode && last == '/') 275 { 276 finalizeNode(); 277 } 278 else if (currentNode && isHeader && headerChar == '!') 279 { 280 headerChar = '\0'; 281 elements ~= currentNode; 282 isHeader = false; 283 currentNode = null; 284 } 285 else if (currentNode && isHeader && last == '?') 286 { 287 elements ~= currentNode; 288 isHeader = false; 289 currentNode = null; 290 } 291 else if (currentNode) 292 { 293 evaluated = true; 294 } 295 } 296 else if (currentNode && !currentNode.name) 297 { 298 string name; 299 300 while (i < (dom.length - 1)) 301 { 302 if (!current.isWhite) 303 { 304 name ~= current; 305 } 306 307 i++; 308 309 if (i < (dom.length - 1)) 310 { 311 last = i > 0 ? dom[i - 1] : '\0'; 312 current = dom[i]; 313 next = i < (dom.length - 1) ? dom[i + 1] : '\0'; 314 } 315 316 if (current.isWhite || current == '>' || current == '/') 317 { 318 if (current == '>') 319 { 320 evaluated = true; 321 322 if 323 ( 324 currentNode && 325 parserSettings.allowSelfClosingTags && 326 ( 327 parserSettings.isSelfClosingTag(name) || 328 ( 329 !parserSettings.isStandardTag(name) && 330 !findAhead(dom[i .. $], "/" ~ name) 331 ) 332 ) 333 ) 334 { 335 evaluated = true; 336 i--; 337 } 338 } 339 340 if (current == '/') 341 { 342 evaluated = true; 343 i--; 344 } 345 break; 346 } 347 } 348 349 currentNode.name = name; 350 } 351 else if (currentNode && !evaluated) 352 { 353 if (!attribute && (current == '\"' || current == '\'')) 354 { 355 attributeStringChar = current; 356 357 auto j = i; 358 DomAttribute tempAttribute; 359 360 while (j < (dom.length - 1)) 361 { 362 j++; 363 364 if (dom[j] == attributeStringChar && last != '\\') 365 { 366 tempAttribute = new DomAttribute(dom[i .. j + 1], null); 367 currentNode.addAttribute(tempAttribute); 368 attributeStringChar = '\0'; 369 break; 370 } 371 } 372 373 if (tempAttribute) 374 { 375 i = j; 376 377 continue; 378 } 379 } 380 381 if ((next.isWhite || next == '>') && !attribute && attributeName && attributeName.length) 382 { 383 attributeName ~= current; 384 385 attribute = new DomAttribute(attributeName, null); 386 387 currentNode.addAttribute(attribute); 388 389 attribute = null; 390 attributeName = null; 391 attributeValue = null; 392 continue; 393 } 394 395 if (attributeStringChar == '\0' && (current == '\"' || current == '\'') && last != '\\') 396 { 397 attributeStringChar = current; 398 } 399 400 if ((current == attributeStringChar && last != '\\' && (attributeValue || last == attributeStringChar)) || (current == '=' && !attribute)) 401 { 402 if (!attribute) 403 { 404 attribute = new DomAttribute(attributeName, null); 405 } 406 else 407 { 408 attribute.value = attributeValue; 409 410 currentNode.addAttribute(attribute); 411 412 attributeStringChar = '\0'; 413 attribute = null; 414 attributeName = null; 415 attributeValue = null; 416 } 417 } 418 else if (!attribute) 419 { 420 attributeName ~= current; 421 } 422 else if (((current == attributeStringChar && last != '=' && last != '\\') || current != attributeStringChar)) 423 { 424 attributeValue ~= current; 425 } 426 } 427 else if (currentNode && evaluated) 428 { 429 text ~= current; 430 } 431 else if (parserSettings.strictParsing) 432 { 433 throw new DomException("Encountered unexpected character: '%s' at index: '%d'.".format(current, i)); 434 } 435 } 436 437 if (currentNode) 438 { 439 elements ~= currentNode; 440 } 441 442 return elements; 443 }