diamond.dom.domparser source code

1 /**
2 * Copyright © DiamondMVC 2019
3 * License: MIT (https://github.com/DiamondMVC/Diamond/blob/master/LICENSE)
4 * Author: Jacob Jensen (bausshf)
5 */
6 module diamond.dom.domparser;
7 
8 import std.uni : isWhite;
9 import std..string : format, strip, toLower, indexOf;
10 import std.algorithm : canFind;
11 import std.conv : to;
12 
13 import diamond.dom.domdocument;
14 import diamond.dom.domnode;
15 import diamond.dom.domattribute;
16 import diamond.dom.domexception;
17 import diamond.dom.domparsersettings;
18 import diamond.errors.checks;
19 
20 /**
21 * Parses a string of dom into an dom document.
22 * Params:
23 *   dom =            The dom string to parse.
24 *   parserSettings = The settings used for parsing.
25 * Returns:
26 *   The parsed dom document.
27 */
28 TDocument parseDom(TDocument : DomDocument)(string dom, DomParserSettings parserSettings) @safe
29 {
30   enforce(parserSettings !is null, "Missing parsing settings.");
31 
32   auto doc = new TDocument(parserSettings);
33 
34   auto elements = parseDomElements(dom, parserSettings);
35 
36   doc.parseElements(elements);
37 
38   return doc;
39 }
40 
41 private bool findAhead(string dom, string toFind) @safe
42 {
43   return dom && dom.length && dom.canFind(toFind);
44 }
45 
46 /**
47 * Parses an dom string into an array of dom nodes.
48 * Params:
49 *   dom =           The dom string to parse.
50 *   parserSettings = The settings used for parsing.
51 * Returns:
52 *   An array of the parsed dom nodes. Null if the string is not dom.
53 */
54 package(diamond.dom) DomNode[] parseDomElements(string dom, DomParserSettings parserSettings) @safe
55 {
56   enforce(parserSettings !is null, "Missing parsing settings.");
57 
58   if (!dom || !dom.length)
59   {
60     return null;
61   }
62 
63   dom = dom.strip();
64 
65   if (dom.length < 2)
66   {
67     return null;
68   }
69 
70   if (dom[0] != '<' && dom[$-1] != '>')
71   {
72     return null;
73   }
74 
75   DomNode[] elements;
76   DomNode currentNode;
77   bool isHeader;
78   bool evaluated;
79   string attributeName;
80   string attributeValue;
81   DomAttribute attribute;
82   string text;
83   bool comment;
84   char headerChar;
85   char attributeStringChar = '\0';
86 
87   void finalizeNode() @trusted
88   {
89     if (!currentNode)
90     {
91       return;
92     }
93 
94     if (currentNode.parent)
95     {
96       currentNode.parent.addChild(currentNode);
97     }
98     else
99     {
100       elements ~= currentNode;
101     }
102 
103     currentNode = currentNode.parent;
104   }
105 
106   foreach (ref i; 0 .. dom.length)
107   {
108     char last = i > 0 ? dom[i - 1] : '\0';
109     char current = dom[i];
110     char next =  i < (dom.length - 1) ? dom[i + 1] : '\0';
111 
112     if (current < 32 && (current < 8 || current > 13))
113     {
114       continue;
115     }
116 
117     if (comment)
118     {
119       if (current == '-' && next == '-' && i < (dom.length - 2))
120       {
121         auto afterNext = dom[i + 2];
122 
123         if (afterNext == '>')
124         {
125           comment = false;
126           i += 2;
127         }
128       }
129 
130       continue;
131     }
132 
133     if (currentNode && evaluated && parserSettings.isFlexibleTag(currentNode.name))
134     {
135       string content = "";
136       bool inString;
137       char stringChar;
138 
139       auto j = i;
140 
141       while (j < (dom.length - 1))
142       {
143         last = j > 0 ? dom[j - 1] : '\0';
144         current = dom[j];
145         next =  j < (dom.length - 1) ? dom[j + 1] : '\0';
146 
147         if ((current == '\"' || current == '\'') && !inString)
148         {
149           stringChar = current;
150           inString = true;
151         }
152         else if ((current == stringChar || current == '\r' || current == '\n') && inString)
153         {
154           inString = false;
155         }
156 
157         if (current == '<' && next == '/' && !inString)
158         {
159           auto endIndex = dom[j .. $].indexOf('>');
160 
161           auto fromLen = j + 2;
162           auto toLen = fromLen + (endIndex - 2);
163 
164           if (endIndex >= 0 && dom[fromLen .. (toLen > $ ? $ : toLen)].toLower() == currentNode.name)
165           {
166             j = toLen;
167             break;
168           }
169         }
170 
171         content ~= current;
172         j++;
173       }
174 
175       i = j + 1;
176 
177       currentNode.rawText = content;
178 
179       finalizeNode();
180       continue;
181     }
182 
183     if (!current || current == '\r' || (current == '\n' && !evaluated))
184     {
185       continue;
186     }
187 
188     if (current == '<')
189     {
190       if (next == '!' && i < (dom.length - 3))
191       {
192         auto afterNext = dom[i + 2];
193         auto nextAfterNext = dom[i + 3];
194 
195         if (afterNext == '-' && nextAfterNext == '-')
196         {
197           comment = true;
198           i += 3;
199           continue;
200         }
201       }
202 
203       if (currentNode && text && text.strip().length)
204       {
205         currentNode.rawText = text;
206 
207         currentNode = new DomNode(currentNode);
208         currentNode.isTextNode = true;
209         currentNode.rawText = text;
210         currentNode.parserSettings = parserSettings;
211 
212         finalizeNode();
213 
214         text = null;
215       }
216 
217       if (currentNode && next == '/')
218       {
219         while (current != '>' && i < (dom.length - 1))
220         {
221           i++;
222 
223           if (i < (dom.length - 1))
224           {
225             last = i > 0 ? dom[i - 1] : '\0';
226             current = dom[i];
227             next =  i < (dom.length - 1) ? dom[i + 1] : '\0';
228           }
229         }
230 
231         finalizeNode();
232       }
233       else
234       {
235         if (next == '?' || next == '!')
236         {
237           isHeader = true;
238           headerChar = next;
239           i++;
240         }
241 
242         currentNode = new DomNode(currentNode);
243         currentNode.parserSettings = parserSettings;
244         evaluated = false;
245       }
246     }
247     else if (currentNode && (current == '?' || current == '!') && isHeader)
248     {
249       continue;
250     }
251     else if (currentNode && next == '>' && current == '/')
252     {
253       i++;
254 
255       finalizeNode();
256     }
257     else if (current == '>')
258     {
259       if
260       (
261         currentNode &&
262         parserSettings.allowSelfClosingTags &&
263         (
264           parserSettings.isSelfClosingTag(currentNode.name) ||
265           (
266             !parserSettings.isStandardTag(currentNode.name) &&
267             !findAhead(dom[i .. $], "/" ~ currentNode.name)
268           )
269         )
270       )
271       {
272         finalizeNode();
273       }
274       else if (currentNode && last == '/')
275       {
276         finalizeNode();
277       }
278       else if (currentNode && isHeader && headerChar == '!')
279       {
280         headerChar = '\0';
281         elements ~= currentNode;
282         isHeader = false;
283         currentNode = null;
284       }
285       else if (currentNode && isHeader && last == '?')
286       {
287         elements ~= currentNode;
288         isHeader = false;
289         currentNode = null;
290       }
291       else if (currentNode)
292       {
293         evaluated = true;
294       }
295     }
296     else if (currentNode && !currentNode.name)
297     {
298       string name;
299 
300       while (i < (dom.length - 1))
301       {
302         if (!current.isWhite)
303         {
304           name ~= current;
305         }
306 
307         i++;
308 
309         if (i < (dom.length - 1))
310         {
311           last = i > 0 ? dom[i - 1] : '\0';
312           current = dom[i];
313           next =  i < (dom.length - 1) ? dom[i + 1] : '\0';
314         }
315 
316         if (current.isWhite || current == '>' || current == '/')
317         {
318           if (current == '>')
319           {
320             evaluated = true;
321 
322             if
323             (
324               currentNode &&
325               parserSettings.allowSelfClosingTags &&
326               (
327                 parserSettings.isSelfClosingTag(name) ||
328                 (
329                   !parserSettings.isStandardTag(name) &&
330                   !findAhead(dom[i .. $], "/" ~ name)
331                 )
332               )
333             )
334             {
335               evaluated = true;
336               i--;
337             }
338           }
339 
340           if (current == '/')
341           {
342             evaluated = true;
343             i--;
344           }
345           break;
346         }
347       }
348 
349       currentNode.name = name;
350     }
351     else if (currentNode && !evaluated)
352     {
353       if (!attribute && (current == '\"' || current == '\''))
354       {
355         attributeStringChar = current;
356 
357         auto j = i;
358         DomAttribute tempAttribute;
359 
360         while (j < (dom.length - 1))
361         {
362           j++;
363 
364           if (dom[j] == attributeStringChar && last != '\\')
365           {
366             tempAttribute = new DomAttribute(dom[i .. j + 1], null);
367             currentNode.addAttribute(tempAttribute);
368             attributeStringChar = '\0';
369             break;
370           }
371         }
372 
373         if (tempAttribute)
374         {
375           i = j;
376 
377           continue;
378         }
379       }
380 
381       if ((next.isWhite || next == '>') && !attribute && attributeName && attributeName.length)
382       {
383         attributeName ~= current;
384 
385         attribute = new DomAttribute(attributeName, null);
386 
387         currentNode.addAttribute(attribute);
388 
389         attribute = null;
390         attributeName = null;
391         attributeValue = null;
392         continue;
393       }
394 
395       if (attributeStringChar == '\0' && (current == '\"' || current == '\'') && last != '\\')
396       {
397         attributeStringChar = current;
398       }
399 
400       if ((current == attributeStringChar && last != '\\' && (attributeValue || last == attributeStringChar)) || (current == '=' && !attribute))
401       {
402         if (!attribute)
403         {
404           attribute = new DomAttribute(attributeName, null);
405         }
406         else
407         {
408           attribute.value = attributeValue;
409 
410           currentNode.addAttribute(attribute);
411 
412           attributeStringChar = '\0';
413           attribute = null;
414           attributeName = null;
415           attributeValue = null;
416         }
417       }
418       else if (!attribute)
419       {
420         attributeName ~= current;
421       }
422       else if (((current == attributeStringChar && last != '=' && last != '\\') || current != attributeStringChar))
423       {
424         attributeValue ~= current;
425       }
426     }
427     else if (currentNode && evaluated)
428     {
429       text ~= current;
430     }
431     else if (parserSettings.strictParsing)
432     {
433       throw new DomException("Encountered unexpected character: '%s' at index: '%d'.".format(current, i));
434     }
435   }
436 
437   if (currentNode)
438   {
439     elements ~= currentNode;
440   }
441 
442   return elements;
443 }