1 /**
2 * Copyright © DiamondMVC 2019
3 * License: MIT (https://github.com/DiamondMVC/Diamond/blob/master/LICENSE)
4 * Author: Jacob Jensen (bausshf)
5 */
6 module diamond.markdown.parser;
7 
8 import std.array : replace, split, array;
9 import std..string : format, strip, indexOf, isNumeric, toLower;
10 import std.conv : to;
11 import std.algorithm : canFind, startsWith, count, map, filter;
12 
13 import diamond.markdown.type;
14 import diamond.markdown.part;
15 
16 /**
17 * Parses markdown to html.
18 * Params:
19 *   markdown =     The markdown to parse.
20 *   customParser = A custom parser that can be ued to customize certain markdown parts. (If it returns null, then it will skip to the default parser.)
21 * Returns:
22 *   A string equivalent to the parsed html from the markdwon.
23 */
24 string parseToHtml(string markdown, string delegate(MarkdownPart) customParser = null)
25 {
26   string result;
27 
28   string[] cellAlignment;
29   size_t cellIndex;
30 
31   foreach (part; parse(markdown))
32   {
33     if (customParser)
34     {
35       auto customResult = customParser(part);
36 
37       if (customResult)
38       {
39         result ~= customResult;
40         continue;
41       }
42     }
43 
44     switch (part.type)
45     {
46       case MarkdownType.content:
47       {
48         result ~= part.content;
49         break;
50       }
51 
52       case MarkdownType.newline:
53       {
54         result ~= "<br>\r\n";
55         break;
56       }
57 
58       case MarkdownType.ulistStart:
59       {
60         result ~= "<ul>\r\n";
61         break;
62       }
63 
64       case MarkdownType.ulistEnd:
65       {
66         result ~= "</ul>\r\n";
67         break;
68       }
69 
70       case MarkdownType.olistStart:
71       {
72         result ~= "<ol>\r\n";
73         break;
74       }
75 
76       case MarkdownType.olistEnd:
77       {
78         result ~= "</ol>\r\n";
79         break;
80       }
81 
82       case MarkdownType.listItemStart:
83       {
84         result ~= "<li>\r\n";
85         break;
86       }
87 
88       case MarkdownType.listItemEnd:
89       {
90         result ~= "</li>\r\n";
91         break;
92       }
93 
94       case MarkdownType.link:
95       {
96         auto title = part.getMetadata("title");
97 
98         result ~= "<a href=\"%s\"%s>%s</a>".format
99         (
100           part.getMetadata("url"),
101           title ? " title=\"" ~ title ~ "\"" : "",
102           part.content
103         );
104         break;
105       }
106 
107       case MarkdownType.image:
108       {
109         auto title = part.getMetadata("title");
110 
111         result ~= "<img src=\"%s\" alt=\"%s\"%s>".format
112         (
113           part.getMetadata("url"),
114           part.content,
115           title ? " title=\"" ~ title ~ "\"" : ""
116         );
117         break;
118       }
119 
120       case MarkdownType.blockQuoteStart:
121       {
122         result ~= "<blockquote>\r\n";
123         break;
124       }
125 
126       case MarkdownType.blockQuoteEnd:
127       {
128         result ~= "</blockquote>\r\n";
129         break;
130       }
131 
132       case MarkdownType.horizontal:
133       {
134         result ~= "<hr>\r\n";
135         break;
136       }
137 
138       case MarkdownType.tableStart:
139       {
140         cellAlignment = null;
141 
142         result ~= "<table>\r\n";
143         break;
144       }
145 
146       case MarkdownType.tableEnd:
147       {
148         result ~= "</table>\r\n";
149         break;
150       }
151 
152       case MarkdownType.tableRowStart:
153       {
154         cellIndex = 0;
155         result ~= "<tr>\r\n";
156         break;
157       }
158 
159       case MarkdownType.tableRowEnd:
160       {
161         result ~= "\r\n</tr>\r\n";
162         break;
163       }
164 
165       case MarkdownType.tableHeadStart:
166       {
167         auto alignment = part.getMetadata("align");
168 
169         cellAlignment ~= alignment ? alignment : "";
170         cellIndex = 0;
171 
172         result ~= "<th>\r\n";
173         break;
174       }
175 
176       case MarkdownType.tableHeadEnd:
177       {
178         result ~= "</th>\r\n";
179         break;
180       }
181 
182       case MarkdownType.tableCellStart:
183       {
184         if (cellAlignment && cellIndex < cellAlignment.length)
185         {
186           auto alignment = cellAlignment[cellIndex];
187           cellIndex++;
188 
189           if (alignment && alignment.length)
190           {
191             result ~= "<td align=\"%s\">\r\n".format(alignment);
192             break;
193           }
194         }
195 
196         result ~= "<td>\r\n";
197         break;
198       }
199 
200       case MarkdownType.tableCellEnd:
201       {
202         result ~= "</td>\r\n";
203         break;
204       }
205 
206       case MarkdownType.codeStart:
207       {
208         auto language = part.content;
209 
210         if (language)
211         {
212           result ~= "<pre class=\"highlight highlight-source-%s\"><code>".format(language);
213         }
214         else
215         {
216           result ~= "<pre><code>";
217         }
218         break;
219       }
220 
221       case MarkdownType.codeEnd:
222       {
223         result ~= "</code></pre>\r\n";
224         break;
225       }
226 
227       case MarkdownType.contentWrapStart:
228       {
229         switch (part.content)
230         {
231           case "bold":
232           {
233             result ~= "<strong>";
234             break;
235           }
236 
237           case "italic":
238           {
239             result ~= "<em>";
240             break;
241           }
242 
243           case "underline":
244           {
245             result ~= "<span style=\"text-decoration: underline\">";
246             break;
247           }
248 
249           case "strike":
250           {
251             result ~= "<del>";
252             break;
253           }
254 
255           case "inlineCode":
256           {
257             result ~= "<code>";
258             break;
259           }
260 
261           default: break;
262         }
263         break;
264       }
265 
266       case MarkdownType.contentWrapEnd:
267       {
268         switch (part.content)
269         {
270           case "bold":
271           {
272             result ~= "</strong>";
273             break;
274           }
275 
276           case "italic":
277           {
278             result ~= "</em>";
279             break;
280           }
281 
282           case "underline":
283           {
284             result ~= "</span>";
285             break;
286           }
287 
288           case "strike":
289           {
290             result ~= "</del>";
291             break;
292           }
293 
294           case "inlineCode":
295           {
296             result ~= "</code>";
297             break;
298           }
299 
300           default: break;
301         }
302         break;
303       }
304 
305       case MarkdownType.header:
306       {
307         auto id = part.content.replace(" ", "-").toLower();
308 
309         result ~= "<h%d id=\"%s\">%s</h%d>\r\n".format(part.volume, id, part.content, part.volume);
310         break;
311       }
312 
313       default: break;
314     }
315   }
316 
317   return result ? result : "";
318 }
319 
320 /**
321 * Parses markdown into parts.
322 * Params:
323 *   markdown = The markdown to parse.
324 * Returns:
325 *   An array of markdown parts.
326 */
327 MarkdownPart[] parse(string markdown)
328 {
329   const tab = cast(char)0x9;
330 
331   MarkdownPart[] parts;
332 
333   auto lines = markdown.replace("\r", "").split("\n");
334 
335   bool bold = false;
336   bool italic = false;
337   bool underline = false;
338   bool strike = false;
339   bool inlineCode = false;
340 
341   size_t ulist = false;
342   size_t olist = false;
343 
344   bool code = false;
345 
346   bool quote = false;
347 
348   bool table = false;
349 
350   foreach (ref i; 0 .. lines.length)
351   {
352     auto lastLine = i > 0 ? lines[i - 1] : null;
353     auto line = lines[i];
354     auto nextLine = i < (lines.length - 1) ? lines[i + 1] : null;
355 
356     if (!line)
357     {
358       continue;
359     }
360 
361     if (code)
362     {
363       if (line.strip() == "```")
364       {
365         parts ~= new MarkdownPart(MarkdownType.codeEnd);
366 
367         code = false;
368       }
369       else
370       {
371         auto part = new MarkdownPart(MarkdownType.content);
372         part.content = line.replace("<", "&lt;").replace(">", "&gt;") ~ "\r\n";
373 
374         parts ~= part;
375       }
376 
377       continue;
378     }
379 
380     void parseContent(string content)
381     {
382       const backSlash = cast(char)0x5c;
383 
384       MarkdownPart currentPart;
385       foreach (ref j; 0 .. content.length)
386       {
387         auto lastChar = j > 0 ? content[j - 1] : cast(char)0;
388         auto currentChar = content[j];
389         auto nextChar = j < (content.length - 1) ? content[j + 1] : cast(char)0;
390 
391         if ((bold || (!bold && (lastChar == ' '  || lastChar == tab || j == 0))) && currentChar == '*' && nextChar == '*' && lastChar != backSlash)
392         {
393           if (currentPart)
394           {
395             parts ~= currentPart;
396             currentPart = null;
397           }
398 
399           bold = !bold;
400 
401           auto part = new MarkdownPart(bold ? MarkdownType.contentWrapStart : MarkdownType.contentWrapEnd);
402           part.content = "bold";
403 
404           parts ~= part;
405 
406           j++;
407         }
408         else if ((italic || (!italic && (lastChar == ' '  || lastChar == tab || j == 0))) && currentChar == '*' && lastChar != backSlash)
409         {
410           if (currentPart)
411           {
412             parts ~= currentPart;
413             currentPart = null;
414           }
415 
416           italic = !italic;
417 
418           auto part = new MarkdownPart(italic ? MarkdownType.contentWrapStart : MarkdownType.contentWrapEnd);
419           part.content = "italic";
420 
421           parts ~= part;
422         }
423         else if ((underline || (!underline && (lastChar == ' '  || lastChar == tab || j == 0))) && currentChar == '_' && lastChar != backSlash)
424         {
425           if (currentPart)
426           {
427             parts ~= currentPart;
428             currentPart = null;
429           }
430 
431           underline = !underline;
432 
433           auto part = new MarkdownPart(underline ? MarkdownType.contentWrapStart : MarkdownType.contentWrapEnd);
434           part.content = "underline";
435 
436           parts ~= part;
437         }
438         else if ((strike || (!strike && (lastChar == ' '  || lastChar == tab || j == 0))) && currentChar == '~' && lastChar != backSlash)
439         {
440           if (currentPart)
441           {
442             parts ~= currentPart;
443             currentPart = null;
444           }
445 
446           strike = !strike;
447 
448           auto part = new MarkdownPart(strike ? MarkdownType.contentWrapStart : MarkdownType.contentWrapEnd);
449           part.content = "strike";
450 
451           parts ~= part;
452         }
453         else if ((inlineCode || (!inlineCode && (lastChar == ' '  || lastChar == tab || j == 0))) && currentChar == '`' && lastChar != backSlash)
454         {
455           if (currentPart)
456           {
457             parts ~= currentPart;
458             currentPart = null;
459           }
460 
461           inlineCode = !inlineCode;
462 
463           auto part = new MarkdownPart(inlineCode ? MarkdownType.contentWrapStart : MarkdownType.contentWrapEnd);
464           part.content = "inlineCode";
465 
466           parts ~= part;
467         }
468         else if (currentChar != backSlash || (currentChar == backSlash && lastChar == backSlash))
469         {
470           if (currentPart)
471           {
472             currentPart.content = currentPart.content ~ to!string(currentChar);
473           }
474           else
475           {
476             currentPart = new MarkdownPart(MarkdownType.content);
477             currentPart.content = to!string(currentChar);
478           }
479         }
480       }
481 
482       if (currentPart)
483       {
484         parts ~= currentPart;
485       }
486     }
487 
488     void parseUList(char ulistChar)
489     {
490       size_t indentation = line.indexOf(ulistChar) + 1;
491 
492       if (!ulist || ulist < indentation)
493       {
494         ulist++;
495 
496         parts ~= new MarkdownPart(MarkdownType.ulistStart);
497       }
498       else if (indentation < ulist)
499       {
500         parts ~= new MarkdownPart(MarkdownType.ulistEnd);
501         ulist--;
502       }
503 
504       parts ~= new MarkdownPart(MarkdownType.listItemStart);
505 
506       auto content = line[line.indexOf(ulistChar) + 1 .. $].strip();
507 
508       parseContent(content);
509 
510       parts ~= new MarkdownPart(MarkdownType.listItemEnd);
511     }
512 
513     if (!line.length)
514     {
515       while (ulist)
516       {
517         parts ~= new MarkdownPart(MarkdownType.ulistEnd);
518         ulist--;
519       }
520 
521       while (olist)
522       {
523         parts ~= new MarkdownPart(MarkdownType.olistEnd);
524         olist--;
525       }
526 
527       if (quote)
528       {
529         parts ~= new MarkdownPart(MarkdownType.blockQuoteEnd);
530         quote = false;
531       }
532 
533       if (table)
534       {
535         parts ~= new MarkdownPart(MarkdownType.tableEnd);
536         table = false;
537       }
538 
539       parts ~= new MarkdownPart(MarkdownType.newline);
540       continue;
541     }
542 
543     // Block-quote
544     if (line[0] == '>')
545     {
546       if (!quote)
547       {
548         parts ~= new MarkdownPart(MarkdownType.blockQuoteStart);
549         quote = true;
550       }
551 
552       auto content = line[1 .. $].strip();
553 
554       parseContent(content);
555     }
556     // Table
557     else if (line.canFind('|') && (table || (!table && nextLine && nextLine.count('|') == line.count('|'))))
558     {
559       auto entries = line.strip().split("|").filter!(e => e && e.length).array;
560 
561       if (!table)
562       {
563         auto nextLinesEntries = nextLine.strip().split("|").filter!(e => e && e.length).array;
564 
565         bool invalidTable = false;
566 
567         string[] alignments;
568 
569         foreach (entry; nextLinesEntries.map!(e => e.strip()))
570         {
571           auto finalResult = entry.replace(":", "").strip();
572 
573           if (finalResult.length < 3 || entry.count('-') != finalResult.length)
574           {
575             invalidTable = true;
576             break;
577           }
578 
579           if (entry[0] == ':' && entry[$-1] != ':')
580           {
581             alignments ~= "left";
582           }
583           else if (entry[0] != ':' && entry[$-1] == ':')
584           {
585             alignments ~= "right";
586           }
587           else if (entry[0] == ':' && entry[$-1] == ':')
588           {
589             alignments ~= "center";
590           }
591           else
592           {
593             alignments ~= "";
594           }
595         }
596 
597         i++;
598 
599         if (!invalidTable)
600         {
601           parts ~= new MarkdownPart(MarkdownType.tableStart);
602           parts ~= new MarkdownPart(MarkdownType.tableRowStart);
603 
604           size_t alignmentCounter;
605           foreach (entry; entries.map!(e => e.strip()))
606           {
607             auto head = new MarkdownPart(MarkdownType.tableHeadStart);
608 
609             if (alignmentCounter < alignments.length)
610             {
611               head.setMetadata("align", alignments[alignmentCounter]);
612             }
613 
614             alignmentCounter++;
615 
616             parts ~= head;
617 
618             parseContent(entry);
619 
620             parts ~= new MarkdownPart(MarkdownType.tableHeadEnd);
621           }
622 
623           table = true;
624 
625           parts ~= new MarkdownPart(MarkdownType.tableRowEnd);
626         }
627       }
628       else
629       {
630         parts ~= new MarkdownPart(MarkdownType.tableRowStart);
631 
632         foreach (entry; entries)
633         {
634           parts ~= new MarkdownPart(MarkdownType.tableCellStart);
635 
636           parseContent(entry.strip());
637 
638           parts ~= new MarkdownPart(MarkdownType.tableCellEnd);
639         }
640 
641         parts ~= new MarkdownPart(MarkdownType.tableRowEnd);
642       }
643 
644       continue; // Don't want new-lines in tables ...
645     }
646     else if
647     (
648       (line.strip().count('-') == line.strip().length && line.strip().count('-') >= 3) ||
649       (line.strip().count('*') == line.strip().length && line.strip().count('*') >= 3) ||
650       (line.strip().count('_') == line.strip().length && line.strip().count('_') >= 3)
651     )
652     {
653       parts ~= new MarkdownPart(MarkdownType.horizontal);
654       continue; // hr shouldn't have a new line
655     }
656     // Header
657     else if (line[0] == '#')
658     {
659       auto hIndex = line.strip().indexOf(' ');
660       auto headerStart = hIndex;
661 
662       if (hIndex == -1)
663       {
664         hIndex = line.strip().indexOf(tab);
665         headerStart = hIndex;
666 
667         if (hIndex == -1)
668         {
669           hIndex = 0;
670         }
671       }
672 
673       if (hIndex > 6)
674       {
675         hIndex = 6;
676       }
677 
678       if (hIndex)
679       {
680         auto part = new MarkdownPart(MarkdownType.header);
681         part.content = line[headerStart .. $].strip();
682         part.volume = hIndex;
683 
684         parts ~= part;
685       }
686 
687       continue; // Headers shouldn't have a new line
688     }
689     // Header alt
690     else if (nextLine.strip() == "======" || nextLine.strip() == "------")
691     {
692       auto part = new MarkdownPart(MarkdownType.header);
693       part.content = line.strip();
694       part.volume = nextLine.strip() == "======" ? 1 : 2;
695 
696       parts ~= part;
697 
698       i++;
699 
700       continue; // Headers shouldn't have a new line
701     }
702     else if (line.strip() == "```" || line.strip().startsWith("```"))
703     {
704       auto part = new MarkdownPart(MarkdownType.codeStart);
705 
706       if (line.strip().length > 3)
707       {
708         part.content = line[3 .. $];
709       }
710 
711       parts ~= part;
712       code = true;
713       continue; // Code shouldn't have a new line
714     }
715     // unordered list
716     else if (line.strip().length > 2 && (line.strip()[0] == '*' || line.strip()[0] == '+' || line.strip()[0] == '-') && line.strip()[1] == ' ')
717     {
718       parseUList(line.strip()[0]);
719       continue; // Don't want <br> after </li>
720     }
721     // ordered list
722     else if (line.strip().length > 3 && line.strip().indexOf('.') > 0 && line.strip()[0 .. line.strip().indexOf('.')].isNumeric)
723     {
724       size_t indentation = 1;
725 
726       foreach (c; line)
727       {
728         if (c != ' ' && c != tab)
729         {
730           break;
731         }
732 
733         indentation++;
734       }
735 
736       if (!olist || olist < indentation)
737       {
738         olist++;
739 
740         parts ~= new MarkdownPart(MarkdownType.olistStart);
741       }
742       else if (indentation < olist)
743       {
744         parts ~= new MarkdownPart(MarkdownType.olistEnd);
745         olist--;
746       }
747 
748       parts ~= new MarkdownPart(MarkdownType.listItemStart);
749 
750       auto content = line[line.indexOf('.') + 1 .. $].strip();
751 
752       parseContent(content);
753 
754       parts ~= new MarkdownPart(MarkdownType.listItemEnd);
755       continue; // Don't want <br> after </li>
756     }
757     // link
758     else if (line.strip()[0] == '[' && line.canFind(']') && line.canFind('(') && line.strip()[$-1] == ')')
759     {
760       auto link = line.strip();
761 
762       auto text = link[1 .. link.indexOf(']')];
763       auto href = link[link.indexOf('(') + 1 .. $-1];
764 
765       auto firstHrefSpace = href.indexOf(' ');
766       auto url = href[0 .. firstHrefSpace == -1 ? href.length : firstHrefSpace];
767       string title;
768 
769       if (firstHrefSpace > 0)
770       {
771         title = href[firstHrefSpace + 1 .. $];
772       }
773 
774       auto part = new MarkdownPart(MarkdownType.link);
775       part.content = text;
776       part.setMetadata("url", url.strip());
777 
778       if (title)
779       {
780         part.setMetadata("title", title.strip());
781       }
782 
783       parts ~= part;
784       continue; // We don't want <br> after <a></a>
785     }
786     // image
787     else if (line.strip()[0] == '!' && line.strip()[1] == '[' && line.canFind(']') && line.canFind('(') && line.strip()[$-1] == ')')
788     {
789       auto image = line.strip();
790 
791       auto text = image[2 .. image.indexOf(']')];
792       auto href = image[image.indexOf('(') + 1 .. $-1];
793 
794       const stringTerminator = cast(char)("\""[0]);
795 
796       auto stringTerminatorIndex = href.indexOf(stringTerminator);
797 
798       auto url = href[0 .. stringTerminatorIndex == -1 ? href.length : stringTerminatorIndex];
799       string title;
800 
801       if (stringTerminatorIndex > 0 && image.strip()[$-2] == stringTerminator)
802       {
803         title = href[stringTerminatorIndex + 1 .. $-1];
804       }
805 
806       auto part = new MarkdownPart(MarkdownType.image);
807       part.content = text;
808       part.setMetadata("url", url.strip());
809 
810       if (title)
811       {
812         part.setMetadata("title", title.strip());
813       }
814 
815       parts ~= part;
816       continue; // We don't want <br> after <img>
817     }
818     // Content
819     else
820     {
821       parseContent(line.strip());
822     }
823 
824     parts ~= new MarkdownPart(MarkdownType.newline);
825   }
826 
827   while (ulist)
828   {
829     parts ~= new MarkdownPart(MarkdownType.ulistEnd);
830     ulist--;
831   }
832 
833   while (olist)
834   {
835     parts ~= new MarkdownPart(MarkdownType.olistEnd);
836     olist--;
837   }
838 
839   if (quote)
840   {
841     parts ~= new MarkdownPart(MarkdownType.blockQuoteEnd);
842     quote = false;
843   }
844 
845   if (table)
846   {
847     parts ~= new MarkdownPart(MarkdownType.tableEnd);
848     table = false;
849   }
850 
851   return parts ? parts : [];
852 }