aboutsummaryrefslogtreecommitdiffstats
path: root/filters/html-converters/resources/markdown.pl
diff options
context:
space:
mode:
authorGravatar Jason A. Donenfeld <Jason@zx2c4.com>2013-05-28 04:39:43 (JST)
committerGravatar Jason A. Donenfeld <Jason@zx2c4.com>2013-05-28 04:54:16 (JST)
commit8149be213f1c8f52b0dbe6c213f6073af57fa954 (patch)
treee4d0315f53022bb7335f782ad394d8e7602f1b52 /filters/html-converters/resources/markdown.pl
parentdcbc0438b2543a733858d62170f3110a89edbed6 (diff)
downloadcgit-8149be213f1c8f52b0dbe6c213f6073af57fa954.zip
cgit-8149be213f1c8f52b0dbe6c213f6073af57fa954.tar.gz
filters: import more modern scripts
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to 'filters/html-converters/resources/markdown.pl')
-rwxr-xr-xfilters/html-converters/resources/markdown.pl1731
1 files changed, 1731 insertions, 0 deletions
diff --git a/filters/html-converters/resources/markdown.pl b/filters/html-converters/resources/markdown.pl
new file mode 100755
index 0000000..abec173
--- /dev/null
+++ b/filters/html-converters/resources/markdown.pl
@@ -0,0 +1,1731 @@
1#!/usr/bin/perl
2
3#
4# Markdown -- A text-to-HTML conversion tool for web writers
5#
6# Copyright (c) 2004 John Gruber
7# <http://daringfireball.net/projects/markdown/>
8#
9
10
11package Markdown;
12require 5.006_000;
13use strict;
14use warnings;
15
16use Digest::MD5 qw(md5_hex);
17use vars qw($VERSION);
18$VERSION = '1.0.1';
19# Tue 14 Dec 2004
20
21## Disabled; causes problems under Perl 5.6.1:
22use utf8;
23binmode( STDOUT, ":utf8" ); # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
24
25
26#
27# Global default settings:
28#
29my $g_empty_element_suffix = " />"; # Change to ">" for HTML output
30my $g_tab_width = 4;
31
32
33#
34# Globals:
35#
36
37# Regex to match balanced [brackets]. See Friedl's
38# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
39my $g_nested_brackets;
40$g_nested_brackets = qr{
41 (?> # Atomic matching
42 [^\[\]]+ # Anything other than brackets
43 |
44 \[
45 (??{ $g_nested_brackets }) # Recursive set of nested brackets
46 \]
47 )*
48}x;
49
50
51# Table of hash values for escaped characters:
52my %g_escape_table;
53foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
54 $g_escape_table{$char} = md5_hex($char);
55}
56
57
58# Global hashes, used by various utility routines
59my %g_urls;
60my %g_titles;
61my %g_html_blocks;
62
63# Used to track when we're inside an ordered or unordered list
64# (see _ProcessListItems() for details):
65my $g_list_level = 0;
66
67
68#### Blosxom plug-in interface ##########################################
69
70# Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
71# which posts Markdown should process, using a "meta-markup: markdown"
72# header. If it's set to 0 (the default), Markdown will process all
73# entries.
74my $g_blosxom_use_meta = 0;
75
76sub start { 1; }
77sub story {
78 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
79
80 if ( (! $g_blosxom_use_meta) or
81 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
82 ){
83 $$body_ref = Markdown($$body_ref);
84 }
85 1;
86}
87
88
89#### Movable Type plug-in interface #####################################
90eval {require MT}; # Test to see if we're running in MT.
91unless ($@) {
92 require MT;
93 import MT;
94 require MT::Template::Context;
95 import MT::Template::Context;
96
97 eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0.
98 unless ($@) {
99 require MT::Plugin;
100 import MT::Plugin;
101 my $plugin = new MT::Plugin({
102 name => "Markdown",
103 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
104 doc_link => 'http://daringfireball.net/projects/markdown/'
105 });
106 MT->add_plugin( $plugin );
107 }
108
109 MT::Template::Context->add_container_tag(MarkdownOptions => sub {
110 my $ctx = shift;
111 my $args = shift;
112 my $builder = $ctx->stash('builder');
113 my $tokens = $ctx->stash('tokens');
114
115 if (defined ($args->{'output'}) ) {
116 $ctx->stash('markdown_output', lc $args->{'output'});
117 }
118
119 defined (my $str = $builder->build($ctx, $tokens) )
120 or return $ctx->error($builder->errstr);
121 $str; # return value
122 });
123
124 MT->add_text_filter('markdown' => {
125 label => 'Markdown',
126 docs => 'http://daringfireball.net/projects/markdown/',
127 on_format => sub {
128 my $text = shift;
129 my $ctx = shift;
130 my $raw = 0;
131 if (defined $ctx) {
132 my $output = $ctx->stash('markdown_output');
133 if (defined $output && $output =~ m/^html/i) {
134 $g_empty_element_suffix = ">";
135 $ctx->stash('markdown_output', '');
136 }
137 elsif (defined $output && $output eq 'raw') {
138 $raw = 1;
139 $ctx->stash('markdown_output', '');
140 }
141 else {
142 $raw = 0;
143 $g_empty_element_suffix = " />";
144 }
145 }
146 $text = $raw ? $text : Markdown($text);
147 $text;
148 },
149 });
150
151 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
152 my $smartypants;
153
154 {
155 no warnings "once";
156 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
157 }
158
159 if ($smartypants) {
160 MT->add_text_filter('markdown_with_smartypants' => {
161 label => 'Markdown With SmartyPants',
162 docs => 'http://daringfireball.net/projects/markdown/',
163 on_format => sub {
164 my $text = shift;
165 my $ctx = shift;
166 if (defined $ctx) {
167 my $output = $ctx->stash('markdown_output');
168 if (defined $output && $output eq 'html') {
169 $g_empty_element_suffix = ">";
170 }
171 else {
172 $g_empty_element_suffix = " />";
173 }
174 }
175 $text = Markdown($text);
176 $text = $smartypants->($text, '1');
177 },
178 });
179 }
180}
181else {
182#### BBEdit/command-line text filter interface ##########################
183# Needs to be hidden from MT (and Blosxom when running in static mode).
184
185 # We're only using $blosxom::version once; tell Perl not to warn us:
186 no warnings 'once';
187 unless ( defined($blosxom::version) ) {
188 use warnings;
189
190 #### Check for command-line switches: #################
191 my %cli_opts;
192 use Getopt::Long;
193 Getopt::Long::Configure('pass_through');
194 GetOptions(\%cli_opts,
195 'version',
196 'shortversion',
197 'html4tags',
198 );
199 if ($cli_opts{'version'}) { # Version info
200 print "\nThis is Markdown, version $VERSION.\n";
201 print "Copyright 2004 John Gruber\n";
202 print "http://daringfireball.net/projects/markdown/\n\n";
203 exit 0;
204 }
205 if ($cli_opts{'shortversion'}) { # Just the version number string.
206 print $VERSION;
207 exit 0;
208 }
209 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
210 $g_empty_element_suffix = ">";
211 }
212
213
214 #### Process incoming text: ###########################
215 my $text;
216 {
217 local $/; # Slurp the whole file
218 $text = <>;
219 }
220 print <<'EOT';
221<style>
222.markdown-body {
223 font-size: 14px;
224 line-height: 1.6;
225 overflow: hidden;
226}
227.markdown-body>*:first-child {
228 margin-top: 0 !important;
229}
230.markdown-body>*:last-child {
231 margin-bottom: 0 !important;
232}
233.markdown-body a.absent {
234 color: #c00;
235}
236.markdown-body a.anchor {
237 display: block;
238 padding-left: 30px;
239 margin-left: -30px;
240 cursor: pointer;
241 position: absolute;
242 top: 0;
243 left: 0;
244 bottom: 0;
245}
246.markdown-body h1, .markdown-body h2, .markdown-body h3, .markdown-body h4, .markdown-body h5, .markdown-body h6 {
247 margin: 20px 0 10px;
248 padding: 0;
249 font-weight: bold;
250 -webkit-font-smoothing: antialiased;
251 cursor: text;
252 position: relative;
253}
254.markdown-body h1 .mini-icon-link, .markdown-body h2 .mini-icon-link, .markdown-body h3 .mini-icon-link, .markdown-body h4 .mini-icon-link, .markdown-body h5 .mini-icon-link, .markdown-body h6 .mini-icon-link {
255 display: none;
256 color: #000;
257}
258.markdown-body h1:hover a.anchor, .markdown-body h2:hover a.anchor, .markdown-body h3:hover a.anchor, .markdown-body h4:hover a.anchor, .markdown-body h5:hover a.anchor, .markdown-body h6:hover a.anchor {
259 text-decoration: none;
260 line-height: 1;
261 padding-left: 0;
262 margin-left: -22px;
263 top: 15%}
264.markdown-body h1:hover a.anchor .mini-icon-link, .markdown-body h2:hover a.anchor .mini-icon-link, .markdown-body h3:hover a.anchor .mini-icon-link, .markdown-body h4:hover a.anchor .mini-icon-link, .markdown-body h5:hover a.anchor .mini-icon-link, .markdown-body h6:hover a.anchor .mini-icon-link {
265 display: inline-block;
266}
267.markdown-body h1 tt, .markdown-body h1 code, .markdown-body h2 tt, .markdown-body h2 code, .markdown-body h3 tt, .markdown-body h3 code, .markdown-body h4 tt, .markdown-body h4 code, .markdown-body h5 tt, .markdown-body h5 code, .markdown-body h6 tt, .markdown-body h6 code {
268 font-size: inherit;
269}
270.markdown-body h1 {
271 font-size: 28px;
272 color: #000;
273}
274.markdown-body h2 {
275 font-size: 24px;
276 border-bottom: 1px solid #ccc;
277 color: #000;
278}
279.markdown-body h3 {
280 font-size: 18px;
281}
282.markdown-body h4 {
283 font-size: 16px;
284}
285.markdown-body h5 {
286 font-size: 14px;
287}
288.markdown-body h6 {
289 color: #777;
290 font-size: 14px;
291}
292.markdown-body p, .markdown-body blockquote, .markdown-body ul, .markdown-body ol, .markdown-body dl, .markdown-body table, .markdown-body pre {
293 margin: 15px 0;
294}
295.markdown-body hr {
296 background: transparent url("/dirty-shade.png") repeat-x 0 0;
297 border: 0 none;
298 color: #ccc;
299 height: 4px;
300 padding: 0;
301}
302.markdown-body>h2:first-child, .markdown-body>h1:first-child, .markdown-body>h1:first-child+h2, .markdown-body>h3:first-child, .markdown-body>h4:first-child, .markdown-body>h5:first-child, .markdown-body>h6:first-child {
303 margin-top: 0;
304 padding-top: 0;
305}
306.markdown-body a:first-child h1, .markdown-body a:first-child h2, .markdown-body a:first-child h3, .markdown-body a:first-child h4, .markdown-body a:first-child h5, .markdown-body a:first-child h6 {
307 margin-top: 0;
308 padding-top: 0;
309}
310.markdown-body h1+p, .markdown-body h2+p, .markdown-body h3+p, .markdown-body h4+p, .markdown-body h5+p, .markdown-body h6+p {
311 margin-top: 0;
312}
313.markdown-body li p.first {
314 display: inline-block;
315}
316.markdown-body ul, .markdown-body ol {
317 padding-left: 30px;
318}
319.markdown-body ul.no-list, .markdown-body ol.no-list {
320 list-style-type: none;
321 padding: 0;
322}
323.markdown-body ul li>:first-child, .markdown-body ul li ul:first-of-type, .markdown-body ul li ol:first-of-type, .markdown-body ol li>:first-child, .markdown-body ol li ul:first-of-type, .markdown-body ol li ol:first-of-type {
324 margin-top: 0px;
325}
326.markdown-body ul li p:last-of-type, .markdown-body ol li p:last-of-type {
327 margin-bottom: 0;
328}
329.markdown-body ul ul, .markdown-body ul ol, .markdown-body ol ol, .markdown-body ol ul {
330 margin-bottom: 0;
331}
332.markdown-body dl {
333 padding: 0;
334}
335.markdown-body dl dt {
336 font-size: 14px;
337 font-weight: bold;
338 font-style: italic;
339 padding: 0;
340 margin: 15px 0 5px;
341}
342.markdown-body dl dt:first-child {
343 padding: 0;
344}
345.markdown-body dl dt>:first-child {
346 margin-top: 0px;
347}
348.markdown-body dl dt>:last-child {
349 margin-bottom: 0px;
350}
351.markdown-body dl dd {
352 margin: 0 0 15px;
353 padding: 0 15px;
354}
355.markdown-body dl dd>:first-child {
356 margin-top: 0px;
357}
358.markdown-body dl dd>:last-child {
359 margin-bottom: 0px;
360}
361.markdown-body blockquote {
362 border-left: 4px solid #DDD;
363 padding: 0 15px;
364 color: #777;
365}
366.markdown-body blockquote>:first-child {
367 margin-top: 0px;
368}
369.markdown-body blockquote>:last-child {
370 margin-bottom: 0px;
371}
372.markdown-body table th {
373 font-weight: bold;
374}
375.markdown-body table th, .markdown-body table td {
376 border: 1px solid #ccc;
377 padding: 6px 13px;
378}
379.markdown-body table tr {
380 border-top: 1px solid #ccc;
381 background-color: #fff;
382}
383.markdown-body table tr:nth-child(2n) {
384 background-color: #f8f8f8;
385}
386.markdown-body img {
387 max-width: 100%;
388 -moz-box-sizing: border-box;
389 box-sizing: border-box;
390}
391.markdown-body span.frame {
392 display: block;
393 overflow: hidden;
394}
395.markdown-body span.frame>span {
396 border: 1px solid #ddd;
397 display: block;
398 float: left;
399 overflow: hidden;
400 margin: 13px 0 0;
401 padding: 7px;
402 width: auto;
403}
404.markdown-body span.frame span img {
405 display: block;
406 float: left;
407}
408.markdown-body span.frame span span {
409 clear: both;
410 color: #333;
411 display: block;
412 padding: 5px 0 0;
413}
414.markdown-body span.align-center {
415 display: block;
416 overflow: hidden;
417 clear: both;
418}
419.markdown-body span.align-center>span {
420 display: block;
421 overflow: hidden;
422 margin: 13px auto 0;
423 text-align: center;
424}
425.markdown-body span.align-center span img {
426 margin: 0 auto;
427 text-align: center;
428}
429.markdown-body span.align-right {
430 display: block;
431 overflow: hidden;
432 clear: both;
433}
434.markdown-body span.align-right>span {
435 display: block;
436 overflow: hidden;
437 margin: 13px 0 0;
438 text-align: right;
439}
440.markdown-body span.align-right span img {
441 margin: 0;
442 text-align: right;
443}
444.markdown-body span.float-left {
445 display: block;
446 margin-right: 13px;
447 overflow: hidden;
448 float: left;
449}
450.markdown-body span.float-left span {
451 margin: 13px 0 0;
452}
453.markdown-body span.float-right {
454 display: block;
455 margin-left: 13px;
456 overflow: hidden;
457 float: right;
458}
459.markdown-body span.float-right>span {
460 display: block;
461 overflow: hidden;
462 margin: 13px auto 0;
463 text-align: right;
464}
465.markdown-body code, .markdown-body tt {
466 margin: 0 2px;
467 padding: 0px 5px;
468 border: 1px solid #eaeaea;
469 background-color: #f8f8f8;
470 border-radius: 3px;
471}
472.markdown-body code {
473 white-space: nowrap;
474}
475.markdown-body pre>code {
476 margin: 0;
477 padding: 0;
478 white-space: pre;
479 border: none;
480 background: transparent;
481}
482.markdown-body .highlight pre, .markdown-body pre {
483 background-color: #f8f8f8;
484 border: 1px solid #ccc;
485 font-size: 13px;
486 line-height: 19px;
487 overflow: auto;
488 padding: 6px 10px;
489 border-radius: 3px;
490}
491.markdown-body pre code, .markdown-body pre tt {
492 margin: 0;
493 padding: 0;
494 background-color: transparent;
495 border: none;
496}
497</style>
498EOT
499 print "<div class='markdown-body'>";
500 print Markdown($text);
501 print "</div>";
502 }
503}
504
505
506
507sub Markdown {
508#
509# Main function. The order in which other subs are called here is
510# essential. Link and image substitutions need to happen before
511# _EscapeSpecialChars(), so that any *'s or _'s in the <a>
512# and <img> tags get encoded.
513#
514 my $text = shift;
515
516 # Clear the global hashes. If we don't clear these, you get conflicts
517 # from other articles when generating a page which contains more than
518 # one article (e.g. an index page that shows the N most recent
519 # articles):
520 %g_urls = ();
521 %g_titles = ();
522 %g_html_blocks = ();
523
524
525 # Standardize line endings:
526 $text =~ s{\r\n}{\n}g; # DOS to Unix
527 $text =~ s{\r}{\n}g; # Mac to Unix
528
529 # Make sure $text ends with a couple of newlines:
530 $text .= "\n\n";
531
532 # Convert all tabs to spaces.
533 $text = _Detab($text);
534
535 # Strip any lines consisting only of spaces and tabs.
536 # This makes subsequent regexen easier to write, because we can
537 # match consecutive blank lines with /\n+/ instead of something
538 # contorted like /[ \t]*\n+/ .
539 $text =~ s/^[ \t]+$//mg;
540
541 # Turn block-level HTML blocks into hash entries
542 $text = _HashHTMLBlocks($text);
543
544 # Strip link definitions, store in hashes.
545 $text = _StripLinkDefinitions($text);
546
547 $text = _RunBlockGamut($text);
548
549 $text = _UnescapeSpecialChars($text);
550
551 return $text . "\n";
552}
553
554
555sub _StripLinkDefinitions {
556#
557# Strips link definitions from text, stores the URLs and titles in
558# hash references.
559#
560 my $text = shift;
561 my $less_than_tab = $g_tab_width - 1;
562
563 # Link defs are in the form: ^[id]: url "optional title"
564 while ($text =~ s{
565 ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
566 [ \t]*
567 \n? # maybe *one* newline
568 [ \t]*
569 <?(\S+?)>? # url = $2
570 [ \t]*
571 \n? # maybe one newline
572 [ \t]*
573 (?:
574 (?<=\s) # lookbehind for whitespace
575 ["(]
576 (.+?) # title = $3
577 [")]
578 [ \t]*
579 )? # title is optional
580 (?:\n+|\Z)
581 }
582 {}mx) {
583 $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
584 if ($3) {
585 $g_titles{lc $1} = $3;
586 $g_titles{lc $1} =~ s/"/&quot;/g;
587 }
588 }
589
590 return $text;
591}
592
593
594sub _HashHTMLBlocks {
595 my $text = shift;
596 my $less_than_tab = $g_tab_width - 1;
597
598 # Hashify HTML blocks:
599 # We only want to do this for block-level HTML tags, such as headers,
600 # lists, and tables. That's because we still want to wrap <p>s around
601 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
602 # phrase emphasis, and spans. The list of tags we're looking for is
603 # hard-coded:
604 my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
605 my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
606
607 # First, look for nested blocks, e.g.:
608 # <div>
609 # <div>
610 # tags for inner block must be indented.
611 # </div>
612 # </div>
613 #
614 # The outermost tags must start at the left margin for this to match, and
615 # the inner nested divs must be indented.
616 # We need to do this before the next, more liberal match, because the next
617 # match will start at the first `<div>` and stop at the first `</div>`.
618 $text =~ s{
619 ( # save in $1
620 ^ # start of line (with /m)
621 <($block_tags_a) # start tag = $2
622 \b # word break
623 (.*\n)*? # any number of lines, minimally matching
624 </\2> # the matching end tag
625 [ \t]* # trailing spaces/tabs
626 (?=\n+|\Z) # followed by a newline or end of document
627 )
628 }{
629 my $key = md5_hex($1);
630 $g_html_blocks{$key} = $1;
631 "\n\n" . $key . "\n\n";
632 }egmx;
633
634
635 #
636 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
637 #
638 $text =~ s{
639 ( # save in $1
640 ^ # start of line (with /m)
641 <($block_tags_b) # start tag = $2
642 \b # word break
643 (.*\n)*? # any number of lines, minimally matching
644 .*</\2> # the matching end tag
645 [ \t]* # trailing spaces/tabs
646 (?=\n+|\Z) # followed by a newline or end of document
647 )
648 }{
649 my $key = md5_hex($1);
650 $g_html_blocks{$key} = $1;
651 "\n\n" . $key . "\n\n";
652 }egmx;
653 # Special case just for <hr />. It was easier to make a special case than
654 # to make the other regex more complicated.
655 $text =~ s{
656 (?:
657 (?<=\n\n) # Starting after a blank line
658 | # or
659 \A\n? # the beginning of the doc
660 )
661 ( # save in $1
662 [ ]{0,$less_than_tab}
663 <(hr) # start tag = $2
664 \b # word break
665 ([^<>])*? #
666 /?> # the matching end tag
667 [ \t]*
668 (?=\n{2,}|\Z) # followed by a blank line or end of document
669 )
670 }{
671 my $key = md5_hex($1);
672 $g_html_blocks{$key} = $1;
673 "\n\n" . $key . "\n\n";
674 }egx;
675
676 # Special case for standalone HTML comments:
677 $text =~ s{
678 (?:
679 (?<=\n\n) # Starting after a blank line
680 | # or
681 \A\n? # the beginning of the doc
682 )
683 ( # save in $1
684 [ ]{0,$less_than_tab}
685 (?s:
686 <!
687 (--.*?--\s*)+
688 >
689 )
690 [ \t]*
691 (?=\n{2,}|\Z) # followed by a blank line or end of document
692 )
693 }{
694 my $key = md5_hex($1);
695 $g_html_blocks{$key} = $1;
696 "\n\n" . $key . "\n\n";
697 }egx;
698
699
700 return $text;
701}
702
703
704sub _RunBlockGamut {
705#
706# These are all the transformations that form block-level
707# tags like paragraphs, headers, and list items.
708#
709 my $text = shift;
710
711 $text = _DoHeaders($text);
712
713 # Do Horizontal Rules:
714 $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
715 $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
716 $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
717
718 $text = _DoLists($text);
719
720 $text = _DoCodeBlocks($text);
721
722 $text = _DoBlockQuotes($text);
723
724 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
725 # was to escape raw HTML in the original Markdown source. This time,
726 # we're escaping the markup we've just created, so that we don't wrap
727 # <p> tags around block-level tags.
728 $text = _HashHTMLBlocks($text);
729
730 $text = _FormParagraphs($text);
731
732 return $text;
733}
734
735
736sub _RunSpanGamut {
737#
738# These are all the transformations that occur *within* block-level
739# tags like paragraphs, headers, and list items.
740#
741 my $text = shift;
742
743 $text = _DoCodeSpans($text);
744
745 $text = _EscapeSpecialChars($text);
746
747 # Process anchor and image tags. Images must come first,
748 # because ![foo][f] looks like an anchor.
749 $text = _DoImages($text);
750 $text = _DoAnchors($text);
751
752 # Make links out of things like `<http://example.com/>`
753 # Must come after _DoAnchors(), because you can use < and >
754 # delimiters in inline links like [this](<url>).
755 $text = _DoAutoLinks($text);
756
757 $text = _EncodeAmpsAndAngles($text);
758
759 $text = _DoItalicsAndBold($text);
760
761 # Do hard breaks:
762 $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
763
764 return $text;
765}
766
767
768sub _EscapeSpecialChars {
769 my $text = shift;
770 my $tokens ||= _TokenizeHTML($text);
771
772 $text = ''; # rebuild $text from the tokens
773# my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
774# my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
775
776 foreach my $cur_token (@$tokens) {
777 if ($cur_token->[0] eq "tag") {
778 # Within tags, encode * and _ so they don't conflict
779 # with their use in Markdown for italics and strong.
780 # We're replacing each such character with its
781 # corresponding MD5 checksum value; this is likely
782 # overkill, but it should prevent us from colliding
783 # with the escape values by accident.
784 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
785 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
786 $text .= $cur_token->[1];
787 } else {
788 my $t = $cur_token->[1];
789 $t = _EncodeBackslashEscapes($t);
790 $text .= $t;
791 }
792 }
793 return $text;
794}
795
796
797sub _DoAnchors {
798#
799# Turn Markdown link shortcuts into XHTML <a> tags.
800#
801 my $text = shift;
802
803 #
804 # First, handle reference-style links: [link text] [id]
805 #
806 $text =~ s{
807 ( # wrap whole match in $1
808 \[
809 ($g_nested_brackets) # link text = $2
810 \]
811
812 [ ]? # one optional space
813 (?:\n[ ]*)? # one optional newline followed by spaces
814
815 \[
816 (.*?) # id = $3
817 \]
818 )
819 }{
820 my $result;
821 my $whole_match = $1;
822 my $link_text = $2;
823 my $link_id = lc $3;
824
825 if ($link_id eq "") {
826 $link_id = lc $link_text; # for shortcut links like [this][].
827 }
828
829 if (defined $g_urls{$link_id}) {
830 my $url = $g_urls{$link_id};
831 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
832 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
833 $result = "<a href=\"$url\"";
834 if ( defined $g_titles{$link_id} ) {
835 my $title = $g_titles{$link_id};
836 $title =~ s! \* !$g_escape_table{'*'}!gx;
837 $title =~ s! _ !$g_escape_table{'_'}!gx;
838 $result .= " title=\"$title\"";
839 }
840 $result .= ">$link_text</a>";
841 }
842 else {
843 $result = $whole_match;
844 }
845 $result;
846 }xsge;
847
848 #
849 # Next, inline-style links: [link text](url "optional title")
850 #
851 $text =~ s{
852 ( # wrap whole match in $1
853 \[
854 ($g_nested_brackets) # link text = $2
855 \]
856 \( # literal paren
857 [ \t]*
858 <?(.*?)>? # href = $3
859 [ \t]*
860 ( # $4
861 (['"]) # quote char = $5
862 (.*?) # Title = $6
863 \5 # matching quote
864 )? # title is optional
865 \)
866 )
867 }{
868 my $result;
869 my $whole_match = $1;
870 my $link_text = $2;
871 my $url = $3;
872 my $title = $6;
873
874 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
875 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
876 $result = "<a href=\"$url\"";
877
878 if (defined $title) {
879 $title =~ s/"/&quot;/g;
880 $title =~ s! \* !$g_escape_table{'*'}!gx;
881 $title =~ s! _ !$g_escape_table{'_'}!gx;
882 $result .= " title=\"$title\"";
883 }
884
885 $result .= ">$link_text</a>";
886
887 $result;
888 }xsge;
889
890 return $text;
891}
892
893
894sub _DoImages {
895#
896# Turn Markdown image shortcuts into <img> tags.
897#
898 my $text = shift;
899
900 #
901 # First, handle reference-style labeled images: ![alt text][id]
902 #
903 $text =~ s{
904 ( # wrap whole match in $1
905 !\[
906 (.*?) # alt text = $2
907 \]
908
909 [ ]? # one optional space
910 (?:\n[ ]*)? # one optional newline followed by spaces
911
912 \[
913 (.*?) # id = $3
914 \]
915
916 )
917 }{
918 my $result;
919 my $whole_match = $1;
920 my $alt_text = $2;
921 my $link_id = lc $3;
922
923 if ($link_id eq "") {
924 $link_id = lc $alt_text; # for shortcut links like ![this][].
925 }
926
927 $alt_text =~ s/"/&quot;/g;
928 if (defined $g_urls{$link_id}) {
929 my $url = $g_urls{$link_id};
930 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
931 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
932 $result = "<img src=\"$url\" alt=\"$alt_text\"";
933 if (defined $g_titles{$link_id}) {
934 my $title = $g_titles{$link_id};
935 $title =~ s! \* !$g_escape_table{'*'}!gx;
936 $title =~ s! _ !$g_escape_table{'_'}!gx;
937 $result .= " title=\"$title\"";
938 }
939 $result .= $g_empty_element_suffix;
940 }
941 else {
942 # If there's no such link ID, leave intact:
943 $result = $whole_match;
944 }
945
946 $result;
947 }xsge;
948
949 #
950 # Next, handle inline images: ![alt text](url "optional title")
951 # Don't forget: encode * and _
952
953 $text =~ s{
954 ( # wrap whole match in $1
955 !\[
956 (.*?) # alt text = $2
957 \]
958 \( # literal paren
959 [ \t]*
960 <?(\S+?)>? # src url = $3
961 [ \t]*
962 ( # $4
963 (['"]) # quote char = $5
964 (.*?) # title = $6
965 \5 # matching quote
966 [ \t]*
967 )? # title is optional
968 \)
969 )
970 }{
971 my $result;
972 my $whole_match = $1;
973 my $alt_text = $2;
974 my $url = $3;
975 my $title = '';
976 if (defined($6)) {
977 $title = $6;
978 }
979
980 $alt_text =~ s/"/&quot;/g;
981 $title =~ s/"/&quot;/g;
982 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
983 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
984 $result = "<img src=\"$url\" alt=\"$alt_text\"";
985 if (defined $title) {
986 $title =~ s! \* !$g_escape_table{'*'}!gx;
987 $title =~ s! _ !$g_escape_table{'_'}!gx;
988 $result .= " title=\"$title\"";
989 }
990 $result .= $g_empty_element_suffix;
991
992 $result;
993 }xsge;
994
995 return $text;
996}
997
998
999sub _DoHeaders {
1000 my $text = shift;
1001
1002 # Setext-style headers:
1003 # Header 1
1004 # ========
1005 #
1006 # Header 2
1007 # --------
1008 #
1009 $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
1010 "<h1>" . _RunSpanGamut($1) . "</h1>\n\n";
1011 }egmx;
1012
1013 $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
1014 "<h2>" . _RunSpanGamut($1) . "</h2>\n\n";
1015 }egmx;
1016
1017
1018 # atx-style headers:
1019 # # Header 1
1020 # ## Header 2
1021 # ## Header 2 with closing hashes ##
1022 # ...
1023 # ###### Header 6
1024 #
1025 $text =~ s{
1026 ^(\#{1,6}) # $1 = string of #'s
1027 [ \t]*
1028 (.+?) # $2 = Header text
1029 [ \t]*
1030 \#* # optional closing #'s (not counted)
1031 \n+
1032 }{
1033 my $h_level = length($1);
1034 "<h$h_level>" . _RunSpanGamut($2) . "</h$h_level>\n\n";
1035 }egmx;
1036
1037 return $text;
1038}
1039
1040
1041sub _DoLists {
1042#
1043# Form HTML ordered (numbered) and unordered (bulleted) lists.
1044#
1045 my $text = shift;
1046 my $less_than_tab = $g_tab_width - 1;
1047
1048 # Re-usable patterns to match list item bullets and number markers:
1049 my $marker_ul = qr/[*+-]/;
1050 my $marker_ol = qr/\d+[.]/;
1051 my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
1052
1053 # Re-usable pattern to match any entirel ul or ol list:
1054 my $whole_list = qr{
1055 ( # $1 = whole list
1056 ( # $2
1057 [ ]{0,$less_than_tab}
1058 (${marker_any}) # $3 = first list item marker
1059 [ \t]+
1060 )
1061 (?s:.+?)
1062 ( # $4
1063 \z
1064 |
1065 \n{2,}
1066 (?=\S)
1067 (?! # Negative lookahead for another list item marker
1068 [ \t]*
1069 ${marker_any}[ \t]+
1070 )
1071 )
1072 )
1073 }mx;
1074
1075 # We use a different prefix before nested lists than top-level lists.
1076 # See extended comment in _ProcessListItems().
1077 #
1078 # Note: There's a bit of duplication here. My original implementation
1079 # created a scalar regex pattern as the conditional result of the test on
1080 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1081 # substitution once, using the scalar as the pattern. This worked,
1082 # everywhere except when running under MT on my hosting account at Pair
1083 # Networks. There, this caused all rebuilds to be killed by the reaper (or
1084 # perhaps they crashed, but that seems incredibly unlikely given that the
1085 # same script on the same server ran fine *except* under MT. I've spent
1086 # more time trying to figure out why this is happening than I'd like to
1087 # admit. My only guess, backed up by the fact that this workaround works,
1088 # is that Perl optimizes the substition when it can figure out that the
1089 # pattern will never change, and when this optimization isn't on, we run
1090 # afoul of the reaper. Thus, the slightly redundant code to that uses two
1091 # static s/// patterns rather than one conditional pattern.
1092
1093 if ($g_list_level) {
1094 $text =~ s{
1095 ^
1096 $whole_list
1097 }{
1098 my $list = $1;
1099 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
1100 # Turn double returns into triple returns, so that we can make a
1101 # paragraph for the last item in a list, if necessary:
1102 $list =~ s/\n{2,}/\n\n\n/g;
1103 my $result = _ProcessListItems($list, $marker_any);
1104 $result = "<$list_type>\n" . $result . "</$list_type>\n";
1105 $result;
1106 }egmx;
1107 }
1108 else {
1109 $text =~ s{
1110 (?:(?<=\n\n)|\A\n?)
1111 $whole_list
1112 }{
1113 my $list = $1;
1114 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
1115 # Turn double returns into triple returns, so that we can make a
1116 # paragraph for the last item in a list, if necessary:
1117 $list =~ s/\n{2,}/\n\n\n/g;
1118 my $result = _ProcessListItems($list, $marker_any);
1119 $result = "<$list_type>\n" . $result . "</$list_type>\n";
1120 $result;
1121 }egmx;
1122 }
1123
1124
1125 return $text;
1126}
1127
1128
1129sub _ProcessListItems {
1130#
1131# Process the contents of a single ordered or unordered list, splitting it
1132# into individual list items.
1133#
1134
1135 my $list_str = shift;
1136 my $marker_any = shift;
1137
1138
1139 # The $g_list_level global keeps track of when we're inside a list.
1140 # Each time we enter a list, we increment it; when we leave a list,
1141 # we decrement. If it's zero, we're not in a list anymore.
1142 #
1143 # We do this because when we're not inside a list, we want to treat
1144 # something like this:
1145 #
1146 # I recommend upgrading to version
1147 # 8. Oops, now this line is treated
1148 # as a sub-list.
1149 #
1150 # As a single paragraph, despite the fact that the second line starts
1151 # with a digit-period-space sequence.
1152 #
1153 # Whereas when we're inside a list (or sub-list), that line will be
1154 # treated as the start of a sub-list. What a kludge, huh? This is
1155 # an aspect of Markdown's syntax that's hard to parse perfectly
1156 # without resorting to mind-reading. Perhaps the solution is to
1157 # change the syntax rules such that sub-lists must start with a
1158 # starting cardinal number; e.g. "1." or "a.".
1159
1160 $g_list_level++;
1161
1162 # trim trailing blank lines:
1163 $list_str =~ s/\n{2,}\z/\n/;
1164
1165
1166 $list_str =~ s{
1167 (\n)? # leading line = $1
1168 (^[ \t]*) # leading whitespace = $2
1169 ($marker_any) [ \t]+ # list marker = $3
1170 ((?s:.+?) # list item text = $4
1171 (\n{1,2}))
1172 (?= \n* (\z | \2 ($marker_any) [ \t]+))
1173 }{
1174 my $item = $4;
1175 my $leading_line = $1;
1176 my $leading_space = $2;
1177
1178 if ($leading_line or ($item =~ m/\n{2,}/)) {
1179 $item = _RunBlockGamut(_Outdent($item));
1180 }
1181 else {
1182 # Recursion for sub-lists:
1183 $item = _DoLists(_Outdent($item));
1184 chomp $item;
1185 $item = _RunSpanGamut($item);
1186 }
1187
1188 "<li>" . $item . "</li>\n";
1189 }egmx;
1190
1191 $g_list_level--;
1192 return $list_str;
1193}
1194
1195
1196
1197sub _DoCodeBlocks {
1198#
1199# Process Markdown `<pre><code>` blocks.
1200#
1201
1202 my $text = shift;
1203
1204 $text =~ s{
1205 (?:\n\n|\A)
1206 ( # $1 = the code block -- one or more lines, starting with a space/tab
1207 (?:
1208 (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces
1209 .*\n+
1210 )+
1211 )
1212 ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1213 }{
1214 my $codeblock = $1;
1215 my $result; # return value
1216
1217 $codeblock = _EncodeCode(_Outdent($codeblock));
1218 $codeblock = _Detab($codeblock);
1219 $codeblock =~ s/\A\n+//; # trim leading newlines
1220 $codeblock =~ s/\s+\z//; # trim trailing whitespace
1221
1222 $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
1223
1224 $result;
1225 }egmx;
1226
1227 return $text;
1228}
1229
1230
1231sub _DoCodeSpans {
1232#
1233# * Backtick quotes are used for <code></code> spans.
1234#
1235# * You can use multiple backticks as the delimiters if you want to
1236# include literal backticks in the code span. So, this input:
1237#
1238# Just type ``foo `bar` baz`` at the prompt.
1239#
1240# Will translate to:
1241#
1242# <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1243#
1244# There's no arbitrary limit to the number of backticks you
1245# can use as delimters. If you need three consecutive backticks
1246# in your code, use four for delimiters, etc.
1247#
1248# * You can use spaces to get literal backticks at the edges:
1249#
1250# ... type `` `bar` `` ...
1251#
1252# Turns to:
1253#
1254# ... type <code>`bar`</code> ...
1255#
1256
1257 my $text = shift;
1258
1259 $text =~ s@
1260 (`+) # $1 = Opening run of `
1261 (.+?) # $2 = The code block
1262 (?<!`)
1263 \1 # Matching closer
1264 (?!`)
1265 @
1266 my $c = "$2";
1267 $c =~ s/^[ \t]*//g; # leading whitespace
1268 $c =~ s/[ \t]*$//g; # trailing whitespace
1269 $c = _EncodeCode($c);
1270 "<code>$c</code>";
1271 @egsx;
1272
1273 return $text;
1274}
1275
1276
1277sub _EncodeCode {
1278#
1279# Encode/escape certain characters inside Markdown code runs.
1280# The point is that in code, these characters are literals,
1281# and lose their special Markdown meanings.
1282#
1283 local $_ = shift;
1284
1285 # Encode all ampersands; HTML entities are not
1286 # entities within a Markdown code span.
1287 s/&/&amp;/g;
1288
1289 # Encode $'s, but only if we're running under Blosxom.
1290 # (Blosxom interpolates Perl variables in article bodies.)
1291 {
1292 no warnings 'once';
1293 if (defined($blosxom::version)) {
1294 s/\$/&#036;/g;
1295 }
1296 }
1297
1298
1299 # Do the angle bracket song and dance:
1300 s! < !&lt;!gx;
1301 s! > !&gt;!gx;
1302
1303 # Now, escape characters that are magic in Markdown:
1304 s! \* !$g_escape_table{'*'}!gx;
1305 s! _ !$g_escape_table{'_'}!gx;
1306 s! { !$g_escape_table{'{'}!gx;
1307 s! } !$g_escape_table{'}'}!gx;
1308 s! \[ !$g_escape_table{'['}!gx;
1309 s! \] !$g_escape_table{']'}!gx;
1310 s! \\ !$g_escape_table{'\\'}!gx;
1311
1312 return $_;
1313}
1314
1315
1316sub _DoItalicsAndBold {
1317 my $text = shift;
1318
1319 # <strong> must go first:
1320 $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1321 {<strong>$2</strong>}gsx;
1322
1323 $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1324 {<em>$2</em>}gsx;
1325
1326 return $text;
1327}
1328
1329
1330sub _DoBlockQuotes {
1331 my $text = shift;
1332
1333 $text =~ s{
1334 ( # Wrap whole match in $1
1335 (
1336 ^[ \t]*>[ \t]? # '>' at the start of a line
1337 .+\n # rest of the first line
1338 (.+\n)* # subsequent consecutive lines
1339 \n* # blanks
1340 )+
1341 )
1342 }{
1343 my $bq = $1;
1344 $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
1345 $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
1346 $bq = _RunBlockGamut($bq); # recurse
1347
1348 $bq =~ s/^/ /g;
1349 # These leading spaces screw with <pre> content, so we need to fix that:
1350 $bq =~ s{
1351 (\s*<pre>.+?</pre>)
1352 }{
1353 my $pre = $1;
1354 $pre =~ s/^ //mg;
1355 $pre;
1356 }egsx;
1357
1358 "<blockquote>\n$bq\n</blockquote>\n\n";
1359 }egmx;
1360
1361
1362 return $text;
1363}