filters: import more modern scripts

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
author: Jason A. Donenfeld <Jason@zx2c4.com> 2013-05-28 04:39:43 (JST)
committer: Jason A. Donenfeld <Jason@zx2c4.com> 2013-05-28 04:54:16 (JST)
commit: 8149be213f1c8f52b0dbe6c213f6073af57fa954 (patch)
tree: e4d0315f53022bb7335f782ad394d8e7602f1b52 /filters
parent: dcbc0438b2543a733858d62170f3110a89edbed6 (diff)
download: cgit-8149be213f1c8f52b0dbe6c213f6073af57fa954.zip
cgit-8149be213f1c8f52b0dbe6c213f6073af57fa954.tar.gz
8 files changed, 1813 insertions, 0 deletions
diff --git a/filters/about-formatting.sh b/filters/about-formatting.sh
new file mode 100755
index 0000000..313a4e6
--- /dev/null
+++ b/filters/about-formatting.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+# This may be used with the about-filter or repo.about-filter setting in cgitrc.
+# It passes formatting of about pages to differing programs, depending on the usage.
+# Markdown support requires perl.
+# RestructuredText support requires python and docutils.
+# Man page support requires groff.
+# The following environment variables can be used to retrieve the configuration
+# of the repository for which this script is called:
+# CGIT_REPO_URL        ( = repo.url       setting )
+# CGIT_REPO_NAME       ( = repo.name      setting )
+# CGIT_REPO_PATH       ( = repo.path      setting )
+# CGIT_REPO_OWNER      ( = repo.owner     setting )
+# CGIT_REPO_DEFBRANCH  ( = repo.defbranch setting )
+# CGIT_REPO_SECTION    ( = section        setting )
+# CGIT_REPO_CLONE_URL  ( = repo.clone-url setting )
+cd "$(dirname $0)/html-converters/"
+case "$(tr '[:upper:]' '[:lower:]' <<<"$1")" in
+        *.md|*.mkd) exec ./md2html; ;;
+        *.rst) exec ./rst2html; ;;
+        *.[1-9]) exec ./man2html; ;;
+        *.htm|*.html) exec cat; ;;
+        *.txt|*) exec ./txt2html; ;;
+esac
diff --git a/filters/html-converters/man2html b/filters/html-converters/man2html
new file mode 100755
index 0000000..1b28437
--- /dev/null
+++ b/filters/html-converters/man2html
@@ -0,0 +1,5 @@
+#!/bin/sh
+echo "<div style=\"font-family: monospace\">"
+groff -mandoc -T html -P -r -P -l | egrep -v '(<html>|<head>|<meta|<title>|</title>|</head>|<body>|</body>|</html>|<!DOCTYPE|"http://www.w3.org)'
+echo "</div>"
diff --git a/filters/html-converters/md2html b/filters/html-converters/md2html
new file mode 100755
index 0000000..5cab749
--- /dev/null
+++ b/filters/html-converters/md2html
@@ -0,0 +1,2 @@
+#!/bin/sh
+exec "$(dirname "$0")/resources/markdown.pl"
diff --git a/filters/html-converters/resources/markdown.pl b/filters/html-converters/resources/markdown.pl
new file mode 100755
index 0000000..abec173
--- /dev/null
+++ b/filters/html-converters/resources/markdown.pl
@@ -0,0 +1,1731 @@
+#!/usr/bin/perl
+#
+# Markdown -- A text-to-HTML conversion tool for web writers
+#
+# Copyright (c) 2004 John Gruber
+# <http://daringfireball.net/projects/markdown/>
+#
+package Markdown;
+require 5.006_000;
+use strict;
+use warnings;
+use Digest::MD5 qw(md5_hex);
+use vars qw($VERSION);
+$VERSION = '1.0.1';
+# Tue 14 Dec 2004
+## Disabled; causes problems under Perl 5.6.1:
+use utf8;
+binmode( STDOUT, ":utf8" );  # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
+#
+# Global default settings:
+#
+my $g_empty_element_suffix = " />";     # Change to ">" for HTML output
+my $g_tab_width = 4;
+#
+# Globals:
+#
+# Regex to match balanced [brackets]. See Friedl's
+# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
+my $g_nested_brackets;
+$g_nested_brackets = qr{
+        (?>                                                             # Atomic matching
+           [^\[\]]+                                                     # Anything other than brackets
+         | 
+           \[
+                 (??{ $g_nested_brackets })             # Recursive set of nested brackets
+           \]
+        )*
+}x;
+# Table of hash values for escaped characters:
+my %g_escape_table;
+foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
+        $g_escape_table{$char} = md5_hex($char);
+}
+# Global hashes, used by various utility routines
+my %g_urls;
+my %g_titles;
+my %g_html_blocks;
+# Used to track when we're inside an ordered or unordered list
+# (see _ProcessListItems() for details):
+my $g_list_level = 0;
+#### Blosxom plug-in interface ##########################################
+# Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
+# which posts Markdown should process, using a "meta-markup: markdown"
+# header. If it's set to 0 (the default), Markdown will process all
+# entries.
+my $g_blosxom_use_meta = 0;
+sub start { 1; }
+sub story {
+        my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
+        if ( (! $g_blosxom_use_meta) or
+             (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
+             ){
+                        $$body_ref  = Markdown($$body_ref);
+     }
+     1;
+}
+#### Movable Type plug-in interface #####################################
+eval {require MT};  # Test to see if we're running in MT.
+unless ($@) {
+    require MT;
+    import  MT;
+    require MT::Template::Context;
+    import  MT::Template::Context;
+        eval {require MT::Plugin};  # Test to see if we're running >= MT 3.0.
+        unless ($@) {
+                require MT::Plugin;
+                import  MT::Plugin;
+                my $plugin = new MT::Plugin({
+                        name => "Markdown",
+                        description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
+                        doc_link => 'http://daringfireball.net/projects/markdown/'
+                });
+                MT->add_plugin( $plugin );
+        }
+        MT::Template::Context->add_container_tag(MarkdownOptions => sub {
+                my $ctx  = shift;
+                my $args = shift;
+                my $builder = $ctx->stash('builder');
+                my $tokens = $ctx->stash('tokens');
+                if (defined ($args->{'output'}) ) {
+                        $ctx->stash('markdown_output', lc $args->{'output'});
+                }
+                defined (my $str = $builder->build($ctx, $tokens) )
+                        or return $ctx->error($builder->errstr);
+                $str;           # return value
+        });
+        MT->add_text_filter('markdown' => {
+                label     => 'Markdown',
+                docs      => 'http://daringfireball.net/projects/markdown/',
+                on_format => sub {
+                        my $text = shift;
+                        my $ctx  = shift;
+                        my $raw  = 0;
+                    if (defined $ctx) {
+                        my $output = $ctx->stash('markdown_output'); 
+                                if (defined $output  &&  $output =~ m/^html/i) {
+                                        $g_empty_element_suffix = ">";
+                                        $ctx->stash('markdown_output', '');
+                                }
+                                elsif (defined $output  &&  $output eq 'raw') {
+                                        $raw = 1;
+                                        $ctx->stash('markdown_output', '');
+                                }
+                                else {
+                                        $raw = 0;
+                                        $g_empty_element_suffix = " />";
+                                }
+                        }
+                        $text = $raw ? $text : Markdown($text);
+                        $text;
+                },
+        });
+        # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
+        my $smartypants;
+        {
+                no warnings "once";
+                $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
+        }
+        if ($smartypants) {
+                MT->add_text_filter('markdown_with_smartypants' => {
+                        label     => 'Markdown With SmartyPants',
+                        docs      => 'http://daringfireball.net/projects/markdown/',
+                        on_format => sub {
+                                my $text = shift;
+                                my $ctx  = shift;
+                                if (defined $ctx) {
+                                        my $output = $ctx->stash('markdown_output'); 
+                                        if (defined $output  &&  $output eq 'html') {
+                                                $g_empty_element_suffix = ">";
+                                        }
+                                        else {
+                                                $g_empty_element_suffix = " />";
+                                        }
+                                }
+                                $text = Markdown($text);
+                                $text = $smartypants->($text, '1');
+                        },
+                });
+        }
+}
+else {
+#### BBEdit/command-line text filter interface ##########################
+# Needs to be hidden from MT (and Blosxom when running in static mode).
+    # We're only using $blosxom::version once; tell Perl not to warn us:
+        no warnings 'once';
+    unless ( defined($blosxom::version) ) {
+                use warnings;
+                #### Check for command-line switches: #################
+                my %cli_opts;
+                use Getopt::Long;
+                Getopt::Long::Configure('pass_through');
+                GetOptions(\%cli_opts,
+                        'version',
+                        'shortversion',
+                        'html4tags',
+                );
+                if ($cli_opts{'version'}) {             # Version info
+                        print "\nThis is Markdown, version $VERSION.\n";
+                        print "Copyright 2004 John Gruber\n";
+                        print "http://daringfireball.net/projects/markdown/\n\n";
+                        exit 0;
+                }
+                if ($cli_opts{'shortversion'}) {                # Just the version number string.
+                        print $VERSION;
+                        exit 0;
+                }
+                if ($cli_opts{'html4tags'}) {                   # Use HTML tag style instead of XHTML
+                        $g_empty_element_suffix = ">";
+                }
+                #### Process incoming text: ###########################
+                my $text;
+                {
+                        local $/;               # Slurp the whole file
+                        $text = <>;
+                }
+        print <<'EOT';
+<style>
+.markdown-body {
+    font-size: 14px;
+    line-height: 1.6;
+    overflow: hidden;
+}
+.markdown-body>*:first-child {
+    margin-top: 0 !important;
+}
+.markdown-body>*:last-child {
+    margin-bottom: 0 !important;
+}
+.markdown-body a.absent {
+    color: #c00;
+}
+.markdown-body a.anchor {
+    display: block;
+    padding-left: 30px;
+    margin-left: -30px;
+    cursor: pointer;
+    position: absolute;
+    top: 0;
+    left: 0;
+    bottom: 0;
+}
+.markdown-body h1, .markdown-body h2, .markdown-body h3, .markdown-body h4, .markdown-body h5, .markdown-body h6 {
+    margin: 20px 0 10px;
+    padding: 0;
+    font-weight: bold;
+    -webkit-font-smoothing: antialiased;
+    cursor: text;
+    position: relative;
+}
+.markdown-body h1 .mini-icon-link, .markdown-body h2 .mini-icon-link, .markdown-body h3 .mini-icon-link, .markdown-body h4 .mini-icon-link, .markdown-body h5 .mini-icon-link, .markdown-body h6 .mini-icon-link {
+    display: none;
+    color: #000;
+}
+.markdown-body h1:hover a.anchor, .markdown-body h2:hover a.anchor, .markdown-body h3:hover a.anchor, .markdown-body h4:hover a.anchor, .markdown-body h5:hover a.anchor, .markdown-body h6:hover a.anchor {
+    text-decoration: none;
+    line-height: 1;
+    padding-left: 0;
+    margin-left: -22px;
+    top: 15%}
+.markdown-body h1:hover a.anchor .mini-icon-link, .markdown-body h2:hover a.anchor .mini-icon-link, .markdown-body h3:hover a.anchor .mini-icon-link, .markdown-body h4:hover a.anchor .mini-icon-link, .markdown-body h5:hover a.anchor .mini-icon-link, .markdown-body h6:hover a.anchor .mini-icon-link {
+    display: inline-block;
+}
+.markdown-body h1 tt, .markdown-body h1 code, .markdown-body h2 tt, .markdown-body h2 code, .markdown-body h3 tt, .markdown-body h3 code, .markdown-body h4 tt, .markdown-body h4 code, .markdown-body h5 tt, .markdown-body h5 code, .markdown-body h6 tt, .markdown-body h6 code {
+    font-size: inherit;
+}
+.markdown-body h1 {
+    font-size: 28px;
+    color: #000;
+}
+.markdown-body h2 {
+    font-size: 24px;
+    border-bottom: 1px solid #ccc;
+    color: #000;
+}
+.markdown-body h3 {
+    font-size: 18px;
+}
+.markdown-body h4 {
+    font-size: 16px;
+}
+.markdown-body h5 {
+    font-size: 14px;
+}
+.markdown-body h6 {
+    color: #777;
+    font-size: 14px;
+}
+.markdown-body p, .markdown-body blockquote, .markdown-body ul, .markdown-body ol, .markdown-body dl, .markdown-body table, .markdown-body pre {
+    margin: 15px 0;
+}
+.markdown-body hr {
+    background: transparent url("/dirty-shade.png") repeat-x 0 0;
+    border: 0 none;
+    color: #ccc;
+    height: 4px;
+    padding: 0;
+}
+.markdown-body>h2:first-child, .markdown-body>h1:first-child, .markdown-body>h1:first-child+h2, .markdown-body>h3:first-child, .markdown-body>h4:first-child, .markdown-body>h5:first-child, .markdown-body>h6:first-child {
+    margin-top: 0;
+    padding-top: 0;
+}
+.markdown-body a:first-child h1, .markdown-body a:first-child h2, .markdown-body a:first-child h3, .markdown-body a:first-child h4, .markdown-body a:first-child h5, .markdown-body a:first-child h6 {
+    margin-top: 0;
+    padding-top: 0;
+}
+.markdown-body h1+p, .markdown-body h2+p, .markdown-body h3+p, .markdown-body h4+p, .markdown-body h5+p, .markdown-body h6+p {
+    margin-top: 0;
+}
+.markdown-body li p.first {
+    display: inline-block;
+}
+.markdown-body ul, .markdown-body ol {
+    padding-left: 30px;
+}
+.markdown-body ul.no-list, .markdown-body ol.no-list {
+    list-style-type: none;
+    padding: 0;
+}
+.markdown-body ul li>:first-child, .markdown-body ul li ul:first-of-type, .markdown-body ul li ol:first-of-type, .markdown-body ol li>:first-child, .markdown-body ol li ul:first-of-type, .markdown-body ol li ol:first-of-type {
+    margin-top: 0px;
+}
+.markdown-body ul li p:last-of-type, .markdown-body ol li p:last-of-type {
+    margin-bottom: 0;
+}
+.markdown-body ul ul, .markdown-body ul ol, .markdown-body ol ol, .markdown-body ol ul {
+    margin-bottom: 0;
+}
+.markdown-body dl {
+    padding: 0;
+}
+.markdown-body dl dt {
+    font-size: 14px;
+    font-weight: bold;
+    font-style: italic;
+    padding: 0;
+    margin: 15px 0 5px;
+}
+.markdown-body dl dt:first-child {
+    padding: 0;
+}
+.markdown-body dl dt>:first-child {
+    margin-top: 0px;
+}
+.markdown-body dl dt>:last-child {
+    margin-bottom: 0px;
+}
+.markdown-body dl dd {
+    margin: 0 0 15px;
+    padding: 0 15px;
+}
+.markdown-body dl dd>:first-child {
+    margin-top: 0px;
+}
+.markdown-body dl dd>:last-child {
+    margin-bottom: 0px;
+}
+.markdown-body blockquote {
+    border-left: 4px solid #DDD;
+    padding: 0 15px;
+    color: #777;
+}
+.markdown-body blockquote>:first-child {
+    margin-top: 0px;
+}
+.markdown-body blockquote>:last-child {
+    margin-bottom: 0px;
+}
+.markdown-body table th {
+    font-weight: bold;
+}
+.markdown-body table th, .markdown-body table td {
+    border: 1px solid #ccc;
+    padding: 6px 13px;
+}
+.markdown-body table tr {
+    border-top: 1px solid #ccc;
+    background-color: #fff;
+}
+.markdown-body table tr:nth-child(2n) {
+    background-color: #f8f8f8;
+}
+.markdown-body img {
+    max-width: 100%;
+    -moz-box-sizing: border-box;
+    box-sizing: border-box;
+}
+.markdown-body span.frame {
+    display: block;
+    overflow: hidden;
+}
+.markdown-body span.frame>span {
+    border: 1px solid #ddd;
+    display: block;
+    float: left;
+    overflow: hidden;
+    margin: 13px 0 0;
+    padding: 7px;
+    width: auto;
+}
+.markdown-body span.frame span img {
+    display: block;
+    float: left;
+}
+.markdown-body span.frame span span {
+    clear: both;
+    color: #333;
+    display: block;
+    padding: 5px 0 0;
+}
+.markdown-body span.align-center {
+    display: block;
+    overflow: hidden;
+    clear: both;
+}
+.markdown-body span.align-center>span {
+    display: block;
+    overflow: hidden;
+    margin: 13px auto 0;
+    text-align: center;
+}
+.markdown-body span.align-center span img {
+    margin: 0 auto;
+    text-align: center;
+}
+.markdown-body span.align-right {
+    display: block;
+    overflow: hidden;
+    clear: both;
+}
+.markdown-body span.align-right>span {
+    display: block;
+    overflow: hidden;
+    margin: 13px 0 0;
+    text-align: right;
+}
+.markdown-body span.align-right span img {
+    margin: 0;
+    text-align: right;
+}
+.markdown-body span.float-left {
+    display: block;
+    margin-right: 13px;
+    overflow: hidden;
+    float: left;
+}
+.markdown-body span.float-left span {
+    margin: 13px 0 0;
+}
+.markdown-body span.float-right {
+    display: block;
+    margin-left: 13px;
+    overflow: hidden;
+    float: right;
+}
+.markdown-body span.float-right>span {
+    display: block;
+    overflow: hidden;
+    margin: 13px auto 0;
+    text-align: right;
+}
+.markdown-body code, .markdown-body tt {
+    margin: 0 2px;
+    padding: 0px 5px;
+    border: 1px solid #eaeaea;
+    background-color: #f8f8f8;
+    border-radius: 3px;
+}
+.markdown-body code {
+    white-space: nowrap;
+}
+.markdown-body pre>code {
+    margin: 0;
+    padding: 0;
+    white-space: pre;
+    border: none;
+    background: transparent;
+}
+.markdown-body .highlight pre, .markdown-body pre {
+    background-color: #f8f8f8;
+    border: 1px solid #ccc;
+    font-size: 13px;
+    line-height: 19px;
+    overflow: auto;
+    padding: 6px 10px;
+    border-radius: 3px;
+}
+.markdown-body pre code, .markdown-body pre tt {
+    margin: 0;
+    padding: 0;
+    background-color: transparent;
+    border: none;
+}
+</style>
+EOT
+        print "<div class='markdown-body'>";
+        print Markdown($text);
+        print "</div>";
+    }
+}
+sub Markdown {
+#
+# Main function. The order in which other subs are called here is
+# essential. Link and image substitutions need to happen before
+# _EscapeSpecialChars(), so that any *'s or _'s in the <a>
+# and <img> tags get encoded.
+#
+        my $text = shift;
+        # Clear the global hashes. If we don't clear these, you get conflicts
+        # from other articles when generating a page which contains more than
+        # one article (e.g. an index page that shows the N most recent
+        # articles):
+        %g_urls = ();
+        %g_titles = ();
+        %g_html_blocks = ();
+        # Standardize line endings:
+        $text =~ s{\r\n}{\n}g;  # DOS to Unix
+        $text =~ s{\r}{\n}g;    # Mac to Unix
+        # Make sure $text ends with a couple of newlines:
+        $text .= "\n\n";
+        # Convert all tabs to spaces.
+        $text = _Detab($text);
+        # Strip any lines consisting only of spaces and tabs.
+        # This makes subsequent regexen easier to write, because we can
+        # match consecutive blank lines with /\n+/ instead of something
+        # contorted like /[ \t]*\n+/ .
+        $text =~ s/^[ \t]+$//mg;
+        # Turn block-level HTML blocks into hash entries
+        $text = _HashHTMLBlocks($text);
+        # Strip link definitions, store in hashes.
+        $text = _StripLinkDefinitions($text);
+        $text = _RunBlockGamut($text);
+        $text = _UnescapeSpecialChars($text);
+        return $text . "\n";
+}
+sub _StripLinkDefinitions {
+#
+# Strips link definitions from text, stores the URLs and titles in
+# hash references.
+#
+        my $text = shift;
+        my $less_than_tab = $g_tab_width - 1;
+        # Link defs are in the form: ^[id]: url "optional title"
+        while ($text =~ s{
+                                                ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
+                                                  [ \t]*
+                                                  \n?                           # maybe *one* newline
+                                                  [ \t]*
+                                                <?(\S+?)>?                      # url = $2
+                                                  [ \t]*
+                                                  \n?                           # maybe one newline
+                                                  [ \t]*
+                                                (?:
+                                                        (?<=\s)                 # lookbehind for whitespace
+                                                        ["(]
+                                                        (.+?)                   # title = $3
+                                                        [")]
+                                                        [ \t]*
+                                                )?      # title is optional
+                                                (?:\n+|\Z)
+                                        }
+                                        {}mx) {
+                $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 );    # Link IDs are case-insensitive
+                if ($3) {
+                        $g_titles{lc $1} = $3;
+                        $g_titles{lc $1} =~ s/"/&quot;/g;
+                }
+        }
+        return $text;
+}
+sub _HashHTMLBlocks {
+        my $text = shift;
+        my $less_than_tab = $g_tab_width - 1;
+        # Hashify HTML blocks:
+        # We only want to do this for block-level HTML tags, such as headers,
+        # lists, and tables. That's because we still want to wrap <p>s around
+        # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
+        # phrase emphasis, and spans. The list of tags we're looking for is
+        # hard-coded:
+        my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
+        my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
+        # First, look for nested blocks, e.g.:
+        #       <div>
+        #               <div>
+        #               tags for inner block must be indented.
+        #               </div>
+        #       </div>
+        #
+        # The outermost tags must start at the left margin for this to match, and
+        # the inner nested divs must be indented.
+        # We need to do this before the next, more liberal match, because the next
+        # match will start at the first `<div>` and stop at the first `</div>`.
+        $text =~ s{
+                                (                                               # save in $1
+                                        ^                                       # start of line  (with /m)
+                                        <($block_tags_a)        # start tag = $2
+                                        \b                                      # word break
+                                        (.*\n)*?                        # any number of lines, minimally matching
+                                        </\2>                           # the matching end tag
+                                        [ \t]*                          # trailing spaces/tabs
+                                        (?=\n+|\Z)      # followed by a newline or end of document
+                                )
+                        }{
+                                my $key = md5_hex($1);
+                                $g_html_blocks{$key} = $1;
+                                "\n\n" . $key . "\n\n";
+                        }egmx;
+        #
+        # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
+        #
+        $text =~ s{
+                                (                                               # save in $1
+                                        ^                                       # start of line  (with /m)
+                                        <($block_tags_b)        # start tag = $2
+                                        \b                                      # word break
+                                        (.*\n)*?                        # any number of lines, minimally matching
+                                        .*</\2>                         # the matching end tag
+                                        [ \t]*                          # trailing spaces/tabs
+                                        (?=\n+|\Z)      # followed by a newline or end of document
+                                )
+                        }{
+                                my $key = md5_hex($1);
+                                $g_html_blocks{$key} = $1;
+                                "\n\n" . $key . "\n\n";
+                        }egmx;
+        # Special case just for <hr />. It was easier to make a special case than
+        # to make the other regex more complicated.     
+        $text =~ s{
+                                (?:
+                                        (?<=\n\n)               # Starting after a blank line
+                                        |                               # or
+                                        \A\n?                   # the beginning of the doc
+                                )
+                                (                                               # save in $1
+                                        [ ]{0,$less_than_tab}
+                                        <(hr)                           # start tag = $2
+                                        \b                                      # word break
+                                        ([^<>])*?                       # 
+                                        /?>                                     # the matching end tag
+                                        [ \t]*
+                                        (?=\n{2,}|\Z)           # followed by a blank line or end of document
+                                )
+                        }{
+                                my $key = md5_hex($1);
+                                $g_html_blocks{$key} = $1;
+                                "\n\n" . $key . "\n\n";
+                        }egx;
+        # Special case for standalone HTML comments:
+        $text =~ s{
+                                (?:
+                                        (?<=\n\n)               # Starting after a blank line
+                                        |                               # or
+                                        \A\n?                   # the beginning of the doc
+                                )
author	Jason A. Donenfeld <Jason@zx2c4.com>	2013-05-28 04:39:43 (JST)
committer	Jason A. Donenfeld <Jason@zx2c4.com>	2013-05-28 04:54:16 (JST)
commit	8149be213f1c8f52b0dbe6c213f6073af57fa954 (patch)
tree	e4d0315f53022bb7335f782ad394d8e7602f1b52 /filters
parent	dcbc0438b2543a733858d62170f3110a89edbed6 (diff)
download	cgit-8149be213f1c8f52b0dbe6c213f6073af57fa954.zip cgit-8149be213f1c8f52b0dbe6c213f6073af57fa954.tar.gz

diff --git a/filters/about-formatting.sh b/filters/about-formatting.sh new file mode 100755 index 0000000..313a4e6 --- /dev/null +++ b/filters/about-formatting.sh
@@ -0,0 +1,27 @@
	1	#!/bin/sh
	2
	3	# This may be used with the about-filter or repo.about-filter setting in cgitrc.
	4	# It passes formatting of about pages to differing programs, depending on the usage.
	5
	6	# Markdown support requires perl.
	7	# RestructuredText support requires python and docutils.
	8	# Man page support requires groff.
	9
	10	# The following environment variables can be used to retrieve the configuration
	11	# of the repository for which this script is called:
	12	# CGIT_REPO_URL ( = repo.url setting )
	13	# CGIT_REPO_NAME ( = repo.name setting )
	14	# CGIT_REPO_PATH ( = repo.path setting )
	15	# CGIT_REPO_OWNER ( = repo.owner setting )
	16	# CGIT_REPO_DEFBRANCH ( = repo.defbranch setting )
	17	# CGIT_REPO_SECTION ( = section setting )
	18	# CGIT_REPO_CLONE_URL ( = repo.clone-url setting )
	19
	20	cd "$(dirname $0)/html-converters/"
	21	case "$(tr '[:upper:]' '[:lower:]' <<<"$1")" in
	22	.md\|.mkd) exec ./md2html; ;;
	23	*.rst) exec ./rst2html; ;;
	24	*.[1-9]) exec ./man2html; ;;
	25	.htm\|.html) exec cat; ;;
	26	.txt\|) exec ./txt2html; ;;
	27	esac


diff --git a/filters/html-converters/man2html b/filters/html-converters/man2html new file mode 100755 index 0000000..1b28437 --- /dev/null +++ b/filters/html-converters/man2html
@@ -0,0 +1,5 @@
	1	#!/bin/sh
	2	echo "<div style=\"font-family: monospace\">"
	3	groff -mandoc -T html -P -r -P -l \| egrep -v '(<html>\|<head>\|<meta\|<title>\|</title>\|</head>\|<body>\|</body>\|</html>\|<!DOCTYPE\|"http://www.w3.org)'
	4	echo "</div>"
	5


diff --git a/filters/html-converters/md2html b/filters/html-converters/md2html new file mode 100755 index 0000000..5cab749 --- /dev/null +++ b/filters/html-converters/md2html
@@ -0,0 +1,2 @@
	1	#!/bin/sh
	2	exec "$(dirname "$0")/resources/markdown.pl"


diff --git a/filters/html-converters/resources/markdown.pl b/filters/html-converters/resources/markdown.pl new file mode 100755 index 0000000..abec173 --- /dev/null +++ b/filters/html-converters/resources/markdown.pl
@@ -0,0 +1,1731 @@
	1	#!/usr/bin/perl
	2
	3	#
	4	# Markdown -- A text-to-HTML conversion tool for web writers
	5	#
	6	# Copyright (c) 2004 John Gruber
	7	# <http://daringfireball.net/projects/markdown/>
	8	#
	9
	10
	11	package Markdown;
	12	require 5.006_000;
	13	use strict;
	14	use warnings;
	15
	16	use Digest::MD5 qw(md5_hex);
	17	use vars qw($VERSION);
	18	$VERSION = '1.0.1';
	19	# Tue 14 Dec 2004
	20
	21	## Disabled; causes problems under Perl 5.6.1:
	22	use utf8;
	23	binmode( STDOUT, ":utf8" ); # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
	24
	25
	26	#
	27	# Global default settings:
	28	#
	29	my $g_empty_element_suffix = " />"; # Change to ">" for HTML output
	30	my $g_tab_width = 4;
	31
	32
	33	#
	34	# Globals:
	35	#
	36
	37	# Regex to match balanced [brackets]. See Friedl's
	38	# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
	39	my $g_nested_brackets;
	40	$g_nested_brackets = qr{
	41	(?> # Atomic matching
	42	[^\[\]]+ # Anything other than brackets
	43	\|
	44	\[
	45	(??{ $g_nested_brackets }) # Recursive set of nested brackets
	46	\]
	47	)*
	48	}x;
	49
	50
	51	# Table of hash values for escaped characters:
	52	my %g_escape_table;
	53	foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
	54	$g_escape_table{$char} = md5_hex($char);
	55	}
	56
	57
	58	# Global hashes, used by various utility routines
	59	my %g_urls;
	60	my %g_titles;
	61	my %g_html_blocks;
	62
	63	# Used to track when we're inside an ordered or unordered list
	64	# (see _ProcessListItems() for details):
	65	my $g_list_level = 0;
	66
	67
	68	#### Blosxom plug-in interface ##########################################
	69
	70	# Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
	71	# which posts Markdown should process, using a "meta-markup: markdown"
	72	# header. If it's set to 0 (the default), Markdown will process all
	73	# entries.
	74	my $g_blosxom_use_meta = 0;
	75
	76	sub start { 1; }
	77	sub story {
	78	my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
	79
	80	if ( (! $g_blosxom_use_meta) or
	81	(defined($meta::markup) and ($meta::markup =~ /^\smarkdown\s$/i))
	82	){
	83	$$body_ref = Markdown($$body_ref);
	84	}
	85	1;
	86	}
	87
	88
	89	#### Movable Type plug-in interface #####################################
	90	eval {require MT}; # Test to see if we're running in MT.
	91	unless ($@) {
	92	require MT;
	93	import MT;
	94	require MT::Template::Context;
	95	import MT::Template::Context;
	96
	97	eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0.
	98	unless ($@) {
	99	require MT::Plugin;
	100	import MT::Plugin;
	101	my $plugin = new MT::Plugin({
	102	name => "Markdown",
	103	description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
	104	doc_link => 'http://daringfireball.net/projects/markdown/'
	105	});
	106	MT->add_plugin( $plugin );
	107	}
	108
	109	MT::Template::Context->add_container_tag(MarkdownOptions => sub {
	110	my $ctx = shift;
	111	my $args = shift;
	112	my $builder = $ctx->stash('builder');
	113	my $tokens = $ctx->stash('tokens');
	114
	115	if (defined ($args->{'output'}) ) {
	116	$ctx->stash('markdown_output', lc $args->{'output'});
	117	}
	118
	119	defined (my $str = $builder->build($ctx, $tokens) )
	120	or return $ctx->error($builder->errstr);
	121	$str; # return value
	122	});
	123
	124	MT->add_text_filter('markdown' => {
	125	label => 'Markdown',
	126	docs => 'http://daringfireball.net/projects/markdown/',
	127	on_format => sub {
	128	my $text = shift;
	129	my $ctx = shift;
	130	my $raw = 0;
	131	if (defined $ctx) {
	132	my $output = $ctx->stash('markdown_output');
	133	if (defined $output && $output =~ m/^html/i) {
	134	$g_empty_element_suffix = ">";
	135	$ctx->stash('markdown_output', '');
	136	}
	137	elsif (defined $output && $output eq 'raw') {
	138	$raw = 1;
	139	$ctx->stash('markdown_output', '');
	140	}
	141	else {
	142	$raw = 0;
	143	$g_empty_element_suffix = " />";
	144	}
	145	}
	146	$text = $raw ? $text : Markdown($text);
	147	$text;
	148	},
	149	});
	150
	151	# If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
	152	my $smartypants;
	153
	154	{
	155	no warnings "once";
	156	$smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
	157	}
	158
	159	if ($smartypants) {
	160	MT->add_text_filter('markdown_with_smartypants' => {
	161	label => 'Markdown With SmartyPants',
	162	docs => 'http://daringfireball.net/projects/markdown/',
	163	on_format => sub {
	164	my $text = shift;
	165	my $ctx = shift;
	166	if (defined $ctx) {
	167	my $output = $ctx->stash('markdown_output');
	168	if (defined $output && $output eq 'html') {
	169	$g_empty_element_suffix = ">";
	170	}
	171	else {
	172	$g_empty_element_suffix = " />";
	173	}
	174	}
	175	$text = Markdown($text);
	176	$text = $smartypants->($text, '1');
	177	},
	178	});
	179	}
	180	}
	181	else {
	182	#### BBEdit/command-line text filter interface ##########################
	183	# Needs to be hidden from MT (and Blosxom when running in static mode).
	184
	185	# We're only using $blosxom::version once; tell Perl not to warn us:
	186	no warnings 'once';
	187	unless ( defined($blosxom::version) ) {
	188	use warnings;
	189
	190	#### Check for command-line switches: #################
	191	my %cli_opts;
	192	use Getopt::Long;
	193	Getopt::Long::Configure('pass_through');
	194	GetOptions(\%cli_opts,
	195	'version',
	196	'shortversion',
	197	'html4tags',
	198	);
	199	if ($cli_opts{'version'}) { # Version info
	200	print "\nThis is Markdown, version $VERSION.\n";
	201	print "Copyright 2004 John Gruber\n";
	202	print "http://daringfireball.net/projects/markdown/\n\n";
	203	exit 0;
	204	}
	205	if ($cli_opts{'shortversion'}) { # Just the version number string.
	206	print $VERSION;
	207	exit 0;
	208	}
	209	if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
	210	$g_empty_element_suffix = ">";
	211	}
	212
	213
	214	#### Process incoming text: ###########################
	215	my $text;
	216	{
	217	local $/; # Slurp the whole file
	218	$text = <>;
	219	}
	220	print <<'EOT';
	221	<style>
	222	.markdown-body {
	223	font-size: 14px;
	224	line-height: 1.6;
	225	overflow: hidden;
	226	}
	227	.markdown-body>*:first-child {
	228	margin-top: 0 !important;
	229	}
	230	.markdown-body>*:last-child {
	231	margin-bottom: 0 !important;
	232	}
	233	.markdown-body a.absent {
	234	color: #c00;
	235	}
	236	.markdown-body a.anchor {
	237	display: block;
	238	padding-left: 30px;
	239	margin-left: -30px;
	240	cursor: pointer;
	241	position: absolute;
	242	top: 0;
	243	left: 0;
	244	bottom: 0;
	245	}
	246	.markdown-body h1, .markdown-body h2, .markdown-body h3, .markdown-body h4, .markdown-body h5, .markdown-body h6 {
	247	margin: 20px 0 10px;
	248	padding: 0;
	249	font-weight: bold;
	250	-webkit-font-smoothing: antialiased;
	251	cursor: text;
	252	position: relative;
	253	}
	254	.markdown-body h1 .mini-icon-link, .markdown-body h2 .mini-icon-link, .markdown-body h3 .mini-icon-link, .markdown-body h4 .mini-icon-link, .markdown-body h5 .mini-icon-link, .markdown-body h6 .mini-icon-link {
	255	display: none;
	256	color: #000;
	257	}
	258	.markdown-body h1:hover a.anchor, .markdown-body h2:hover a.anchor, .markdown-body h3:hover a.anchor, .markdown-body h4:hover a.anchor, .markdown-body h5:hover a.anchor, .markdown-body h6:hover a.anchor {
	259	text-decoration: none;
	260	line-height: 1;
	261	padding-left: 0;
	262	margin-left: -22px;
	263	top: 15%}
	264	.markdown-body h1:hover a.anchor .mini-icon-link, .markdown-body h2:hover a.anchor .mini-icon-link, .markdown-body h3:hover a.anchor .mini-icon-link, .markdown-body h4:hover a.anchor .mini-icon-link, .markdown-body h5:hover a.anchor .mini-icon-link, .markdown-body h6:hover a.anchor .mini-icon-link {
	265	display: inline-block;
	266	}
	267	.markdown-body h1 tt, .markdown-body h1 code, .markdown-body h2 tt, .markdown-body h2 code, .markdown-body h3 tt, .markdown-body h3 code, .markdown-body h4 tt, .markdown-body h4 code, .markdown-body h5 tt, .markdown-body h5 code, .markdown-body h6 tt, .markdown-body h6 code {
	268	font-size: inherit;
	269	}
	270	.markdown-body h1 {
	271	font-size: 28px;
	272	color: #000;
	273	}
	274	.markdown-body h2 {
	275	font-size: 24px;
	276	border-bottom: 1px solid #ccc;
	277	color: #000;
	278	}
	279	.markdown-body h3 {
	280	font-size: 18px;
	281	}
	282	.markdown-body h4 {
	283	font-size: 16px;
	284	}
	285	.markdown-body h5 {
	286	font-size: 14px;
	287	}
	288	.markdown-body h6 {
	289	color: #777;
	290	font-size: 14px;
	291	}
	292	.markdown-body p, .markdown-body blockquote, .markdown-body ul, .markdown-body ol, .markdown-body dl, .markdown-body table, .markdown-body pre {
	293	margin: 15px 0;
	294	}
	295	.markdown-body hr {
	296	background: transparent url("/dirty-shade.png") repeat-x 0 0;
	297	border: 0 none;
	298	color: #ccc;
	299	height: 4px;
	300	padding: 0;
	301	}
	302	.markdown-body>h2:first-child, .markdown-body>h1:first-child, .markdown-body>h1:first-child+h2, .markdown-body>h3:first-child, .markdown-body>h4:first-child, .markdown-body>h5:first-child, .markdown-body>h6:first-child {
	303	margin-top: 0;
	304	padding-top: 0;
	305	}
	306	.markdown-body a:first-child h1, .markdown-body a:first-child h2, .markdown-body a:first-child h3, .markdown-body a:first-child h4, .markdown-body a:first-child h5, .markdown-body a:first-child h6 {
	307	margin-top: 0;
	308	padding-top: 0;
	309	}
	310	.markdown-body h1+p, .markdown-body h2+p, .markdown-body h3+p, .markdown-body h4+p, .markdown-body h5+p, .markdown-body h6+p {
	311	margin-top: 0;
	312	}
	313	.markdown-body li p.first {
	314	display: inline-block;
	315	}
	316	.markdown-body ul, .markdown-body ol {
	317	padding-left: 30px;
	318	}
	319	.markdown-body ul.no-list, .markdown-body ol.no-list {
	320	list-style-type: none;
	321	padding: 0;
	322	}
	323	.markdown-body ul li>:first-child, .markdown-body ul li ul:first-of-type, .markdown-body ul li ol:first-of-type, .markdown-body ol li>:first-child, .markdown-body ol li ul:first-of-type, .markdown-body ol li ol:first-of-type {
	324	margin-top: 0px;
	325	}
	326	.markdown-body ul li p:last-of-type, .markdown-body ol li p:last-of-type {
	327	margin-bottom: 0;
	328	}
	329	.markdown-body ul ul, .markdown-body ul ol, .markdown-body ol ol, .markdown-body ol ul {
	330	margin-bottom: 0;
	331	}
	332	.markdown-body dl {
	333	padding: 0;
	334	}
	335	.markdown-body dl dt {
	336	font-size: 14px;
	337	font-weight: bold;
	338	font-style: italic;
	339	padding: 0;
	340	margin: 15px 0 5px;
	341	}
	342	.markdown-body dl dt:first-child {
	343	padding: 0;
	344	}
	345	.markdown-body dl dt>:first-child {
	346	margin-top: 0px;
	347	}
	348	.markdown-body dl dt>:last-child {
	349	margin-bottom: 0px;
	350	}
	351	.markdown-body dl dd {
	352	margin: 0 0 15px;
	353	padding: 0 15px;
	354	}
	355	.markdown-body dl dd>:first-child {
	356	margin-top: 0px;
	357	}
	358	.markdown-body dl dd>:last-child {
	359	margin-bottom: 0px;
	360	}
	361	.markdown-body blockquote {
	362	border-left: 4px solid #DDD;
	363	padding: 0 15px;
	364	color: #777;
	365	}
	366	.markdown-body blockquote>:first-child {
	367	margin-top: 0px;
	368	}
	369	.markdown-body blockquote>:last-child {
	370	margin-bottom: 0px;
	371	}
	372	.markdown-body table th {
	373	font-weight: bold;
	374	}
	375	.markdown-body table th, .markdown-body table td {
	376	border: 1px solid #ccc;
	377	padding: 6px 13px;
	378	}
	379	.markdown-body table tr {
	380	border-top: 1px solid #ccc;
	381	background-color: #fff;
	382	}
	383	.markdown-body table tr:nth-child(2n) {
	384	background-color: #f8f8f8;
	385	}
	386	.markdown-body img {
	387	max-width: 100%;
	388	-moz-box-sizing: border-box;
	389	box-sizing: border-box;
	390	}
	391	.markdown-body span.frame {
	392	display: block;
	393	overflow: hidden;
	394	}
	395	.markdown-body span.frame>span {
	396	border: 1px solid #ddd;
	397	display: block;
	398	float: left;
	399	overflow: hidden;
	400	margin: 13px 0 0;
	401	padding: 7px;
	402	width: auto;
	403	}
	404	.markdown-body span.frame span img {
	405	display: block;
	406	float: left;
	407	}
	408	.markdown-body span.frame span span {
	409	clear: both;
	410	color: #333;
	411	display: block;
	412	padding: 5px 0 0;
	413	}
	414	.markdown-body span.align-center {
	415	display: block;
	416	overflow: hidden;
	417	clear: both;
	418	}
	419	.markdown-body span.align-center>span {
	420	display: block;
	421	overflow: hidden;
	422	margin: 13px auto 0;
	423	text-align: center;
	424	}
	425	.markdown-body span.align-center span img {
	426	margin: 0 auto;
	427	text-align: center;
	428	}
	429	.markdown-body span.align-right {
	430	display: block;
	431	overflow: hidden;
	432	clear: both;
	433	}
	434	.markdown-body span.align-right>span {
	435	display: block;
	436	overflow: hidden;
	437	margin: 13px 0 0;
	438	text-align: right;
	439	}
	440	.markdown-body span.align-right span img {
	441	margin: 0;
	442	text-align: right;
	443	}
	444	.markdown-body span.float-left {
	445	display: block;
	446	margin-right: 13px;
	447	overflow: hidden;
	448	float: left;
	449	}
	450	.markdown-body span.float-left span {
	451	margin: 13px 0 0;
	452	}
	453	.markdown-body span.float-right {
	454	display: block;
	455	margin-left: 13px;
	456	overflow: hidden;
	457	float: right;
	458	}
	459	.markdown-body span.float-right>span {
	460	display: block;
	461	overflow: hidden;
	462	margin: 13px auto 0;
	463	text-align: right;
	464	}
	465	.markdown-body code, .markdown-body tt {
	466	margin: 0 2px;
	467	padding: 0px 5px;
	468	border: 1px solid #eaeaea;
	469	background-color: #f8f8f8;
	470	border-radius: 3px;
	471	}
	472	.markdown-body code {
	473	white-space: nowrap;
	474	}
	475	.markdown-body pre>code {
	476	margin: 0;
	477	padding: 0;
	478	white-space: pre;
	479	border: none;
	480	background: transparent;
	481	}
	482	.markdown-body .highlight pre, .markdown-body pre {
	483	background-color: #f8f8f8;
	484	border: 1px solid #ccc;
	485	font-size: 13px;
	486	line-height: 19px;
	487	overflow: auto;
	488	padding: 6px 10px;
	489	border-radius: 3px;
	490	}
	491	.markdown-body pre code, .markdown-body pre tt {
	492	margin: 0;
	493	padding: 0;
	494	background-color: transparent;
	495	border: none;
	496	}
	497	</style>
	498	EOT
	499	print "<div class='markdown-body'>";
	500	print Markdown($text);
	501	print "</div>";
	502	}
	503	}
	504
	505
	506
	507	sub Markdown {
	508	#
	509	# Main function. The order in which other subs are called here is
	510	# essential. Link and image substitutions need to happen before
	511	# _EscapeSpecialChars(), so that any *'s or _'s in the <a>
	512	# and <img> tags get encoded.
	513	#
	514	my $text = shift;
	515
	516	# Clear the global hashes. If we don't clear these, you get conflicts
	517	# from other articles when generating a page which contains more than
	518	# one article (e.g. an index page that shows the N most recent
	519	# articles):
	520	%g_urls = ();
	521	%g_titles = ();
	522	%g_html_blocks = ();
	523
	524
	525	# Standardize line endings:
	526	$text =~ s{\r\n}{\n}g; # DOS to Unix
	527	$text =~ s{\r}{\n}g; # Mac to Unix
	528
	529	# Make sure $text ends with a couple of newlines:
	530	$text .= "\n\n";
	531
	532	# Convert all tabs to spaces.
	533	$text = _Detab($text);
	534
	535	# Strip any lines consisting only of spaces and tabs.
	536	# This makes subsequent regexen easier to write, because we can
	537	# match consecutive blank lines with /\n+/ instead of something
	538	# contorted like /[ \t]*\n+/ .
	539	$text =~ s/^[ \t]+$//mg;
	540
	541	# Turn block-level HTML blocks into hash entries
	542	$text = _HashHTMLBlocks($text);
	543
	544	# Strip link definitions, store in hashes.
	545	$text = _StripLinkDefinitions($text);
	546
	547	$text = _RunBlockGamut($text);
	548
	549	$text = _UnescapeSpecialChars($text);
	550
	551	return $text . "\n";
	552	}
	553
	554
	555	sub _StripLinkDefinitions {
	556	#
	557	# Strips link definitions from text, stores the URLs and titles in
	558	# hash references.
	559	#
	560	my $text = shift;
	561	my $less_than_tab = $g_tab_width - 1;
	562
	563	# Link defs are in the form: ^[id]: url "optional title"
	564	while ($text =~ s{
	565	^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
	566	[ \t]*
	567	\n? # maybe one newline
	568	[ \t]*
	569	<?(\S+?)>? # url = $2
	570	[ \t]*
	571	\n? # maybe one newline
	572	[ \t]*
	573	(?:
	574	(?<=\s) # lookbehind for whitespace
	575	["(]
	576	(.+?) # title = $3
	577	[")]
	578	[ \t]*
	579	)? # title is optional
	580	(?:\n+\|\Z)
	581	}
	582	{}mx) {
	583	$g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
	584	if ($3) {
	585	$g_titles{lc $1} = $3;
	586	$g_titles{lc $1} =~ s/"/"/g;
	587	}
	588	}
	589
	590	return $text;
	591	}
	592
	593
	594	sub _HashHTMLBlocks {
	595	my $text = shift;
	596	my $less_than_tab = $g_tab_width - 1;
	597
	598	# Hashify HTML blocks:
	599	# We only want to do this for block-level HTML tags, such as headers,
	600	# lists, and tables. That's because we still want to wrap <p>s around
	601	# "paragraphs" that are wrapped in non-block-level tags, such as anchors,
	602	# phrase emphasis, and spans. The list of tags we're looking for is
	603	# hard-coded:
	604	my $block_tags_a = qr/p\|div\|h[1-6]\|blockquote\|pre\|table\|dl\|ol\|ul\|script\|noscript\|form\|fieldset\|iframe\|math\|ins\|del/;
	605	my $block_tags_b = qr/p\|div\|h[1-6]\|blockquote\|pre\|table\|dl\|ol\|ul\|script\|noscript\|form\|fieldset\|iframe\|math/;
	606
	607	# First, look for nested blocks, e.g.:
	608	# <div>
	609	# <div>
	610	# tags for inner block must be indented.
	611	# </div>
	612	# </div>
	613	#
	614	# The outermost tags must start at the left margin for this to match, and
	615	# the inner nested divs must be indented.
	616	# We need to do this before the next, more liberal match, because the next
	617	# match will start at the first `<div>` and stop at the first `</div>`.
	618	$text =~ s{
	619	( # save in $1
	620	^ # start of line (with /m)
	621	<($block_tags_a) # start tag = $2
	622	\b # word break
	623	(.\n)? # any number of lines, minimally matching
	624	</\2> # the matching end tag
	625	[ \t]* # trailing spaces/tabs
	626	(?=\n+\|\Z) # followed by a newline or end of document
	627	)
	628	}{
	629	my $key = md5_hex($1);
	630	$g_html_blocks{$key} = $1;
	631	"\n\n" . $key . "\n\n";
	632	}egmx;
	633
	634
	635	#
	636	# Now match more liberally, simply from `\n<tag>` to `</tag>\n`
	637	#
	638	$text =~ s{
	639	( # save in $1
	640	^ # start of line (with /m)
	641	<($block_tags_b) # start tag = $2
	642	\b # word break
	643	(.\n)? # any number of lines, minimally matching
	644	.*</\2> # the matching end tag
	645	[ \t]* # trailing spaces/tabs
	646	(?=\n+\|\Z) # followed by a newline or end of document
	647	)
	648	}{
	649	my $key = md5_hex($1);
	650	$g_html_blocks{$key} = $1;
	651	"\n\n" . $key . "\n\n";
	652	}egmx;
	653	# Special case just for <hr />. It was easier to make a special case than
	654	# to make the other regex more complicated.
	655	$text =~ s{
	656	(?:
	657	(?<=\n\n) # Starting after a blank line
	658	\| # or
	659	\A\n? # the beginning of the doc
	660	)
	661	( # save in $1
	662	[ ]{0,$less_than_tab}
	663	<(hr) # start tag = $2
	664	\b # word break
	665	([^<>])*? #
	666	/?> # the matching end tag
	667	[ \t]*
	668	(?=\n{2,}\|\Z) # followed by a blank line or end of document
	669	)
	670	}{
	671	my $key = md5_hex($1);
	672	$g_html_blocks{$key} = $1;
	673	"\n\n" . $key . "\n\n";
	674	}egx;
	675
	676	# Special case for standalone HTML comments:
	677	$text =~ s{
	678	(?:
	679	(?<=\n\n) # Starting after a blank line
	680	\| # or
	681	\A\n? # the beginning of the doc
	682	)