changelog shortlog tags branches changeset files revisions annotate raw help

Mercurial > hg > werc / bin/contrib/markdown.pl

changeset 672: cd59fedffc2e
parent: c4b07f81f234
author: sl
date: Fri, 02 Dec 2016 20:35:58 -0500
permissions: -rwxr-xr-x
description: etc/initrc: change default markdown formatter from markdown.pl (no modern perl on plan 9) to md2html.awk (works everywhere)
1 #!/usr/bin/env perl
2 #
3 # Markdown -- A text-to-HTML conversion tool for web writers
4 #
5 # Copyright (c) 2004 John Gruber
6 # <http://daringfireball.net/projects/markdown/>
7 #
8 package Markdown;
9 require 5.006_000;
10 use strict;
11 use warnings;
12 
13 use Digest::MD5 qw(md5_hex);
14 use vars qw($VERSION);
15 $VERSION = '1.0.1';
16 # Tue 14 Dec 2004
17 
18 ## Disabled; causes problems under Perl 5.6.1:
19 # use utf8;
20 # binmode( STDOUT, ":utf8" ); # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
21 
22 
23 #
24 # Global default settings:
25 #
26 my $g_empty_element_suffix = " />"; # Change to ">" for HTML output
27 my $g_tab_width = 4;
28 
29 
30 #
31 # Globals:
32 #
33 
34 # Regex to match balanced [brackets]. See Friedl's
35 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
36 my $g_nested_brackets;
37 $g_nested_brackets = qr{
38  (?> # Atomic matching
39  [^\[\]]+ # Anything other than brackets
40  |
41  \[
42  (??{ $g_nested_brackets }) # Recursive set of nested brackets
43  \]
44  )*
45 }x;
46 
47 
48 # Table of hash values for escaped characters:
49 my %g_escape_table;
50 foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
51  $g_escape_table{$char} = md5_hex($char);
52 }
53 
54 
55 # Global hashes, used by various utility routines
56 my %g_urls;
57 my %g_titles;
58 my %g_html_blocks;
59 
60 # Used to track when we're inside an ordered or unordered list
61 # (see _ProcessListItems() for details):
62 my $g_list_level = 0;
63 
64 
65 #### Blosxom plug-in interface ##########################################
66 
67 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
68 # which posts Markdown should process, using a "meta-markup: markdown"
69 # header. If it's set to 0 (the default), Markdown will process all
70 # entries.
71 my $g_blosxom_use_meta = 0;
72 
73 sub start { 1; }
74 sub story {
75  my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
76 
77  if ( (! $g_blosxom_use_meta) or
78  (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
79  ){
80  $$body_ref = Markdown($$body_ref);
81  }
82  1;
83 }
84 
85 
86 #### Movable Type plug-in interface #####################################
87 eval {require MT}; # Test to see if we're running in MT.
88 unless ($@) {
89  require MT;
90  import MT;
91  require MT::Template::Context;
92  import MT::Template::Context;
93 
94  eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0.
95  unless ($@) {
96  require MT::Plugin;
97  import MT::Plugin;
98  my $plugin = new MT::Plugin({
99  name => "Markdown",
100  description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
101  doc_link => 'http://daringfireball.net/projects/markdown/'
102  });
103  MT->add_plugin( $plugin );
104  }
105 
106  MT::Template::Context->add_container_tag(MarkdownOptions => sub {
107  my $ctx = shift;
108  my $args = shift;
109  my $builder = $ctx->stash('builder');
110  my $tokens = $ctx->stash('tokens');
111 
112  if (defined ($args->{'output'}) ) {
113  $ctx->stash('markdown_output', lc $args->{'output'});
114  }
115 
116  defined (my $str = $builder->build($ctx, $tokens) )
117  or return $ctx->error($builder->errstr);
118  $str; # return value
119  });
120 
121  MT->add_text_filter('markdown' => {
122  label => 'Markdown',
123  docs => 'http://daringfireball.net/projects/markdown/',
124  on_format => sub {
125  my $text = shift;
126  my $ctx = shift;
127  my $raw = 0;
128  if (defined $ctx) {
129  my $output = $ctx->stash('markdown_output');
130  if (defined $output && $output =~ m/^html/i) {
131  $g_empty_element_suffix = ">";
132  $ctx->stash('markdown_output', '');
133  }
134  elsif (defined $output && $output eq 'raw') {
135  $raw = 1;
136  $ctx->stash('markdown_output', '');
137  }
138  else {
139  $raw = 0;
140  $g_empty_element_suffix = " />";
141  }
142  }
143  $text = $raw ? $text : Markdown($text);
144  $text;
145  },
146  });
147 
148  # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
149  my $smartypants;
150 
151  {
152  no warnings "once";
153  $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
154  }
155 
156  if ($smartypants) {
157  MT->add_text_filter('markdown_with_smartypants' => {
158  label => 'Markdown With SmartyPants',
159  docs => 'http://daringfireball.net/projects/markdown/',
160  on_format => sub {
161  my $text = shift;
162  my $ctx = shift;
163  if (defined $ctx) {
164  my $output = $ctx->stash('markdown_output');
165  if (defined $output && $output eq 'html') {
166  $g_empty_element_suffix = ">";
167  }
168  else {
169  $g_empty_element_suffix = " />";
170  }
171  }
172  $text = Markdown($text);
173  $text = $smartypants->($text, '1');
174  },
175  });
176  }
177 }
178 else {
179 #### BBEdit/command-line text filter interface ##########################
180 # Needs to be hidden from MT (and Blosxom when running in static mode).
181 
182  # We're only using $blosxom::version once; tell Perl not to warn us:
183  no warnings 'once';
184  unless ( defined($blosxom::version) ) {
185  use warnings;
186 
187  #### Check for command-line switches: #################
188  my %cli_opts;
189  use Getopt::Long;
190  Getopt::Long::Configure('pass_through');
191  GetOptions(\%cli_opts,
192  'version',
193  'shortversion',
194  'html4tags',
195  );
196  if ($cli_opts{'version'}) { # Version info
197  print "\nThis is Markdown, version $VERSION.\n";
198  print "Copyright 2004 John Gruber\n";
199  print "http://daringfireball.net/projects/markdown/\n\n";
200  exit 0;
201  }
202  if ($cli_opts{'shortversion'}) { # Just the version number string.
203  print $VERSION;
204  exit 0;
205  }
206  if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
207  $g_empty_element_suffix = ">";
208  }
209 
210 
211  #### Process incoming text: ###########################
212  my $text;
213  {
214  local $/; # Slurp the whole file
215  $text = <>;
216  }
217  print Markdown($text);
218  }
219 }
220 
221 
222 
223 sub Markdown {
224 #
225 # Main function. The order in which other subs are called here is
226 # essential. Link and image substitutions need to happen before
227 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
228 # and <img> tags get encoded.
229 #
230  my $text = shift;
231 
232  # Clear the global hashes. If we don't clear these, you get conflicts
233  # from other articles when generating a page which contains more than
234  # one article (e.g. an index page that shows the N most recent
235  # articles):
236  %g_urls = ();
237  %g_titles = ();
238  %g_html_blocks = ();
239 
240 
241  # Standardize line endings:
242  $text =~ s{\r\n}{\n}g; # DOS to Unix
243  $text =~ s{\r}{\n}g; # Mac to Unix
244 
245  # Make sure $text ends with a couple of newlines:
246  $text .= "\n\n";
247 
248  # Convert all tabs to spaces.
249  $text = _Detab($text);
250 
251  # Strip any lines consisting only of spaces and tabs.
252  # This makes subsequent regexen easier to write, because we can
253  # match consecutive blank lines with /\n+/ instead of something
254  # contorted like /[ \t]*\n+/ .
255  $text =~ s/^[ \t]+$//mg;
256 
257  # Turn block-level HTML blocks into hash entries
258  $text = _HashHTMLBlocks($text);
259 
260  # Strip link definitions, store in hashes.
261  $text = _StripLinkDefinitions($text);
262 
263  $text = _RunBlockGamut($text);
264 
265  $text = _UnescapeSpecialChars($text);
266 
267  return $text . "\n";
268 }
269 
270 
271 sub _StripLinkDefinitions {
272 #
273 # Strips link definitions from text, stores the URLs and titles in
274 # hash references.
275 #
276  my $text = shift;
277  my $less_than_tab = $g_tab_width - 1;
278 
279  # Link defs are in the form: ^[id]: url "optional title"
280  while ($text =~ s{
281  ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
282  [ \t]*
283  \n? # maybe *one* newline
284  [ \t]*
285  <?(\S+?)>? # url = $2
286  [ \t]*
287  \n? # maybe one newline
288  [ \t]*
289  (?:
290  (?<=\s) # lookbehind for whitespace
291  ["(]
292  (.+?) # title = $3
293  [")]
294  [ \t]*
295  )? # title is optional
296  (?:\n+|\Z)
297  }
298  {}mx) {
299  $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
300  if ($3) {
301  $g_titles{lc $1} = $3;
302  $g_titles{lc $1} =~ s/"/&quot;/g;
303  }
304  }
305 
306  return $text;
307 }
308 
309 
310 sub _HashHTMLBlocks {
311  my $text = shift;
312  my $less_than_tab = $g_tab_width - 1;
313 
314  # Hashify HTML blocks:
315  # We only want to do this for block-level HTML tags, such as headers,
316  # lists, and tables. That's because we still want to wrap <p>s around
317  # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
318  # phrase emphasis, and spans. The list of tags we're looking for is
319  # hard-coded:
320  my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
321  my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
322 
323  # First, look for nested blocks, e.g.:
324  # <div>
325  # <div>
326  # tags for inner block must be indented.
327  # </div>
328  # </div>
329  #
330  # The outermost tags must start at the left margin for this to match, and
331  # the inner nested divs must be indented.
332  # We need to do this before the next, more liberal match, because the next
333  # match will start at the first `<div>` and stop at the first `</div>`.
334  $text =~ s{
335  ( # save in $1
336  ^ # start of line (with /m)
337  <($block_tags_a) # start tag = $2
338  \b # word break
339  (.*\n)*? # any number of lines, minimally matching
340  </\2> # the matching end tag
341  [ \t]* # trailing spaces/tabs
342  (?=\n+|\Z) # followed by a newline or end of document
343  )
344  }{
345  my $key = md5_hex($1);
346  $g_html_blocks{$key} = $1;
347  "\n\n" . $key . "\n\n";
348  }egmx;
349 
350 
351  #
352  # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
353  #
354  $text =~ s{
355  ( # save in $1
356  ^ # start of line (with /m)
357  <($block_tags_b) # start tag = $2
358  \b # word break
359  (.*\n)*? # any number of lines, minimally matching
360  .*</\2> # the matching end tag
361  [ \t]* # trailing spaces/tabs
362  (?=\n+|\Z) # followed by a newline or end of document
363  )
364  }{
365  my $key = md5_hex($1);
366  $g_html_blocks{$key} = $1;
367  "\n\n" . $key . "\n\n";
368  }egmx;
369  # Special case just for <hr />. It was easier to make a special case than
370  # to make the other regex more complicated.
371  $text =~ s{
372  (?:
373  (?<=\n\n) # Starting after a blank line
374  | # or
375  \A\n? # the beginning of the doc
376  )
377  ( # save in $1
378  [ ]{0,$less_than_tab}
379  <(hr) # start tag = $2
380  \b # word break
381  ([^<>])*? #
382  /?> # the matching end tag
383  [ \t]*
384  (?=\n{2,}|\Z) # followed by a blank line or end of document
385  )
386  }{
387  my $key = md5_hex($1);
388  $g_html_blocks{$key} = $1;
389  "\n\n" . $key . "\n\n";
390  }egx;
391 
392  # Special case for standalone HTML comments:
393  $text =~ s{
394  (?:
395  (?<=\n\n) # Starting after a blank line
396  | # or
397  \A\n? # the beginning of the doc
398  )
399  ( # save in $1
400  [ ]{0,$less_than_tab}
401  (?s:
402  <!
403  (--.*?--\s*)+
404  >
405  )
406  [ \t]*
407  (?=\n{2,}|\Z) # followed by a blank line or end of document
408  )
409  }{
410  my $key = md5_hex($1);
411  $g_html_blocks{$key} = $1;
412  "\n\n" . $key . "\n\n";
413  }egx;
414 
415 
416  return $text;
417 }
418 
419 
420 sub _RunBlockGamut {
421 #
422 # These are all the transformations that form block-level
423 # tags like paragraphs, headers, and list items.
424 #
425  my $text = shift;
426 
427  $text = _DoHeaders($text);
428 
429  # Do Horizontal Rules:
430  $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
431  $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
432  $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
433 
434  $text = _DoLists($text);
435 
436  $text = _DoCodeBlocks($text);
437 
438  $text = _DoBlockQuotes($text);
439 
440  # We already ran _HashHTMLBlocks() before, in Markdown(), but that
441  # was to escape raw HTML in the original Markdown source. This time,
442  # we're escaping the markup we've just created, so that we don't wrap
443  # <p> tags around block-level tags.
444  $text = _HashHTMLBlocks($text);
445 
446  $text = _FormParagraphs($text);
447 
448  return $text;
449 }
450 
451 
452 sub _RunSpanGamut {
453 #
454 # These are all the transformations that occur *within* block-level
455 # tags like paragraphs, headers, and list items.
456 #
457  my $text = shift;
458 
459  $text = _DoCodeSpans($text);
460 
461  $text = _EscapeSpecialChars($text);
462 
463  # Process anchor and image tags. Images must come first,
464  # because ![foo][f] looks like an anchor.
465  $text = _DoImages($text);
466  $text = _DoAnchors($text);
467 
468  # Make links out of things like `<http://example.com/>`
469  # Must come after _DoAnchors(), because you can use < and >
470  # delimiters in inline links like [this](<url>).
471  $text = _DoAutoLinks($text);
472 
473  $text = _EncodeAmpsAndAngles($text);
474 
475  $text = _DoItalicsAndBold($text);
476 
477  # Do hard breaks:
478  $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
479 
480  return $text;
481 }
482 
483 
484 sub _EscapeSpecialChars {
485  my $text = shift;
486  my $tokens ||= _TokenizeHTML($text);
487 
488  $text = ''; # rebuild $text from the tokens
489 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
490 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
491 
492  foreach my $cur_token (@$tokens) {
493  if ($cur_token->[0] eq "tag") {
494  # Within tags, encode * and _ so they don't conflict
495  # with their use in Markdown for italics and strong.
496  # We're replacing each such character with its
497  # corresponding MD5 checksum value; this is likely
498  # overkill, but it should prevent us from colliding
499  # with the escape values by accident.
500  $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
501  $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
502  $text .= $cur_token->[1];
503  } else {
504  my $t = $cur_token->[1];
505  $t = _EncodeBackslashEscapes($t);
506  $text .= $t;
507  }
508  }
509  return $text;
510 }
511 
512 
513 sub _DoAnchors {
514 #
515 # Turn Markdown link shortcuts into XHTML <a> tags.
516 #
517  my $text = shift;
518 
519  #
520  # First, handle reference-style links: [link text] [id]
521  #
522  $text =~ s{
523  ( # wrap whole match in $1
524  \[
525  ($g_nested_brackets) # link text = $2
526  \]
527 
528  [ ]? # one optional space
529  (?:\n[ ]*)? # one optional newline followed by spaces
530 
531  \[
532  (.*?) # id = $3
533  \]
534  )
535  }{
536  my $result;
537  my $whole_match = $1;
538  my $link_text = $2;
539  my $link_id = lc $3;
540 
541  if ($link_id eq "") {
542  $link_id = lc $link_text; # for shortcut links like [this][].
543  }
544 
545  if (defined $g_urls{$link_id}) {
546  my $url = $g_urls{$link_id};
547  $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
548  $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
549  $result = "<a href=\"$url\"";
550  if ( defined $g_titles{$link_id} ) {
551  my $title = $g_titles{$link_id};
552  $title =~ s! \* !$g_escape_table{'*'}!gx;
553  $title =~ s! _ !$g_escape_table{'_'}!gx;
554  $result .= " title=\"$title\"";
555  }
556  $result .= ">$link_text</a>";
557  }
558  else {
559  $result = $whole_match;
560  }
561  $result;
562  }xsge;
563 
564  #
565  # Next, inline-style links: [link text](url "optional title")
566  #
567  $text =~ s{
568  ( # wrap whole match in $1
569  \[
570  ($g_nested_brackets) # link text = $2
571  \]
572  \( # literal paren
573  [ \t]*
574  <?(.*?)>? # href = $3
575  [ \t]*
576  ( # $4
577  (['"]) # quote char = $5
578  (.*?) # Title = $6
579  \5 # matching quote
580  )? # title is optional
581  \)
582  )
583  }{
584  my $result;
585  my $whole_match = $1;
586  my $link_text = $2;
587  my $url = $3;
588  my $title = $6;
589 
590  $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
591  $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
592  $result = "<a href=\"$url\"";
593 
594  if (defined $title) {
595  $title =~ s/"/&quot;/g;
596  $title =~ s! \* !$g_escape_table{'*'}!gx;
597  $title =~ s! _ !$g_escape_table{'_'}!gx;
598  $result .= " title=\"$title\"";
599  }
600 
601  $result .= ">$link_text</a>";
602 
603  $result;
604  }xsge;
605 
606  return $text;
607 }
608 
609 
610 sub _DoImages {
611 #
612 # Turn Markdown image shortcuts into <img> tags.
613 #
614  my $text = shift;
615 
616  #
617  # First, handle reference-style labeled images: ![alt text][id]
618  #
619  $text =~ s{
620  ( # wrap whole match in $1
621  !\[
622  (.*?) # alt text = $2
623  \]
624 
625  [ ]? # one optional space
626  (?:\n[ ]*)? # one optional newline followed by spaces
627 
628  \[
629  (.*?) # id = $3
630  \]
631 
632  )
633  }{
634  my $result;
635  my $whole_match = $1;
636  my $alt_text = $2;
637  my $link_id = lc $3;
638 
639  if ($link_id eq "") {
640  $link_id = lc $alt_text; # for shortcut links like ![this][].
641  }
642 
643  $alt_text =~ s/"/&quot;/g;
644  if (defined $g_urls{$link_id}) {
645  my $url = $g_urls{$link_id};
646  $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
647  $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
648  $result = "<img src=\"$url\" alt=\"$alt_text\"";
649  if (defined $g_titles{$link_id}) {
650  my $title = $g_titles{$link_id};
651  $title =~ s! \* !$g_escape_table{'*'}!gx;
652  $title =~ s! _ !$g_escape_table{'_'}!gx;
653  $result .= " title=\"$title\"";
654  }
655  $result .= $g_empty_element_suffix;
656  }
657  else {
658  # If there's no such link ID, leave intact:
659  $result = $whole_match;
660  }
661 
662  $result;
663  }xsge;
664 
665  #
666  # Next, handle inline images: ![alt text](url "optional title")
667  # Don't forget: encode * and _
668 
669  $text =~ s{
670  ( # wrap whole match in $1
671  !\[
672  (.*?) # alt text = $2
673  \]
674  \( # literal paren
675  [ \t]*
676  <?(\S+?)>? # src url = $3
677  [ \t]*
678  ( # $4
679  (['"]) # quote char = $5
680  (.*?) # title = $6
681  \5 # matching quote
682  [ \t]*
683  )? # title is optional
684  \)
685  )
686  }{
687  my $result;
688  my $whole_match = $1;
689  my $alt_text = $2;
690  my $url = $3;
691  my $title = '';
692  if (defined($6)) {
693  $title = $6;
694  }
695 
696  $alt_text =~ s/"/&quot;/g;
697  $title =~ s/"/&quot;/g;
698  $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
699  $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
700  $result = "<img src=\"$url\" alt=\"$alt_text\"";
701  if (defined $title) {
702  $title =~ s! \* !$g_escape_table{'*'}!gx;
703  $title =~ s! _ !$g_escape_table{'_'}!gx;
704  $result .= " title=\"$title\"";
705  }
706  $result .= $g_empty_element_suffix;
707 
708  $result;
709  }xsge;
710 
711  return $text;
712 }
713 
714 
715 sub _DoHeaders {
716  my $text = shift;
717 
718  # Setext-style headers:
719  # Header 1
720  # ========
721  #
722  # Header 2
723  # --------
724  #
725  $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
726  "<h1>" . _RunSpanGamut($1) . "</h1>\n\n";
727  }egmx;
728 
729  $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
730  "<h2>" . _RunSpanGamut($1) . "</h2>\n\n";
731  }egmx;
732 
733 
734  # atx-style headers:
735  # # Header 1
736  # ## Header 2
737  # ## Header 2 with closing hashes ##
738  # ...
739  # ###### Header 6
740  #
741  $text =~ s{
742  ^(\#{1,6}) # $1 = string of #'s
743  [ \t]*
744  (.+?) # $2 = Header text
745  [ \t]*
746  \#* # optional closing #'s (not counted)
747  \n+
748  }{
749  my $h_level = length($1);
750  "<h$h_level>" . _RunSpanGamut($2) . "</h$h_level>\n\n";
751  }egmx;
752 
753  return $text;
754 }
755 
756 
757 sub _DoLists {
758 #
759 # Form HTML ordered (numbered) and unordered (bulleted) lists.
760 #
761  my $text = shift;
762  my $less_than_tab = $g_tab_width - 1;
763 
764  # Re-usable patterns to match list item bullets and number markers:
765  my $marker_ul = qr/[*+-]/;
766  my $marker_ol = qr/\d+[.]/;
767  my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
768 
769  # Re-usable pattern to match any entirel ul or ol list:
770  my $whole_list = qr{
771  ( # $1 = whole list
772  ( # $2
773  [ ]{0,$less_than_tab}
774  (${marker_any}) # $3 = first list item marker
775  [ \t]+
776  )
777  (?s:.+?)
778  ( # $4
779  \z
780  |
781  \n{2,}
782  (?=\S)
783  (?! # Negative lookahead for another list item marker
784  [ \t]*
785  ${marker_any}[ \t]+
786  )
787  )
788  )
789  }mx;
790 
791  # We use a different prefix before nested lists than top-level lists.
792  # See extended comment in _ProcessListItems().
793  #
794  # Note: There's a bit of duplication here. My original implementation
795  # created a scalar regex pattern as the conditional result of the test on
796  # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
797  # substitution once, using the scalar as the pattern. This worked,
798  # everywhere except when running under MT on my hosting account at Pair
799  # Networks. There, this caused all rebuilds to be killed by the reaper (or
800  # perhaps they crashed, but that seems incredibly unlikely given that the
801  # same script on the same server ran fine *except* under MT. I've spent
802  # more time trying to figure out why this is happening than I'd like to
803  # admit. My only guess, backed up by the fact that this workaround works,
804  # is that Perl optimizes the substition when it can figure out that the
805  # pattern will never change, and when this optimization isn't on, we run
806  # afoul of the reaper. Thus, the slightly redundant code to that uses two
807  # static s/// patterns rather than one conditional pattern.
808 
809  if ($g_list_level) {
810  $text =~ s{
811  ^
812  $whole_list
813  }{
814  my $list = $1;
815  my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
816  # Turn double returns into triple returns, so that we can make a
817  # paragraph for the last item in a list, if necessary:
818  $list =~ s/\n{2,}/\n\n\n/g;
819  my $result = _ProcessListItems($list, $marker_any);
820  $result = "<$list_type>\n" . $result . "</$list_type>\n";
821  $result;
822  }egmx;
823  }
824  else {
825  $text =~ s{
826  (?:(?<=\n\n)|\A\n?)
827  $whole_list
828  }{
829  my $list = $1;
830  my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
831  # Turn double returns into triple returns, so that we can make a
832  # paragraph for the last item in a list, if necessary:
833  $list =~ s/\n{2,}/\n\n\n/g;
834  my $result = _ProcessListItems($list, $marker_any);
835  $result = "<$list_type>\n" . $result . "</$list_type>\n";
836  $result;
837  }egmx;
838  }
839 
840 
841  return $text;
842 }
843 
844 
845 sub _ProcessListItems {
846 #
847 # Process the contents of a single ordered or unordered list, splitting it
848 # into individual list items.
849 #
850 
851  my $list_str = shift;
852  my $marker_any = shift;
853 
854 
855  # The $g_list_level global keeps track of when we're inside a list.
856  # Each time we enter a list, we increment it; when we leave a list,
857  # we decrement. If it's zero, we're not in a list anymore.
858  #
859  # We do this because when we're not inside a list, we want to treat
860  # something like this:
861  #
862  # I recommend upgrading to version
863  # 8. Oops, now this line is treated
864  # as a sub-list.
865  #
866  # As a single paragraph, despite the fact that the second line starts
867  # with a digit-period-space sequence.
868  #
869  # Whereas when we're inside a list (or sub-list), that line will be
870  # treated as the start of a sub-list. What a kludge, huh? This is
871  # an aspect of Markdown's syntax that's hard to parse perfectly
872  # without resorting to mind-reading. Perhaps the solution is to
873  # change the syntax rules such that sub-lists must start with a
874  # starting cardinal number; e.g. "1." or "a.".
875 
876  $g_list_level++;
877 
878  # trim trailing blank lines:
879  $list_str =~ s/\n{2,}\z/\n/;
880 
881 
882  $list_str =~ s{
883  (\n)? # leading line = $1
884  (^[ \t]*) # leading whitespace = $2
885  ($marker_any) [ \t]+ # list marker = $3
886  ((?s:.+?) # list item text = $4
887  (\n{1,2}))
888  (?= \n* (\z | \2 ($marker_any) [ \t]+))
889  }{
890  my $item = $4;
891  my $leading_line = $1;
892  my $leading_space = $2;
893 
894  if ($leading_line or ($item =~ m/\n{2,}/)) {
895  $item = _RunBlockGamut(_Outdent($item));
896  }
897  else {
898  # Recursion for sub-lists:
899  $item = _DoLists(_Outdent($item));
900  chomp $item;
901  $item = _RunSpanGamut($item);
902  }
903 
904  "<li>" . $item . "</li>\n";
905  }egmx;
906 
907  $g_list_level--;
908  return $list_str;
909 }
910 
911 
912 
913 sub _DoCodeBlocks {
914 #
915 # Process Markdown `<pre><code>` blocks.
916 #
917 
918  my $text = shift;
919 
920  $text =~ s{
921  (?:\n\n|\A)
922  ( # $1 = the code block -- one or more lines, starting with a space/tab
923  (?:
924  (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces
925  .*\n+
926  )+
927  )
928  ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
929  }{
930  my $codeblock = $1;
931  my $result; # return value
932 
933  $codeblock = _EncodeCode(_Outdent($codeblock));
934  $codeblock = _Detab($codeblock);
935  $codeblock =~ s/\A\n+//; # trim leading newlines
936  $codeblock =~ s/\s+\z//; # trim trailing whitespace
937 
938  $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
939 
940  $result;
941  }egmx;
942 
943  return $text;
944 }
945 
946 
947 sub _DoCodeSpans {
948 #
949 # * Backtick quotes are used for <code></code> spans.
950 #
951 # * You can use multiple backticks as the delimiters if you want to
952 # include literal backticks in the code span. So, this input:
953 #
954 # Just type ``foo `bar` baz`` at the prompt.
955 #
956 # Will translate to:
957 #
958 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
959 #
960 # There's no arbitrary limit to the number of backticks you
961 # can use as delimters. If you need three consecutive backticks
962 # in your code, use four for delimiters, etc.
963 #
964 # * You can use spaces to get literal backticks at the edges:
965 #
966 # ... type `` `bar` `` ...
967 #
968 # Turns to:
969 #
970 # ... type <code>`bar`</code> ...
971 #
972 
973  my $text = shift;
974 
975  $text =~ s@
976  (`+) # $1 = Opening run of `
977  (.+?) # $2 = The code block
978  (?<!`)
979  \1 # Matching closer
980  (?!`)
981  @
982  my $c = "$2";
983  $c =~ s/^[ \t]*//g; # leading whitespace
984  $c =~ s/[ \t]*$//g; # trailing whitespace
985  $c = _EncodeCode($c);
986  "<code>$c</code>";
987  @egsx;
988 
989  return $text;
990 }
991 
992 
993 sub _EncodeCode {
994 #
995 # Encode/escape certain characters inside Markdown code runs.
996 # The point is that in code, these characters are literals,
997 # and lose their special Markdown meanings.
998 #
999  local $_ = shift;
1000 
1001  # Encode all ampersands; HTML entities are not
1002  # entities within a Markdown code span.
1003  s/&/&amp;/g;
1004 
1005  # Encode $'s, but only if we're running under Blosxom.
1006  # (Blosxom interpolates Perl variables in article bodies.)
1007  {
1008  no warnings 'once';
1009  if (defined($blosxom::version)) {
1010  s/\$/&#036;/g;
1011  }
1012  }
1013 
1014 
1015  # Do the angle bracket song and dance:
1016  s! < !&lt;!gx;
1017  s! > !&gt;!gx;
1018 
1019  # Now, escape characters that are magic in Markdown:
1020  s! \* !$g_escape_table{'*'}!gx;
1021  s! _ !$g_escape_table{'_'}!gx;
1022  s! { !$g_escape_table{'{'}!gx;
1023  s! } !$g_escape_table{'}'}!gx;
1024  s! \[ !$g_escape_table{'['}!gx;
1025  s! \] !$g_escape_table{']'}!gx;
1026  s! \\ !$g_escape_table{'\\'}!gx;
1027 
1028  return $_;
1029 }
1030 
1031 
1032 sub _DoItalicsAndBold {
1033  my $text = shift;
1034 
1035  # <strong> must go first:
1036  $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1037  {<strong>$2</strong>}gsx;
1038 
1039  $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1040  {<em>$2</em>}gsx;
1041 
1042  return $text;
1043 }
1044 
1045 
1046 sub _DoBlockQuotes {
1047  my $text = shift;
1048 
1049  $text =~ s{
1050  ( # Wrap whole match in $1
1051  (
1052  ^[ \t]*>[ \t]? # '>' at the start of a line
1053  .+\n # rest of the first line
1054  (.+\n)* # subsequent consecutive lines
1055  \n* # blanks
1056  )+
1057  )
1058  }{
1059  my $bq = $1;
1060  $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
1061  $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
1062  $bq = _RunBlockGamut($bq); # recurse
1063 
1064  $bq =~ s/^/ /g;
1065  # These leading spaces screw with <pre> content, so we need to fix that:
1066  $bq =~ s{
1067  (\s*<pre>.+?</pre>)
1068  }{
1069  my $pre = $1;
1070  $pre =~ s/^ //mg;
1071  $pre;
1072  }egsx;
1073 
1074  "<blockquote>\n$bq\n</blockquote>\n\n";
1075  }egmx;
1076 
1077 
1078  return $text;
1079 }
1080 
1081 
1082 sub _FormParagraphs {
1083 #
1084 # Params:
1085 # $text - string to process with html <p> tags
1086 #
1087  my $text = shift;
1088 
1089  # Strip leading and trailing lines:
1090  $text =~ s/\A\n+//;
1091  $text =~ s/\n+\z//;
1092 
1093  my @grafs = split(/\n{2,}/, $text);
1094 
1095  #
1096  # Wrap <p> tags.
1097  #
1098  foreach (@grafs) {
1099  unless (defined( $g_html_blocks{$_} )) {
1100  $_ = _RunSpanGamut($_);
1101  s/^([ \t]*)/<p>/;
1102  $_ .= "</p>";
1103  }
1104  }
1105 
1106  #
1107  # Unhashify HTML blocks
1108  #
1109  foreach (@grafs) {
1110  if (defined( $g_html_blocks{$_} )) {
1111  $_ = $g_html_blocks{$_};
1112  }
1113  }
1114 
1115  return join "\n\n", @grafs;
1116 }
1117 
1118 
1119 sub _EncodeAmpsAndAngles {
1120 # Smart processing for ampersands and angle brackets that need to be encoded.
1121 
1122  my $text = shift;
1123 
1124  # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1125  # http://bumppo.net/projects/amputator/
1126  $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1127 
1128  # Encode naked <'s
1129  $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1130 
1131  return $text;
1132 }
1133 
1134 
1135 sub _EncodeBackslashEscapes {
1136 #
1137 # Parameter: String.
1138 # Returns: The string, with after processing the following backslash
1139 # escape sequences.
1140 #
1141  local $_ = shift;
1142 
1143  s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first.
1144  s! \\` !$g_escape_table{'`'}!gx;
1145  s! \\\* !$g_escape_table{'*'}!gx;
1146  s! \\_ !$g_escape_table{'_'}!gx;
1147  s! \\\{ !$g_escape_table{'{'}!gx;
1148  s! \\\} !$g_escape_table{'}'}!gx;
1149  s! \\\[ !$g_escape_table{'['}!gx;
1150  s! \\\] !$g_escape_table{']'}!gx;
1151  s! \\\( !$g_escape_table{'('}!gx;
1152  s! \\\) !$g_escape_table{')'}!gx;
1153  s! \\> !$g_escape_table{'>'}!gx;
1154  s! \\\# !$g_escape_table{'#'}!gx;
1155  s! \\\+ !$g_escape_table{'+'}!gx;
1156  s! \\\- !$g_escape_table{'-'}!gx;
1157  s! \\\. !$g_escape_table{'.'}!gx;
1158  s{ \\! }{$g_escape_table{'!'}}gx;
1159 
1160  return $_;
1161 }
1162 
1163 
1164 sub _DoAutoLinks {
1165  my $text = shift;
1166 
1167  $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1168 
1169  # Email addresses: <address@domain.foo>
1170  $text =~ s{
1171  <
1172  (?:mailto:)?
1173  (
1174  [-.\w]+
1175  \@
1176  [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1177  )
1178  >
1179  }{
1180  _EncodeEmailAddress( _UnescapeSpecialChars($1) );
1181  }egix;
1182 
1183  return $text;
1184 }
1185 
1186 
1187 sub _EncodeEmailAddress {
1188 #
1189 # Input: an email address, e.g. "foo@example.com"
1190 #
1191 # Output: the email address as a mailto link, with each character
1192 # of the address encoded as either a decimal or hex entity, in
1193 # the hopes of foiling most address harvesting spam bots. E.g.:
1194 #
1195 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1196 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1197 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1198 #
1199 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1200 # mailing list: <http://tinyurl.com/yu7ue>
1201 #
1202 
1203  my $addr = shift;
1204 
1205  srand;
1206  my @encode = (
1207  sub { '&#' . ord(shift) . ';' },
1208  sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1209  sub { shift },
1210  );
1211 
1212  $addr = "mailto:" . $addr;
1213 
1214  $addr =~ s{(.)}{
1215  my $char = $1;
1216  if ( $char eq '@' ) {
1217  # this *must* be encoded. I insist.
1218  $char = $encode[int rand 1]->($char);
1219  } elsif ( $char ne ':' ) {
1220  # leave ':' alone (to spot mailto: later)
1221  my $r = rand;
1222  # roughly 10% raw, 45% hex, 45% dec
1223  $char = (
1224  $r > .9 ? $encode[2]->($char) :
1225  $r < .45 ? $encode[1]->($char) :
1226  $encode[0]->($char)
1227  );
1228  }
1229  $char;
1230  }gex;
1231 
1232  $addr = qq{<a href="$addr">$addr</a>};
1233  $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1234 
1235  return $addr;
1236 }
1237 
1238 
1239 sub _UnescapeSpecialChars {
1240 #
1241 # Swap back in all the special characters we've hidden.
1242 #
1243  my $text = shift;
1244 
1245  while( my($char, $hash) = each(%g_escape_table) ) {
1246  $text =~ s/$hash/$char/g;
1247  }
1248  return $text;
1249 }
1250 
1251 
1252 sub _TokenizeHTML {
1253 #
1254 # Parameter: String containing HTML markup.
1255 # Returns: Reference to an array of the tokens comprising the input
1256 # string. Each token is either a tag (possibly with nested,
1257 # tags contained therein, such as <a href="<MTFoo>">, or a
1258 # run of text between tags. Each element of the array is a
1259 # two-element array; the first is either 'tag' or 'text';
1260 # the second is the actual value.
1261 #
1262 #
1263 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1264 # <http://www.bradchoate.com/past/mtregex.php>
1265 #
1266 
1267  my $str = shift;
1268  my $pos = 0;
1269  my $len = length $str;
1270  my @tokens;
1271 
1272  my $depth = 6;
1273  my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
1274  my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
1275  (?s: <\? .*? \?> ) | # processing instruction
1276  $nested_tags/ix; # nested tags
1277 
1278  while ($str =~ m/($match)/g) {
1279  my $whole_tag = $1;
1280  my $sec_start = pos $str;
1281  my $tag_start = $sec_start - length $whole_tag;
1282  if ($pos < $tag_start) {
1283  push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1284  }
1285  push @tokens, ['tag', $whole_tag];
1286  $pos = pos $str;
1287  }
1288  push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1289  \@tokens;
1290 }
1291 
1292 
1293 sub _Outdent {
1294 #
1295 # Remove one level of line-leading tabs or spaces
1296 #
1297  my $text = shift;
1298 
1299  $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
1300  return $text;
1301 }
1302 
1303 
1304 sub _Detab {
1305 #
1306 # Cribbed from a post by Bart Lateur:
1307 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1308 #
1309  my $text = shift;
1310 
1311  $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
1312  return $text;
1313 }
1314 
1315 
1316 1;
1317 
1318 __END__
1319 
1320 
1321 =pod
1322 
1323 =head1 NAME
1324 
1325 B<Markdown>
1326 
1327 
1328 =head1 SYNOPSIS
1329 
1330 B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
1331  [ I<file> ... ]
1332 
1333 
1334 =head1 DESCRIPTION
1335 
1336 Markdown is a text-to-HTML filter; it translates an easy-to-read /
1337 easy-to-write structured text format into HTML. Markdown's text format
1338 is most similar to that of plain text email, and supports features such
1339 as headers, *emphasis*, code blocks, blockquotes, and links.
1340 
1341 Markdown's syntax is designed not as a generic markup language, but
1342 specifically to serve as a front-end to (X)HTML. You can use span-level
1343 HTML tags anywhere in a Markdown document, and you can use block level
1344 HTML tags (like <div> and <table> as well).
1345 
1346 For more information about Markdown's syntax, see:
1347 
1348  http://daringfireball.net/projects/markdown/
1349 
1350 
1351 =head1 OPTIONS
1352 
1353 Use "--" to end switch parsing. For example, to open a file named "-z", use:
1354 
1355  Markdown.pl -- -z
1356 
1357 =over 4
1358 
1359 
1360 =item B<--html4tags>
1361 
1362 Use HTML 4 style for empty element tags, e.g.:
1363 
1364  <br>
1365 
1366 instead of Markdown's default XHTML style tags, e.g.:
1367 
1368  <br />
1369 
1370 
1371 =item B<-v>, B<--version>
1372 
1373 Display Markdown's version number and copyright information.
1374 
1375 
1376 =item B<-s>, B<--shortversion>
1377 
1378 Display the short-form version number.
1379 
1380 
1381 =back
1382 
1383 
1384 
1385 =head1 BUGS
1386 
1387 To file bug reports or feature requests (other than topics listed in the
1388 Caveats section above) please send email to:
1389 
1390  support@daringfireball.net
1391 
1392 Please include with your report: (1) the example input; (2) the output
1393 you expected; (3) the output Markdown actually produced.
1394 
1395 
1396 =head1 VERSION HISTORY
1397 
1398 See the readme file for detailed release notes for this version.
1399 
1400 1.0.1 - 14 Dec 2004
1401 
1402 1.0 - 28 Aug 2004
1403 
1404 
1405 =head1 AUTHOR
1406 
1407  John Gruber
1408  http://daringfireball.net
1409 
1410  PHP port and other contributions by Michel Fortin
1411  http://michelf.com
1412 
1413 
1414 =head1 COPYRIGHT AND LICENSE
1415 
1416 Copyright (c) 2003-2004 John Gruber
1417 <http://daringfireball.net/>
1418 All rights reserved.
1419 
1420 Redistribution and use in source and binary forms, with or without
1421 modification, are permitted provided that the following conditions are
1422 met:
1423 
1424 * Redistributions of source code must retain the above copyright notice,
1425  this list of conditions and the following disclaimer.
1426 
1427 * Redistributions in binary form must reproduce the above copyright
1428  notice, this list of conditions and the following disclaimer in the
1429  documentation and/or other materials provided with the distribution.
1430 
1431 * Neither the name "Markdown" nor the names of its contributors may
1432  be used to endorse or promote products derived from this software
1433  without specific prior written permission.
1434 
1435 This software is provided by the copyright holders and contributors "as
1436 is" and any express or implied warranties, including, but not limited
1437 to, the implied warranties of merchantability and fitness for a
1438 particular purpose are disclaimed. In no event shall the copyright owner
1439 or contributors be liable for any direct, indirect, incidental, special,
1440 exemplary, or consequential damages (including, but not limited to,
1441 procurement of substitute goods or services; loss of use, data, or
1442 profits; or business interruption) however caused and on any theory of
1443 liability, whether in contract, strict liability, or tort (including
1444 negligence or otherwise) arising in any way out of the use of this
1445 software, even if advised of the possibility of such damage.
1446 
1447 =cut