#!/usr/bin/ruby -w
# encoding: UTF-8
# == Synopsis
#
# canonical_to_html_alternative: generates HTML file from canonical plain text
#
# == Usage
#
# canonical_to_html_alternative [FILE]
#
# FILE: The canonical document to read (defaults to stdin)

require 'cgi'

# io = DATA
io = ARGF

is_old_ruby = RUBY_VERSION.split('.')[0] == "1"

$link_regexp = 0
if is_old_ruby
   # This can be dropped when going 10.9+ (verify that first)
   $link_regexp = Regexp.new('(((https?|mid|cid|message|s?ftp|ftps|file|smb|afp|nfs|(x-)?man(-page)?|gopher):\/\/|mailto:)[-:@!a-z0-9_.,~%*+\/?=&()#;]*[@a-z0-9_~%+\/=&(])')
else
   # Works better with UTF-8
   $link_regexp = Regexp.new('(((https?|mid|cid|message|s?ftp|ftps|file|smb|afp|nfs|(x-)?man(-page)?|gopher):\/\/|mailto:)[-:@![:word:].,~%*+\/?=&()#;]*[@[:word:]~%+\/=&(])')
end

def prepare_linkify (text)
  # Some links are tricky to handle, for example <https://freron.com>, since escapeHTML is called before linkify.
  # We handle this by marking links without using HTML before calling escapeHTML.
  return text.gsub($link_regexp, '93C67E4B9D2B44E7\1FC756B349A13DC5C')
end

def linkify (text)
  return text.gsub(/93C67E4B9D2B44E7(.*?)FC756B349A13DC5C/, '<a href="\1">\1</a>')
end

block_prefix = "<blockquote>"
block_suffix = "</blockquote>"

print '<div class="plaintext">'

old_quote_level = 0
new_quote_level = 0
paragraph = ""
while line = io.gets
  line += "\n" if line != "" and line[-1,1] != "\n" # Canonicalization: Always end a line with a newline.
   
  new_quote_level = line.match('>*')[0].size
  diff = new_quote_level - old_quote_level

  if (diff != 0 or line == "\n") then
     if paragraph != "" then
        print '<p dir="auto">' + paragraph + "</p>\n"
        # print "\n" # sundown style
        paragraph = ""
     else
        if line == "\n" then
           print "<br>" # sundown style
           # print "<br />" # cmark style
        end
     end
  end

  (1..-diff).each {|i| print block_suffix} if diff < 0
  (1..diff).each {|i| print block_prefix} if diff > 0

  if line != "\n" then
     # In the canonical format, the character '>' can be space-stuffed.
     # If space-stuffed, we should skip a single space.
     # If space after quotes, we should skip a single space.
     skip_space = (line.match('^ +>') or line.match('^>+ ')) ? 1 : 0
     paragraph += "<br>\n" if paragraph != "" # sundown-style
     # paragraph += "<br />\n" if paragraph != ""       # cmark-style
     paragraph += linkify(CGI::escapeHTML(prepare_linkify(line[new_quote_level + skip_space..-2])))
     # paragraph += linkify(prepare_linkify(line[new_quote_level + skip_space..-2]))
  end
  old_quote_level = new_quote_level;
end

print '<p dir="auto">' + paragraph + "</p>\n" if paragraph != ""
(1..new_quote_level).each {|i| print block_suffix}
puts '</div>'

__END__
Unquoted line.
> 1 level
>> 2 levels
 > Space-stuffed. Not quoted. Must be encoded using &gt;.
  > Also space-stuffed.
This is an easy link to spot: https://example.com
This link <https://example.com> is more tricky, because of the use of <>.
And it's even worse if punctuation is involved: <https://example.com>.
Two links on one line: <https://example.com>, and <https://example.com>.
