#!/usr/bin/ruby -w
# encoding: UTF-8
# == Synopsis
#
# canonical_to_html: generates HTML file from canonical plain text
#
# == Usage
#
# canonical_to_html [FILE]
#
# FILE: The canonical document to read (defaults to stdin)

require 'cgi'

no_links = ARGV.include?('--no-links')
ARGV.delete('--no-links')

# io = DATA
io = ARGF

is_old_ruby = RUBY_VERSION.split('.')[0] == "1"

$link_regexp = 0
if is_old_ruby
   # This can be dropped when going 10.9+ (verify that first)
   $link_regexp = Regexp.new('(((https?|mid|cid|message|s?ftp|ftps|file|smb|afp|nfs|(x-)?man(-page)?|gopher):\/\/|mailto:)[-:@!a-z0-9_.,~%*+\/?=&()#;]*[@a-z0-9_~%+\/=&(])')
else
   # Works better with UTF-8
   $link_regexp = Regexp.new('(((https?|mid|cid|message|s?ftp|ftps|file|smb|afp|nfs|(x-)?man(-page)?|gopher):\/\/|mailto:)[-:@![:word:].,~%*+\/?=&()#;]*[@[:word:]~%+\/=&(])')
end

def prepare_linkify (text)
  # Some links are tricky to handle, for example <https://freron.com>, since escapeHTML is called before linkify.
  # We handle this by marking links without using HTML before calling escapeHTML.
  return text.gsub($link_regexp, '93C67E4B9D2B44E7\1FC756B349A13DC5C')
end

def linkify (text)
  return text.gsub(/93C67E4B9D2B44E7(.*?)FC756B349A13DC5C/, '<a href="\1">\1</a>')
end

block_prefix = "<blockquote>"
block_suffix = "</blockquote>"

#print '<div class="plaintext" dir="auto">' # Doesn't work if dir(ection) changes in the middle of the text...
print '<div class="plaintext">'

old_quote_level = 0
new_quote_level = 0
while line = io.gets
  new_quote_level = line.match('>*')[0].size
  diff = new_quote_level - old_quote_level

  (1..diff).each {|i| print block_prefix} if diff > 0
  (1..-diff).each {|i| print block_suffix} if diff < 0

  # In the canonical format, the character '>' can be space-stuffed.
  # If space-stuffed, we should skip a single space.
  # If space after quotes, we should skip a single space.
  skip_space = (line.match('^ +>') or line.match('^>+ ')) ? 1 : 0
  print '<div dir="auto">'
  if no_links
    print CGI::escapeHTML(line[new_quote_level + skip_space..-1])
  else
    print linkify(CGI::escapeHTML(prepare_linkify(line[new_quote_level + skip_space..-1])))
  end
  print '</div>' # It's a bit ugly, but we cannot use puts since we cannot have a line feed after the div.
  old_quote_level = new_quote_level;
end

(1..new_quote_level).each {|i| print block_suffix}
puts '</div>'

__END__
Unquoted line.
> 1 level
>> 2 levels
 > Space-stuffed. Not quoted. Must be encoded using &gt;.
  > Also space-stuffed.
This is an easy link to spot: https://example.com
This link <https://example.com> is more tricky, because of the use of <>.
And it's even worse if punctuation is involved: <https://example.com>.
Two links on one line: <https://example.com>, and <https://example.com>.
