From 6107e954040443d1fc4344b440b229d70fe75166 Mon Sep 17 00:00:00 2001 From: kibigo! Date: Wed, 28 Jun 2017 00:27:44 -0700 Subject: Backend YAML Processing + Profile Metadata on Static Pages --- app/lib/frontmatter_handler.rb | 242 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 app/lib/frontmatter_handler.rb (limited to 'app/lib/frontmatter_handler.rb') diff --git a/app/lib/frontmatter_handler.rb b/app/lib/frontmatter_handler.rb new file mode 100644 index 000000000..19007cf4e --- /dev/null +++ b/app/lib/frontmatter_handler.rb @@ -0,0 +1,242 @@ +# frozen_string_literal: true + +require 'singleton' + +# See also `app/javascript/features/account/util/bio_metadata.js`. + +class FrontmatterHandler + include Singleton + + # CONVENIENCE FUNCTIONS # + + def self.unirex(str) + Regexp.new str, false, 'um' + end + def self.rexstr(exp) + '(?:' + exp.source + ')' + end + + # CHARACTER CLASSES # + + DOCUMENT_START = /^/ + DOCUMENT_END = /$/ + ALLOWED_CHAR = # c-printable` in the YAML 1.2 spec. + /[\t\n\r\u{20}-\u{7e}\u{85}\u{a0}-\u{d7ff}\u{e000}-\u{fffd}\u{10000}-\u{10ffff}]/u + WHITE_SPACE = /[ \t]/ + INDENTATION = / */ + LINE_BREAK = /\r?\n|\r|/ + ESCAPE_CHAR = /[0abt\tnvfre "\/\\N_LP]/ + HEXADECIMAL_CHARS = /[0-9a-fA-F]/ + INDICATOR = /[-?:,\[\]{}&#*!|>'"%@`]/ + FLOW_CHAR = /[,\[\]{}]/ + + # NEGATED CHARACTER CLASSES # + + NOT_WHITE_SPACE = unirex '(?!' + rexstr(WHITE_SPACE) + ').' + NOT_LINE_BREAK = unirex '(?!' + rexstr(LINE_BREAK) + ').' + NOT_INDICATOR = unirex '(?!' + rexstr(INDICATOR) + ').' + NOT_FLOW_CHAR = unirex '(?!' + rexstr(FLOW_CHAR) + ').' + + # BASIC CONSTRUCTS # + + ANY_WHITE_SPACE = unirex rexstr(WHITE_SPACE) + '*' + ANY_ALLOWED_CHARS = unirex rexstr(ALLOWED_CHAR) + '*' + NEW_LINE = unirex( + rexstr(ANY_WHITE_SPACE) + rexstr(LINE_BREAK) + ) + SOME_NEW_LINES = unirex( + '(?:' + rexstr(ANY_WHITE_SPACE) + rexstr(LINE_BREAK) + ')+' + ) + POSSIBLE_STARTS = unirex( + rexstr(DOCUMENT_START) + rexstr(/]*>/) + '?' + ) + POSSIBLE_ENDS = unirex( + rexstr(SOME_NEW_LINES) + '|' + + rexstr(DOCUMENT_END) + '|' + + rexstr(/<\/p>/) + ) + CHARACTER_ESCAPE = unirex( + rexstr(/\\/) + + '(?:' + + rexstr(ESCAPE_CHAR) + '|' + + rexstr(/x/) + rexstr(HEXADECIMAL_CHARS) + '{2}' + '|' + + rexstr(/u/) + rexstr(HEXADECIMAL_CHARS) + '{4}' + '|' + + rexstr(/U/) + rexstr(HEXADECIMAL_CHARS) + '{8}' + + ')' + ) + ESCAPED_CHAR = unirex( + rexstr(/(?!["\\])/) + rexstr(NOT_LINE_BREAK) + '|' + + rexstr(CHARACTER_ESCAPE) + ) + ANY_ESCAPED_CHARS = unirex( + rexstr(ESCAPED_CHAR) + '*' + ) + ESCAPED_APOS = unirex( + '(?=' + rexstr(NOT_LINE_BREAK) + ')' + rexstr(/[^']|''/) + ) + ANY_ESCAPED_APOS = unirex( + rexstr(ESCAPED_APOS) + '*' + ) + FIRST_KEY_CHAR = unirex( + '(?=' + rexstr(NOT_LINE_BREAK) + ')' + + '(?=' + rexstr(NOT_WHITE_SPACE) + ')' + + rexstr(NOT_INDICATOR) + '|' + + rexstr(/[?:-]/) + + '(?=' + rexstr(NOT_LINE_BREAK) + ')' + + '(?=' + rexstr(NOT_WHITE_SPACE) + ')' + + '(?=' + rexstr(NOT_FLOW_CHAR) + ')' + ) + FIRST_VALUE_CHAR = unirex( + '(?=' + rexstr(NOT_LINE_BREAK) + ')' + + '(?=' + rexstr(NOT_WHITE_SPACE) + ')' + + rexstr(NOT_INDICATOR) + '|' + + rexstr(/[?:-]/) + + '(?=' + rexstr(NOT_LINE_BREAK) + ')' + + '(?=' + rexstr(NOT_WHITE_SPACE) + ')' + # Flow indicators are allowed in values. + ) + LATER_KEY_CHAR = unirex( + rexstr(WHITE_SPACE) + '|' + + '(?=' + rexstr(NOT_LINE_BREAK) + ')' + + '(?=' + rexstr(NOT_WHITE_SPACE) + ')' + + '(?=' + rexstr(NOT_FLOW_CHAR) + ')' + + rexstr(/[^:#]#?/) + '|' + + rexstr(/:/) + '(?=' + rexstr(NOT_WHITE_SPACE) + ')' + ) + LATER_VALUE_CHAR = unirex( + rexstr(WHITE_SPACE) + '|' + + '(?=' + rexstr(NOT_LINE_BREAK) + ')' + + '(?=' + rexstr(NOT_WHITE_SPACE) + ')' + + # Flow indicators are allowed in values. + rexstr(/[^:#]#?/) + '|' + + rexstr(/:/) + '(?=' + rexstr(NOT_WHITE_SPACE) + ')' + ) + + # YAML CONSTRUCTS # + + YAML_START = unirex( + rexstr(ANY_WHITE_SPACE) + rexstr(/---/) + ) + YAML_END = unirex( + rexstr(ANY_WHITE_SPACE) + rexstr(/(?:---|\.\.\.)/) + ) + YAML_LOOKAHEAD = unirex( + '(?=' + + rexstr(YAML_START) + + rexstr(ANY_ALLOWED_CHARS) + rexstr(NEW_LINE) + + rexstr(YAML_END) + rexstr(POSSIBLE_ENDS) + + ')' + ) + YAML_DOUBLE_QUOTE = unirex( + rexstr(/"/) + rexstr(ANY_ESCAPED_CHARS) + rexstr(/"/) + ) + YAML_SINGLE_QUOTE = unirex( + rexstr(/'/) + rexstr(ANY_ESCAPED_APOS) + rexstr(/'/) + ) + YAML_SIMPLE_KEY = unirex( + rexstr(FIRST_KEY_CHAR) + rexstr(LATER_KEY_CHAR) + '*' + ) + YAML_SIMPLE_VALUE = unirex( + rexstr(FIRST_VALUE_CHAR) + rexstr(LATER_VALUE_CHAR) + '*' + ) + YAML_KEY = unirex( + rexstr(YAML_DOUBLE_QUOTE) + '|' + + rexstr(YAML_SINGLE_QUOTE) + '|' + + rexstr(YAML_SIMPLE_KEY) + ) + YAML_VALUE = unirex( + rexstr(YAML_DOUBLE_QUOTE) + '|' + + rexstr(YAML_SINGLE_QUOTE) + '|' + + rexstr(YAML_SIMPLE_VALUE) + ) + YAML_SEPARATOR = unirex( + rexstr(ANY_WHITE_SPACE) + + ':' + rexstr(WHITE_SPACE) + + rexstr(ANY_WHITE_SPACE) + ) + YAML_LINE = unirex( + '(' + rexstr(YAML_KEY) + ')' + + rexstr(YAML_SEPARATOR) + + '(' + rexstr(YAML_VALUE) + ')' + ) + + # FRONTMATTER REGEX # + + YAML_FRONTMATTER = unirex( + rexstr(POSSIBLE_STARTS) + + rexstr(YAML_LOOKAHEAD) + + rexstr(YAML_START) + rexstr(SOME_NEW_LINES) + + '(?:' + + '(' + rexstr(INDENTATION) + ')' + + rexstr(YAML_LINE) + rexstr(SOME_NEW_LINES) + + '(?:' + + '\\1' + rexstr(YAML_LINE) + rexstr(SOME_NEW_LINES) + + '){0,4}' + + ')?' + + rexstr(YAML_END) + rexstr(POSSIBLE_ENDS) + ) + + # SEARCHES # + + FIND_YAML_LINES = unirex( + rexstr(NEW_LINE) + rexstr(INDENTATION) + rexstr(YAML_LINE) + ) + + # STRING PROCESSING # + + def process_string(str) + case str[0] + when '"' + str[1..-2] + .gsub(/\\0/, "\u{00}") + .gsub(/\\a/, "\u{07}") + .gsub(/\\b/, "\u{08}") + .gsub(/\\t/, "\u{09}") + .gsub(/\\n/, "\u{0a}") + .gsub(/\\v/, "\u{0b}") + .gsub(/\\f/, "\u{0c}") + .gsub(/\\r/, "\u{0d}") + .gsub(/\\e/, "\u{1b}") + .gsub(/\\ /, "\u{20}") + .gsub(/\\"/, "\u{22}") + .gsub(/\\\//, "\u{2f}") + .gsub(/\\\\/, "\u{5c}") + .gsub(/\\N/, "\u{85}") + .gsub(/\\_/, "\u{a0}") + .gsub(/\\L/, "\u{2028}") + .gsub(/\\P/, "\u{2029}") + .gsub(/\\x([0-9a-fA-F]{2})/mu) {|s| $1.to_i.chr Encoding::UTF_8} + .gsub(/\\u([0-9a-fA-F]{4})/mu) {|s| $1.to_i.chr Encoding::UTF_8} + .gsub(/\\U([0-9a-fA-F]{8})/mu) {|s| $1.to_i.chr Encoding::UTF_8} + when "'" + str[1..-2].gsub(/''/, "'") + else + str + end + end + + # BIO PROCESSING # + + def process_bio content + result = { + text: content.gsub(/"/, '"').gsub(/'/, "'"), + metadata: [] + } + yaml = YAML_FRONTMATTER.match(result[:text]) + return result unless yaml + yaml = yaml[0] + start = YAML_START =~ result[:text] + ending = start + yaml.length - (YAML_START =~ yaml) + result[:text][start..ending - 1] = '' + metadata = nil + index = 0 + while metadata = FIND_YAML_LINES.match(yaml, index) do + index = metadata.end(0) + result[:metadata].push [ + process_string(metadata[1]), process_string(metadata[2]) + ] + end + return result + end + +end -- cgit