Selaa lähdekoodia

Implement custom BBCode parser to fix parsing issues

ghorsington 5 vuotta sitten
vanhempi
commit
78ea59feaf
3 muutettua tiedostoa jossa 264 lisäystä ja 23 poistoa
  1. 0 2
      bot/package.json
  2. 261 0
      bot/src/bbcode-parser/bbcode-js.ts
  3. 3 21
      bot/src/commands/forums_news_checker.ts

+ 0 - 2
bot/package.json

@@ -25,8 +25,6 @@
       "@db": "../db/lib/src"
    },
    "dependencies": {
-      "@bbob/html": "^2.5.2",
-      "@bbob/preset-html5": "^2.5.2",
       "@google-cloud/translate": "^4.1.1",
       "@types/cheerio": "^0.22.12",
       "@types/dotenv": "^6.1.1",

+ 261 - 0
bot/src/bbcode-parser/bbcode-js.ts

@@ -0,0 +1,261 @@
+import { Dict } from "src/util";
+
+var VERSION = '0.4.0';
+
+export interface BBCodeConfig {
+    showQuotePrefix?: boolean;
+    classPrefix?: string;
+    mentionPrefix?: string;
+}
+
+// default options
+const defaults: BBCodeConfig = {
+    showQuotePrefix: true,
+    classPrefix: 'bbcode_',
+    mentionPrefix: '@'
+};
+
+export var version = VERSION;
+
+// copied from here:
+// http://blog.mattheworiordan.com/post/13174566389/url-regular-expression-for-links-with-or-without-the had to make an
+// update to allow / in the query string, since some sites will have a / there made another update to support colons in
+// the query string made another update to disallow an ending dot(.)
+var URL_PATTERN = new RegExp("(" // overall match
+    + "(" // brackets covering match for protocol (optional) and domain
+    + "([A-Za-z]{3,9}:(?:\\/\\/)?)" // allow something@ for email addresses
+    + "(?:[\\-;:&=\\+\\$,\\w]+@)?[A-Za-z0-9\\.\\-]+[A-Za-z0-9\\-]"
+    // anything looking at all like a domain, non-unicode domains
+    + "|" // or instead of above
+    + "(?:www\\.|[\\-;:&=\\+\\$,\\w]+@)" // starting with something@ or www.
+    + "[A-Za-z0-9\\.\\-]+[A-Za-z0-9\\-]" // anything looking at all like a domain
+    + ")" // end protocol/domain
+    + "(" // brackets covering match for path, query string and anchor
+    + "(?:\\/[\\+~%\\/\\.\\w\\-_]*)?" // allow optional /path
+    + "\\??(?:[\\-\\+=&;%@\\.\\w_\\/:]*)" // allow optional query string starting with ?
+    + "#?(?:[\\.\\!\\/\\\\\\w]*)" // allow optional anchor #anchor
+    + ")?" // make URL suffix optional
+    + ")");
+
+function doReplace(content: string, matches: Replacement[], options: BBCodeConfig) {
+    var i, obj, regex, hasMatch, tmp;
+    // match/replace until we don't change the input anymore
+    do {
+        hasMatch = false;
+        for (i = 0; i < matches.length; ++i) {
+            obj = matches[i];
+            regex = new RegExp(obj.e, 'gi');
+            tmp = content.replace(regex, obj.func.bind(undefined, options));
+            if (tmp !== content) {
+                content = tmp;
+                hasMatch = true;
+            }
+        }
+    } while (hasMatch);
+    return content;
+}
+
+function listItemReplace(options: BBCodeConfig, fullMatch: string, tag: string, value: string) {
+    console.log(fullMatch);
+    return '<li>' + doReplace(value.trim(), [{ e: '\\[(\\w+)(?:[= ]([^\\]]+))?]((?:.|[\r\n])*?)\\[/\\1]', func: tagReplace }], options) + '</li>';
+}
+
+export var extractQuotedText = function (value: string, parts?: string[]) {
+    var quotes = ["\"", "'"], i, quote, nextPart;
+
+    for (i = 0; i < quotes.length; ++i) {
+        quote = quotes[i];
+        if (value && value[0] === quote) {
+            value = value.slice(1);
+            if (value[value.length - 1] !== quote) {
+                while (parts && parts.length) {
+                    nextPart = parts.shift();
+                    value += " " + nextPart;
+                    if (nextPart[nextPart.length - 1] === quote) {
+                        break;
+                    }
+                }
+            }
+            value = value.replace(new RegExp("[" + quote + "]+$"), '');
+            break;
+        }
+    }
+    return [value, parts];
+};
+
+export var parseParams = function (tagName: string, params: string) {
+    let paramMap: Dict<string> = {};
+
+    if (!params) {
+        return paramMap;
+    }
+
+    // first, collapse spaces next to equals
+    params = params.replace(/\s*[=]\s*/g, "=");
+    let parts = params.split(/\s+/);
+
+    while (parts.length) {
+        let part = parts.shift();
+        // check if the param itself is a valid url
+        if (!URL_PATTERN.exec(part)) {
+            let index = part.indexOf('=');
+            if (index > 0) {
+                let rv = extractQuotedText(part.slice(index + 1), parts);
+                paramMap[part.slice(0, index).toLowerCase()] = rv[0] as string;
+                parts = rv[1] as string[];
+            }
+            else {
+                let rv = extractQuotedText(part, parts);
+                paramMap[tagName] = rv[0] as string;
+                parts = rv[1] as string[];
+            }
+        } else {
+            let rv = extractQuotedText(part, parts);
+            paramMap[tagName] = rv[0] as string;
+            parts = rv[1] as string[];
+        }
+    }
+    return paramMap;
+};
+
+
+function tagReplace(options: BBCodeConfig, fullMatch: string, tag: string, params: string, value: string) {
+    let val: string;
+    tag = tag.toLowerCase();
+    let paramsObj = parseParams(tag, params || undefined);
+    let inlineValue = paramsObj[tag];
+
+    switch (tag) {
+        case 'quote':
+            val = '<div class="' + options.classPrefix + 'quote"';
+            for (let i in paramsObj) {
+                let tmp = paramsObj[i];
+                if (!inlineValue && (i === 'author' || i === 'name')) {
+                    inlineValue = tmp;
+                } else if (i !== tag) {
+                    val += ' data-' + i + '="' + tmp + '"';
+                }
+            }
+            return val + '>' + (inlineValue ? inlineValue + ' wrote:' : (options.showQuotePrefix ? 'Quote:' : '')) + '<blockquote>' + value + '</blockquote></div>';
+        case 'url':
+            return '<a class="' + options.classPrefix + 'link" target="_blank" href="' + (inlineValue || value) + '">' + value + '</a>';
+        case 'email':
+            return '<a class="' + options.classPrefix + 'link" target="_blank" href="mailto:' + (inlineValue || value) + '">' + value + '</a>';
+        case 'anchor':
+            return '<a name="' + (inlineValue || paramsObj.a || value) + '">' + value + '</a>';
+        case 'b':
+            return '<strong>' + value + '</strong>';
+        case 'i':
+            return '<em>' + value + '</em>';
+        case 'u':
+            return '<span style="text-decoration:underline">' + value + '</span>';
+        case 's':
+            return '<span style="text-decoration:line-through">' + value + '</span>';
+        case 'indent':
+            return '<blockquote>' + value + '</blockquote>';
+        case 'list':
+            tag = 'ul';
+            let className = options.classPrefix + 'list';
+            if (inlineValue && /[1Aa]/.test(inlineValue)) {
+                tag = 'ol';
+                if (/1/.test(inlineValue)) {
+                    className += '_numeric';
+                }
+                else if (/A/.test(inlineValue)) {
+                    className += '_alpha';
+                }
+                else if (/a/.test(inlineValue)) {
+                    className += '_alpha_lower';
+                }
+            }
+            val = '<' + tag + ' class="' + className + '">';
+            //  parse the value
+            val += doReplace(value, [{ e: '\\[([*])\\]([^\r\n]+)', func: listItemReplace }], options);
+            return val + '</' + tag + '>';
+        case 'code':
+        case 'php':
+        case 'java':
+        case 'javascript':
+        case 'cpp':
+        case 'ruby':
+        case 'python':
+            return '<pre class="' + options.classPrefix + (tag === 'code' ? '' : 'code_') + tag + '">' + value + '</pre>';
+        case 'highlight':
+            return '<span class="' + options.classPrefix + tag + '">' + value + '</span>';
+        case 'html':
+            return value;
+        case 'mention':
+            val = '<span class="' + options.classPrefix + 'mention"';
+            if (inlineValue) {
+                val += ' data-mention-id="' + inlineValue + '"';
+            }
+            return val + '>' + (options.mentionPrefix || '') + value + '</span>';
+        case 'span':
+        case 'h1':
+        case 'h2':
+        case 'h3':
+        case 'h4':
+        case 'h5':
+        case 'h6':
+            return '<' + tag + '>' + value + '</' + tag + '>';
+        case 'youtube':
+            return '<object class="' + options.classPrefix + 'video" width="425" height="350"><param name="movie" value="http://www.youtube.com/v/' + value + '"></param><embed src="http://www.youtube.com/v/' + value + '" type="application/x-shockwave-flash" width="425" height="350"></embed></object>';
+        case 'gvideo':
+            return '<embed class="' + options.classPrefix + 'video" style="width:400px; height:325px;" id="VideoPlayback" type="application/x-shockwave-flash" src="http://video.google.com/googleplayer.swf?docId=' + value + '&amp;hl=en">';
+        case 'google':
+            return '<a class="' + options.classPrefix + 'link" target="_blank" href="http://www.google.com/search?q=' + (inlineValue || value) + '">' + value + '</a>';
+        case 'wikipedia':
+            return '<a class="' + options.classPrefix + 'link" target="_blank" href="http://www.wikipedia.org/wiki/' + (inlineValue || value) + '">' + value + '</a>';
+        case 'img':
+            var dims = new RegExp('^(\\d+)x(\\d+)$').exec(inlineValue || '');
+            if (!dims || (dims.length !== 3)) {
+                dims = new RegExp('^width=(\\d+)\\s+height=(\\d+)$').exec(inlineValue || '');
+            }
+            if (dims && dims.length === 3) {
+                params = undefined;
+            }
+            val = '<img class="' + options.classPrefix + 'image" src="' + value + '"';
+            if (dims && dims.length === 3) {
+                val += ' width="' + dims[1] + '" height="' + dims[2] + '"';
+            } else {
+                for (let i in paramsObj) {
+                    let tmp = paramsObj[i];
+                    if (i === 'img') {
+                        i = 'alt';
+                    }
+                    val += ' ' + i + '="' + tmp + '"';
+                }
+            }
+            return val + '/>';
+    }
+    // return the original
+    return fullMatch;
+}
+
+interface Replacement {
+    e: string;
+    func: (options: BBCodeConfig, fullMatch: string, tag: string, params: string, value: string) => string;
+}
+
+/**
+ * Renders the content as html
+ * @param content   the given content to render
+ * @param options   optional object with control parameters
+ * @returns rendered html
+ */
+export var render = function (content: string, options?: BBCodeConfig) {
+    var matches: Replacement[] = [], tmp;
+
+    options = options || {};
+
+    if (!options.classPrefix)
+        options.classPrefix = defaults.classPrefix;
+    if (!options.mentionPrefix)
+        options.mentionPrefix = defaults.mentionPrefix;
+    if (!options.showQuotePrefix)
+        options.showQuotePrefix = defaults.showQuotePrefix;
+
+    // for now, only one rule
+    matches.push({ e: '\\[(\\w+)(?:[= ]([^\\]]+))?]((?:.|[\r\n])*?)\\[/\\1]', func: tagReplace });
+    return doReplace(content, matches, options);
+};

+ 3 - 21
bot/src/commands/forums_news_checker.ts

@@ -1,5 +1,4 @@
 import TurndownService, { Options } from "turndown";
-import RSSParser from "rss-parser";
 import interval from "interval-promise";
 import { client, forumClient } from "../client";
 import sha1 from "sha1";
@@ -10,8 +9,7 @@ import { getRepository, Not, IsNull } from "typeorm";
 import { PostedForumNewsItem } from "@db/entity/PostedForumsNewsItem";
 import { KnownChannel } from "@db/entity/KnownChannel";
 import { PostVerifyMessage } from "@db/entity/PostVerifyMessage";
-import bbobHTML from '@bbob/html'
-import presetHTML5 from '@bbob/preset-html5'
+import { render } from "../bbcode-parser/bbcode-js";
 
 const PREVIEW_CHAR_LIMIT = 300;
 const NEWS_POST_VERIFY_CHANNEL = "newsPostVerify";
@@ -33,27 +31,11 @@ turndown.addRule("link", {
     replacement: (content: string, node: HTMLElement) => node.getAttribute("href")
 });
 
-const parser = new RSSParser();
 const RSS_UPDATE_INTERVAL_MIN = 5;
-
-function getThreadId(url: string) {
-    let result = url.substring(url.lastIndexOf(".") + 1);
-    if (result.endsWith("/"))
-        result = result.substring(0, result.length - 1);
-    return result;
-}
-
 const NEWS_FORUM_ID = 49;
 
-const FEEDS = [
-    {
-        url: "http://custommaid3d2.com/index.php?forums/news.49/index.rss",
-        contentElement: "content:encoded"
-    }
-];
-
 function bbCodeToMarkdown(bbCode: string) {
-    return turndown.turndown(bbobHTML(bbCode, presetHTML5()));
+    return turndown.turndown(render(bbCode)).replace(/( {2}\n|\n\n){2,}/gm, "\n");
 }
 
 async function checkFeeds() {
@@ -324,6 +306,6 @@ export default {
         botUserId = user.user_id;
 
         await initPendingReactors();
-        interval(checkFeeds, RSS_UPDATE_INTERVAL_MIN * 60 * 1000);
+        interval(checkFeeds, RSS_UPDATE_INTERVAL_MIN * 1000);
     }
 } as ICommand;