Bläddra i källkod

Switch node-html-parser to cheerio

ghorsington 5 år sedan
förälder
incheckning
82b0af0c2e

+ 2 - 1
bot/package.json

@@ -28,6 +28,7 @@
       "@bbob/html": "^2.5.2",
       "@bbob/preset-html5": "^2.5.2",
       "@google-cloud/translate": "^4.1.1",
+      "@types/cheerio": "^0.22.12",
       "@types/dotenv": "^6.1.1",
       "@types/lowdb": "^1.0.9",
       "@types/request-promise-native": "^1.0.16",
@@ -35,6 +36,7 @@
       "@types/turndown": "^5.0.0",
       "@types/xml2js": "^0.4.4",
       "axios": "^0.19.0",
+      "cheerio": "^1.0.0-rc.3",
       "discord.js": "^11.4.2",
       "dotenv": "^8.0.0",
       "html2bbcode": "^1.2.6",
@@ -42,7 +44,6 @@
       "jimp": "^0.5.4",
       "lowdb": "^1.0.0",
       "module-alias": "^2.2.0",
-      "node-html-parser": "^1.1.16",
       "opencv4nodejs": "^4.9.0",
       "pg": "^7.11.0",
       "reflect-metadata": "^0.1.10",

+ 5 - 11
bot/src/commands/aggregators/com3d2_updates.ts

@@ -1,9 +1,9 @@
-import * as html from "node-html-parser";
 import request from "request-promise-native";
 import { Response } from "request";
 import { IAggregator, INewsItem } from "./aggregator";
 import { getRepository } from "typeorm";
 import { AggroNewsItem } from "@db/entity/AggroNewsItem";
+import cheerio from "cheerio";
 
 const updatePage = "http://com3d2.jp/update/";
 const changeLogPattern = /\[\s*([^\s\]]+)\s*\]\s*((・.*)\s+)+/gim;
@@ -36,22 +36,16 @@ async function aggregate() {
         if(mainPageRes.statusCode != 200)
             return;
         
-        let rootNode = html.parse(mainPageRes.body, {
-                pre: true,
-                script: false,
-                style: false
-        });
-
-        if(!(rootNode instanceof html.HTMLElement))
-            return;
+        let rootNode = cheerio.load(mainPageRes.body);
 
-        let readme = rootNode.querySelector("div.readme");
+        let readme = rootNode("div.readme");
 
         if(!readme) {
             console.log("[COM3D2 JP UPDATE] Failed to find listing!");
+            return [];
         }
 
-        let latestVersionChangelog = changeLogPattern.exec(readme.text);
+        let latestVersionChangelog = changeLogPattern.exec(readme.text());
 
         if(!latestVersionChangelog)
             return [];

+ 11 - 25
bot/src/commands/aggregators/com3d2_world.ts

@@ -1,9 +1,9 @@
-import * as html from "node-html-parser";
 import request from "request-promise-native";
 import { Response } from "request";
 import { INewsItem, IAggregator } from "./aggregator";
 import { getRepository } from "typeorm";
 import { AggroNewsItem } from "@db/entity/AggroNewsItem";
+import cheerio from "cheerio";
 
 const kissDiaryRoot = "https://com3d2.world/r18/notices.php";
 const FEED_NAME = "com3d2-world-notices";
@@ -28,16 +28,9 @@ async function aggregate() {
         if(mainPageRes.statusCode != 200)
             return [];
 
-        let rootNode = html.parse(mainPageRes.body, {
-                pre: true,
-                script: false,
-                style: false
-        });
-
-        if(!(rootNode instanceof html.HTMLElement))
-            return;
+        let rootNode = cheerio.load(mainPageRes.body);
 
-        let diaryEntries = rootNode.querySelectorAll("div.frame a");
+        let diaryEntries = rootNode("div.frame a");
 
         if(!diaryEntries) {
             console.log("[COM3D2 WORLD BLOG] Failed to find listing!");
@@ -46,11 +39,11 @@ async function aggregate() {
         let result : INewsItem[] = [];
         let latestEntry = lastPost.newsId;
 
-        for(let a of diaryEntries) {
-            if(!a.rawAttributes.id)
+        for(let a of diaryEntries.get() as CheerioElement[]) {
+            if(!a.attribs.id)
                 continue;
             
-            let id = +a.rawAttributes.id;
+            let id = +a.attribs.id;
 
             if(id <= lastPost.newsId)
                 continue;
@@ -63,25 +56,18 @@ async function aggregate() {
             if(res.statusCode != 200)
                 continue;
 
-            let node = html.parse(res.body, {
-                pre: true,
-                script: false,
-                style: false
-            });
-
-            if(!(node instanceof html.HTMLElement))
-                continue;
+            let node = cheerio.load(res.body);
 
-            let title = node.querySelector("div.frame div.notice_title th");
-            let contents = node.querySelectorAll("div.frame div")[1];
+            let title = node("div.frame div.notice_title th");
+            let contents = node("div.frame div").get(1);
 
             result.push({
                 newsId: id,
                 feedId: FEED_NAME,
                 link: diaryLink,
-                title: title.text,
+                title: title.text(),
                 author: "com3d2.world",
-                contents: contents.outerHTML,
+                contents: cheerio.html(contents),
                 embedColor: 0xa39869
             });
         }

+ 15 - 32
bot/src/commands/aggregators/kiss_diary.ts

@@ -1,9 +1,9 @@
-import * as html from "node-html-parser";
 import request from "request-promise-native";
 import { Response } from "request";
 import { INewsItem, IAggregator } from "./aggregator";
 import { getRepository } from "typeorm";
 import { AggroNewsItem } from "@db/entity/AggroNewsItem";
+import cheerio from "cheerio";
 
 const urlPattern = /diary\.php\?no=(\d+)/i;
 const kissDiaryRoot = "http://www.kisskiss.tv/kiss";
@@ -29,26 +29,19 @@ async function aggregate() {
         if(mainPageRes.statusCode != 200)
             return [];
 
-        let rootNode = html.parse(mainPageRes.body, {
-                pre: true,
-                script: false,
-                style: false
-        });
-
-        if(!(rootNode instanceof html.HTMLElement))
-            return;
+        let rootNode = cheerio.load(mainPageRes.body);
 
-        let diaryEntries = rootNode.querySelectorAll("div.blog_frame_middle ul.disc li a");
+        let diaryEntries = rootNode("div.blog_frame_middle ul.disc li a");
 
-        if(!diaryEntries) {
+        if(diaryEntries.length == 0) {
             console.log("[KISS DIARY] Failed to find listing!");
         }
 
         let result : INewsItem[] = [];
         let latestEntry = lastPost.newsId;
 
-        for(let a of diaryEntries) {
-            let matches = urlPattern.exec(a.rawAttributes.href);
+        for(let a of diaryEntries.get() as CheerioElement[]) {
+            let matches = urlPattern.exec(a.attribs.href);
             if(!matches)
                 continue;
             
@@ -60,36 +53,26 @@ async function aggregate() {
             if(id > latestEntry)
                 latestEntry = id;
 
-            let diaryLink = `${kissDiaryRoot}/${a.rawAttributes.href}`;
+            let diaryLink = `${kissDiaryRoot}/${a.attribs.href}`;
             let res = await request(diaryLink, {resolveWithFullResponse: true}) as Response;
+
             if(res.statusCode != 200)
                 continue;
 
-            let node = html.parse(res.body, {
-                pre: true,
-                script: false,
-                style: false
-            });
-
-            if(!(node instanceof html.HTMLElement))
-                continue;
+            let node = cheerio.load(res.body);
 
-            let title = node.querySelector("table.blog_frame_top tr td a");
-            let contents = node.querySelector("div.blog_frame_middle");
-            let bottomFrame = contents.querySelector("div.blog_data");
-            if(bottomFrame) {
-                let child = contents.childNodes[0];
-                if(child instanceof html.HTMLElement)
-                    child.removeChild(bottomFrame);
-            }
+            let title = node("table.blog_frame_top tr td a");
+            let contents = node("div.blog_frame_middle");
+            let bottomFrame = contents.find("div.blog_data");
+            bottomFrame.remove();
 
             result.push({
                 newsId: id,
                 feedId: FEED_NAME,
                 link: diaryLink,
-                title: title.text,
+                title: title.text(),
                 author: "KISS BLOG",
-                contents: contents.innerHTML,
+                contents: contents.html(),
                 embedColor: 0xf4c100,
                 needsTranslation: true
             });

+ 1 - 1
bot/src/commands/news_aggregator.ts

@@ -289,6 +289,6 @@ export default {
 
         await initPendingReactors();
         initAggregators();
-        interval(checkFeeds, UPDATE_INTERVAL * 60 * 1000);
+        interval(checkFeeds, UPDATE_INTERVAL * 1000);
     }
 } as ICommand;