From 8ad75415b13b91b9cf7fc69879f0d198db4d3379 Mon Sep 17 00:00:00 2001 From: chaoyinYang Date: Fri, 11 Oct 2024 15:46:59 +0800 Subject: [PATCH] update parser emis --- emis/parser.js | 16 ++++++++++++++++ .../{emis.2023-04-17.csv => emis.2024-10-11.csv} | 13 ++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) rename emis/test/{emis.2023-04-17.csv => emis.2024-10-11.csv} (62%) diff --git a/emis/parser.js b/emis/parser.js index 0e8c3d0e..35de33b7 100755 --- a/emis/parser.js +++ b/emis/parser.js @@ -15,6 +15,7 @@ module.exports = new Parser(function analyseEC(parsedUrl, ec) { let path = parsedUrl.pathname; // uncomment this line if you need parameters let param = parsedUrl.query || {}; + let match; // use console.error for debuging // console.error(parsedUrl); @@ -33,11 +34,21 @@ module.exports = new Parser(function analyseEC(parsedUrl, ec) { result.rtype = 'REPORT'; result.mime = 'HTML'; result.unitid = param.doc_id; + } else if ((match = /^\/v2\/documents\/report\/([0-9]+)$/i.exec(path)) != null) { + // https://www.emis.com/v2/documents/report/834568862?keyword=anduril + result.rtype = 'REPORT'; + result.mime = 'HTML'; + result.unitid = match[1]; } else if (/^\/php\/companies\/index$/i.test(path)) { // https://www.emis.com/php/companies/index?pc=HK&cmpy=9737982 result.rtype = 'RECORD'; result.mime = 'HTML'; result.unitid = param.cmpy; + } else if ((match = /^\/v2\/companies\/profile\/[a-zA-Z0-9]+\/([0-9]+)$/i.exec(path)) != null) { + // https://www.emis.com/v2/companies/profile/US/14330970 + result.rtype = 'RECORD'; + result.mime = 'HTML'; + result.unitid = match[1]; } else if (/^\/php\/companies\/index\/keystatsbox$/i.test(path) && param.excel === '1') { // https://www.emis.com/php/companies/index/keystatsbox?pc=HK&cmpy=9737982&hideValues=¤cy=HKD&display_units=3&excel=1&tbl=keystats-page-table-exchange result.rtype = 'DATASET'; @@ -47,6 +58,11 @@ module.exports = new Parser(function analyseEC(parsedUrl, ec) { // https://www.emis.com/php/search/searchv2 result.rtype = 'SEARCH'; result.mime = 'HTML'; + } else if ((match = /^\/v2\/documents\/([0-9]+)$/i.exec(path)) != null) { + // https://www.emis.com/v2/documents/837338451 + result.rtype = 'ARTICLE'; + result.mime = 'HTML'; + result.unitid = match[1]; } return result; diff --git a/emis/test/emis.2023-04-17.csv b/emis/test/emis.2024-10-11.csv similarity index 62% rename from emis/test/emis.2023-04-17.csv rename to emis/test/emis.2024-10-11.csv index e51862a9..f915d9c6 100644 --- a/emis/test/emis.2023-04-17.csv +++ b/emis/test/emis.2024-10-11.csv @@ -1,7 +1,10 @@ out-unitid;out-rtype;out-mime;in-url -;SEARCH;HTML;https://www.emis.com/php/search/searchv2 -766831747;REPORT;PDF;https://www.emis.com/php/search/docpdf?doc_id=766831747 -9737982;RECORD;HTML;https://www.emis.com/php/companies/index?pc=HK&cmpy=9737982 -9737982;DATASET;XLS;https://www.emis.com/php/companies/index/keystatsbox?pc=HK&cmpy=9737982&hideValues=¤cy=HKD&display_units=3&excel=1&tbl=keystats-page-table-exchange +837338451;ARTICLE;HTML;https://www.emis.com/v2/documents/837338451 +834568862;REPORT;HTML;https://www.emis.com/v2/documents/report/834568862?keyword=anduril +14330970;RECORD;HTML;https://www.emis.com/v2/companies/profile/US/14330970 +719516695;REPORT;HTML;https://www.emis.com/php/search/pdf2html?pc=BR&doc_id=719516695&type=1 719516695;REPORT;PDF;https://www.emis.com/php/search/docpdf?pc=BR&sv=EMIS&doc_id=719516695 -719516695;REPORT;HTML;https://www.emis.com/php/search/pdf2html?pc=BR&doc_id=719516695&type=1 \ No newline at end of file +9737982;DATASET;XLS;https://www.emis.com/php/companies/index/keystatsbox?pc=HK&cmpy=9737982&hideValues=¤cy=HKD&display_units=3&excel=1&tbl=keystats-page-table-exchange +9737982;RECORD;HTML;https://www.emis.com/php/companies/index?pc=HK&cmpy=9737982 +766831747;REPORT;PDF;https://www.emis.com/php/search/docpdf?doc_id=766831747 +;SEARCH;HTML;https://www.emis.com/php/search/searchv2 \ No newline at end of file