From 708140f4a852aab226093c62c30ef30d788ef58d Mon Sep 17 00:00:00 2001 From: denkiwakame Date: Tue, 28 May 2024 13:10:21 +0900 Subject: [PATCH 1/3] wip --- src/js/parsers.js | 107 ++++++++++++++++++++++++++++++++++++++++++++++ src/js/popup.js | 94 +++------------------------------------- 2 files changed, 113 insertions(+), 88 deletions(-) create mode 100644 src/js/parsers.js diff --git a/src/js/parsers.js b/src/js/parsers.js new file mode 100644 index 0000000..c4fe269 --- /dev/null +++ b/src/js/parsers.js @@ -0,0 +1,107 @@ +// MIT License +// Copyright (c) 2024 denkiwakame + +class URLParser { + constructor() { + this.parsers = []; + } + + addParser(domain, handler) { + this.parsers.push({ domain, handler }); + } + + async parse(url) { + for (let { domain, handler } of this.parsers) { + if (url?.startsWith(domain)) return handler(url); + } + throw new Error('No perser found for the given URL'); + } +} + +const openReviewParser = async (url) => { + const id = new URLSearchParams(new URL(url).search).get('id'); + console.error(id); + + const res = await fetch(url); + const html = await res.text(); + const parser = new DOMParser(); + const xml = parser.parseFromString(html, 'text/html'); + console.error(xml); + + const authorsArray = Array.from( + xml.querySelectorAll('meta[name="citation_author"]'), + (author) => author.getAttribute('content') + ); + const authors = authorsArray.length ? authorsArray : ['Anonymous']; + + const paperTitle = xml + .querySelector('meta[name="citation_title"]') + .getAttribute('content'); + + const abst = xml + .querySelector('meta[name="citation_abstract"]') + .getAttribute('content'); + + const date = xml + .querySelector('meta[name="citation_online_date"]') + .getAttribute('content'); + // -> ISO 8601 date string + const published = new Date(date).toISOString().split('T')[0]; + const comment = 'none'; + + return { + id: id, + title: paperTitle, + abst: abst, + authors: authors, + url: url, + published: published, + comment: comment, + publisher: 'OpenReview', + }; +}; + +const arXivParser = async (url) => { + const ARXIV_API = 'http://export.arxiv.org/api/query/search_query'; + // ref: https://info.arxiv.org/help/arxiv_identifier.html + // e.g. (new id format: 2404.16782) | (old id format: hep-th/0702063) + const parseArXivId = (str) => str.match(/(\d+\.\d+$)|((\w|-)+\/\d+$)/)?.[0]; + + const paperId = parseArXivId(url); + const res = await fetch(ARXIV_API + '?id_list=' + paperId.toString()); + if (res.status != 200) { + console.error('arXiv API request failed'); + return; + } + const data = await res.text(); // TODO: error handling + console.log(res.status); + const xmlData = new window.DOMParser().parseFromString(data, 'text/xml'); + console.log(xmlData); + + const entry = xmlData.querySelector('entry'); + const id = parseArXivId(entry.querySelector('id')?.textContent); + const paperTitle = entry.querySelector('title').textContent; + const abst = entry.querySelector('summary').textContent; + const authors = Array.from(entry.querySelectorAll('author')).map((author) => { + return author.textContent.trim(); + }); + const published = entry.querySelector('published').textContent; + const comment = entry.querySelector('comment')?.textContent ?? 'none'; + + return { + id: id, + title: paperTitle, + abst: abst, + authors: authors, + url: url, + published: published, + comment: comment, + publisher: 'arXiv', + }; +}; + +const urlParser = new URLParser(); +urlParser.addParser('https://openreview.net/', openReviewParser); +urlParser.addParser('https://arxiv.org', arXivParser); + +export default urlParser; diff --git a/src/js/popup.js b/src/js/popup.js index a975497..47e1b0e 100644 --- a/src/js/popup.js +++ b/src/js/popup.js @@ -7,11 +7,12 @@ import Icons from 'uikit/dist/js/uikit-icons'; import Mustache from 'mustache'; import NotionClient from './notion.js'; import thenChrome from 'then-chrome'; +import urlParser from './parsers.js'; UIKit.use(Icons); const TEST_URL = 'https://arxiv.org/abs/2308.04079'; -const ARXIV_API = 'http://export.arxiv.org/api/query/search_query'; + class UI { constructor() { this.setupProgressBar(); @@ -97,13 +98,11 @@ class UI { return url && url.split('.').pop() === 'pdf'; } async getPaperInfo(url) { - if (this.isArxivUrl(url)) return this.getArXivInfo(url); - if (this.isOpenReviewUrl(url)) return this.getOpenReviewInfo(url); + this.showProgressBar(); + const data = await urlParser.parse(url); + this.setFormContents(data.title, data.abst, data.comment, data.authors); + return data; } - // ref: https://info.arxiv.org/help/arxiv_identifier.html - // e.g. (new id format: 2404.16782) | (old id format: hep-th/0702063) - parseArXivId = (str) => str.match(/(\d+\.\d+$)|((\w|-)+\/\d+$)/)?.[0]; - setFormContents(paperTitle, abst, comment, authors) { document.getElementById('js-title').value = paperTitle; document.getElementById('js-abst').value = abst; @@ -118,87 +117,6 @@ class UI { }); } - async getArXivInfo(url) { - this.showProgressBar(); - const paperId = this.parseArXivId(url); - - const res = await fetch(ARXIV_API + '?id_list=' + paperId.toString()); - if (res.status != 200) { - console.error('arXiv API request failed'); - return; - } - const data = await res.text(); // TODO: error handling - console.log(res.status); - const xmlData = new window.DOMParser().parseFromString(data, 'text/xml'); - console.log(xmlData); - - const entry = xmlData.querySelector('entry'); - const id = this.parseArXivId(entry.querySelector('id')?.textContent); - const paperTitle = entry.querySelector('title').textContent; - const abst = entry.querySelector('summary').textContent; - const authors = Array.from(entry.querySelectorAll('author')).map( - (author) => { - return author.textContent.trim(); - } - ); - const published = entry.querySelector('published').textContent; - const comment = entry.querySelector('comment')?.textContent ?? 'none'; - this.setFormContents(paperTitle, abst, comment, authors); - return { - id: id, - title: paperTitle, - abst: abst, - authors: authors, - url: url, - published: published, - comment: comment, - publisher: 'arXiv', - }; - } - - async getOpenReviewInfo(url) { - this.showProgressBar(); - const id = new URLSearchParams(new URL(url).search).get('id'); - - const res = await fetch(url); - const html = await res.text(); - const parser = new DOMParser(); - const xml = parser.parseFromString(html, 'text/html'); - - const authorsArray = Array.from( - xml.querySelectorAll('meta[name="citation_author"]'), - (author) => author.getAttribute('content') - ); - const authors = authorsArray.length ? authorsArray : ['Anonymous']; - - const paperTitle = xml - .querySelector('meta[name="citation_title"]') - .getAttribute('content'); - - const abst = xml - .querySelector('meta[name="citation_abstract"]') - .getAttribute('content'); - - const date = xml - .querySelector('meta[name="citation_publication_date"]') - .getAttribute('content'); - // -> ISO 8601 date string - const published = new Date(date).toISOString().split('T')[0]; - const comment = 'none'; - - this.setFormContents(paperTitle, abst, comment, authors); - return { - id: id, - title: paperTitle, - abst: abst, - authors: authors, - url: url, - published: published, - comment: comment, - publisher: 'OpenReview', - }; - } - renderMessage(type, message, overwrite = false) { // type: warning, danger, success, primary const template = `

{{message}}

`; From df7d0bfd49c8672fd3413028c6ad76839e3b330a Mon Sep 17 00:00:00 2001 From: denkiwakame Date: Tue, 28 May 2024 14:27:07 +0900 Subject: [PATCH 2/3] add ACL anthology URL to allow CORS from chrome extension --- manifest.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/manifest.json b/manifest.json index 69cc628..95a4a34 100644 --- a/manifest.json +++ b/manifest.json @@ -20,7 +20,8 @@ "host_permissions": [ "*://api.notion.com/*", "*://www.notion.so/*", - "*://openreview.net/*" + "*://openreview.net/*", + "*://aclanthology.org/*" ], "content_security_policy": { "extension_pages": "script-src 'self'; object-src 'self'" From 191924b6359d05d5cb094c798420c760319d6132 Mon Sep 17 00:00:00 2001 From: denkiwakame Date: Tue, 28 May 2024 14:27:46 +0900 Subject: [PATCH 3/3] add ACL anthology parser --- src/js/parsers.js | 96 +++++++++++++++++++++++++++++++++-------------- src/js/popup.js | 1 + 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/src/js/parsers.js b/src/js/parsers.js index c4fe269..27e221c 100644 --- a/src/js/parsers.js +++ b/src/js/parsers.js @@ -18,15 +18,51 @@ class URLParser { } } +const arXivParser = async (url) => { + const ARXIV_API = 'http://export.arxiv.org/api/query/search_query'; + // ref: https://info.arxiv.org/help/arxiv_identifier.html + // e.g. (new id format: 2404.16782) | (old id format: hep-th/0702063) + const parseArXivId = (str) => str.match(/(\d+\.\d+$)|((\w|-)+\/\d+$)/)?.[0]; + + const paperId = parseArXivId(url); + const res = await fetch(ARXIV_API + '?id_list=' + paperId.toString()); + if (res.status != 200) { + console.error('arXiv API request failed'); + return; + } + const data = await res.text(); // TODO: error handling + console.log(res.status); + const xmlData = new window.DOMParser().parseFromString(data, 'text/xml'); + console.log(xmlData); + + const entry = xmlData.querySelector('entry'); + const id = parseArXivId(entry.querySelector('id')?.textContent); + const paperTitle = entry.querySelector('title').textContent; + const abst = entry.querySelector('summary').textContent; + const authors = Array.from(entry.querySelectorAll('author')).map((author) => { + return author.textContent.trim(); + }); + const published = entry.querySelector('published').textContent; + const comment = entry.querySelector('comment')?.textContent ?? 'none'; + + return { + id: id, + title: paperTitle, + abst: abst, + authors: authors, + url: url, + published: published, + comment: comment, + publisher: 'arXiv', + }; +}; + const openReviewParser = async (url) => { const id = new URLSearchParams(new URL(url).search).get('id'); - console.error(id); - const res = await fetch(url); const html = await res.text(); const parser = new DOMParser(); const xml = parser.parseFromString(html, 'text/html'); - console.error(xml); const authorsArray = Array.from( xml.querySelectorAll('meta[name="citation_author"]'), @@ -61,33 +97,36 @@ const openReviewParser = async (url) => { }; }; -const arXivParser = async (url) => { - const ARXIV_API = 'http://export.arxiv.org/api/query/search_query'; - // ref: https://info.arxiv.org/help/arxiv_identifier.html - // e.g. (new id format: 2404.16782) | (old id format: hep-th/0702063) - const parseArXivId = (str) => str.match(/(\d+\.\d+$)|((\w|-)+\/\d+$)/)?.[0]; +const aclAnthologyParser = async (url) => { + const res = await fetch(url); + const html = await res.text(); + const parser = new DOMParser(); + const xml = parser.parseFromString(html, 'text/html'); - const paperId = parseArXivId(url); - const res = await fetch(ARXIV_API + '?id_list=' + paperId.toString()); - if (res.status != 200) { - console.error('arXiv API request failed'); - return; - } - const data = await res.text(); // TODO: error handling - console.log(res.status); - const xmlData = new window.DOMParser().parseFromString(data, 'text/xml'); - console.log(xmlData); + const id = xml + .querySelector('meta[name="citation_doi"]') + .getAttribute('content'); + const authors = Array.from( + xml.querySelectorAll('meta[name="citation_author"]'), + (author) => author.getAttribute('content') + ); - const entry = xmlData.querySelector('entry'); - const id = parseArXivId(entry.querySelector('id')?.textContent); - const paperTitle = entry.querySelector('title').textContent; - const abst = entry.querySelector('summary').textContent; - const authors = Array.from(entry.querySelectorAll('author')).map((author) => { - return author.textContent.trim(); - }); - const published = entry.querySelector('published').textContent; - const comment = entry.querySelector('comment')?.textContent ?? 'none'; + const paperTitle = xml + .querySelector('meta[name="citation_title"]') + .getAttribute('content'); + const abst = 'none'; + const date = xml + .querySelector('meta[name="citation_publication_date"]') + .getAttribute('content'); + // -> ISO 8601 date string + const published = new Date(date).toISOString().split('T')[0]; + const publisher = xml + .querySelectorAll('.acl-paper-details dd')[6] + .textContent.replaceAll('\n', ''); + const comment = xml + .querySelector('meta[name="citation_pdf_url"]') + .getAttribute('content'); return { id: id, title: paperTitle, @@ -96,12 +135,13 @@ const arXivParser = async (url) => { url: url, published: published, comment: comment, - publisher: 'arXiv', + publisher: publisher, }; }; const urlParser = new URLParser(); urlParser.addParser('https://openreview.net/', openReviewParser); urlParser.addParser('https://arxiv.org', arXivParser); +urlParser.addParser('https://aclanthology.org', aclAnthologyParser); export default urlParser; diff --git a/src/js/popup.js b/src/js/popup.js index 47e1b0e..efad053 100644 --- a/src/js/popup.js +++ b/src/js/popup.js @@ -12,6 +12,7 @@ import urlParser from './parsers.js'; UIKit.use(Icons); const TEST_URL = 'https://arxiv.org/abs/2308.04079'; +// const TEST_URL = 'https://aclanthology.org/2023.ijcnlp-main.1/'; class UI { constructor() {