Skip to content

Support multiple parsers #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
"host_permissions": [
"*://api.notion.com/*",
"*://www.notion.so/*",
"*://openreview.net/*"
"*://openreview.net/*",
"*://aclanthology.org/*"
],
"content_security_policy": {
"extension_pages": "script-src 'self'; object-src 'self'"
Expand Down
147 changes: 147 additions & 0 deletions src/js/parsers.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
// MIT License
// Copyright (c) 2024 denkiwakame <denkivvakame@gmail.com>

class URLParser {
constructor() {
this.parsers = [];
}

addParser(domain, handler) {
this.parsers.push({ domain, handler });
}

async parse(url) {
for (let { domain, handler } of this.parsers) {
if (url?.startsWith(domain)) return handler(url);
}
throw new Error('No perser found for the given URL');
}
}

const arXivParser = async (url) => {
const ARXIV_API = 'http://export.arxiv.org/api/query/search_query';
// ref: https://info.arxiv.org/help/arxiv_identifier.html
// e.g. (new id format: 2404.16782) | (old id format: hep-th/0702063)
const parseArXivId = (str) => str.match(/(\d+\.\d+$)|((\w|-)+\/\d+$)/)?.[0];

const paperId = parseArXivId(url);
const res = await fetch(ARXIV_API + '?id_list=' + paperId.toString());
if (res.status != 200) {
console.error('arXiv API request failed');
return;
}
const data = await res.text(); // TODO: error handling
console.log(res.status);
const xmlData = new window.DOMParser().parseFromString(data, 'text/xml');
console.log(xmlData);

const entry = xmlData.querySelector('entry');
const id = parseArXivId(entry.querySelector('id')?.textContent);
const paperTitle = entry.querySelector('title').textContent;
const abst = entry.querySelector('summary').textContent;
const authors = Array.from(entry.querySelectorAll('author')).map((author) => {
return author.textContent.trim();
});
const published = entry.querySelector('published').textContent;
const comment = entry.querySelector('comment')?.textContent ?? 'none';

return {
id: id,
title: paperTitle,
abst: abst,
authors: authors,
url: url,
published: published,
comment: comment,
publisher: 'arXiv',
};
};

const openReviewParser = async (url) => {
const id = new URLSearchParams(new URL(url).search).get('id');
const res = await fetch(url);
const html = await res.text();
const parser = new DOMParser();
const xml = parser.parseFromString(html, 'text/html');

const authorsArray = Array.from(
xml.querySelectorAll('meta[name="citation_author"]'),
(author) => author.getAttribute('content')
);
const authors = authorsArray.length ? authorsArray : ['Anonymous'];

const paperTitle = xml
.querySelector('meta[name="citation_title"]')
.getAttribute('content');

const abst = xml
.querySelector('meta[name="citation_abstract"]')
.getAttribute('content');

const date = xml
.querySelector('meta[name="citation_online_date"]')
.getAttribute('content');
// -> ISO 8601 date string
const published = new Date(date).toISOString().split('T')[0];
const comment = 'none';

return {
id: id,
title: paperTitle,
abst: abst,
authors: authors,
url: url,
published: published,
comment: comment,
publisher: 'OpenReview',
};
};

const aclAnthologyParser = async (url) => {
const res = await fetch(url);
const html = await res.text();
const parser = new DOMParser();
const xml = parser.parseFromString(html, 'text/html');

const id = xml
.querySelector('meta[name="citation_doi"]')
.getAttribute('content');
const authors = Array.from(
xml.querySelectorAll('meta[name="citation_author"]'),
(author) => author.getAttribute('content')
);

const paperTitle = xml
.querySelector('meta[name="citation_title"]')
.getAttribute('content');

const abst = 'none';
const date = xml
.querySelector('meta[name="citation_publication_date"]')
.getAttribute('content');
// -> ISO 8601 date string
const published = new Date(date).toISOString().split('T')[0];
const publisher = xml
.querySelectorAll('.acl-paper-details dd')[6]
.textContent.replaceAll('\n', '');
const comment = xml
.querySelector('meta[name="citation_pdf_url"]')
.getAttribute('content');
return {
id: id,
title: paperTitle,
abst: abst,
authors: authors,
url: url,
published: published,
comment: comment,
publisher: publisher,
};
};

const urlParser = new URLParser();
urlParser.addParser('https://openreview.net/', openReviewParser);
urlParser.addParser('https://arxiv.org', arXivParser);
urlParser.addParser('https://aclanthology.org', aclAnthologyParser);

export default urlParser;
95 changes: 7 additions & 88 deletions src/js/popup.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ import Icons from 'uikit/dist/js/uikit-icons';
import Mustache from 'mustache';
import NotionClient from './notion.js';
import thenChrome from 'then-chrome';
import urlParser from './parsers.js';

UIKit.use(Icons);

const TEST_URL = 'https://arxiv.org/abs/2308.04079';
const ARXIV_API = 'http://export.arxiv.org/api/query/search_query';
// const TEST_URL = 'https://aclanthology.org/2023.ijcnlp-main.1/';

class UI {
constructor() {
this.setupProgressBar();
Expand Down Expand Up @@ -97,13 +99,11 @@ class UI {
return url && url.split('.').pop() === 'pdf';
}
async getPaperInfo(url) {
if (this.isArxivUrl(url)) return this.getArXivInfo(url);
if (this.isOpenReviewUrl(url)) return this.getOpenReviewInfo(url);
this.showProgressBar();
const data = await urlParser.parse(url);
this.setFormContents(data.title, data.abst, data.comment, data.authors);
return data;
}
// ref: https://info.arxiv.org/help/arxiv_identifier.html
// e.g. (new id format: 2404.16782) | (old id format: hep-th/0702063)
parseArXivId = (str) => str.match(/(\d+\.\d+$)|((\w|-)+\/\d+$)/)?.[0];

setFormContents(paperTitle, abst, comment, authors) {
document.getElementById('js-title').value = paperTitle;
document.getElementById('js-abst').value = abst;
Expand All @@ -118,87 +118,6 @@ class UI {
});
}

async getArXivInfo(url) {
this.showProgressBar();
const paperId = this.parseArXivId(url);

const res = await fetch(ARXIV_API + '?id_list=' + paperId.toString());
if (res.status != 200) {
console.error('arXiv API request failed');
return;
}
const data = await res.text(); // TODO: error handling
console.log(res.status);
const xmlData = new window.DOMParser().parseFromString(data, 'text/xml');
console.log(xmlData);

const entry = xmlData.querySelector('entry');
const id = this.parseArXivId(entry.querySelector('id')?.textContent);
const paperTitle = entry.querySelector('title').textContent;
const abst = entry.querySelector('summary').textContent;
const authors = Array.from(entry.querySelectorAll('author')).map(
(author) => {
return author.textContent.trim();
}
);
const published = entry.querySelector('published').textContent;
const comment = entry.querySelector('comment')?.textContent ?? 'none';
this.setFormContents(paperTitle, abst, comment, authors);
return {
id: id,
title: paperTitle,
abst: abst,
authors: authors,
url: url,
published: published,
comment: comment,
publisher: 'arXiv',
};
}

async getOpenReviewInfo(url) {
this.showProgressBar();
const id = new URLSearchParams(new URL(url).search).get('id');

const res = await fetch(url);
const html = await res.text();
const parser = new DOMParser();
const xml = parser.parseFromString(html, 'text/html');

const authorsArray = Array.from(
xml.querySelectorAll('meta[name="citation_author"]'),
(author) => author.getAttribute('content')
);
const authors = authorsArray.length ? authorsArray : ['Anonymous'];

const paperTitle = xml
.querySelector('meta[name="citation_title"]')
.getAttribute('content');

const abst = xml
.querySelector('meta[name="citation_abstract"]')
.getAttribute('content');

const date = xml
.querySelector('meta[name="citation_publication_date"]')
.getAttribute('content');
// -> ISO 8601 date string
const published = new Date(date).toISOString().split('T')[0];
const comment = 'none';

this.setFormContents(paperTitle, abst, comment, authors);
return {
id: id,
title: paperTitle,
abst: abst,
authors: authors,
url: url,
published: published,
comment: comment,
publisher: 'OpenReview',
};
}

renderMessage(type, message, overwrite = false) {
// type: warning, danger, success, primary
const template = `<div class="uk-alert-{{type}}" uk-alert><a class="uk-alert-close" uk-close></a><p>{{message}}</p></div>`;
Expand Down
Loading