Skip to content

Commit 629d49e

Browse files
committed
First release of the CRediT contribution statement parser
0 parents  commit 629d49e

File tree

12 files changed

+3322
-0
lines changed

12 files changed

+3322
-0
lines changed

CITATION.cff

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
cff-version: 1.0.0
2+
title: "Credit Contribution Parser"
3+
message: "If you use this software, please cite it as below"
4+
authors:
5+
- given-names: "Elsevier"
6+
url: "https://github.com/elsevierlabs-os/credit-contribution-parser"
7+
license: "MIT"
8+
month: "06"
9+
year: "2024"
10+
journal: "Github repository"

CODEOWNERS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
* @j-fisher-els
2+
* @elsevier-kristy

CreditParser/CreditParser.py

Lines changed: 527 additions & 0 deletions
Large diffs are not rendered by default.

CreditParser/Preprocess_XML.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Databricks notebook source
2+
import pyspark.sql.functions as f
3+
import pyspark.sql.types as t
4+
from bs4 import BeautifulSoup
5+
import unicodedata
6+
import html
7+
8+
# COMMAND ----------
9+
10+
"""
11+
Table contains the following info
12+
Publication ID
13+
Author Info: Array[{"author ID":"value", "given name", "initials", "last name}]
14+
xml: full text xml
15+
"""
16+
cs_unstructured_corpus = table("database.table")
17+
18+
# COMMAND ----------
19+
20+
21+
22+
def get_credit_statement(xml):
23+
"""
24+
Find credit statement using regex, collect parent section 2 levels up from body
25+
Some credit statements are broken up in to multiple paragraphs
26+
This presumes a full-text schema based on Elsevier's Journal Article DTD
27+
See https://www.elsevier.com/researcher/author/policies-and-guidelines/elsevier-xml-dtds-and-transport-schemas for details
28+
And that the article authors have labeled the section with a heading matching the regex
29+
"""
30+
soup = BeautifulSoup(xml, 'xml')
31+
section_header_regex = 'CRediT|[Aa]uthor(ship)? ([Cc]ontribut|[Ss]tatement)|([Cc]ontribution|[Cc]redit) [Ss]tatement'
32+
33+
try:
34+
credit_statement_sec = soup.body.find(string= re.compile(section_header_regex)).parent.parent
35+
paras = credit_statement_sec.find_all('para', {"view":"all"})
36+
if not paras:
37+
raise Exception('need to go up a level')
38+
except:
39+
credit_statement_sec = soup.body.find(string= re.compile(section_header_regex)).parent.parent.parent
40+
paras = credit_statement_sec.find_all('para', {"view":"all"})
41+
credit_statement = [item.text.replace(u'\xa0', u' ') for item in paras] #weird encoding error
42+
credit_statement = [cs.replace(':', '') for cs in credit_statement]
43+
return credit_statement
44+
45+
def try_cs(xml):
46+
47+
try:
48+
output = get_credit_statement(xml)
49+
except Exception as e:
50+
output = ''.join(traceback.format_exception(None, e, e.__traceback__))
51+
return output
52+
53+
54+
def get_author_group(xml):
55+
soup = BeautifulSoup(xml, 'xml')
56+
author_group = soup.find('author-group')
57+
author_group = html.unescape(unicodedata.normalize("NFKD",str(author_group)))
58+
return author_group
59+
60+
# COMMAND ----------
61+
62+
#collect credit statement section from xml
63+
cs_udf = f.udf(try_cs, t.ArrayType(t.StringType()))
64+
cs_unstructured_corpus = cs_unstructured_corpus.withColumn('credit_statement', cs_udf(cs_unstructured_corpus.xml))
65+
66+
#collect author information from xml
67+
ag_udf = f.udf(get_author_group, t.StringType())
68+
cs_unstructured_corpus = cs_unstructured_corpus.withColumn('author_group', ag_udf(cs_unstructured_corpus.xml))
69+
70+
cs_unstructured_corpus = cs_unstructured_corpus.select(['PII', 'Au', 'credit_statement', 'author_group'])
71+
cs_unstructured_corpus.write.mode("overwrite").format("delta").saveAsTable("database.table")

0 commit comments

Comments
 (0)