elsevierlabs-os
diff --git a/‎CITATION.cff
Lines changed: 10 additions & 0 deletions b/‎CITATION.cff
Lines changed: 10 additions & 0 deletions
diff --git a/‎CODEOWNERS
Lines changed: 2 additions & 0 deletions b/‎CODEOWNERS
Lines changed: 2 additions & 0 deletions
diff --git a/‎CreditParser/CreditParser.py
Lines changed: 527 additions & 0 deletions b/‎CreditParser/CreditParser.py
Lines changed: 527 additions & 0 deletions
diff --git a/‎CreditParser/Preprocess_XML.py
Lines changed: 71 additions & 0 deletions b/‎CreditParser/Preprocess_XML.py
Lines changed: 71 additions & 0 deletions
@@ -0,0 +1,10 @@
+cff-version: 1.0.0
+title: "Credit Contribution Parser"
+message: "If you use this software, please cite it as below"
+authors:
+  - given-names: "Elsevier"
+url: "https://github.com/elsevierlabs-os/credit-contribution-parser"
+license: "MIT"
+month: "06"
+year: "2024"
+journal: "Github repository"
@@ -0,0 +1,2 @@
+*	@j-fisher-els
+*   @elsevier-kristy
@@ -0,0 +1,71 @@
+# Databricks notebook source
+import pyspark.sql.functions as f
+import pyspark.sql.types as t
+from bs4 import BeautifulSoup
+import unicodedata
+import html
+
+# COMMAND ----------
+
+"""
+Table contains the following info
+Publication ID
+Author Info: Array[{"author ID":"value", "given name", "initials", "last name}]
+xml: full text xml
+"""
+cs_unstructured_corpus = table("database.table")
+
+# COMMAND ----------
+
+
+
+def get_credit_statement(xml):
+    """
+    Find credit statement using regex, collect parent section 2 levels up from body
+    Some credit statements are broken up in to multiple paragraphs
+    This presumes a full-text schema based on Elsevier's Journal Article DTD
+    See https://www.elsevier.com/researcher/author/policies-and-guidelines/elsevier-xml-dtds-and-transport-schemas for details
+    And that the article authors have labeled the section with a heading matching the regex
+    """
+    soup = BeautifulSoup(xml, 'xml')
+    section_header_regex = 'CRediT|[Aa]uthor(ship)? ([Cc]ontribut|[Ss]tatement)|([Cc]ontribution|[Cc]redit) [Ss]tatement'
+
+    try:
+        credit_statement_sec = soup.body.find(string= re.compile(section_header_regex)).parent.parent
+        paras = credit_statement_sec.find_all('para', {"view":"all"})
+        if not paras:
+            raise Exception('need to go up a level')
+    except:
+        credit_statement_sec = soup.body.find(string= re.compile(section_header_regex)).parent.parent.parent
+        paras = credit_statement_sec.find_all('para', {"view":"all"})
+    credit_statement = [item.text.replace(u'\xa0', u' ') for item in paras] #weird encoding error
+    credit_statement = [cs.replace(':', '') for cs in credit_statement]
+    return credit_statement
+
+def try_cs(xml):
+
+    try:
+        output = get_credit_statement(xml)
+    except Exception as e:
+        output = ''.join(traceback.format_exception(None, e, e.__traceback__))
+    return output
+
+
+def get_author_group(xml):
+    soup = BeautifulSoup(xml, 'xml')
+    author_group = soup.find('author-group')
+    author_group = html.unescape(unicodedata.normalize("NFKD",str(author_group)))
+    return author_group
+
+# COMMAND ----------
+
+#collect credit statement section from xml
+cs_udf = f.udf(try_cs, t.ArrayType(t.StringType()))
+cs_unstructured_corpus = cs_unstructured_corpus.withColumn('credit_statement', cs_udf(cs_unstructured_corpus.xml))
+
+#collect author information from xml
+ag_udf = f.udf(get_author_group, t.StringType())
+cs_unstructured_corpus = cs_unstructured_corpus.withColumn('author_group', ag_udf(cs_unstructured_corpus.xml))
+
+cs_unstructured_corpus = cs_unstructured_corpus.select(['PII', 'Au', 'credit_statement', 'author_group'])
+cs_unstructured_corpus.write.mode("overwrite").format("delta").saveAsTable("database.table")
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+* @j-fisher-els`
	`2`	`+* @elsevier-kristy`