1
+ # Databricks notebook source
2
+ import pyspark .sql .functions as f
3
+ import pyspark .sql .types as t
4
+ from bs4 import BeautifulSoup
5
+ import unicodedata
6
+ import html
7
+
8
+ # COMMAND ----------
9
+
10
+ """
11
+ Table contains the following info
12
+ Publication ID
13
+ Author Info: Array[{"author ID":"value", "given name", "initials", "last name}]
14
+ xml: full text xml
15
+ """
16
+ cs_unstructured_corpus = table ("database.table" )
17
+
18
+ # COMMAND ----------
19
+
20
+
21
+
22
+ def get_credit_statement (xml ):
23
+ """
24
+ Find credit statement using regex, collect parent section 2 levels up from body
25
+ Some credit statements are broken up in to multiple paragraphs
26
+ This presumes a full-text schema based on Elsevier's Journal Article DTD
27
+ See https://www.elsevier.com/researcher/author/policies-and-guidelines/elsevier-xml-dtds-and-transport-schemas for details
28
+ And that the article authors have labeled the section with a heading matching the regex
29
+ """
30
+ soup = BeautifulSoup (xml , 'xml' )
31
+ section_header_regex = 'CRediT|[Aa]uthor(ship)? ([Cc]ontribut|[Ss]tatement)|([Cc]ontribution|[Cc]redit) [Ss]tatement'
32
+
33
+ try :
34
+ credit_statement_sec = soup .body .find (string = re .compile (section_header_regex )).parent .parent
35
+ paras = credit_statement_sec .find_all ('para' , {"view" :"all" })
36
+ if not paras :
37
+ raise Exception ('need to go up a level' )
38
+ except :
39
+ credit_statement_sec = soup .body .find (string = re .compile (section_header_regex )).parent .parent .parent
40
+ paras = credit_statement_sec .find_all ('para' , {"view" :"all" })
41
+ credit_statement = [item .text .replace (u'\xa0 ' , u' ' ) for item in paras ] #weird encoding error
42
+ credit_statement = [cs .replace (':' , '' ) for cs in credit_statement ]
43
+ return credit_statement
44
+
45
+ def try_cs (xml ):
46
+
47
+ try :
48
+ output = get_credit_statement (xml )
49
+ except Exception as e :
50
+ output = '' .join (traceback .format_exception (None , e , e .__traceback__ ))
51
+ return output
52
+
53
+
54
+ def get_author_group (xml ):
55
+ soup = BeautifulSoup (xml , 'xml' )
56
+ author_group = soup .find ('author-group' )
57
+ author_group = html .unescape (unicodedata .normalize ("NFKD" ,str (author_group )))
58
+ return author_group
59
+
60
+ # COMMAND ----------
61
+
62
+ #collect credit statement section from xml
63
+ cs_udf = f .udf (try_cs , t .ArrayType (t .StringType ()))
64
+ cs_unstructured_corpus = cs_unstructured_corpus .withColumn ('credit_statement' , cs_udf (cs_unstructured_corpus .xml ))
65
+
66
+ #collect author information from xml
67
+ ag_udf = f .udf (get_author_group , t .StringType ())
68
+ cs_unstructured_corpus = cs_unstructured_corpus .withColumn ('author_group' , ag_udf (cs_unstructured_corpus .xml ))
69
+
70
+ cs_unstructured_corpus = cs_unstructured_corpus .select (['PII' , 'Au' , 'credit_statement' , 'author_group' ])
71
+ cs_unstructured_corpus .write .mode ("overwrite" ).format ("delta" ).saveAsTable ("database.table" )
0 commit comments