-
Notifications
You must be signed in to change notification settings - Fork 204
Open
Description
https://github.com/lantunes/skipatom
Intended usage:
from pymatgen.core.composition import Composition
compositions = [Composition("Al2O3"), Composition("CeCoGe3")]
from matminer.featurizers.composition.composite import ElementProperty
ep = ElementProperty.from_preset("skipatom")
comp_fingerprints = ep.featurize_many(compositions)
I think the following would be a good example to follow:
matminer/matminer/utils/data.py
Lines 386 to 411 in 886524a
class MatscholarElementData(AbstractData): | |
""" | |
Class to get word embedding vectors of elements. These word embeddings were | |
generated using NLP + Neural Network techniques on more than 3 million | |
scientific abstracts. | |
The data returned by this class are simply learned representations of the | |
elements, taken from: | |
Tshitoyan, V., Dagdelen, J., Weston, L. et al. Unsupervised word embeddings | |
capture latent knowledge from materials science literature. Nature 571, | |
95–98 (2019). https://doi.org/10.1038/s41586-019-1335-8 | |
""" | |
def __init__(self): | |
dfile = os.path.join(module_dir, "data_files/matscholar_els.json") | |
with open(dfile) as fp: | |
embeddings = json.load(fp) | |
self.prop_names = [f"embedding {i}" for i in range(1, 201)] | |
all_element_data = {} | |
for el, embedding in embeddings.items(): | |
all_element_data[el] = dict(zip(self.prop_names, embedding)) | |
self.all_element_data = all_element_data | |
def get_elemental_property(self, elem, property_name): | |
return self.all_element_data[str(elem)][property_name] |
Followed by the appropriate incorporation into:
matminer/matminer/featurizers/composition/composite.py
Lines 18 to 244 in 886524a
class ElementProperty(BaseFeaturizer): | |
""" | |
Class to calculate elemental property attributes. | |
To initialize quickly, use the from_preset() method. | |
Features: Based on the statistics of the data_source chosen, computed | |
by element stoichiometry. The format generally is: | |
"{data source} {statistic} {property}" | |
For example: | |
"PymetgenData range X" # Range of electronegativity from Pymatgen data | |
For a list of all statistics, see the PropertyStats documentation; for a | |
list of all attributes available for a given data_source, see the | |
documentation for the data sources (e.g., PymatgenData, MagpieData, | |
MatscholarElementData, etc.). | |
Args: | |
data_source (AbstractData or str): source from which to retrieve | |
element property data (or use str for preset: "pymatgen", | |
"magpie", or "deml") | |
features (list of strings): List of elemental properties to use | |
(these must be supported by data_source) | |
stats (list of strings): a list of weighted statistics to compute to for each | |
property (see PropertyStats for available stats) | |
""" | |
def __init__(self, data_source, features, stats): | |
if data_source == "pymatgen": | |
self.data_source = PymatgenData() | |
elif data_source == "magpie": | |
self.data_source = MagpieData() | |
elif data_source == "deml": | |
self.data_source = DemlData() | |
elif data_source == "matscholar_el": | |
self.data_source = MatscholarElementData() | |
elif data_source == "megnet_el": | |
self.data_source = MEGNetElementData() | |
else: | |
self.data_source = data_source | |
self.features = features | |
self.stats = stats | |
# Initialize stats computer | |
self.pstats = PropertyStats() | |
@classmethod | |
def from_preset(cls, preset_name): | |
""" | |
Return ElementProperty from a preset string | |
Args: | |
preset_name: (str) can be one of "magpie", "deml", "matminer", | |
"matscholar_el", or "megnet_el". | |
Returns: | |
ElementProperty based on the preset name. | |
""" | |
if preset_name == "magpie": | |
data_source = "magpie" | |
features = [ | |
"Number", | |
"MendeleevNumber", | |
"AtomicWeight", | |
"MeltingT", | |
"Column", | |
"Row", | |
"CovalentRadius", | |
"Electronegativity", | |
"NsValence", | |
"NpValence", | |
"NdValence", | |
"NfValence", | |
"NValence", | |
"NsUnfilled", | |
"NpUnfilled", | |
"NdUnfilled", | |
"NfUnfilled", | |
"NUnfilled", | |
"GSvolume_pa", | |
"GSbandgap", | |
"GSmagmom", | |
"SpaceGroupNumber", | |
] | |
stats = ["minimum", "maximum", "range", "mean", "avg_dev", "mode"] | |
elif preset_name == "deml": | |
data_source = "deml" | |
stats = ["minimum", "maximum", "range", "mean", "std_dev"] | |
features = [ | |
"atom_num", | |
"atom_mass", | |
"row_num", | |
"col_num", | |
"atom_radius", | |
"molar_vol", | |
"heat_fusion", | |
"melting_point", | |
"boiling_point", | |
"heat_cap", | |
"first_ioniz", | |
"electronegativity", | |
"electric_pol", | |
"GGAU_Etot", | |
"mus_fere", | |
"FERE correction", | |
] | |
elif preset_name == "matminer": | |
data_source = "pymatgen" | |
stats = ["minimum", "maximum", "range", "mean", "std_dev"] | |
features = [ | |
"X", | |
"row", | |
"group", | |
"block", | |
"atomic_mass", | |
"atomic_radius", | |
"mendeleev_no", | |
"electrical_resistivity", | |
"velocity_of_sound", | |
"thermal_conductivity", | |
"melting_point", | |
"bulk_modulus", | |
"coefficient_of_linear_thermal_expansion", | |
] | |
elif preset_name == "matscholar_el": | |
data_source = "matscholar_el" | |
stats = ["minimum", "maximum", "range", "mean", "std_dev"] | |
features = MatscholarElementData().prop_names | |
elif preset_name == "megnet_el": | |
data_source = "megnet_el" | |
stats = ["minimum", "maximum", "range", "mean", "std_dev"] | |
features = MEGNetElementData().prop_names | |
else: | |
raise ValueError("Invalid preset_name specified!") | |
return cls(data_source, features, stats) | |
def featurize(self, comp): | |
""" | |
Get elemental property attributes | |
Args: | |
comp: Pymatgen composition object | |
Returns: | |
all_attributes: Specified property statistics of features | |
""" | |
all_attributes = [] | |
# Get the element names and fractions | |
elements, fractions = zip(*comp.element_composition.items()) | |
for attr in self.features: | |
elem_data = [self.data_source.get_elemental_property(e, attr) for e in elements] | |
for stat in self.stats: | |
all_attributes.append(self.pstats.calc_stat(elem_data, stat, fractions)) | |
return all_attributes | |
def feature_labels(self): | |
labels = [] | |
for attr in self.features: | |
src = self.data_source.__class__.__name__ | |
for stat in self.stats: | |
labels.append(f"{src} {stat} {attr}") | |
return labels | |
def citations(self): | |
if self.data_source.__class__.__name__ == "MagpieData": | |
citation = [ | |
"@article{ward_agrawal_choudary_wolverton_2016, title={A general-purpose " | |
"machine learning framework for predicting properties of inorganic materials}, " | |
"volume={2}, DOI={10.1038/npjcompumats.2017.28}, number={1}, journal={npj " | |
"Computational Materials}, author={Ward, Logan and Agrawal, Ankit and Choudhary, " | |
"Alok and Wolverton, Christopher}, year={2016}}" | |
] | |
elif self.data_source.__class__.__name__ == "DemlData": | |
citation = [ | |
"@article{deml_ohayre_wolverton_stevanovic_2016, title={Predicting density " | |
"functional theory total energies and enthalpies of formation of metal-nonmetal " | |
"compounds by linear regression}, volume={47}, DOI={10.1002/chin.201644254}, " | |
"number={44}, journal={ChemInform}, author={Deml, Ann M. and Ohayre, Ryan and " | |
"Wolverton, Chris and Stevanovic, Vladan}, year={2016}}" | |
] | |
elif self.data_source.__class__.__name__ == "PymatgenData": | |
citation = [ | |
"@article{Ong2013, author = {Ong, Shyue Ping and Richards, William Davidson and Jain, Anubhav and Hautier, " | |
"Geoffroy and Kocher, Michael and Cholia, Shreyas and Gunter, Dan and Chevrier, Vincent L. and Persson, " | |
"Kristin A. and Ceder, Gerbrand}, doi = {10.1016/j.commatsci.2012.10.028}, issn = {09270256}, " | |
"journal = {Computational Materials Science}, month = {feb}, pages = {314--319}, " | |
"publisher = {Elsevier B.V.}, title = {{Python Materials Genomics (pymatgen): A robust, open-source python " | |
"library for materials analysis}}, url = {http://linkinghub.elsevier.com/retrieve/pii/S0927025612006295}, " | |
"volume = {68}, year = {2013} } " | |
] | |
elif self.data_source.__class__.__name__ == "MEGNetElementData": | |
# TODO: Cite MEGNet publication (not preprint) once released! | |
citation = [ | |
"@ARTICLE{2018arXiv181205055C," | |
"author = {{Chen}, Chi and {Ye}, Weike and {Zuo}, Yunxing and {Zheng}, Chen and {Ong}, Shyue Ping}," | |
"title = '{Graph Networks as a Universal Machine Learning Framework for Molecules and Crystals}'," | |
"journal = {arXiv e-prints}," | |
"keywords = {Condensed Matter - Materials Science, Physics - Computational Physics}," | |
"year = '2018'," | |
"month = 'Dec'," | |
"eid = {arXiv:1812.05055}," | |
"pages = {arXiv:1812.05055}," | |
"archivePrefix = {arXiv}," | |
"eprint = {1812.05055}," | |
"primaryClass = {cond-mat.mtrl-sci}," | |
r"adsurl = {https://ui.adsabs.harvard.edu/\#abs/2018arXiv181205055C}," | |
"adsnote = {Provided by the SAO/NASA Astrophysics Data System}}" | |
] | |
else: | |
citation = [] | |
return citation | |
def implementors(self): | |
return ["Jiming Chen", "Logan Ward", "Anubhav Jain", "Alex Dunn"] |
Metadata
Metadata
Assignees
Labels
No labels