From 7674e374cbd639d284512e0bb5aa0618a852bdf9 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Fri, 21 Apr 2023 01:55:25 +0100 Subject: [PATCH 01/10] Drop unexpected columsn when writing PDB dataframes #124 --- biopandas/pdb/pandas_pdb.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py index 9e713c7..c25773d 100644 --- a/biopandas/pdb/pandas_pdb.py +++ b/biopandas/pdb/pandas_pdb.py @@ -610,6 +610,9 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): dfs = {r: self.df[r].copy() for r in records if not self.df[r].empty} + # Drop unexpected columns + dfs = {k: v[[pdb_df_columns]] for k, v in dfs.items() if k in {"ATOM", "HETATM"}} + for r in dfs: for col in pdb_records[r]: dfs[r][col["id"]] = dfs[r][col["id"]].apply(col["strf"]) From 0757e3955206425169ca44600effdee4b4d0e5d2 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Mon, 24 Jul 2023 14:48:43 -1000 Subject: [PATCH 02/10] change list to set --- biopandas/pdb/pandas_pdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py index c25773d..7fe1bb7 100644 --- a/biopandas/pdb/pandas_pdb.py +++ b/biopandas/pdb/pandas_pdb.py @@ -611,7 +611,7 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): dfs = {r: self.df[r].copy() for r in records if not self.df[r].empty} # Drop unexpected columns - dfs = {k: v[[pdb_df_columns]] for k, v in dfs.items() if k in {"ATOM", "HETATM"}} + dfs = {k: v[list(pdb_df_columns)] for k, v in dfs.items() if k in {"ATOM", "HETATM"}} for r in dfs: for col in pdb_records[r]: From 9d0de4f73ab28602f825732fe72fae50ac2ccffe Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Mon, 31 Jul 2023 13:58:35 -0400 Subject: [PATCH 03/10] make column subsetting more robust --- biopandas/pdb/pandas_pdb.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py index 7fe1bb7..5fd4ffe 100644 --- a/biopandas/pdb/pandas_pdb.py +++ b/biopandas/pdb/pandas_pdb.py @@ -611,7 +611,10 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): dfs = {r: self.df[r].copy() for r in records if not self.df[r].empty} # Drop unexpected columns - dfs = {k: v[list(pdb_df_columns)] for k, v in dfs.items() if k in {"ATOM", "HETATM"}} + for k, v in dfs.items(): + if k in {"ATOM", "HETATM"}}: + overlap_columns = set(pdb_df_columns).intersection(set(df.columns)) + dfs[k] = v[list(overlap_columns)] for r in dfs: for col in pdb_records[r]: From 0c2cc06cdf19e1a03b37c92a911354a4d0b903de Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Mon, 31 Jul 2023 14:05:34 -0400 Subject: [PATCH 04/10] Fix syntax error --- biopandas/pdb/pandas_pdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py index 5fd4ffe..b9b535c 100644 --- a/biopandas/pdb/pandas_pdb.py +++ b/biopandas/pdb/pandas_pdb.py @@ -612,7 +612,7 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): # Drop unexpected columns for k, v in dfs.items(): - if k in {"ATOM", "HETATM"}}: + if k in {"ATOM", "HETATM"}: overlap_columns = set(pdb_df_columns).intersection(set(df.columns)) dfs[k] = v[list(overlap_columns)] From d32838b4921cfd1409e593d4b3757bdbf3fdb29c Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Mon, 31 Jul 2023 14:15:10 -0400 Subject: [PATCH 05/10] Fix syntax error --- biopandas/pdb/pandas_pdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py index b9b535c..8998fd6 100644 --- a/biopandas/pdb/pandas_pdb.py +++ b/biopandas/pdb/pandas_pdb.py @@ -613,7 +613,7 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): # Drop unexpected columns for k, v in dfs.items(): if k in {"ATOM", "HETATM"}: - overlap_columns = set(pdb_df_columns).intersection(set(df.columns)) + overlap_columns = set(pdb_df_columns).intersection(set(v.columns)) dfs[k] = v[list(overlap_columns)] for r in dfs: From 564cf0831936f5d91d8dee8a2f0954f174981926 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Mon, 31 Jul 2023 14:30:11 -0400 Subject: [PATCH 06/10] sort by atom number --- biopandas/pdb/pandas_pdb.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py index 8998fd6..74e9186 100644 --- a/biopandas/pdb/pandas_pdb.py +++ b/biopandas/pdb/pandas_pdb.py @@ -638,6 +638,11 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): else: dfs[r]["OUT"] = dfs[r]["OUT"] + dfs[r][c] + if "line_idx" in dfs.columns: + sort_column = "line_idx" + else: + sort_column = "atom_number" + if pd_version < LooseVersion("0.17.0"): warn( "You are using an old pandas version (< 0.17)" @@ -646,7 +651,7 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): " installation to a more recent version.", DeprecationWarning, ) - dfs.sort(columns="line_idx", inplace=True) + dfs.sort(columns=sort_column, inplace=True) elif pd_version < LooseVersion("0.23.0"): df = pd.concat(dfs) @@ -654,7 +659,7 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): else: df = pd.concat(dfs, sort=False) - df.sort_values(by="line_idx", inplace=True) + df.sort_values(by=sort_column, inplace=True) with openf(path, w_mode) as f: From 5252d3c310e85874a658ee05ad67d1ed22fdbd3d Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Tue, 1 Aug 2023 08:09:42 -0400 Subject: [PATCH 07/10] syntax error, drop legacy pandas support --- biopandas/pdb/pandas_pdb.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py index 74e9186..69e2fce 100644 --- a/biopandas/pdb/pandas_pdb.py +++ b/biopandas/pdb/pandas_pdb.py @@ -638,27 +638,17 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): else: dfs[r]["OUT"] = dfs[r]["OUT"] + dfs[r][c] - if "line_idx" in dfs.columns: - sort_column = "line_idx" - else: - sort_column = "atom_number" - - if pd_version < LooseVersion("0.17.0"): - warn( - "You are using an old pandas version (< 0.17)" - " that relies on the old sorting syntax." - " Please consider updating your pandas" - " installation to a more recent version.", - DeprecationWarning, - ) - dfs.sort(columns=sort_column, inplace=True) - - elif pd_version < LooseVersion("0.23.0"): + + if pd_version < LooseVersion("0.23.0"): df = pd.concat(dfs) - else: df = pd.concat(dfs, sort=False) + if "line_idx" in dfs.columns: + sort_column = "line_idx" + else: + sort_column = "atom_number" + df.sort_values(by=sort_column, inplace=True) with openf(path, w_mode) as f: From f2d0929bfa4c1884a87a8e9d0618f2ed19d7eb4a Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Tue, 1 Aug 2023 08:16:30 -0400 Subject: [PATCH 08/10] fix: typo --- biopandas/pdb/pandas_pdb.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py index 69e2fce..175676f 100644 --- a/biopandas/pdb/pandas_pdb.py +++ b/biopandas/pdb/pandas_pdb.py @@ -638,13 +638,12 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): else: dfs[r]["OUT"] = dfs[r]["OUT"] + dfs[r][c] - if pd_version < LooseVersion("0.23.0"): df = pd.concat(dfs) else: df = pd.concat(dfs, sort=False) - if "line_idx" in dfs.columns: + if "line_idx" in df.columns: sort_column = "line_idx" else: sort_column = "atom_number" From affb5b6b353078931dda31f8f8e9d82781f59b73 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Tue, 1 Aug 2023 13:44:03 -0400 Subject: [PATCH 09/10] simplify writing dfs with added columns --- biopandas/pdb/pandas_pdb.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/biopandas/pdb/pandas_pdb.py b/biopandas/pdb/pandas_pdb.py index 175676f..d4129ef 100644 --- a/biopandas/pdb/pandas_pdb.py +++ b/biopandas/pdb/pandas_pdb.py @@ -610,12 +610,6 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): dfs = {r: self.df[r].copy() for r in records if not self.df[r].empty} - # Drop unexpected columns - for k, v in dfs.items(): - if k in {"ATOM", "HETATM"}: - overlap_columns = set(pdb_df_columns).intersection(set(v.columns)) - dfs[k] = v[list(overlap_columns)] - for r in dfs: for col in pdb_records[r]: dfs[r][col["id"]] = dfs[r][col["id"]].apply(col["strf"]) @@ -628,7 +622,7 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): for idx in range(dfs[r][c].values.shape[0]): if len(dfs[r][c].values[idx]) > 8: dfs[r][c].values[idx] = str(dfs[r][c].values[idx]).strip() - if c in {"line_idx", "OUT"}: + if c in {"line_idx", "OUT", "model_id"}: pass elif r in {"ATOM", "HETATM"} and c not in pdb_df_columns: warn( @@ -643,12 +637,7 @@ def to_pdb(self, path, records=None, gz=False, append_newline=True): else: df = pd.concat(dfs, sort=False) - if "line_idx" in df.columns: - sort_column = "line_idx" - else: - sort_column = "atom_number" - - df.sort_values(by=sort_column, inplace=True) + df.sort_values(by="line_idx", inplace=True) with openf(path, w_mode) as f: From 86cf6dcd050b8fdf1cafc1173d8b791e7a11af18 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Tue, 1 Aug 2023 13:52:35 -0400 Subject: [PATCH 10/10] add test --- biopandas/pdb/tests/test_write_pdb.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/biopandas/pdb/tests/test_write_pdb.py b/biopandas/pdb/tests/test_write_pdb.py index a726336..f8b258f 100644 --- a/biopandas/pdb/tests/test_write_pdb.py +++ b/biopandas/pdb/tests/test_write_pdb.py @@ -71,3 +71,14 @@ def test_anisou(): f1 = f.read() os.remove(OUTFILE) assert f1 == four_eiy + +def test_write_with_model_id(): + """Test writing a dataframe with a model ID column added.""" + ppdb = PandasPdb() + ppdb.read_pdb(TESTDATA_FILENAME) + df.label_models() + ppdb.to_pdb(path=OUTFILE, records=None) + with open(OUTFILE, "r") as f: + f1 = f.read() + os.remove(OUTFILE) + asser f1 == f2 \ No newline at end of file