10
10
import umap
11
11
import logging
12
12
from flowsom import flowsom as flowsom
13
+ import seaborn as sb
13
14
import tempfile
15
+ import matplotlib
16
+ matplotlib .use ('Agg' )
14
17
import matplotlib .pyplot as plt
15
18
tmp = tempfile .NamedTemporaryFile ()
16
19
sc .settings .autoshow = False
17
20
sc .settings .set_figure_params (dpi = 300 , facecolor = 'white' ,
18
21
figsize = (10 , 10 ))
19
22
sc .settings .verbosity = 0
20
23
warnings .filterwarnings ("ignore" , category = FutureWarning )
21
- import matplotlib
22
- matplotlib .use ('Agg' )
23
24
24
25
25
26
class Cytophenograph :
@@ -46,9 +47,10 @@ def __init__(self, info_file, input_folder, output_folder, k_coef, marker_list,
46
47
self .marker_array = None
47
48
self .anndata_list = []
48
49
self .outfig = None
49
- self .fileformat = "pdf"
50
+ self .fileformat = "pdf" # insert svg to change figure format
50
51
self .log = logging .getLogger ()
51
52
self .log .setLevel (logging .INFO )
53
+ self .dpi = 100
52
54
format = logging .Formatter ("%(asctime)s %(threadName)-11s %(levelname)-10s %(message)s" )
53
55
#
54
56
ch = logging .StreamHandler (sys .stdout )
@@ -129,40 +131,36 @@ def concatenate_dataframe(self,info_file, csv_list):
129
131
for df in pandas_df_list ]):
130
132
try :
131
133
for i in range (len (pandas_df_list )):
132
- # print(pandas_df_list[i].index[0][:-2]) sample id derivated
133
134
# save column with Sample name in list
134
135
Sample_list = info_file ["Sample" ].tolist ()
135
136
# check if Sample name are in the anndata index
136
137
if pandas_df_list [i ].index [0 ][:- 2 ] in Sample_list :
137
138
ann_tmp = anndata .AnnData (pandas_df_list [i ])
138
139
ann_tmp .obs ['Sample' ] = pandas_df_list [i ].index [0 ][:- 2 ]
139
140
#
140
- cell_type = info_file ['Cell_type' ].loc [info_file ['Sample' ]== pandas_df_list [i ].index [0 ][:- 2 ]]
141
- # ann_tmp.obs['Cell_type'] = cell_type.to_string().split(" ")[-1]
142
- ann_tmp .obs ['Cell_type' ] = '' .join (e for e in cell_type .to_string () if e .isalnum ())
141
+ cell_type = info_file ['Cell_type' ].loc [info_file ['Sample' ] == pandas_df_list [i ].index [0 ][:- 2 ]]
142
+ ann_tmp .obs ['Cell_type' ] = '' .join (e for e in cell_type .to_string ().split (" " )[- 1 ] if e .isalnum ())
143
143
#
144
- exp = info_file ['EXP' ].loc [info_file ['Sample' ]== pandas_df_list [i ].index [0 ][:- 2 ]]
145
- # ann_tmp.obs['EXP'] = exp.to_string().split(" ")[-1]
146
- ann_tmp .obs ['EXP' ] = '' .join (e for e in exp .to_string () if e .isalnum ())
144
+ exp = info_file ['EXP' ].loc [info_file ['Sample' ] == pandas_df_list [i ].index [0 ][:- 2 ]]
145
+ ann_tmp .obs ['EXP' ] = '' .join (e for e in exp .to_string ().split (" " )[- 1 ] if e .isalnum ())
147
146
#
148
- id = info_file ['ID' ].loc [info_file ['Sample' ]== pandas_df_list [i ].index [0 ][:- 2 ]]
149
- # ann_tmp.obs['ID'] = id.to_string().split(" ")[-1]
150
- ann_tmp .obs ['ID' ] = '' .join (e for e in id .to_string () if e .isalnum ())
147
+ id = info_file ['ID' ].loc [info_file ['Sample' ] == pandas_df_list [i ].index [0 ][:- 2 ]]
148
+ ann_tmp .obs ['ID' ] = '' .join (e for e in id .to_string ().split (" " )[- 1 ] if e .isalnum ())
151
149
#
152
150
time_point = info_file ['Time_point' ].loc [info_file ['Sample' ] == pandas_df_list [i ].index [0 ][:- 2 ]]
153
- ann_tmp .obs ['Time_point' ] = time_point .to_string ().split (" " )[- 1 ]
154
- ann_tmp .obs ['Time_point' ] = '' .join (e for e in time_point .to_string () if e .isalnum ())
151
+ # ann_tmp.obs['Time_point'] = time_point.to_string().split(" ")[-1]
152
+ ann_tmp .obs ['Time_point' ] = '' .join (e for e in time_point .to_string (). split ( " " )[ - 1 ] if e .isalnum ())
155
153
#
154
+
156
155
condition = info_file ['Condition' ].loc [info_file ['Sample' ] == pandas_df_list [i ].index [0 ][:- 2 ]]
157
- # ann_tmp.obs['Condition'] = condition.to_string().split(" ")[-1]
158
- ann_tmp .obs ['Condition' ] = '' .join (e for e in condition .to_string () if e .isalnum ())
156
+ ann_tmp .obs ['Condition' ] = '' .join (e for e in condition .to_string ().split (" " )[- 1 ] if e .isalnum ())
159
157
#
160
158
count = info_file ['Count' ].loc [info_file ['Sample' ] == pandas_df_list [i ].index [0 ][:- 2 ]]
161
- # ann_tmp.obs['Count'] = count.to_string().split(" ")[-1]
162
- ann_tmp .obs ['Count' ] = '' .join (e for e in count .to_string () if e .isalnum ())
159
+ ann_tmp .obs ['Count' ] = '' .join (e for e in count .to_string ().split (" " )[- 1 ] if e .isalnum ())
163
160
self .anndata_list .append (ann_tmp )
164
161
else :
165
- self .log .error ("Error, this file {0} is not in the column Sample of Infofile. \n Please check sample name and Infofile" .format (pandas_df_list [i ].index [0 ][:- 2 ]))
162
+ self .log .error ("Error, this file {0} is not in the column Sample of Infofile. "
163
+ "\n Please check sample name and Infofile" .format (pandas_df_list [i ].index [0 ][:- 2 ]))
166
164
sys .exit (1 )
167
165
tmp = self .anndata_list [0 ]
168
166
self .anndata_list .pop (0 )
@@ -172,11 +170,12 @@ def concatenate_dataframe(self,info_file, csv_list):
172
170
else :
173
171
self .adata = tmp .concatenate (self .anndata_list )
174
172
self .adata .layers ['raw_value' ] = self .adata .X
175
- except ( ValueError , Exception ) :
173
+ except Exception as e :
176
174
self .log .error ("Error. Please check Info File Header or CSV header." )
175
+ self .log .error ("Exception - {0}\n " .format (str (e )))
177
176
sys .exit (1 )
178
177
else :
179
- self .log .error ("Error. Please check Info File Header or CSV header." )
178
+ self .log .error ("Error. Please check Info File Header or CSV header." , exc_info = True )
180
179
sys .exit (1 )
181
180
self .tmp_df = pd .DataFrame (self .adata .X , index = self .adata .obs .index )
182
181
self .tmp_df .columns = self .adata .var_names
@@ -250,10 +249,6 @@ def plot_umap(self):
250
249
palette = self .palette , legend_fontoutline = 2 , show = False , add_outline = False , frameon = False ,
251
250
legend_loc = 'on data' , title = "UMAP Plot" ,return_fig = False ,
252
251
s = 50 , save = "_legend_on_data." .join (["" .join ([str (self .tool ), "_cluster" ]), self .fileformat ]))
253
- sc .pl .umap (self .adata_subset , color = "pheno_leiden" ,
254
- palette = self .palette , legend_fontoutline = 2 , show = False , add_outline = False , frameon = False ,
255
- legend_loc = 'on data' , title = "UMAP Plot" ,return_fig = False ,
256
- s = 50 , save = "_legend_on_data." .join (["" .join ([str (self .tool ), "_cluster" ]), 'svg' ]))
257
252
sc .pl .correlation_matrix (self .adata_subset , "pheno_leiden" , show = False ,
258
253
save = "." .join ([self .tool , self .fileformat ]))
259
254
for _ in list (self .adata_subset .var_names .unique ()):
@@ -272,17 +267,29 @@ def matrixplot(self):
272
267
dendrogram = True , vmin = - 2 , vmax = 2 , cmap = 'RdBu_r' , layer = "scaled" ,
273
268
show = False , swap_axes = False , return_fig = False ,
274
269
save = "." .join (["matrixplot_mean_z_score" , self .fileformat ]))
275
- sc .pl .matrixplot (self .adata_subset , list (self .adata_subset .var_names ), "pheno_leiden" ,
276
- dendrogram = True , vmin = - 2 , vmax = 2 , cmap = 'RdBu_r' , layer = "scaled" ,
277
- show = False , swap_axes = False , return_fig = False ,
278
- save = "." .join (["matrixplot_mean_z_score" , 'svg' ]))
279
270
sc .pl .matrixplot (self .adata_subset , list (self .adata_subset .var_names ), "pheno_leiden" ,
280
271
dendrogram = True , cmap = 'Blues' , standard_scale = 'var' ,
281
272
colorbar_title = 'column scaled\n expression' , layer = "scaled" ,
282
273
swap_axes = False , return_fig = False ,
283
274
show = False ,
284
275
save = "." .join (["matrixplot_column_scaled_expression" , self .fileformat ]))
285
276
277
+ def plotdist (self ):
278
+ """
279
+ Plot histogram and scatter
280
+ Returns:
281
+ """
282
+ ax = self .adata .to_df ().hist (bins = 25 , figsize = (20 , 15 ))
283
+ fig = ax .get_figure ()
284
+ fig .savefig ("/" .join ([self .outfig ,"." .join (["MarkerHistograms" ,self .fileformat ])]),
285
+ dpi = self .dpi , bbox_inches = 'tight' , facecolor = 'white' , trasparent = True ,
286
+ format = self .fileformat )
287
+ # ax = sb.pairplot(self.adata.to_df(), plot_kws={'alpha': 0.3})
288
+ # ax.fig.set_size_inches(20,20)
289
+ # ax.savefig("/".join([self.outfig, "MarkerPairPlot.pdf"]),
290
+ # dpi=self.dpi, bbox_inches='tight',facecolor='white',trasparent=True,
291
+ # format=self.fileformat)
292
+
286
293
def plot_frequency (self ):
287
294
"""
288
295
@@ -296,16 +303,9 @@ def plot_frequency(self):
296
303
ax1 .set_ylabel ("Cluster" )
297
304
ax1 .grid (False )
298
305
ax1 .legend (bbox_to_anchor = (1.2 , 1.0 ))
299
- if self .fileformat == "pdf" :
300
- fig .savefig ("/" .join ([self .outfig , "ClusterFrequencyNormalized.pdf" ]),
301
- dpi = 100 , bbox_inches = 'tight' ,
302
- format = self .fileformat )
303
- fig .savefig ("/" .join ([self .outfig , "ClusterFrequencyNormalized.svg" ]),
304
- dpi = 100 , bbox_inches = 'tight' ,
305
- format = 'svg' )
306
- else :
307
- fig .savefig ("/" .join ([self .outfig , "ClusterFrequencyNormalized.svg" ]),
308
- dpi = fig .dpi , bbox_inches = 'tight' ,format = self .fileformat )
306
+ fig .savefig ("/" .join ([self .outfig , "." .join (["ClusterFrequencyNormalized" , self .fileformat ])]),
307
+ dpi = self .dpi , bbox_inches = 'tight' ,
308
+ format = self .fileformat )
309
309
fig , (ax2 ) = plt .subplots (1 , 1 , figsize = (17 / 2.54 , 17 / 2.54 ))
310
310
ax2 = self .adata_subset .obs .groupby ("pheno_leiden" )["Sample" ].value_counts (normalize = False ).unstack ().plot .barh (stacked = True ,
311
311
legend = False ,
@@ -315,13 +315,8 @@ def plot_frequency(self):
315
315
ax2 .set_ylabel ("Cluster" )
316
316
ax2 .grid (False )
317
317
ax2 .legend (bbox_to_anchor = (1.2 , 1.0 ))
318
- if self .fileformat == "pdf" :
319
- fig .savefig ("/" .join ([self .outfig , "ClusterFrequencyNotNormalized.pdf" ]),
320
- dpi = fig .dpi , bbox_inches = 'tight' ,
321
- format = self .fileformat )
322
- else :
323
- fig .savefig ("/" .join ([self .outfig , "ClusterFrequencyNotNormalized.svg" ]),
324
- dpi = fig .dpi , bbox_inches = 'tight' ,
318
+ fig .savefig ("/" .join ([self .outfig , "." .join (["ClusterFrequencyNotNormalized" , self .fileformat ])]),
319
+ dpi = self .dpi , bbox_inches = 'tight' ,
325
320
format = self .fileformat )
326
321
327
322
def runphenograph (self ):
@@ -331,7 +326,8 @@ def runphenograph(self):
331
326
"""
332
327
self .log .info ("Part2: Phenograph Clustering" )
333
328
self .log .info ("Markers used for Phenograph clustering:" )
334
- self .adata_subset = self .adata [:, self .markertoinclude ].copy ()
329
+ self .adata_subset = self .adata [:,
330
+ self .markertoinclude ].copy ()
335
331
self .log .info (self .adata_subset .var_names )
336
332
self .log .info ("Markers excluded for Phenograph clustering:" )
337
333
self .log .info (self .marker_array )
@@ -351,10 +347,11 @@ def runphenograph(self):
351
347
self .embedding = self .runumap ()
352
348
self .adata .obsm ['X_umap' ] = self .embedding
353
349
self .adata_subset .obsm ['X_umap' ] = self .embedding
354
- self .tmp_df = pd .DataFrame (self .adata .X , columns = self .adata .var_names , index = self . adata . obs . index )
350
+ self .tmp_df = pd .DataFrame (self .adata .X , columns = self .adata .var_names )
355
351
self .tmp_df ['UMAP_1' ] = self .embedding [:, 0 ]
356
352
self .tmp_df ['UMAP_2' ] = self .embedding [:, 1 ]
357
- self .tmp_df ['Cluster_Phenograph' ] = pd .DataFrame (self .adata .obs ['Phenograph_cluster' ])
353
+ self .tmp_df ['Cluster_Phenograph' ] = self .adata_subset .obs ['pheno_leiden' ]
354
+ # self.plotdist()
358
355
self .plot_umap ()
359
356
self .plot_frequency ()
360
357
self .matrixplot ()
@@ -389,10 +386,11 @@ def runparc(self):
389
386
self .embedding = self .runumap ()
390
387
self .adata .obsm ['X_umap' ] = self .embedding
391
388
self .adata_subset .obsm ['X_umap' ] = self .embedding
392
- self .tmp_df = pd .DataFrame (self .adata .X , columns = self .adata .var_names , index = self . adata . obs . index )
389
+ self .tmp_df = pd .DataFrame (self .adata .X , columns = self .adata .var_names )
393
390
self .tmp_df ['UMAP_1' ] = self .embedding [:, 0 ]
394
391
self .tmp_df ['UMAP_2' ] = self .embedding [:, 1 ]
395
- self .tmp_df ['Cluster_Parc' ] = pd .DataFrame (self .adata .obs ['Parc_cluster' ])
392
+ self .tmp_df ['Cluster_Parc' ] = self .adata_subset .obs ['pheno_leiden' ]
393
+ # self.plotdist()
396
394
self .plot_umap ()
397
395
self .plot_frequency ()
398
396
self .matrixplot ()
@@ -447,10 +445,13 @@ def runflowsom(self):
447
445
self .embedding = self .runumap ()
448
446
self .adata .obsm ['X_umap' ] = self .embedding
449
447
self .adata_subset .obsm ['X_umap' ] = self .embedding
450
- self .tmp_df = pd .DataFrame (self .adata .X , columns = self .adata .var_names , index = self .adata .obs .index )
448
+ self .embedding = self .runumap ()
449
+ self .adata .obsm ['X_umap' ] = self .embedding
450
+ self .adata_subset .obsm ['X_umap' ] = self .embedding
451
+ self .tmp_df = pd .DataFrame (self .adata .X , columns = self .adata .var_names )
451
452
self .tmp_df ['UMAP_1' ] = self .embedding [:, 0 ]
452
453
self .tmp_df ['UMAP_2' ] = self .embedding [:, 1 ]
453
- self . tmp_df [ 'Cluster_Flowsom' ] = pd . DataFrame ( self .adata . obs [ 'Cluster_Flowsom' ] )
454
+ # self.plotdist( )
454
455
self .plot_umap ()
455
456
self .plot_frequency ()
456
457
self .matrixplot ()
0 commit comments