1 '''
2 Importing pandasTools enables several features that allow for using RDKit molecules as columns of a Pandas dataframe.
3 If the dataframe is containing a molecule format in a column (e.g. smiles), like in this example:
4 >>> from rdkit.Chem import PandasTools
5 >>> import pandas as pd
6 >>> import os
7 >>> from rdkit import RDConfig
8 >>> antibiotics = pd.DataFrame(columns=['Name','Smiles'])
9 >>> antibiotics = antibiotics.append({'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C','Name':'Penicilline G'}, ignore_index=True)#Penicilline G
10 >>> antibiotics = antibiotics.append({'Smiles':'CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O','Name':'Tetracycline'}, ignore_index=True)#Tetracycline
11 >>> antibiotics = antibiotics.append({'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O)O)C','Name':'Ampicilline'}, ignore_index=True)#Ampicilline
12 >>> print([str(x) for x in antibiotics.columns])
13 ['Name', 'Smiles']
14 >>> print(antibiotics)
15 Name Smiles
16 0 Penicilline G CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C
17 1 Tetracycline CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4...
18 2 Ampicilline CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...
19
20 a new column can be created holding the respective RDKit molecule objects. The fingerprint can be included to accelerate substructure searches on the dataframe.
21
22 >>> PandasTools.AddMoleculeColumnToFrame(antibiotics,'Smiles','Molecule',includeFingerprints=True)
23 >>> print([str(x) for x in antibiotics.columns])
24 ['Name', 'Smiles', 'Molecule']
25
26 A substructure filter can be applied on the dataframe using the RDKit molecule column, because the ">=" operator has been modified to work as a substructure check.
27 Such the antibiotics containing the beta-lactam ring "C1C(=O)NC1" can be obtained by
28
29 >>> beta_lactam = Chem.MolFromSmiles('C1C(=O)NC1')
30 >>> beta_lactam_antibiotics = antibiotics[antibiotics['Molecule'] >= beta_lactam]
31 >>> print(beta_lactam_antibiotics[['Name','Smiles']])
32 Name Smiles
33 0 Penicilline G CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C
34 2 Ampicilline CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...
35
36
37 It is also possible to load an SDF file can be load into a dataframe.
38
39 >>> sdfFile = os.path.join(RDConfig.RDDataDir,'NCI/first_200.props.sdf')
40 >>> frame = PandasTools.LoadSDF(sdfFile,smilesName='SMILES',molColName='Molecule',includeFingerprints=True)
41 >>> frame.info # doctest: +SKIP
42 <bound method DataFrame.info of <class 'pandas.core.frame.DataFrame'>
43 Int64Index: 200 entries, 0 to 199
44 Data columns:
45 AMW 200 non-null values
46 CLOGP 200 non-null values
47 CP 200 non-null values
48 CR 200 non-null values
49 DAYLIGHT.FPG 200 non-null values
50 DAYLIGHT_CLOGP 200 non-null values
51 FP 200 non-null values
52 ID 200 non-null values
53 ISM 200 non-null values
54 LIPINSKI_VIOLATIONS 200 non-null values
55 NUM_HACCEPTORS 200 non-null values
56 NUM_HDONORS 200 non-null values
57 NUM_HETEROATOMS 200 non-null values
58 NUM_LIPINSKIHACCEPTORS 200 non-null values
59 NUM_LIPINSKIHDONORS 200 non-null values
60 NUM_RINGS 200 non-null values
61 NUM_ROTATABLEBONDS 200 non-null values
62 P1 30 non-null values
63 SMILES 200 non-null values
64 Molecule 200 non-null values
65 dtypes: object(20)>
66
67 In order to support rendering the molecules as images in the HTML export of the dataframe, the __str__ method is monkey-patched to return a base64 encoded PNG:
68 >>> molX = Chem.MolFromSmiles('Fc1cNc2ccccc12')
69 >>> print(molX) # doctest: +SKIP
70 <img src="data:image/png;base64,..." alt="Mol"/>
71 This can be reverted using the ChangeMoleculeRendering method
72 >>> ChangeMoleculeRendering(renderer='String')
73 >>> print(molX) # doctest: +SKIP
74 <rdkit.Chem.rdchem.Mol object at 0x10d179440>
75 >>> ChangeMoleculeRendering(renderer='PNG')
76 >>> print(molX) # doctest: +SKIP
77 <img src="data:image/png;base64,..." alt="Mol"/>
78 '''
79 from __future__ import print_function
80
81 from base64 import b64encode
82 import types,copy
83
84 from rdkit.six import BytesIO, string_types
85 from rdkit import Chem
86 from rdkit.Chem import Draw
87
88 try:
89 import pandas as pd
90 try:
91 v = pd.__version__.split('.')
92 except AttributeError:
93
94 v = pd.version.version.split('.')
95
96
97 if v[0]=='0' and int(v[1])<10:
98 print("Pandas version %s not compatible with tests"%v, file=sys.stderr)
99 pd = None
100 else:
101 if 'display.width' in pd.core.config._registered_options:
102 pd.set_option('display.width',1000000000)
103 if 'display.max_rows' in pd.core.config._registered_options:
104 pd.set_option('display.max_rows',1000000000)
105 elif 'display.height' in pd.core.config._registered_options:
106 pd.set_option('display.height',1000000000)
107 if 'display.max_colwidth' in pd.core.config._registered_options:
108 pd.set_option('display.max_colwidth',1000000000)
109
110 defPandasRendering = pd.core.frame.DataFrame.to_html
111 except ImportError:
112 import traceback
113 traceback.print_exc()
114 pd = None
115
116 except Exception as e:
117 import sys
118 import traceback
119 traceback.print_exc()
120 pd = None
121
122 highlightSubstructures=True
123 molRepresentation = 'png'
124 molSize = (200,200)
125
126
128 '''
129 Patched default escaping of HTML control characters to allow molecule image rendering dataframes
130 '''
131 formatter = pd.core.format.DataFrameFormatter(self,buf=None,columns=None,col_space=None,colSpace=None,header=True,index=True,
132 na_rep='NaN',formatters=None,float_format=None,sparsify=None,index_names=True,
133 justify = None, force_unicode=None,bold_rows=True,classes=None,escape=False)
134 formatter.to_html()
135 html = formatter.buf.getvalue()
136 return html
137
139 '''Ensure inheritance of patched to_html in "head" subframe
140 '''
141 df = self[:n]
142 df.to_html = types.MethodType(patchPandasHTMLrepr,df)
143 df.head = types.MethodType(patchPandasHeadMethod,df)
144 return df
145
147 """displayhook function for PIL Images, rendered as PNG"""
148 import pandas as pd
149 bio = BytesIO()
150 x.save(bio,format='PNG')
151 s = b64encode(bio.getvalue()).decode('ascii')
152 pd.set_option('display.max_columns',len(s)+1000)
153 pd.set_option('display.max_rows',len(s)+1000)
154 if len(s)+100 > pd.get_option("display.max_colwidth"):
155 pd.set_option("display.max_colwidth",len(s)+1000)
156 return s
157
173
174 from rdkit import DataStructs
175
176 try:
177 from rdkit.Avalon import pyAvalonTools as pyAvalonTools
178 _fingerprinter=lambda x,y:pyAvalonTools.GetAvalonFP(x,isQuery=y,bitFlags=pyAvalonTools.avalonSSSBits)
179 except ImportError:
180 _fingerprinter=lambda x,y:Chem.PatternFingerprint(x,fpSize=2048)
181
183 """Allows for substructure check using the >= operator (X has substructure Y -> X >= Y) by
184 monkey-patching the __ge__ function
185 This has the effect that the pandas/numpy rowfilter can be used for substructure filtering (filtered = dframe[dframe['RDKitColumn'] >= SubstructureMolecule])
186 """
187 if x is None or y is None: return False
188 if hasattr(x,'_substructfp'):
189 if not hasattr(y,'_substructfp'):
190 y._substructfp=_fingerprinter(y,True)
191 if not DataStructs.AllProbeBitsMatch(y._substructfp,x._substructfp):
192 return False
193 match = x.GetSubstructMatch(y)
194 if match:
195 if highlightSubstructures:
196 x.__sssAtoms=list(match)
197 else:
198 x.__sssAtoms=[]
199 return True
200 else:
201 return False
202
203
204 Chem.Mol.__ge__ = _molge
205
217
218
221
222
223 Chem.Mol.__str__ = PrintAsBase64PNGString
224
226 '''Precomputes fingerprints and stores results in molecule objects to accelerate substructure matching
227 '''
228
229 if m is not None:
230 m._substructfp=_fingerprinter(m,False)
231 return m
232
234 '''Changes the default dataframe rendering to not escape HTML characters, thus allowing rendered images in all dataframes.
235 IMPORTANT: THIS IS A GLOBAL CHANGE THAT WILL AFFECT TO COMPLETE PYTHON SESSION. If you want to change the rendering only
236 for a single dataframe use the "ChangeMoleculeRendering" method instead.
237 '''
238 if images:
239 pd.core.frame.DataFrame.to_html = patchPandasHTMLrepr
240 else:
241 pd.core.frame.DataFrame.to_html = defPandasRendering
242
243
245 '''Converts the molecules contains in "smilesCol" to RDKit molecules and appends them to the dataframe "frame" using the specified column name.
246 If desired, a fingerprint can be computed and stored with the molecule objects to accelerate substructure matching
247 '''
248 if not includeFingerprints:
249 frame[molCol]=frame[smilesCol].map(Chem.MolFromSmiles)
250 else:
251 frame[molCol]=frame[smilesCol].map(lambda smiles: _MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
252 RenderImagesInAllDataFrames(images=True)
253
254
255
256
258 '''Allows to change the rendering of the molecules between base64 PNG images and string representations.
259 This serves two purposes: First it allows to avoid the generation of images if this is not desired and, secondly, it allows to enable image rendering for
260 newly created dataframe that already contains molecules, without having to rerun the time-consuming AddMoleculeColumnToFrame. Note: this behaviour is, because some pandas methods, e.g. head()
261 returns a new dataframe instance that uses the default pandas rendering (thus not drawing images for molecules) instead of the monkey-patched one.
262 '''
263 if renderer == 'String':
264 Chem.Mol.__str__ = PrintDefaultMolRep
265 else:
266 Chem.Mol.__str__ = PrintAsBase64PNGString
267 if frame is not None:
268 frame.to_html = types.MethodType(patchPandasHTMLrepr,frame)
269
270 -def LoadSDF(filename, idName='ID',molColName = 'ROMol',includeFingerprints=False, isomericSmiles=False, smilesName=None, embedProps=False):
271 """ Read file in SDF format and return as Pandas data frame. If embedProps=True all properties also get embedded in Mol objects in the molecule column. """
272 df = None
273 if isinstance(filename, string_types):
274 if filename.lower()[-3:] == ".gz":
275 import gzip
276 f = gzip.open(filename, "rb")
277 else:
278 f = open(filename, 'rb')
279 close = f.close
280 else:
281 f = filename
282 close = None
283 records = []
284 indices = []
285 for i, mol in enumerate(Chem.ForwardSDMolSupplier(f)):
286 if mol is None: continue
287 row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames())
288 if not embedProps:
289 for prop in mol.GetPropNames():
290 mol.ClearProp(prop)
291 if mol.HasProp('_Name'): row[idName] = mol.GetProp('_Name')
292 if smilesName is not None:
293 row[smilesName] = Chem.MolToSmiles(mol, isomericSmiles=isomericSmiles)
294 if not includeFingerprints:
295 row[molColName] = mol
296 else:
297 row[molColName] = _MolPlusFingerprint(mol)
298 records.append(row)
299 indices.append(i)
300
301 if close is not None: close()
302 RenderImagesInAllDataFrames(images=True)
303 return pd.DataFrame(records, index=indices)
304
305 from rdkit.Chem import SDWriter
306
307 -def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumeric=False):
308 '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export all columns.
309 The "allNumeric" flag allows to automatically include all numeric columns in the output. User has to make sure that correct data type is assigned to column.
310 "idName" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title.
311 '''
312
313 close = None
314 if isinstance(out, string_types):
315 if out.lower()[-3:] == ".gz":
316 import gzip
317 out = gzip.open(out, "wb")
318 close = out.close
319
320 writer = SDWriter(out)
321 if properties is None:
322 properties=[]
323 else:
324 properties=list(properties)
325 if allNumeric:
326 properties.extend([dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt],float) or np.issubdtype(df.dtypes[dt],int))])
327
328 if molColName in properties:
329 properties.remove(molColName)
330 if idName in properties:
331 properties.remove(idName)
332 writer.SetProps(properties)
333 for row in df.iterrows():
334
335 mol = Chem.Mol(row[1][molColName])
336
337 if idName is not None:
338 if idName == 'RowID':
339 mol.SetProp('_Name',str(row[0]))
340 else:
341 mol.SetProp('_Name',str(row[1][idName]))
342 for p in properties:
343 cell_value = row[1][p]
344
345 if np.issubdtype(type(cell_value),float):
346 s = '{:f}'.format(cell_value).rstrip("0")
347 if s[-1] == ".":
348 s += "0"
349 mol.SetProp(p, s)
350 else:
351 mol.SetProp(p,str(cell_value))
352 writer.write(mol)
353 writer.close()
354 if close is not None: close()
355
356 _saltRemover = None
366
368 '''
369 Saves smi file. SMILES are generated from column with RDKit molecules. Column with names is optional.
370 '''
371 w = Chem.SmilesWriter(outFile, isomericSmiles=isomericSmiles)
372 if NamesCol != '':
373 for m,n in zip(frame[molCol], map(str,frame[NamesCol])):
374 m.SetProp('_Name',n)
375 w.write(m)
376 w.close()
377 else:
378 for m in frame[molCol]:
379 w.write(m)
380 w.close()
381
382 import numpy as np
383 import os
384 from rdkit.six.moves import cStringIO as StringIO
385
387 """
388 Saves pandas DataFrame as a xlsx file with embedded images.
389 It maps numpy data types to excel cell types:
390 int, float -> number
391 datetime -> datetime
392 object -> string (limited to 32k character - xlsx limitations)
393
394 Cells with compound images are a bit larger than images due to excel.
395 Column width weirdness explained (from xlsxwriter docs):
396 The width corresponds to the column width value that is specified in Excel.
397 It is approximately equal to the length of a string in the default font of Calibri 11.
398 Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format.
399 This feature is only available at runtime from within Excel.
400 """
401
402 import xlsxwriter
403
404 cols = list(frame.columns)
405 cols.remove(molCol)
406 dataTypes = dict(frame.dtypes)
407
408 workbook = xlsxwriter.Workbook(outFile)
409 worksheet = workbook.add_worksheet()
410 worksheet.set_column('A:A', size[0]/6.)
411
412
413 c2 = 1
414 for x in cols:
415 worksheet.write_string(0, c2, x)
416 c2 += 1
417
418 c = 1
419 for index, row in frame.iterrows():
420 image_data = StringIO()
421 img = Draw.MolToImage(row[molCol], size=size)
422 img.save(image_data, format='PNG')
423
424 worksheet.set_row(c, height=size[1])
425 worksheet.insert_image(c, 0, "f", {'image_data': image_data})
426
427 c2 = 1
428 for x in cols:
429 if str(dataTypes[x]) == "object":
430 worksheet.write_string(c, c2, str(row[x])[:32000])
431 elif ('float' in str(dataTypes[x])) or ('int' in str(dataTypes[x])):
432 if (row[x] != np.nan) or (row[x] != np.inf):
433 worksheet.write_number(c, c2, row[x])
434 elif 'datetime' in str(dataTypes[x]):
435 worksheet.write_datetime(c, c2, row[x])
436 c2 += 1
437 c += 1
438
439 workbook.close()
440 image_data.close()
441
442
444 '''
445 Draw grid image of mols in pandas DataFrame.
446 '''
447 if legendsCol:
448 if legendsCol == frame.index.name:
449 img = Draw.MolsToGridImage(frame[column], legends=list(map(str, list(frame.index))), **kwargs)
450 else:
451 img = Draw.MolsToGridImage(frame[column], legends=list(map(str, list(frame[legendsCol]))), **kwargs)
452 else:
453 img = Draw.MolsToGridImage(frame[column], **kwargs)
454 return img
455
456 from rdkit.Chem.Scaffolds import MurckoScaffold
457
458 -def AddMurckoToFrame(frame, molCol = 'ROMol', MurckoCol = 'Murcko_SMILES', Generic = False):
466
467
468 from rdkit.Chem import AllChem
469
478
480 '''
481 Aligns molecules in molCol to scaffolds in scaffoldCol
482 '''
483 frame[molCol] = frame.apply(lambda x: AlignMol(x[molCol],x[scaffoldCol]), axis=1)
484
485
486 if __name__ == "__main__":
487 import sys
488 if pd is None:
489 print("pandas installation not found, skipping tests", file=sys.stderr)
490 else:
491
492 try:
493 v = pd.__version__.split('.')
494 except AttributeError:
495
496 v = pd.version.version.split('.')
497
498 if v[0]=='0' and int(v[1])<10:
499 print("pandas installation >=0.10 not found, skipping tests",
500 file=sys.stderr)
501 else:
502 import doctest
503 failed,tried=doctest.testmod(optionflags=doctest.ELLIPSIS+doctest.NORMALIZE_WHITESPACE)
504 if failed:
505 sys.exit(failed)
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538