1
1
from argparse import ArgumentParser , ArgumentTypeError
2
+ import ast
2
3
from base64 import b64encode , b64decode
3
4
import copy
4
5
from functools import wraps
10
11
import re
11
12
import subprocess
12
13
import sys
14
+ import textwrap
13
15
import tokenize
14
16
from typing import (
15
17
Any ,
20
22
List ,
21
23
Literal ,
22
24
Optional ,
25
+ Set ,
23
26
Tuple ,
24
27
Union ,
25
28
)
26
29
from typing_inspect import get_args as typing_inspect_get_args , get_origin as typing_inspect_get_origin
30
+ import warnings
27
31
28
32
if sys .version_info >= (3 , 10 ):
29
33
from types import UnionType
@@ -184,7 +188,6 @@ def tokenize_source(obj: object) -> Generator:
184
188
"""Returns a generator for the tokens of the object's source code."""
185
189
source = inspect .getsource (obj )
186
190
token_generator = tokenize .generate_tokens (StringIO (source ).readline )
187
-
188
191
return token_generator
189
192
190
193
@@ -204,21 +207,65 @@ def source_line_to_tokens(obj: object) -> Dict[int, List[Dict[str, Union[str, in
204
207
"""Gets a dictionary mapping from line number to a dictionary of tokens on that line for an object's source code."""
205
208
line_to_tokens = {}
206
209
for token_type , token , (start_line , start_column ), (end_line , end_column ), line in tokenize_source (obj ):
207
- line_to_tokens .setdefault (start_line , []).append (
208
- {
209
- "token_type" : token_type ,
210
- "token" : token ,
211
- "start_line" : start_line ,
212
- "start_column" : start_column ,
213
- "end_line" : end_line ,
214
- "end_column" : end_column ,
215
- "line" : line ,
216
- }
217
- )
210
+ line_to_tokens .setdefault (start_line , []).append ({
211
+ 'token_type' : token_type ,
212
+ 'token' : token ,
213
+ 'start_line' : start_line ,
214
+ 'start_column' : start_column ,
215
+ 'end_line' : end_line ,
216
+ 'end_column' : end_column ,
217
+ 'line' : line
218
+ })
218
219
219
220
return line_to_tokens
220
221
221
222
223
+ def get_subsequent_assign_lines (cls : type ) -> Set [int ]:
224
+ """For all multiline assign statements, get the line numbers after the first line of the assignment."""
225
+ # Get source code of class
226
+ source = inspect .getsource (cls )
227
+
228
+ # Parse source code using ast (with an if statement to avoid indentation errors)
229
+ source = f"if True:\n { textwrap .indent (source , ' ' )} "
230
+ body = ast .parse (source ).body [0 ]
231
+
232
+ # Set up warning message
233
+ parse_warning = (
234
+ "Could not parse class source code to extract comments. "
235
+ "Comments in the help string may be incorrect."
236
+ )
237
+
238
+ # Check for correct parsing
239
+ if not isinstance (body , ast .If ):
240
+ warnings .warn (parse_warning )
241
+ return set ()
242
+
243
+ # Extract if body
244
+ if_body = body .body
245
+
246
+ # Check for a single body
247
+ if len (if_body ) != 1 :
248
+ warnings .warn (parse_warning )
249
+ return set ()
250
+
251
+ # Extract class body
252
+ cls_body = if_body [0 ]
253
+
254
+ # Check for a single class definition
255
+ if not isinstance (cls_body , ast .ClassDef ):
256
+ warnings .warn (parse_warning )
257
+ return set ()
258
+
259
+ # Get line numbers of assign statements
260
+ assign_lines = set ()
261
+ for node in cls_body .body :
262
+ if isinstance (node , (ast .Assign , ast .AnnAssign )):
263
+ # Get line number of assign statement excluding the first line (and minus 1 for the if statement)
264
+ assign_lines |= set (range (node .lineno , node .end_lineno ))
265
+
266
+ return assign_lines
267
+
268
+
222
269
def get_class_variables (cls : type ) -> Dict [str , Dict [str , str ]]:
223
270
"""Returns a dictionary mapping class variables to their additional information (currently just comments)."""
224
271
# Get mapping from line number to tokens
@@ -227,12 +274,19 @@ def get_class_variables(cls: type) -> Dict[str, Dict[str, str]]:
227
274
# Get class variable column number
228
275
class_variable_column = get_class_column (cls )
229
276
277
+ # For all multiline assign statements, get the line numbers after the first line of the assignment
278
+ # This is used to avoid identifying comments in multiline assign statements
279
+ subsequent_assign_lines = get_subsequent_assign_lines (cls )
280
+
230
281
# Extract class variables
231
282
class_variable = None
232
283
variable_to_comment = {}
233
- for tokens in line_to_tokens .values ():
234
- for i , token in enumerate (tokens ):
284
+ for line , tokens in line_to_tokens .items ():
285
+ # Skip assign lines after the first line of multiline assign statements
286
+ if line in subsequent_assign_lines :
287
+ continue
235
288
289
+ for i , token in enumerate (tokens ):
236
290
# Skip whitespace
237
291
if token ["token" ].strip () == "" :
238
292
continue
@@ -244,8 +298,21 @@ def get_class_variables(cls: type) -> Dict[str, Dict[str, str]]:
244
298
and token ["token" ][:1 ] in {'"' , "'" }
245
299
):
246
300
sep = " " if variable_to_comment [class_variable ]["comment" ] else ""
301
+
302
+ # Identify the quote character (single or double)
247
303
quote_char = token ["token" ][:1 ]
248
- variable_to_comment [class_variable ]["comment" ] += sep + token ["token" ].strip (quote_char ).strip ()
304
+
305
+ # Identify the number of quote characters at the start of the string
306
+ num_quote_chars = len (token ["token" ]) - len (token ["token" ].lstrip (quote_char ))
307
+
308
+ # Remove the number of quote characters at the start of the string and the end of the string
309
+ token ["token" ] = token ["token" ][num_quote_chars :- num_quote_chars ]
310
+
311
+ # Remove the unicode escape sequences (e.g. "\"")
312
+ token ["token" ] = bytes (token ["token" ], encoding = 'ascii' ).decode ('unicode-escape' )
313
+
314
+ # Add the token to the comment, stripping whitespace
315
+ variable_to_comment [class_variable ]["comment" ] += sep + token ["token" ].strip ()
249
316
250
317
# Match class variable
251
318
class_variable = None
0 commit comments