yapf/yapflib/comment_splicer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354

# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Comment splicer for lib2to3 trees.

The lib2to3 syntax tree produced by the parser holds comments and whitespace in
prefix attributes of nodes, rather than nodes themselves. This module provides
functionality to splice comments out of prefixes and into nodes of their own,
making them easier to process.

  SpliceComments(): the main function exported by this module.
"""

from lib2to3 import pygram
from lib2to3 import pytree
from lib2to3.pgen2 import token

from yapf.yapflib import pytree_utils


def SpliceComments(tree):
  """Given a pytree, splice comments into nodes of their own right.

  Extract comments from the prefixes where they are housed after parsing.
  The prefixes that previously housed the comments become empty.

  Args:
    tree: a pytree.Node - the tree to work on. The tree is modified by this
        function.
  """
  # The previous leaf node encountered in the traversal.
  # This is a list because Python 2.x doesn't have 'nonlocal' :)
  prev_leaf = [None]
  _AnnotateIndents(tree)

  def _VisitNodeRec(node):
    # This loop may insert into node.children, so we'll iterate over a copy.
    for child in node.children[:]:
      if isinstance(child, pytree.Node):
        # Nodes don't have prefixes.
        _VisitNodeRec(child)
      else:
        if child.prefix.lstrip().startswith('#'):
          # We have a comment prefix in this child, so splicing is needed.
          comment_prefix = child.prefix
          comment_lineno = child.lineno - comment_prefix.count('\n')
          comment_column = child.column

          # Remember the leading indentation of this prefix and clear it.
          # Mopping up the prefix is important because we may go over this same
          # child in the next iteration...
          child_prefix = child.prefix.lstrip('\n')
          prefix_indent = child_prefix[:child_prefix.find('#')]
          if '\n' in prefix_indent:
            prefix_indent = prefix_indent[prefix_indent.rfind('\n') + 1:]
          child.prefix = ''

          if child.type == token.NEWLINE:
            # If the prefix was on a NEWLINE leaf, it's part of the line so it
            # will be inserted after the previously encountered leaf.
            # We can't just insert it before the NEWLINE node, because as a
            # result of the way pytrees are organized, this node can be under
            # an inappropriate parent.
            comment_column -= len(comment_prefix.lstrip())
            pytree_utils.InsertNodesAfter(
                _CreateCommentsFromPrefix(
                    comment_prefix,
                    comment_lineno,
                    comment_column,
                    standalone=False), prev_leaf[0])
          elif child.type == token.DEDENT:
            # Comment prefixes on DEDENT nodes also deserve special treatment,
            # because their final placement depends on their prefix.
            # We'll look for an ancestor of this child with a matching
            # indentation, and insert the comment before it if the ancestor is
            # on a DEDENT node and after it otherwise.
            #
            # lib2to3 places comments that should be separated into the same
            # DEDENT node. For example, "comment 1" and "comment 2" will be
            # combined.
            #
            #   def _():
            #     for x in y:
            #       pass
            #       # comment 1
            #
            #     # comment 2
            #     pass
            #
            # In this case, we need to split them up ourselves.

            # Split into groups of comments at decreasing levels of indentation
            comment_groups = []
            comment_column = None
            for cmt in comment_prefix.split('\n'):
              col = cmt.find('#')
              if col < 0:
                if comment_column is None:
                  # Skip empty lines at the top of the first comment group
                  comment_lineno += 1
                  continue
              elif comment_column is None or col < comment_column:
                comment_column = col
                comment_indent = cmt[:comment_column]
                comment_groups.append((comment_column, comment_indent, []))
              comment_groups[-1][-1].append(cmt)

            # Insert a node for each group
            for comment_column, comment_indent, comment_group in comment_groups:
              ancestor_at_indent = _FindAncestorAtIndent(child, comment_indent)
              if ancestor_at_indent.type == token.DEDENT:
                InsertNodes = pytree_utils.InsertNodesBefore  # pylint: disable=invalid-name
              else:
                InsertNodes = pytree_utils.InsertNodesAfter  # pylint: disable=invalid-name
              InsertNodes(
                  _CreateCommentsFromPrefix(
                      '\n'.join(comment_group) + '\n',
                      comment_lineno,
                      comment_column,
                      standalone=True), ancestor_at_indent)
              comment_lineno += len(comment_group)
          else:
            # Otherwise there are two cases.
            #
            # 1. The comment is on its own line
            # 2. The comment is part of an expression.
            #
            # Unfortunately, it's fairly difficult to distinguish between the
            # two in lib2to3 trees. The algorithm here is to determine whether
            # child is the first leaf in the statement it belongs to. If it is,
            # then the comment (which is a prefix) belongs on a separate line.
            # If it is not, it means the comment is buried deep in the statement
            # and is part of some expression.
            stmt_parent = _FindStmtParent(child)

            for leaf_in_parent in stmt_parent.leaves():
              if leaf_in_parent.type == token.NEWLINE:
                continue
              elif id(leaf_in_parent) == id(child):
                # This comment stands on its own line, and it has to be inserted
                # into the appropriate parent. We'll have to find a suitable
                # parent to insert into. See comments above
                # _STANDALONE_LINE_NODES for more details.
                node_with_line_parent = _FindNodeWithStandaloneLineParent(child)
                pytree_utils.InsertNodesBefore(
                    _CreateCommentsFromPrefix(
                        comment_prefix, comment_lineno, 0, standalone=True),
                    node_with_line_parent)
                break
              else:
                if comment_lineno == prev_leaf[0].lineno:
                  comment_lines = comment_prefix.splitlines()
                  value = comment_lines[0].lstrip()
                  if value.rstrip('\n'):
                    comment_column = prev_leaf[0].column
                    comment_column += len(prev_leaf[0].value)
                    comment_column += (
                        len(comment_lines[0]) - len(comment_lines[0].lstrip()))
                    comment_leaf = pytree.Leaf(
                        type=token.COMMENT,
                        value=value.rstrip('\n'),
                        context=('', (comment_lineno, comment_column)))
                    pytree_utils.InsertNodesAfter([comment_leaf], prev_leaf[0])
                    comment_prefix = '\n'.join(comment_lines[1:])
                    comment_lineno += 1

                rindex = (0 if '\n' not in comment_prefix.rstrip() else
                          comment_prefix.rstrip().rindex('\n') + 1)
                comment_column = (
                    len(comment_prefix[rindex:]) - len(
                        comment_prefix[rindex:].lstrip()))
                comments = _CreateCommentsFromPrefix(
                    comment_prefix,
                    comment_lineno,
                    comment_column,
                    standalone=False)
                pytree_utils.InsertNodesBefore(comments, child)
                break

        prev_leaf[0] = child

  _VisitNodeRec(tree)


def _CreateCommentsFromPrefix(comment_prefix,
                              comment_lineno,
                              comment_column,
                              standalone=False):
  """Create pytree nodes to represent the given comment prefix.

  Args:
    comment_prefix: (unicode) the text of the comment from the node's prefix.
    comment_lineno: (int) the line number for the start of the comment.
    comment_column: (int) the column for the start of the comment.
    standalone: (bool) determines if the comment is standalone or not.

  Returns:
    The simple_stmt nodes if this is a standalone comment, otherwise a list of
    new COMMENT leafs. The prefix may consist of multiple comment blocks,
    separated by blank lines. Each block gets its own leaf.
  """
  # The comment is stored in the prefix attribute, with no lineno of its
  # own. So we only know at which line it ends. To find out at which line it
  # starts, look at how many newlines the comment itself contains.
  comments = []

  lines = comment_prefix.split('\n')
  index = 0
  while index < len(lines):
    comment_block = []
    while index < len(lines) and lines[index].lstrip().startswith('#'):
      comment_block.append(lines[index].strip())
      index += 1

    if comment_block:
      new_lineno = comment_lineno + index - 1
      comment_block[0] = comment_block[0].strip()
      comment_block[-1] = comment_block[-1].strip()
      comment_leaf = pytree.Leaf(
          type=token.COMMENT,
          value='\n'.join(comment_block),
          context=('', (new_lineno, comment_column)))
      comment_node = comment_leaf if not standalone else pytree.Node(
          pygram.python_symbols.simple_stmt, [comment_leaf])
      comments.append(comment_node)

    while index < len(lines) and not lines[index].lstrip():
      index += 1

  return comments


# "Standalone line nodes" are tree nodes that have to start a new line in Python
# code (and cannot follow a ';' or ':'). Other nodes, like 'expr_stmt', serve as
# parents of other nodes but can come later in a line. This is a list of
# standalone line nodes in the grammar. It is meant to be exhaustive
# *eventually*, and we'll modify it with time as we discover more corner cases
# in the parse tree.
#
# When splicing a standalone comment (i.e. a comment that appears on its own
# line, not on the same line with other code), it's important to insert it into
# an appropriate parent of the node it's attached to. An appropriate parent
# is the first "standaline line node" in the parent chain of a node.
_STANDALONE_LINE_NODES = frozenset([
    'suite', 'if_stmt', 'while_stmt', 'for_stmt', 'try_stmt', 'with_stmt',
    'funcdef', 'classdef', 'decorated', 'file_input'
])


def _FindNodeWithStandaloneLineParent(node):
  """Find a node whose parent is a 'standalone line' node.

  See the comment above _STANDALONE_LINE_NODES for more details.

  Arguments:
    node: node to start from

  Returns:
    Suitable node that's either the node itself or one of its ancestors.
  """
  if pytree_utils.NodeName(node.parent) in _STANDALONE_LINE_NODES:
    return node
  else:
    # This is guaranteed to terminate because 'file_input' is the root node of
    # any pytree.
    return _FindNodeWithStandaloneLineParent(node.parent)


# "Statement nodes" are standalone statements. The don't have to start a new
# line.
_STATEMENT_NODES = frozenset(['simple_stmt']) | _STANDALONE_LINE_NODES


def _FindStmtParent(node):
  """Find the nearest parent of node that is a statement node.

  Arguments:
    node: node to start from

  Returns:
    Nearest parent (or node itself, if suitable).
  """
  if pytree_utils.NodeName(node) in _STATEMENT_NODES:
    return node
  else:
    return _FindStmtParent(node.parent)


def _FindAncestorAtIndent(node, indent):
  """Find an ancestor of node with the given indentation.

  Arguments:
    node: node to start from. This must not be the tree root.
    indent: indentation string for the ancestor we're looking for.
        See _AnnotateIndents for more details.

  Returns:
    An ancestor node with suitable indentation. If no suitable ancestor is
    found, the closest ancestor to the tree root is returned.
  """
  if node.parent.parent is None:
    # Our parent is the tree root, so there's nowhere else to go.
    return node

  # If the parent has an indent annotation, and it's shorter than node's
  # indent, this is a suitable ancestor.
  # The reason for "shorter" rather than "equal" is that comments may be
  # improperly indented (i.e. by three spaces, where surrounding statements
  # have either zero or two or four), and we don't want to propagate them all
  # the way to the root.
  parent_indent = pytree_utils.GetNodeAnnotation(
      node.parent, pytree_utils.Annotation.CHILD_INDENT)
  if parent_indent is not None and indent.startswith(parent_indent):
    return node
  else:
    # Keep looking up the tree.
    return _FindAncestorAtIndent(node.parent, indent)


def _AnnotateIndents(tree):
  """Annotate the tree with child_indent annotations.

  A child_indent annotation on a node specifies the indentation (as a string,
  like "  ") of its children. It is inferred from the INDENT child of a node.

  Arguments:
    tree: root of a pytree. The pytree is modified to add annotations to nodes.

  Raises:
    RuntimeError: if the tree is malformed.
  """
  # Annotate the root of the tree with zero indent.
  if tree.parent is None:
    pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT,
                                   '')
  for child in tree.children:
    if child.type == token.INDENT:
      child_indent = pytree_utils.GetNodeAnnotation(
          tree, pytree_utils.Annotation.CHILD_INDENT)
      if child_indent is not None and child_indent != child.value:
        raise RuntimeError('inconsistent indentation for child', (tree, child))
      pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT,
                                     child.value)
    _AnnotateIndents(child)