Package epydoc :: Package markup :: Module epytext
[hide private]
[frames] | no frames]

Source Code for Module epydoc.markup.epytext

   1  # 
   2  # epytext.py: epydoc formatted docstring parsing 
   3  # Edward Loper 
   4  # 
   5  # Created [04/10/01 12:00 AM] 
   6  # $Id: epytext.py 1803 2008-02-27 00:32:35Z edloper $ 
   7  # 
   8   
   9  """ 
  10  Parser for epytext strings.  Epytext is a lightweight markup whose 
  11  primary intended application is Python documentation strings.  This 
  12  parser converts Epytext strings to a simple DOM-like representation 
  13  (encoded as a tree of L{Element} objects and strings).  Epytext 
  14  strings can contain the following X{structural blocks}: 
  15   
  16      - X{epytext}: The top-level element of the DOM tree. 
  17      - X{para}: A paragraph of text.  Paragraphs contain no newlines,  
  18        and all spaces are soft. 
  19      - X{section}: A section or subsection. 
  20      - X{field}: A tagged field.  These fields provide information 
  21        about specific aspects of a Python object, such as the 
  22        description of a function's parameter, or the author of a 
  23        module. 
  24      - X{literalblock}: A block of literal text.  This text should be 
  25        displayed as it would be displayed in plaintext.  The 
  26        parser removes the appropriate amount of leading whitespace  
  27        from each line in the literal block. 
  28      - X{doctestblock}: A block containing sample python code, 
  29        formatted according to the specifications of the C{doctest} 
  30        module. 
  31      - X{ulist}: An unordered list. 
  32      - X{olist}: An ordered list. 
  33      - X{li}: A list item.  This tag is used both for unordered list 
  34        items and for ordered list items. 
  35   
  36  Additionally, the following X{inline regions} may be used within 
  37  C{para} blocks: 
  38       
  39      - X{code}:   Source code and identifiers. 
  40      - X{math}:   Mathematical expressions. 
  41      - X{index}:  A term which should be included in an index, if one 
  42                   is generated. 
  43      - X{italic}: Italicized text. 
  44      - X{bold}:   Bold-faced text. 
  45      - X{uri}:    A Universal Resource Indicator (URI) or Universal 
  46                   Resource Locator (URL) 
  47      - X{link}:   A Python identifier which should be hyperlinked to 
  48                   the named object's documentation, when possible. 
  49   
  50  The returned DOM tree will conform to the the following Document Type 
  51  Description:: 
  52   
  53     <!ENTITY % colorized '(code | math | index | italic | 
  54                            bold | uri | link | symbol)*'> 
  55   
  56     <!ELEMENT epytext ((para | literalblock | doctestblock | 
  57                        section | ulist | olist)*, fieldlist?)> 
  58   
  59     <!ELEMENT para (#PCDATA | %colorized;)*> 
  60   
  61     <!ELEMENT section (para | listblock | doctestblock | 
  62                        section | ulist | olist)+> 
  63   
  64     <!ELEMENT fieldlist (field+)> 
  65     <!ELEMENT field (tag, arg?, (para | listblock | doctestblock) 
  66                                  ulist | olist)+)> 
  67     <!ELEMENT tag (#PCDATA)> 
  68     <!ELEMENT arg (#PCDATA)> 
  69      
  70     <!ELEMENT literalblock (#PCDATA | %colorized;)*> 
  71     <!ELEMENT doctestblock (#PCDATA)> 
  72   
  73     <!ELEMENT ulist (li+)> 
  74     <!ELEMENT olist (li+)> 
  75     <!ELEMENT li (para | literalblock | doctestblock | ulist | olist)+> 
  76     <!ATTLIST li bullet NMTOKEN #IMPLIED> 
  77     <!ATTLIST olist start NMTOKEN #IMPLIED> 
  78   
  79     <!ELEMENT uri     (name, target)> 
  80     <!ELEMENT link    (name, target)> 
  81     <!ELEMENT name    (#PCDATA | %colorized;)*> 
  82     <!ELEMENT target  (#PCDATA)> 
  83      
  84     <!ELEMENT code    (#PCDATA | %colorized;)*> 
  85     <!ELEMENT math    (#PCDATA | %colorized;)*> 
  86     <!ELEMENT italic  (#PCDATA | %colorized;)*> 
  87     <!ELEMENT bold    (#PCDATA | %colorized;)*> 
  88     <!ELEMENT indexed (#PCDATA | %colorized;)> 
  89     <!ATTLIST code style CDATA #IMPLIED> 
  90   
  91     <!ELEMENT symbol (#PCDATA)> 
  92   
  93  @var SYMBOLS: A list of the of escape symbols that are supported 
  94        by epydoc.  Currently the following symbols are supported: 
  95  <<<SYMBOLS>>> 
  96  """ 
  97  # Note: the symbol list is appended to the docstring automatically, 
  98  # below. 
  99   
 100  __docformat__ = 'epytext en' 
 101   
 102  # Code organization.. 
 103  #   1. parse() 
 104  #   2. tokenize() 
 105  #   3. colorize() 
 106  #   4. helpers 
 107  #   5. testing 
 108   
 109  import re, string, types, sys, os.path 
 110  from epydoc.markup import * 
 111  from epydoc.util import wordwrap, plaintext_to_html, plaintext_to_latex 
 112  from epydoc.markup.doctest import doctest_to_html, doctest_to_latex 
 113   
 114  ################################################## 
 115  ## DOM-Like Encoding 
 116  ################################################## 
 117   
118 -class Element:
119 """ 120 A very simple DOM-like representation for parsed epytext 121 documents. Each epytext document is encoded as a tree whose nodes 122 are L{Element} objects, and whose leaves are C{string}s. Each 123 node is marked by a I{tag} and zero or more I{attributes}. Each 124 attribute is a mapping from a string key to a string value. 125 """
126 - def __init__(self, tag, *children, **attribs):
127 self.tag = tag 128 """A string tag indicating the type of this element. 129 @type: C{string}""" 130 131 self.children = list(children) 132 """A list of the children of this element. 133 @type: C{list} of (C{string} or C{Element})""" 134 135 self.attribs = attribs 136 """A dictionary mapping attribute names to attribute values 137 for this element. 138 @type: C{dict} from C{string} to C{string}"""
139
140 - def __str__(self):
141 """ 142 Return a string representation of this element, using XML 143 notation. 144 @bug: Doesn't escape '<' or '&' or '>'. 145 """ 146 attribs = ''.join([' %s=%r' % t for t in self.attribs.items()]) 147 return ('<%s%s>' % (self.tag, attribs) + 148 ''.join([str(child) for child in self.children]) + 149 '</%s>' % self.tag)
150
151 - def __repr__(self):
152 attribs = ''.join([', %s=%r' % t for t in self.attribs.items()]) 153 args = ''.join([', %r' % c for c in self.children]) 154 return 'Element(%s%s%s)' % (self.tag, args, attribs)
155 156 ################################################## 157 ## Constants 158 ################################################## 159 160 # The possible heading underline characters, listed in order of 161 # heading depth. 162 _HEADING_CHARS = "=-~" 163 164 # Escape codes. These should be needed very rarely. 165 _ESCAPES = {'lb':'{', 'rb': '}'} 166 167 # Symbols. These can be generated via S{...} escapes. 168 SYMBOLS = [ 169 # Arrows 170 '<-', '->', '^', 'v', 171 172 # Greek letters 173 'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 174 'eta', 'theta', 'iota', 'kappa', 'lambda', 'mu', 175 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 176 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega', 177 'Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon', 'Zeta', 178 'Eta', 'Theta', 'Iota', 'Kappa', 'Lambda', 'Mu', 179 'Nu', 'Xi', 'Omicron', 'Pi', 'Rho', 'Sigma', 180 'Tau', 'Upsilon', 'Phi', 'Chi', 'Psi', 'Omega', 181 182 # HTML character entities 183 'larr', 'rarr', 'uarr', 'darr', 'harr', 'crarr', 184 'lArr', 'rArr', 'uArr', 'dArr', 'hArr', 185 'copy', 'times', 'forall', 'exist', 'part', 186 'empty', 'isin', 'notin', 'ni', 'prod', 'sum', 187 'prop', 'infin', 'ang', 'and', 'or', 'cap', 'cup', 188 'int', 'there4', 'sim', 'cong', 'asymp', 'ne', 189 'equiv', 'le', 'ge', 'sub', 'sup', 'nsub', 190 'sube', 'supe', 'oplus', 'otimes', 'perp', 191 192 # Alternate (long) names 193 'infinity', 'integral', 'product', 194 '>=', '<=', 195 ] 196 # Convert to a dictionary, for quick lookup 197 _SYMBOLS = {} 198 for symbol in SYMBOLS: _SYMBOLS[symbol] = 1 199 200 # Add symbols to the docstring. 201 symblist = ' ' 202 symblist += ';\n '.join([' - C{E{S}{%s}}=S{%s}' % (symbol, symbol) 203 for symbol in SYMBOLS]) 204 __doc__ = __doc__.replace('<<<SYMBOLS>>>', symblist) 205 del symbol, symblist 206 207 # Tags for colorizing text. 208 _COLORIZING_TAGS = { 209 'C': 'code', 210 'M': 'math', 211 'X': 'indexed', 212 'I': 'italic', 213 'B': 'bold', 214 'U': 'uri', 215 'L': 'link', # A Python identifier that should be linked to 216 'E': 'escape', # escapes characters or creates symbols 217 'S': 'symbol', 218 'G': 'graph', 219 } 220 221 # Which tags can use "link syntax" (e.g., U{Python<www.python.org>})? 222 _LINK_COLORIZING_TAGS = ['link', 'uri'] 223 224 ################################################## 225 ## Structuring (Top Level) 226 ################################################## 227
228 -def parse(str, errors = None):
229 """ 230 Return a DOM tree encoding the contents of an epytext string. Any 231 errors generated during parsing will be stored in C{errors}. 232 233 @param str: The epytext string to parse. 234 @type str: C{string} 235 @param errors: A list where any errors generated during parsing 236 will be stored. If no list is specified, then fatal errors 237 will generate exceptions, and non-fatal errors will be 238 ignored. 239 @type errors: C{list} of L{ParseError} 240 @return: a DOM tree encoding the contents of an epytext string. 241 @rtype: C{Element} 242 @raise ParseError: If C{errors} is C{None} and an error is 243 encountered while parsing. 244 """ 245 # Initialize errors list. 246 if errors == None: 247 errors = [] 248 raise_on_error = 1 249 else: 250 raise_on_error = 0 251 252 # Preprocess the string. 253 str = re.sub('\015\012', '\012', str) 254 str = string.expandtabs(str) 255 256 # Tokenize the input string. 257 tokens = _tokenize(str, errors) 258 259 # Have we encountered a field yet? 260 encountered_field = 0 261 262 # Create an document to hold the epytext. 263 doc = Element('epytext') 264 265 # Maintain two parallel stacks: one contains DOM elements, and 266 # gives the ancestors of the current block. The other contains 267 # indentation values, and gives the indentation of the 268 # corresponding DOM elements. An indentation of "None" reflects 269 # an unknown indentation. However, the indentation must be 270 # greater than, or greater than or equal to, the indentation of 271 # the prior element (depending on what type of DOM element it 272 # corresponds to). No 2 consecutive indent_stack values will be 273 # ever be "None." Use initial dummy elements in the stack, so we 274 # don't have to worry about bounds checking. 275 stack = [None, doc] 276 indent_stack = [-1, None] 277 278 for token in tokens: 279 # Uncomment this for debugging: 280 #print ('%s: %s\n%s: %s\n' % 281 # (''.join(['%-11s' % (t and t.tag) for t in stack]), 282 # token.tag, ''.join(['%-11s' % i for i in indent_stack]), 283 # token.indent)) 284 285 # Pop any completed blocks off the stack. 286 _pop_completed_blocks(token, stack, indent_stack) 287 288 # If Token has type PARA, colorize and add the new paragraph 289 if token.tag == Token.PARA: 290 _add_para(doc, token, stack, indent_stack, errors) 291 292 # If Token has type HEADING, add the new section 293 elif token.tag == Token.HEADING: 294 _add_section(doc, token, stack, indent_stack, errors) 295 296 # If Token has type LBLOCK, add the new literal block 297 elif token.tag == Token.LBLOCK: 298 stack[-1].children.append(token.to_dom(doc)) 299 300 # If Token has type DTBLOCK, add the new doctest block 301 elif token.tag == Token.DTBLOCK: 302 stack[-1].children.append(token.to_dom(doc)) 303 304 # If Token has type BULLET, add the new list/list item/field 305 elif token.tag == Token.BULLET: 306 _add_list(doc, token, stack, indent_stack, errors) 307 else: 308 assert 0, 'Unknown token type: '+token.tag 309 310 # Check if the DOM element we just added was a field.. 311 if stack[-1].tag == 'field': 312 encountered_field = 1 313 elif encountered_field == 1: 314 if len(stack) <= 3: 315 estr = ("Fields must be the final elements in an "+ 316 "epytext string.") 317 errors.append(StructuringError(estr, token.startline)) 318 319 # Graphs use inline markup (G{...}) but are really block-level 320 # elements; so "raise" any graphs we generated. This is a bit of 321 # a hack, but the alternative is to define a new markup for 322 # block-level elements, which I'd rather not do. (See sourceforge 323 # bug #1673017.) 324 for child in doc.children: 325 _raise_graphs(child, doc) 326 327 # If there was an error, then signal it! 328 if len([e for e in errors if e.is_fatal()]) > 0: 329 if raise_on_error: 330 raise errors[0] 331 else: 332 return None 333 334 # Return the top-level epytext DOM element. 335 return doc
336
337 -def _raise_graphs(tree, parent):
338 # Recurse to children. 339 have_graph_child = False 340 for elt in tree.children: 341 if isinstance(elt, Element): 342 _raise_graphs(elt, tree) 343 if elt.tag == 'graph': have_graph_child = True 344 345 block = ('section', 'fieldlist', 'field', 'ulist', 'olist', 'li') 346 if have_graph_child and tree.tag not in block: 347 child_index = 0 348 parent_index = parent.children.index(tree) 349 for elt in tree.children: 350 if isinstance(elt, Element) and elt.tag == 'graph': 351 # We found a graph: splice it into the parent. 352 left = tree.children[:child_index] 353 right = tree.children[child_index+1:] 354 parent.children[parent_index:parent_index+1] = [ 355 Element(tree.tag, *left, **tree.attribs), 356 elt, 357 Element(tree.tag, *right, **tree.attribs)] 358 child_index = 0 359 parent_index += 2 360 tree = parent.children[parent_index] 361 else: 362 child_index += 1
363
364 -def _pop_completed_blocks(token, stack, indent_stack):
365 """ 366 Pop any completed blocks off the stack. This includes any 367 blocks that we have dedented past, as well as any list item 368 blocks that we've dedented to. The top element on the stack 369 should only be a list if we're about to start a new list 370 item (i.e., if the next token is a bullet). 371 """ 372 indent = token.indent 373 if indent != None: 374 while (len(stack) > 2): 375 pop = 0 376 377 # Dedent past a block 378 if indent_stack[-1]!=None and indent<indent_stack[-1]: pop=1 379 elif indent_stack[-1]==None and indent<indent_stack[-2]: pop=1 380 381 # Dedent to a list item, if it is follwed by another list 382 # item with the same indentation. 383 elif (token.tag == 'bullet' and indent==indent_stack[-2] and 384 stack[-1].tag in ('li', 'field')): pop=1 385 386 # End of a list (no more list items available) 387 elif (stack[-1].tag in ('ulist', 'olist') and 388 (token.tag != 'bullet' or token.contents[-1] == ':')): 389 pop=1 390 391 # Pop the block, if it's complete. Otherwise, we're done. 392 if pop == 0: return 393 stack.pop() 394 indent_stack.pop()
395
396 -def _add_para(doc, para_token, stack, indent_stack, errors):
397 """Colorize the given paragraph, and add it to the DOM tree.""" 398 # Check indentation, and update the parent's indentation 399 # when appropriate. 400 if indent_stack[-1] == None: 401 indent_stack[-1] = para_token.indent 402 if para_token.indent == indent_stack[-1]: 403 # Colorize the paragraph and add it. 404 para = _colorize(doc, para_token, errors) 405 if para_token.inline: 406 para.attribs['inline'] = True 407 stack[-1].children.append(para) 408 else: 409 estr = "Improper paragraph indentation." 410 errors.append(StructuringError(estr, para_token.startline))
411
412 -def _add_section(doc, heading_token, stack, indent_stack, errors):
413 """Add a new section to the DOM tree, with the given heading.""" 414 if indent_stack[-1] == None: 415 indent_stack[-1] = heading_token.indent 416 elif indent_stack[-1] != heading_token.indent: 417 estr = "Improper heading indentation." 418 errors.append(StructuringError(estr, heading_token.startline)) 419 420 # Check for errors. 421 for tok in stack[2:]: 422 if tok.tag != "section": 423 estr = "Headings must occur at the top level." 424 errors.append(StructuringError(estr, heading_token.startline)) 425 break 426 if (heading_token.level+2) > len(stack): 427 estr = "Wrong underline character for heading." 428 errors.append(StructuringError(estr, heading_token.startline)) 429 430 # Pop the appropriate number of headings so we're at the 431 # correct level. 432 stack[heading_token.level+2:] = [] 433 indent_stack[heading_token.level+2:] = [] 434 435 # Colorize the heading 436 head = _colorize(doc, heading_token, errors, 'heading') 437 438 # Add the section's and heading's DOM elements. 439 sec = Element("section") 440 stack[-1].children.append(sec) 441 stack.append(sec) 442 sec.children.append(head) 443 indent_stack.append(None)
444
445 -def _add_list(doc, bullet_token, stack, indent_stack, errors):
446 """ 447 Add a new list item or field to the DOM tree, with the given 448 bullet or field tag. When necessary, create the associated 449 list. 450 """ 451 # Determine what type of bullet it is. 452 if bullet_token.contents[-1] == '-': 453 list_type = 'ulist' 454 elif bullet_token.contents[-1] == '.': 455 list_type = 'olist' 456 elif bullet_token.contents[-1] == ':': 457 list_type = 'fieldlist' 458 else: 459 raise AssertionError('Bad Bullet: %r' % bullet_token.contents) 460 461 # Is this a new list? 462 newlist = 0 463 if stack[-1].tag != list_type: 464 newlist = 1 465 elif list_type == 'olist' and stack[-1].tag == 'olist': 466 old_listitem = stack[-1].children[-1] 467 old_bullet = old_listitem.attribs.get("bullet").split('.')[:-1] 468 new_bullet = bullet_token.contents.split('.')[:-1] 469 if (new_bullet[:-1] != old_bullet[:-1] or 470 int(new_bullet[-1]) != int(old_bullet[-1])+1): 471 newlist = 1 472 473 # Create the new list. 474 if newlist: 475 if stack[-1].tag is 'fieldlist': 476 # The new list item is not a field list item (since this 477 # is a new list); but it's indented the same as the field 478 # list. This either means that they forgot to indent the 479 # list, or they are trying to put something after the 480 # field list. The first one seems more likely, so we'll 481 # just warn about that (to avoid confusion). 482 estr = "Lists must be indented." 483 errors.append(StructuringError(estr, bullet_token.startline)) 484 if stack[-1].tag in ('ulist', 'olist', 'fieldlist'): 485 stack.pop() 486 indent_stack.pop() 487 488 if (list_type != 'fieldlist' and indent_stack[-1] is not None and 489 bullet_token.indent == indent_stack[-1]): 490 # Ignore this error if there's text on the same line as 491