MindSpore
/
darglint
mirror of https://gitee.com/mindspore/darglint.git

 
			
							"""Defines a function for lexing a comment, `lex`."""

from typing import (
    Iterator,
    List,
    Optional,
)
from .custom_assert import Assert
from .peaker import Peaker
from .token import Token, TokenType
from .config import (
    get_config,
)

# These convenience functions take an optional string
# because the peaker could return None when at the end
# of the stream.


def _is_space(char):
    # type: (Optional[str]) -> bool
    return char == ' '


def _is_newline(char):
    # type: (Optional[str]) -> bool
    return char == '\n'


def _is_colon(char):
    # type: (Optional[str]) -> bool
    return char == ':'


def _is_hash(char):
    # type: (Optional[str]) -> bool
    return char == '#'


def _is_separator(char):
    # type: (Optional[str]) -> bool
    """Check whether if `char` is a separator other than newline or space.

    Args:
        char: The character to check.

    Returns:
        true if `char` is a separator other than newline or space.

    """
    if char is None:
        return False
    return char.isspace() and not (_is_space(char) or _is_newline(char))


def _is_lparen(char):
    # type: (Optional[str]) -> bool
    return char == '('


def _is_rparen(char):
    # type: (Optional[str]) -> bool
    return char == ')'


def _is_hyphen(char):
    # type: (Optional[str]) -> bool
    return char == '-'


def _is_word(char):
    # type: (str) -> bool
    return not any([
        _is_space(char),
        _is_newline(char),
        _is_colon(char),
        _is_separator(char),
        _is_hash(char),
        _is_lparen(char),
        _is_rparen(char),
    ])


def lex(program):
    # type: (str) -> Iterator[Token]
    """Create a stream of tokens from the string.

    Args:
        program: The program to lex, as a string.

    Yields:
        Tokens lexed from the string.

    """
    extra = ''  # Extra characters which are pulled but unused from a check.
    peaker = Peaker((x for x in program or []))  # the stream
    line_number = 0

    # Set the amount of spaces which count as an indent.
    config = get_config()

    while peaker.has_next():
        # Each of the following conditions must move the stream
        # forward and -- excepting separators -- yield a token.
        if _is_space(peaker.peak()):
            spaces = ''.join(peaker.take_while(_is_space))
            for _ in range(len(spaces) // config.indentation):
                yield Token(' ' * 4, TokenType.INDENT, line_number)
        elif _is_newline(peaker.peak()):
            value = peaker.next()
            yield Token(value, TokenType.NEWLINE, line_number)
            line_number += 1
        elif _is_colon(peaker.peak()):
            value = peaker.next()
            yield Token(value, TokenType.COLON, line_number)
        elif _is_separator(peaker.peak()):
            peaker.take_while(_is_separator)
        elif _is_hash(peaker.peak()):
            value = peaker.next()
            yield Token(value, TokenType.HASH, line_number)
        elif _is_lparen(peaker.peak()):
            value = peaker.next()
            yield Token(value, TokenType.LPAREN, line_number)
        elif _is_rparen(peaker.peak()):
            value = peaker.next()
            yield Token(value, TokenType.RPAREN, line_number)
        elif _is_hyphen(peaker.peak()):
            value = ''.join(peaker.take_while(_is_word))
            if value.count('-') == len(value):
                yield Token(value, TokenType.HEADER, line_number)
            else:
                yield Token(value, TokenType.WORD, line_number)
        else:
            value = ''.join(peaker.take_while(_is_word))
            if extra != '':
                value = extra + value
                extra = ''
            Assert(
                len(value) > 0,
                "There should be non-special characters.",
            )
            yield Token(value, TokenType.WORD, line_number)


KEYWORDS = {
    'Args': TokenType.ARGUMENTS,
    'Arguments': TokenType.ARGUMENTS,
    'Yields': TokenType.YIELDS,
    'Raises': TokenType.RAISES,
    'Returns': TokenType.RETURNS,
    'noqa': TokenType.NOQA,
    'param': TokenType.ARGUMENTS,
    'parameter': TokenType.ARGUMENTS,
    'Parameters': TokenType.ARGUMENTS,
    'arg': TokenType.ARGUMENTS,
    'argument': TokenType.ARGUMENTS,
    'key': TokenType.VARIABLES,
    'keyword': TokenType.VARIABLES,
    'var': TokenType.VARIABLES,
    'ivar': TokenType.VARIABLES,
    'cvar': TokenType.VARIABLES,
    'type': TokenType.ARGUMENT_TYPE,
    'vartype': TokenType.VARIABLE_TYPE,
    'raises': TokenType.RAISES,
    'yield': TokenType.YIELDS,
    'yields': TokenType.YIELDS,
    'ytype': TokenType.YIELD_TYPE,
    'return': TokenType.RETURNS,
    'returns': TokenType.RETURNS,
    'rtype': TokenType.RETURN_TYPE,
    'Other': TokenType.OTHER,
    'Receives': TokenType.RECEIVES,
    'Warns': TokenType.WARNS,
    'Warnings': TokenType.WARNS,
    'See': TokenType.SEE,
    'Also': TokenType.ALSO,
    'Notes': TokenType.NOTES,
    'Examples': TokenType.EXAMPLES,
    'References': TokenType.REFERENCES,
}


def condense(tokens):
    # type: (Iterator[Token]) -> List[Token]
    """Condense the stream of tokens into a list consumable by CYK.

    This servers two purposes:

    1. It minimizes the lookup table used in the CYK algorithm.
       (The CYK algorithm is a dynamic algorithm, with one dimension
       in the two-dimension lookup table being determined by the number
       of tokens.)

    2. It applies more discriminate token types to the tokens identified
       by lex.  Eventually, this will be moved into the lex function.

    Args:
        tokens: The stream of tokens from the lex function.

    Returns:
        A List of tokens which have been condensed into as small a
        representation as possible.

    """
    ret = list()  # type: List[Token]
    try:
        curr = next(tokens)
    except StopIteration:
        return ret

    if curr.value in KEYWORDS:
        curr.token_type = KEYWORDS[curr.value]

    encountered_noqa = False

    for token in tokens:
        if token.token_type == TokenType.WORD and token.value in KEYWORDS:
            ret.append(curr)
            if token.value == 'noqa':
                encountered_noqa = True
            curr = Token(
                token.value,
                KEYWORDS[token.value],
                token.line_number,
            )
        elif token.token_type == TokenType.WORD:
            if curr.token_type == TokenType.WORD and not encountered_noqa:
                curr.value += ' {}'.format(token.value)
            else:
                ret.append(curr)
                curr = token
        elif token.token_type == TokenType.NEWLINE:
            ret.append(curr)
            curr = token
            encountered_noqa = False
        else:
            ret.append(curr)
            curr = token

    ret.append(curr)

    return ret