Source code for pymorphy2.tagset

# -*- coding: utf-8 -*-
"""
Utils for working with grammatical tags.
"""
from __future__ import absolute_import, unicode_literals
import collections

try:
    from sys import intern
except ImportError:
    # python 2.x has builtin ``intern`` function
    pass

# a bit of *heavy* magic...
class _select_grammeme_from(object):
    """
    Descriptor object for accessing grammemes of certain classes
    (e.g. number or voice).
    """
    def __init__(self, grammeme_set):
        self.grammeme_set = grammeme_set

        # ... are descriptors not magical enough?

        # In order to fight typos, raise an exception
        # if a result is compared to a grammeme which
        # is not in a set of allowed grammemes.
        _str = type("unicode string")

        class TypedGrammeme(_str):
            def __eq__(self, other):
                if other is None:
                    return False
                if other not in grammeme_set:
                    raise ValueError("'%s' is not a valid grammeme for this attribute." % other)
                return _str.__eq__(self, other)

            def __ne__(self, other):
                return not self.__eq__(other)

            def __hash__(self):
                return _str.__hash__(self)

        self.TypedGrammeme = TypedGrammeme

    def __get__(self, instance, owner):
        grammemes = self.grammeme_set & instance.grammemes

        if not grammemes:
            # XXX: type checks are not enforced in this case
            return None

        res = next(iter(grammemes))
        return self.TypedGrammeme(res) if owner.typed_grammemes else res


# Design notes: Tag objects are immutable, but the tag class is mutable.
[docs]class OpencorporaTag(object):
    """
    Wrapper class for OpenCorpora.org tags.

    .. warning::

        In order to work properly, the class has to be globally
        initialized with actual grammemes (using _init_grammemes method).

        Pymorphy2 initializes it when loading a dictionary;
        it may be not a good idea to use this class directly.
        If possible, use ``morph_analyzer.TagClass`` instead.

    Example::

        >>> from pymorphy2 import MorphAnalyzer
        >>> morph = MorphAnalyzer()
        >>> Tag = morph.TagClass # get an initialzed Tag class
        >>> tag = Tag('VERB,perf,tran plur,impr,excl')
        >>> tag
        OpencorporaTag('VERB,perf,tran plur,impr,excl')

    Tag instances have attributes for accessing grammemes::

        >>> print(tag.POS)
        VERB
        >>> print(tag.number)
        plur
        >>> print(tag.case)
        None

    Available attributes are: POS, animacy, aspect, case, gender, involvement,
    mood, number, person, tense, transitivity and voice.

    You may check if a grammeme is in tag or if all grammemes
    from a given set are in tag::

        >>> 'perf' in tag
        True
        >>> 'nomn' in tag
        False
        >>> 'Geox' in tag
        False
        >>> set(['VERB', 'perf']) in tag
        True
        >>> set(['VERB', 'perf', 'sing']) in tag
        False

    In order to fight typos, for unknown grammemes an exception is raised::

        >>> 'foobar' in tag
        Traceback (most recent call last):
        ...
        ValueError: Grammeme is unknown: foobar
        >>> set(['NOUN', 'foo', 'bar']) in tag
        Traceback (most recent call last):
        ...
        ValueError: Grammemes are unknown: {'bar', 'foo'}

    This also works for attributes::

        >>> tag.POS == 'plur'
        Traceback (most recent call last):
        ...
        ValueError: 'plur' is not a valid grammeme for this attribute.

    """

    __slots__ = ['_grammemes_tuple', '_grammemes_cache', '_str', '_POS']

    # Grammeme categories
    # (see http://opencorpora.org/dict.php?act=gram for a full set)
    # -------------------------------------------------------------

    PARTS_OF_SPEECH = frozenset([
        'NOUN',  # имя существительное
        'ADJF',  # имя прилагательное (полное)
        'ADJS',  # имя прилагательное (краткое)
        'COMP',  # компаратив
        'VERB',  # глагол (личная форма)
        'INFN',  # глагол (инфинитив)
        'PRTF',  # причастие (полное)
        'PRTS',  # причастие (краткое)
        'GRND',  # деепричастие
        'NUMR',  # числительное
        'ADVB',  # наречие
        'NPRO',  # местоимение-существительное
        'PRED',  # предикатив
        'PREP',  # предлог
        'CONJ',  # союз
        'PRCL',  # частица
        'INTJ',  # междометие
    ])

    ANIMACY = frozenset([
        'anim',  # одушевлённое
        'inan',  # неодушевлённое
    ])

    GENDERS = frozenset([
        'masc',  # мужской род
        'femn',  # женский род
        'neut',  # средний род
    ])

    NUMBERS = frozenset([
        'sing',  # единственное число
        'plur',  # множественное число
    ])

    CASES = frozenset([
        'nomn',  # именительный падеж
        'gent',  # родительный падеж
        'datv',  # дательный падеж
        'accs',  # винительный падеж
        'ablt',  # творительный падеж
        'loct',  # предложный падеж
        'voct',  # звательный падеж
        'gen1',  # первый родительный падеж
        'gen2',  # второй родительный (частичный) падеж
        'acc2',  # второй винительный падеж
        'loc1',  # первый предложный падеж
        'loc2',  # второй предложный (местный) падеж
    ])

    ASPECTS = frozenset([
        'perf',  # совершенный вид
        'impf',  # несовершенный вид
    ])

    TRANSITIVITY = frozenset([
        'tran',  # переходный
        'intr',  # непереходный
    ])

    PERSONS = frozenset([
        '1per',  # 1 лицо
        '2per',  # 2 лицо
        '3per',  # 3 лицо
    ])

    TENSES = frozenset([
        'pres',  # настоящее время
        'past',  # прошедшее время
        'futr',  # будущее время
    ])

    MOODS = frozenset([
        'indc',  # изъявительное наклонение
        'impr',  # повелительное наклонение
    ])

    VOICES = frozenset([
        'actv',  # действительный залог
        'pssv',  # страдательный залог
    ])

    INVOLVEMENT = frozenset([
        'incl',  # говорящий включён в действие
        'excl',  # говорящий не включён в действие
    ])

    # Set this to False (as a class attribute) to disable strict
    # grammeme type checking for tag.POS, tag.voice, etc. attributes.
    # Without type checks comparisons are about 2x faster.
    typed_grammemes = True

    # Tag format identifier
    # (compatible with https://github.com/kmike/russian-tagsets)
    # ----------------------------------------------------------
    FORMAT = 'opencorpora-int'


    # Helper attributes for inflection/declension routines
    # ----------------------------------------------------
    _NON_PRODUCTIVE_GRAMMEMES = set(['NUMR', 'NPRO', 'PRED', 'PREP',
                                     'CONJ', 'PRCL', 'INTJ', 'Apro'])
    _EXTRA_INCOMPATIBLE = {  # XXX: is it a good idea to have these rules?
        'plur': set(['GNdr']),
        # XXX: how to use rules from OpenCorpora?
        # (they have "lexeme/form" separation)
    }
    _GRAMMEME_INDICES = collections.defaultdict(int)
    _GRAMMEME_INCOMPATIBLE = collections.defaultdict(set)
    KNOWN_GRAMMEMES = set()

    _NUMERAL_AGREEMENT_GRAMMEMES = (
        set(['sing', 'nomn']),
        set(['sing', 'gent']),
        set(['plur', 'nomn']),
        set(['plur', 'gent']),
    )

    def __init__(self, tag):
        self._str = tag

        # XXX: we loose information about which grammemes
        # belongs to lexeme and which belongs to form
        # (but this information seems useless for pymorphy2).

        # Hacks for better memory usage:
        # - store grammemes in a tuple and build a set only when needed;
        # - use byte strings for grammemes under Python 2.x;
        # - grammemes are interned.
        grammemes = tag.replace(' ', ',', 1).split(',')
        grammemes_tuple = tuple([intern(str(g)) for g in grammemes])

        self._assert_grammemes_are_known(set(grammemes_tuple))

        self._grammemes_tuple = grammemes_tuple
        self._POS = self._grammemes_tuple[0]
        self._grammemes_cache = None

    @property
[docs]    def grammemes(self):
        """ A frozenset with grammemes for this tag. """
        if self._grammemes_cache is None:
            self._grammemes_cache = frozenset(self._grammemes_tuple)
        return self._grammemes_cache

    # attributes for grammeme categories
    POS = _select_grammeme_from(PARTS_OF_SPEECH)
    animacy = _select_grammeme_from(ANIMACY)
    aspect = _select_grammeme_from(ASPECTS)
    case = _select_grammeme_from(CASES)
    gender = _select_grammeme_from(GENDERS)
    involvement = _select_grammeme_from(INVOLVEMENT)
    mood = _select_grammeme_from(MOODS)
    number = _select_grammeme_from(NUMBERS)
    person = _select_grammeme_from(PERSONS)
    tense = _select_grammeme_from(TENSES)
    transitivity = _select_grammeme_from(TRANSITIVITY)
    voice = _select_grammeme_from(VOICES)

    def __contains__(self, grammeme):

        # {'NOUN', 'sing'} in tag
        if isinstance(grammeme, (set, frozenset)):
            if grammeme <= self.grammemes:
                return True
            self._assert_grammemes_are_known(grammeme)
            return False

        # 'NOUN' in tag
        if grammeme in self.grammemes:
            return True
        else:
            if not self.grammeme_is_known(grammeme):
                raise ValueError("Grammeme is unknown: %s" % grammeme)
            return False

    # FIXME: __repr__ and __str__ always return unicode,
    # but they should return a byte string under Python 2.x.
    def __str__(self):
        return self._str

    def __repr__(self):
        return "OpencorporaTag('%s')" % self


    def __eq__(self, other):
        return self._grammemes_tuple == other._grammemes_tuple

    def __ne__(self, other):
        return self._grammemes_tuple != other._grammemes_tuple

    def __lt__(self, other):
        return self._grammemes_tuple < other._grammemes_tuple

    def __gt__(self, other):
        return self._grammemes_tuple > other._grammemes_tuple

    def __hash__(self):
        return hash(self._grammemes_tuple)

    def __len__(self):
        return len(self._grammemes_tuple)

    def __reduce__(self):
        return self.__class__, (self._str,), None


    def is_productive(self):
        return not self.grammemes & self._NON_PRODUCTIVE_GRAMMEMES

    def _is_unknown(self):
        return self._POS not in self.PARTS_OF_SPEECH

    @classmethod
    def grammeme_is_known(cls, grammeme):
        cls._assert_grammemes_initialized()
        return grammeme in cls.KNOWN_GRAMMEMES

    @classmethod
    def _assert_grammemes_are_known(cls, grammemes):
        if not grammemes <= cls.KNOWN_GRAMMEMES:
            cls._assert_grammemes_initialized()
            unknown = grammemes - cls.KNOWN_GRAMMEMES
            unknown_repr = ", ".join(["'%s'" % g for g in sorted(unknown)])
            raise ValueError("Grammemes are unknown: {%s}" % unknown_repr)

    @classmethod
    def _assert_grammemes_initialized(cls):
        if not cls.KNOWN_GRAMMEMES:
            msg = "The class was not properly initialized."
            raise RuntimeError(msg)

[docs]    def updated_grammemes(self, required):
        """
        Return a new set of grammemes with ``required`` grammemes added
        and incompatible grammemes removed.
        """
        new_grammemes = self.grammemes | required
        for grammeme in required:
            if not self.grammeme_is_known(grammeme):
                raise ValueError("Unknown grammeme: %s" % grammeme)
            new_grammemes -= self._GRAMMEME_INCOMPATIBLE[grammeme]
        return new_grammemes

    @classmethod
    def _init_grammemes(cls, dict_grammemes):
        """
        Initialize various class attributes with grammeme
        information obtained from XML dictionary.

        ``dict_grammemes`` is a list of tuples::

            [
                (name, parent, alias, description),
                ...
            ]

        """
        gr = dict((name, parent) for (name, parent, alias, description) in dict_grammemes)
        cls.KNOWN_GRAMMEMES = set(gr.keys())

        # figure out parents & children
        children = collections.defaultdict(set)
        for index, (name, parent, alias, description) in enumerate(dict_grammemes):
            if parent:
                children[parent].add(name)
            if gr.get(parent, None): # parent's parent
                children[gr[parent]].add(name)

        # expand EXTRA_INCOMPATIBLE
        for grammeme, g_set in cls._EXTRA_INCOMPATIBLE.items():
            for g in g_set.copy():
                g_set.update(children[g])

        # fill GRAMMEME_INDICES and GRAMMEME_INCOMPATIBLE
        for index, (name, parent, alias, description) in enumerate(dict_grammemes):
            cls._GRAMMEME_INDICES[name] = index
            incompatible = cls._EXTRA_INCOMPATIBLE.get(name, set())
            incompatible = (incompatible | children[parent]) - set([name])

            cls._GRAMMEME_INCOMPATIBLE[name] = frozenset(incompatible)

    # XXX: do we still need these methods?
    @classmethod
    def _from_internal_tag(cls, tag):
        """ Return tag string given internal tag string """
        return tag

    @classmethod
    def _from_internal_grammeme(cls, grammeme):
        return grammeme

    def numeral_agreement_grammemes(self, num):
        if (num % 10 == 1) and (num % 100 != 11):
            index = 0
        elif (num % 10 >= 2) and (num % 10 <= 4) and (num % 100 < 10 or num % 100 >= 20):
            index = 1
        else:
            index = 2

        if self.POS not in ('NOUN', 'ADJF', 'PRTF'):
            return set([])

        if self.POS == 'NOUN' and self.case != 'nomn':
            if index == 0:
                grammemes = set(['sing', self.case])
            elif self.case == 'accs' and index == 2:
                grammemes = self._NUMERAL_AGREEMENT_GRAMMEMES[3]
            else:
                grammemes = set(['plur', self.case])
        elif index == 0:
            grammemes = self._NUMERAL_AGREEMENT_GRAMMEMES[0]
        elif self.POS == 'NOUN' and index == 1:
            grammemes = self._NUMERAL_AGREEMENT_GRAMMEMES[1]
        elif self.POS in ('ADJF', 'PRTF') and self.gender == 'femn' and index == 1:
            grammemes = self._NUMERAL_AGREEMENT_GRAMMEMES[2]
        else:
            grammemes = self._NUMERAL_AGREEMENT_GRAMMEMES[3]
        return grammemes


class CyrillicOpencorporaTag(OpencorporaTag):
    """
    Tag class that uses Cyrillic tag names.

    .. warning::

        This class is experimental and incomplete, do not use
        it because it may be removed in future!
    """

    FORMAT = 'opencorpora-ext'

    _GRAMMEME_ALIAS_MAP = dict()

    @classmethod
    def _from_internal_tag(cls, tag):
        for name, alias in cls._GRAMMEME_ALIAS_MAP.items():
            if alias:
                tag = tag.replace(name, alias)
        return tag

    @classmethod
    def _from_internal_grammeme(cls, grammeme):
        return cls._GRAMMEME_ALIAS_MAP.get(grammeme, grammeme)

    @classmethod
    def _init_grammemes(cls, dict_grammemes):
        """
        Initialize various class attributes with grammeme
        information obtained from XML dictionary.
        """
        cls._init_alias_map(dict_grammemes)
        super(CyrillicOpencorporaTag, cls)._init_grammemes(dict_grammemes)

        GRAMMEME_INDICES = collections.defaultdict(int)
        for name, idx in cls._GRAMMEME_INDICES.items():
            GRAMMEME_INDICES[cls._from_internal_grammeme(name)] = idx
        cls._GRAMMEME_INDICES = GRAMMEME_INDICES

        GRAMMEME_INCOMPATIBLE = collections.defaultdict(set)
        for name, value in cls._GRAMMEME_INCOMPATIBLE.items():
            GRAMMEME_INCOMPATIBLE[cls._from_internal_grammeme(name)] = set([
                cls._from_internal_grammeme(gr) for gr in value
            ])
        cls._GRAMMEME_INCOMPATIBLE = GRAMMEME_INCOMPATIBLE

        cls._NON_PRODUCTIVE_GRAMMEMES = set([
            cls._from_internal_grammeme(gr) for gr in cls._NON_PRODUCTIVE_GRAMMEMES
        ])

    @classmethod
    def _init_alias_map(cls, dict_grammemes):
        for name, parent, alias, description in dict_grammemes:
            cls._GRAMMEME_ALIAS_MAP[name] = alias


registry = dict()

for tag_type in [CyrillicOpencorporaTag, OpencorporaTag]:
    registry[tag_type.FORMAT] = tag_type
Source code for pymorphy2.tagset

Project Versions

На этой странице

Просмотр

Source code for pymorphy2.tagset

Project Versions

RTD Search

На этой странице

Быстрый поиск

Просмотр