Source code for pymorphy2.units.by_hyphen

# -*- coding: utf-8 -*-
"""
Analyzer units for unknown words with hyphens
---------------------------------------------
"""

from __future__ import absolute_import, unicode_literals, division

from pymorphy2.units.base import BaseAnalyzerUnit, AnalogyAnalizerUnit
from pymorphy2.units.utils import (add_parse_if_not_seen, add_tag_if_not_seen,
                                   with_suffix, without_fixed_suffix,
                                   with_prefix, without_fixed_prefix,
                                   replace_methods_stack)


[docs]class HyphenSeparatedParticleAnalyzer(AnalogyAnalizerUnit):
    """
    Parse the word by analyzing it without
    a particle after a hyphen.

    Example: смотри-ка -> смотри + "-ка".

    .. note::

        This analyzer doesn't remove particles from the result
        so for normalization you may need to handle
        particles at tokenization level.

    """
    terminal = True
    ESTIMATE_DECAY = 0.9

    # XXX: maybe the code can be made faster by compiling this list to a DAWG?
    PARTICLES_AFTER_HYPHEN = [
        "-то", "-ка", "-таки", "-де", "-тко", "-тка", "-с", "-ста"
    ]

    def parse(self, word, word_lower, seen_parses):

        result = []
        for unsuffixed_word, particle in self.possible_splits(word_lower):
            method = (self, particle)

            for fixed_word, tag, normal_form, estimate, methods_stack in self.morph.parse(unsuffixed_word):
                parse = (
                    fixed_word+particle,
                    tag,
                    normal_form+particle,
                    estimate*self.ESTIMATE_DECAY,
                    methods_stack+(method,)
                )
                add_parse_if_not_seen(parse, result, seen_parses)

            # If a word ends with with one of the particles,
            # it can't ends with an another.
            break

        return result

    def tag(self, word, word_lower, seen_tags):
        result = []
        for unsuffixed_word, particle in self.possible_splits(word_lower):
            result.extend(self.morph.tag(unsuffixed_word))
            # If a word ends with with one of the particles,
            # it can't ends with an another.
            break

        return result

    def possible_splits(self, word):
        if '-' not in word:
            return

        for particle in self.PARTICLES_AFTER_HYPHEN:
            if not word.endswith(particle):
                continue

            unsuffixed_word = word[:-len(particle)]
            if not unsuffixed_word:
                continue

            yield unsuffixed_word, particle

    def normalizer(self, form, this_method):
        particle = this_method[1]
        normal_form = yield without_fixed_suffix(form, len(particle))
        yield with_suffix(normal_form, particle)

    def lexemizer(self, form, this_method):
        particle = this_method[1]
        lexeme = yield without_fixed_suffix(form, len(particle))
        yield [with_suffix(f, particle) for f in lexeme]


[docs]class HyphenAdverbAnalyzer(BaseAnalyzerUnit):
    """
    Detect adverbs that starts with "по-".

    Example: по-западному
    """
    terminal = True
    ESTIMATE_DECAY = 0.7

    def __init__(self, morph):
        super(HyphenAdverbAnalyzer, self).__init__(morph)
        self._tag = self.morph.TagClass('ADVB')

    def parse(self, word, word_lower, seen_parses):
        if not self.should_parse(word_lower):
            return []

        parse = (
            word_lower, self._tag, word_lower,
            self.ESTIMATE_DECAY,
            ((self, word),)
        )
        seen_parses.add(parse)
        return [parse]

    def tag(self, word, word_lower, seen_tags):
        if not self.should_parse(word_lower) or self._tag in seen_tags:
            return []

        seen_tags.add(self._tag)
        return [self._tag]

    def should_parse(self, word):
        if len(word) < 5 or not word.startswith('по-'):
            return False

        tags = self.morph.tag(word[3:])
        return any(set(['ADJF', 'sing', 'datv']) in tag for tag in tags)

    def normalized(self, form):
        return form

    def get_lexeme(self, form):
        return [form]


[docs]class HyphenatedWordsAnalyzer(BaseAnalyzerUnit):
    """
    Parse the word by parsing its hyphen-separated parts.

    Examples:

        * интернет-магазин -> "интернет-" + магазин
        * человек-гора -> человек + гора

    """

    terminal = True
    ESTIMATE_DECAY = 0.75

    _CONSIDER_THE_SAME = {
        'V-oy': 'V-ey',
        'gen1': 'gent',
        'loc1': 'loct',
        # 'acc1': 'accs',

    }  # TODO: add more grammemes

    def __init__(self, morph):
        super(HyphenatedWordsAnalyzer, self).__init__(morph)
        Tag = morph.TagClass
        self._FEATURE_GRAMMEMES = (Tag.PARTS_OF_SPEECH | Tag.NUMBERS |
                                   Tag.CASES | Tag.PERSONS | Tag.TENSES)

    def parse(self, word, word_lower, seen_parses):
        if not self._should_parse(word_lower):
            return []

        left, right = word_lower.split('-', 1)
        left_parses = self.morph.parse(left)
        right_parses = self.morph.parse(right)

        result = self._parse_as_variable_both(left_parses, right_parses, seen_parses)

        # We copy `seen_parses` to preserve parses even if similar parses
        # were observed at previous step (they may have different lexemes).
        _seen = seen_parses.copy()
        result.extend(self._parse_as_fixed_left(right_parses, _seen, left))
        seen_parses.update(_seen)

        return result

    def _parse_as_fixed_left(self, right_parses, seen, left):
        """
        Step 1: Assume that the left part is an uninflected prefix.
        Examples: интернет-магазин, воздушно-капельный
        """
        result = []

        for fixed_word, tag, normal_form, estimate, right_methods in right_parses:

            if tag._is_unknown():
                continue

            new_methods_stack = ((self, left, right_methods),)

            parse = (
                '-'.join((left, fixed_word)),
                tag,
                '-'.join((left, normal_form)),
                estimate * self.ESTIMATE_DECAY,
                new_methods_stack
            )
            result.append(parse)
            # add_parse_if_not_seen(parse, result, seen_left)

        return result

    def _parse_as_variable_both(self, left_parses, right_parses, seen):
        """
        Step 2: if left and right can be parsed the same way,
        then it may be the case that both parts should be inflected.
        Examples: человек-гора, команд-участниц, компания-производитель
        """
        result = []
        right_features = [self._similarity_features(p[1]) for p in right_parses]

        # FIXME: quadratic algorithm
        for left_parse in left_parses:

            left_tag = left_parse[1]

            if left_tag._is_unknown():
                continue

            left_feat = self._similarity_features(left_tag)

            for parse_index, right_parse in enumerate(right_parses):

                right_feat = right_features[parse_index]

                if left_feat != right_feat:
                    continue

                left_methods = left_parse[4]
                right_methods = right_parse[4]

                new_methods_stack = ((self, left_methods, right_methods),)

                # tag
                parse = (
                    '-'.join((left_parse[0], right_parse[0])),  # word
                    left_tag,
                    '-'.join((left_parse[2], right_parse[2])),  # normal form
                    left_parse[3] * self.ESTIMATE_DECAY,
                    new_methods_stack
                )
                result.append(parse)
                # add_parse_if_not_seen(parse, result, seen_right)

        return result

    def _similarity_features(self, tag):
        """ :type tag: pymorphy2.tagset.OpencorporaTag """
        return replace_grammemes(
            tag.grammemes & self._FEATURE_GRAMMEMES,
            {'gen1': 'gent', 'loc1': 'loct'}
        )

    def tag(self, word, word_lower, seen_tags):
        result = []
        # TODO: do not use self.parse
        for p in self.parse(word, word_lower, set()):
            add_tag_if_not_seen(p[1], result, seen_tags)
        return result

    def _should_parse(self, word):
        if '-' not in word:
            return False

        word_stripped = word.strip('-')
        if word_stripped != word:
            # don't handle words that starts of ends with a hyphen
            return False

        if word_stripped.count('-') != 1:
            # require exactly 1 hyphen, in the middle of the word
            return False

        if self.dict.prediction_prefixes.prefixes(word):
            # such words should really be parsed by KnownPrefixAnalyzer
            return False

        return True

    def normalized(self, form):
        return next(self._iter_lexeme(form))

    def get_lexeme(self, form):
        return list(self._iter_lexeme(form))

    def _iter_lexeme(self, form):
        methods_stack = form[4]
        assert len(methods_stack) == 1

        this_method, left_methods, right_methods = methods_stack[0]
        assert this_method is self

        if self._fixed_left_method_was_used(left_methods):
            # Form is obtained by parsing right part,
            # assuming that left part is an uninflected prefix.
            # Lexeme can be calculated from the right part in this case:
            prefix = left_methods + '-'

            right_form = without_fixed_prefix(
                replace_methods_stack(form, right_methods),
                len(prefix)
            )
            base_analyzer = right_methods[-1][0]

            lexeme = base_analyzer.get_lexeme(right_form)
            return (
                replace_methods_stack(
                    with_prefix(f, prefix),
                    ((this_method, left_methods, f[4]),)
                )
                for f in lexeme
            )

        else:
            # Form is obtained by parsing both parts.
            # Compute lexemes for left and right parts,
            # then merge them.
            left_form = self._without_right_part(
                replace_methods_stack(form, left_methods)
            )

            right_form = self._without_left_part(
                replace_methods_stack(form, right_methods)
            )

            left_lexeme = left_methods[-1][0].get_lexeme(left_form)
            right_lexeme = right_methods[-1][0].get_lexeme(right_form)

            return self._merge_lexemes(left_lexeme, right_lexeme)

    def _merge_lexemes(self, left_lexeme, right_lexeme):

        for left, right in self._align_lexeme_forms(left_lexeme, right_lexeme):
            word = '-'.join((left[0], right[0]))
            tag = left[1]
            normal_form = '-'.join((left[2], right[2]))
            estimate = (left[3] + right[3]) / 2
            method_stack = ((self, left[4], right[4]), )

            yield (word, tag, normal_form, estimate, method_stack)

    def _align_lexeme_forms(self, left_lexeme, right_lexeme):
        # FIXME: quadratic algorithm
        for right in right_lexeme:
            min_dist, closest = 1e6, None
            gr_right = replace_grammemes(right[1].grammemes, self._CONSIDER_THE_SAME)

            for left in left_lexeme:
                gr_left = replace_grammemes(left[1].grammemes, self._CONSIDER_THE_SAME)
                dist = len(gr_left ^ gr_right)
                if dist < min_dist:
                    min_dist = dist
                    closest = left

            yield closest, right

    @classmethod
    def _without_right_part(cls, form):
        word, tag, normal_form, estimate, methods_stack = form
        return (word[:word.index('-')], tag, normal_form[:normal_form.index('-')],
                estimate, methods_stack)

    @classmethod
    def _without_left_part(cls, form):
        word, tag, normal_form, estimate, methods_stack = form
        return (word[word.index('-')+1:], tag, normal_form[normal_form.index('-')+1:],
                estimate, methods_stack)

    @classmethod
    def _fixed_left_method_was_used(cls, left_methods):
        return not isinstance(left_methods, tuple)


def replace_grammemes(grammemes, replaces):
    grammemes = set(grammemes)
    for gr, replace in replaces.items():
        if gr in grammemes:
            grammemes.remove(gr)
            grammemes.add(replace)
    return grammemes
Source code for pymorphy2.units.by_hyphen

Project Versions

На этой странице

Просмотр

Source code for pymorphy2.units.by_hyphen

Project Versions

RTD Search

На этой странице

Быстрый поиск

Просмотр