# -*- coding: utf-8 -*-
"""
Analogy analyzer units
----------------------
This module provides analyzer units that analyzes unknown words by looking
at how similar known words are analyzed.
"""
from __future__ import absolute_import, unicode_literals, division
import operator
from pymorphy2.units.base import AnalogyAnalizerUnit
from pymorphy2.units.by_lookup import DictionaryAnalyzer
from pymorphy2.units.utils import (
add_parse_if_not_seen,
add_tag_if_not_seen,
without_fixed_prefix,
with_prefix
)
from pymorphy2.utils import word_splits
from pymorphy2.dawg import PrefixMatcher
_cnt_getter = operator.itemgetter(3)
class _PrefixAnalyzer(AnalogyAnalizerUnit):
def normalizer(self, form, this_method):
prefix = this_method[1]
normal_form = yield without_fixed_prefix(form, len(prefix))
yield with_prefix(normal_form, prefix)
def lexemizer(self, form, this_method):
prefix = this_method[1]
lexeme = yield without_fixed_prefix(form, len(prefix))
yield [with_prefix(f, prefix) for f in lexeme]
[docs]class KnownPrefixAnalyzer(_PrefixAnalyzer):
"""
Parse the word by checking if it starts with a known prefix
and parsing the remainder.
Example: псевдокошка -> (псевдо) + кошка.
"""
_repr_skip_value_params = ['known_prefixes']
def __init__(self, known_prefixes, score_multiplier=0.75, min_remainder_length=3):
self.known_prefixes = known_prefixes
self.score_multiplier = score_multiplier
self.min_remainder_length = min_remainder_length
def init(self, morph):
super(KnownPrefixAnalyzer, self).init(morph)
self.get_prefixes = PrefixMatcher(self.known_prefixes).prefixes
def parse(self, word, word_lower, seen_parses):
result = []
for prefix, unprefixed_word in self.possible_splits(word_lower):
method = (self, prefix)
parses = self.morph.parse(unprefixed_word)
for fixed_word, tag, normal_form, score, methods_stack in parses:
if not tag.is_productive():
continue
parse = (
prefix + fixed_word,
tag,
prefix + normal_form,
score * self.score_multiplier,
methods_stack + (method,)
)
add_parse_if_not_seen(parse, result, seen_parses)
return result
def tag(self, word, word_lower, seen_tags):
result = []
for prefix, unprefixed_word in self.possible_splits(word_lower):
for tag in self.morph.tag(unprefixed_word):
if not tag.is_productive():
continue
add_tag_if_not_seen(tag, result, seen_tags)
return result
def possible_splits(self, word):
word_prefixes = self.get_prefixes(word)
word_prefixes.sort(key=len, reverse=True)
for prefix in word_prefixes:
unprefixed_word = word[len(prefix):]
if len(unprefixed_word) < self.min_remainder_length:
continue
yield prefix, unprefixed_word
[docs]class UnknownPrefixAnalyzer(_PrefixAnalyzer):
"""
Parse the word by parsing only the word suffix
(with restrictions on prefix & suffix lengths).
Example: байткод -> (байт) + код
"""
def __init__(self, score_multiplier=0.5):
self.score_multiplier = score_multiplier
def init(self, morph):
super(AnalogyAnalizerUnit, self).init(morph)
self.dict_analyzer = DictionaryAnalyzer()
self.dict_analyzer.init(morph)
def parse(self, word, word_lower, seen_parses):
result = []
for prefix, unprefixed_word in word_splits(word_lower):
method = (self, prefix)
parses = self.dict_analyzer.parse(unprefixed_word, unprefixed_word, seen_parses)
for fixed_word, tag, normal_form, score, methods_stack in parses:
if not tag.is_productive():
continue
parse = (
prefix + fixed_word,
tag,
prefix + normal_form,
score * self.score_multiplier,
methods_stack + (method,)
)
add_parse_if_not_seen(parse, result, seen_parses)
return result
def tag(self, word, word_lower, seen_tags):
result = []
for _, unprefixed_word in word_splits(word_lower):
tags = self.dict_analyzer.tag(unprefixed_word, unprefixed_word, seen_tags)
for tag in tags:
if not tag.is_productive():
continue
add_tag_if_not_seen(tag, result, seen_tags)
return result
[docs]class KnownSuffixAnalyzer(AnalogyAnalizerUnit):
"""
Parse the word by checking how the words with similar suffixes
are parsed.
Example: бутявкать -> ...вкать
"""
[docs] class FakeDictionary(DictionaryAnalyzer):
""" This is just a DictionaryAnalyzer with different __repr__ """
pass
def __init__(self, score_multiplier=0.5, min_word_length=4):
self.min_word_length = min_word_length
self.score_multiplier = score_multiplier
def init(self, morph):
super(KnownSuffixAnalyzer, self).init(morph)
self._paradigm_prefixes = list(reversed(list(enumerate(self.dict.paradigm_prefixes))))
self._prediction_splits = list(reversed(range(1, self._max_suffix_length()+1)))
self.fake_dict = self.FakeDictionary()
self.fake_dict.init(morph)
def _max_suffix_length(self):
try:
return self.dict.meta['compile_options']['max_suffix_length']
except KeyError:
# dicts v2.4 support
return self.dict.meta['prediction_options']['max_suffix_length']
def parse(self, word, word_lower, seen_parses):
result = []
if len(word) < self.min_word_length:
return result
# smoothing; XXX: isn't max_cnt better?
# or maybe use a proper discounting?
total_counts = [1] * len(self._paradigm_prefixes)
for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word_lower):
for i in self._prediction_splits:
# XXX: this should be counted once, not for each prefix
word_start, word_end = word_lower[:-i], word_lower[-i:]
para_data = suffixes_dawg.similar_items(word_end, self.morph.char_substitutes)
for fixed_suffix, parses in para_data:
fixed_word = word_start + fixed_suffix
for cnt, para_id, idx in parses:
tag = self.dict.build_tag_info(para_id, idx)
# skip non-productive tags
if not tag.is_productive():
continue
total_counts[prefix_id] += cnt
# avoid duplicate parses
reduced_parse = fixed_word, tag, para_id
if reduced_parse in seen_parses:
continue
seen_parses.add(reduced_parse)
# ok, build the result
normal_form = self.dict.build_normal_form(para_id, idx, fixed_word)
methods = (
(self.fake_dict, fixed_word, para_id, idx),
(self, fixed_suffix),
)
parse = (cnt, fixed_word, tag, normal_form, prefix_id, methods)
result.append(parse)
if total_counts[prefix_id] > 1:
break
result = [
(fixed_word, tag, normal_form, cnt/total_counts[prefix_id] * self.score_multiplier, methods_stack)
for (cnt, fixed_word, tag, normal_form, prefix_id, methods_stack) in result
]
result.sort(key=_cnt_getter, reverse=True)
return result
def tag(self, word, word_lower, seen_tags):
# XXX: the result order may be different from
# ``self.parse(...)``.
result = []
if len(word) < self.min_word_length:
return result
for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word_lower):
for i in self._prediction_splits:
# XXX: end should be counted once, not for each prefix
end = word_lower[-i:]
para_data = suffixes_dawg.similar_items(end, self.morph.char_substitutes)
found = False
for fixed_suffix, parses in para_data:
for cnt, para_id, idx in parses:
tag = self.dict.build_tag_info(para_id, idx)
if not tag.is_productive():
continue
found = True
if tag in seen_tags:
continue
seen_tags.add(tag)
result.append((cnt, tag))
if found:
break
result.sort(reverse=True)
return [tag for cnt, tag in result]
def _possible_prefixes(self, word):
for prefix_id, prefix in self._paradigm_prefixes:
if not word.startswith(prefix):
continue
suffixes_dawg = self.dict.prediction_suffixes_dawgs[prefix_id]
yield prefix_id, prefix, suffixes_dawg