""" Pyphen ====== Pure Python module to hyphenate text, inspired by Ruby's Text::Hyphen. """ import re from importlib import resources from pathlib import Path VERSION = __version__ = '0.17.2' __all__ = ('LANGUAGES', 'Pyphen', 'language_fallback') # cache of per-file HyphDict objects hdcache = {} # precompile some stuff parse_hex = re.compile(r'\^{2}([0-9a-f]{2})').sub parse = re.compile(r'(\d?)(\D?)').findall ignored = ( '%', '#', 'LEFTHYPHENMIN', 'RIGHTHYPHENMIN', 'COMPOUNDLEFTHYPHENMIN', 'COMPOUNDRIGHTHYPHENMIN') #: Dict of languages including codes as keys and dictionary Path as values. LANGUAGES = {} try: dictionaries = resources.files('pyphen.dictionaries') except TypeError: dictionaries = Path(__file__).parent / 'dictionaries' for path in sorted(dictionaries.iterdir()): if path.suffix == '.dic': name = path.name[5:-4] LANGUAGES[name] = path short_name = name.split('_')[0] if short_name not in LANGUAGES: LANGUAGES[short_name] = path LANGUAGES_LOWERCASE = {name.lower(): name for name in LANGUAGES} def language_fallback(language): """Get a fallback language available in our dictionaries. http://www.unicode.org/reports/tr35/#Locale_Inheritance We use the normal truncation inheritance. This function needs aliases including scripts for languages with multiple regions available. """ parts = language.replace('-', '_').lower().split('_') while parts: language = '_'.join(parts) if language in LANGUAGES_LOWERCASE: return LANGUAGES_LOWERCASE[language] parts.pop() class AlternativeParser: """Parser of nonstandard hyphen pattern alternative. The instance returns a special int with data about the current position in the pattern when called with an odd value. """ def __init__(self, pattern, alternative): alternative = alternative.split(',') self.change = alternative[0] self.index = int(alternative[1]) self.cut = int(alternative[2]) if pattern.startswith('.'): self.index += 1 def __call__(self, value): self.index -= 1 value = int(value) if value & 1: return DataInt(value, (self.change, self.index, self.cut)) else: return value class DataInt(int): """``int`` with some other data can be stuck to in a ``data`` attribute.""" def __new__(cls, value, data=None, reference=None): """Create a new ``DataInt``. Call with ``reference=dataint_object`` to use the data from another ``DataInt``. """ obj = int.__new__(cls, value) if reference and isinstance(reference, DataInt): obj.data = reference.data else: obj.data = data return obj class HyphDict: """Hyphenation patterns.""" def __init__(self, path): """Read a ``hyph_*.dic`` and parse its patterns. :param path: Path of hyph_*.dic to read """ self.patterns = {} # see "man 4 hunspell", iscii-devanagari is not supported by python with path.open('rb') as fd: encoding = fd.readline().decode() if encoding.lower() == 'microsoft-cp1251': encoding = 'cp1251' for pattern in path.read_text(encoding).split('\n')[1:]: pattern = pattern.strip() if not pattern or pattern.startswith(ignored): continue # replace ^^hh with the real character pattern = parse_hex( lambda match: chr(int(match.group(1), 16)), pattern) # read nonstandard hyphen alternatives if '/' in pattern and '=' in pattern: pattern, alternative = pattern.split('/', 1) factory = AlternativeParser(pattern, alternative) else: factory = int tags, values = zip(*[ (string, factory(i or '0')) for i, string in parse(pattern)]) # if only zeros, skip this pattern if max(values) == 0: continue # chop zeros from beginning and end, and store start offset start, end = 0, len(values) while not values[start]: start += 1 while not values[end - 1]: end -= 1 self.patterns[''.join(tags)] = start, values[start:end] self.cache = {} self.maxlen = max(len(key) for key in self.patterns) def positions(self, word): """Get a list of positions where the word can be hyphenated. :param word: unicode string of the word to hyphenate E.g. for the dutch word 'lettergrepen' this method returns ``[3, 6, 9]``. Each position is a ``DataInt`` with a data attribute. If the data attribute is not ``None``, it contains a tuple with information about nonstandard hyphenation at that point: ``(change, index, cut)``. change a string like ``'ff=f'``, that describes how hyphenation should take place. index where to substitute the change, counting from the current point cut how many characters to remove while substituting the nonstandard hyphenation """ word = word.lower() points = self.cache.get(word) if points is None: pointed_word = f'.{word}.' references = [0] * (len(pointed_word) + 1) for i in range(len(pointed_word) - 1): stop = min(i + self.maxlen, len(pointed_word)) + 1 for j in range(i + 1, stop): pattern = self.patterns.get(pointed_word[i:j]) if not pattern: continue offset, values = pattern slice_ = slice(i + offset, i + offset + len(values)) references[slice_] = map(max, values, references[slice_]) self.cache[word] = points = [ DataInt(i - 1, reference=reference) for i, reference in enumerate(references) if reference % 2] return points class Pyphen: """Hyphenation class, with methods to hyphenate strings in various ways.""" def __init__(self, filename=None, lang=None, left=2, right=2, cache=True): """Create an hyphenation instance for given lang or filename. :param filename: filename or Path of hyph_*.dic to read :param lang: lang of the included dict to use if no filename is given :param left: minimum number of characters of the first syllabe :param right: minimum number of characters of the last syllabe :param cache: if ``True``, use cached copy of the hyphenation patterns """ self.left, self.right = left, right path = Path(filename) if filename else LANGUAGES[language_fallback(lang)] if not cache or path not in hdcache: hdcache[path] = HyphDict(path) self.hd = hdcache[path] def positions(self, word): """Get a list of positions where the word can be hyphenated. :param word: unicode string of the word to hyphenate See also ``HyphDict.positions``. The points that are too far to the left or right are removed. """ right = len(word) - self.right return [i for i in self.hd.positions(word) if self.left <= i <= right] def iterate(self, word): """Iterate over all hyphenation possibilities, the longest first. :param word: unicode string of the word to hyphenate """ for position in reversed(self.positions(word)): if position.data: # get the nonstandard hyphenation data change, index, cut = position.data index += position if word.isupper(): change = change.upper() c1, c2 = change.split('=') yield word[:index] + c1, c2 + word[index + cut:] else: yield word[:position], word[position:] def wrap(self, word, width, hyphen='-'): """Get the longest possible first part and the last part of a word. :param word: unicode string of the word to hyphenate :param width: maximum length of the first part :param hyphen: unicode string used as hyphen character The first part has the hyphen already attached. Returns ``None`` if there is no hyphenation point before ``width``, or if the word could not be hyphenated. """ width -= len(hyphen) for w1, w2 in self.iterate(word): if len(w1) <= width: return w1 + hyphen, w2 def inserted(self, word, hyphen='-'): """Get the word as a string with all the possible hyphens inserted. :param word: unicode string of the word to hyphenate :param hyphen: unicode string used as hyphen character E.g. for the dutch word ``'lettergrepen'``, this method returns the unicode string ``'let-ter-gre-pen'``. The hyphen string to use can be given as the second parameter, that defaults to ``'-'``. """ letters = list(word) for position in reversed(self.positions(word)): if position.data: # get the nonstandard hyphenation data change, index, cut = position.data index += position if word.isupper(): change = change.upper() letters[index:index + cut] = change.replace('=', hyphen) else: letters.insert(position, hyphen) return ''.join(letters) __call__ = iterate