feat: add comprehensive GitHub workflow and development tools
This commit is contained in:
297
app/.venv/Lib/site-packages/pyphen/__init__.py
Normal file
297
app/.venv/Lib/site-packages/pyphen/__init__.py
Normal file
@@ -0,0 +1,297 @@
|
||||
"""
|
||||
|
||||
Pyphen
|
||||
======
|
||||
|
||||
Pure Python module to hyphenate text, inspired by Ruby's Text::Hyphen.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
from importlib import resources
|
||||
from pathlib import Path
|
||||
|
||||
VERSION = __version__ = '0.17.2'
|
||||
|
||||
__all__ = ('LANGUAGES', 'Pyphen', 'language_fallback')
|
||||
|
||||
# cache of per-file HyphDict objects
|
||||
hdcache = {}
|
||||
|
||||
# precompile some stuff
|
||||
parse_hex = re.compile(r'\^{2}([0-9a-f]{2})').sub
|
||||
parse = re.compile(r'(\d?)(\D?)').findall
|
||||
ignored = (
|
||||
'%', '#', 'LEFTHYPHENMIN', 'RIGHTHYPHENMIN',
|
||||
'COMPOUNDLEFTHYPHENMIN', 'COMPOUNDRIGHTHYPHENMIN')
|
||||
|
||||
#: Dict of languages including codes as keys and dictionary Path as values.
|
||||
LANGUAGES = {}
|
||||
|
||||
try:
|
||||
dictionaries = resources.files('pyphen.dictionaries')
|
||||
except TypeError:
|
||||
dictionaries = Path(__file__).parent / 'dictionaries'
|
||||
|
||||
for path in sorted(dictionaries.iterdir()):
|
||||
if path.suffix == '.dic':
|
||||
name = path.name[5:-4]
|
||||
LANGUAGES[name] = path
|
||||
short_name = name.split('_')[0]
|
||||
if short_name not in LANGUAGES:
|
||||
LANGUAGES[short_name] = path
|
||||
|
||||
LANGUAGES_LOWERCASE = {name.lower(): name for name in LANGUAGES}
|
||||
|
||||
|
||||
def language_fallback(language):
|
||||
"""Get a fallback language available in our dictionaries.
|
||||
|
||||
http://www.unicode.org/reports/tr35/#Locale_Inheritance
|
||||
|
||||
We use the normal truncation inheritance. This function needs aliases
|
||||
including scripts for languages with multiple regions available.
|
||||
|
||||
"""
|
||||
parts = language.replace('-', '_').lower().split('_')
|
||||
while parts:
|
||||
language = '_'.join(parts)
|
||||
if language in LANGUAGES_LOWERCASE:
|
||||
return LANGUAGES_LOWERCASE[language]
|
||||
parts.pop()
|
||||
|
||||
|
||||
class AlternativeParser:
|
||||
"""Parser of nonstandard hyphen pattern alternative.
|
||||
|
||||
The instance returns a special int with data about the current position in
|
||||
the pattern when called with an odd value.
|
||||
|
||||
"""
|
||||
def __init__(self, pattern, alternative):
|
||||
alternative = alternative.split(',')
|
||||
self.change = alternative[0]
|
||||
self.index = int(alternative[1])
|
||||
self.cut = int(alternative[2])
|
||||
if pattern.startswith('.'):
|
||||
self.index += 1
|
||||
|
||||
def __call__(self, value):
|
||||
self.index -= 1
|
||||
value = int(value)
|
||||
if value & 1:
|
||||
return DataInt(value, (self.change, self.index, self.cut))
|
||||
else:
|
||||
return value
|
||||
|
||||
|
||||
class DataInt(int):
|
||||
"""``int`` with some other data can be stuck to in a ``data`` attribute."""
|
||||
def __new__(cls, value, data=None, reference=None):
|
||||
"""Create a new ``DataInt``.
|
||||
|
||||
Call with ``reference=dataint_object`` to use the data from another
|
||||
``DataInt``.
|
||||
|
||||
"""
|
||||
obj = int.__new__(cls, value)
|
||||
if reference and isinstance(reference, DataInt):
|
||||
obj.data = reference.data
|
||||
else:
|
||||
obj.data = data
|
||||
return obj
|
||||
|
||||
|
||||
class HyphDict:
|
||||
"""Hyphenation patterns."""
|
||||
|
||||
def __init__(self, path):
|
||||
"""Read a ``hyph_*.dic`` and parse its patterns.
|
||||
|
||||
:param path: Path of hyph_*.dic to read
|
||||
|
||||
"""
|
||||
self.patterns = {}
|
||||
|
||||
# see "man 4 hunspell", iscii-devanagari is not supported by python
|
||||
with path.open('rb') as fd:
|
||||
encoding = fd.readline().decode()
|
||||
if encoding.lower() == 'microsoft-cp1251':
|
||||
encoding = 'cp1251'
|
||||
|
||||
for pattern in path.read_text(encoding).split('\n')[1:]:
|
||||
pattern = pattern.strip()
|
||||
if not pattern or pattern.startswith(ignored):
|
||||
continue
|
||||
|
||||
# replace ^^hh with the real character
|
||||
pattern = parse_hex(
|
||||
lambda match: chr(int(match.group(1), 16)), pattern)
|
||||
|
||||
# read nonstandard hyphen alternatives
|
||||
if '/' in pattern and '=' in pattern:
|
||||
pattern, alternative = pattern.split('/', 1)
|
||||
factory = AlternativeParser(pattern, alternative)
|
||||
else:
|
||||
factory = int
|
||||
|
||||
tags, values = zip(*[
|
||||
(string, factory(i or '0')) for i, string in parse(pattern)])
|
||||
|
||||
# if only zeros, skip this pattern
|
||||
if max(values) == 0:
|
||||
continue
|
||||
|
||||
# chop zeros from beginning and end, and store start offset
|
||||
start, end = 0, len(values)
|
||||
while not values[start]:
|
||||
start += 1
|
||||
while not values[end - 1]:
|
||||
end -= 1
|
||||
|
||||
self.patterns[''.join(tags)] = start, values[start:end]
|
||||
|
||||
self.cache = {}
|
||||
self.maxlen = max(len(key) for key in self.patterns)
|
||||
|
||||
def positions(self, word):
|
||||
"""Get a list of positions where the word can be hyphenated.
|
||||
|
||||
:param word: unicode string of the word to hyphenate
|
||||
|
||||
E.g. for the dutch word 'lettergrepen' this method returns ``[3, 6,
|
||||
9]``.
|
||||
|
||||
Each position is a ``DataInt`` with a data attribute.
|
||||
|
||||
If the data attribute is not ``None``, it contains a tuple with
|
||||
information about nonstandard hyphenation at that point: ``(change,
|
||||
index, cut)``.
|
||||
|
||||
change
|
||||
a string like ``'ff=f'``, that describes how hyphenation should
|
||||
take place.
|
||||
|
||||
index
|
||||
where to substitute the change, counting from the current point
|
||||
|
||||
cut
|
||||
how many characters to remove while substituting the nonstandard
|
||||
hyphenation
|
||||
|
||||
"""
|
||||
word = word.lower()
|
||||
points = self.cache.get(word)
|
||||
if points is None:
|
||||
pointed_word = f'.{word}.'
|
||||
references = [0] * (len(pointed_word) + 1)
|
||||
|
||||
for i in range(len(pointed_word) - 1):
|
||||
stop = min(i + self.maxlen, len(pointed_word)) + 1
|
||||
for j in range(i + 1, stop):
|
||||
pattern = self.patterns.get(pointed_word[i:j])
|
||||
if not pattern:
|
||||
continue
|
||||
offset, values = pattern
|
||||
slice_ = slice(i + offset, i + offset + len(values))
|
||||
references[slice_] = map(max, values, references[slice_])
|
||||
|
||||
self.cache[word] = points = [
|
||||
DataInt(i - 1, reference=reference)
|
||||
for i, reference in enumerate(references) if reference % 2]
|
||||
return points
|
||||
|
||||
|
||||
class Pyphen:
|
||||
"""Hyphenation class, with methods to hyphenate strings in various ways."""
|
||||
|
||||
def __init__(self, filename=None, lang=None, left=2, right=2, cache=True):
|
||||
"""Create an hyphenation instance for given lang or filename.
|
||||
|
||||
:param filename: filename or Path of hyph_*.dic to read
|
||||
:param lang: lang of the included dict to use if no filename is given
|
||||
:param left: minimum number of characters of the first syllabe
|
||||
:param right: minimum number of characters of the last syllabe
|
||||
:param cache: if ``True``, use cached copy of the hyphenation patterns
|
||||
|
||||
"""
|
||||
self.left, self.right = left, right
|
||||
path = Path(filename) if filename else LANGUAGES[language_fallback(lang)]
|
||||
if not cache or path not in hdcache:
|
||||
hdcache[path] = HyphDict(path)
|
||||
self.hd = hdcache[path]
|
||||
|
||||
def positions(self, word):
|
||||
"""Get a list of positions where the word can be hyphenated.
|
||||
|
||||
:param word: unicode string of the word to hyphenate
|
||||
|
||||
See also ``HyphDict.positions``. The points that are too far to the
|
||||
left or right are removed.
|
||||
|
||||
"""
|
||||
right = len(word) - self.right
|
||||
return [i for i in self.hd.positions(word) if self.left <= i <= right]
|
||||
|
||||
def iterate(self, word):
|
||||
"""Iterate over all hyphenation possibilities, the longest first.
|
||||
|
||||
:param word: unicode string of the word to hyphenate
|
||||
|
||||
"""
|
||||
for position in reversed(self.positions(word)):
|
||||
if position.data:
|
||||
# get the nonstandard hyphenation data
|
||||
change, index, cut = position.data
|
||||
index += position
|
||||
if word.isupper():
|
||||
change = change.upper()
|
||||
c1, c2 = change.split('=')
|
||||
yield word[:index] + c1, c2 + word[index + cut:]
|
||||
else:
|
||||
yield word[:position], word[position:]
|
||||
|
||||
def wrap(self, word, width, hyphen='-'):
|
||||
"""Get the longest possible first part and the last part of a word.
|
||||
|
||||
:param word: unicode string of the word to hyphenate
|
||||
:param width: maximum length of the first part
|
||||
:param hyphen: unicode string used as hyphen character
|
||||
|
||||
The first part has the hyphen already attached.
|
||||
|
||||
Returns ``None`` if there is no hyphenation point before ``width``, or
|
||||
if the word could not be hyphenated.
|
||||
|
||||
"""
|
||||
width -= len(hyphen)
|
||||
for w1, w2 in self.iterate(word):
|
||||
if len(w1) <= width:
|
||||
return w1 + hyphen, w2
|
||||
|
||||
def inserted(self, word, hyphen='-'):
|
||||
"""Get the word as a string with all the possible hyphens inserted.
|
||||
|
||||
:param word: unicode string of the word to hyphenate
|
||||
:param hyphen: unicode string used as hyphen character
|
||||
|
||||
E.g. for the dutch word ``'lettergrepen'``, this method returns the
|
||||
unicode string ``'let-ter-gre-pen'``. The hyphen string to use can be
|
||||
given as the second parameter, that defaults to ``'-'``.
|
||||
|
||||
"""
|
||||
letters = list(word)
|
||||
for position in reversed(self.positions(word)):
|
||||
if position.data:
|
||||
# get the nonstandard hyphenation data
|
||||
change, index, cut = position.data
|
||||
index += position
|
||||
if word.isupper():
|
||||
change = change.upper()
|
||||
letters[index:index + cut] = change.replace('=', hyphen)
|
||||
else:
|
||||
letters.insert(position, hyphen)
|
||||
|
||||
return ''.join(letters)
|
||||
|
||||
__call__ = iterate
|
||||
Reference in New Issue
Block a user