Source code for persian.core
"""Core Persian language processing utilities."""
from __future__ import annotations
import urllib.parse
from .constants import (
AR_DIACRITIC_REMOVAL_TABLE,
AR_DIACRITICS_MAPPING,
AR_DIGITS,
AR_TO_FA_CHARS_TABLE,
AR_TO_FA_DIGITS_TABLE,
DE_YII_PATTERN,
EN_TO_FA_DIGITS_TABLE,
EN_TO_FA_KEYBOARD_TABLE,
FA_DIGITS,
FA_TO_EN_DIGITS_TABLE,
MI_PATTERN,
)
MappingType = tuple[tuple[str, str], ...]
def _validate_string_input(input_str: str, param_name: str = "input_str") -> None:
"""Validate that the provided input is a non-None string."""
if input_str is None:
raise ValueError(f"{param_name} cannot be None")
if not isinstance(input_str, str):
raise TypeError(f"{param_name} must be str, got {type(input_str).__name__}")
def _multiple_replace(mapping: MappingType, text: str) -> str:
"""Replace all mapping keys within a text sequentially (legacy helper)."""
result = text
for old, new in mapping:
result = result.replace(old, new)
return result
[docs]
def convert_en_numbers(input_str: str) -> str:
"""Convert English digits to Persian digits.
Args:
input_str: Text that may contain English digits.
Returns:
A copy of the text with English digits replaced by Persian digits.
Raises:
TypeError: If `input_str` is not a string.
ValueError: If `input_str` is None.
Examples:
>>> convert_en_numbers("Phone 123")
'Phone ۱۲۳'
"""
_validate_string_input(input_str)
return input_str.translate(EN_TO_FA_DIGITS_TABLE)
[docs]
def convert_en_characters(input_str: str) -> str:
"""Convert English keyboard characters to standard Persian characters.
Args:
input_str: Text typed using an English keyboard layout.
Returns:
New text containing the mapped Persian characters.
Raises:
TypeError: If `input_str` is not a string.
ValueError: If `input_str` is None.
Examples:
>>> convert_en_characters("sghl")
'سلام'
"""
_validate_string_input(input_str)
return input_str.translate(EN_TO_FA_KEYBOARD_TABLE)
[docs]
def convert_ar_numbers(input_str: str) -> str:
"""Convert Arabic digits to Persian digits.
Args:
input_str: Text that may contain Arabic digits.
Returns:
Text with Arabic digits replaced by Persian digits.
Raises:
TypeError: If `input_str` is not a string.
ValueError: If `input_str` is None.
Examples:
>>> convert_ar_numbers("٣٤٥")
'۳۴۵'
"""
_validate_string_input(input_str)
return input_str.translate(AR_TO_FA_DIGITS_TABLE)
[docs]
def convert_fa_numbers(input_str: str) -> str:
"""Convert Persian digits to English digits.
Args:
input_str: Text that may contain Persian digits.
Returns:
Text containing English digits instead of Persian digits.
Raises:
TypeError: If `input_str` is not a string.
ValueError: If `input_str` is None.
Examples:
>>> convert_fa_numbers("۱۲۳")
'123'
"""
_validate_string_input(input_str)
return input_str.translate(FA_TO_EN_DIGITS_TABLE)
[docs]
def convert_ar_characters(input_str: str) -> str:
"""Convert Arabic characters to their Persian equivalents.
Args:
input_str: Text that may contain Arabic characters.
Returns:
Text where Arabic characters are replaced with Persian characters.
Raises:
TypeError: If `input_str` is not a string.
ValueError: If `input_str` is None.
Examples:
>>> convert_ar_characters("علي")
'علی'
"""
_validate_string_input(input_str)
result = _multiple_replace(AR_DIACRITICS_MAPPING, input_str)
return result.translate(AR_TO_FA_CHARS_TABLE)
[docs]
def convert_fa_spaces(input_value: str) -> str:
"""Replace improper spaces around Persian affixes with zero-width non-joiners.
Args:
input_value: Persian text that may include incorrect spaces.
Returns:
Text where required affix boundaries use half-space characters.
Raises:
TypeError: If `input_value` is not a string.
ValueError: If `input_value` is None.
Examples:
>>> convert_fa_spaces("می روم به خانه")
'میروم به خانه'
"""
_validate_string_input(input_value, "input_value")
repl = "\\2\u200c\\4"
result = MI_PATTERN.sub(repl, input_value)
return DE_YII_PATTERN.sub(repl, result)
[docs]
def decode_url(input_str: str) -> str:
"""Decode percent-encoded Persian characters in URLs.
Args:
input_str: URL containing percent-encoded segments.
Returns:
Decoded URL string.
Raises:
TypeError: If `input_str` is not a string.
ValueError: If `input_str` is None.
Examples:
>>> decode_url("https://example/%D8%B5%D9%81%D8%AD%D9%87")
'https://example/صفحه'
"""
_validate_string_input(input_str)
return urllib.parse.unquote(input_str)
[docs]
def normalize_persian(
input_str: str,
*,
convert_numbers: bool = True,
convert_characters: bool = True,
fix_spacing: bool = True,
) -> str:
"""Normalize Persian text by applying optional conversions.
Args:
input_str: Text to normalize.
convert_numbers: Whether to convert Arabic digits to Persian.
convert_characters: Whether to convert Arabic characters to Persian.
fix_spacing: Whether to fix improper spaces with ZWNJ.
Returns:
Normalized Persian text.
Raises:
TypeError: If `input_str` is not a string.
ValueError: If `input_str` is None.
Examples:
>>> normalize_persian("سلام ٣٤٥ می آیم")
'سلام ۳۴۵ میآیم'
"""
_validate_string_input(input_str)
result = input_str
if convert_numbers:
result = convert_ar_numbers(result)
if convert_characters:
result = convert_ar_characters(result)
if fix_spacing:
result = convert_fa_spaces(result)
return result
[docs]
def contains_persian_digits(input_str: str) -> bool:
"""Check whether the text contains Persian digits.
Args:
input_str: Text to inspect.
Returns:
True if at least one Persian digit exists, otherwise False.
Raises:
TypeError: If `input_str` is not a string.
ValueError: If `input_str` is None.
"""
_validate_string_input(input_str)
return any(char in FA_DIGITS for char in input_str)
[docs]
def contains_arabic_digits(input_str: str) -> bool:
"""Check whether the text contains Arabic digits.
Args:
input_str: Text to inspect.
Returns:
True if at least one Arabic digit exists, otherwise False.
Raises:
TypeError: If `input_str` is not a string.
ValueError: If `input_str` is None.
"""
_validate_string_input(input_str)
return any(char in AR_DIGITS for char in input_str)
[docs]
def is_persian_text(input_str: str) -> bool:
"""Determine whether the text contains Persian characters.
Args:
input_str: Text to inspect.
Returns:
True if the unicode range includes at least one Persian character.
Raises:
TypeError: If `input_str` is not a string.
ValueError: If `input_str` is None.
"""
_validate_string_input(input_str)
return any("\u0600" <= char <= "\u06ff" for char in input_str)
[docs]
def remove_arabic_diacritics(input_str: str) -> str:
"""Remove Arabic diacritic (tashkeel) characters from the text.
Args:
input_str: Text that may include combining Arabic diacritics.
Returns:
Text without Arabic diacritics.
Raises:
TypeError: If `input_str` is not a string.
ValueError: If `input_str` is None.
"""
_validate_string_input(input_str)
return input_str.translate(AR_DIACRITIC_REMOVAL_TABLE)
__all__ = [
"convert_en_numbers",
"convert_en_characters",
"convert_ar_numbers",
"convert_fa_numbers",
"convert_ar_characters",
"convert_fa_spaces",
"decode_url",
"normalize_persian",
"contains_persian_digits",
"contains_arabic_digits",
"is_persian_text",
"remove_arabic_diacritics",
]