Module adash.string_util

Expand source code
import re
import unicodedata
from typing import Any


def replace_all(s: str, obj: dict) -> str:
    """複数のreplace
    Example:
        >>> _obj = {"円": ".", "銭": ""}
        >>> replace_all("3円00銭", _obj)
        '3.00'
        >>> _obj = {"[△▲]": "-", "[,、]": ""}
        >>> replace_all('▲12,345', _obj)
        '-12345'
        >>> replace_all('△12、345', _obj)
        '-12345'
    """
    for key in obj:
        val = obj[key]
        s = re.sub(key, val, s)
    return s


def to_half_string(s: str) -> str:
    """normalize
    Example:
        >>> to_half_string('123XYZ')
        '123XYZ'
    """
    return unicodedata.normalize("NFKC", s)


def to_number(s: str, default: Any = float('NAN')) -> Any:
    """文字列を数値化
    Example:
        >>> to_number('△12,345')
        -12345
        >>> to_number('12,345')
        12345
        >>> to_number('12円34銭')
        12.34
        >>> to_number('98%')
        98
        >>> to_number('abc')
        nan
        >>> to_number('abc', '123')
        '123'
    """
    rep_dict = {
        '[△▲Δ]': '-',
        '[,、銭%%]': '',
        '[円]': '.'
    }
    s = to_half_string(s)
    s = replace_all(s, rep_dict)
    try:
        float(s)
    except ValueError:
        return default
    else:
        if float(s).is_integer():
            return int(float(s))
        return float(s)


def to_date(s: str) -> Any:
    """日付表記を統一する
    Example:
        >>> to_date('2018年5月27日')
        '2018-05-27'
        >>> to_date('令和 元年 5月12日')
        '2019-05-12'
        >>> to_date('大3年5月27日')
        '1914-05-27'
        >>> to_date('20211010')
        '2021-10-10'
    """
    meiji = 'meiji'
    taisyou = 'taisyou'
    syouwa = 'syouwa'
    heisei = 'heisei'
    reiwa = 'reiwa'
    s = to_half_string(s)
    s = replace_all(s, {
        '[年月/]': '-',
        r'[日\s]': '',
        '[元]': '1',
        r'(明治|明)': meiji,
        r'(大正|大)': taisyou,
        r'(昭和|昭)': syouwa,
        r'(平成|平)': heisei,
        r'(令和|令)': reiwa,
    })
    # 数字8桁ならハイフン挿入
    m = re.match(r'^\d{8}$', s)
    if m:
        s = re.sub(r'(\d{4})(\d{2})(\d{2})', r'\1-\2-\3', s)
    m = re.match(r'(\D*)(\d*)-(\d*)-(\d*)', s)
    if m:
        era, year, month, day = m.groups()
        if era:
            if era == meiji:
                year = int(year) + 1868 - 1
            elif era == taisyou:
                year = int(year) + 1912 - 1
            elif era == syouwa:
                year = int(year) + 1926 - 1
            elif era == heisei:
                year = int(year) + 1989 - 1
            elif era == reiwa:
                year = int(year) + 2019 - 1
            year = str(year)
        month = f'0{month}' if len(month) == 1 else month
        day = f'0{day}' if len(day) == 1 else day
        return f'{year}-{month}-{day}'
    return None


def split_uppercase(s: str) -> list:
    """UpperCaseを分割
    Example:
        >>> split_uppercase('NextAccumulatedQ2Duration')
        ['Next', 'Accumulated', 'Q2', 'Duration']
    """
    return re.findall(r'[A-Z]+[a-z0-9]*', s)

Functions

def replace_all(s: str, obj: dict) ‑> str

複数のreplace

Example

>>> _obj = {"円": ".", "銭": ""}
>>> replace_all("3円00銭", _obj)
'3.00'
>>> _obj = {"[△▲]": "-", "[,、]": ""}
>>> replace_all('▲12,345', _obj)
'-12345'
>>> replace_all('△12、345', _obj)
'-12345'
Expand source code
def replace_all(s: str, obj: dict) -> str:
    """複数のreplace
    Example:
        >>> _obj = {"円": ".", "銭": ""}
        >>> replace_all("3円00銭", _obj)
        '3.00'
        >>> _obj = {"[△▲]": "-", "[,、]": ""}
        >>> replace_all('▲12,345', _obj)
        '-12345'
        >>> replace_all('△12、345', _obj)
        '-12345'
    """
    for key in obj:
        val = obj[key]
        s = re.sub(key, val, s)
    return s
def split_uppercase(s: str) ‑> list

UpperCaseを分割

Example

>>> split_uppercase('NextAccumulatedQ2Duration')
['Next', 'Accumulated', 'Q2', 'Duration']
Expand source code
def split_uppercase(s: str) -> list:
    """UpperCaseを分割
    Example:
        >>> split_uppercase('NextAccumulatedQ2Duration')
        ['Next', 'Accumulated', 'Q2', 'Duration']
    """
    return re.findall(r'[A-Z]+[a-z0-9]*', s)
def to_date(s: str) ‑> Any

日付表記を統一する

Example

>>> to_date('2018年5月27日')
'2018-05-27'
>>> to_date('令和 元年 5月12日')
'2019-05-12'
>>> to_date('大3年5月27日')
'1914-05-27'
>>> to_date('20211010')
'2021-10-10'
Expand source code
def to_date(s: str) -> Any:
    """日付表記を統一する
    Example:
        >>> to_date('2018年5月27日')
        '2018-05-27'
        >>> to_date('令和 元年 5月12日')
        '2019-05-12'
        >>> to_date('大3年5月27日')
        '1914-05-27'
        >>> to_date('20211010')
        '2021-10-10'
    """
    meiji = 'meiji'
    taisyou = 'taisyou'
    syouwa = 'syouwa'
    heisei = 'heisei'
    reiwa = 'reiwa'
    s = to_half_string(s)
    s = replace_all(s, {
        '[年月/]': '-',
        r'[日\s]': '',
        '[元]': '1',
        r'(明治|明)': meiji,
        r'(大正|大)': taisyou,
        r'(昭和|昭)': syouwa,
        r'(平成|平)': heisei,
        r'(令和|令)': reiwa,
    })
    # 数字8桁ならハイフン挿入
    m = re.match(r'^\d{8}$', s)
    if m:
        s = re.sub(r'(\d{4})(\d{2})(\d{2})', r'\1-\2-\3', s)
    m = re.match(r'(\D*)(\d*)-(\d*)-(\d*)', s)
    if m:
        era, year, month, day = m.groups()
        if era:
            if era == meiji:
                year = int(year) + 1868 - 1
            elif era == taisyou:
                year = int(year) + 1912 - 1
            elif era == syouwa:
                year = int(year) + 1926 - 1
            elif era == heisei:
                year = int(year) + 1989 - 1
            elif era == reiwa:
                year = int(year) + 2019 - 1
            year = str(year)
        month = f'0{month}' if len(month) == 1 else month
        day = f'0{day}' if len(day) == 1 else day
        return f'{year}-{month}-{day}'
    return None
def to_half_string(s: str) ‑> str

normalize

Example

>>> to_half_string('123XYZ')
'123XYZ'
Expand source code
def to_half_string(s: str) -> str:
    """normalize
    Example:
        >>> to_half_string('123XYZ')
        '123XYZ'
    """
    return unicodedata.normalize("NFKC", s)
def to_number(s: str, default: Any = nan) ‑> Any

文字列を数値化

Example

>>> to_number('△12,345')
-12345
>>> to_number('12,345')
12345
>>> to_number('12円34銭')
12.34
>>> to_number('98%')
98
>>> to_number('abc')
nan
>>> to_number('abc', '123')
'123'
Expand source code
def to_number(s: str, default: Any = float('NAN')) -> Any:
    """文字列を数値化
    Example:
        >>> to_number('△12,345')
        -12345
        >>> to_number('12,345')
        12345
        >>> to_number('12円34銭')
        12.34
        >>> to_number('98%')
        98
        >>> to_number('abc')
        nan
        >>> to_number('abc', '123')
        '123'
    """
    rep_dict = {
        '[△▲Δ]': '-',
        '[,、銭%%]': '',
        '[円]': '.'
    }
    s = to_half_string(s)
    s = replace_all(s, rep_dict)
    try:
        float(s)
    except ValueError:
        return default
    else:
        if float(s).is_integer():
            return int(float(s))
        return float(s)