Metadata-Version: 2.4
Name: nuki
Version: 0.1.9
Summary: 自分用・非汎用
Requires-Python: >=3.12
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: patchright>=1.40
Requires-Dist: playwright>=1.40
Requires-Dist: selectolax>=0.3
Requires-Dist: pyarrow>=14.0
Requires-Dist: camoufox>=0.4
Requires-Dist: loguru>=0.7
Requires-Dist: tqdm>=4.66

# nuki

自分用・非汎用

## インストール

`uv add nuki`  
`uv run patchright install chromium`  
`uv run camoufox fetch`

## 使用例

### スクレイピング

```python
from nuki import wrap_page
from nuki.browser import patchright_page
from nuki.utils import append_csv, from_here, save_log, write_bytes

here = from_here(__file__)
save_log(here('log/scraping.log'))

with patchright_page() as page:
    p = wrap_page(page)

    p.goto('https://www.foobarbaz1.jp')
    pref_urls = p.ss('li.item > ul > li > a').urls

    classroom_urls = []
    for i, url in enumerate(pref_urls, 1):
        print(f'pref_urls {i}/{len(pref_urls)}')
        if not p.goto(url):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
            continue
        classroom_urls.extend(p.ss('.school-area h4 a').urls)

    for i, url in enumerate(classroom_urls, 1):
        print(f'classroom_urls {i}/{len(classroom_urls)}')
        if not p.goto(url):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
            continue
        th_grep = p.ss('th').re
        append_csv(here('csv/scrape.csv'), {
            'id': i,
            'URL': page.url,
            '教室名': p.s('h1 .text01').text,
            '住所': p.s('.item .mapText').text,
            '電話番号': p.s('.item .phoneNumber').text,
            'HP': th_grep.s(r'ホームページ').next('td').s('a').url,
            '営業時間': th_grep.s(r'営業時間').next('td').text,
            '定休日': th_grep.s(r'定休日').next('td').text,
        })
        p.s('.school-map').screenshot(here(f'media/{i}-screenshot.png'))
        if (img_url := p.s('.school-area img').src):
            if (res := p.goto(img_url)) and res.ok:
                write_bytes(here(f'media/{i}-img.jpg'), res.body())
```

### スクレイピング(HTML丸ごと保存)

```python
from nuki import wrap_page
from nuki.browser import camoufox_page
from nuki.utils import append_csv, from_here, hash_name, save_log, write_text

here = from_here(__file__)
save_log(here('log/scraping.log'))

with camoufox_page() as page:
    p = wrap_page(page)

    p.goto('https://www.foobarbaz1.jp')
    item_urls = p.ss('ul.items > li > a').urls

    for i, url in enumerate(item_urls, 1):
        print(f'item_urls {i}/{len(item_urls)}')
        if not p.goto(url):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
            continue
        file_name = f'{hash_name(url)}.html'
        if not write_text(here('html') / file_name, p.html(with_url=True, with_saved_at=True)):
            append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'write_text'})
            continue
```

### ローカルHTMLからデータ抽出&Parquet出力

```python
from nuki import wrap_parser
from nuki.utils import from_here, parse_html, save_log, write_parquet

here = from_here(__file__)
save_log(here('log/scraping.log'))

results = []
for i, file_path in enumerate(here('html').glob('*.html')):
    print(f'html {i}')
    if not (parser := parse_html(file_path)):
        continue
    p = wrap_parser(parser)
    dts = p.ss('dt').re
    results.append({
        'URL': p.url,
        'file_name': file_path.name,
        '教室名': p.s('h1 .text02').text,
        '住所': p.s('.item .mapText').text,
        '所在地': dts.s(r'所在地').next('dd').text,
        '交通': dts.s(r'交通').next('dd').text,
        '物件番号': dts.s(r'物件番号').next('dd').text,
    })
write_parquet(here('parquet/extract.parquet'), results)
```

### ローカルHTMLからデータ抽出&Parquet出力(並列処理)

```python
from pathlib import Path

from nuki import wrap_parser
from nuki.utils import from_here, glob_paths, parse_html, pool_map, write_parquet

def main():
    here = from_here(__file__)
    html_paths = glob_paths(here('html'), '*.html')
    results = [r for r in pool_map(extract, html_paths) if r]
    write_parquet(here('parquet/extract.parquet'), results)

def extract(file_path: str) -> dict | None:
    if not (parser := parse_html(Path(file_path))):
        return None
    p = wrap_parser(parser)
    dts = p.ss('dt').re
    return {
        'URL': p.url,
        'file_path': file_path,
        '教室名': p.s('h1 .text02').text,
        '住所': p.s('.item .mapText').text,
        '所在地': dts.s(r'所在地').next('dd').text,
        '交通': dts.s(r'交通').next('dd').text,
        '価格': dts.s(r'価格').next('dd').text,
        '設備・条件': dts.s(r'設備').next('dd').text,
        '備考': dts.s(r'備考').next('dd').text,
    }

if __name__ == '__main__':
    main()
```

## License - ライセンス

[MIT](./LICENSE)

