CITATION.cff
CONTRIBUTING.md
HISTORY.md
LICENSE
MANIFEST.in
README.md
pyproject.toml
tests/__init__.py
tests/baseline_tests.py
tests/cli_tests.py
tests/conftest.py
tests/deduplication_tests.py
tests/downloads_tests.py
tests/feeds_tests.py
tests/filters_tests.py
tests/json_metadata_tests.py
tests/metadata_tests.py
tests/sitemaps_tests.py
tests/spider_tests.py
tests/unit_tests.py
tests/xml_tei_tests.py
tests/resources/apache.html
tests/resources/exotic_tags.html
tests/resources/exotic_tags_tei.html
tests/resources/feed.json
tests/resources/feed1.atom
tests/resources/feed2.rss
tests/resources/http_sample.html
tests/resources/httpbin_sample.html
tests/resources/list-discard.txt
tests/resources/list-process.txt
tests/resources/mozilla.org.firefox.developer.html
tests/resources/newsettings.cfg
tests/resources/redundant-urls.txt
tests/resources/scam.html
tests/resources/sitemap-hreflang.xml
tests/resources/sitemap.xml
tests/resources/sitemap.xml.gz
tests/resources/sitemap2.xml
tests/resources/utf8.html
tests/resources/webpage.html.gz
tests/resources/zerolength.cfg
trafilatura/__init__.py
trafilatura/baseline.py
trafilatura/cli.py
trafilatura/cli_utils.py
trafilatura/core.py
trafilatura/deduplication.py
trafilatura/downloads.py
trafilatura/external.py
trafilatura/feeds.py
trafilatura/htmlprocessing.py
trafilatura/json_metadata.py
trafilatura/main_extractor.py
trafilatura/meta.py
trafilatura/metadata.py
trafilatura/py.typed
trafilatura/readability_lxml.py
trafilatura/settings.cfg
trafilatura/settings.py
trafilatura/sitemaps.py
trafilatura/spider.py
trafilatura/utils.py
trafilatura/xml.py
trafilatura/xpaths.py
trafilatura.egg-info/PKG-INFO
trafilatura.egg-info/SOURCES.txt
trafilatura.egg-info/dependency_links.txt
trafilatura.egg-info/entry_points.txt
trafilatura.egg-info/requires.txt
trafilatura.egg-info/top_level.txt
trafilatura/data/tei_corpus.dtd