pyproject.toml

[project]
name = "datatrove"
version = "0.4.0"  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
description = "HuggingFace library to process and filter large amounts of webdata"
readme = "README.md"
authors = [
  {name = "HuggingFace Inc.", email = "guilherme@huggingface.co"}
]
license = {text = "Apache-2.0"}
classifiers = [
    "Intended Audience :: Developers",
    "Intended Audience :: Education",
    "Intended Audience :: Science/Research",
    "License :: OSI Approved :: Apache Software License",
    "Operating System :: OS Independent",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Topic :: Scientific/Engineering :: Artificial Intelligence",
]
keywords = ["data", "machine", "learning", "processing"]
requires-python = ">=3.10.0"
dependencies = [
    "dill>=0.3.0",
    "fsspec>=2023.12.2",
    "huggingface-hub>=0.17.0",
    "humanize",
    "loguru>=0.7.0",
    "multiprocess",
    "numpy>=2.0.0",
    "tqdm",
]

[project.optional-dependencies]
cli = [
  "rich",
]
io = [
  "faust-cchardet",
  "pyarrow",
  "python-magic",
  "warcio",
  "datasets>=3.1.0",
  "orjson",
  "zstandard"
]
s3 = [
  "s3fs>=2023.12.2",
]
processing = [
    "fasttext-numpy2-wheel",
    "nltk",
    "inscriptis",
#   "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
    "tldextract",
    "trafilatura>=1.8.0,<1.12.0",
    "tokenizers",
    "ftfy",
    "fasteners",
    "regex",
    "xxhash",
    "kenlm",
    "pyahocorasick"
]
decont = [
    "lighteval>=0.3.0"
]
multilingual = [
    "spacy[ja]>=3.8",
    "stanza",
    "pyvi", # vietnamese
    "pythainlp", # thai
    "jieba", # chinese
    "indic-nlp-library", # indic languages
    "kiwipiepy", # korean
    # urduhack has keras and tensorflow as dependencies and requires a specific version to work...
    "urduhack",
    "tensorflow>=2.16",
    "khmer-nltk", # khmer
    "laonlp", # lao
    "botok", # tibetan languages,
    "pyidaungsu-numpy2", # burmese
]
quality = [
  "ruff>=0.1.5"
]
testing = [
  "datatrove[cli]",
  "datatrove[io]",
  "datatrove[processing]",
  "datatrove[multilingual]",
  "datatrove[s3]",
  # Lighteval doesn't support numpy>=2.0.0
#  "datatrove[decont]",
# Flask doesn't have correct dependencies on werkzeux, causing issues, thus we pin flask 3.1 (which currently works) to avoid it
  "flask>=3.1.0",
  "pytest",
  "pytest-timeout",
  "pytest-xdist",
  "moto[s3,server]",
  "spacy[ja]"
]
all = [
  "datatrove[quality]",
  "datatrove[testing]",
]
dev = [
  "datatrove[all]"
]

[project.urls]
Repository = "https://github.com/huggingface/datatrove"

[project.scripts]
check_dataset = "datatrove.tools.check_dataset:main"
merge_stats = "datatrove.tools.merge_stats:main"
launch_pickled_pipeline = "datatrove.tools.launch_pickled_pipeline:main"
failed_logs = "datatrove.tools.failed_logs:main"
inspect_data = "datatrove.tools.inspect_data:main"
jobs_status = "datatrove.tools.jobs_status:main"

[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[tool.setuptools.packages.find]
where = ["src"]

[tool.setuptools.package-data]
datatrove = ["assets/*"]

[tool.ruff]
lint.ignore = [
  "C901", # `function_name` is too complex
  "E501", # line length violation
]
lint.select = [
  "C",
  "E",
  "F",
  "I",
  "W",
  "NPY201", # numpy 2.0.0
]
line-length = 119

[tool.ruff.lint.per-file-ignores]
"__init__.py" = [
  "F401" # module imported but unused
]

[tool.ruff.lint.isort]
lines-after-imports = 2
known-first-party = [
  "datatrove"
]