Skip to content

Commit 64b3fe1

Browse files
authored
feat: basic read_huggingface functionality (#4996)
## Changes Made Adds `daft.read_huggingface` ## Related Issues #2841 ## Checklist - [x] Documented in API Docs (if applicable) - [x] Documented in User Guide (if applicable) - [x] If adding a new documentation page, doc is added to `docs/mkdocs.yml` navigation - [x] Documentation builds and is formatted properly (tag @/ccmao1130 for docs review)
1 parent e36f52f commit 64b3fe1

File tree

16 files changed

+213
-34
lines changed

16 files changed

+213
-34
lines changed

daft/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ def refresh_logger() -> None:
9090
read_lance,
9191
read_video_frames,
9292
read_warc,
93+
read_huggingface,
9394
)
9495
from daft.series import Series
9596
from daft.session import (
@@ -206,6 +207,7 @@ def refresh_logger() -> None:
206207
"read_csv",
207208
"read_deltalake",
208209
"read_hudi",
210+
"read_huggingface",
209211
"read_iceberg",
210212
"read_json",
211213
"read_lance",

daft/daft/__init__.pyi

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,22 @@ class UnityConfig:
703703
"""Replaces values if provided, returning a new UnityConfig."""
704704
...
705705

706+
class HuggingFaceConfig:
707+
"""I/O configuration for accessing Hugging Face datasets.
708+
709+
Args:
710+
token (str, optional): Your Hugging Face access token, generated from https://huggingface.co/settings/tokens.
711+
anonymous (bool, optional): Whether or not to use "anonymous mode", which will access Hugging Face without any credentials. Defaults to False.
712+
"""
713+
714+
token: str | None
715+
anonymous: bool
716+
717+
def __init__(self, token: str | None = None, anonymous: bool | None = None): ...
718+
def replace(self, token: str | None = None, anonymous: bool | None = None) -> HuggingFaceConfig:
719+
"""Replaces values if provided, returning a new HuggingFaceConfig."""
720+
...
721+
706722
class IOConfig:
707723
"""Configuration for the native I/O layer, e.g. credentials for accessing cloud storage systems."""
708724

@@ -711,6 +727,7 @@ class IOConfig:
711727
gcs: GCSConfig
712728
http: HTTPConfig
713729
unity: UnityConfig
730+
hf: HuggingFaceConfig
714731

715732
def __init__(
716733
self,
@@ -719,6 +736,7 @@ class IOConfig:
719736
gcs: GCSConfig | None = None,
720737
http: HTTPConfig | None = None,
721738
unity: UnityConfig | None = None,
739+
hf: HuggingFaceConfig | None = None,
722740
): ...
723741
def replace(
724742
self,
@@ -727,6 +745,7 @@ class IOConfig:
727745
gcs: GCSConfig | None = None,
728746
http: HTTPConfig | None = None,
729747
unity: UnityConfig | None = None,
748+
hf: HuggingFaceConfig | None = None,
730749
) -> IOConfig:
731750
"""Replaces values if provided, returning a new IOConfig."""
732751
...

daft/io/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
S3Config,
99
S3Credentials,
1010
UnityConfig,
11+
HuggingFaceConfig,
1112
)
1213
from daft.io._csv import read_csv
1314
from daft.io.delta_lake._deltalake import read_deltalake
@@ -18,6 +19,7 @@
1819
from daft.io._parquet import read_parquet
1920
from daft.io._sql import read_sql
2021
from daft.io._warc import read_warc
22+
from daft.io._huggingface import read_huggingface
2123
from daft.io._range import _range
2224
from daft.io.catalog import DataCatalogTable, DataCatalogType
2325
from daft.io.file_path import from_glob_path
@@ -34,6 +36,7 @@
3436
"DataSourceTask",
3537
"GCSConfig",
3638
"HTTPConfig",
39+
"HuggingFaceConfig",
3740
"IOConfig",
3841
"S3Config",
3942
"S3Credentials",
@@ -43,6 +46,7 @@
4346
"read_csv",
4447
"read_deltalake",
4548
"read_hudi",
49+
"read_huggingface",
4650
"read_iceberg",
4751
"read_json",
4852
"read_lance",

daft/io/_huggingface.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
from daft.api_annotations import PublicAPI
6+
7+
from ._parquet import read_parquet
8+
9+
if TYPE_CHECKING:
10+
from daft.daft import IOConfig
11+
from daft.dataframe import DataFrame
12+
13+
14+
@PublicAPI
15+
def read_huggingface(repo: str, io_config: IOConfig | None = None) -> DataFrame:
16+
"""Create a DataFrame from a Hugging Face dataset.
17+
18+
Currently supports all public datasets and all private Parquet datasets. See [the Hugging Face docs](https://huggingface.co/docs/dataset-viewer/en/parquet) for more details.
19+
20+
Args:
21+
repo (str): repository to read in the form `username/dataset_name`
22+
io_config (IOConfig): Config to use when reading data
23+
"""
24+
return read_parquet(f"hf://datasets/{repo}", io_config=io_config)

daft/udf/generator.py

Whitespace-only changes.

docs/connectors/huggingface.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ For authenticated datasets:
4545
=== "🐍 Python"
4646

4747
```python
48-
from daft.io import IOConfig, HTTPConfig
48+
from daft.io import IOConfig, HuggingFaceConfig
4949

50-
io_config = IoConfig(http=HTTPConfig(bearer_token="your_token"))
50+
io_config = IOConfig(hf=HuggingFaceConfig(token="your_token"))
5151
df = daft.read_parquet("hf://datasets/username/dataset_name", io_config=io_config)
5252
```
5353

requirements-dev.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@ docker
77
# Pinned aiohttp due to Ray connection issue in aiohttp==3.12.6
88
aiohttp==3.12.4
99

10-
# Pinned requests due to docker-py issue: https://github.com/docker/docker-py/issues/3256
11-
requests<2.32.0
12-
1310
# Pinned httpx due to unitycatalog-python issue: https://github.com/unitycatalog/unitycatalog-python/issues/9
1411
httpx==0.27.2
1512

@@ -38,6 +35,7 @@ Pillow==10.4.0
3835
opencv-python==4.10.0.84
3936
tiktoken==0.9.0
4037
duckdb==1.1.2
38+
datasets==4.0.0
4139

4240
# TQDM
4341
tqdm

src/common/io-config/src/config.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,18 @@ use std::fmt::{Display, Formatter};
22

33
use serde::{Deserialize, Serialize};
44

5-
use crate::{unity::UnityConfig, AzureConfig, GCSConfig, HTTPConfig, S3Config};
5+
use crate::{
6+
huggingface::HuggingFaceConfig, unity::UnityConfig, AzureConfig, GCSConfig, HTTPConfig,
7+
S3Config,
8+
};
69
#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
710
pub struct IOConfig {
811
pub s3: S3Config,
912
pub azure: AzureConfig,
1013
pub gcs: GCSConfig,
1114
pub http: HTTPConfig,
1215
pub unity: UnityConfig,
16+
pub hf: HuggingFaceConfig,
1317
}
1418

1519
impl IOConfig {
@@ -36,6 +40,10 @@ impl IOConfig {
3640
"Unity config = {{ {} }}",
3741
self.unity.multiline_display().join(", ")
3842
));
43+
res.push(format!(
44+
"Hugging Face config = {{ {} }}",
45+
self.hf.multiline_display().join(", ")
46+
));
3947
res
4048
}
4149
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
use std::fmt::{Display, Formatter};
2+
3+
use serde::{Deserialize, Serialize};
4+
5+
use crate::ObfuscatedString;
6+
7+
#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
8+
pub struct HuggingFaceConfig {
9+
pub token: Option<ObfuscatedString>,
10+
pub anonymous: bool,
11+
}
12+
13+
impl HuggingFaceConfig {
14+
pub fn multiline_display(&self) -> Vec<String> {
15+
let mut res = vec![];
16+
if let Some(token) = &self.token {
17+
res.push(format!("Token = {token}"));
18+
}
19+
res.push(format!("Anonymous = {}", self.anonymous));
20+
res
21+
}
22+
}
23+
24+
impl Display for HuggingFaceConfig {
25+
fn fmt(&self, f: &mut Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
26+
write!(
27+
f,
28+
"HuggingFaceConfig\n{}",
29+
self.multiline_display().join("\n")
30+
)
31+
}
32+
}

src/common/io-config/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ mod azure;
77
mod config;
88
mod gcs;
99
mod http;
10+
mod huggingface;
1011
mod s3;
1112
mod unity;
1213

@@ -23,6 +24,7 @@ pub use crate::{
2324
config::IOConfig,
2425
gcs::GCSConfig,
2526
http::HTTPConfig,
27+
huggingface::HuggingFaceConfig,
2628
s3::{S3Config, S3Credentials},
2729
unity::UnityConfig,
2830
};

0 commit comments

Comments
 (0)