CSV folder example
Example 1: directory structure
- bucket_root
- nested_folder/
- 2021-05-03/ -> Date Partition
- training_data/ -> Sub Folder
- data.csv
- training_data/ -> Sub Folder
- 2021-05-04/ -> Date Partition
- training_sample/ -> Sub Folder
- data.csv
- training_sample/ -> Sub Folder
- 2021-05-03/ -> Date Partition
- nested_folder/
- Python
from featurestore import Client, CSVFolder
# Initialise feature store client
client = Client("ip:port")
client.auth.login()
# Set project specifics
project = client.projects.create("demo")
# Create the csv folder source
csv_folder = CSVFolder(
root_folder="s3a://feature-store-test-data/nested_folder",
filter_pattern=".*/training.*"
)
csv_folder_schema = client.extract_schema_from_source(csv_folder)
# Register the feature set
my_feature_set = project.feature_sets.register(csv_folder_schema, "feature_set_name", primary_key=["key_name"])
# Ingest to cache
my_feature_set.ingest(csv_folder)
# Retrieve feature set
ref = my_feature_set.retrieve()
ref.download()
Example 2: directory structure
- bucket_root
- nested_folder/
-
California
- 2021-05-03/ -> Date Partition
- training_data/ -> Sub Folder
- date.csv
- training_data/ -> Sub Folder
- 2021-05-03/ -> Date Partition
-
Arizona
- 2021-05-04/ -> Date Partition
- training_sample/ -> Sub Folder
- data.csv
- training_sample/ -> Sub Folder
- 2021-05-04/ -> Date Partition
-
Texas
- 2021-05-04/ -> Date Partition
- training_sample/ -> Sub Folder
- data.csv
- training_sample/ -> Sub Folder
- 2021-05-04/ -> Date Partition
-
- nested_folder/
- Python
from featurestore import Client, CSVFolder
# Initialise feature store client
client = Client("ip:port")
client.auth.login()
# Set project specifics
project = client.projects.create("demo")
# Create the csv folder source
csv_folder_source = CSVFolder(
root_folder="s3a://feature-store-test-data/nested_folder",
filter_pattern=".*/.*/training.*" # To ingest from all states
)
csv_folder_schema = client.extract_schema_from_source(csv_folder_source)
# Note
# To ingest only from California, then filter_pattern = "California/.*/training.*"
# To ingest only from California & Arizona, then filter_pattern = "(Arizona|California)/.*/training.*"
# Register the feature set
my_feature_set = project.feature_sets.register(csv_folder_schema, "feature_set_name", primary_key=["key_name"])
# Ingest to cache
my_feature_set.ingest()
# Retrieve feature set
ref = my_feature_set.retrieve()
ref.download()
Example 3: directory structure (no date folder)
- bucket_root
- nested_folder/
-
California
- training_data/ -> Sub Folder
- data.csv
- training_data/ -> Sub Folder
-
Arizona
- training_sample/ -> Sub Folder
- data.csv
- training_sample/ -> Sub Folder
-
Texas
- training_sample/ -> Sub Folder
- data.csv
- training_sample/ -> Sub Folder
-
- nested_folder/
- Python
from featurestore import Client, CSVFolder
# Initialise feature store client
client = Client("ip:port")
client.auth.login()
# Set project specifics
project = client.projects.create("demo")
# Create the csv folder source
csv_folder_source = CSVFolder(
root_folder="s3a://feature-store-test-data/nested_folder",
filter_pattern=".*/training.*" # To ingest from all states
)
csv_folder_schema = client.extract_schema_from_source(csv_folder_source)
# Note
# To ingest only from California, then filter_pattern = "California/training.*"
# To ingest only from California & Arizona, then filter_pattern = "(Arizona|California)/training.*"
# Register the feature set
my_feature_set = project.feature_sets.register(csv_folder_schema, "feature_set_name", primary_key=["key_name"])
# Ingest to cache
my_feature_set.ingest()
# Retrieve feature set
ref = my_feature_set.retrieve()
ref.download()
Feedback
- Submit and view feedback for this page
- Send feedback about H2O Feature Store to cloud-feedback@h2o.ai