CSV folder example
Example 1: directory structure
- bucket_root
- nested_folder/
- 2021-05-03/ -> Date Partition
- training_data/ -> Sub Folder
- data.csv
- training_data/ -> Sub Folder
- 2021-05-04/ -> Date Partition
- training_sample/ -> Sub Folder
- data.csv
- training_sample/ -> Sub Folder
- 2021-05-03/ -> Date Partition
- nested_folder/
- Python
- Scala
from featurestore import Client, CSVFolder
# Initialise feature store client
client = Client("ip:port")
client.auth.login()
# Set project specifics
project = client.projects.create("demo")
# Create the csv folder source
csv_folder = CSVFolder(
root_folder="s3a://feature-store-test-data/nested_folder",
filter_pattern=".*/training.*"
)
csv_folder_schema = client.extract_schema_from_source(csv_folder)
# Register the feature set
my_feature_set = project.feature_sets.register(csv_folder_schema, "feature_set_name", primary_key=["key_name"])
# Ingest to cache
my_feature_set.ingest(csv_folder)
# Retrieve feature set
ref = my_feature_set.retrieve()
ref.download()
import ai.h2o.featurestore.Client
import ai.h2o.featurestore.core.sources.CSVFolder
// Initialise feature store client
val client = Client("url")
client.auth.login()
// Set project specifics
val project = client.projects.create("demo")
// Create the csv folder source
val csvFolder = CSVFolder(
rootFolder="s3a://feature-store-test-data/nested_folder",
filterPattern=".*/training.*"
)
val csvFolderSchema = client.extractSchemaFromSource(csvFolder)
// Register the feature set
val myFeatureSet = project.featureSets.register(csvFolderSchema, "featureSetName", primaryKey=Seq("keyName"))
// Ingest to cache
myFeatureSet.ingest(csvFolder)
# Retrieve feature set
val ref = myFeatureSet.retrieve()
ref.download()
Example 2: directory structure
bucket_root
nested_folder/
California
- 2021-05-03/ -> Date Partition
- training_data/ -> Sub Folder
- date.csv
- training_data/ -> Sub Folder
- 2021-05-03/ -> Date Partition
Arizona
- 2021-05-04/ -> Date Partition
- training_sample/ -> Sub Folder
- data.csv
- training_sample/ -> Sub Folder
- 2021-05-04/ -> Date Partition
Texas
- 2021-05-04/ -> Date Partition
- training_sample/ -> Sub Folder
- data.csv
- training_sample/ -> Sub Folder
- 2021-05-04/ -> Date Partition
- Python
- Scala
from featurestore import Client, CSVFolder
# Initialise feature store client
client = Client("ip:port")
client.auth.login()
# Set project specifics
project = client.projects.create("demo")
# Create the csv folder source
csv_folder_source = CSVFolder(
root_folder="s3a://feature-store-test-data/nested_folder",
filter_pattern=".*/.*/training.*" # To ingest from all states
)
csv_folder_schema = client.extract_schema_from_source(csv_folder_source)
# Note
# To ingest only from California, then filter_pattern = "California/.*/training.*"
# To ingest only from California & Arizona, then filter_pattern = "(Arizona|California)/.*/training.*"
# Register the feature set
my_feature_set = project.feature_sets.register(csv_folder_schema, "feature_set_name", primary_key=["key_name"])
# Ingest to cache
my_feature_set.ingest()
# Retrieve feature set
ref = my_feature_set.retrieve()
ref.download()
import ai.h2o.featurestore.Client
import ai.h2o.featurestore.core.sources.CSVFolder
// Initialise feature store client
val client = Client("url")
client.auth.setAuthToken(...)
// Set project specifics
val project = client.projects.create("demo")
// Create the csv folder source
val csvFolderSource = CSVFolder(
rootFolder="s3a://feature-store-test-data/nested_folder",
filterPattern=".*/.*/training.*"
)
val csvFolderSchema = client.extractSchemaFromSource(csvFolderSource)
// Note
// To ingest only from California, then filter_pattern = "California/.*/training.*"
// To ingest only from California & Arizona, then filter_pattern = "(Arizona|California)/.*/training.*"
// Register the feature set
val myFeatureSet = project.featureSets.register(csvFolderSchema, "featureSetName", primaryKey=Seq("keyName"))
// Ingest to cache
myFeatureSet.ingest()
// Retrieve feature set
val ref = myFeatureSet.retrieve()
ref.download()
Example 3: directory structure (no date folder)
bucket_root
nested_folder/
California
- training_data/ -> Sub Folder
- data.csv
- training_data/ -> Sub Folder
Arizona
- training_sample/ -> Sub Folder
- data.csv
- training_sample/ -> Sub Folder
Texas
- training_sample/ -> Sub Folder
- data.csv
- training_sample/ -> Sub Folder
- Python
- Scala
from featurestore import Client, CSVFolder
# Initialise feature store client
client = Client("ip:port")
client.auth.login()
# Set project specifics
project = client.projects.create("demo")
# Create the csv folder source
csv_folder_source = CSVFolder(
root_folder="s3a://feature-store-test-data/nested_folder",
filter_pattern=".*/training.*" # To ingest from all states
)
csv_folder_schema = client.extract_schema_from_source(csv_folder_source)
# Note
# To ingest only from California, then filter_pattern = "California/training.*"
# To ingest only from California & Arizona, then filter_pattern = "(Arizona|California)/training.*"
# Register the feature set
my_feature_set = project.feature_sets.register(csv_folder_schema, "feature_set_name", primary_key=["key_name"])
# Ingest to cache
my_feature_set.ingest()
# Retrieve feature set
ref = my_feature_set.retrieve()
ref.download()
import ai.h2o.featurestore.Client
import ai.h2o.featurestore.core.sources.CSVFolder
// Initialise feature store client
val client = Client("url")
client.auth.setAuthToken(...)
// Set project specifics
val project = client.projects.create("demo")
// Create the csv folder source
val csvFolderSource = CSVFolder(
rootFolder="s3a://feature-store-test-data/nested_folder",
filterPattern=".*/training.*" # To ingest from all states
)
val csvFolderSchema = client.extractSchemaFromSource(csvFolderSource)
// Note
// To ingest only from California, then filter_pattern = "California/training.*"
// To ingest only from California & Arizona, then filter_pattern = "(Arizona|California)/training.*"
// Register the feature set
val myFeatureSet = project.featureSets.register(csvFolderSchema, "featureSetName", primaryKey=Seq("keyName"))
// Ingest to cache
myFeatureSet.ingest()
// Retrieve feature set
val ref = myFeatureSet.retrieve()
ref.download()
Feedback
- Submit and view feedback for this page
- Send feedback about H2O Feature Store to cloud-feedback@h2o.ai