Skip to main content
Version: 1.2.0

CSV folder example

Example 1: directory structure

  • bucket_root
    • nested_folder/
      • 2021-05-03/ -> Date Partition
        • training_data/ -> Sub Folder
          • data.csv
      • 2021-05-04/ -> Date Partition
        • training_sample/ -> Sub Folder
          • data.csv
from featurestore import Client, CSVFolder

# Initialise feature store client
client = Client("ip:port")
client.auth.login()

# Set project specifics
project = client.projects.create("demo")

# Create the csv folder source
csv_folder = CSVFolder(
root_folder="s3a://feature-store-test-data/nested_folder",
filter_pattern=".*/training.*"
)
csv_folder_schema = client.extract_schema_from_source(csv_folder)

# Register the feature set
my_feature_set = project.feature_sets.register(csv_folder_schema, "feature_set_name", primary_key=["key_name"])

# Ingest to cache
my_feature_set.ingest(csv_folder)

# Retrieve feature set
ref = my_feature_set.retrieve()
ref.download()

Example 2: directory structure

  • bucket_root

    • nested_folder/

      • California

        • 2021-05-03/ -> Date Partition
          • training_data/ -> Sub Folder
            • date.csv
      • Arizona

        • 2021-05-04/ -> Date Partition
          • training_sample/ -> Sub Folder
            • data.csv
      • Texas

        • 2021-05-04/ -> Date Partition
          • training_sample/ -> Sub Folder
            • data.csv
from featurestore import Client, CSVFolder

# Initialise feature store client
client = Client("ip:port")
client.auth.login()

# Set project specifics
project = client.projects.create("demo")

# Create the csv folder source
csv_folder_source = CSVFolder(
root_folder="s3a://feature-store-test-data/nested_folder",
filter_pattern=".*/.*/training.*" # To ingest from all states
)
csv_folder_schema = client.extract_schema_from_source(csv_folder_source)
# Note
# To ingest only from California, then filter_pattern = "California/.*/training.*"
# To ingest only from California & Arizona, then filter_pattern = "(Arizona|California)/.*/training.*"

# Register the feature set
my_feature_set = project.feature_sets.register(csv_folder_schema, "feature_set_name", primary_key=["key_name"])

# Ingest to cache
my_feature_set.ingest()

# Retrieve feature set
ref = my_feature_set.retrieve()
ref.download()

Example 3: directory structure (no date folder)

  • bucket_root

    • nested_folder/

      • California

        • training_data/ -> Sub Folder
          • data.csv
      • Arizona

        • training_sample/ -> Sub Folder
          • data.csv
      • Texas

        • training_sample/ -> Sub Folder
          • data.csv
from featurestore import Client, CSVFolder

# Initialise feature store client
client = Client("ip:port")
client.auth.login()

# Set project specifics
project = client.projects.create("demo")

# Create the csv folder source
csv_folder_source = CSVFolder(
root_folder="s3a://feature-store-test-data/nested_folder",
filter_pattern=".*/training.*" # To ingest from all states
)
csv_folder_schema = client.extract_schema_from_source(csv_folder_source)
# Note
# To ingest only from California, then filter_pattern = "California/training.*"
# To ingest only from California & Arizona, then filter_pattern = "(Arizona|California)/training.*"

# Register the feature set
my_feature_set = project.feature_sets.register(csv_folder_schema, "feature_set_name", primary_key=["key_name"])

# Ingest to cache
my_feature_set.ingest()

# Retrieve feature set
ref = my_feature_set.retrieve()
ref.download()

Feedback