-
Notifications
You must be signed in to change notification settings - Fork 33
Description
Code:
import logging
from ludwig.api import LudwigModel
from ludwig.datasets import agnews
Loads the dataset as a pandas.DataFrame
train_df, test_df, _ = agnews.load(split=True)
Prints a preview of the first five rows.
train_df.head(5)
config = {
"input_features": [
{
"name": "title", # The name of the input column
"type": "text", # Data type of the input column
"encoder": {
"type": "auto_transformer", # The model architecture to use
"pretrained_model_name_or_path": "bigscience/bloom-3b",
"trainable": True,
},
},
],
"output_features": [
{
"name": "class",
"type": "category",
}
],
"trainer": {
"learning_rate": 0.00001,
"epochs": 3, # We'll train for three epochs. Training longer might give
# better performance.
},
"backend": {
"type": "ray",
"trainer": {
"strategy": "fsdp", # fsdp distributed strategy for using a multi-GPU cluster
}
}
}
model = LudwigModel(config, logging_level=logging.INFO)
train_stats, preprocessed_data, output_directory = model.train(dataset=train_df)
Error
2023-04-12 02:50:23,717 WARNING read_api.py:330 -- .repartition(n) to increase the number of dataset blocks.
Parquet Files Sample: 100%|██████████| 1/1 [00:00<00:00, 2.12it/s]
Parquet Files Sample: 0%| | 0/1 [00:00<?, ?it/s]
(_sample_piece pid=2075, ip=172.31.47.162) 2023-04-12 02:50:24,737 INFO worker.py:772 -- Task failed with retryable exception: TaskID(45b3d0fcab720f49ffffffffffffffffffffffff01000000).
(_sample_piece pid=2075, ip=172.31.47.162) Traceback (most recent call last):
(_sample_piece pid=2075, ip=172.31.47.162) File "python/ray/_raylet.pyx", line 857, in ray._raylet.execute_task
(_sample_piece pid=2075, ip=172.31.47.162) File "python/ray/_raylet.pyx", line 861, in ray._raylet.execute_task
(_sample_piece pid=2075, ip=172.31.47.162) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/data/datasource/parquet_datasource.py", line 461, in _sample_piece
(_sample_piece pid=2075, ip=172.31.47.162) piece = piece.subset(row_group_ids=[0])
(_sample_piece pid=2075, ip=172.31.47.162) File "pyarrow/_dataset_parquet.pyx", line 424, in pyarrow._dataset_parquet.ParquetFileFragment.subset
(_sample_piece pid=2075, ip=172.31.47.162) File "pyarrow/error.pxi", line 143, in pyarrow.lib.pyarrow_internal_check_status
(_sample_piece pid=2075, ip=172.31.47.162) File "pyarrow/_fs.pyx", line 1179, in pyarrow._fs._cb_open_input_file
(_sample_piece pid=2075, ip=172.31.47.162) File "/home/ray/anaconda3/lib/python3.8/site-packages/pyarrow/fs.py", line 394, in open_input_file
(_sample_piece pid=2075, ip=172.31.47.162) raise FileNotFoundError(path)
(_sample_piece pid=2075, ip=172.31.47.162) FileNotFoundError: /home/ray/1e10f286d91711edbbf702820dcb34a8.validation.parquet/part.00000000.parquet
2023-04-12 02:50:25,232 WARNING read_api.py:330 -- .repartition(n) to increase the number of dataset blocks.
Parquet Files Sample: 100%|██████████| 1/1 [00:01<00:00, 1.49s/it]
Parquet Files Sample: 0%| | 0/1 [00:00<?, ?it/s]
(_sample_piece pid=2075, ip=172.31.47.162) 2023-04-12 02:50:25,243 INFO worker.py:772 -- Task failed with retryable exception: TaskID(06f28617326374dbffffffffffffffffffffffff01000000).
(_sample_piece pid=2075, ip=172.31.47.162) Traceback (most recent call last):
(_sample_piece pid=2075, ip=172.31.47.162) File "python/ray/_raylet.pyx", line 857, in ray._raylet.execute_task
(_sample_piece pid=2075, ip=172.31.47.162) File "python/ray/_raylet.pyx", line 861, in ray._raylet.execute_task
(_sample_piece pid=2075, ip=172.31.47.162) File "/home/ray/anaconda3/lib/python3.8/site-packages/ray/data/datasource/parquet_datasource.py", line 461, in _sample_piece
(_sample_piece pid=2075, ip=172.31.47.162) piece = piece.subset(row_group_ids=[0])
(_sample_piece pid=2075, ip=172.31.47.162) File "pyarrow/_dataset_parquet.pyx", line 424, in pyarrow._dataset_parquet.ParquetFileFragment.subset
(_sample_piece pid=2075, ip=172.31.47.162) File "pyarrow/error.pxi", line 143, in pyarrow.lib.pyarrow_internal_check_status
(_sample_piece pid=2075, ip=172.31.47.162) File "pyarrow/_fs.pyx", line 1179, in pyarrow._fs._cb_open_input_file
(_sample_piece pid=2075, ip=172.31.47.162) File "/home/ray/anaconda3/lib/python3.8/site-packages/pyarrow/fs.py", line 394, in open_input_file
(_sample_piece pid=2075, ip=172.31.47.162) raise FileNotFoundError(path)
(_sample_piece pid=2075, ip=172.31.47.162) FileNotFoundError: /home/ray/1e10f286d91711edbbf702820dcb34a8.test.parquet/part.00000000.parquet
2023-04-12 02:50:26,238 WARNING read_api.py:330 -- .repartition(n) to increase the number of dataset blocks.
Parquet Files Sample: 100%|██████████| 1/1 [00:00<00:00, 1.00it/s]
Dataset Statistics
╒════════════╤═══════════════╤════════════════════╕
│ Dataset │ Size (Rows) │ Size (In Memory) │
╞════════════╪═══════════════╪════════════════════╡
│ Training │ 80626 │ 12.44 Mb │
├────────────┼───────────────┼────────────────────┤
│ Validation │ 11383 │ 1.76 Mb │
├────────────┼───────────────┼────────────────────┤
│ Test │ 22890 │ 3.53 Mb │
╘════════════╧═══════════════╧════════════════════╛
╒═══════╕
│ MODEL │
╘═══════╛
After this the bloom model downloads but never uses resources from clusters