Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
The column in the dataset containing the expected output.
The column in the dataset containing the expected output.

For classification, this needs to be an integer column containing the class label.
1 change: 1 addition & 0 deletions documentation/docs/tooltips/experiments/_num-classes.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The number of possible classes for the classification task. For binary classification, a single class should be selected.
6 changes: 5 additions & 1 deletion documentation/docs/tooltips/experiments/_problem-type.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@ Defines the problem type of the experiment, which also defines the settings H2O

- Causal Language Modeling: Used to fine-tune large language models

- Sequence To Sequence Modeling: Used to fine-tune large sequence to sequence models
- Rlhf Language Modeling: Used to fine-tune RLHF language models

- Sequence To Sequence Modeling: Used to fine-tune large sequence to sequence models

- Causal Classification Modeling: Used to fine-tune causal classification models
1 change: 1 addition & 0 deletions llm_studio/app_utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def get_size(x):
"text_causal_language_modeling_config",
"text_rlhf_language_modeling_config",
"text_sequence_to_sequence_modeling_config",
"text_causal_classification_modeling_config",
],
"problem_categories": ["text"],
"dataset_keys": [
Expand Down
13 changes: 12 additions & 1 deletion llm_studio/app_utils/hugging_face_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,19 @@ def publish_model_to_hugging_face(
repo_id=repo_id, repo_type="model", commit_message="Upload model card"
)

# push config to hub
api = huggingface_hub.HfApi()

# push classification head to hub
if cfg.type == "causal_classification":
api.upload_file(
path_or_fileobj=f"{path_to_experiment}/classification_head.pth",
path_in_repo="classification_head.pth",
repo_id=repo_id,
repo_type="model",
commit_message="Upload classification_head.pth",
)

# push config to hub
api.upload_file(
path_or_fileobj=os.path.join(path_to_experiment, "cfg.yaml"),
path_in_repo="cfg.yaml",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import os
from dataclasses import dataclass, field
from typing import Any, Tuple

from llm_studio.python_configs.base import DefaultConfigProblemBase
from llm_studio.python_configs.text_causal_language_modeling_config import (
ConfigNLPAugmentation,
ConfigNLPCausalLMArchitecture,
ConfigNLPCausalLMDataset,
ConfigNLPCausalLMEnvironment,
ConfigNLPCausalLMLogging,
ConfigNLPCausalLMPrediction,
ConfigNLPCausalLMTokenizer,
ConfigNLPCausalLMTraining,
)
from llm_studio.src import possible_values
from llm_studio.src.losses import text_causal_classification_modeling_losses
from llm_studio.src.metrics import text_causal_classification_modeling_metrics
from llm_studio.src.models import text_causal_classification_modeling_model
from llm_studio.src.utils.modeling_utils import generate_experiment_name


@dataclass
class ConfigNLPCausalClassificationDataset(ConfigNLPCausalLMDataset):
system_column: str = "None"
prompt_column: Tuple[str, ...] = ("instruction", "input")
answer_column: str = "label"
num_classes: int = 1
parent_id_column: str = "None"

text_system_start: str = ""
text_prompt_start: str = ""
text_answer_separator: str = ""

add_eos_token_to_system: bool = False
add_eos_token_to_prompt: bool = False
add_eos_token_to_answer: bool = False

_allowed_file_extensions: Tuple[str, ...] = ("csv", "pq", "parquet")

def __post_init__(self):
self.prompt_column = (
tuple(
self.prompt_column,
)
if isinstance(self.prompt_column, str)
else tuple(self.prompt_column)
)
super().__post_init__()

self._possible_values["num_classes"] = (1, 100, 1)

self._visibility["personalize"] = -1
self._visibility["chatbot_name"] = -1
self._visibility["chatbot_author"] = -1
self._visibility["mask_prompt_labels"] = -1
self._visibility["add_eos_token_to_answer"] = -1


@dataclass
class ConfigNLPCausalClassificationTraining(ConfigNLPCausalLMTraining):
loss_class: Any = text_causal_classification_modeling_losses.Losses
loss_function: str = "BinaryCrossEntropyLoss"

learning_rate: float = 0.0001
differential_learning_rate_layers: Tuple[str, ...] = ("classification_head",)
differential_learning_rate: float = 0.00001

def __post_init__(self):
super().__post_init__()
self._possible_values["loss_function"] = self.loss_class.names()

self._possible_values[
"differential_learning_rate_layers"
] = possible_values.String(
values=("backbone", "embed", "classification_head"),
allow_custom=False,
placeholder="Select optional layers...",
)


@dataclass
class ConfigNLPCausalClassificationTokenizer(ConfigNLPCausalLMTokenizer):
max_length_prompt: int = 512
max_length: int = 512

def __post_init__(self):
super().__post_init__()

self._visibility["max_length_answer"] = -1


@dataclass
class ConfigNLPCausalClassificationArchitecture(ConfigNLPCausalLMArchitecture):
model_class: Any = text_causal_classification_modeling_model.Model

def __post_init__(self):
super().__post_init__()


@dataclass
class ConfigNLPCausalClassificationPrediction(ConfigNLPCausalLMPrediction):
metric_class: Any = text_causal_classification_modeling_metrics.Metrics
metric: str = "AUC"

def __post_init__(self):
super().__post_init__()
self._possible_values["metric"] = self.metric_class.names()

for k in [
"min_length_inference",
"max_length_inference",
"do_sample",
"num_beams",
"temperature",
"repetition_penalty",
"stop_tokens",
"top_k",
"top_p",
]:
self._visibility[k] = -1


@dataclass
class ConfigNLPCausalClassificationEnvironment(ConfigNLPCausalLMEnvironment):
_model_card_template: str = "text_causal_classification_model_card_template.md"
_summary_card_template: str = (
"text_causal_classification_experiment_summary_card_template.md"
)

def __post_init__(self):
super().__post_init__()


@dataclass
class ConfigProblemBase(DefaultConfigProblemBase):
output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}"
experiment_name: str = field(default_factory=generate_experiment_name)
_parent_experiment: str = ""
llm_backbone: str = "h2oai/h2ogpt-4096-llama2-7b"
type: str = "causal_classification"

dataset: ConfigNLPCausalClassificationDataset = field(
default_factory=ConfigNLPCausalClassificationDataset
)
tokenizer: ConfigNLPCausalLMTokenizer = field(
default_factory=ConfigNLPCausalLMTokenizer
)
architecture: ConfigNLPCausalClassificationArchitecture = field(
default_factory=ConfigNLPCausalClassificationArchitecture
)
training: ConfigNLPCausalClassificationTraining = field(
default_factory=ConfigNLPCausalClassificationTraining
)
augmentation: ConfigNLPAugmentation = field(default_factory=ConfigNLPAugmentation)
prediction: ConfigNLPCausalClassificationPrediction = field(
default_factory=ConfigNLPCausalClassificationPrediction
)
environment: ConfigNLPCausalClassificationEnvironment = field(
default_factory=ConfigNLPCausalClassificationEnvironment
)
logging: ConfigNLPCausalLMLogging = field(default_factory=ConfigNLPCausalLMLogging)

def __post_init__(self):
super().__post_init__()

self._visibility["output_directory"] = -1

self._possible_values["llm_backbone"] = possible_values.String(
values=(
"h2oai/h2ogpt-4096-llama2-70b",
"h2oai/h2ogpt-4096-llama2-70b-chat",
"h2oai/h2ogpt-4096-llama2-13b",
"h2oai/h2ogpt-4096-llama2-13b-chat",
"h2oai/h2ogpt-4096-llama2-7b",
"h2oai/h2ogpt-4096-llama2-7b-chat",
"tiiuae/falcon-40b",
"tiiuae/falcon-7b",
"openlm-research/open_llama_13b",
"openlm-research/open_llama_7b",
"openlm-research/open_llama_3b",
"EleutherAI/gpt-j-6B",
"facebook/opt-125m",
),
allow_custom=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ class ConfigProblemBase(DefaultConfigProblemBase):
experiment_name: str = field(default_factory=generate_experiment_name)
_parent_experiment: str = ""
llm_backbone: str = "h2oai/h2ogpt-4096-llama2-7b"
type: str = "causal_lm"

dataset: ConfigNLPCausalLMDataset = field(default_factory=ConfigNLPCausalLMDataset)
tokenizer: ConfigNLPCausalLMTokenizer = field(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ class ConfigProblemBase(DefaultConfigProblemBase):
_parent_experiment: str = ""
llm_backbone: str = "h2oai/h2ogpt-4096-llama2-7b-chat"
reward_model: str = "OpenAssistant/reward-model-deberta-v3-large-v2"
type: str = "rlhf"

dataset: ConfigRLHFLMDataset = field(default_factory=ConfigRLHFLMDataset)
tokenizer: ConfigNLPCausalLMTokenizer = field(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def __post_init__(self):

self._visibility["limit_chained_samples"] = -1
self._visibility["mask_prompt_labels"] = -1
self._visibility["dataset_class"] = -1


@dataclass
Expand Down Expand Up @@ -75,6 +74,7 @@ class ConfigProblemBase(DefaultConfigProblemBase):
experiment_name: str = field(default_factory=generate_experiment_name)
_parent_experiment: str = ""
llm_backbone: str = "t5-small"
type: str = "seq2seq"

dataset: ConfigNLPSeq2SeqDataset = field(default_factory=ConfigNLPSeq2SeqDataset)
tokenizer: ConfigNLPCausalLMTokenizer = field(
Expand Down
15 changes: 14 additions & 1 deletion llm_studio/src/datasets/text_causal_language_modeling_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ def __init__(self, df: pd.DataFrame, cfg: Any, mode: str = "train"):
self.tokenizer = get_tokenizer(self.cfg)
self.conversation_chain_handler = ConversationChainHandler(self.df, cfg)

if cfg.type == "causal_classification":
self.answers_int = df[cfg.dataset.answer_column].astype(int).values.tolist()

def __len__(self) -> int:
return len(self.conversation_chain_handler)

Expand Down Expand Up @@ -107,6 +110,10 @@ def __getitem__(self, idx: int) -> Dict:
sample["labels"][: len(system_encoding)] = -100
if sample["prompt_input_ids"][0] != self.tokenizer.pad_token_id:
sample["prompt_input_ids"][: len(system_encoding)] = system_encoding

if self.cfg.type == "causal_classification":
sample["class_label"] = self.answers_int[idx]

return sample

@staticmethod
Expand Down Expand Up @@ -254,7 +261,10 @@ def clean_output(
return output

def postprocess_output(self, cfg, df: pd.DataFrame, output: Dict) -> Dict:
if not cfg.prediction.metric == "Perplexity":
if (
not cfg.prediction.metric == "Perplexity"
and not cfg.type == "causal_classification"
):
output = self.clean_output(output, cfg)

output["target_text"] = self.conversation_chain_handler.answers
Expand Down Expand Up @@ -297,6 +307,9 @@ def format_output(
if "predicted_text" in output.keys():
output["predicted_text"] = np.array(output["predicted_text"])

if "logits" in output.keys():
output["logits"] = np.array(output["logits"].float())

if isinstance(cfg.dataset.prompt_column, tuple):
for col in cfg.dataset.prompt_column:
output[col] = df.loc[end_conversation_ids, col].values
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import logging
from typing import Any, KeysView

from torch import nn

__all__ = ["Losses"]


logger = logging.getLogger(__name__)


class CrossEntropyLoss(nn.Module):
def __init__(self, cfg: Any):
super().__init__()
self.cfg = cfg
self.loss_fn = nn.CrossEntropyLoss()

def forward(self, logits, labels):
return self.loss_fn(logits, labels.reshape(-1).long())


class BinaryCrossEntropyLoss(nn.Module):
def __init__(self, cfg: Any):
super().__init__()
self.cfg = cfg
self.loss_fn = nn.BCEWithLogitsLoss()

def forward(self, logits, labels):
return self.loss_fn(logits, labels)


class Losses:
"""Losses factory."""

_losses = {
"CrossEntropyLoss": CrossEntropyLoss,
"BinaryCrossEntropyLoss": BinaryCrossEntropyLoss,
}

@classmethod
def names(cls) -> KeysView:
return cls._losses.keys()

@classmethod
def get(cls, name: str) -> Any:
"""Access to Losses.

Args:
name: losses name
Returns:
A class to build the Losses
"""
return cls._losses.get(name, CrossEntropyLoss)
Loading