h2oai · psinger · Oct 23, 2023 · Oct 16, 2023 · Oct 16, 2023 · Oct 17, 2023
@@ -1 +1,3 @@
-The column in the dataset containing the expected output.
+The column in the dataset containing the expected output.
+
+For classification, this needs to be an integer column containing the class label.
@@ -0,0 +1 @@
+The number of possible classes for the classification task. For binary classification, a single class should be selected.
@@ -2,4 +2,8 @@ Defines the problem type of the experiment, which also defines the settings H2O
 
 - Causal Language Modeling: Used to fine-tune large language models
 
-- Sequence To Sequence Modeling: Used to fine-tune large sequence to sequence models
+- Rlhf Language Modeling: Used to fine-tune RLHF language models
+
+- Sequence To Sequence Modeling: Used to fine-tune large sequence to sequence models
+
+- Causal Classification Modeling: Used to fine-tune causal classification models
@@ -61,6 +61,7 @@ def get_size(x):
         "text_causal_language_modeling_config",
         "text_rlhf_language_modeling_config",
         "text_sequence_to_sequence_modeling_config",
+        "text_causal_classification_modeling_config",
     ],
     "problem_categories": ["text"],
     "dataset_keys": [

@@ -122,8 +122,19 @@ def publish_model_to_hugging_face(
         repo_id=repo_id, repo_type="model", commit_message="Upload model card"
     )
 
-    # push config to hub
     api = huggingface_hub.HfApi()
+
+    # push classification head to hub
+    if cfg.type == "causal_classification":
+        api.upload_file(
+            path_or_fileobj=f"{path_to_experiment}/classification_head.pth",
+            path_in_repo="classification_head.pth",
+            repo_id=repo_id,
+            repo_type="model",
+            commit_message="Upload classification_head.pth",
+        )
+
+    # push config to hub
     api.upload_file(
         path_or_fileobj=os.path.join(path_to_experiment, "cfg.yaml"),
         path_in_repo="cfg.yaml",

@@ -0,0 +1,186 @@
+import os
+from dataclasses import dataclass, field
+from typing import Any, Tuple
+
+from llm_studio.python_configs.base import DefaultConfigProblemBase
+from llm_studio.python_configs.text_causal_language_modeling_config import (
+    ConfigNLPAugmentation,
+    ConfigNLPCausalLMArchitecture,
+    ConfigNLPCausalLMDataset,
+    ConfigNLPCausalLMEnvironment,
+    ConfigNLPCausalLMLogging,
+    ConfigNLPCausalLMPrediction,
+    ConfigNLPCausalLMTokenizer,
+    ConfigNLPCausalLMTraining,
+)
+from llm_studio.src import possible_values
+from llm_studio.src.losses import text_causal_classification_modeling_losses
+from llm_studio.src.metrics import text_causal_classification_modeling_metrics
+from llm_studio.src.models import text_causal_classification_modeling_model
+from llm_studio.src.utils.modeling_utils import generate_experiment_name
+
+
+@dataclass
+class ConfigNLPCausalClassificationDataset(ConfigNLPCausalLMDataset):
+    system_column: str = "None"
+    prompt_column: Tuple[str, ...] = ("instruction", "input")
+    answer_column: str = "label"
+    num_classes: int = 1
+    parent_id_column: str = "None"
+
+    text_system_start: str = ""
+    text_prompt_start: str = ""
+    text_answer_separator: str = ""
+
+    add_eos_token_to_system: bool = False
+    add_eos_token_to_prompt: bool = False
+    add_eos_token_to_answer: bool = False
+
+    _allowed_file_extensions: Tuple[str, ...] = ("csv", "pq", "parquet")
+
+    def __post_init__(self):
+        self.prompt_column = (
+            tuple(
+                self.prompt_column,
+            )
+            if isinstance(self.prompt_column, str)
+            else tuple(self.prompt_column)
+        )
+        super().__post_init__()
+
+        self._possible_values["num_classes"] = (1, 100, 1)
+
+        self._visibility["personalize"] = -1
+        self._visibility["chatbot_name"] = -1
+        self._visibility["chatbot_author"] = -1
+        self._visibility["mask_prompt_labels"] = -1
+        self._visibility["add_eos_token_to_answer"] = -1
+
+
+@dataclass
+class ConfigNLPCausalClassificationTraining(ConfigNLPCausalLMTraining):
+    loss_class: Any = text_causal_classification_modeling_losses.Losses
+    loss_function: str = "BinaryCrossEntropyLoss"
+
+    learning_rate: float = 0.0001
+    differential_learning_rate_layers: Tuple[str, ...] = ("classification_head",)
+    differential_learning_rate: float = 0.00001
+
+    def __post_init__(self):
+        super().__post_init__()
+        self._possible_values["loss_function"] = self.loss_class.names()
+
+        self._possible_values[
+            "differential_learning_rate_layers"
+        ] = possible_values.String(
+            values=("backbone", "embed", "classification_head"),
+            allow_custom=False,
+            placeholder="Select optional layers...",
+        )
+
+
+@dataclass
+class ConfigNLPCausalClassificationTokenizer(ConfigNLPCausalLMTokenizer):
+    max_length_prompt: int = 512
+    max_length: int = 512
+
+    def __post_init__(self):
+        super().__post_init__()
+
+        self._visibility["max_length_answer"] = -1
+
+
+@dataclass
+class ConfigNLPCausalClassificationArchitecture(ConfigNLPCausalLMArchitecture):
+    model_class: Any = text_causal_classification_modeling_model.Model
+
+    def __post_init__(self):
+        super().__post_init__()
+
+
+@dataclass
+class ConfigNLPCausalClassificationPrediction(ConfigNLPCausalLMPrediction):
+    metric_class: Any = text_causal_classification_modeling_metrics.Metrics
+    metric: str = "AUC"
+
+    def __post_init__(self):
+        super().__post_init__()
+        self._possible_values["metric"] = self.metric_class.names()
+
+        for k in [
+            "min_length_inference",
+            "max_length_inference",
+            "do_sample",
+            "num_beams",
+            "temperature",
+            "repetition_penalty",
+            "stop_tokens",
+            "top_k",
+            "top_p",
+        ]:
+            self._visibility[k] = -1
+
+
+@dataclass
+class ConfigNLPCausalClassificationEnvironment(ConfigNLPCausalLMEnvironment):
+    _model_card_template: str = "text_causal_classification_model_card_template.md"
+    _summary_card_template: str = (
+        "text_causal_classification_experiment_summary_card_template.md"
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+
+
+@dataclass
+class ConfigProblemBase(DefaultConfigProblemBase):
+    output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}"
+    experiment_name: str = field(default_factory=generate_experiment_name)
+    _parent_experiment: str = ""
+    llm_backbone: str = "h2oai/h2ogpt-4096-llama2-7b"
+    type: str = "causal_classification"
+
+    dataset: ConfigNLPCausalClassificationDataset = field(
+        default_factory=ConfigNLPCausalClassificationDataset
+    )
+    tokenizer: ConfigNLPCausalLMTokenizer = field(
+        default_factory=ConfigNLPCausalLMTokenizer
+    )
+    architecture: ConfigNLPCausalClassificationArchitecture = field(
+        default_factory=ConfigNLPCausalClassificationArchitecture
+    )
+    training: ConfigNLPCausalClassificationTraining = field(
+        default_factory=ConfigNLPCausalClassificationTraining
+    )
+    augmentation: ConfigNLPAugmentation = field(default_factory=ConfigNLPAugmentation)
+    prediction: ConfigNLPCausalClassificationPrediction = field(
+        default_factory=ConfigNLPCausalClassificationPrediction
+    )
+    environment: ConfigNLPCausalClassificationEnvironment = field(
+        default_factory=ConfigNLPCausalClassificationEnvironment
+    )
+    logging: ConfigNLPCausalLMLogging = field(default_factory=ConfigNLPCausalLMLogging)
+
+    def __post_init__(self):
+        super().__post_init__()
+
+        self._visibility["output_directory"] = -1
+
+        self._possible_values["llm_backbone"] = possible_values.String(
+            values=(
+                "h2oai/h2ogpt-4096-llama2-70b",
+                "h2oai/h2ogpt-4096-llama2-70b-chat",
+                "h2oai/h2ogpt-4096-llama2-13b",
+                "h2oai/h2ogpt-4096-llama2-13b-chat",
+                "h2oai/h2ogpt-4096-llama2-7b",
+                "h2oai/h2ogpt-4096-llama2-7b-chat",
+                "tiiuae/falcon-40b",
+                "tiiuae/falcon-7b",
+                "openlm-research/open_llama_13b",
+                "openlm-research/open_llama_7b",
+                "openlm-research/open_llama_3b",
+                "EleutherAI/gpt-j-6B",
+                "facebook/opt-125m",
+            ),
+            allow_custom=True,
+        )
diff --git a/llm_studio/python_configs/text_causal_language_modeling_config.py b/llm_studio/python_configs/text_causal_language_modeling_config.py
@@ -407,6 +407,7 @@ class ConfigProblemBase(DefaultConfigProblemBase):
     experiment_name: str = field(default_factory=generate_experiment_name)
     _parent_experiment: str = ""
     llm_backbone: str = "h2oai/h2ogpt-4096-llama2-7b"
+    type: str = "causal_lm"
 
     dataset: ConfigNLPCausalLMDataset = field(default_factory=ConfigNLPCausalLMDataset)
     tokenizer: ConfigNLPCausalLMTokenizer = field(

diff --git a/llm_studio/python_configs/text_rlhf_language_modeling_config.py b/llm_studio/python_configs/text_rlhf_language_modeling_config.py
@@ -178,6 +178,7 @@ class ConfigProblemBase(DefaultConfigProblemBase):
     _parent_experiment: str = ""
     llm_backbone: str = "h2oai/h2ogpt-4096-llama2-7b-chat"
     reward_model: str = "OpenAssistant/reward-model-deberta-v3-large-v2"
+    type: str = "rlhf"
 
     dataset: ConfigRLHFLMDataset = field(default_factory=ConfigRLHFLMDataset)
     tokenizer: ConfigNLPCausalLMTokenizer = field(

@@ -42,7 +42,6 @@ def __post_init__(self):
 
         self._visibility["limit_chained_samples"] = -1
         self._visibility["mask_prompt_labels"] = -1
-        self._visibility["dataset_class"] = -1
 
 
 @dataclass
@@ -75,6 +74,7 @@ class ConfigProblemBase(DefaultConfigProblemBase):
     experiment_name: str = field(default_factory=generate_experiment_name)
     _parent_experiment: str = ""
     llm_backbone: str = "t5-small"
+    type: str = "seq2seq"
 
     dataset: ConfigNLPSeq2SeqDataset = field(default_factory=ConfigNLPSeq2SeqDataset)
     tokenizer: ConfigNLPCausalLMTokenizer = field(

@@ -31,6 +31,9 @@ def __init__(self, df: pd.DataFrame, cfg: Any, mode: str = "train"):
         self.tokenizer = get_tokenizer(self.cfg)
         self.conversation_chain_handler = ConversationChainHandler(self.df, cfg)
 
+        if cfg.type == "causal_classification":
+            self.answers_int = df[cfg.dataset.answer_column].astype(int).values.tolist()
+
     def __len__(self) -> int:
         return len(self.conversation_chain_handler)
 
@@ -107,6 +110,10 @@ def __getitem__(self, idx: int) -> Dict:
                 sample["labels"][: len(system_encoding)] = -100
         if sample["prompt_input_ids"][0] != self.tokenizer.pad_token_id:
             sample["prompt_input_ids"][: len(system_encoding)] = system_encoding
+
+        if self.cfg.type == "causal_classification":
+            sample["class_label"] = self.answers_int[idx]
+
         return sample
 
     @staticmethod
@@ -254,7 +261,10 @@ def clean_output(
         return output
 
     def postprocess_output(self, cfg, df: pd.DataFrame, output: Dict) -> Dict:
-        if not cfg.prediction.metric == "Perplexity":
+        if (
+            not cfg.prediction.metric == "Perplexity"
+            and not cfg.type == "causal_classification"
+        ):
             output = self.clean_output(output, cfg)
 
         output["target_text"] = self.conversation_chain_handler.answers
@@ -297,6 +307,9 @@ def format_output(
         if "predicted_text" in output.keys():
             output["predicted_text"] = np.array(output["predicted_text"])
 
+        if "logits" in output.keys():
+            output["logits"] = np.array(output["logits"].float())
+
         if isinstance(cfg.dataset.prompt_column, tuple):
             for col in cfg.dataset.prompt_column:
                 output[col] = df.loc[end_conversation_ids, col].values

@@ -0,0 +1,53 @@
+import logging
+from typing import Any, KeysView
+
+from torch import nn
+
+__all__ = ["Losses"]
+
+
+logger = logging.getLogger(__name__)
+
+
+class CrossEntropyLoss(nn.Module):
+    def __init__(self, cfg: Any):
+        super().__init__()
+        self.cfg = cfg
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, logits, labels):
+        return self.loss_fn(logits, labels.reshape(-1).long())
+
+
+class BinaryCrossEntropyLoss(nn.Module):
+    def __init__(self, cfg: Any):
+        super().__init__()
+        self.cfg = cfg
+        self.loss_fn = nn.BCEWithLogitsLoss()
+
+    def forward(self, logits, labels):
+        return self.loss_fn(logits, labels)
+
+
+class Losses:
+    """Losses factory."""
+
+    _losses = {
+        "CrossEntropyLoss": CrossEntropyLoss,
+        "BinaryCrossEntropyLoss": BinaryCrossEntropyLoss,
+    }
+
+    @classmethod
+    def names(cls) -> KeysView:
+        return cls._losses.keys()
+
+    @classmethod
+    def get(cls, name: str) -> Any:
+        """Access to Losses.
+
+        Args:
+            name: losses name
+        Returns:
+            A class to build the Losses
+        """
+        return cls._losses.get(name, CrossEntropyLoss)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The number of possible classes for the classification task. For binary classification, a single class should be selected.