googleapis
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎bigframes/ml/core.py
Lines changed: 6 additions & 0 deletions b/‎bigframes/ml/core.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎bigframes/ml/decomposition.py
Lines changed: 166 additions & 1 deletion b/‎bigframes/ml/decomposition.py
Lines changed: 166 additions & 1 deletion
diff --git a/‎bigframes/ml/loader.py
Lines changed: 2 additions & 0 deletions b/‎bigframes/ml/loader.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎bigframes/ml/sql.py
Lines changed: 5 additions & 0 deletions b/‎bigframes/ml/sql.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎owlbot.py
Lines changed: 7 additions & 0 deletions b/‎owlbot.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎samples/snippets/create_multiple_timeseries_forecasting_model_test.py
Lines changed: 70 additions & 5 deletions b/‎samples/snippets/create_multiple_timeseries_forecasting_model_test.py
Lines changed: 70 additions & 5 deletions
diff --git a/‎scratch/.gitignore
Lines changed: 2 additions & 0 deletions b/‎scratch/.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/data/ratings.jsonl
Lines changed: 20 additions & 0 deletions b/‎tests/data/ratings.jsonl
Lines changed: 20 additions & 0 deletions
diff --git a/‎tests/data/ratings_schema.json
Lines changed: 17 additions & 0 deletions b/‎tests/data/ratings_schema.json
Lines changed: 17 additions & 0 deletions
@@ -60,5 +60,6 @@ coverage.xml
 system_tests/local_test_setup
 
 # Make sure a generated file isn't accidentally committed.
+demo.ipynb
 pylintrc
 pylintrc.test
@@ -117,6 +117,12 @@ def model(self) -> bigquery.Model:
         """Get the BQML model associated with this wrapper"""
         return self._model
 
+    def recommend(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
+        return self._apply_ml_tvf(
+            input_data,
+            self._model_manipulation_sql_generator.ml_recommend,
+        )
+
     def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
         return self._apply_ml_tvf(
             input_data,
 
@@ -19,6 +19,7 @@
 
 from typing import List, Literal, Optional, Union
 
+import bigframes_vendored.sklearn.decomposition._mf
 import bigframes_vendored.sklearn.decomposition._pca
 from google.cloud import bigquery
 
@@ -27,7 +28,15 @@
 import bigframes.pandas as bpd
 import bigframes.session
 
-_BQML_PARAMS_MAPPING = {"svd_solver": "pcaSolver"}
+_BQML_PARAMS_MAPPING = {
+    "svd_solver": "pcaSolver",
+    "feedback_type": "feedbackType",
+    "num_factors": "numFactors",
+    "user_col": "userColumn",
+    "item_col": "itemColumn",
+    "_input_label_columns": "inputLabelColumns",
+    "l2_reg": "l2Regularization",
+}
 
 
 @log_adapter.class_logger
@@ -197,3 +206,159 @@ def score(
 
         # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE.
         return self._bqml_model.evaluate()
+
+
+@log_adapter.class_logger
+class MatrixFactorization(
+    base.UnsupervisedTrainablePredictor,
+    bigframes_vendored.sklearn.decomposition._mf.MatrixFactorization,
+):
+    __doc__ = bigframes_vendored.sklearn.decomposition._mf.MatrixFactorization.__doc__
+
+    def __init__(
+        self,
+        *,
+        feedback_type: Literal["explicit", "implicit"] = "explicit",
+        num_factors: int,
+        user_col: str,
+        item_col: str,
+        rating_col: str = "rating",
+        # TODO: Add support for hyperparameter tuning.
+        l2_reg: float = 1.0,
+    ):
+
+        feedback_type = feedback_type.lower()  # type: ignore
+        if feedback_type not in ("explicit", "implicit"):
+            raise ValueError("Expected feedback_type to be `explicit` or `implicit`.")
+
+        self.feedback_type = feedback_type
+
+        if not isinstance(num_factors, int):
+            raise TypeError(
+                f"Expected num_factors to be an int, but got {type(num_factors)}."
+            )
+
+        if num_factors < 0:
+            raise ValueError(
+                f"Expected num_factors to be a positive integer, but got {num_factors}."
+            )
+
+        self.num_factors = num_factors
+
+        if not isinstance(user_col, str):
+            raise TypeError(f"Expected user_col to be a str, but got {type(user_col)}.")
+
+        self.user_col = user_col
+
+        if not isinstance(item_col, str):
+            raise TypeError(f"Expected item_col to be STR, but got {type(item_col)}.")
+
+        self.item_col = item_col
+
+        if not isinstance(rating_col, str):
+            raise TypeError(
+                f"Expected rating_col to be a str, but got {type(rating_col)}."
+            )
+
+        self._input_label_columns = [rating_col]
+
+        if not isinstance(l2_reg, (float, int)):
+            raise TypeError(
+                f"Expected l2_reg to be a float or int, but got {type(l2_reg)}."
+            )
+
+        self.l2_reg = l2_reg
+        self._bqml_model: Optional[core.BqmlModel] = None
+        self._bqml_model_factory = globals.bqml_model_factory()
+
+    @property
+    def rating_col(self) -> str:
+        """str: The rating column name. Defaults to 'rating'."""
+        return self._input_label_columns[0]
+
+    @classmethod
+    def _from_bq(
+        cls, session: bigframes.session.Session, bq_model: bigquery.Model
+    ) -> MatrixFactorization:
+        assert bq_model.model_type == "MATRIX_FACTORIZATION"
+
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
+
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
+
+    @property
+    def _bqml_options(self) -> dict:
+        """The model options as they will be set for BQML"""
+        options: dict = {
+            "model_type": "matrix_factorization",
+            "feedback_type": self.feedback_type,
+            "user_col": self.user_col,
+            "item_col": self.item_col,
+            "rating_col": self.rating_col,
+            "l2_reg": self.l2_reg,
+        }
+
+        if self.num_factors is not None:
+            options["num_factors"] = self.num_factors
+
+        return options
+
+    def _fit(
+        self,
+        X: utils.ArrayType,
+        y=None,
+        transforms: Optional[List[str]] = None,
+    ) -> MatrixFactorization:
+        if y is not None:
+            raise ValueError(
+                "Label column not supported for Matrix Factorization model but y was not `None`"
+            )
+
+        (X,) = utils.batch_convert_to_dataframe(X)
+
+        self._bqml_model = self._bqml_model_factory.create_model(
+            X_train=X,
+            transforms=transforms,
+            options=self._bqml_options,
+        )
+        return self
+
+    def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before recommend")
+
+        (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
+
+        return self._bqml_model.recommend(X)
+
+    def to_gbq(self, model_name: str, replace: bool = False) -> MatrixFactorization:
+        """Save the model to BigQuery.
+
+        Args:
+            model_name (str):
+                The name of the model.
+            replace (bool, default False):
+                Determine whether to replace if the model already exists. Default to False.
+
+        Returns:
+            MatrixFactorization: Saved model."""
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before it can be saved")
+
+        new_model = self._bqml_model.copy(model_name, replace)
+        return new_model.session.read_gbq_model(model_name)
+
+    def score(
+        self,
+        X=None,
+        y=None,
+    ) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before score")
+
+        # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE.
+        return self._bqml_model.evaluate()
@@ -42,6 +42,7 @@
         "LINEAR_REGRESSION": linear_model.LinearRegression,
         "LOGISTIC_REGRESSION": linear_model.LogisticRegression,
         "KMEANS": cluster.KMeans,
+        "MATRIX_FACTORIZATION": decomposition.MatrixFactorization,
         "PCA": decomposition.PCA,
         "BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor,
         "BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier,
@@ -80,6 +81,7 @@
 def from_bq(
     session: bigframes.session.Session, bq_model: bigquery.Model
 ) -> Union[
+    decomposition.MatrixFactorization,
     decomposition.PCA,
     cluster.KMeans,
     linear_model.LinearRegression,
 
@@ -299,6 +299,11 @@ def alter_model(
         return "\n".join(parts)
 
     # ML prediction TVFs
+    def ml_recommend(self, source_sql: str) -> str:
+        """Encode ML.RECOMMEND for BQML"""
+        return f"""SELECT * FROM ML.RECOMMEND(MODEL {self._model_ref_sql()},
+  ({source_sql}))"""
+
     def ml_predict(self, source_sql: str) -> str:
         """Encode ML.PREDICT for BQML"""
         return f"""SELECT * FROM ML.PREDICT(MODEL {self._model_ref_sql()},
 
@@ -64,6 +64,13 @@
 # Fixup files
 # ----------------------------------------------------------------------------
 
+# Add scratch space for experimentation to .gitignore.
+assert 1 == s.replace(
+    [".gitignore"],
+    re.escape("# Make sure a generated file isn't accidentally committed.\n"),
+    "# Make sure a generated file isn't accidentally committed.\ndemo.ipynb\n",
+)
+
 # Encourage sharring all relevant versions in bug reports.
 assert 1 == s.replace(  # bug_report.md
     [".github/ISSUE_TEMPLATE/bug_report.md"],
 
@@ -73,26 +73,91 @@ def test_multiple_timeseries_forecasting_model(random_model_id: str) -> None:
     from bigframes.ml import forecasting
     import bigframes.pandas as bpd
 
+    model = forecasting.ARIMAPlus(
+        # To reduce the query runtime with the compromise of a potential slight
+        # drop in model quality, you could decrease the value of the
+        # auto_arima_max_order. This shrinks the search space of hyperparameter
+        # tuning in the auto.ARIMA algorithm.
+        auto_arima_max_order=5,
+    )
+
     df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips")
 
+    # This query creates twelve time series models, one for each of the twelve
+    # Citi Bike start stations in the input data. If you remove this row
+    # filter, there would be 600+ time series to forecast.
+    df = df[df["start_station_name"].str.contains("Central Park")]
+
     features = bpd.DataFrame(
         {
-            "num_trips": df.starttime,
+            "start_station_name": df["start_station_name"],
+            "num_trips": df["starttime"],
             "date": df["starttime"].dt.date,
         }
     )
-    num_trips = features.groupby(["date"], as_index=False).count()
-    model = forecasting.ARIMAPlus()
+    num_trips = features.groupby(
+        ["start_station_name", "date"],
+        as_index=False,
+    ).count()
 
     X = num_trips["date"].to_frame()
     y = num_trips["num_trips"].to_frame()
 
-    model.fit(X, y)
+    model.fit(
+        X,
+        y,
+        # The input data that you want to get forecasts for,
+        # in this case the Citi Bike station, as represented by the
+        # start_station_name column.
+        id_col=num_trips["start_station_name"].to_frame(),
+    )
+
     # The model.fit() call above created a temporary model.
     # Use the to_gbq() method to write to a permanent location.
-
     model.to_gbq(
         your_model_id,  # For example: "bqml_tutorial.nyc_citibike_arima_model",
         replace=True,
     )
     # [END bigquery_dataframes_bqml_arima_multiple_step_3_fit]
+
+    # [START bigquery_dataframes_bqml_arima_multiple_step_4_evaluate]
+    # Evaluate the time series models by using the summary() function. The summary()
+    # function shows you the evaluation metrics of all the candidate models evaluated
+    # during the process of automatic hyperparameter tuning.
+    summary = model.summary()
+    print(summary.peek())
+
+    # Expected output:
+    #    start_station_name                  non_seasonal_p  non_seasonal_d   non_seasonal_q  has_drift  log_likelihood           AIC     variance ...
+    # 1         Central Park West & W 72 St               0               1                5      False    -1966.449243   3944.898487  1215.689281 ...
+    # 8            Central Park W & W 96 St               0               0                5      False     -274.459923    562.919847   655.776577 ...
+    # 9        Central Park West & W 102 St               0               0                0      False     -226.639918    457.279835    258.83582 ...
+    # 11        Central Park West & W 76 St               1               1                2      False    -1700.456924   3408.913848   383.254161 ...
+    # 4   Grand Army Plaza & Central Park S               0               1                5      False    -5507.553498  11027.106996   624.138741 ...
+    # [END bigquery_dataframes_bqml_arima_multiple_step_4_evaluate]
+
+    # [START bigquery_dataframes_bqml_arima_multiple_step_5_coefficients]
+    coef = model.coef_
+    print(coef.peek())
+
+    # Expected output:
+    #    start_station_name                                              ar_coefficients                                   ma_coefficients intercept_or_drift
+    # 5    Central Park West & W 68 St                                                [] [-0.41014089  0.21979212 -0.59854213 -0.251438...                0.0
+    # 6         Central Park S & 6 Ave                                                [] [-0.71488957 -0.36835772  0.61008532  0.183290...                0.0
+    # 0    Central Park West & W 85 St                                                [] [-0.39270166 -0.74494638  0.76432596  0.489146...                0.0
+    # 3    W 82 St & Central Park West                         [-0.50219511 -0.64820817]             [-0.20665325  0.67683137 -0.68108631]                0.0
+    # 11  W 106 St & Central Park West [-0.70442887 -0.66885553 -0.25030325 -0.34160669]                                                []                0.0
+    # [END bigquery_dataframes_bqml_arima_multiple_step_5_coefficients]
+
+    # [START bigquery_dataframes_bqml_arima_multiple_step_6_forecast]
+    prediction = model.predict(horizon=3, confidence_level=0.9)
+
+    print(prediction.peek())
+    # Expected output:
+    #            forecast_timestamp                             start_station_name  forecast_value  standard_error  confidence_level ...
+    # 4   2016-10-01 00:00:00+00:00                         Central Park S & 6 Ave      302.377201       32.572948               0.9 ...
+    # 14  2016-10-02 00:00:00+00:00  Central Park North & Adam Clayton Powell Blvd      263.917567       45.284082               0.9 ...
+    # 1   2016-09-25 00:00:00+00:00                    Central Park West & W 85 St      189.574706       39.874856               0.9 ...
+    # 20  2016-10-02 00:00:00+00:00                    Central Park West & W 72 St      175.474862       40.940794               0.9 ...
+    # 12  2016-10-01 00:00:00+00:00                   W 106 St & Central Park West        63.88163       18.088868               0.9 ...
+    # [END bigquery_dataframes_bqml_arima_multiple_step_6_forecast]
@@ -0,0 +1,2 @@
+# Ignore all files in this directory.
+*
@@ -0,0 +1,20 @@
+{"user_id": 1, "item_id": 2, "rating": 4.0}
+{"user_id": 1, "item_id": 5, "rating": 3.0}
+{"user_id": 2, "item_id": 1, "rating": 5.0}
+{"user_id": 2, "item_id": 3, "rating": 2.0}
+{"user_id": 3, "item_id": 4, "rating": 4.5}
+{"user_id": 3, "item_id": 7, "rating": 3.5}
+{"user_id": 4, "item_id": 2, "rating": 1.0}
+{"user_id": 4, "item_id": 8, "rating": 5.0}
+{"user_id": 5, "item_id": 3, "rating": 4.0}
+{"user_id": 5, "item_id": 9, "rating": 2.5}
+{"user_id": 6, "item_id": 1, "rating": 3.0}
+{"user_id": 6, "item_id": 6, "rating": 4.5}
+{"user_id": 7, "item_id": 5, "rating": 5.0}
+{"user_id": 7, "item_id": 10, "rating": 1.5}
+{"user_id": 8, "item_id": 4, "rating": 2.0}
+{"user_id": 8, "item_id": 7, "rating": 4.0}
+{"user_id": 9, "item_id": 2, "rating": 3.5}
+{"user_id": 9, "item_id": 9, "rating": 5.0}
+{"user_id": 10, "item_id": 3, "rating": 4.5}
+{"user_id": 10, "item_id": 8, "rating": 2.5}
@@ -0,0 +1,17 @@
+[
+    {
+      "mode": "NULLABLE",
+      "name": "user_id",
+      "type": "STRING"
+    },
+    {
+      "mode": "NULLABLE",
+      "name": "item_id",
+      "type": "INT64"
+    },
+    {
+      "mode": "NULLABLE",
+      "name": "rating",
+      "type": "FLOAT"
+    }
+]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Ignore all files in this directory.`
	`2`	`+*`