More to help catch mis-alignment at apply pars time

briochh · briochh · commit 14b9cc6441ad · 2023-04-19T19:12:22.000+12:00
diff --git a/autotest/pst_from_tests.py b/autotest/pst_from_tests.py
@@ -555,6 +555,12 @@ def freyberg_prior_build_test(tmp_path):
                           par_name_base="welflux_grid",
                           zone_array=m.bas6.ibound.array,
                           geostruct=geostruct, lower_bound=0.25, upper_bound=1.75)
+        pf.add_parameters(filenames=well_mfiles,
+                          par_type="grid", index_cols=[0, 1, 2], use_cols=3,
+                          par_name_base="welflux_grid",
+                          zone_array=m.bas6.ibound.array,
+                          use_rows=(1, 3, 4),
+                          geostruct=geostruct, lower_bound=0.25, upper_bound=1.75)
         # global constant across all files
         pf.add_parameters(filenames=well_mfiles,
                           par_type="constant",
diff --git a/pyemu/utils/geostats.py b/pyemu/utils/geostats.py
@@ -1383,7 +1383,7 @@ def _calc_factors_org(
             if self.interp_data is None:
                 self.interp_data = df
             else:
-                self.interp_data = self.interp_data.append(df)
+                self.interp_data = pd.concat([self.interp_data, df])
         # correct for negative kriging factors, if requested
         if remove_negative_factors == True:
             self._remove_neg_factors()
diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py
@@ -2047,11 +2047,12 @@ def _process_chunk_list_files(chunk, i, df):
     print("process", i, " processed ", len(chunk), "process_list_file calls")
 
 
-def _list_index_caster(x,add1):
+def _list_index_caster(x, add1):
         vals = []
         for xx in x:
             if xx:
-                if xx.strip().isdigit() or (xx.strip()[0] == '-' and xx.strip()[1:].isdigit()):
+                if (xx.strip().isdigit() or
+                        (xx.strip()[0] == '-' and xx.strip()[1:].isdigit())):
                     vals.append(add1 + int(xx))
                 else:
                     try:
@@ -2061,11 +2062,12 @@ def _list_index_caster(x,add1):
 
         return tuple(vals)
 
-def _list_index_splitter_and_caster(x,add1):
-    return _list_index_caster(x.strip("()").replace('\'','').split(","),add1)
 
-def _process_list_file(model_file, df):
+def _list_index_splitter_and_caster(x, add1):
+    return _list_index_caster(x.strip("()").replace('\'', '').split(","), add1)
+
 
+def _process_list_file(model_file, df):
     # print("processing model file:", model_file)
     df_mf = df.loc[df.model_file == model_file, :].copy()
     # read data stored in org (mults act on this)
@@ -2110,7 +2112,7 @@ def _process_list_file(model_file, df):
         # index_cols can be from header str
         header = 0
         hheader = True
-    elif isinstance(index_col_eg, int):
+    elif isinstance(index_col_eg, (int, np.integer)):
         # index_cols are column numbers in input file
         header = None
         hheader = None
@@ -2169,14 +2171,13 @@ def _process_list_file(model_file, df):
             common_idx = (
                 new_df.index.intersection(mlts.index).sort_values().drop_duplicates()
             )
-            if common_idx.shape[0] == 0:
-                raise Exception("error: common_idx is empty")
             mlt_cols = [str(col) for col in mlt.use_cols]
-            assert len(common_idx) * len(mlt_cols) == mlt.chkpar, (
-                "probable miss-alignment in tpl indices and original file:\n"
-                f"mult idx[:10] : {mlts.index.values.tolist()[:10]}\n"
-                f"org file idx[:10]: {new_df.index.value[:10]}\n"
-                f"n common: {len(common_idx)}, n cols: {len(mlt_cols)}"
+            assert len(common_idx) == mlt.chkpar, (
+                "Probable miss-alignment in tpl indices and original file:\n"
+                f"mult idx[:10] : {mlts.index.sort_values().tolist()[:10]}\n"
+                f"org file idx[:10]: {new_df.index.sort_values().to_list()[:10]}\n"
+                f"n common: {len(common_idx)}, n cols: {len(mlt_cols)}, "
+                f"expected: {mlt.chkpar}."
             )
             operator = mlt.operator
             if operator == "*" or operator.lower()[0] == "m":
diff --git a/pyemu/utils/pst_from.py b/pyemu/utils/pst_from.py
@@ -219,7 +219,7 @@ def parfile_relations(self):
                     if x is not None
                     else lb_max["lbound"]
                 )
-        pr["zero_based"] = self.zero_based
+        pr["zero_based"] = self.zero_based   # todo -- chase this out if going to file specific zero based def
         return pr
 
     def _generic_get_xy(self, args, **kwargs):
@@ -968,7 +968,7 @@ def _par_prep(
                         sep = " "
                         if rel_filepath.suffix.lower() == ".csv":
                             sep = ","
-                if df.columns.is_integer():
+                if pd.api.types.is_integer_dtype(df.columns):  # df.columns.is_integer(): # really!???
                     hheader = False
                 else:
                     hheader = df.columns
@@ -1912,7 +1912,9 @@ def add_parameters(
             par_style = par_style[0]
         if par_style not in ["m", "d", "a"]:
             self.logger.lraise(
-                "add_parameters(): unrecognized 'style': {0}, should be either 'm'/'mult'/'multiplier', 'a'/'add'/'addend' or 'd'/'direct'".format(
+                "add_parameters(): unrecognized 'style': {0}, "
+                "should be either 'm'/'mult'/'multiplier', "
+                "'a'/'add'/'addend' or 'd'/'direct'".format(
                     par_style
                 )
             )
@@ -2138,7 +2140,7 @@ def add_parameters(
 
         pp_filename = None  # setup placeholder variables
         fac_filename = None
-
+        nxs = None
         # Process model parameter files to produce appropriate pest pars
         if index_cols is not None:  # Assume list/tabular type input files
             # ensure inputs are provided for all required cols
@@ -2167,7 +2169,7 @@ def add_parameters(
                 par_type.startswith("grid") or par_type.startswith("p")
             ) and geostruct is not None:
                 get_xy = self.get_xy
-            df = write_list_tpl(
+            df, nxs = write_list_tpl(
                 filenames,
                 dfs,
                 par_name_base,
@@ -2189,6 +2191,7 @@ def add_parameters(
                 fill_value=initial_value,
                 logger=self.logger,
             )
+            nxs = {fname: nx for fname, nx in zip(filenames, nxs)}
             assert (
                 np.mod(len(df), len(use_cols)) == 0.0
             ), "Parameter dataframe wrong shape for number of cols {0}" "".format(
@@ -2273,14 +2276,13 @@ def add_parameters(
                     structured = True
                     for mod_file, ar in file_dict.items():
                         orgdata = ar.shape
-                        if spatial_reference_type=='vertex':
+                        if spatial_reference_type == 'vertex':
                             assert orgdata[0] == spatial_reference.ncpl, (
                                 "Spatial reference ncpl not equal to original data ncpl for\n"
                                 + os.path.join(
                                     *os.path.split(self.original_file_d)[1:], mod_file
                                 )
                             )
-
                         else:
                             assert orgdata[0] == spatial_reference.nrow, (
                                 "Spatial reference nrow not equal to original data nrow for\n"
@@ -2643,7 +2645,7 @@ def add_parameters(
             zone_filename = zone_filename.name
 
         relate_parfiles = []
-        for mod_file in file_dict.keys():
+        for mod_file, pdf in file_dict.items():
             mult_dict = {
                 "org_file": Path(self.original_file_d.name, mod_file.name),
                 "model_file": mod_file,
@@ -2655,8 +2657,9 @@ def add_parameters(
                 "upper_bound": ult_ubound,
                 "lower_bound": ult_lbound,
                 "operator": par_style,
-                "chkpar": len(df)
             }
+            if nxs:
+                mult_dict["chkpar"] = nxs[mod_file]
             if par_style in ["m", "a"]:
                 mult_dict["mlt_file"] = Path(self.mult_file_d.name, mlt_filename)
 
@@ -3094,7 +3097,7 @@ def write_list_tpl(
     # get dataframe with autogenerated parnames based on `name`, `index_cols`,
     # `use_cols`, `suffix` and `par_type`
     if par_style == "d":
-        df_tpl = _write_direct_df_tpl(
+        df_tpl, nxs = _write_direct_df_tpl(
             filenames[0],
             tpl_filename,
             dfs[0],
@@ -3130,8 +3133,8 @@ def write_list_tpl(
             par_fill_value=fill_value,
             par_style=par_style,
         )
-        idxs = [df.loc[:, index_cols].values.tolist() for df in dfs]
-        use_rows = _get_use_rows(
+        idxs = [[tuple(s) for s in df.loc[:, index_cols].values] for df in dfs]
+        use_rows, nxs = _get_use_rows(
             df_tpl, idxs, use_rows, zero_based, tpl_filename, logger=logger
         )
         df_tpl = df_tpl.loc[use_rows, :]  # direct pars done in direct function
@@ -3227,7 +3230,7 @@ def write_list_tpl(
     df_par.loc[:, "tpl_filename"] = tpl_filename
     df_par.loc[:, "input_filename"] = input_filename
     df_par.loc[:, "parval1"] = parval
-    return df_par
+    return df_par, nxs
 
 
 def _write_direct_df_tpl(
@@ -3311,8 +3314,8 @@ def _write_direct_df_tpl(
         init_df=df,
         init_fname=in_filename,
     )
-    idxs = df.loc[:, index_cols].values.tolist()
-    use_rows = _get_use_rows(
+    idxs = [tuple(s) for s in df.loc[:, index_cols].values]
+    use_rows, nxs = _get_use_rows(
         df_ti, [idxs], use_rows, zero_based, tpl_filename, logger=logger
     )
     df_ti = df_ti.loc[use_rows]
@@ -3325,7 +3328,7 @@ def _write_direct_df_tpl(
     pyemu.helpers._write_df_tpl(
         tpl_filename, direct_tpl_df, index=False, header=header, headerlines=headerlines
     )
-    return df_ti
+    return df_ti, nxs
 
 
 def _get_use_rows(tpldf, idxcolvals, use_rows, zero_based, fnme, logger=None):
@@ -3345,19 +3348,23 @@ def _get_use_rows(tpldf, idxcolvals, use_rows, zero_based, fnme, logger=None):
     """
     if use_rows is None:
         use_rows = tpldf.index
-        return use_rows
+        nxs = [len(set(idx)) for idx in idxcolvals]
+        return use_rows, nxs
     if np.ndim(use_rows) == 0:
         use_rows = [use_rows]
     if np.ndim(use_rows) == 1:  # assume we have collection of int that describe iloc
         use_rows = [idx[i] for i in use_rows for idx in idxcolvals]
+    else:
+        use_rows = [tuple(r) for r in use_rows]
+    nxs = [len(set(use_rows).intersection(idx)) for idx in idxcolvals]
+    orig_use_rows = use_rows.copy()
     if not zero_based:  # assume passed indicies are 1 based
         use_rows = [
-            tuple([i - 1 if isinstance(i, int) else i for i in r])
+            tuple([i - 1 if isinstance(i, (int, np.integer)) else i for i in r])
             if not isinstance(r, str)
             else r
             for r in use_rows
         ]
-    orig_use_rows = use_rows
     use_rows = set(use_rows)
     sel = tpldf.sidx.isin(use_rows) | tpldf.idx_strs.isin(use_rows)
     if not sel.any():  # use_rows must be ints
@@ -3387,7 +3394,7 @@ def _get_use_rows(tpldf, idxcolvals, use_rows, zero_based, fnme, logger=None):
         else:
             warnings.warn(msg, PyemuWarning)
         use_rows = tpldf.index
-    return use_rows
+    return use_rows, nxs
 
 
 def _get_index_strfmt(index_cols):
@@ -3590,7 +3597,6 @@ def _get_tpl_or_ins_df(
     Private method to auto-generate parameter or obs names from tabular
     model files (input or output) read into pandas dataframes
     Args:
-        filenames (`str` or `list` of str`): filenames
         dfs (`pandas.DataFrame` or `list`): DataFrames (can be list of DataFrames)
             to set up parameters or observations
         name (`str`): Parameter name or Observation name prefix
@@ -3633,8 +3639,6 @@ def _get_tpl_or_ins_df(
 
     # work out the union of indices across all dfs
     if typ != "obs":
-
-
         sidx = set()
         for df in dfs:
             # looses ordering