Library APIs

Statsmodels

Run a wild cluster bootstrap based on an object of class 'statsmodels.regression.linear_model.OLS'

Parameters:

Name	Type	Description	Default
`model`	`OLS`	A statsmodels regression object	required
`B`	`int`	The number of bootstrap iterations to run	required
`cluster`	`Union[None, np.ndarray, pd.Series, pd.DataFrame]`	If None (default), a 'heteroskedastic' wild boostrap is run. For a wild cluster bootstrap, requires a numpy array of dimension one,a pandas Series or DataFrame, containing the clustering variable.	`None`
`param`	`Union[str, None]`	A string of length one, containing the test parameter of interest. Defaults to None.	`None`
`weights_type`	`str`	The type of bootstrap weights. Either 'rademacher', 'mammen', 'webb' or 'normal'. 'rademacher' by default. Defaults to 'rademacher'.	`'rademacher'`
`impose_null`	`bool`	Should the null hypothesis be imposed on the bootstrap dgp, or not? Defaults to True.	`True`
`bootstrap_type`	`str`	A string of length one. Allows to choose the bootstrap type to be run. Either '11', '31', '13' or '33'. '11' by default. Defaults to '11'.	`'11'`
`seed`	`Union[str, None]`	Option to provide a random seed. Defaults to None.	`None`

Raises:

Type	Description
`Exception`	Raises if `param` is not a string

Returns:

Type	Description
`pd.DataFrame`	pd.DataFrame: A wild cluster bootstrapped p-value(s).
`pd.DataFrame`

Example:

>>> from wildboottest.wildboottest import wildboottest
>>> import statsmodels.api as sm
>>> import numpy as np
>>> import pandas as pd

>>> np.random.seed(12312312)
>>> N = 1000
>>> k = 10
>>> G = 10
>>> X = np.random.normal(0, 1, N * k).reshape((N,k))
>>> X = pd.DataFrame(X)
>>> X.rename(columns = {0:"X1"}, inplace = True)
>>> beta = np.random.normal(0,1,k)
>>> beta[0] = 0.005
>>> u = np.random.normal(0,1,N)
>>> Y = 1 + X @ beta + u
>>> cluster = np.random.choice(list(range(0,G)), N)
>>> model = sm.OLS(Y, X)
>>> wildboottest(model, param = "X1", B = 9999)
>>> wildboottest(model, param = "X1", cluster = cluster, B = 9999)
>>> wildboottest(model, cluster = cluster, B = 9999)

Source code in wildboottest\wildboottest.py

def wildboottest(model : 'OLS', 
                 B:int, 
                 cluster : Union[np.ndarray, pd.Series, pd.DataFrame, None] = None, 
                 param : Union[str, None] = None, 
                 weights_type: str = 'rademacher',
                 impose_null: bool = True, 
                 bootstrap_type: str = '11', 
                 seed: Union[str, None] = None,
                 adj: bool = True,
                 cluster_adj: bool = True,
                 show=True) -> pd.DataFrame:
  """Run a wild cluster bootstrap based on an object of class 'statsmodels.regression.linear_model.OLS'

  Args:
      model (OLS):  A statsmodels regression object
      B (int): The number of bootstrap iterations to run
      cluster (Union[None, np.ndarray, pd.Series, pd.DataFrame], optional): If None (default), a 'heteroskedastic' wild boostrap 
           is run. For a wild cluster bootstrap, requires a numpy array of dimension one,a  pandas Series or DataFrame, containing the clustering variable.
      param (Union[str, None], optional): A string of length one, containing the test parameter of interest. Defaults to None.
      weights_type (str, optional): The type of bootstrap weights. Either 'rademacher', 'mammen', 'webb' or 'normal'. 
                        'rademacher' by default. Defaults to 'rademacher'.
      impose_null (bool, optional): Should the null hypothesis be imposed on the bootstrap dgp, or not?
                           Defaults to True.
      bootstrap_type (str, optional):A string of length one. Allows to choose the bootstrap type 
                          to be run. Either '11', '31', '13' or '33'. '11' by default. Defaults to '11'.
      seed (Union[str, None], optional): Option to provide a random seed. Defaults to None.

  Raises:
      Exception: Raises if `param` is not a string

  Returns:
      pd.DataFrame: A wild cluster bootstrapped p-value(s).

  Example: 

      >>> from wildboottest.wildboottest import wildboottest
      >>> import statsmodels.api as sm
      >>> import numpy as np
      >>> import pandas as pd

      >>> np.random.seed(12312312)
      >>> N = 1000
      >>> k = 10
      >>> G = 10
      >>> X = np.random.normal(0, 1, N * k).reshape((N,k))
      >>> X = pd.DataFrame(X)
      >>> X.rename(columns = {0:"X1"}, inplace = True)
      >>> beta = np.random.normal(0,1,k)
      >>> beta[0] = 0.005
      >>> u = np.random.normal(0,1,N)
      >>> Y = 1 + X @ beta + u
      >>> cluster = np.random.choice(list(range(0,G)), N)
      >>> model = sm.OLS(Y, X)
      >>> wildboottest(model, param = "X1", B = 9999)
      >>> wildboottest(model, param = "X1", cluster = cluster, B = 9999)
      >>> wildboottest(model, cluster = cluster, B = 9999)
  """

  # does model.exog already exclude missing values?
  X = model.exog
  # interestingly, the dependent variable is called 'endogeneous'
  Y = model.endog
  # weights not yet used, only as a placeholder
  weights = model.weights

  xnames = model.data.xnames
  ynames = model.data.ynames

  pvalues = []
  tstats = []

  def generate_stats(param, cluster):

      R = np.zeros(len(xnames))
      R[xnames.index(param)] = 1
      r = 0
      # Just test for beta=0

      # is it possible to fetch the clustering variables from the pre-processed data 
      # frame, e.g. with 'excluding' observations with missings etc
      # cluster = ...

      if cluster is None: 

          boot = WildboottestHC(X = X, Y = Y, R = R, r = r, B = B, seed = seed)
          boot.get_adjustments(bootstrap_type = bootstrap_type)
          boot.get_uhat(impose_null = impose_null)
          boot.get_tboot(weights_type = weights_type)
          boot.get_tstat()
          boot.get_pvalue(pval_type = "two-tailed")  
          full_enumeration_warn = False

      else: 

          boot = WildboottestCL(X = X, Y = Y, cluster = cluster, 
                              R = R, B = B, seed = seed)
          boot.get_scores(bootstrap_type = bootstrap_type, impose_null = impose_null, adj=adj, cluster_adj=cluster_adj)
          _, _, full_enumeration_warn = boot.get_weights(weights_type = weights_type)
          boot.get_numer()
          boot.get_denom()
          boot.get_tboot()
          boot.get_vcov()
          boot.get_tstat()
          boot.get_pvalue(pval_type = "two-tailed")

      pvalues.append(boot.pvalue)
      tstats.append(boot.t_stat)

      return pvalues, tstats, full_enumeration_warn

  if param is None:
    for x in xnames:
      pvalues, tstats, full_enumeration_warn = generate_stats(x, cluster=cluster)
    param = xnames
  elif isinstance(param, str):
    pvalues, tstats, full_enumeration_warn = generate_stats(param, cluster=cluster)
  else:
    raise Exception("`param` not correctly specified")

  if full_enumeration_warn:
    warnings.warn("2^G < the number of boot iterations, setting full_enumeration to True.")

  res = {
    'param': param,
    'statistic': tstats,
    'p-value': pvalues
  }

  res_df = pd.DataFrame(res).set_index('param')

  if show:
    print(res_df.to_markdown(floatfmt=".3f"))

  return res_df