Library APIs

Statsmodels

Run a wild cluster bootstrap based on an object of class 'statsmodels.regression.linear_model.OLS'

Parameters:

Name Type Description Default
model OLS

A statsmodels regression object

required
B int

The number of bootstrap iterations to run

required
cluster Union[None, np.ndarray, pd.Series, pd.DataFrame]

If None (default), a 'heteroskedastic' wild boostrap is run. For a wild cluster bootstrap, requires a numpy array of dimension one,a pandas Series or DataFrame, containing the clustering variable.

None
param Union[str, None]

A string of length one, containing the test parameter of interest. Defaults to None.

None
weights_type str

The type of bootstrap weights. Either 'rademacher', 'mammen', 'webb' or 'normal'. 'rademacher' by default. Defaults to 'rademacher'.

'rademacher'
impose_null bool

Should the null hypothesis be imposed on the bootstrap dgp, or not? Defaults to True.

True
bootstrap_type str

A string of length one. Allows to choose the bootstrap type to be run. Either '11', '31', '13' or '33'. '11' by default. Defaults to '11'.

'11'
seed Union[str, None]

Option to provide a random seed. Defaults to None.

None

Raises:

Type Description
Exception

Raises if param is not a string

Returns:

Type Description
pd.DataFrame

pd.DataFrame: A wild cluster bootstrapped p-value(s).

pd.DataFrame

Example:

>>> from wildboottest.wildboottest import wildboottest
>>> import statsmodels.api as sm
>>> import numpy as np
>>> import pandas as pd

>>> np.random.seed(12312312)
>>> N = 1000
>>> k = 10
>>> G = 10
>>> X = np.random.normal(0, 1, N * k).reshape((N,k))
>>> X = pd.DataFrame(X)
>>> X.rename(columns = {0:"X1"}, inplace = True)
>>> beta = np.random.normal(0,1,k)
>>> beta[0] = 0.005
>>> u = np.random.normal(0,1,N)
>>> Y = 1 + X @ beta + u
>>> cluster = np.random.choice(list(range(0,G)), N)
>>> model = sm.OLS(Y, X)
>>> wildboottest(model, param = "X1", B = 9999)
>>> wildboottest(model, param = "X1", cluster = cluster, B = 9999)
>>> wildboottest(model, cluster = cluster, B = 9999)
Source code in wildboottest\wildboottest.py
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
def wildboottest(model : 'OLS', 
                 B:int, 
                 cluster : Union[np.ndarray, pd.Series, pd.DataFrame, None] = None, 
                 param : Union[str, None] = None, 
                 weights_type: str = 'rademacher',
                 impose_null: bool = True, 
                 bootstrap_type: str = '11', 
                 seed: Union[str, None] = None,
                 adj: bool = True,
                 cluster_adj: bool = True,
                 show=True) -> pd.DataFrame:
  """Run a wild cluster bootstrap based on an object of class 'statsmodels.regression.linear_model.OLS'

  Args:
      model (OLS):  A statsmodels regression object
      B (int): The number of bootstrap iterations to run
      cluster (Union[None, np.ndarray, pd.Series, pd.DataFrame], optional): If None (default), a 'heteroskedastic' wild boostrap 
           is run. For a wild cluster bootstrap, requires a numpy array of dimension one,a  pandas Series or DataFrame, containing the clustering variable.
      param (Union[str, None], optional): A string of length one, containing the test parameter of interest. Defaults to None.
      weights_type (str, optional): The type of bootstrap weights. Either 'rademacher', 'mammen', 'webb' or 'normal'. 
                        'rademacher' by default. Defaults to 'rademacher'.
      impose_null (bool, optional): Should the null hypothesis be imposed on the bootstrap dgp, or not?
                           Defaults to True.
      bootstrap_type (str, optional):A string of length one. Allows to choose the bootstrap type 
                          to be run. Either '11', '31', '13' or '33'. '11' by default. Defaults to '11'.
      seed (Union[str, None], optional): Option to provide a random seed. Defaults to None.

  Raises:
      Exception: Raises if `param` is not a string

  Returns:
      pd.DataFrame: A wild cluster bootstrapped p-value(s).

  Example: 

      >>> from wildboottest.wildboottest import wildboottest
      >>> import statsmodels.api as sm
      >>> import numpy as np
      >>> import pandas as pd

      >>> np.random.seed(12312312)
      >>> N = 1000
      >>> k = 10
      >>> G = 10
      >>> X = np.random.normal(0, 1, N * k).reshape((N,k))
      >>> X = pd.DataFrame(X)
      >>> X.rename(columns = {0:"X1"}, inplace = True)
      >>> beta = np.random.normal(0,1,k)
      >>> beta[0] = 0.005
      >>> u = np.random.normal(0,1,N)
      >>> Y = 1 + X @ beta + u
      >>> cluster = np.random.choice(list(range(0,G)), N)
      >>> model = sm.OLS(Y, X)
      >>> wildboottest(model, param = "X1", B = 9999)
      >>> wildboottest(model, param = "X1", cluster = cluster, B = 9999)
      >>> wildboottest(model, cluster = cluster, B = 9999)
  """

  # does model.exog already exclude missing values?
  X = model.exog
  # interestingly, the dependent variable is called 'endogeneous'
  Y = model.endog
  # weights not yet used, only as a placeholder
  weights = model.weights

  xnames = model.data.xnames
  ynames = model.data.ynames

  pvalues = []
  tstats = []

  def generate_stats(param, cluster):

      R = np.zeros(len(xnames))
      R[xnames.index(param)] = 1
      r = 0
      # Just test for beta=0

      # is it possible to fetch the clustering variables from the pre-processed data 
      # frame, e.g. with 'excluding' observations with missings etc
      # cluster = ...

      if cluster is None: 

          boot = WildboottestHC(X = X, Y = Y, R = R, r = r, B = B, seed = seed)
          boot.get_adjustments(bootstrap_type = bootstrap_type)
          boot.get_uhat(impose_null = impose_null)
          boot.get_tboot(weights_type = weights_type)
          boot.get_tstat()
          boot.get_pvalue(pval_type = "two-tailed")  
          full_enumeration_warn = False

      else: 

          boot = WildboottestCL(X = X, Y = Y, cluster = cluster, 
                              R = R, B = B, seed = seed)
          boot.get_scores(bootstrap_type = bootstrap_type, impose_null = impose_null, adj=adj, cluster_adj=cluster_adj)
          _, _, full_enumeration_warn = boot.get_weights(weights_type = weights_type)
          boot.get_numer()
          boot.get_denom()
          boot.get_tboot()
          boot.get_vcov()
          boot.get_tstat()
          boot.get_pvalue(pval_type = "two-tailed")

      pvalues.append(boot.pvalue)
      tstats.append(boot.t_stat)

      return pvalues, tstats, full_enumeration_warn

  if param is None:
    for x in xnames:
      pvalues, tstats, full_enumeration_warn = generate_stats(x, cluster=cluster)
    param = xnames
  elif isinstance(param, str):
    pvalues, tstats, full_enumeration_warn = generate_stats(param, cluster=cluster)
  else:
    raise Exception("`param` not correctly specified")

  if full_enumeration_warn:
    warnings.warn("2^G < the number of boot iterations, setting full_enumeration to True.")

  res = {
    'param': param,
    'statistic': tstats,
    'p-value': pvalues
  }

  res_df = pd.DataFrame(res).set_index('param')

  if show:
    print(res_df.to_markdown(floatfmt=".3f"))

  return res_df