diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml old mode 100644 new mode 100755 index 2e513ed1..5647cf48 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -30,7 +30,6 @@ matrix: os: [ubuntu-latest] environment-file: - - ci/39-latest.yaml - ci/310-min.yaml - ci/310-latest.yaml - ci/311-latest.yaml diff --git a/ci/310-latest.yaml b/ci/310-latest.yaml old mode 100644 new mode 100755 index c5b78a9d..e00035ad --- a/ci/310-latest.yaml +++ b/ci/310-latest.yaml @@ -4,7 +4,7 @@ channels: dependencies: - python=3.10 # required - - libpysal + - libpysal>=4.12.1 - numpy>=1.3 - pandas>=1.0 - scipy>=0.11 diff --git a/ci/310-min.yaml b/ci/310-min.yaml old mode 100644 new mode 100755 index ab4be12e..f7c5d939 --- a/ci/310-min.yaml +++ b/ci/310-min.yaml @@ -4,7 +4,7 @@ channels: dependencies: - python=3.10 # required - - libpysal + - libpysal>=4.12.1 - numpy>=1.3 - pandas>=1.0 - scipy>=0.11 diff --git a/ci/311-latest.yaml b/ci/311-latest.yaml old mode 100644 new mode 100755 index bc295d06..cb27a6da --- a/ci/311-latest.yaml +++ b/ci/311-latest.yaml @@ -4,7 +4,7 @@ channels: dependencies: - python=3.11 # required - - libpysal + - libpysal>=4.12.1 - numpy>=1.3 - pandas>=1.0 - scipy>=0.11 diff --git a/ci/312-latest.yaml b/ci/312-latest.yaml old mode 100644 new mode 100755 index 0e9984b9..7ab733e7 --- a/ci/312-latest.yaml +++ b/ci/312-latest.yaml @@ -4,7 +4,7 @@ channels: dependencies: - python=3.12 # required - - libpysal + - libpysal>=4.12.1 - numpy>=1.3 - pandas>=1.0 - scipy>=0.11 diff --git a/ci/39-latest.yaml b/ci/39-latest.yaml deleted file mode 100644 index fc5eb3b3..00000000 --- a/ci/39-latest.yaml +++ /dev/null @@ -1,27 +0,0 @@ -name: test -channels: - - conda-forge -dependencies: - - python=3.9 - # required - - libpysal - - numpy>=1.3 - - pandas>=1.0 - - scipy>=0.11 - # testing/formatting - - codecov - - pytest - - pytest-cov - - pytest-xdist - # optional - - bokeh>=0.11.1 - - folium>=0.2.1 - - geopandas>=0.2 - - geojson>=1.3.2 - - matplotlib>=1.5.1 - - mplleaflet>=0.0.5 - - numba - - numexpr - - scikit-learn>=0.17.1 - - seaborn>=0.7.0 - - statsmodels>=0.6.1 diff --git a/docs/_static/references.bib b/docs/_static/references.bib index c591767f..b4f57970 100644 --- a/docs/_static/references.bib +++ b/docs/_static/references.bib @@ -8,13 +8,13 @@ @article{Anselin2021, Author = {Anselin, Luc and Amaral, Pedro}, - Journal = {}, - Number = {}, - Pages = {}, + Journal = {Journal of Geographical Systems}, + Number = {26}, + Pages = {209–234}, Title = {Endogenous Spatial Regimes}, - Url = {https://www.researchgate.net/publication/353411566_Endogenous_Spatial_Regimes}, + Url = {https://doi.org/10.1007/s10109-023-00411-2}, Volume = {}, - Year = {2021}} + Year = {2024}} @article{McMillen1992, Abstract = {ABSTRACT. Commonly-employed spatial autocorrelation models imply heteroskedastic errors, but heteroskedasticity causes probit to be inconsistent. This paper proposes and illustrates the use of two categories of estimators for probit models with spatial autocorrelation. One category is based on the EM algorithm, and requires repeated application of a maximum-likelihood estimator. The other category, which can be applied to models derived using the spatial expansion method, only requires weighted least squares.}, diff --git a/docs/api.rst b/docs/api.rst old mode 100644 new mode 100755 index 77c232e1..03f3e93d --- a/docs/api.rst +++ b/docs/api.rst @@ -7,6 +7,15 @@ API reference .. _models_api: +Classic Models +-------------- + +.. autosummary:: + :toctree: generated/ + + spreg.OLS + spreg.TSLS + Spatial Regression Models ------------------------- @@ -15,10 +24,10 @@ These are the standard spatial regression models supported by the `spreg` packag .. autosummary:: :toctree: generated/ - spreg.OLS + spreg.GM_Lag spreg.ML_Lag + spreg.GMM_Error spreg.ML_Error - spreg.GM_Lag spreg.GM_Error spreg.GM_Error_Het spreg.GM_Error_Hom @@ -28,11 +37,11 @@ These are the standard spatial regression models supported by the `spreg` packag spreg.GM_Endog_Error spreg.GM_Endog_Error_Het spreg.GM_Endog_Error_Hom - spreg.TSLS - spreg.ThreeSLS + spreg.NSLX Discrete Choice Models ---------------------- + .. autosummary:: :toctree: generated/ @@ -60,6 +69,8 @@ Regimes models are variants of spatial regression models which allow for structu spreg.GM_Endog_Error_Regimes spreg.GM_Endog_Error_Hom_Regimes spreg.GM_Endog_Error_Het_Regimes + spreg.OLS_Endog_Regimes + spreg.GM_Lag_Endog_Regimes spreg.Skater_reg Seemingly-Unrelated Regressions @@ -107,6 +118,7 @@ Diagnostic tests are useful for identifying model fit, sufficiency, and specific spreg.akaike spreg.schwarz spreg.condition_index + spreg.dwh spreg.jarque_bera spreg.breusch_pagan spreg.white diff --git a/spreg/__init__.py b/spreg/__init__.py index 64afa0bd..cdd68390 100755 --- a/spreg/__init__.py +++ b/spreg/__init__.py @@ -1,3 +1,4 @@ + import contextlib from importlib.metadata import PackageNotFoundError, version @@ -17,6 +18,7 @@ from .ml_error_regimes import * from .ml_lag import * from .ml_lag_regimes import * +from .nslx import * from .ols import * from .ols_regimes import * from .panel_fe import * @@ -24,6 +26,7 @@ from .probit import * from .regimes import * from .skater_reg import * +#from .spsearch import * from .sp_panels import * from .sputils import * from .sur import * diff --git a/spreg/diagnostics.py b/spreg/diagnostics.py index 442d3a24..2811c179 100755 --- a/spreg/diagnostics.py +++ b/spreg/diagnostics.py @@ -1,8 +1,7 @@ """ -Diagnostics for regression estimations. - +Diagnostics for regression estimations. + """ - __author__ = ( "Luc Anselin luc.anselin@asu.edu, Nicholas Malizia nicholas.malizia@asu.edu " ) @@ -39,7 +38,7 @@ ] -def f_stat(reg, df=0): +def f_stat(reg,df=0): """ Calculates the f-statistic and associated p-value for multiple coefficient constraints :cite:`Greene2003`. @@ -50,7 +49,7 @@ def f_stat(reg, df=0): ---------- reg : regression object output instance from a regression model - df : number of coefficient constraints + df : number of coefficient constraints (zero constraint for last df coefficients in betas) Returns @@ -101,15 +100,15 @@ def f_stat(reg, df=0): utu = reg.utu # (scalar) residual sum of squares # default case, all coefficients if df == 0: - r = k - 1 + r = k-1 predy = reg.predy # (array) vector of predicted values (n x 1) mean_y = reg.mean_y # (scalar) mean of dependent observations U = np.sum((predy - mean_y) ** 2) - else: # F test on last df coefficients + else: # F test on last df coefficients y = reg.y r = df - x0 = reg.x[:, :-r] - olsr = BaseOLS(y, x0) # constrained regression + x0 = reg.x[:,:-r] + olsr = BaseOLS(y,x0) # constrained regression rtr = olsr.utu U = rtr - utu fStat = (U / r) / (utu / (n - k)) @@ -1386,10 +1385,17 @@ def likratiotest(reg0, reg1): raise Exception("Missing or improper log-likelihoods in regression objects") if likr < 0.0: # always enforces positive likelihood ratio likr = -likr - pvalue = stats.chisqprob(likr, 1) - likratio = {"likr": likr, "df": 1, "p-value": pvalue} - return likratio + # generalize to multiple parameters, e.g., spatial Durbin + df = reg1.k - reg0.k + if not(df > 0): + df = 1 + + #pvalue = stats.chisqprob(likr, 1) + pvalue = stats.chisqprob(likr, df) + #likratio = {"likr": likr, "df": 1, "p-value": pvalue} + likratio = {"likr": likr, "df": df, "p-value": pvalue} + return likratio def dwh(reg): """ @@ -1408,23 +1414,23 @@ def dwh(reg): and associated p-value """ - n = reg.n - ny = reg.yend.shape[1] # number of endogenous variables + n = reg.n + ny = reg.yend.shape[1] # number of endogenous variables qq = reg.h # all exogenous and instruments xx = reg.z # all exogenous and endogenous # get predicted values for endogenous variables on all instruments - py = np.zeros((n, ny)) + py = np.zeros((n,ny)) for i in range(ny): - yy = reg.yend[:, i].reshape(n, 1) - ols1 = BaseOLS(y=yy, x=qq) + yy = reg.yend[:, i].reshape(n,1) + ols1 = BaseOLS(y=yy,x=qq) yp = ols1.predy - py[0:n, i] = yp.flatten() + py[0:n,i] = yp.flatten() nxq = sphstack(xx, py) # F-test in augmented regression ols2 = BaseOLS(y=reg.y, x=nxq) dwh = f_stat(ols2, df=ny) - return dwh - + return dwh + def _test(): import doctest diff --git a/spreg/diagnostics_sp.py b/spreg/diagnostics_sp.py index 2a5df83a..01660d49 100644 --- a/spreg/diagnostics_sp.py +++ b/spreg/diagnostics_sp.py @@ -1,7 +1,6 @@ """ Spatial diagnostics module """ - __author__ = "Luc Anselin lanselin@gmail.com, Daniel Arribas-Bel darribas@asu.edu, Pedro Amaral pedrovma@gmail.com" from .utils import spdot @@ -163,20 +162,10 @@ class LMtests: def __init__(self, ols, w, tests=["all"]): cache = spDcache(ols, w) if tests == ["all"]: - tests = [ - "lme", - "lml", - "rlme", - "rlml", - "sarma", - "lmwx", - "lmspdurbin", - "rlmwx", - "rlmdurlag", - "lmslxerr", - ] # added back in for access + tests = ["lme", "lml", "rlme", "rlml", "sarma", "lmwx", "lmspdurbin", "rlmwx", + "rlmdurlag", "lmslxerr"] # added back in for access if any(test in ["lme", "lmslxerr"] for test in tests): - # if "lme" in tests: + #if "lme" in tests: self.lme = lmErr(ols, w, cache) if any(test in ["lml", "rlmwx"] for test in tests): self.lml = lmLag(ols, w, cache) @@ -186,8 +175,8 @@ def __init__(self, ols, w, tests=["all"]): self.rlml = rlmLag(ols, w, cache) if "sarma" in tests: self.sarma = lmSarma(ols, w, cache) - # if any(test in ["lmwx", "rlmdurlag", "lmslxerr"] for test in tests): - if any(test in ["lmwx", "rlmdurlag", "lmslxerr"] for test in tests): + #if any(test in ["lmwx", "rlmdurlag", "lmslxerr"] for test in tests): + if any(test in ["lmwx", "rlmdurlag","lmslxerr"] for test in tests): self.lmwx = lm_wx(ols, w) if any(test in ["lmspdurbin", "rlmdurlag", "rlmwx"] for test in tests): self.lmspdurbin = lm_spdurbin(ols, w) @@ -195,10 +184,9 @@ def __init__(self, ols, w, tests=["all"]): self.rlmwx = rlm_wx(ols, self.lmspdurbin, self.lml) if "rlmdurlag" in tests: self.rlmdurlag = rlm_durlag(self.lmspdurbin, self.lmwx) - if "lmslxerr" in tests: # currently removed - LA added back in for access + if "lmslxerr" in tests: #currently removed - LA added back in for access self.lmslxerr = lm_slxerr(ols, self.lme, self.lmwx) - class MoranRes: """ Moran's I for spatial autocorrelation in residuals from OLS regression @@ -300,7 +288,7 @@ def __init__(self, ols, w, z=False): class AKtest: - """ + r""" Moran's I test of spatial autocorrelation for IV estimation. Implemented following the original reference :cite:`Anselin1997` @@ -453,7 +441,7 @@ def __init__(self, iv, w, case="nosp"): class spDcache: - """ + r""" Helper class to compute reusable pieces in the spatial diagnostics module ... @@ -576,7 +564,7 @@ def lmErr(reg, w, spDcache): Pair of statistic and p-value for the LM error test. """ - lm = spDcache.utwuDs**2 / spDcache.t + lm = spDcache.utwuDs ** 2 / spDcache.t pval = chisqprob(lm, 1) return (lm[0][0], pval[0][0]) @@ -601,7 +589,7 @@ def lmLag(ols, w, spDcache): Pair of statistic and p-value for the LM lag test. """ - lm = spDcache.utwyDs**2 / (ols.n * spDcache.j) + lm = spDcache.utwyDs ** 2 / (ols.n * spDcache.j) pval = chisqprob(lm, 1) return (lm[0][0], pval[0][0]) @@ -684,12 +672,11 @@ def lmSarma(ols, w, spDcache): """ first = (spDcache.utwyDs - spDcache.utwuDs) ** 2 / (w.n * spDcache.j - spDcache.t) - secnd = spDcache.utwuDs**2 / spDcache.t + secnd = spDcache.utwuDs ** 2 / spDcache.t lm = first + secnd pval = chisqprob(lm, 2) return (lm[0][0], pval[0][0]) - def lm_wx(reg, w): """ LM test for WX. Implemented as presented in Koley & Bera (2024) :cite:`KoleyBera2024`. @@ -711,7 +698,7 @@ def lm_wx(reg, w): # preliminaries # set up X1 (constant) and X (no constant) as x1 and xx x1 = reg.x - xx = x1[:, 1:] + xx = x1[:,1:] # WX wx = w.sparse * xx # end of preliminaries @@ -728,11 +715,10 @@ def lm_wx(reg, w): rsg1 = (xpwpu.T @ xqxi) @ xpwpu rsgam = rsg1[0][0] / reg.sig2n pval = chisqprob(rsgam, (reg.k - 1)) - rsgamma = (rsgam, pval) - return rsgamma - + rsgamma = (rsgam,pval) + return(rsgamma) -def lm_spdurbin(reg, w): +def lm_spdurbin(reg,w): """ Joint test for SDM. Implemented as presented in Koley & Bera (2024) :cite:`KoleyBera2024`. @@ -753,7 +739,7 @@ def lm_spdurbin(reg, w): # preliminaries # set up X1 (constant) and X (no constant) as x1 and xx x1 = reg.x - xx = x1[:, 1:] + xx = x1[:,1:] k = x1.shape[1] # WX wx = w.sparse * xx @@ -771,23 +757,21 @@ def lm_spdurbin(reg, w): pp = w.trcWtW_WW # end of preliminaries # J_11: block matrix with X1'X1 and n/2sig2n - jj1a = np.hstack((reg.xtx, np.zeros((k, 1)))) - jj1b = np.hstack( - (np.zeros((1, k)), np.array([reg.n / (2.0 * reg.sig2n)]).reshape(1, 1)) - ) - jj11 = np.vstack((jj1a, jj1b)) + jj1a = np.hstack((reg.xtx,np.zeros((k,1)))) + jj1b = np.hstack((np.zeros((1,k)),np.array([reg.n/(2.0*reg.sig2n)]).reshape(1,1))) + jj11 = np.vstack((jj1a,jj1b)) # J_12: matrix with k-1 rows X1'WX1b and X1'WX, and 1 row of zeros jj12a = np.hstack((x1.T @ wxb, x1.T @ wx)) - jj12 = np.vstack((jj12a, np.zeros((1, k)))) + jj12 = np.vstack((jj12a,np.zeros((1,k)))) # J_22 matrix with diagonal elements b'X1'W'WX1b + T.sig2n and X'W'WX # and off-diagonal element b'X1'W'WX jj22a = wxb.T @ wxb + pp * reg.sig2n - jj22a = jj22a.reshape(1, 1) - wxbtwx = (wxb.T @ wx).reshape(1, k - 1) - jj22b = np.hstack((jj22a, wxbtwx)) + jj22a = jj22a.reshape(1,1) + wxbtwx = (wxb.T @ wx).reshape(1,k-1) + jj22b = np.hstack((jj22a,wxbtwx)) wxtwx = wx.T @ wx - jj22c = np.hstack((wxbtwx.T, wxtwx)) - jj22 = np.vstack((jj22b, jj22c)) + jj22c = np.hstack((wxbtwx.T,wxtwx)) + jj22 = np.vstack((jj22b,jj22c)) # J^22 (the inverse) from J^22 = (J_22 - J_21.J_11^-1.J_12)^-1 jj11i = la.inv(jj11) j121121 = (jj12.T @ jj11i) @ jj12 @@ -796,15 +780,14 @@ def lm_spdurbin(reg, w): # rescale by sig2n jj22i = jj22i * reg.sig2n # statistic - dd = np.vstack((drho, dgam)) + dd = np.vstack((drho,dgam)) rsjoint = (dd.T @ jj22i) @ dd rsjoint = rsjoint[0][0] pval = chisqprob(rsjoint, k) rsrhogam = (rsjoint, pval) - return rsrhogam - + return(rsrhogam) -def rlm_wx(reg, lmspdurbin, lmlag): +def rlm_wx(reg,lmspdurbin,lmlag): """ Robust LM WX test. Implemented as presented in Koley & Bera (2024) :cite:`KoleyBera2024`. @@ -825,12 +808,11 @@ def rlm_wx(reg, lmspdurbin, lmlag): """ # robust gamma = rsjoint - rsrho rsgams = lmspdurbin[0] - lmlag[0] - pval = chisqprob(rsgams, (reg.k - 1)) + pval = chisqprob(rsgams,(reg.k - 1)) rsgamstar = (rsgams, pval) - return rsgamstar - + return(rsgamstar) -def rlm_durlag(lmspdurbin, lmwx): +def rlm_durlag(lmspdurbin,lmwx): """ Robust LM Lag - SDM. Implemented as presented in Koley & Bera (2024) :cite:`KoleyBera2024`. @@ -849,12 +831,11 @@ def rlm_durlag(lmspdurbin, lmwx): # robust rho = rsjoint - rsgam rsrhos = lmspdurbin[0] - lmwx[0] - pval = chisqprob(rsrhos, 1) + pval = chisqprob(rsrhos,1) rsrhostar = (rsrhos, pval) - return rsrhostar + return(rsrhostar) - -def lm_slxerr(reg, lmerr, lmwx): +def lm_slxerr(reg,lmerr,lmwx): """ Joint test for Error and WX. Implemented as presented in Koley & Bera (2024) :cite:`KoleyBera2024`. @@ -873,10 +854,9 @@ def lm_slxerr(reg, lmerr, lmwx): Pair of statistic and p-value for the Joint test for Error and WX. """ rslamgam = lmerr[0] + lmwx[0] - pval = chisqprob(rslamgam, reg.k) - rslamgamma = (rslamgam, pval) - return rslamgamma - + pval = chisqprob(rslamgam,reg.k) + rslamgamma = (rslamgam,pval) + return(rslamgamma) def get_mI(reg, w, spDcache): """ @@ -912,8 +892,8 @@ def get_vI(ols, w, ei, spDcache): B = spDcache.AB[1] trB = np.sum(B.diagonal()) * 4.0 - vi = (w.n**2 / (w.s0**2 * (w.n - ols.k) * (w.n - ols.k + 2.0))) * ( - w.s1 + 2.0 * trA2 - trB - ((2.0 * (spDcache.trA**2)) / (w.n - ols.k)) + vi = (w.n ** 2 / (w.s0 ** 2 * (w.n - ols.k) * (w.n - ols.k + 2.0))) * ( + w.s1 + 2.0 * trA2 - trB - ((2.0 * (spDcache.trA ** 2)) / (w.n - ols.k)) ) return vi @@ -931,13 +911,15 @@ def get_zI(I, ei, vi): Returns two-sided p-values as provided in the GeoDa family """ - z = abs((I - ei) / np.sqrt(vi)) - pval = norm.sf(z) * 2.0 + #z = abs((I - ei) / np.sqrt(vi)) + z = (I - ei) / np.sqrt(vi) + #pval = norm.sf(z) * 2.0 + pval = norm.sf(abs(z)) * 2.0 return (z, pval) def akTest(iv, w, spDcache): - """ + r""" Computes AK-test for the general case (end. reg. + sp. lag) Parameters @@ -967,7 +949,7 @@ def akTest(iv, w, spDcache): a = np.dot(etwz, np.dot(iv.varb, etwz.T)) s12 = (w.s0 / w.n) ** 2 phi2 = (spDcache.t + (4.0 / iv.sig2n) * a) / (s12 * w.n) - ak = w.n * mi**2 / phi2 + ak = w.n * mi ** 2 / phi2 pval = chisqprob(ak, 1) return (mi, ak[0][0], pval[0][0]) diff --git a/spreg/error_sp_het.py b/spreg/error_sp_het.py index b8bf01e2..5d385952 100755 --- a/spreg/error_sp_het.py +++ b/spreg/error_sp_het.py @@ -25,6 +25,7 @@ class BaseGM_Error_Het(RegressionPropsY): + """ GMM method for a spatial error model with heteroskedasticity (note: no consistency checks, diagnostics or constant added); based on @@ -109,9 +110,8 @@ class BaseGM_Error_Het(RegressionPropsY): [ 0.4118 0.168 ]] """ - def __init__( - self, y, x, w, max_iter=1, epsilon=0.00001, step1c=False, hard_bound=False - ): + def __init__(self, y, x, w, max_iter=1, epsilon=0.00001, step1c=False, hard_bound=False): + self.step1c = step1c # 1a. OLS --> \tilde{betas} ols = OLS.BaseOLS(y=y, x=x) @@ -153,15 +153,10 @@ def __init__( self.iter_stop = UTILS.iter_msg(self.iteration, max_iter) if hard_bound: if abs(lambda3) >= 0.99: - raise Exception( - "Spatial error parameter was outside the bounds of -0.99 and 0.99" - ) + raise Exception("Spatial error parameter was outside the bounds of -0.99 and 0.99") else: if abs(lambda3) >= 0.99: - set_warn( - self, - "Spatial error parameter was outside the bounds of -0.99 and 0.99", - ) + set_warn(self, "Spatial error parameter was outside the bounds of -0.99 and 0.99") sigma = get_psi_sigma(w, self.u, lambda3) vc3 = get_vc_het(w, wA1, sigma) @@ -376,40 +371,33 @@ def __init__( latex=False, hard_bound=False, ): + n = USER.check_arrays(y, x) y, name_y = USER.check_y(y, n, name_y) w = USER.check_weights(w, y, w_required=True, slx_lags=slx_lags) x_constant, name_x, warn = USER.check_constant(x, name_x) - name_x = USER.set_name_x( - name_x, x_constant - ) # initialize in case None, includes constant + name_x = USER.set_name_x(name_x, x_constant) # initialize in case None, includes constant set_warn(self, warn) self.title = "GM SPATIALLY WEIGHTED LEAST SQUARES (HET)" - if slx_lags > 0: - # lag_x = get_lags(w, x_constant[:, 1:], slx_lags) - # x_constant = np.hstack((x_constant, lag_x)) - # name_x += USER.set_name_spatial_lags(name_x, slx_lags) - # name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # exclude constant + if slx_lags >0: + #lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + #x_constant = np.hstack((x_constant, lag_x)) +# name_x += USER.set_name_spatial_lags(name_x, slx_lags) + #name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # exclude constant - x_constant, name_x = USER.flex_wx( - w, - x=x_constant, - name_x=name_x, - constant=True, - slx_lags=slx_lags, - slx_vars=slx_vars, - ) + x_constant,name_x = USER.flex_wx(w,x=x_constant,name_x=name_x,constant=True, + slx_lags=slx_lags,slx_vars=slx_vars) self.title += " WITH SLX (SLX-Error)" # OLD - # if slx_lags >0: - # lag_x = get_lags(w, x_constant[:, 1:], slx_lags) - # x_constant = np.hstack((x_constant, lag_x)) - # name_x += USER.set_name_spatial_lags(name_x, slx_lags) - # name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # no constant - # self.title += " WITH SLX (SLX-Error)" + #if slx_lags >0: + #lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + #x_constant = np.hstack((x_constant, lag_x)) +# name_x += USER.set_name_spatial_lags(name_x, slx_lags) + #name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # no constant + #self.title += " WITH SLX (SLX-Error)" BaseGM_Error_Het.__init__( self, @@ -419,23 +407,25 @@ def __init__( max_iter=max_iter, step1c=step1c, epsilon=epsilon, - hard_bound=hard_bound, + hard_bound = hard_bound ) + self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) - # self.name_x = USER.set_name_x(name_x, x_constant) +# self.name_x = USER.set_name_x(name_x, x_constant) self.name_x = name_x # constant already included self.name_x.append("lambda") self.name_w = USER.set_name_w(name_w, w) - self.output = pd.DataFrame(self.name_x, columns=["var_names"]) - self.output["var_type"] = ["x"] * (len(self.name_x) - 1) + ["lambda"] - self.output["regime"], self.output["equation"] = (0, 0) + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = ['x'] * (len(self.name_x)-1) + ['lambda'] + self.output['regime'], self.output['equation'] = (0, 0) self.other_top = _summary_iteration(self) output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class BaseGM_Endog_Error_Het(RegressionPropsY): + """ GMM method for a spatial error model with heteroskedasticity and endogenous variables (note: no consistency checks, diagnostics or constant @@ -560,6 +550,7 @@ def __init__( inv_method="power_exp", hard_bound=False, ): + self.step1c = step1c # 1a. reg --> \tilde{betas} tsls = TSLS.BaseTSLS(y=y, x=x, yend=yend, q=q) @@ -599,11 +590,11 @@ def __init__( lambda2 = lambda1 # Forcing the 1st step lambda to be in the range [-0.9, 0.9] to avoid perfect collinearity in step 2 in case of SLX-Error or GNS models - # if lambda2 > 0.9: + #if lambda2 > 0.9: # lambda_old = 0.9 - # elif lambda2 < -0.9: + #elif lambda2 < -0.9: # lambda_old = -0.9 - # else: + #else: lambda_old = lambda2 self.iteration, eps = 0, 1 @@ -639,11 +630,11 @@ def __init__( moments_i = UTILS._moments2eqs(wA1, w, self.u) lambda3 = UTILS.optim_moments(moments_i, vc2) - # if abs(lambda3) <= 0.9: + #if abs(lambda3) <= 0.9: # pass - # elif lambda3 > 0.9: + #elif lambda3 > 0.9: # lambda3 = 0.9 - # elif lambda3 < -0.9: + #elif lambda3 < -0.9: # lambda3 = -0.9 eps = abs(lambda3 - lambda_old) @@ -653,24 +644,14 @@ def __init__( self.iter_stop = UTILS.iter_msg(self.iteration, max_iter) if hard_bound: if abs(lambda3) >= 0.99: - raise Exception( - "Spatial error parameter was outside the bounds of -0.99 and 0.99" - ) + raise Exception("Spatial error parameter was outside the bounds of -0.99 and 0.99") if abs(tsls_s.betas[-1]) >= 0.99: - raise Exception( - "Spatial lag parameter was outside the bounds of -0.99 and 0.99" - ) + raise Exception("Spatial lag parameter was outside the bounds of -0.99 and 0.99") else: if abs(lambda3) >= 0.99: - set_warn( - self, - "Spatial error parameter was outside the bounds of -0.99 and 0.99", - ) + set_warn(self, "Spatial error parameter was outside the bounds of -0.99 and 0.99") if abs(tsls_s.betas[-1]) >= 0.99: - set_warn( - self, - "Spatial lag parameter was outside the bounds of -0.99 and 0.99", - ) + set_warn(self, "Spatial lag parameter was outside the bounds of -0.99 and 0.99") zs = UTILS.get_spFilter(w, lambda3, self.z) P = get_P_hat(self, tsls.hthi, zs) @@ -682,6 +663,7 @@ def __init__( class GM_Endog_Error_Het(BaseGM_Endog_Error_Het): + """ GMM method for a spatial error model with heteroskedasticity and endogenous variables, with results and diagnostics; based on @@ -707,7 +689,7 @@ class GM_Endog_Error_Het(BaseGM_Endog_Error_Het): Number of spatial lags of X to include in the model specification. If slx_lags>0, the specification becomes of the SLX-Error type. slx_vars : either "All" (default) or list of booleans to select x variables - to be lagged + to be lagged max_iter : int Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`. Note: epsilon provides an additional @@ -934,37 +916,32 @@ def __init__( name_w=None, name_ds=None, latex=False, - hard_bound=False, + hard_bound=False ): + n = USER.check_arrays(y, x, yend, q) y, name_y = USER.check_y(y, n, name_y) w = USER.check_weights(w, y, w_required=True, slx_lags=slx_lags) yend, q, name_yend, name_q = USER.check_endog([yend, q], [name_yend, name_q]) x_constant, name_x, warn = USER.check_constant(x, name_x) - name_x = USER.set_name_x( - name_x, x_constant - ) # initialize in case None, includes constant + name_x = USER.set_name_x(name_x, x_constant) # initialize in case None, includes constant set_warn(self, warn) self.title = "GM SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HET)" - if slx_lags > 0: - x_constant, name_x = USER.flex_wx( - w, - x=x_constant, - name_x=name_x, - constant=True, - slx_lags=slx_lags, - slx_vars=slx_vars, - ) + if slx_lags >0: - self.title += " WITH SLX (SLX-Error)" + x_constant,name_x = USER.flex_wx(w,x=x_constant,name_x=name_x,constant=True, + slx_lags=slx_lags,slx_vars=slx_vars) + + self.title += " WITH SLX (SLX-Error)" # OLD - # if slx_lags > 0: - # lag_x = get_lags(w, x_constant[:, 1:], slx_lags) - # x_constant = np.hstack((x_constant, lag_x)) - # name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # no constant - # self.title += " WITH SLX (SLX-Error)" + #if slx_lags > 0: + #lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + #x_constant = np.hstack((x_constant, lag_x)) + #name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # no constant + #self.title += " WITH SLX (SLX-Error)" + BaseGM_Endog_Error_Het.__init__( self, @@ -981,7 +958,7 @@ def __init__( ) self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) - # self.name_x = USER.set_name_x(name_x, x_constant) +# self.name_x = USER.set_name_x(name_x, x_constant) self.name_x = name_x # constant already included self.name_yend = USER.set_name_yend(name_yend, yend) self.name_z = self.name_x + self.name_yend @@ -989,16 +966,16 @@ def __init__( self.name_q = USER.set_name_q(name_q, q) self.name_h = USER.set_name_h(self.name_x, self.name_q) self.name_w = USER.set_name_w(name_w, w) - self.output = pd.DataFrame(self.name_z, columns=["var_names"]) - self.output["var_type"] = ( - ["x"] * len(self.name_x) + ["yend"] * len(self.name_yend) + ["lambda"] - ) - self.output["regime"], self.output["equation"] = (0, 0) + self.output = pd.DataFrame(self.name_z, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * len(self.name_yend) + ['lambda'] + self.output['regime'], self.output['equation'] = (0, 0) self.other_top = _summary_iteration(self) output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class BaseGM_Combo_Het(BaseGM_Endog_Error_Het): + """ GMM method for a spatial lag and error model with heteroskedasticity and endogenous variables (note: no consistency checks, diagnostics or constant @@ -1153,6 +1130,7 @@ def __init__( inv_method="power_exp", hard_bound=False, ): + BaseGM_Endog_Error_Het.__init__( self, y=y, @@ -1169,6 +1147,7 @@ def __init__( class GM_Combo_Het(BaseGM_Combo_Het): + """ GMM method for a spatial lag and error model with heteroskedasticity and endogenous variables, with results and diagnostics; based on @@ -1452,30 +1431,27 @@ def __init__( latex=False, hard_bound=False, ): + n = USER.check_arrays(y, x, yend, q) y, name_y = USER.check_y(y, n, name_y) w = USER.check_weights(w, y, w_required=True, slx_lags=slx_lags) yend, q, name_yend, name_q = USER.check_endog([yend, q], [name_yend, name_q]) x_constant, name_x, warn = USER.check_constant(x, name_x) - name_x = USER.set_name_x( - name_x, x_constant - ) # initialize in case None, includes constant + name_x = USER.set_name_x(name_x, x_constant) # initialize in case None, includes constant set_warn(self, warn) if slx_lags > 0: - yend2, q2, wx = set_endog( - y, x_constant[:, 1:], w, yend, q, w_lags, lag_q, slx_lags, slx_vars - ) + yend2, q2, wx = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q, slx_lags,slx_vars) x_constant = np.hstack((x_constant, wx)) else: yend2, q2 = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q) # OLS - # if slx_lags == 0: - # yend2, q2 = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q) - # else: - # yend2, q2, wx = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q, slx_lags) - # x_constant = np.hstack((x_constant, wx)) + #if slx_lags == 0: + #yend2, q2 = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q) + #else: + #yend2, q2, wx = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q, slx_lags) + #x_constant = np.hstack((x_constant, wx)) BaseGM_Combo_Het.__init__( self, @@ -1500,33 +1476,31 @@ def __init__( self.title = "SPATIALLY WEIGHTED 2SLS- GM-COMBO MODEL (HET)" if slx_lags > 0: # adjust for flexwx - if isinstance(slx_vars, list): # slx_vars has True,False - if len(slx_vars) != x.shape[1]: + if (isinstance(slx_vars,list)): # slx_vars has True,False + if len(slx_vars) != x.shape[1] : raise Exception("slx_vars incompatible with x column dimensions") else: # use slx_vars to extract proper columns workname = name_x[1:] kx = len(workname) - vv = list(compress(workname, slx_vars)) + vv = list(compress(workname,slx_vars)) name_x += USER.set_name_spatial_lags(vv, slx_lags) wkx = slx_vars.count(True) else: kx = len(name_x) - 1 wkx = kx - name_x += USER.set_name_spatial_lags( - name_x[1:], slx_lags - ) # exclude constant + name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # exclude constant self.title += " WITH SLX (GNSM)" # OLD - # if slx_lags > 0: - # name_x += USER.set_name_spatial_lags(name_x, slx_lags) - # name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # no constant - # self.title += " WITH SLX (GNSM)" + #if slx_lags > 0: +# name_x += USER.set_name_spatial_lags(name_x, slx_lags) + #name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # no constant + #self.title += " WITH SLX (GNSM)" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) - # self.name_x = USER.set_name_x(name_x, x_constant) - self.name_x = name_x # constant already included +# self.name_x = USER.set_name_x(name_x, x_constant) + self.name_x = name_x # constant already included self.name_yend = USER.set_name_yend(name_yend, yend) self.name_yend.append(USER.set_name_yend_sp(self.name_y)) self.name_z = self.name_x + self.name_yend @@ -1536,52 +1510,39 @@ def __init__( if slx_lags > 0: # need to remove all but last SLX variables from name_x self.name_x0 = [] self.name_x0.append(self.name_x[0]) # constant - if isinstance(slx_vars, list): # boolean list passed + if (isinstance(slx_vars,list)): # boolean list passed # x variables that were not lagged - self.name_x0.extend( - list(compress(self.name_x[1:], [not i for i in slx_vars])) - ) + self.name_x0.extend(list(compress(self.name_x[1:],[not i for i in slx_vars]))) # last wkx variables self.name_x0.extend(self.name_x[-wkx:]) + else: - okx = int( - (self.k - self.yend.shape[1] - 1) / (slx_lags + 1) - ) # number of original exogenous vars + okx = int((self.k - self.yend.shape[1] - 1) / (slx_lags + 1)) # number of original exogenous vars self.name_x0.extend(self.name_x[-okx:]) - self.name_q.extend( - USER.set_name_q_sp(self.name_x0, w_lags, self.name_q, lag_q) - ) + self.name_q.extend(USER.set_name_q_sp(self.name_x0, w_lags, self.name_q, lag_q)) - # var_types = ['x'] * (kx + 1) + ['wx'] * kx * slx_lags + ['yend'] * (len(self.name_yend) - 1) + ['rho'] - var_types = ( - ["x"] * (kx + 1) - + ["wx"] * wkx * slx_lags - + ["yend"] * (len(self.name_yend) - 1) - + ["rho", "lambda"] - ) + #var_types = ['x'] * (kx + 1) + ['wx'] * kx * slx_lags + ['yend'] * (len(self.name_yend) - 1) + ['rho'] + var_types = ['x'] * (kx + 1) + ['wx'] * wkx * slx_lags + ['yend'] * (len(self.name_yend) - 1) + ['rho','lambda'] else: - self.name_q.extend( - USER.set_name_q_sp(self.name_x, w_lags, self.name_q, lag_q) - ) - var_types = ( - ["x"] * len(self.name_x) - + ["yend"] * (len(self.name_yend) - 1) - + ["rho", "lambda"] - ) + self.name_q.extend(USER.set_name_q_sp(self.name_x, w_lags, self.name_q, lag_q)) + var_types = ['x'] * len(self.name_x) + ['yend'] * (len(self.name_yend) - 1) + ['rho','lambda'] + + - # self.name_q.extend(USER.set_name_q_sp(self.name_x, w_lags, self.name_q, lag_q)) + #self.name_q.extend(USER.set_name_q_sp(self.name_x, w_lags, self.name_q, lag_q)) self.name_h = USER.set_name_h(self.name_x, self.name_q) self.name_w = USER.set_name_w(name_w, w) - self.output = pd.DataFrame(self.name_z, columns=["var_names"]) + self.output = pd.DataFrame(self.name_z, + columns=['var_names']) + + self.output['var_type'] = var_types - self.output["var_type"] = var_types + #self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * (len(self.name_yend)-1) + ['rho', 'lambda'] - # self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * (len(self.name_yend)-1) + ['rho', 'lambda'] - - self.output["regime"], self.output["equation"] = (0, 0) + self.output['regime'], self.output['equation'] = (0, 0) self.other_top = _spat_pseudo_r2(self) self.other_top += _summary_iteration(self) output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) @@ -1611,7 +1572,7 @@ def get_psi_sigma(w, u, lamb): def get_vc_het(w, wA1, E): - """ + r""" Computes the VC matrix Psi based on lambda as in Arraiz et al :cite:`Arraiz2010`: ..math:: @@ -1748,6 +1709,7 @@ def get_a1a2(w, wA1, reg, lambdapar, P, zs, inv_method, filt): def get_vc_het_tsls( w, wA1, reg, lambdapar, P, zs, inv_method, filt=True, save_a1a2=False ): + sigma = get_psi_sigma(w, reg.u, lambdapar) vc1 = get_vc_het(w, wA1, sigma) a1, a2 = get_a1a2(w, wA1, reg, lambdapar, P, zs, inv_method, filt) @@ -1831,9 +1793,9 @@ def _test(): import numpy as np import libpysal - db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"), "r") + db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r') y = np.array(db.by_col("HOVAL")) - y = np.reshape(y, (49, 1)) + y = np.reshape(y, (49,1)) X = [] X.append(db.by_col("INC")) X = np.array(X).T @@ -1845,23 +1807,11 @@ def _test(): q = np.array(q).T w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp")) - w.transform = "r" + w.transform = 'r' # reg = GM_Error_Het(y, X, w=w, name_x=['inc'], name_y='hoval', name_ds='columbus', vm=True) # reg = GM_Endog_Error_Het(y, X, yd, q, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'], # name_q=['discbd'], name_ds='columbus',vm=True) - reg = GM_Combo_Het( - y, - X, - yd, - q, - w=w, - step1c=True, - name_x=["inc"], - name_y="hoval", - name_yend=["crime"], - name_q=["discbd"], - name_ds="columbus", - vm=True, - ) + reg = GM_Combo_Het(y, X, yd, q, w=w, step1c=True, name_x=['inc'], name_y='hoval', name_yend=['crime'], name_q=['discbd'], name_ds='columbus', + vm=True) print(reg.output) - print(reg.summary) + print(reg.summary) \ No newline at end of file diff --git a/spreg/error_sp_hom.py b/spreg/error_sp_hom.py index 6f2f4381..b63908f5 100644 --- a/spreg/error_sp_hom.py +++ b/spreg/error_sp_hom.py @@ -1543,7 +1543,7 @@ def moments_hom(w, wA1, wA2, u): def get_vc_hom(w, wA1, wA2, reg, lambdapar, z_s=None, for_omegaOLS=False): - """ + r""" VC matrix \psi of Spatial error with homoscedasticity. As in Anselin (2011) :cite:`Anselin2011` (p. 20) ... diff --git a/spreg/ml_lag.py b/spreg/ml_lag.py index db968128..8fecd2fb 100755 --- a/spreg/ml_lag.py +++ b/spreg/ml_lag.py @@ -16,13 +16,7 @@ from . import diagnostics as DIAG from . import user_output as USER import pandas as pd -from .output import ( - output, - _nonspat_top, - _spat_diag_out, - _spat_pseudo_r2, - _summary_impacts, -) +from .output import output, _nonspat_top, _spat_diag_out, _spat_pseudo_r2, _summary_impacts from .w_utils import symmetrize from libpysal import weights @@ -37,6 +31,7 @@ class BaseML_Lag(RegressionPropsY, RegressionPropsVM): + """ ML estimation of the spatial lag model (note no consistency checks, diagnostics or constants added) :cite:`Anselin1988` @@ -200,9 +195,9 @@ def __init__(self, y, x, w, slx_lags=0, method="full", epsilon=0.0000001): ylag = weights.lag_spatial(w, y) # b0, b1, e0 and e1 - # now set in ML_Lag - # if slx_lags>0: - # self.x = np.hstack((self.x, get_lags(w, self.x[:, 1:], slx_lags))) +# now set in ML_Lag +# if slx_lags>0: +# self.x = np.hstack((self.x, get_lags(w, self.x[:, 1:], slx_lags))) self.n, self.k = self.x.shape xtx = spdot(self.x.T, self.x) @@ -224,19 +219,19 @@ def __init__(self, y, x, w, slx_lags=0, method="full", epsilon=0.0000001): bounds=(-1.0, 1.0), args=(self.n, e0, e1, W), method="bounded", - options={"xatol": epsilon}, + options={'xatol': epsilon}, ) elif methodML == "LU": I = sp.identity(w.n) Wsp = w.sparse # moved here - W = Wsp # .tocsc() + W = Wsp#.tocsc() res = minimize_scalar( lag_c_loglik_sp, 0.0, bounds=(-1.0, 1.0), args=(self.n, e0, e1, I, Wsp), method="bounded", - options={"xatol": epsilon}, + options={'xatol': epsilon}, ) elif methodML == "ORD": # check on symmetry structure @@ -254,7 +249,7 @@ def __init__(self, y, x, w, slx_lags=0, method="full", epsilon=0.0000001): bounds=(-1.0, 1.0), args=(self.n, e0, e1, evals), method="bounded", - options={"xatol": epsilon}, + options={'xatol': epsilon}, ) else: # program will crash, need to catch @@ -313,7 +308,7 @@ def __init__(self, y, x, w, slx_lags=0, method="full", epsilon=0.0000001): (xTwpy / self.sig2, tr2 + tr3 + wpyTwpy / self.sig2, tr1 / self.sig2) ) v3 = np.vstack( - (np.zeros((self.k, 1)), tr1 / self.sig2, self.n / (2.0 * self.sig2**2)) + (np.zeros((self.k, 1)), tr1 / self.sig2, self.n / (2.0 * self.sig2 ** 2)) ) v = np.hstack((v1, v2, v3)) @@ -323,6 +318,7 @@ def __init__(self, y, x, w, slx_lags=0, method="full", epsilon=0.0000001): class ML_Lag(BaseML_Lag): + """ ML estimation of the spatial lag model with all results and diagnostics; :cite:`Anselin1988` @@ -351,7 +347,7 @@ class ML_Lag(BaseML_Lag): Include average direct impact (ADI), average indirect impact (AII), and average total impact (ATI) in summary results. Options are 'simple', 'full', 'power', 'all' or None. - See sputils._spmultiplier for more information. + See sputils.spmultiplier for more information. vm : boolean if True, include variance-covariance matrix in summary results @@ -619,56 +615,38 @@ def __init__( y, name_y = USER.check_y(y, n, name_y) w = USER.check_weights(w, y, w_required=True, slx_lags=slx_lags) x_constant, name_x, warn = USER.check_constant(x, name_x) - name_x = USER.set_name_x( - name_x, x_constant - ) # needs to be initialized for none, now with constant + name_x = USER.set_name_x(name_x, x_constant) # needs to be initialized for none, now with constant set_warn(self, warn) method = method.upper() # using flex_wx kx = len(name_x) if slx_lags > 0: - x_constant, name_x = USER.flex_wx( - w, - x=x_constant, - name_x=name_x, - constant=True, - slx_lags=slx_lags, - slx_vars=slx_vars, - ) - if isinstance(slx_vars, list): + x_constant,name_x = USER.flex_wx(w,x=x_constant,name_x=name_x,constant=True, + slx_lags=slx_lags,slx_vars=slx_vars) + if isinstance(slx_vars,list): kw = slx_vars.count(True) if kw < kx - 1: - spat_diag = False # no common factor test + spat_diag = False # no common factor test else: - kw = kx - 1 + kw = kx-1 + BaseML_Lag.__init__( - self, - y=y, - x=x_constant, - w=w, - slx_lags=slx_lags, - method=method, - epsilon=epsilon, + self, y=y, x=x_constant, w=w, slx_lags=slx_lags, method=method, epsilon=epsilon ) # increase by 1 to have correct aic and sc, include rho in count self.k += 1 - if slx_lags > 0: - # kx = len(name_x) - # name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # exclude constant - - self.title = ( - "MAXIMUM LIKELIHOOD SPATIAL LAG WITH SLX - SPATIAL DURBIN MODEL" - + " (METHOD = " - + method - + ")" - ) - # var_types = ['x'] * kx + ['wx'] * (kx-1) * slx_lags + ['rho'] - var_types = ["x"] * kx + ["wx"] * (kw) * slx_lags + ["rho"] + if slx_lags>0: + # kx = len(name_x) + # name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # exclude constant + + self.title = "MAXIMUM LIKELIHOOD SPATIAL LAG WITH SLX - SPATIAL DURBIN MODEL" + " (METHOD = " + method + ")" +# var_types = ['x'] * kx + ['wx'] * (kx-1) * slx_lags + ['rho'] + var_types = ['x'] * kx + ['wx'] * (kw) * slx_lags + ['rho'] else: self.title = "MAXIMUM LIKELIHOOD SPATIAL LAG" + " (METHOD = " + method + ")" - var_types = ["x"] * len(name_x) + ["rho"] + var_types = ['x'] * len(name_x) + ['rho'] self.slx_lags = slx_lags self.slx_vars = slx_vars self.name_ds = USER.set_name_ds(name_ds) @@ -679,25 +657,22 @@ def __init__( self.name_w = USER.set_name_w(name_w, w) self.aic = DIAG.akaike(reg=self) self.schwarz = DIAG.schwarz(reg=self) - self.output = pd.DataFrame(self.name_x, columns=["var_names"]) - self.output["var_type"] = var_types - self.output["regime"], self.output["equation"] = (0, 0) + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = var_types + self.output['regime'], self.output['equation'] = (0, 0) self.other_top = _spat_pseudo_r2(self) self.other_top += _nonspat_top(self, ml=True) diag_out = None - if spat_diag and slx_lags == 1: - diag_out = _spat_diag_out(self, w, "yend", ml=True) + if spat_diag and slx_lags==1: + diag_out = _spat_diag_out(self, w, 'yend', ml=True) if spat_impacts: - self.sp_multipliers, impacts_str = _summary_impacts( - self, w, spat_impacts, slx_lags, slx_vars - ) + self.sp_multipliers, impacts_str = _summary_impacts(self, w, spat_impacts, slx_lags,slx_vars) try: diag_out += impacts_str except TypeError: diag_out = impacts_str output(reg=self, vm=vm, robust=False, other_end=diag_out, latex=latex) - def lag_c_loglik(rho, n, e0, e1, W): # concentrated log-lik for lag model, no constants, brute force er = e0 - rho * e1 @@ -748,7 +723,6 @@ def _test(): doctest.testmod() np.set_printoptions(suppress=start_suppress) - if __name__ == "__main__": _test() diff --git a/spreg/ml_lag_regimes.py b/spreg/ml_lag_regimes.py index bfaa695c..c77e8dd1 100644 --- a/spreg/ml_lag_regimes.py +++ b/spreg/ml_lag_regimes.py @@ -12,19 +12,14 @@ from .ml_lag import BaseML_Lag from .utils import set_warn, get_lags import pandas as pd -from .output import ( - output, - _nonspat_top, - _spat_diag_out, - _spat_pseudo_r2, - _summary_impacts, -) +from .output import output, _nonspat_top, _spat_diag_out, _spat_pseudo_r2, _summary_impacts __all__ = ["ML_Lag_Regimes"] class ML_Lag_Regimes(BaseML_Lag, REGI.Regimes_Frame): + """ ML estimation of the spatial lag model with regimes (note no consistency checks, diagnostics or constants added) :cite:`Anselin1988`. @@ -74,7 +69,7 @@ class ML_Lag_Regimes(BaseML_Lag, REGI.Regimes_Frame): Include average direct impact (ADI), average indirect impact (AII), and average total impact (ATI) in summary results. Options are 'simple', 'full', 'power', 'all' or None. - See sputils._spmultiplier for more information. + See sputils.spmultiplier for more information. cores : boolean Specifies if multiprocessing is to be used Default: no multiprocessing, cores = False @@ -337,6 +332,7 @@ def __init__( name_regimes=None, latex=False, ): + n = USER.check_arrays(y, x) y, name_y = USER.check_y(y, n, name_y) w = USER.check_weights(w, y, w_required=True, slx_lags=slx_lags) @@ -353,9 +349,7 @@ def __init__( name_x += USER.set_name_spatial_lags(name_x, slx_lags) kwx = lag_x.shape[1] - self.name_x_r = USER.set_name_x(name_x, x_constant) + [ - USER.set_name_yend_sp(name_y) - ] + self.name_x_r = USER.set_name_x(name_x, x_constant) + [USER.set_name_yend_sp(name_y)] self.method = method self.epsilon = epsilon regimes, name_regimes = USER.check_reg_list(regimes, name_regimes, n) @@ -427,7 +421,7 @@ def __init__( constant_regi, cols2regi=cols2regi[:-1], names=name_x, - rlist=True, + rlist=True ) self.name_x.append("_Global_" + USER.set_name_yend_sp(name_y)) BaseML_Lag.__init__(self, y=y, x=x, w=w, method=method, epsilon=epsilon) @@ -438,46 +432,31 @@ def __init__( self.aic = DIAG.akaike(reg=self) self.schwarz = DIAG.schwarz(reg=self) self.regime_lag_sep = regime_lag_sep - self.output = pd.DataFrame(self.name_x, columns=["var_names"]) - self.output["regime"] = x_rlist + ["_Global"] - self.output["var_type"] = ["x"] * (len(self.name_x) - 1) + ["rho"] - self.output["equation"] = 0 + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['regime'] = x_rlist + ['_Global'] + self.output['var_type'] = ['x'] * (len(self.name_x) - 1) + ['rho'] + self.output['equation'] = 0 self.slx_lags = slx_lags diag_out = None if slx_lags > 0: - self.title = ( - "MAXIMUM LIKELIHOOD SPATIAL DURBIN - REGIMES" - + " (METHOD = " - + method - + ")" - ) - fixed_wx = cols2regi[-(kwx + 1) : -1].count(False) + self.title = ("MAXIMUM LIKELIHOOD SPATIAL DURBIN - REGIMES"+ " (METHOD = "+ method+ ")") + fixed_wx = cols2regi[-(kwx+1):-1].count(False) kwx = kwx - fixed_wx if kwx > 0: for m in self.regimes_set: - r_output = self.output[ - (self.output["regime"] == str(m)) - & (self.output["var_type"] == "x") - ] + r_output = self.output[(self.output['regime'] == str(m)) & (self.output['var_type'] == 'x')] wx_index = r_output.index[-kwx:] - self.output.loc[wx_index, "var_type"] = "wx" + self.output.loc[wx_index, 'var_type'] = 'wx' if fixed_wx > 0: - f_wx_index = self.output.index[-(fixed_wx + 1) : -1] - self.output.loc[f_wx_index, "var_type"] = "wx" + f_wx_index = self.output.index[-(fixed_wx+1):-1] + self.output.loc[f_wx_index, 'var_type'] = 'wx' if spat_diag and slx_lags == 1: - diag_out = _spat_diag_out(self, w, "yend", ml=True) + diag_out = _spat_diag_out(self, w, 'yend', ml=True) else: - self.title = ( - "MAXIMUM LIKELIHOOD SPATIAL LAG - REGIMES" - + " (METHOD = " - + method - + ")" - ) - + self.title = ("MAXIMUM LIKELIHOOD SPATIAL LAG - REGIMES"+ " (METHOD = "+ method+ ")") + if spat_impacts: - self.sp_multipliers, impacts_str = _summary_impacts( - self, w, spat_impacts, slx_lags, regimes=True - ) + self.sp_multipliers, impacts_str = _summary_impacts(self, w, spat_impacts, slx_lags, regimes=True) try: diag_out += impacts_str except TypeError: @@ -508,7 +487,7 @@ def ML_Lag_Regimes_Multi( name_ds, latex, ): - # pool = mp.Pool(cores) + #pool = mp.Pool(cores) results_p = {} """ for r in self.regimes_set: @@ -585,9 +564,7 @@ def ML_Lag_Regimes_Multi( results = {} self.name_y, self.name_x = [], [] counter = 0 - self.output = pd.DataFrame( - columns=["var_names", "var_type", "regime", "equation"] - ) + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -603,13 +580,21 @@ def ML_Lag_Regimes_Multi( (counter * self.kr) : ((counter + 1) * self.kr), (counter * self.kr) : ((counter + 1) * self.kr), ] = results[r].vm - self.betas[(counter * self.kr) : ((counter + 1) * self.kr),] = results[ - r - ].betas - self.u[regi_ids[r],] = results[r].u - self.predy[regi_ids[r],] = results[r].predy - self.predy_e[regi_ids[r],] = results[r].predy_e - self.e_pred[regi_ids[r],] = results[r].e_pred + self.betas[ + (counter * self.kr) : ((counter + 1) * self.kr), + ] = results[r].betas + self.u[ + regi_ids[r], + ] = results[r].u + self.predy[ + regi_ids[r], + ] = results[r].predy + self.predy_e[ + regi_ids[r], + ] = results[r].predy_e + self.e_pred[ + regi_ids[r], + ] = results[r].e_pred self.name_y += results[r].name_y self.name_x += results[r].name_x results[r].other_top = _spat_pseudo_r2(results[r]) @@ -617,26 +602,17 @@ def ML_Lag_Regimes_Multi( results[r].other_mid = "" if slx_lags > 0: kx = (len(results[r].name_x) - 1) // (slx_lags + 1) - var_types = ["x"] * (kx + 1) + ["wx"] * kx * slx_lags + ["rho"] + var_types = ['x'] * (kx + 1) + ['wx'] * kx * slx_lags + ['rho'] else: - var_types = ["x"] * (len(results[r].name_x) - 1) + ["rho"] - results[r].output = pd.DataFrame( - { - "var_names": results[r].name_x, - "var_type": var_types, - "regime": r, - "equation": r, - } - ) + var_types = ['x'] * (len(results[r].name_x) - 1) + ['rho'] + results[r].output = pd.DataFrame({'var_names': results[r].name_x, + 'var_type': var_types, + 'regime': r, 'equation': r}) self.output = pd.concat([self.output, results[r].output], ignore_index=True) if spat_diag and slx_lags == 1: - results[r].other_mid += _spat_diag_out( - results[r], None, "yend", ml=True - ) + results[r].other_mid += _spat_diag_out(results[r], None, 'yend', ml=True) if spat_impacts: - results[r].sp_multipliers, impacts_str = _summary_impacts( - results[r], results[r].w, spat_impacts, slx_lags - ) + results[r].sp_multipliers, impacts_str = _summary_impacts(results[r], results[r].w, spat_impacts, slx_lags) results[r].other_mid += impacts_str counter += 1 self.multi = results @@ -663,21 +639,9 @@ def _work( x_r = x[regi_ids[r]] model = BaseML_Lag(y_r, x_r, w_r, method=method, epsilon=epsilon) if slx_lags == 0: - model.title = ( - "MAXIMUM LIKELIHOOD SPATIAL LAG - REGIME " - + str(r) - + " (METHOD = " - + method - + ")" - ) + model.title = ("MAXIMUM LIKELIHOOD SPATIAL LAG - REGIME "+ str(r)+ " (METHOD = "+ method+ ")") else: - model.title = ( - "MAXIMUM LIKELIHOOD SPATIAL DURBIN - REGIME " - + str(r) - + " (METHOD = " - + method - + ")" - ) + model.title = ("MAXIMUM LIKELIHOOD SPATIAL DURBIN - REGIME "+ str(r)+ " (METHOD = "+ method+ ")") model.name_ds = name_ds model.name_y = "%s_%s" % (str(r), name_y) model.name_x = ["%s_%s" % (str(r), i) for i in name_x] diff --git a/spreg/nslx.py b/spreg/nslx.py new file mode 100755 index 00000000..b31c210a --- /dev/null +++ b/spreg/nslx.py @@ -0,0 +1,581 @@ +""" +Estimation of Nonlinear SLX Model +""" + +__author__ = "Luc Anselin lanselin@gmail.com, \ + Pedro V. Amaral pedrovma@gmail.com" + +import numpy as np +import pandas as pd +from scipy.sparse import coo_array,csr_array +from scipy.optimize import minimize +from libpysal.cg import KDTree +from . import user_output as USER +import numpy.linalg as la +from .utils import make_wnslx, RegressionPropsY, set_warn +from .output import output, _nslx_out +from .diagnostics import log_likelihood,akaike,schwarz +from itertools import compress + +__all__ = ["NSLX"] + + +class BaseNSLX(RegressionPropsY): + ''' + Estimation of the nonlinear SLX model (note no consistency + checks, diagnostics or constants added) - inverse distance + power function and negative exponential distance function supported + + Parameters + ---------- + y : n by 1 numpy array with dependent variable + x : n by k array with explanatory variables, includes constant + xw : n by h array with selected columns of X that will have the + W(alpha) transformation applied to them + w : list of sparse CSR arrays to use in lag transformation, if + list has a single element, same weights applied for all + transform : tuple of transformations, either "exponential" or "power" + when same transformation applies to all, tuple is a single + element tuple (default is "power") + var_flag : flag for analytical computation of variance-covariance matrix + passed from NSLX + verbose : option for nonlinear optimization, either False (default) or + True + options : options specific to scipy minimize, such as {"disp":True} + (see scipy minimize docs) + + Attributes + ---------- + y : n by 1 numpy array with dependent variable + x : n by k array with explanatory variables, includes constant + xw : n by h array with selected columns of X that will have the + W(alpha) transformation applied to them + w : list of sparse CSR arrays to use in lag transformation, if + list has a single element, same weights applied for all + n : number of observations + k : number of explanatory variables in X (includes constant) + transform : tuple of transformations, either "power" or "exponential" + when same transformation applies to all, tuple is a single + element tuple (default is "power") + verbose : option for nonlinear optimization, either False (default) or + True + options : options specific to scipy minimize, such as {"disp":True} + (see scipy minimize docs) + betas : numpy array with parameter estimates + utu : sum of squared residuals + ihess : inverse of Hessian matrix + sign : estimate of residual variance (divided by n) + sig2 : same as sign + vm : coefficient variance-covariance matrix (sign x ihess) + predy : vector of predicted values + u : vector of residuals + + ''' + + def __init__(self,y,x,xw,w,transform,var_flag,verbose,options): + self.y = y + self.x = x + self.xw = xw + self.n = self.x.shape[0] + h = self.xw.shape[1] + kk = self.x.shape[1] + self.k = kk + h + + self.w = w + self.transform = transform + self.verbose = verbose + self.options = options + + h = self.xw.shape[1] + + xty = self.x.T @ self.y + xtx = self.x.T @ self.x + b0 = la.solve(xtx,xty) + + alpha0 = np.ones((h,1)) # initial value + g0 = np.vstack((b0,alpha0)) + gamma0 = g0.flatten() + gradflag=0 + + ssmin = minimize(nslxobj,gamma0, + args=(self.y,self.x,self.xw,self.w,self.transform,self.verbose), + options=self.options) + + self.betas = ssmin.x + self.utu = ssmin.fun + self.sign = self.utu / self.n + self.sig2 = self.sign + + # convergence criteria + self.success = ssmin.success + self.status = ssmin.status + self.message = ssmin.message + self.nit = ssmin.nit + + # compute predy + b= self.betas[0:-h] + alpha = self.betas[-h:] + + wx = nlmod(alpha,xw,w,transform,gradflag=0) + wxs = np.sum(wx,axis=1).reshape(-1,1) + xb = x @ b.reshape(-1,1) + predy = xb + wxs + self.predy = predy + self.u = self.y - self.predy + + # compute variance from gradients + if var_flag: + vb = np.zeros((self.k,self.k)) + xtxi = la.inv(xtx) + vb[:kk,:kk] = xtxi + wax = nlmod(alpha,xw,w,transform,gradflag=1) + waxtwax = wax.T @ wax + try: + waxtwaxi = la.inv(waxtwax) + except: + raise Exception("Singular variance matrix for nonlinear part") + vb[-h:,-h:] = waxtwaxi + self.vm = vb * self.sig2 + elif var_flag == 0: + self.ihess = ssmin.hess_inv + self.vm = self.ihess * self.sig2 + + + + + + +class NSLX(BaseNSLX): + + ''' + Estimation of the nonlinear SLX model - inverse distance + power function and negative exponential distance function supported + Includes output of all results. + + Parameters + ---------- + y : numpy.ndarray or pandas.Series + nx1 array for dependent variable + x : numpy.ndarray or pandas object + Two dimensional array with n rows and one column for each + independent (exogenous) variable, excluding the constant + coords : an n by 2 array or a selection of two columns from a data frame + params : a list of tuples containing the two parameters for the construction + of the distance weights and the transformation: + (k,distance_upper_bound,transformation) + if the list consists of a single element, the same parameters are + applied to all transformations + default is [(10,np.inf,"exponential")] for 10 knn neighbors, variable + bandwidth and exponential transformation + (see make_wnslx in UTILS) + distance_metric: metric for distance computations, either "Euclidean" (default) or "Arc" + (for decimal lat-lon degrees) + leafsize : parameter used to creat KDTree, default is 30 + slx_vars : list with True,False for selection of X variables to which SLX should be applied + default is "All" + var_flag : flag for variance computation, default = 1 - uses analytical derivation, + = 0 - uses numerical approximation with inverse hessian + conv_flag : flag for convergence diagnostics, default = 0 for no diagnostics + = 1 - prints our minimize convergence summary + verbose : boolean for intermediate results in nonlinear optimization, default is False + options : options specific to scipy minimize, such as {"disp":True} + (see scipy minimize docs) + vm : boolean + if True, include variance-covariance matrix in summary + results + name_y : string + Name of dependent variable for use in output + name_x : list of strings + Names of independent variables for use in output + name_coords : list of strings + Names of coordinate variables used in distance matrix + name_ds : string + Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format + + Attributes + ---------- + output : dataframe + regression results pandas dataframe + summary : string + Summary of regression results and diagnostics (note: use in + conjunction with the print command) + y : n by 1 numpy array with dependent variable + x : n by k array with explanatory variables, includes constant + xw : n by h array with selected columns of X that will have the + W(alpha) transformation applied to them + w : list of sparse CSR arrays to use in lag transformation, if + list has a single element, same weights applied for all + n : number of observations + k : number of explanatory variables in X (includes constant) + transform : tuple of transformations, either "power" or "exponential" + when same transformation applies to all, tuple is a single + element tuple (default is "power") + verbose : option for nonlinear optimization, either False (default) or + True + options : options specific to scipy minimize, such as {"disp":True} + (see scipy minimize docs) + betas : numpy array with parameter estimates + utu : sum of squared residuals + ihess : inverse of Hessian matrix + sign : estimate of residual variance (divided by n) + sig2 : same as sign + vm : coefficient variance-covariance matrix (sign x ihess) + predy : vector of predicted values + u : vector of residuals + ll : float + Log likelihood + aic : float + Akaike information criterion + schwarz : float + Schwarz information criterion + name_x : variable names for explanatory variables + name_ds : data set name + name_y : name of dependent variable + title : output header + + Example + -------- + >>> import numpy as np + >>> import geopandas as gpd + >>> from libpysal.examples import load_example, get_path + >>> import spreg + + Open data on Chicago census tract SDOH variables from libpysal examples using geopandas. + If you don't have chicagoSDOH installed, you can do so by running `load_example('chicagoSDOH')`. + + >>> dfs = gpd.read_file(get_path('Chi-SDOH.shp')) + + For this example, we will use the 'HIS_ct' column (economic hardship index) as the + dependent variable and the 'Blk14P', 'Hisp14P', and 'EP_NOHSDP' columns as independent + variables. The coordinates "COORD_X" and "COORD_Y" will be used to construct the + spatial weights matrix. + + >>> y = dfs[['HIS_ct']] + >>> x = dfs[['Blk14P','Hisp14P','EP_NOHSDP']] + >>> coords = dfs[["COORD_X","COORD_Y"]] + + For the regression, we set var_flag = 1 to obtain the analytical standard errors. + + >>> reg = spreg.NSLX(y, x, coords, var_flag=1) + + We can easily obtain a full summary of all the results nicely formatted and + ready to be printed: + + >>> print(reg.summary) + REGRESSION RESULTS + ------------------ + + SUMMARY OF OUTPUT: NONLINEAR SLX + -------------------------------- + Data set : unknown + Dependent Variable : HIS_ct Number of Observations: 791 + Mean dependent var : 39.7301 Number of Variables : 7 + S.D. dependent var : 13.8098 Degrees of Freedom : 784 + Sigma-square : 32.287 Sum squared residual : 25538.9 + S.E. of regression : 5.682 Log likelihood : -2496.609 + Schwarz criterion : 5039.931 Akaike info criterion : 5007.218 + Coordinates : COORD_X, COORD_Y Distance metric : Euclidean + + ------------------------------------------------------------------------------------ + Variable Coefficient Std.Error z-Statistic Probability + ------------------------------------------------------------------------------------ + CONSTANT 17.90865 0.43461 41.20635 0.00000 + Blk14P 0.17910 0.00806 22.21475 0.00000 + Hisp14P 0.05818 0.01525 3.81435 0.00014 + EP_NOHSDP 0.65462 0.02750 23.80062 0.00000 + We_Blk14P 17.16669 1.70331 10.07841 0.00000 + We_Hisp14P 88.30447 81337.66263 0.00109 0.99913 + We_EP_NOHSDP 10.02114 0.36436 27.50353 0.00000 + ------------------------------------------------------------------------------------ + Transformation: exponential + KNN: 10 + Distance upper bound: inf + ================================ END OF REPORT ===================================== + + ''' + + def __init__( + self, + y, + x, + coords, + params = [(10,np.inf,"exponential")], + distance_metric = "Euclidean", + leafsize = 30, + slx_vars="All", + var_flag=1, + conv_flag=0, + verbose = False, + options = None, + vm = False, + name_y=None, + name_x=None, + name_ds=None, + name_coords=None, + latex=False, + ): + + n = USER.check_arrays(y, x) + y, name_y = USER.check_y(y, n, name_y) + + coords,name_coords = USER.check_coords(coords,name_coords) + x_constant, name_x, warn = USER.check_constant(x, name_x) + + name_x = USER.set_name_x(name_x, x_constant) # needs to be initialized for none, now with constant + set_warn(self, warn) + + g = len(params) + + transform = [ i[2] for i in params ] + + xw = x_constant[:,1:] + + xw_name = name_x[1:] + if isinstance(slx_vars,list): # not W(a)X for all X + + xw = xw[:,slx_vars] + xw_name = list(compress(xw_name,slx_vars)) + + if g > 1: + + if g == len(slx_vars): # params does not have correct form + params = list(compress(params,slx_vars)) + transform = list(compress(transform,slx_vars)) + elif g != len(xw_name): + raise Exception("Mismatch dimension params and slx_vars") + + if g == 1: + cp = transform[0][0] + xw_name = [ "W"+ cp + "_"+ i for i in xw_name] + elif len(transform) > 1: + if len(transform) != len(xw_name): + raise Exception("Dimension mismatch") + else: + xw_name = ["W" + i[0][0] + "_" + j for i,j in zip(transform,xw_name)] + + self.name_x = name_x + xw_name + + # build up w list + w = [0 for i in range(len(params))] + for i in range(len(params)): + w[i] = make_wnslx(coords,params[i],leafsize=leafsize,distance_metric=distance_metric) + + BaseNSLX.__init__(self,y=y,x=x_constant,xw=xw,w=w,transform=transform, + var_flag=var_flag, + verbose=verbose,options=options) + + if conv_flag: + print("Convergence Characteristics") + print("Success :",self.success) + print("Status :",self.status) + print("Message :",self.message) + print("Iterations :",self.nit) + + self.ll = log_likelihood(self) + self.aic = akaike(self) + self.schwarz = schwarz(self) + + self.distance_metric = distance_metric + self.knn = [ i[0] for i in params ] + self.d_upper_bound = [ i[1] for i in params ] + self.name_ds = USER.set_name_ds(name_ds) + self.name_y = USER.set_name_y(name_y) + self.name_coords = name_coords + self.title = "NONLINEAR SLX" + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = ['x'] * len(name_x) + ['wx'] * len(xw_name) + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top = _nslx_out(self, "top") + self.other_mid = _nslx_out(self, "mid") + output(reg=self, vm=vm, latex=latex) + + + +def nslxobj(gamma0,y,x,xw,w,transform,verbose): + ''' + Objective function for minimize call, computes the sum of squared residuals in + the nonlinear SLX model. Note that all parameters other than gamma0 are passed + in an arg parameter to the minimize function + + Parameters + ---------- + gamma0 : current parameter estimates, consists of beta (for X) and alpha (for WX) + needs to be a flattened array + y : n by 1 vector with observations on the dependent variable + x : n by k matrix with observations on X, must include constant vector + xw : n by h matrix with columns of X that will be spatially lagged + w : list of sparse CSR weights for the X column transformations; if same weights + are used for all columns, must be a single element list + transform : list of transformations (either "power" or "exponential") to be applied to each + column of xw; if same transformation is applied to all, must be a single element + list + verbose : verbose option, whether or not the intermediate parameter values and residual sum + of squares are printed out + + Returns + ------- + res2 : sum of squared residuals + + ''' + n = xw.shape[0] + h = xw.shape[1] + if verbose: + print("gamma0",gamma0) + b0 = gamma0[0:-h] + alpha0 = gamma0[-h:] + # create WX + wx = nlmod(alpha0,xw,w,transform,gradflag=0) + wxs = np.sum(wx,axis=1).reshape(-1,1) + xb = x @ b0.reshape(-1,1) + res = y - xb - wxs + res2 = res.T @ res + if verbose: + print("res2",res2) + + return res2 + +def nlmod(alpha,xw,w,transform,gradflag=0): + ''' + Constructs the matrix of spatially lagged X variables W(a)X (for gradflag = 0) and + the gradient matrix d(W(x))/d(a) (for gradflag = 1), for possibly different + alpha parameters, different weights and different transformations (the transformations + must match the weights and are not checked for compatibility). Calls nltransform for + each relevant column of X. This allows the possibility that not all columns of X are + used, defined by slx_vars. + + Parameters + ---------- + alpha : array with alpha parameters, same number as relevant columns in X + must be flattened (not a vector) + xw : matrix with relevant columns of X to be lagged + w : a list containing the weights matrix (as sparse CSR) for each column + if the same weights matrix are used for all columns, w is a single element list, + otherwise, the number of weights must match the number of relevant columns in X and + the number of elements in the transform tuple + transform : a tuple with the transformations for each relevant column in X, either "power" or "exponential" + if the same transformation is used for all columns, transform is a single element tuple + (transform,), otherwise, the number of elements in the tuple must match the number of relevant + columns in X and the number of elements in the weights matrix tuple + the transformation must match the type of weights, but this is not checked + gradflag : flag for computation of gradient matrix, = 0 by default (actual function) + = 1 - computes matrix of first partial derivatives with respect to alpha + + Returns + ------- + wx : matrix with spatially lagged X variables + + ''' + # alpha must be flattened + h = len(alpha) # number of parameters + if xw.shape[1] != h: + raise Exception("Incompatible dimensions") + g = len(w) # number of weights + if len(transform) != g: + raise Exception("Incompatible dimensions") + + # initialize matrix of spatial lags + n = xw.shape[0] + wx = np.zeros((n,h)) + + for i in range(h): + + if g == 1: + walpha = _nltransform(alpha[i],w[0],transform[0],gradflag=gradflag) # element of single element tuple + elif g > 1: + walpha = _nltransform(alpha[i],w[i],transform[i],gradflag=gradflag) + else: + raise Exception("Operation not supported") + wx[:,i] = walpha @ xw[:,i] + return wx + + +def _nltransform(a,w,transform,gradflag=0): + ''' + Constructs the transformed CSR sparse array for power and exponential transformation + for a given alpha parameter, input array and transformation. Note that the alpha parameters + are positive, but are used as negative powers in the exponential transformation. + + Parameters + ---------- + a : alpha coefficient as a (positive) scalar + w : CSR sparse array with weights + transform : transformation, either "power" or "exponential" + gradflag : flag for gradient computation, 0 = function, default + 1 = gradient + + Returns + ------- + walpha : CSR sparse array with transformed weights + + ''' + walpha = w.copy() + + if transform.lower() == "exponential": + + wdata = walpha + awdata = wdata * a + + if gradflag == 0: + + awdata = -awdata + np.exp(awdata.data, out= awdata.data) + walpha = awdata + + elif gradflag == 1: + + ww = -awdata + np.log(awdata.data, out = awdata.data) + wln = awdata + w1 = wln.multiply(wdata) + np.exp(ww.data, out=ww.data) + wgrad = ww.multiply(w1) + wgrad = -wgrad + walpha = wgrad + + elif transform.lower() == "power": + + if gradflag == 0: + walpha = walpha.power(a) + elif gradflag == 1: + walpha = walpha.power(a) + ww = w.copy() + np.log(ww.data, out = ww.data) + wgrad = walpha.multiply(ww) + walpha = wgrad + + + else: + raise Exception("Transformation not supported") + + return walpha + + + +def _test(): + import doctest + + # the following line could be used to define an alternative to the '' flag + # doctest.BLANKLINE_MARKER = 'something better than ' + start_suppress = np.get_printoptions()["suppress"] + np.set_printoptions(suppress=True) + doctest.testmod() + np.set_printoptions(suppress=start_suppress) + + +if __name__ == "__main__": + _test() + + from libpysal.examples import load_example, get_path + import geopandas as gpd + import spreg + + dfs = gpd.read_file(load_example('columbus').get_path("columbus.shp")) + dfs['geometry'] = gpd.points_from_xy(dfs['X'], dfs['Y']) # Transforming polygons to points + y = dfs[["CRIME"]] + x = dfs[["INC","HOVAL","DISCBD"]] + reg = spreg.NSLX(y, x, dfs['geometry']) + print(reg.output) + print(reg.summary) + diff --git a/spreg/ols.py b/spreg/ols.py index 850d1ec1..50237661 100644 --- a/spreg/ols.py +++ b/spreg/ols.py @@ -8,12 +8,13 @@ from . import robust as ROBUST from .utils import spdot, RegressionPropsY, RegressionPropsVM, set_warn, get_lags import pandas as pd -from libpysal import weights # needed for check on kernel weights in slx +from libpysal import weights # needed for check on kernel weights in slx __all__ = ["OLS"] class BaseOLS(RegressionPropsY, RegressionPropsVM): + """ Ordinary least squares (OLS) (note: no consistency checks, diagnostics or constant added) @@ -437,8 +438,8 @@ def __init__( w=None, robust=None, gwk=None, - slx_lags=0, - slx_vars="All", + slx_lags = 0, + slx_vars = "All", sig2n_k=True, nonspat_diag=True, spat_diag=False, @@ -453,42 +454,34 @@ def __init__( name_ds=None, latex=False, ): + n = USER.check_arrays(y, x) y, name_y = USER.check_y(y, n, name_y) USER.check_robust(robust, gwk) - if robust == "hac" and spat_diag: - set_warn( - self, - "Spatial diagnostics are not available for HAC estimation. Hence, spatial diagnostics have been disabled for this model.", - ) - spat_diag = False + spat_diag, warn = USER.check_spat_diag(spat_diag=spat_diag, w=w, robust=robust, slx_lags=slx_lags) + set_warn(self, warn) + if robust in ["hac", "white"] and white_test: - set_warn( - self, - "White test not available when standard errors are estimated by HAC or White correction.", - ) - white_test = False + set_warn( + self, + "White test not available when standard errors are estimated by HAC or White correction.", + ) + white_test = False x_constant, name_x, warn = USER.check_constant(x, name_x) set_warn(self, warn) self.name_x = USER.set_name_x(name_x, x_constant) - + if spat_diag or moran: - w = USER.check_weights(w, y, slx_lags=slx_lags, w_required=True) + w = USER.check_weights(w, y, slx_lags=slx_lags, w_required=True, allow_wk=True) else: - w = USER.check_weights(w, y, slx_lags=slx_lags) - if slx_lags > 0: - # lag_x = get_lags(w, x_constant[:, 1:], slx_lags) - # x_constant = np.hstack((x_constant, lag_x)) - # self.name_x += USER.set_name_spatial_lags(self.name_x[1:], slx_lags) - x_constant, self.name_x = USER.flex_wx( - w, - x=x_constant, - name_x=self.name_x, - constant=True, - slx_lags=slx_lags, - slx_vars=slx_vars, - ) + w = USER.check_weights(w, y, slx_lags=slx_lags, allow_wk=True) + if slx_lags >0: +# lag_x = get_lags(w, x_constant[:, 1:], slx_lags) +# x_constant = np.hstack((x_constant, lag_x)) +# self.name_x += USER.set_name_spatial_lags(self.name_x[1:], slx_lags) + x_constant,self.name_x = USER.flex_wx(w,x=x_constant,name_x=self.name_x,constant=True, + slx_lags=slx_lags,slx_vars=slx_vars) BaseOLS.__init__( self, y=y, x=x_constant, robust=robust, gwk=gwk, sig2n_k=sig2n_k @@ -502,24 +495,21 @@ def __init__( self.robust = USER.set_robust(robust) self.name_w = USER.set_name_w(name_w, w) self.name_gwk = USER.set_name_w(name_gwk, gwk) - self.output = pd.DataFrame(self.name_x, columns=["var_names"]) - self.output["var_type"] = ["x"] * len(self.name_x) - self.output["regime"], self.output["equation"] = (0, 0) - self.other_top, self.other_mid, other_end = ( - "", - "", - "", - ) # strings where function-specific diag. are stored + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top, self.other_mid, other_end = ("", "", "") # strings where function-specific diag. are stored if nonspat_diag: self.other_mid += _nonspat_mid(self, white_test=white_test) self.other_top += _nonspat_top(self) if vif: self.other_mid += _summary_vif(self) if spat_diag: - other_end += _spat_diag_out(self, w, "ols", moran=moran) + other_end += _spat_diag_out(self, w, 'ols', moran=moran) output(reg=self, vm=vm, robust=robust, other_end=other_end, latex=latex) + def _test(): import doctest diff --git a/spreg/ols_regimes.py b/spreg/ols_regimes.py index 4376ef42..5a6027a5 100755 --- a/spreg/ols_regimes.py +++ b/spreg/ols_regimes.py @@ -9,20 +9,12 @@ import pandas as pd from . import regimes as REGI from . import user_output as USER -from .utils import ( - set_warn, - RegressionProps_basic, - spdot, - RegressionPropsY, - get_lags, - optim_k, -) +from .utils import set_warn, RegressionProps_basic, spdot, RegressionPropsY, get_lags, optim_k from .ols import BaseOLS from .robust import hac_multi from .output import output, _spat_diag_out, _nonspat_mid, _nonspat_top from .skater_reg import Skater_reg - class OLS_Regimes(BaseOLS, REGI.Regimes_Frame, RegressionPropsY): """ Ordinary least squares with results and diagnostics. @@ -358,9 +350,9 @@ class OLS_Regimes(BaseOLS, REGI.Regimes_Frame, RegressionPropsY): ------------------------------------------------------------------------------------ Variable Coefficient Std.Error t-Statistic Probability ------------------------------------------------------------------------------------ - 0_CONSTANT 0.3964290 0.2481634 1.5974512 0.1103544 - 0_PS90 0.6558330 0.0966268 6.7872800 0.0000000 - 0_UE90 0.4870394 0.0362863 13.4221336 0.0000000 + 0_CONSTANT 0.39643 0.24816 1.59745 0.11035 + 0_PS90 0.65583 0.09663 6.78728 0.00000 + 0_UE90 0.48704 0.03629 13.42213 0.00000 ------------------------------------------------------------------------------------ Regimes variable: SOUTH @@ -377,50 +369,52 @@ class OLS_Regimes(BaseOLS, REGI.Regimes_Frame, RegressionPropsY): ------------------------------------------------------------------------------------ Variable Coefficient Std.Error t-Statistic Probability ------------------------------------------------------------------------------------ - 1_CONSTANT 5.5983500 0.4689456 11.9381640 0.0000000 - 1_PS90 1.1621045 0.2166740 5.3633790 0.0000001 - 1_UE90 0.5316389 0.0594565 8.9416422 0.0000000 + 1_CONSTANT 5.59835 0.46895 11.93816 0.00000 + 1_PS90 1.16210 0.21667 5.36338 0.00000 + 1_UE90 0.53164 0.05946 8.94164 0.00000 ------------------------------------------------------------------------------------ Regimes variable: SOUTH ------------------------------------------------------------------------------------ + GLOBAL DIAGNOSTICS REGIMES DIAGNOSTICS - CHOW TEST VARIABLE DF VALUE PROB - CONSTANT 1 96.129 0.0000 - PS90 1 4.554 0.0328 - UE90 1 0.410 0.5220 - Global test 3 680.960 0.0000 + CONSTANT 1 96.129 0.0000 + PS90 1 4.554 0.0328 + UE90 1 0.410 0.5220 + Global test 3 680.960 0.0000 ================================ END OF REPORT ===================================== """ def __init__( - self, - y, - x, - regimes, - w=None, - robust=None, - gwk=None, - slx_lags=0, - sig2n_k=True, - nonspat_diag=True, - spat_diag=False, - moran=False, - white_test=False, - vm=False, - constant_regi="many", - cols2regi="all", - regime_err_sep=True, - cores=False, - name_y=None, - name_x=None, - name_regimes=None, - name_w=None, - name_gwk=None, - name_ds=None, - latex=False, + self, + y, + x, + regimes, + w=None, + robust=None, + gwk=None, + slx_lags=0, + sig2n_k=True, + nonspat_diag=True, + spat_diag=False, + moran=False, + white_test=False, + vm=False, + constant_regi="many", + cols2regi="all", + regime_err_sep=True, + cores=False, + name_y=None, + name_x=None, + name_regimes=None, + name_w=None, + name_gwk=None, + name_ds=None, + latex=False ): + n = USER.check_arrays(y, x) y, name_y = USER.check_y(y, n, name_y) USER.check_robust(robust, gwk) @@ -431,12 +425,8 @@ def __init__( "Error by regimes is not available for HAC estimation. The error by regimes has been disabled for this model.", ) regime_err_sep = False - if spat_diag: - set_warn( - self, - "Spatial diagnostics are not available for HAC estimation. The spatial diagnostics have been disabled for this model.", - ) - spat_diag = False + spat_diag, warn = USER.check_spat_diag(spat_diag=spat_diag, w=w, robust=robust, slx_lags=slx_lags) + set_warn(self, warn) if robust in ["hac", "white"] and white_test: set_warn( self, @@ -447,9 +437,9 @@ def __init__( x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) name_x = USER.set_name_x(name_x, x_constant, constant=True) if spat_diag or moran: - w = USER.check_weights(w, y, slx_lags=slx_lags, w_required=True) + w = USER.check_weights(w, y, slx_lags=slx_lags, w_required=True, allow_wk=True) else: - w = USER.check_weights(w, y, slx_lags=slx_lags) + w = USER.check_weights(w, y, slx_lags=slx_lags, allow_wk=True) if slx_lags > 0: lag_x = get_lags(w, x_constant, slx_lags) x_constant = np.hstack((x_constant, lag_x)) @@ -476,9 +466,9 @@ def __init__( USER.check_regimes(self.regimes_set, self.n, x_constant.shape[1]) self.regime_err_sep = regime_err_sep if ( - regime_err_sep == True - and set(cols2regi) == set([True]) - and constant_regi == "many" + regime_err_sep == True + and set(cols2regi) == set([True]) + and constant_regi == "many" ): self.y = y regi_ids = dict( @@ -499,17 +489,18 @@ def __init__( name_x, moran, white_test, - latex, + latex ) else: x, self.name_x, x_rlist = REGI.Regimes_Frame.__init__( self, x_constant, regimes, constant_regi, cols2regi, name_x, rlist=True ) - self.output = pd.DataFrame(self.name_x, columns=["var_names"]) - self.output["var_type"] = ["x"] * len(self.name_x) - self.output["regime"] = x_rlist - self.output["equation"] = 0 + self.output = pd.DataFrame(self.name_x, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + self.output['regime'] = x_rlist + self.output['equation'] = 0 BaseOLS.__init__(self, y=y, x=x, robust=robust, gwk=gwk, sig2n_k=sig2n_k) if regime_err_sep == True and robust == None: @@ -535,37 +526,31 @@ def __init__( self.title = "ORDINARY LEAST SQUARES WITH SLX - REGIMES" self.robust = USER.set_robust(robust) self.chow = REGI.Chow(self) - self.other_top, self.other_mid, other_end = ( - "", - "", - "", - ) # strings where function-specific diag. are stored + self.other_top, self.other_mid, other_end = ("", "", "") # strings where function-specific diag. are stored if nonspat_diag: self.other_mid += _nonspat_mid(self, white_test=white_test) self.other_top += _nonspat_top(self) if spat_diag: - other_end += _spat_diag_out( - self, w, "ols", moran=moran - ) # Must decide what to do with W. + other_end += _spat_diag_out(self, w, 'ols', moran=moran) #Must decide what to do with W. output(reg=self, vm=vm, robust=robust, other_end=other_end, latex=latex) def _ols_regimes_multi( - self, - x, - w, - regi_ids, - cores, - gwk, - slx_lags, - sig2n_k, - robust, - nonspat_diag, - spat_diag, - vm, - name_x, - moran, - white_test, - latex, + self, + x, + w, + regi_ids, + cores, + gwk, + slx_lags, + sig2n_k, + robust, + nonspat_diag, + spat_diag, + vm, + name_x, + moran, + white_test, + latex ): results_p = {} """ @@ -598,7 +583,7 @@ def _ols_regimes_multi( name_x, self.name_w, self.name_regimes, - slx_lags, + slx_lags ), ) else: @@ -616,7 +601,7 @@ def _ols_regimes_multi( name_x, self.name_w, self.name_regimes, - slx_lags, + slx_lags ) ) self.kryd = 0 @@ -639,9 +624,7 @@ def _ols_regimes_multi( results = {} self.name_y, self.name_x = [], [] counter = 0 - self.output = pd.DataFrame( - columns=["var_names", "var_type", "regime", "equation"] - ) + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -655,30 +638,23 @@ def _ols_regimes_multi( results[r] = results_p[r].get() self.vm[ - (counter * self.kr) : ((counter + 1) * self.kr), - (counter * self.kr) : ((counter + 1) * self.kr), + (counter * self.kr): ((counter + 1) * self.kr), + (counter * self.kr): ((counter + 1) * self.kr), ] = results[r].vm - self.betas[(counter * self.kr) : ((counter + 1) * self.kr),] = results[ - r - ].betas - self.u[regi_ids[r],] = results[r].u - self.predy[regi_ids[r],] = results[r].predy + self.betas[ + (counter * self.kr): ((counter + 1) * self.kr), + ] = results[r].betas + self.u[ + regi_ids[r], + ] = results[r].u + self.predy[ + regi_ids[r], + ] = results[r].predy self.name_y += results[r].name_y self.name_x += results[r].name_x - self.output = pd.concat( - [ - self.output, - pd.DataFrame( - { - "var_names": results[r].name_x, - "var_type": ["x"] * len(results[r].name_x), - "regime": r, - "equation": r, - } - ), - ], - ignore_index=True, - ) + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_x, + 'var_type': ['x']*len(results[r].name_x), + 'regime': r, 'equation': r})], ignore_index=True) results[r].other_top, results[r].other_mid = ("", "") if nonspat_diag: results[r].other_mid += _nonspat_mid(results[r], white_test=white_test) @@ -692,7 +668,7 @@ def _ols_regimes_multi( other_end = "" if spat_diag: self._get_spat_diag_props(x_constant, sig2n_k) - # other_end += _spat_diag_out(self, w, 'ols', moran=moran) Need to consider W before implementing + #other_end += _spat_diag_out(self, w, 'ols', moran=moran) Need to consider W before implementing output(reg=self, vm=vm, robust=robust, other_end=other_end, latex=latex) def _get_spat_diag_props(self, x, sig2n_k): @@ -706,19 +682,7 @@ def _get_spat_diag_props(self, x, sig2n_k): def _work( - y, - x, - w, - regi_ids, - r, - robust, - sig2n_k, - name_ds, - name_y, - name_x, - name_w, - name_regimes, - slx_lags, + y, x, w, regi_ids, r, robust, sig2n_k, name_ds, name_y, name_x, name_w, name_regimes, slx_lags ): y_r = y[regi_ids[r]] x_r = x[regi_ids[r]] @@ -745,64 +709,366 @@ def _work( class OLS_Endog_Regimes(OLS_Regimes): - def __init__(self, y, x, w, n_clusters=None, quorum=-np.inf, trace=True, **kwargs): + """ + Ordinary least squares with endogenous regimes. Based on the function skater_reg as shown in :cite:`Anselin2021`. + + Parameters + ---------- + y : numpy.ndarray or pandas.Series + nx1 array for dependent variable + x : numpy.ndarray or pandas object + Two dimensional array with n rows and one column for each + independent (exogenous) variable, excluding the constant + w : pysal W object + Spatial weights object (required if running spatial + diagnostics) + n_clusters : int + Number of clusters to be used in the endogenous regimes. + If None (default), the number of clusters will be chosen + according to the function utils.optim_k using a method + adapted from Mojena (1977)'s Rule Two + quorum : int + Minimum number of observations in a cluster to be considered + Must be at least larger than the number of variables in x + Default value is 30 or 10*k, whichever is larger. + trace : boolean + Sets whether to store intermediate results of the clustering + Hard-coded to True if n_clusters is None + name_y : string + Name of dependent variable for use in output + name_x : list of strings + Names of independent variables for use in output + name_w : string + Name of weights matrix for use in output + name_ds : string + Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format + **kwargs : additional keyword arguments depending on the specific model + + Attributes + ---------- + output : dataframe + regression results pandas dataframe + summary : string + Summary of regression results and diagnostics (note: use in + conjunction with the print command) + betas : array + kx1 array of estimated coefficients + u : array + nx1 array of residuals + predy : array + nx1 array of predicted y values + n : integer + Number of observations + k : integer + Number of variables for which coefficients are estimated + (including the constant) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + y : array + nx1 array for dependent variable + x : array + Two dimensional array with n rows and one column for each + independent (exogenous) variable, including the constant + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + robust : string + Adjustment for robust standard errors + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + mean_y : float + Mean of dependent variable + std_y : float + Standard deviation of dependent variable + vm : array + Variance covariance matrix (kxk) + r2 : float + R squared + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + ar2 : float + Adjusted R squared + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + utu : float + Sum of squared residuals + sig2 : float + Sigma squared used in computations + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + sig2ML : float + Sigma squared (maximum likelihood) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + f_stat : tuple + Statistic (float), p-value (float) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + logll : float + Log likelihood + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + aic : float + Akaike information criterion + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + schwarz : float + Schwarz information criterion + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + std_err : array + 1xk array of standard errors of the betas + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + t_stat : list of tuples + t statistic; each tuple contains the pair (statistic, + p-value), where each is a float + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + mulColli : float + Multicollinearity condition number + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + jarque_bera : dictionary + 'jb': Jarque-Bera statistic (float); 'pvalue': p-value + (float); 'df': degrees of freedom (int) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + breusch_pagan : dictionary + 'bp': Breusch-Pagan statistic (float); 'pvalue': p-value + (float); 'df': degrees of freedom (int) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + koenker_bassett: dictionary + 'kb': Koenker-Bassett statistic (float); 'pvalue': p-value (float); + 'df': degrees of freedom (int). Only available in dictionary + 'multi' when multiple regressions (see 'multi' below for details). + white : dictionary + 'wh': White statistic (float); 'pvalue': p-value (float); + 'df': degrees of freedom (int). + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + lm_error : tuple + Lagrange multiplier test for spatial error model; tuple + contains the pair (statistic, p-value), where each is a + float. Only available in dictionary 'multi' when multiple + regressions (see 'multi' below for details) + lm_lag : tuple + Lagrange multiplier test for spatial lag model; tuple + contains the pair (statistic, p-value), where each is a + float. Only available in dictionary 'multi' when multiple + regressions (see 'multi' below for details) + rlm_error : tuple + Robust lagrange multiplier test for spatial error model; + tuple contains the pair (statistic, p-value), where each + is a float. Only available in dictionary 'multi' when multiple + regressions (see 'multi' below for details) + rlm_lag : tuple + Robust lagrange multiplier test for spatial lag model; + tuple contains the pair (statistic, p-value), where each + is a float. Only available in dictionary 'multi' when + multiple regressions (see 'multi' below for details) + lm_sarma : tuple + Lagrange multiplier test for spatial SARMA model; tuple + contains the pair (statistic, p-value), where each is a + float. Only available in dictionary 'multi' when multiple + regressions (see 'multi' below for details) + moran_res : tuple + Moran's I for the residuals; tuple containing the triple + (Moran's I, standardized Moran's I, p-value) + name_y : string + Name of dependent variable for use in output + name_x : list of strings + Names of independent variables for use in output + name_w : string + Name of weights matrix for use in output + name_gwk : string + Name of kernel weights matrix for use in output + name_ds : string + Name of dataset for use in output + name_regimes : string + Name of regime variable for use in the output + title : string + Name of the regression method used. + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + sig2n : float + Sigma squared (computed with n in the denominator) + sig2n_k : float + Sigma squared (computed with n-k in the denominator) + xtx : float + :math:`X'X`. Only available in dictionary 'multi' when multiple + regressions (see 'multi' below for details) + xtxi : float + :math:`(X'X)^{-1}`. Only available in dictionary 'multi' when multiple + regressions (see 'multi' below for details) + regimes : list + List of n values with the mapping of each observation to + a regime. Assumed to be aligned with 'x'. + constant_regi : string + Ignored if regimes=False. Constant option for regimes. + Switcher controlling the constant term setup. It may take + the following values: + + * 'one': a vector of ones is appended to x and held constant across regimes. + + * 'many': a vector of ones is appended to x and considered different per regime. + cols2regi : list + Ignored if regimes=False. Argument indicating whether each + column of x should be considered as different per regime + or held constant across regimes (False). + If a list, k booleans indicating for each variable the + option (True if one per regime, False to be held constant). + If 'all', all the variables vary by regime. + regime_err_sep: boolean + If True, a separate regression is run for each regime. + kr : int + Number of variables/columns to be "regimized" or subject + to change by regime. These will result in one parameter + estimate by regime for each variable (i.e. nr parameters per + variable) + kf : int + Number of variables/columns to be considered fixed or + global across regimes and hence only obtain one parameter + estimate. + nr : int + Number of different regimes in the 'regimes' list. + multi : dictionary + Only available when multiple regressions are estimated, + i.e. when regime_err_sep=True and no variable is fixed + across regimes. + Contains all attributes of each individual regression. + SSR : list + list with the total sum of squared residuals for the model + considering all regimes for each of steps of number of regimes + considered, starting with the solution with 2 regimes. + clusters : int + Number of clusters considered in the endogenous regimes + _trace : list + List of dictionaries with the clustering results for each + number of clusters tested. Only available if n_clusters is + None or trace=True. + + Examples + -------- + >>> import libpysal + >>> import numpy as np + >>> np.set_printoptions(legacy='1.25') #to avoid printing issues with numpy floats + >>> import geopandas as gpd + >>> from spreg import OLS_Endog_Regimes + + Open data on Baltimore house sales price and characteristics in Baltimore + from libpysal examples using geopandas. + + >>> db = gpd.read_file(libpysal.examples.get_path('baltim.shp')) + + We will create a weights matrix based on contiguity. + + >>> w = libpysal.weights.Queen.from_dataframe(db, use_index=True) + >>> w.transform = "r" + + For this example, we will use the 'PRICE' column as the dependent variable and + the 'NROOM', 'AGE', and 'SQFT' columns as independent variables. + At this point, we will let the model choose the number of clusters. + + >>> olsr = OLS_Endog_Regimes(y=db['PRICE'], x=db[['NROOM','AGE','SQFT']], w=w, name_w="baltim_q.gal") + + The function `print(olsr.summary)` can be used to visualize the results of the regression. + + Alternatively, we can check individual attributes: + >>> olsr.betas + array([[26.24209866], + [ 2.40329959], + [-0.24183707], + [ 0.45714794], + [19.84817747], + [ 5.12117483], + [-0.65466516], + [ 1.10034154]]) + >>> olsr.SSR + [68840.74965798721, 62741.55717492997] + >>> olsr.clusters + array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int32) + + We will now set the number of clusters to 2 and run the regression again. + + >>> olsr = OLS_Endog_Regimes(y=db['PRICE'], x=db[['NROOM','AGE','SQFT']], w=w, n_clusters=2, name_w="baltim_q.gal") + + The function `print(olsr.summary)` can be used to visualize the results of the regression. + + Alternatively, we can check individual attributes as before: + >>> olsr.betas + array([[26.24209866], + [ 2.40329959], + [-0.24183707], + [ 0.45714794], + [19.84817747], + [ 5.12117483], + [-0.65466516], + [ 1.10034154]]) + >>> olsr.SSR + [68840.74965798721] + >>> olsr.clusters + array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int32) + + """ + + def __init__( + self, y, x, w, n_clusters=None, quorum=-1, trace=True, name_y=None, name_x=None, **kwargs): + n = USER.check_arrays(y, x) y, name_y = USER.check_y(y, n, name_y) w = USER.check_weights(w, y, w_required=True) - + x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) + set_warn(self, warn) # Standardize the variables - x_std = (x - np.mean(x, axis=0)) / np.std(x, axis=0) + x_std = (x_constant - np.mean(x_constant, axis=0)) / np.std(x_constant, axis=0) + + if quorum < 0: + quorum = np.max([(x_constant.shape[1]+1)*10, 30]) if not n_clusters: - if quorum < 0: - quorum = np.max([(x.shape[1] + 1) * 10, 30]) - n_clusters_opt = x.shape[0] * 0.70 // quorum + n_clusters_opt = x_constant.shape[0]*0.70//quorum if n_clusters_opt < 2: raise ValueError( - "The combination of the values of `N` and `quorum` is not compatible with regimes estimation." - ) - sk_reg_results = Skater_reg().fit( - n_clusters_opt, - w, - x_std, - {"reg": BaseOLS, "y": y, "x": x}, - quorum=quorum, - trace=True, - ) - n_clusters = optim_k( - [ - sk_reg_results._trace[i][1][2] - for i in range(1, len(sk_reg_results._trace)) - ] - ) - self.clusters = sk_reg_results._trace[n_clusters - 1][0] + "The combination of the values of `N` and `quorum` is not compatible with regimes estimation.") + sk_reg_results = Skater_reg().fit(n_clusters_opt, w, x_std, {'reg':BaseOLS,'y':y,'x':x_constant}, quorum=quorum, trace=True) + n_clusters = optim_k([sk_reg_results._trace[i][1][2] for i in range(1, len(sk_reg_results._trace))]) + self.clusters = sk_reg_results._trace[n_clusters-1][0] else: try: # Call the Skater_reg method based on OLS - sk_reg_results = Skater_reg().fit( - n_clusters, - w, - x_std, - {"reg": BaseOLS, "y": y, "x": x}, - quorum=quorum, - trace=trace, - ) + sk_reg_results = Skater_reg().fit(n_clusters, w, x_std, {'reg':BaseOLS,'y':y,'x':x_constant}, quorum=quorum, trace=trace) self.clusters = sk_reg_results.current_labels_ except Exception as e: if str(e) == "one or more input arrays have more columns than rows": - raise ValueError( - "One or more input ended up with more variables than observations. Please check your setting for `quorum`." - ) + raise ValueError("One or more input ended up with more variables than observations. Please check your setting for `quorum`.") else: print("An error occurred:", e) self._trace = sk_reg_results._trace self.SSR = [self._trace[i][1][2] for i in range(1, len(self._trace))] - OLS_Regimes.__init__( - self, y, x, regimes=self.clusters, w=w, name_regimes="Skater_reg", **kwargs - ) - + OLS_Regimes.__init__(self, y, x_constant, regimes=self.clusters, w=w, name_regimes='Skater_reg', name_y=name_y, name_x=name_x, **kwargs) def _test(): import doctest @@ -821,15 +1087,15 @@ def _test(): db = libpysal.io.open(libpysal.examples.get_path("NAT.dbf"), "r") y_var = "HR90" - y = np.array(db.by_col(y_var)).reshape(-1, 1) - x_var = ["PS90", "UE90"] + y = np.array(db.by_col(y_var)).reshape(-1,1) + x_var = ['PS90','UE90'] x = np.array([db.by_col(name) for name in x_var]).T r_var = "SOUTH" regimes = db.by_col(r_var) w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("NAT.shp")) w.transform = "r" olsr = OLS_Regimes( - y, + y, x, regimes, w=w, @@ -844,7 +1110,7 @@ def _test(): cols2regi=[True, True], sig2n_k=False, white_test=True, - # robust="white" + #robust="white" ) print(olsr.output) print(olsr.summary) diff --git a/spreg/output.py b/spreg/output.py index 8c4e4b41..4683da5b 100755 --- a/spreg/output.py +++ b/spreg/output.py @@ -8,7 +8,7 @@ from . import diagnostics as diagnostics from . import diagnostics_tsls as diagnostics_tsls from . import diagnostics_sp as diagnostics_sp -from .sputils import _sp_effects, _spmultiplier +from .sputils import _sp_effects, spmultiplier __all__ = [] @@ -16,10 +16,9 @@ ############### Primary functions for running summary diagnostics ############# ############################################################################### - def output(reg, vm, other_end=False, robust=False, latex=False): strSummary = output_start(reg) - for eq in reg.output["equation"].unique(): + for eq in reg.output['equation'].unique(): try: reg.multi[eq].__summary = {} strSummary, reg.multi[eq] = out_part_top(strSummary, reg, eq) @@ -29,32 +28,24 @@ def output(reg, vm, other_end=False, robust=False, latex=False): strSummary, reg = out_part_middle(strSummary, reg, robust, m=eq, latex=latex) strSummary, reg = out_part_end(strSummary, reg, vm, other_end, m=eq) reg.summary = strSummary - reg.output.sort_values(by=["equation", "regime"], inplace=True) - reg.output.drop(["var_type", "regime", "equation"], axis=1, inplace=True) - + reg.output.sort_values(by=['equation', 'regime'], inplace=True) + reg.output.drop(['var_type', 'regime', 'equation'], axis=1, inplace=True) def output_start(reg): reg.__summary = {} strSummary = "REGRESSION RESULTS\n" strSummary += "------------------\n" - reg.output = reg.output.assign( - coefficients=[None] * len(reg.output), - std_err=[None] * len(reg.output), - zt_stat=[None] * len(reg.output), - prob=[None] * len(reg.output), - ) + reg.output = reg.output.assign(coefficients=[None] * len(reg.output), std_err=[None] * len(reg.output), + zt_stat=[None] * len(reg.output), prob=[None] * len(reg.output)) return strSummary - def out_part_top(strSummary, reg, m): # Top part of summary output. # m = None for single models, m = 1,2,3... for multiple equation models if m == None: _reg = reg # _reg = local object with regression results else: - _reg = reg.multi[ - m - ] # _reg = local object with equation specific regression results + _reg = reg.multi[m] # _reg = local object with equation specific regression results title = "\nSUMMARY OF OUTPUT: " + _reg.title + "\n" strSummary += title strSummary += "-" * (len(title) - 2) + "\n" @@ -85,7 +76,7 @@ def out_part_top(strSummary, reg, m): ) _reg.std_err = diagnostics.se_betas(_reg) - if "OLS" in reg.__class__.__name__: + if 'OLS' in reg.__class__.__name__: _reg.t_stat = diagnostics.t_stat(_reg) _reg.r2 = diagnostics.r2(_reg) _reg.ar2 = diagnostics.ar2(_reg) @@ -99,8 +90,9 @@ def out_part_top(strSummary, reg, m): else: _reg.z_stat = diagnostics.t_stat(_reg, z_stat=True) - _reg.pr2 = diagnostics_tsls.pr2_aspatial(_reg) - strSummary += "%-20s:%12.4f\n" % ("Pseudo R-squared", _reg.pr2) + if not 'NSLX' in reg.__class__.__name__: + _reg.pr2 = diagnostics_tsls.pr2_aspatial(_reg) + strSummary += "%-20s:%12.4f\n" % ("Pseudo R-squared", _reg.pr2) _reg.__summary["summary_zt"] = "z" try: # Adding additional top part if there is one @@ -110,28 +102,21 @@ def out_part_top(strSummary, reg, m): return (strSummary, _reg) - def out_part_middle(strSummary, reg, robust, m=None, latex=False): # Middle part of summary output. # m = None for single models, m = 1,2,3... for multiple equation models - if m == None: - _reg = reg # _reg = local object with regression results - m = reg.output["equation"].unique()[0] + if m==None: + _reg = reg #_reg = local object with regression results + m = reg.output['equation'].unique()[0] else: - _reg = reg.multi[ - m - ] # _reg = local object with equation specific regression results - coefs = pd.DataFrame(_reg.betas, columns=["coefficients"]) - coefs["std_err"] = pd.DataFrame(_reg.std_err) + _reg = reg.multi[m] #_reg = local object with equation specific regression results + coefs = pd.DataFrame(_reg.betas, columns=['coefficients']) + coefs['std_err'] = pd.DataFrame(_reg.std_err) try: - coefs = pd.concat( - [coefs, pd.DataFrame(_reg.z_stat, columns=["zt_stat", "prob"])], axis=1 - ) + coefs = pd.concat([coefs, pd.DataFrame(_reg.z_stat, columns=['zt_stat', 'prob'])], axis=1) except AttributeError: - coefs = pd.concat( - [coefs, pd.DataFrame(_reg.t_stat, columns=["zt_stat", "prob"])], axis=1 - ) - coefs.index = reg.output[reg.output["equation"] == m].index + coefs = pd.concat([coefs, pd.DataFrame(_reg.t_stat, columns=['zt_stat', 'prob'])], axis=1) + coefs.index = reg.output[reg.output['equation'] == m].index reg.output.update(coefs) strSummary += "\n" if robust: @@ -140,54 +125,41 @@ def out_part_middle(strSummary, reg, robust, m=None, latex=False): elif robust == "hac": strSummary += "HAC Standard Errors; Kernel Weights: " + _reg.name_gwk + "\n" elif robust == "ogmm": - strSummary += "Optimal GMM used to estimate the coefficients and the variance-covariance matrix\n" + strSummary += "Optimal GMM used to estimate the coefficients and the variance-covariance matrix\n" strSummary += "------------------------------------------------------------------------------------\n" - - m_output = reg.output[reg.output["equation"] == m] + + m_output = reg.output[reg.output['equation'] == m] if latex: - df_1 = m_output.iloc[np.lexsort((m_output.index, m_output["regime"]))] - df_2 = df_1.loc[:, ["var_names", "coefficients", "std_err", "zt_stat", "prob"]] - df_2 = df_2.set_axis( - [ - "Variable", - "Coefficient", - "Std.Error", - _reg.__summary["summary_zt"] + "-Statistic", - "Prob.", - ], - axis="columns", - copy=False, - ) - cols = df_2.columns.difference(["Variable"]) + df_1 = m_output.iloc[np.lexsort((m_output.index, m_output['regime']))] + df_2 = df_1.loc[:, ['var_names', 'coefficients', 'std_err', 'zt_stat', 'prob']] + df_2 = df_2.set_axis(['Variable', 'Coefficient', 'Std.Error', _reg.__summary['summary_zt']+'-Statistic', 'Prob.'], axis='columns', copy=False) + cols = df_2.columns.difference(['Variable']) df_2[cols] = df_2[cols].astype(float).map(lambda x: "%12.5f" % x) - df_2["Variable"] = ( - df_2["Variable"].str.replace("_", "\_").str.replace("%", "\%") - ) - df_inlatex = df_2.style.hide(axis="index").to_latex(hrules=True) + df_2['Variable'] = df_2['Variable'].str.replace("_", r"\_").str.replace("%", r"\%") + df_inlatex = df_2.style.hide(axis='index').to_latex(hrules=True) strSummary += df_inlatex strSummary += "------------------------------------------------------------------------------------\n" - else: + else: strSummary += ( - " Variable Coefficient Std.Error %1s-Statistic Probability\n" - % (_reg.__summary["summary_zt"]) + " Variable Coefficient Std.Error %1s-Statistic Probability\n" + % (_reg.__summary["summary_zt"]) ) strSummary += "------------------------------------------------------------------------------------\n" - for row in m_output.iloc[ - np.lexsort((m_output.index, m_output["regime"])) - ].itertuples(): + for row in m_output.iloc[np.lexsort((m_output.index, m_output['regime']))].itertuples(): try: strSummary += "%20s %12.5f %12.5f %12.5f %12.5f\n" % ( row.var_names, row.coefficients, row.std_err, row.zt_stat, - row.prob, + row.prob + ) + except TypeError: # special case for models that do not have inference on the lambda term + strSummary += "%20s %12.5f \n" % ( + row.var_names, + row.coefficients ) - except ( - TypeError - ): # special case for models that do not have inference on the lambda term - strSummary += "%20s %12.5f \n" % (row.var_names, row.coefficients) strSummary += "------------------------------------------------------------------------------------\n" try: # Adding info on instruments if they are present @@ -211,10 +183,8 @@ def out_part_middle(strSummary, reg, robust, m=None, latex=False): pass try: # Adding info on regimes if they are present - strSummary += "Regimes variable: %s\n" % _reg.name_regimes - strSummary += _summary_chow( - _reg - ) # If local regimes present, compute Chow test. + strSummary += ("Regimes variable: %s\n" % _reg.name_regimes) + strSummary += _summary_chow(_reg) # If local regimes present, compute Chow test. except: pass @@ -231,7 +201,6 @@ def out_part_middle(strSummary, reg, robust, m=None, latex=False): return (strSummary, reg) - def out_part_end(strSummary, reg, vm, other_end, m=None): if m is not None: strSummary += "------------------------------------------------------------------------------------\n" @@ -251,7 +220,6 @@ def out_part_end(strSummary, reg, vm, other_end, m=None): strSummary += "================================ END OF REPORT =====================================" return (strSummary, reg) - def _summary_chow(reg): sum_text = "\nREGIMES DIAGNOSTICS - CHOW TEST" name_x_r = reg.name_x_r @@ -264,38 +232,23 @@ def _summary_chow(reg): if reg.constant_regi == "many": names_chow = ["CONSTANT"] + names_chow - if "lambda" in reg.output.var_type.values: - if reg.output.var_type.value_counts()["lambda"] > 1: + if 'lambda' in reg.output.var_type.values: + if reg.output.var_type.value_counts()['lambda'] > 1: names_chow += ["lambda"] reg.output_chow = pd.DataFrame() - reg.output_chow["var_names"] = names_chow - reg.output_chow["df"] = reg.nr - 1 - reg.output_chow = pd.concat( - [reg.output_chow, pd.DataFrame(regi, columns=["value", "prob"])], axis=1 - ) - reg.output_chow = pd.concat( - [ - reg.output_chow, - pd.DataFrame( - [ - { - "var_names": "Global test", - "df": reg.kr * (reg.nr - 1), - "value": joint[0], - "prob": joint[1], - } - ] - ), - ], - ignore_index=True, - ) + reg.output_chow['var_names'] = names_chow + reg.output_chow['df'] = reg.nr - 1 + reg.output_chow = pd.concat([reg.output_chow, pd.DataFrame(regi, columns=['value', 'prob'])], axis=1) + reg.output_chow = pd.concat([reg.output_chow, pd.DataFrame([{'var_names': 'Global test', + 'df': reg.kr * (reg.nr - 1), + 'value': joint[0], 'prob': joint[1]}])], ignore_index=True) for row in reg.output_chow.itertuples(): sum_text += "%20s %2d %12.3f %9.4f\n" % ( row.var_names, row.df, row.value, - row.prob, + row.prob ) return sum_text @@ -306,8 +259,7 @@ def _spat_diag_out(reg, w, type, moran=False, ml=False): cache = diagnostics_sp.spDcache(reg, w) if type == "yend": strSummary += ( - "TEST DF VALUE PROB\n" - ) + "TEST DF VALUE PROB\n") if not ml: mi, ak, ak_p = diagnostics_sp.akTest(reg, w, cache) reg.ak_test = ak, ak_p @@ -317,33 +269,19 @@ def _spat_diag_out(reg, w, type, moran=False, ml=False): reg.ak_test[0], reg.ak_test[1], ) - if any(reg.output["var_type"] == "rho"): + if any(reg.output['var_type'] == 'rho'): # no common factor test if slx_vars is not "All" - if reg.slx_lags == 1 and not any(reg.output["var_type"] == "yend"): - if not hasattr(reg, "slx_vars") or not isinstance(reg.slx_vars, list): - wx_indices = reg.output[ - (reg.output["var_type"] == "wx") - & (reg.output["regime"] != "_Global") - ].index + if reg.slx_lags == 1 and not any(reg.output['var_type'] == 'yend'): + if not hasattr(reg, 'slx_vars') or not isinstance(reg.slx_vars, list): + wx_indices = reg.output[(reg.output['var_type'] == 'wx') & (reg.output['regime'] != '_Global')].index x_indices = [] - for m in reg.output["regime"].unique(): - x_indices.extend( - reg.output[ - (reg.output["regime"] == m) - & (reg.output["var_type"] == "x") - ].index[1:] - ) - vm_indices = ( - x_indices - + wx_indices.tolist() - + reg.output[reg.output["var_type"] == "rho"].index.tolist() - ) - cft, cft_p = diagnostics_sp.comfac_test( - reg.rho, - reg.betas[x_indices], - reg.betas[wx_indices], - reg.vm[vm_indices, :][:, vm_indices], - ) + for m in reg.output['regime'].unique(): + x_indices.extend(reg.output[(reg.output['regime'] == m) & (reg.output['var_type'] == 'x')].index[1:]) + vm_indices = x_indices + wx_indices.tolist() + reg.output[reg.output['var_type'] == 'rho'].index.tolist() + cft, cft_p = diagnostics_sp.comfac_test(reg.rho, + reg.betas[x_indices], + reg.betas[wx_indices], + reg.vm[vm_indices, :][:, vm_indices]) reg.cfh_test = cft, cft_p strSummary += "%-27s %2d %12.3f %9.4f\n" % ( "Common Factor Hypothesis Test", @@ -362,21 +300,15 @@ def _spat_diag_out(reg, w, type, moran=False, ml=False): strSummary += ( "TEST MI/DF VALUE PROB\n" ) - lm_tests = diagnostics_sp.LMtests( - reg, w, tests=["lme", "lml", "rlme", "rlml", "sarma"] - ) + lm_tests = diagnostics_sp.LMtests(reg, w, tests=["lme", "lml", "rlme", "rlml", "sarma"]) if reg.slx_lags == 0: try: - lm_tests2 = diagnostics_sp.LMtests( - reg, - w, - tests=["lmwx", "lmspdurbin", "rlmdurlag", "rlmwx", "lmslxerr"], - ) + lm_tests2 = diagnostics_sp.LMtests(reg, w, tests=["lmwx", "lmspdurbin", "rlmdurlag", "rlmwx","lmslxerr"]) reg.lm_wx = lm_tests2.lmwx reg.lm_spdurbin = lm_tests2.lmspdurbin reg.rlm_wx = lm_tests2.rlmwx reg.rlm_durlag = lm_tests2.rlmdurlag - reg.lm_slxerr = lm_tests2.lmslxerr # currently removed. - LA reinstated + reg.lm_slxerr = lm_tests2.lmslxerr #currently removed. - LA reinstated koley_bera = True except: koley_bera = False @@ -385,6 +317,7 @@ def _spat_diag_out(reg, w, type, moran=False, ml=False): reg.rlm_error = lm_tests.rlme reg.rlm_lag = lm_tests.rlml reg.lm_sarma = lm_tests.sarma + if moran: moran_res = diagnostics_sp.MoranRes(reg, w, z=True) @@ -426,16 +359,18 @@ def _spat_diag_out(reg, w, type, moran=False, ml=False): reg.lm_sarma[1], ) if reg.slx_lags == 0 and koley_bera: - strSummary += "\n- Spatial Durbin -\nTEST DF VALUE PROB\n" + strSummary += ( + "\n- Spatial Durbin -\nTEST DF VALUE PROB\n" + ) strSummary += "%-27s %2d %12.3f %9.4f\n" % ( "LM test for WX", - reg.k - 1, + reg.k-1, reg.lm_wx[0], reg.lm_wx[1], ) strSummary += "%-27s %2d %12.3f %9.4f\n" % ( "Robust LM WX test", - reg.k - 1, + reg.k-1, reg.rlm_wx[0], reg.rlm_wx[1], ) @@ -457,19 +392,18 @@ def _spat_diag_out(reg, w, type, moran=False, ml=False): reg.lm_spdurbin[0], reg.lm_spdurbin[1], ) - # strSummary += ( + #strSummary += ( # "\n- Spatial Error and WX -\nTEST DF VALUE PROB\n" - # ) - # strSummary += "%-27s %2d %12.3f %9.4f\n\n" % ( + #) + #strSummary += "%-27s %2d %12.3f %9.4f\n\n" % ( # "Joint test for Error and WX", # reg.k, # reg.lm_slxerr[0], # reg.lm_slxerr[1], - # ) + #) return strSummary - def _nonspat_top(reg, ml=False): if not ml: reg.sig2ML = reg.sig2n @@ -479,11 +413,7 @@ def _nonspat_top(reg, ml=False): reg.schwarz = diagnostics.schwarz(reg) strSummary = "%-20s:%12.6g %-22s:%12.4f\n" % ( - "Sum squared residual", - reg.utu, - "F-statistic", - reg.f_stat[0], - ) + "Sum squared residual", reg.utu, "F-statistic", reg.f_stat[0],) strSummary += "%-20s:%12.3f %-22s:%12.4g\n" % ( "Sigma-square", reg.sig2, @@ -528,7 +458,6 @@ def _nonspat_top(reg, ml=False): return strSummary - def _nonspat_mid(reg, white_test=False): # compute diagnostics reg.mulColli = diagnostics.condition_index(reg) @@ -584,16 +513,14 @@ def _nonspat_mid(reg, white_test=False): pass return strSummary - def _spat_pseudo_r2(reg): if np.abs(reg.rho) < 1: reg.pr2_e = diagnostics_tsls.pr2_spatial(reg) strSummary = "%-20s: %5.4f\n" % ("Spatial Pseudo R-squared", reg.pr2_e) else: - strSummary = "Spatial Pseudo R-squared: omitted due to rho outside the boundary (-1, 1).\n" + strSummary = "Spatial Pseudo R-squared: omitted due to rho outside the boundary (-1, 1).\n" return strSummary - def _summary_vm(reg): strVM = "\n" strVM += "COEFFICIENTS VARIANCE MATRIX\n" @@ -613,7 +540,6 @@ def _summary_vm(reg): strVM += "\n" return strVM - def _summary_iteration(reg): """Reports the number of iterations computed and the type of estimator used for hom and het models.""" try: @@ -633,7 +559,7 @@ def _summary_iteration(reg): step1c, ) except: - pass + pass try: txt = txt[:-1] + " %-22s:%12s" % ( @@ -645,11 +571,10 @@ def _summary_iteration(reg): return txt - -def _summary_impacts(reg, w, spat_impacts, slx_lags=0, slx_vars="All", regimes=False): +def _summary_impacts(reg, w, spat_impacts, slx_lags=0, slx_vars="All",regimes=False): """ Spatial direct, indirect and total effects in spatial lag model. - Uses multipliers computed by sputils._spmultipliers. + Uses multipliers computed by sputils.spmultipliers. Attributes ---------- @@ -672,16 +597,14 @@ def _summary_impacts(reg, w, spat_impacts, slx_lags=0, slx_vars="All", regimes=F except AttributeError: spat_impacts = [x.lower() for x in spat_impacts] - # variables = reg.output.query("var_type in ['x', 'yend'] and index != 0") # excludes constant - variables = reg.output.query( - "var_type == 'x' and index != 0" - ) # excludes constant and endogenous variables + #variables = reg.output.query("var_type in ['x', 'yend'] and index != 0") # excludes constant + variables = reg.output.query("var_type == 'x' and index != 0") # excludes constant and endogenous variables if regimes: - variables = variables[~variables["var_names"].str.endswith("_CONSTANT")] + variables = variables[~variables['var_names'].str.endswith('_CONSTANT')] variables_index = variables.index - if slx_lags == 0: + if slx_lags==0: strSummary = "\nSPATIAL LAG MODEL IMPACTS\n" else: strSummary = "\nSPATIAL DURBIN MODEL IMPACTS\n" @@ -695,35 +618,20 @@ def _summary_impacts(reg, w, spat_impacts, slx_lags=0, slx_vars="All", regimes=F sp_multipliers = {} for i in spat_impacts: - spmult = _spmultiplier( - w, reg.rho, method=i - ) # computes the multipliers, slx_lags not needed - + spmult = spmultiplier(w, reg.rho, method=i) # computes the multipliers, slx_lags not needed + strSummary += spmult["warn"] - btot, bdir, bind = _sp_effects( - reg, variables, spmult, slx_lags, slx_vars - ) # computes the impacts, needs slx_lags - sp_multipliers[spmult["method"]] = ( - spmult["adi"], - spmult["aii"].item(), - spmult["ati"].item(), - ) + btot, bdir, bind = _sp_effects(reg, variables, spmult, slx_lags,slx_vars) # computes the impacts, needs slx_lags + sp_multipliers[spmult["method"]] = spmult['adi'], spmult['aii'].item(), spmult['ati'].item() strSummary += "Impacts computed using the '" + spmult["method"] + "' method.\n" - strSummary += ( - " Variable Direct Indirect Total\n" - ) + strSummary += " Variable Direct Indirect Total\n" for i in range(len(variables)): strSummary += "%20s %12.4f %12.4f %12.4f\n" % ( - variables["var_names"][variables_index[i]], - bdir[i][0], - bind[i][0], - btot[i][0], - ) + variables['var_names'][variables_index[i]], bdir[i][0], bind[i][0], btot[i][0]) return sp_multipliers, strSummary - def _summary_vif(reg): """ Summary of variance inflation factors for the model. @@ -740,16 +648,12 @@ def _summary_vif(reg): vif = diagnostics.vif(reg) strSummary = "\nVARIANCE INFLATION FACTOR\n" strSummary += " Variable VIF Tolerance\n" - for i in range(len(reg.name_x) - 1): + for i in range(len(reg.name_x)-1): i += 1 strSummary += "%20s %12.4f %12.4f\n" % ( - reg.name_x[i], - vif[i][0], - vif[i][1], - ) + reg.name_x[i], vif[i][0], vif[i][1]) return strSummary - def _summary_dwh(reg): """ Summary of Durbin-Wu-Hausman test on endogeneity of variables. @@ -764,11 +668,60 @@ def _summary_dwh(reg): """ strSummary = "\nREGRESSION DIAGNOSTICS\n" - strSummary += "TEST DF VALUE PROB\n" + strSummary += ( + "TEST DF VALUE PROB\n") strSummary += "%-27s %2d %12.3f %9.4f\n" % ( - "Durbin-Wu-Hausman test", - reg.yend.shape[1], - reg.dwh[0], - reg.dwh[1], - ) + "Durbin-Wu-Hausman test",reg.yend.shape[1],reg.dwh[0],reg.dwh[1]) return strSummary + +def _nslx_out(reg, section): + """ + Summary of the NSLX model. + + Parameters + ---------- + reg: spreg regression object + section: string, "top" for top part of summary, "mid" for middle part + + Returns + ------- + strSummary: string with NSLX model summary specifics + + """ + if section == "top": + strSummary = "%-20s:%12.3f %-22s:%12.6g\n" % ( + "Sigma-square", reg.sign, "Sum squared residual", reg.utu,) + strSummary += "%-20s:%12.3f %-22s:%12.3f\n" % ( + "S.E. of regression", np.sqrt(reg.sig2), "Log likelihood", + reg.ll,) + strSummary += "%-20s:%12.3f %-22s:%12.3f\n" % ( + "Schwarz criterion", reg.schwarz, "Akaike info criterion", + reg.aic,) + if len(''.join(reg.name_coords)) <= 12: + strSummary += "%-20s:%12s %-22s:%12s\n" % ( + "Coordinates", reg.name_coords, "Distance metric", reg.distance_metric,) + else: + max_spaces = 27 + if isinstance(reg.name_coords, list): + name_coords = ', '.join(reg.name_coords) if len(reg.name_coords) > 1 else reg.name_coords[0] + else: + name_coords = reg.name_coords # It's already a string + if len(name_coords) > max_spaces: + name_coords = name_coords[:max_spaces-4] + "..." + space_between = (max_spaces - len(name_coords)) * ' ' + strSummary += "%-20s: %12s%s%-22s:%12s\n" % ("Coordinates", name_coords, space_between, "Distance metric", reg.distance_metric) + + if section == "mid": + strSummary = "" + if len(reg.transform) == 1: + strSummary += "Transformation: " + reg.transform[0] + "\n" + strSummary += "KNN: " + str(reg.knn[0]) + "\n" + strSummary += "Distance upper bound: " + str(reg.d_upper_bound[0]) + "\n" + else: + param = [reg.transform, reg.knn, reg.d_upper_bound] + text = ["Transformation: ", "KNN: ", "Distance upper bound: "] + for i in range(3): + param_i = ', '.join(map(str, param[i])) + text_wrapper = TW.TextWrapper(width=76, subsequent_indent=" ") + strSummary += text[i] + text_wrapper.fill(param_i) + "\n" + return strSummary \ No newline at end of file diff --git a/spreg/skater_reg.py b/spreg/skater_reg.py index 03ea6f31..2b203fbf 100755 --- a/spreg/skater_reg.py +++ b/spreg/skater_reg.py @@ -308,8 +308,7 @@ def fit( print("cut made {}...".format(best_deletion)) if best_deletion.score > prev_score: raise ValueError( - ( - "The score increased with the number of clusters. " + ("The score increased with the number of clusters. " "Please check your data.\nquorum: {}; n_clusters: {}" ).format(quorum, n_clusters) ) @@ -378,10 +377,7 @@ def score_spreg( } trees_scores = {} - if ( - data_reg["reg"].__name__ == "GM_Lag" - or data_reg["reg"].__name__ == "BaseGM_Lag" - ): + if data_reg["reg"].__name__ == "GM_Lag" or data_reg["reg"].__name__ == "BaseGM_Lag": try: x = np.hstack((np.ones((data_reg["x"].shape[0], 1)), data_reg["x"])) reg = TSLS_Regimes( @@ -389,8 +385,7 @@ def score_spreg( x=x, yend=data_reg["yend"], q=data_reg["q"], - regimes=all_labels, - ) + regimes=all_labels,) except: x = _const_x(data_reg["x"]) reg = TSLS_Regimes( @@ -398,10 +393,10 @@ def score_spreg( x=x, yend=data_reg["yend"], q=data_reg["q"], - regimes=all_labels, - ) + regimes=all_labels,) score = np.dot(reg.u.T, reg.u)[0][0] else: + for l in set_labels: x = data_reg["x"][all_labels == l] if np.linalg.matrix_rank(x) < x.shape[1]: @@ -428,18 +423,12 @@ def score_spreg( try: x = np.hstack((np.ones((x.shape[0], 1)), x)) reg = data_reg["reg"]( - y=data_reg["y"][all_labels == l], - x=x, - w=w_regi_i, - **kargs, + y=data_reg["y"][all_labels == l], x=x, w=w_regi_i, **kargs ) except np.linalg.LinAlgError: x = _const_x(x) reg = data_reg["reg"]( - y=data_reg["y"][all_labels == l], - x=x, - w=w_regi_i, - **kargs, + y=data_reg["y"][all_labels == l], x=x, w=w_regi_i, **kargs ) trees_scores[l] = np.dot(reg.u.T, reg.u)[0][0] score = sum(trees_scores.values()) @@ -513,7 +502,7 @@ def score_stats( data_reg["y"][all_labels == l], x, **kargs ).fit() - trees_scores[l] = np.sum(reg.resid**2) + trees_scores[l] = np.sum(reg.resid ** 2) score = sum(trees_scores.values()) else: part_scores, score, trees_scores = self._data_reg_none( @@ -535,10 +524,10 @@ def _prep_score(self, all_labels, current_tree, current_labels): return labels, subtree_quorums def _data_reg_none(self, data, all_labels, l, set_labels): - assert data.shape[0] == len(all_labels), ( - "Length of label array ({}) does not match " "length of data ({})! ".format( - all_labels.shape[0], data.shape[0] - ) + assert data.shape[0] == len( + all_labels + ), "Length of label array ({}) does not match " "length of data ({})! ".format( + all_labels.shape[0], data.shape[0] ) part_scores = [ self.reduction( @@ -556,15 +545,8 @@ def _data_reg_none(self, data, all_labels, l, set_labels): def _prep_lag(self, data_reg): # if the model is a spatial lag, add the lagged dependent variable to the model - data_reg["yend"], data_reg["q"] = set_endog( - data_reg["y"], - data_reg["x"][:, 1:], - data_reg["w"], - yend=None, - q=None, - w_lags=1, - lag_q=True, - ) + data_reg['yend'], data_reg['q'] = set_endog(data_reg["y"], data_reg["x"][:, 1:], data_reg["w"], yend=None, + q=None, w_lags=1, lag_q=True) return data_reg def find_cut( @@ -636,10 +618,7 @@ def tqdm(noop, desc=""): best_d_score = -np.inf try: - if ( - data_reg["reg"].__name__ == "GM_Lag" - or data_reg["reg"].__name__ == "BaseGM_Lag" - ): + if data_reg["reg"].__name__ == "GM_Lag" or data_reg["reg"].__name__ == "BaseGM_Lag": data_reg = self._prep_lag(data_reg) except: pass @@ -696,9 +675,9 @@ def tqdm(noop, desc=""): best_d_score = d_score try: for i in set(current_labels): - best_scores[local_labels[current_list.index(i)]] = ( - trees_scores[i] - ) + best_scores[ + local_labels[current_list.index(i)] + ] = trees_scores[i] for i in new_trees_scores: best_scores[i] = new_trees_scores[i] except: @@ -714,4 +693,4 @@ def tqdm(noop, desc=""): def _const_x(x): x = x[:, np.ptp(x, axis=0) != 0] x = np.hstack((np.ones((x.shape[0], 1)), x)) - return x + return x \ No newline at end of file diff --git a/spreg/sputils.py b/spreg/sputils.py index 124bbb94..c7be313c 100755 --- a/spreg/sputils.py +++ b/spreg/sputils.py @@ -2,8 +2,12 @@ import numpy.linalg as la import scipy.sparse as SP import pandas as pd +import geopandas as gpd from scipy.sparse import linalg as SPla from itertools import compress +import libpysal.weights as weights +from libpysal import graph +from scipy import sparse def spdot(a, b, array_out=True): @@ -273,8 +277,8 @@ def spisfinite(a): return np.isfinite(a.sum()) -def _spmultiplier(w, rho, method="simple", mtol=0.00000001): - """ " +def spmultiplier(w, rho, method="simple", mtol=0.00000001): + """" Spatial Lag Multiplier Calculation Follows Kim, Phipps and Anselin (2003) (simple), and LeSage and Pace (2009) (full, power) @@ -294,7 +298,7 @@ def _spmultiplier(w, rho, method="simple", mtol=0.00000001): pow = powers used in power approximation (otherwise 0) """ - multipliers = {"ati": 1.0, "adi": 1.0, "aii": 1.0, "method": method, "warn": ""} + multipliers = {"ati": 1.0, "adi": 1.0, "aii": 1.0, "method": method, "warn": ''} multipliers["pow"] = 0 multipliers["ati"] = 1.0 / (1.0 - rho) n = w.n @@ -303,12 +307,12 @@ def _spmultiplier(w, rho, method="simple", mtol=0.00000001): elif method == "full": wf = w.full()[0] id0 = np.identity(n) - irw0 = id0 - rho * wf + irw0 = (id0 - rho * wf) invirw0 = np.linalg.inv(irw0) adii0 = np.sum(np.diag(invirw0)) multipliers["adi"] = adii0 / n elif method == "power": - ws3 = w.to_sparse(fmt="csr") + ws3 = w.to_sparse(fmt='csr') rhop = rho ww = ws3 pow = 1 @@ -325,18 +329,15 @@ def _spmultiplier(w, rho, method="simple", mtol=0.00000001): multipliers["adi"] = adi.item() multipliers["pow"] = pow else: - multipliers["warn"] = ( - "Method '" + method + "' not supported for spatial impacts.\n" - ) - multipliers["method"] = "simple" + multipliers["warn"] = "Method '"+method+"' not supported for spatial impacts.\n" + multipliers["method"] ='simple' multipliers["aii"] = multipliers["ati"] - multipliers["adi"] - return multipliers - + return (multipliers) -def _sp_effects(reg, variables, spmult, slx_lags=0, slx_vars="All"): +def _sp_effects(reg, variables, spmult, slx_lags=0,slx_vars="All"): """ Calculate spatial lag, direct and indirect effects - + Attributes ---------- reg : regression object @@ -352,23 +353,21 @@ def _sp_effects(reg, variables, spmult, slx_lags=0, slx_vars="All"): bdir : direct effects bind : indirect effects """ - + variables_x_index = variables.index - m1 = spmult["ati"] + m1 = spmult['ati'] btot = m1 * reg.betas[variables_x_index] - m2 = spmult["adi"] + m2 = spmult['adi'] bdir = m2 * reg.betas[variables_x_index] - # Assumes all SLX effects are indirect effects. + # Assumes all SLX effects are indirect effects. if slx_lags > 0: if reg.output.regime.nunique() > 1: - btot_idx = pd.Series(btot.flatten(), index=variables_x_index) - wchunk_size = len( - variables.query("regime == @reg.output.regime.iloc[0]") - ) # Number of exogenous variables in each regime + btot_idx = pd.Series(btot.flatten(), index=variables_x_index) + wchunk_size = len(variables.query("regime == @reg.output.regime.iloc[0]")) #Number of exogenous variables in each regime for i in range(slx_lags): - chunk_indices = variables_x_index + (i + 1) * wchunk_size + chunk_indices = variables_x_index + (i+1) * wchunk_size bmult = m1 * reg.betas[chunk_indices] btot_idx[variables_x_index] += bmult.flatten() btot = btot_idx.to_numpy().reshape(btot.shape) @@ -376,14 +375,12 @@ def _sp_effects(reg, variables, spmult, slx_lags=0, slx_vars="All"): else: variables_wx = reg.output.query("var_type == 'wx'") variables_wx_index = variables_wx.index - if hasattr(reg, "slx_vars") and isinstance(slx_vars, list): - flexwx_indices = list( - compress(variables_x_index, slx_vars) - ) # indices of x variables in wx + if hasattr(reg, 'slx_vars') and isinstance(slx_vars,list): + flexwx_indices = list(compress(variables_x_index,slx_vars)) # indices of x variables in wx else: - flexwx_indices = variables_x_index # all x variables + flexwx_indices = variables_x_index # all x variables xind = [h - 1 for h in flexwx_indices] - wchunk_size = len(variables_wx_index) // slx_lags + wchunk_size = len(variables_wx_index)//slx_lags for i in range(slx_lags): start_idx = i * wchunk_size end_idx = start_idx + wchunk_size @@ -393,11 +390,101 @@ def _sp_effects(reg, variables, spmult, slx_lags=0, slx_vars="All"): bind = btot - bdir else: - m3 = spmult["aii"] + m3 = spmult['aii'] bind = m3 * reg.betas[variables_x_index] return btot, bdir, bind +def i_multipliers(w,coef=0.0,model='lag',id=None): + ''' + Creates pandas DataFrame with spatial multipliers with direct effects + (diagonal), effect of neighbors (row sum) and effect on neighbors + (column sum) for each observation + + Parameters + ---------- + w : spatial weights, either sparse CSR, PySAL weights object + or full array + coef : spatial coefficient, default is 0.0; non-zero values for + spatial autoregressive coefficient or parameter in nonlinear + SLX -- coef has no effect on slx and kernel models + model : type of spatial model, default is 'lag' for spatial lag or + spatial Durbin model; other options are "slx" (contiguity + weights without parameters), "kernel" (kernel weights), + "power" (nonlinear SLX power model) and "exponential" (nonlinear + SLX exponential model) + id : pandas Series with ID variable for each observation, default is none, + which creates a vector with sequence numbers and assigns variable name + "ID"; otherwise variable name is extracted from Series columns. + + ''' + + if sparse.issparse(w): # sparse kernel or dist fraction + n = w.shape[0] + edirect = np.zeros((n,1)) + ww = w.copy() + if model == 'power': + ww = ww.power(coef) + elif model == 'exponential': + + wdata = ww.data # uses data attribute of CSR to apply transformation + wexp = np.exp(wdata * -coef) + ww.data = wexp + else: + raise Exception("Model not supported") + + else: # pysal weights + if isinstance(w,weights.W): + n = w.n + wf = w.full()[0] + elif isinstance(w, graph.Graph): + w = w.to_W() + n = w.n + wf = w.full()[0] + elif type(w) == np.ndarray: + n = w.shape[0] + wf = w + else: + raise Exception("Weights object not supported") + + edirect = np.zeros((n,1)) + + if model == 'slx': + ww = wf + elif model == 'kernel': + edirect = np.diag(wf).copy().reshape(-1,1) + ww = wf.copy() + np.fill_diagonal(ww,0) + elif model == 'lag': + + id0 = np.identity(n) + irw0 = (id0 - coef * wf) + invirw0 = np.linalg.inv(irw0) + edirect = np.diag(invirw0).copy().reshape(-1,1) + ww = invirw0.copy() + np.fill_diagonal(ww,0) + else: + raise Exception("Model not supported") + + eofNbrs = ww.sum(axis=1).reshape(-1,1) + eonNbrs = ww.sum(axis=0).reshape(-1,1) + + if (isinstance(id,pd.core.frame.DataFrame)) or (isinstance(id,gpd.geoseries.GeoSeries)): + pdid = id # already a data frame + elif isinstance(id,np.ndarray): + pdid = pd.DataFrame(id,columns=["ID"]) + elif id == None: + id = np.arange(n,dtype='int32').reshape(-1,1) + pdid = pd.DataFrame(id,columns=["ID"]) + else: + raise Exception("ID format not supported") + + impacts = np.concatenate((edirect,eofNbrs,eonNbrs),axis=1) + dfimpacts = pd.DataFrame(impacts,columns=["Direct","EofNbrs","EonNbrs"]) + dfimpacts = pd.concat([pdid,dfimpacts],axis=1) + + return dfimpacts + def _test(): import doctest diff --git a/spreg/tests/test_nslx.py b/spreg/tests/test_nslx.py new file mode 100644 index 00000000..4789e1b4 --- /dev/null +++ b/spreg/tests/test_nslx.py @@ -0,0 +1,34 @@ +import unittest +import numpy as np +import libpysal +import spreg +import geopandas as gpd +RTOL = 1e-04 + +class TestNSLX(unittest.TestCase): + def setUp(self): + csdoh = libpysal.examples.load_example('chicagoSDOH') + dfs = gpd.read_file(csdoh.get_path('Chi-SDOH.shp')) + self.y = dfs[['HIS_ct']] + self.x = dfs[['Blk14P','Hisp14P','EP_NOHSDP']] + self.coords = dfs[["COORD_X","COORD_Y"]] + + def test_nslx_slxvars(self): + reg = spreg.NSLX(self.y, self.x, self.coords, var_flag=1, + slx_vars=[False,False,True], params=[(6,np.inf,"exponential")]) + np.testing.assert_allclose(reg.betas, + np.array([17.878828, 0.180593, 0.056209, 0.647127, 6.969201]), rtol=RTOL) + vm = np.array([[ 1.91361545e-01, -2.09518978e-03, -2.89344531e-03, 1.50324352e-04, + 0.00000000e+00], + [-2.09518978e-03, 6.58549881e-05, 9.80509736e-05, -1.50773218e-04, + 0.00000000e+00], + [-2.89344531e-03, 9.80509736e-05, 2.35720689e-04, -3.57313408e-04, + 0.00000000e+00], + [ 1.50324352e-04, -1.50773218e-04, -3.57313408e-04, 7.66414008e-04, + 0.00000000e+00], + [ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, + 3.41278119e-02]]) + np.testing.assert_allclose(reg.vm, vm,RTOL) + +if __name__ == '__main__': + unittest.main() diff --git a/spreg/twosls.py b/spreg/twosls.py index e2868fd8..1bed3bed 100644 --- a/spreg/twosls.py +++ b/spreg/twosls.py @@ -4,14 +4,7 @@ from . import user_output as USER from . import diagnostics as DIAG from .output import output, _spat_diag_out, _summary_dwh -from .utils import ( - spdot, - sphstack, - RegressionPropsY, - RegressionPropsVM, - set_warn, - get_lags, -) +from .utils import spdot, sphstack, RegressionPropsY, RegressionPropsVM, set_warn, get_lags import pandas as pd __author__ = "Luc Anselin lanselin@gmail.com, Pedro Amaral pedrovma@gmail.com, David C. Folch david.folch@asu.edu, Jing Yao jingyao@asu.edu" @@ -19,6 +12,7 @@ class BaseTSLS(RegressionPropsY, RegressionPropsVM): + """ Two stage least squares (2SLS) (note: no consistency checks, diagnostics or constant added) @@ -140,6 +134,7 @@ class BaseTSLS(RegressionPropsY, RegressionPropsVM): def __init__( self, y, x, yend, q=None, h=None, robust=None, gwk=None, sig2n_k=False ): + if issubclass(type(q), np.ndarray) and issubclass(type(h), np.ndarray): raise Exception("Please do not provide 'q' and 'h' together") if q is None and h is None: @@ -162,13 +157,23 @@ def __init__( # k = number of exogenous variables and endogenous variables self.k = z.shape[1] hth = spdot(h.T, h) - hthi = la.inv(hth) + + try: + hthi = la.inv(hth) + except: + raise Exception("H'H singular - no inverse") + zth = spdot(z.T, h) hty = spdot(h.T, y) factor_1 = np.dot(zth, hthi) factor_2 = np.dot(factor_1, zth.T) # this one needs to be in cache to be used in AK - varb = la.inv(factor_2) + + try: + varb = la.inv(factor_2) + except: + raise Exception("Singular matrix Z'H(H'H)^-1H'Z - endogenous variable(s) may be part of X") + factor_3 = np.dot(varb, factor_1) betas = np.dot(factor_3, hty) self.betas = betas @@ -261,7 +266,7 @@ class TSLS(BaseTSLS): If True, then use n-k to estimate sigma^2. If False, use n. spat_diag : boolean If True, then compute Anselin-Kelejian test (requires w) - nonspat_diag : boolean + nonspat_diag : boolean If True, then compute non-spatial diagnostics vm : boolean If True, include variance-covariance matrix in summary @@ -468,32 +473,22 @@ def __init__( name_ds=None, latex=False, ): + n = USER.check_arrays(y, x, yend, q) y, name_y = USER.check_y(y, n, name_y) yend, q, name_yend, name_q = USER.check_endog([yend, q], [name_yend, name_q]) USER.check_robust(robust, gwk) - if robust == "hac" and spat_diag: - set_warn( - self, - "Spatial diagnostics are not available for HAC estimation. The spatial diagnostics have been disabled for this model.", - ) - spat_diag = False - + spat_diag, warn = USER.check_spat_diag(spat_diag=spat_diag, w=w, robust=robust, slx_lags=slx_lags) + set_warn(self, warn) x_constant, name_x, warn = USER.check_constant(x, name_x) self.name_x = USER.set_name_x(name_x, x_constant) - w = USER.check_weights(w, y, slx_lags=slx_lags, w_required=spat_diag) - if slx_lags > 0: - # lag_x = get_lags(w, x_constant[:, 1:], slx_lags) - # x_constant = np.hstack((x_constant, lag_x)) - # self.name_x += USER.set_name_spatial_lags(self.name_x[1:], slx_lags) - x_constant, self.name_x = USER.flex_wx( - w, - x=x_constant, - name_x=self.name_x, - constant=True, - slx_lags=slx_lags, - slx_vars=slx_vars, - ) + w = USER.check_weights(w, y, slx_lags=slx_lags, w_required=spat_diag) + if slx_lags>0: +# lag_x = get_lags(w, x_constant[:, 1:], slx_lags) +# x_constant = np.hstack((x_constant, lag_x)) +# self.name_x += USER.set_name_spatial_lags(self.name_x[1:], slx_lags) + x_constant,self.name_x = USER.flex_wx(w,x=x_constant,name_x=self.name_x,constant=True, + slx_lags=slx_lags,slx_vars=slx_vars) set_warn(self, warn) BaseTSLS.__init__( @@ -518,21 +513,20 @@ def __init__( self.robust = USER.set_robust(robust) self.name_w = USER.set_name_w(name_w, w) self.name_gwk = USER.set_name_w(name_gwk, gwk) - self.output = pd.DataFrame(self.name_x + self.name_yend, columns=["var_names"]) - self.output["var_type"] = ["x"] * len(self.name_x) + ["yend"] * len( - self.name_yend - ) - self.output["regime"], self.output["equation"] = (0, 0) + self.output = pd.DataFrame(self.name_x + self.name_yend, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * len(self.name_yend) + self.output['regime'], self.output['equation'] = (0, 0) diag_out = "" if nonspat_diag: self.dwh = DIAG.dwh(self) sum_dwh = _summary_dwh(self) diag_out += sum_dwh if spat_diag: - diag_out += _spat_diag_out(self, w, "yend") + diag_out += _spat_diag_out(self, w, 'yend') - output(reg=self, vm=vm, robust=robust, other_end=diag_out, latex=latex) + output(reg=self, vm=vm, robust=robust, other_end=diag_out, latex=latex) def _test(): import doctest diff --git a/spreg/twosls_regimes.py b/spreg/twosls_regimes.py index 820df45c..fc97889a 100644 --- a/spreg/twosls_regimes.py +++ b/spreg/twosls_regimes.py @@ -16,6 +16,7 @@ class TSLS_Regimes(BaseTSLS, REGI.Regimes_Frame): + """ Two stage least squares (2SLS) with regimes. @@ -365,6 +366,7 @@ def __init__( summ=True, latex=False, ): + n = USER.check_arrays(y, x) y, name_y = USER.check_y(y, n, name_y) USER.check_robust(robust, gwk) @@ -376,12 +378,8 @@ def __init__( "Error by regimes is not available for HAC estimation. The error by regimes has been disabled for this model.", ) regime_err_sep = False - if spat_diag: - set_warn( - self, - "Spatial diagnostics are not available for HAC estimation. The spatial diagnostics have been disabled for this model.", - ) - spat_diag = False + spat_diag, warn = USER.check_spat_diag(spat_diag=spat_diag, w=w, robust=robust, slx_lags=slx_lags) + set_warn(self, warn) x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) @@ -436,7 +434,7 @@ def __init__( name_yend, name_q, summ, - latex, + latex ) else: q, self.name_q = REGI.Regimes_Frame.__init__( @@ -449,7 +447,7 @@ def __init__( constant_regi, cols2regi=cols2regi, names=name_x, - rlist=True, + rlist=True ) yend, self.name_yend, yend_rlist = REGI.Regimes_Frame.__init__( self, @@ -459,16 +457,13 @@ def __init__( cols2regi=cols2regi, yend=True, names=name_yend, - rlist=True, - ) - self.output = pd.DataFrame( - self.name_x + self.name_yend, columns=["var_names"] + rlist=True ) - self.output["var_type"] = ["x"] * len(self.name_x) + ["yend"] * len( - self.name_yend - ) - self.output["regime"] = x_rlist + yend_rlist - self.output["equation"] = 0 + self.output = pd.DataFrame(self.name_x+self.name_yend, + columns=['var_names']) + self.output['var_type'] = ['x']*len(self.name_x)+['yend']*len(self.name_yend) + self.output['regime'] = x_rlist+yend_rlist + self.output['equation'] = 0 BaseTSLS.__init__( self, y=y, x=x, yend=yend, q=q, robust=robust, gwk=gwk, sig2n_k=sig2n_k @@ -487,11 +482,12 @@ def __init__( self.robust = USER.set_robust(robust) if summ: if spat_diag: - diag_out = _spat_diag_out(self, w, "yend") + diag_out = _spat_diag_out(self, w, 'yend') else: diag_out = None output(reg=self, vm=vm, robust=robust, other_end=diag_out, latex=latex) + def _tsls_regimes_multi( self, x, @@ -510,7 +506,7 @@ def _tsls_regimes_multi( name_yend, name_q, summ, - latex, + latex ): results_p = {} """ @@ -547,7 +543,7 @@ def _tsls_regimes_multi( name_q, self.name_w, self.name_regimes, - slx_lags, + slx_lags ), ) else: @@ -569,7 +565,7 @@ def _tsls_regimes_multi( name_q, self.name_w, self.name_regimes, - slx_lags, + slx_lags ) ) @@ -600,9 +596,7 @@ def _tsls_regimes_multi( self.name_h, ) = ([], [], [], [], [], []) counter = 0 - self.output = pd.DataFrame( - columns=["var_names", "var_type", "regime", "equation"] - ) + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -619,32 +613,24 @@ def _tsls_regimes_multi( (counter * self.kr) : ((counter + 1) * self.kr), (counter * self.kr) : ((counter + 1) * self.kr), ] = results[r].vm - self.betas[(counter * self.kr) : ((counter + 1) * self.kr),] = results[ - r - ].betas - self.u[regi_ids[r],] = results[r].u - self.predy[regi_ids[r],] = results[r].predy + self.betas[ + (counter * self.kr) : ((counter + 1) * self.kr), + ] = results[r].betas + self.u[ + regi_ids[r], + ] = results[r].u + self.predy[ + regi_ids[r], + ] = results[r].predy self.name_y += results[r].name_y self.name_x += results[r].name_x self.name_yend += results[r].name_yend self.name_q += results[r].name_q self.name_z += results[r].name_z self.name_h += results[r].name_h - self.output = pd.concat( - [ - self.output, - pd.DataFrame( - { - "var_names": results[r].name_x + results[r].name_yend, - "var_type": ["x"] * len(results[r].name_x) - + ["yend"] * len(results[r].name_yend), - "regime": r, - "equation": r, - } - ), - ], - ignore_index=True, - ) + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_x+results[r].name_yend, + 'var_type': ['x']*len(results[r].name_x)+['yend']*len(results[r].name_yend), + 'regime': r, 'equation': r})], ignore_index=True) counter += 1 self.multi = results @@ -660,11 +646,11 @@ def _tsls_regimes_multi( self.chow = REGI.Chow(self) if spat_diag: self._get_spat_diag_props(results, regi_ids, x_constant, yend, q) - diag_out = _spat_diag_out(self, w, "yend") + diag_out = _spat_diag_out(self, w, 'yend') else: diag_out = None if summ: - self.output.sort_values(by="regime", inplace=True) + self.output.sort_values(by='regime', inplace=True) output(reg=self, vm=vm, robust=robust, other_end=diag_out, latex=latex) def _get_spat_diag_props(self, results, regi_ids, x, yend, q): @@ -738,9 +724,9 @@ def _work( def _optimal_weight(reg, sig2n_k, warn=True): try: - Hu = reg.h.toarray() * reg.u**2 + Hu = reg.h.toarray() * reg.u ** 2 except: - Hu = reg.h * reg.u**2 + Hu = reg.h * reg.u ** 2 if sig2n_k: S = spdot(reg.h.T, Hu, array_out=True) / (reg.n - reg.k) else: @@ -756,7 +742,7 @@ def _optimal_weight(reg, sig2n_k, warn=True): else: vm = fac2 * reg.n RegressionProps_basic(reg, betas=betas, vm=vm, sig2=False) - # reg.title += " (Optimal-Weighted GMM)" + #reg.title += " (Optimal-Weighted GMM)" if warn: set_warn( reg, "Residuals treated as homoskedastic for the purpose of diagnostics." @@ -799,7 +785,7 @@ def _test(): yd, q, regimes, - w=w, + w = w, constant_regi="many", spat_diag=True, name_y=y_var, @@ -807,11 +793,15 @@ def _test(): name_yend=yd_var, name_q=q_var, name_regimes=r_var, - # cols2regi=[False, True, True, False], + #cols2regi=[False, True, True, False], sig2n_k=False, - regime_err_sep=True, - # robust = 'hac', - vm=False, + regime_err_sep = True, + #robust = 'hac', + vm = False ) print(tslsr.output) print(tslsr.summary) + + + + diff --git a/spreg/twosls_sp.py b/spreg/twosls_sp.py index 6c123421..e5abe017 100755 --- a/spreg/twosls_sp.py +++ b/spreg/twosls_sp.py @@ -171,28 +171,31 @@ class BaseGM_Lag(TSLS.BaseTSLS): """ def __init__( - self, - y, - x, - yend=None, - q=None, - w=None, - w_lags=1, - slx_lags=0, - slx_vars="All", - lag_q=True, - robust=None, - gwk=None, - sig2n_k=False, + self, + y, + x, + yend=None, + q=None, + w=None, + w_lags=1, + slx_lags=0, + slx_vars="All", + lag_q=True, + robust=None, + gwk=None, + sig2n_k=False, ): + + + if slx_lags > 0: - yend2, q2, wx = set_endog( - y, x[:, 1:], w, yend, q, w_lags, lag_q, slx_lags, slx_vars - ) + yend2, q2, wx = set_endog(y, x[:, 1:], w, yend, q, w_lags, lag_q, slx_lags,slx_vars) x = np.hstack((x, wx)) else: yend2, q2 = set_endog(y, x[:, 1:], w, yend, q, w_lags, lag_q) + + TSLS.BaseTSLS.__init__( self, y=y, x=x, yend=yend2, q=q2, robust=robust, gwk=gwk, sig2n_k=sig2n_k ) @@ -248,7 +251,7 @@ class GM_Lag(BaseGM_Lag): Include average direct impact (ADI), average indirect impact (AII), and average total impact (ATI) in summary results. Options are 'simple', 'full', 'power', 'all' or None. - See sputils._spmultiplier for more information. + See sputils.spmultiplier for more information. vm : boolean If True, include variance-covariance matrix in summary results @@ -501,67 +504,61 @@ class GM_Lag(BaseGM_Lag): """ def __init__( - self, - y, - x, - yend=None, - q=None, - w=None, - w_lags=1, - lag_q=True, - slx_lags=0, - slx_vars="All", - robust=None, - gwk=None, - sig2n_k=False, - spat_diag=True, - spat_impacts="simple", - vm=False, - name_y=None, - name_x=None, - name_yend=None, - name_q=None, - name_w=None, - name_gwk=None, - name_ds=None, - latex=False, - hard_bound=False, + self, + y, + x, + yend=None, + q=None, + w=None, + w_lags=1, + lag_q=True, + slx_lags=0, + slx_vars="All", + robust=None, + gwk=None, + sig2n_k=False, + spat_diag=True, + spat_impacts="simple", + vm=False, + name_y=None, + name_x=None, + name_yend=None, + name_q=None, + name_w=None, + name_gwk=None, + name_ds=None, + latex=False, + hard_bound=False, ): + n = USER.check_arrays(x, yend, q) y, name_y = USER.check_y(y, n, name_y) w = USER.check_weights(w, y, w_required=True, slx_lags=slx_lags) USER.check_robust(robust, gwk) yend, q, name_yend, name_q = USER.check_endog([yend, q], [name_yend, name_q]) - if robust == "hac" and spat_diag: - set_warn( - self, - "Spatial diagnostics are not available for HAC estimation. The spatial diagnostics have been disabled for this model.", - ) - spat_diag = False + spat_diag, warn = USER.check_spat_diag(spat_diag=spat_diag, w=w, robust=robust, slx_lags=slx_lags) + set_warn(self, warn) x_constant, name_x, warn = USER.check_constant(x, name_x) - name_x = USER.set_name_x( - name_x, x_constant - ) # need to check for None and set defaults + set_warn(self, warn) + name_x = USER.set_name_x(name_x, x_constant) # need to check for None and set defaults # kx and wkx are used to replace complex calculation for output if slx_lags > 0: # adjust for flexwx - if isinstance(slx_vars, list): # slx_vars has True,False - if len(slx_vars) != x.shape[1]: + if (isinstance(slx_vars,list)): # slx_vars has True,False + if len(slx_vars) != x.shape[1] : raise Exception("slx_vars incompatible with x column dimensions") else: # use slx_vars to extract proper columns workname = name_x[1:] kx = len(workname) - vv = list(compress(workname, slx_vars)) + vv = list(compress(workname,slx_vars)) name_x += USER.set_name_spatial_lags(vv, slx_lags) wkx = slx_vars.count(True) else: kx = len(name_x) - 1 wkx = kx - name_x += USER.set_name_spatial_lags( - name_x[1:], slx_lags - ) # exclude constant + name_x += USER.set_name_spatial_lags(name_x[1:], slx_lags) # exclude constant + - set_warn(self, warn) BaseGM_Lag.__init__( self, y=y, @@ -580,12 +577,7 @@ def __init__( self.rho = self.betas[-1] self.predy_e, self.e_pred, warn = sp_att( - w, - self.y, - self.predy, - self.yend[:, -1].reshape(self.n, 1), - self.rho, - hard_bound=hard_bound, + w, self.y, self.predy, self.yend[:, -1].reshape(self.n, 1), self.rho, hard_bound=hard_bound ) set_warn(self, warn) self.title = "SPATIAL TWO STAGE LEAST SQUARES" @@ -603,41 +595,25 @@ def __init__( if slx_lags > 0: # need to remove all but last SLX variables from name_x self.name_x0 = [] self.name_x0.append(self.name_x[0]) # constant - if isinstance(slx_vars, list): # boolean list passed + if (isinstance(slx_vars,list)): # boolean list passed # x variables that were not lagged - self.name_x0.extend( - list(compress(self.name_x[1:], [not i for i in slx_vars])) - ) + self.name_x0.extend(list(compress(self.name_x[1:],[not i for i in slx_vars]))) # last wkx variables self.name_x0.extend(self.name_x[-wkx:]) + else: - okx = int( - (self.k - self.kstar - 1) / (slx_lags + 1) - ) # number of original exogenous vars + okx = int((self.k - self.kstar - 1) / (slx_lags + 1)) # number of original exogenous vars self.name_x0.extend(self.name_x[-okx:]) - self.name_q.extend( - USER.set_name_q_sp(self.name_x0, w_lags, self.name_q, lag_q) - ) - - # var_types = ['x'] * (kx + 1) + ['wx'] * kx * slx_lags + ['yend'] * (len(self.name_yend) - 1) + ['rho'] - var_types = ( - ["x"] * (kx + 1) - + ["wx"] * wkx * slx_lags - + ["yend"] * (len(self.name_yend) - 1) - + ["rho"] - ) + self.name_q.extend(USER.set_name_q_sp(self.name_x0, w_lags, self.name_q, lag_q)) + + #var_types = ['x'] * (kx + 1) + ['wx'] * kx * slx_lags + ['yend'] * (len(self.name_yend) - 1) + ['rho'] + var_types = ['x'] * (kx + 1) + ['wx'] * wkx * slx_lags + ['yend'] * (len(self.name_yend) - 1) + ['rho'] else: - self.name_q.extend( - USER.set_name_q_sp(self.name_x, w_lags, self.name_q, lag_q) - ) - var_types = ( - ["x"] * len(self.name_x) - + ["yend"] * (len(self.name_yend) - 1) - + ["rho"] - ) + self.name_q.extend(USER.set_name_q_sp(self.name_x, w_lags, self.name_q, lag_q)) + var_types = ['x'] * len(self.name_x) + ['yend'] * (len(self.name_yend) - 1) + ['rho'] self.name_h = USER.set_name_h(self.name_x, self.name_q) self.robust = USER.set_robust(robust) @@ -646,18 +622,16 @@ def __init__( self.slx_lags = slx_lags self.slx_vars = slx_vars - self.output = pd.DataFrame(self.name_x + self.name_yend, columns=["var_names"]) - self.output["var_type"] = var_types - self.output["regime"], self.output["equation"] = (0, 0) + self.output = pd.DataFrame(self.name_x + self.name_yend, columns=['var_names']) + self.output['var_type'] = var_types + self.output['regime'], self.output['equation'] = (0, 0) self.other_top = _spat_pseudo_r2(self) diag_out = None if spat_diag: - diag_out = _spat_diag_out(self, w, "yend") + diag_out = _spat_diag_out(self, w, 'yend') if spat_impacts: - self.sp_multipliers, impacts_str = _summary_impacts( - self, w, spat_impacts, slx_lags, slx_vars - ) + self.sp_multipliers, impacts_str = _summary_impacts(self, w, spat_impacts, slx_lags,slx_vars) try: diag_out += impacts_str except TypeError: diff --git a/spreg/twosls_sp_regimes.py b/spreg/twosls_sp_regimes.py index ec7545b1..9488b563 100644 --- a/spreg/twosls_sp_regimes.py +++ b/spreg/twosls_sp_regimes.py @@ -11,22 +11,14 @@ from . import user_output as USER from .twosls_regimes import TSLS_Regimes, _optimal_weight from .twosls import BaseTSLS -from .utils import ( - set_endog, - set_endog_sparse, - sp_att, - set_warn, - sphstack, - spdot, - optim_k, -) +from .utils import set_endog, set_endog_sparse, sp_att, set_warn, sphstack, spdot, optim_k from .robust import hac_multi from .output import output, _spat_diag_out, _spat_pseudo_r2, _summary_impacts from .skater_reg import Skater_reg from .twosls_sp import BaseGM_Lag - class GM_Lag_Regimes(TSLS_Regimes, REGI.Regimes_Frame): + """ Spatial two stage least squares (S2SLS) with regimes; :cite:`Anselin1988` @@ -99,7 +91,7 @@ class GM_Lag_Regimes(TSLS_Regimes, REGI.Regimes_Frame): Include average direct impact (ADI), average indirect impact (AII), and average total impact (ATI) in summary results. Options are 'simple', 'full', 'power', 'all' or None. - See sputils._spmultiplier for more information. + See sputils.spmultiplier for more information. spat_diag : boolean If True, then compute Anselin-Kelejian test and Common Factor Hypothesis test (if applicable) vm : boolean @@ -397,8 +389,8 @@ class GM_Lag_Regimes(TSLS_Regimes, REGI.Regimes_Frame): the coefficient estimates by calling: >>> model.std_err - array([0.44682888, 0.14358192, 0.05655124, 1.06044865, 0.20184548, - 0.06118262, 0.12387232]) + array([0.38492902, 0.19106926, 0.06063249, 1.25607153, 0.36117334, + 0.092293 , 0.15116983]) In the example above, all coefficients but the spatial lag vary according to the regime. It is also possible to have the spatial lag @@ -408,15 +400,15 @@ class GM_Lag_Regimes(TSLS_Regimes, REGI.Regimes_Frame): >>> model=GM_Lag_Regimes(y, x, regimes, w=w, regime_lag_sep=True, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT', name_w='NAT.shp') >>> print(model.output) - var_names coefficients std_err zt_stat prob - 0 0_CONSTANT 1.365848 0.398547 3.427066 0.00061 - 1 0_PS90 0.808757 0.113249 7.141418 0.0 - 2 0_UE90 0.569468 0.046251 12.312591 0.0 - 3 0_W_HR90 -0.434244 0.133502 -3.252724 0.001143 - 4 1_CONSTANT 7.907311 1.636019 4.833264 0.000001 - 5 1_PS90 1.274657 0.247099 5.158493 0.0 - 6 1_UE90 0.601677 0.079933 7.527245 0.0 - 7 1_W_HR90 -0.296034 0.199345 -1.485036 0.137534 + var_names coefficients std_err zt_stat prob + 0 0_CONSTANT 1.365848 0.385177 3.546023 0.000391 + 1 0_PS90 0.808757 0.206672 3.91325 0.000091 + 2 0_UE90 0.569468 0.067703 8.411247 0.0 + 3 0_W_HR90 -0.434244 0.177208 -2.450478 0.014267 + 4 1_CONSTANT 7.907311 1.772336 4.461518 0.000008 + 5 1_PS90 1.274657 0.368306 3.460869 0.000538 + 6 1_UE90 0.601677 0.102102 5.892907 0.0 + 7 1_W_HR90 -0.296034 0.226243 -1.308474 0.190712 Alternatively, we can type: 'model.summary' to see the organized results output. The class is flexible enough to accomodate a spatial lag model that, @@ -449,8 +441,8 @@ class GM_Lag_Regimes(TSLS_Regimes, REGI.Regimes_Frame): model.summary >>> model.std_err - array([0.49163311, 0.12237382, 0.05633464, 0.72555909, 0.17250521, - 0.06749131, 0.27370369, 0.25106224, 0.05804213]) + array([0.49529467, 0.18912143, 0.05157813, 0.92277557, 0.33711135, + 0.08993181, 0.33506177, 0.36381449, 0.07209498]) """ def __init__( @@ -464,7 +456,7 @@ def __init__( w_lags=1, slx_lags=0, lag_q=True, - robust="white", + robust='white', gwk=None, sig2n_k=False, spat_diag=True, @@ -486,19 +478,17 @@ def __init__( latex=False, hard_bound=False, ): + n = USER.check_arrays(y, x) y, name_y = USER.check_y(y, n, name_y) yend, q, name_yend, name_q = USER.check_endog([yend, q], [name_yend, name_q]) w = USER.check_weights(w, y, w_required=True, slx_lags=slx_lags) USER.check_robust(robust, gwk) if regime_lag_sep and not regime_err_sep: - set_warn(self, "regime_err_sep set to True when regime_lag_sep=True.") + set_warn(self, "regime_err_sep set to True when regime_lag_sep=True.") regime_err_sep = True if regime_err_sep and not regime_lag_sep: - set_warn( - self, - "Groupwise heteroskedasticity is not currently available for this method,\n so regime_err_sep has been set to False.", - ) + set_warn(self, "Groupwise heteroskedasticity is not currently available for this method,\n so regime_err_sep has been set to False.") regime_err_sep = False if robust == "hac": if regime_err_sep: @@ -507,12 +497,8 @@ def __init__( "Error by regimes is not available for HAC estimation. The error by regimes has been disabled for this model.", ) regime_err_sep = False - if spat_diag: - set_warn( - self, - "Spatial diagnostics are not available for HAC estimation. The spatial diagnostics have been disabled for this model.", - ) - spat_diag = False + spat_diag, warn = USER.check_spat_diag(spat_diag=spat_diag, w=w, robust=robust, slx_lags=slx_lags) + set_warn(self, warn) x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) @@ -524,33 +510,22 @@ def __init__( self.name_regimes = USER.set_name_ds(name_regimes) self.constant_regi = constant_regi if slx_lags > 0: - yend2, q2, wx = set_endog( - y, x_constant, w, yend, q, w_lags, lag_q, slx_lags - ) + yend2, q2, wx = set_endog(y, x_constant, w, yend, q, w_lags, lag_q, slx_lags) x_constant = np.hstack((x_constant, wx)) name_slx = USER.set_name_spatial_lags(name_x, slx_lags) - name_q.extend( - USER.set_name_q_sp( - name_slx[-len(name_x) :], w_lags, name_q, lag_q, force_all=True - ) - ) + name_q.extend(USER.set_name_q_sp(name_slx[-len(name_x):], w_lags, name_q, lag_q, force_all=True)) name_x += name_slx - if cols2regi == "all": + if cols2regi == 'all': cols2regi = REGI.check_cols2regi( - constant_regi, cols2regi, x_constant, yend=yend2, add_cons=False - )[0:-1] + constant_regi, cols2regi, x_constant, yend=yend2, add_cons=False)[0:-1] else: cols2regi = REGI.check_cols2regi( - constant_regi, cols2regi, x_constant, yend=yend2, add_cons=False - ) + constant_regi, cols2regi, x_constant, yend=yend2, add_cons=False) else: - name_q.extend( - USER.set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=True) - ) + name_q.extend(USER.set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=True)) yend2, q2 = yend, q cols2regi = REGI.check_cols2regi( - constant_regi, cols2regi, x_constant, yend=yend2, add_cons=False - ) + constant_regi, cols2regi, x_constant, yend=yend2, add_cons=False) self.n = x_constant.shape[0] self.cols2regi = cols2regi self.regimes_set = REGI._get_regimes_set(regimes) @@ -653,38 +628,25 @@ def __init__( self.sp_att_reg(w_i, regi_ids, yend2[:, -1].reshape(self.n, 1)) else: self.rho = self.betas[-1] - self.output.iat[-1, self.output.columns.get_loc("var_type")] = "rho" + self.output.iat[-1, self.output.columns.get_loc('var_type')] = 'rho' self.predy_e, self.e_pred, warn = sp_att( - w, - self.y, - self.predy, - yend2[:, -1].reshape(self.n, 1), - self.rho, - hard_bound=hard_bound, - ) + w, self.y, self.predy, yend2[:, -1].reshape(self.n, 1), self.rho, hard_bound=hard_bound) set_warn(self, warn) self.regime_lag_sep = regime_lag_sep self.title = "SPATIAL " + self.title if slx_lags > 0: for m in self.regimes_set: - r_output = self.output[ - (self.output["regime"] == str(m)) - & (self.output["var_type"] == "x") - ] - wx_index = r_output.index[ - -((len(r_output) - 1) // (slx_lags + 1)) * slx_lags : - ] - self.output.loc[wx_index, "var_type"] = "wx" + r_output = self.output[(self.output['regime'] == str(m)) & (self.output['var_type'] == 'x')] + wx_index = r_output.index[-((len(r_output)-1)//(slx_lags+1)) * slx_lags:] + self.output.loc[wx_index, 'var_type'] = 'wx' self.title = " SPATIAL 2SLS WITH SLX (SPATIAL DURBIN MODEL) - REGIMES" self.other_top = _spat_pseudo_r2(self) self.slx_lags = slx_lags diag_out = None if spat_diag: - diag_out = _spat_diag_out(self, w, "yend") + diag_out = _spat_diag_out(self, w, 'yend') if spat_impacts: - self.sp_multipliers, impacts_str = _summary_impacts( - self, w, spat_impacts, slx_lags, regimes=True - ) + self.sp_multipliers, impacts_str = _summary_impacts(self, w, spat_impacts, slx_lags, regimes=True) try: diag_out += impacts_str except TypeError: @@ -823,9 +785,7 @@ def GM_Lag_Regimes_Multi( self.name_h, ) = ([], [], [], [], [], []) counter = 0 - self.output = pd.DataFrame( - columns=["var_names", "var_type", "regime", "equation"] - ) + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -842,8 +802,7 @@ def GM_Lag_Regimes_Multi( results[r].y, results[r].predy, results[r].yend[:, -1].reshape(results[r].n, 1), - results[r].rho, - hard_bound=hard_bound, + results[r].rho, hard_bound=hard_bound ) set_warn(results[r], warn) results[r].w = w_i[r] @@ -851,13 +810,21 @@ def GM_Lag_Regimes_Multi( (counter * self.kr) : ((counter + 1) * self.kr), (counter * self.kr) : ((counter + 1) * self.kr), ] = results[r].vm - self.betas[(counter * self.kr) : ((counter + 1) * self.kr),] = results[ - r - ].betas - self.u[regi_ids[r],] = results[r].u - self.predy[regi_ids[r],] = results[r].predy - self.predy_e[regi_ids[r],] = results[r].predy_e - self.e_pred[regi_ids[r],] = results[r].e_pred + self.betas[ + (counter * self.kr) : ((counter + 1) * self.kr), + ] = results[r].betas + self.u[ + regi_ids[r], + ] = results[r].u + self.predy[ + regi_ids[r], + ] = results[r].predy + self.predy_e[ + regi_ids[r], + ] = results[r].predy_e + self.e_pred[ + regi_ids[r], + ] = results[r].e_pred self.name_y += results[r].name_y self.name_x += results[r].name_x self.name_yend += results[r].name_yend @@ -866,38 +833,24 @@ def GM_Lag_Regimes_Multi( self.name_h += results[r].name_h if r == self.regimes_set[0]: self.hac_var = np.zeros((self.n, results[r].h.shape[1]), float) - self.hac_var[regi_ids[r],] = results[r].h + self.hac_var[ + regi_ids[r], + ] = results[r].h results[r].other_top = _spat_pseudo_r2(results[r]) results[r].other_mid = "" if slx_lags > 0: kx = (results[r].k - results[r].kstar - 1) // (slx_lags + 1) - var_types = ( - ["x"] * (kx + 1) - + ["wx"] * kx * slx_lags - + ["yend"] * (len(results[r].name_yend) - 1) - + ["rho"] - ) + var_types = ['x'] * (kx + 1) + ['wx'] * kx * slx_lags + ['yend'] * (len(results[r].name_yend) - 1) + ['rho'] else: - var_types = ( - ["x"] * len(results[r].name_x) - + ["yend"] * (len(results[r].name_yend) - 1) - + ["rho"] - ) - results[r].output = pd.DataFrame( - { - "var_names": results[r].name_x + results[r].name_yend, - "var_type": var_types, - "regime": r, - "equation": r, - } - ) + var_types = ['x'] * len(results[r].name_x) + ['yend'] * (len(results[r].name_yend)-1) + ['rho'] + results[r].output = pd.DataFrame({'var_names': results[r].name_x + results[r].name_yend, + 'var_type': var_types, + 'regime': r, 'equation': r}) self.output = pd.concat([self.output, results[r].output], ignore_index=True) if spat_diag: - results[r].other_mid += _spat_diag_out(results[r], results[r].w, "yend") + results[r].other_mid += _spat_diag_out(results[r], results[r].w, 'yend') if spat_impacts: - results[r].sp_multipliers, impacts_str = _summary_impacts( - results[r], results[r].w, spat_impacts, slx_lags - ) + results[r].sp_multipliers, impacts_str = _summary_impacts(results[r], results[r].w, spat_impacts, slx_lags) results[r].other_mid += impacts_str counter += 1 self.multi = results @@ -909,7 +862,7 @@ def GM_Lag_Regimes_Multi( "Residuals treated as homoskedastic for the purpose of diagnostics.", ) self.chow = REGI.Chow(self) - # if spat_diag: + #if spat_diag: # self._get_spat_diag_props(y, x, w, yend, q, w_lags, lag_q) output(reg=self, vm=vm, robust=robust, other_end=False, latex=latex) @@ -1019,63 +972,369 @@ def _work( class GM_Lag_Endog_Regimes(GM_Lag_Regimes): - def __init__(self, y, x, w, n_clusters=None, quorum=-np.inf, trace=True, **kwargs): + + """ + Spatial two stage least squares (S2SLS) with endogenous regimes. + Based on the function skater_reg as shown in :cite:`Anselin2021`. + + Parameters + ---------- + y : numpy.ndarray or pandas.Series + nx1 array for dependent variable + x : numpy.ndarray or pandas object + Two dimensional array with n rows and one column for each + independent (exogenous) variable, excluding the constant + w : pysal W object + Spatial weights object (required if running spatial + diagnostics) + n_clusters : int + Number of clusters to be used in the endogenous regimes. + If None (default), the number of clusters will be chosen + according to the function utils.optim_k using a method + adapted from Mojena (1977)'s Rule Two + quorum : int + Minimum number of observations in a cluster to be considered + Must be at least larger than the number of variables in x + Default value is 30 or 10*k, whichever is larger. + trace : boolean + Sets whether to store intermediate results of the clustering + Hard-coded to True if n_clusters is None + name_y : string + Name of dependent variable for use in output + name_x : list of strings + Names of independent variables for use in output + name_w : string + Name of weights matrix for use in output + name_gwk : string + Name of kernel weights matrix for use in output + name_ds : string + Name of dataset for use in output + name_regimes : string + Name of regimes variable for use in output + latex : boolean + Specifies if summary is to be printed in latex format + **kwargs : additional keyword arguments depending on the specific model + + Attributes + ---------- + output : dataframe + regression results pandas dataframe + summary : string + Summary of regression results and diagnostics (note: use in + conjunction with the print command) + betas : array + kx1 array of estimated coefficients + u : array + nx1 array of residuals + e_pred : array + nx1 array of residuals (using reduced form) + predy : array + nx1 array of predicted y values + predy_e : array + nx1 array of predicted y values (using reduced form) + n : integer + Number of observations + k : integer + Number of variables for which coefficients are estimated + (including the constant) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + kstar : integer + Number of endogenous variables. + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + y : array + nx1 array for dependent variable + x : array + Two dimensional array with n rows and one column for each + independent (exogenous) variable, including the constant + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + yend : array + Two dimensional array with n rows and one column for each + endogenous variable + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + q : array + Two dimensional array with n rows and one column for each + external exogenous variable used as instruments + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + z : array + nxk array of variables (combination of x and yend) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + h : array + nxl array of instruments (combination of x and q) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + robust : string + Adjustment for robust standard errors + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + mean_y : float + Mean of dependent variable + std_y : float + Standard deviation of dependent variable + vm : array + Variance covariance matrix (kxk) + pr2 : float + Pseudo R squared (squared correlation between y and ypred) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + pr2_e : float + Pseudo R squared (squared correlation between y and ypred_e + (using reduced form)) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + utu : float + Sum of squared residuals + sig2 : float + Sigma squared used in computations + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + std_err : array + 1xk array of standard errors of the betas + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + z_stat : list of tuples + z statistic; each tuple contains the pair (statistic, + p-value), where each is a float + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + ak_test : tuple + Anselin-Kelejian test; tuple contains the pair (statistic, + p-value) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + cfh_test : tuple + Common Factor Hypothesis test; tuple contains the pair (statistic, + p-value). Only when it applies (see specific documentation). + name_y : string + Name of dependent variable for use in output + name_x : list of strings + Names of independent variables for use in output + name_yend : list of strings + Names of endogenous variables for use in output + name_z : list of strings + Names of exogenous and endogenous variables for use in + output + name_q : list of strings + Names of external instruments + name_h : list of strings + Names of all instruments used in ouput + name_w : string + Name of weights matrix for use in output + name_gwk : string + Name of kernel weights matrix for use in output + name_ds : string + Name of dataset for use in output + name_regimes : string + Name of regimes variable for use in output + title : string + Name of the regression method used + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + sig2n : float + Sigma squared (computed with n in the denominator) + sig2n_k : float + Sigma squared (computed with n-k in the denominator) + hth : float + :math:`H'H`. + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + hthi : float + :math:`(H'H)^{-1}`. + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + varb : array + :math:`(Z'H (H'H)^{-1} H'Z)^{-1}`. + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + zthhthi : array + :math:`Z'H(H'H)^{-1}`. + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + pfora1a2 : array + n(zthhthi)'varb + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + sp_multipliers: dict + Dictionary of spatial multipliers (if spat_impacts is not None) + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + regimes : list + List of n values with the mapping of each + observation to a regime. Assumed to be aligned with 'x'. + constant_regi: string + Ignored if regimes=False. Constant option for regimes. + Switcher controlling the constant term setup. It may take + the following values: + + * 'one': a vector of ones is appended to x and held constant across regimes. + + * 'many': a vector of ones is appended to x and considered different per regime. + cols2regi : list, 'all' + Ignored if regimes=False. Argument indicating whether each + column of x should be considered as different per regime + or held constant across regimes (False). + If a list, k booleans indicating for each variable the + option (True if one per regime, False to be held constant). + If 'all', all the variables vary by regime. + regime_lag_sep: boolean + If True, the spatial parameter for spatial lag is also + computed according to different regimes. If False (default), + the spatial parameter is fixed accross regimes. + regime_err_sep: boolean + If True, a separate regression is run for each regime. + kr : int + Number of variables/columns to be "regimized" or subject + to change by regime. These will result in one parameter + estimate by regime for each variable (i.e. nr parameters per + variable) + kf : int + Number of variables/columns to be considered fixed or + global across regimes and hence only obtain one parameter + estimate + nr : int + Number of different regimes in the 'regimes' list + multi : dictionary + Only available when multiple regressions are estimated, + i.e. when regime_err_sep=True and no variable is fixed + across regimes. + Contains all attributes of each individual regression + SSR : list + list with the total sum of squared residuals for the model + considering all regimes for each of steps of number of regimes + considered, starting with the solution with 2 regimes. + clusters : int + Number of clusters considered in the endogenous regimes + _trace : list + List of dictionaries with the clustering results for each + number of clusters tested. Only available if n_clusters is + None or trace=True. + Examples + -------- + >>> import libpysal + >>> import numpy as np + >>> np.set_printoptions(legacy='1.25') #to avoid printing issues with numpy floats + >>> import geopandas as gpd + >>> from spreg import OLS_Endog_Regimes + + Open data on Baltimore house sales price and characteristics in Baltimore + from libpysal examples using geopandas. + + >>> db = gpd.read_file(libpysal.examples.get_path('baltim.shp')) + + We will create a weights matrix based on contiguity. + + >>> w = libpysal.weights.Queen.from_dataframe(db, use_index=True) + >>> w.transform = "r" + + For this example, we will use the 'PRICE' column as the dependent variable and + the 'NROOM', 'AGE', and 'SQFT' columns as independent variables. + At this point, we will let the model choose the number of clusters. + + >>> reg = GM_Lag_Endog_Regimes(y=db['PRICE'], x=db[['NROOM','AGE','SQFT']], w=w, name_w="baltim_q.gal") + + The function `print(reg.summary)` can be used to visualize the results of the regression. + + Alternatively, we can check individual attributes: + >>> reg.betas + array([[ 6.20932938], + [ 4.25581944], + [-0.1468118 ], + [ 0.40893082], + [ 5.01866492], + [ 4.84994184], + [-0.55425337], + [ 1.04577632], + [ 0.05155043]]) + >>> reg.SSR + [59784.06769835169, 56858.621800274515] + >>> reg.clusters + array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], dtype=int32) + + We will now set the number of clusters to 2 and run the regression again. + + >>> reg = GM_Lag_Endog_Regimes(y=db['PRICE'], x=db[['NROOM','AGE','SQFT']], w=w, n_clusters=2, name_w="baltim_q.gal") + + The function `print(reg.summary)` can be used to visualize the results of the regression. + + Alternatively, we can check individual attributes as before: + >>> reg.betas + array([[ 6.20932938], + [ 4.25581944], + [-0.1468118 ], + [ 0.40893082], + [ 5.01866492], + [ 4.84994184], + [-0.55425337], + [ 1.04577632], + [ 0.05155043]]) + >>> reg.SSR + [59784.06769835169] + >>> reg.clusters + array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], dtype=int32) + + """ + + + def __init__( + self, y, x, w, n_clusters=None, quorum=-1, trace=True, name_y=None, name_x=None, **kwargs): + n = USER.check_arrays(y, x) y, name_y = USER.check_y(y, n, name_y) w = USER.check_weights(w, y, w_required=True) - + x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) + set_warn(self, warn) # Standardize the variables - x_std = (x - np.mean(x, axis=0)) / np.std(x, axis=0) + x_std = (x_constant - np.mean(x_constant, axis=0)) / np.std(x_constant, axis=0) + if quorum < 0: + quorum = np.max([(x.shape[1]+1)*10, 30]) + if not n_clusters: - if quorum < 0: - quorum = np.max([(x.shape[1] + 1) * 10, 30]) - n_clusters_opt = x.shape[0] * 0.70 // quorum + n_clusters_opt = x_constant.shape[0]*0.70//quorum if n_clusters_opt < 2: raise ValueError( - "The combination of the values of `N` and `quorum` is not compatible with regimes estimation." - ) - sk_reg_results = Skater_reg().fit( - n_clusters_opt, - w, - x_std, - {"reg": BaseGM_Lag, "y": y, "x": x, "w": w}, - quorum=quorum, - trace=True, - ) - n_clusters = optim_k( - [ - sk_reg_results._trace[i][1][2] - for i in range(1, len(sk_reg_results._trace)) - ] - ) - self.clusters = sk_reg_results._trace[n_clusters - 1][0] + "The combination of the values of `N` and `quorum` is not compatible with regimes estimation.") + sk_reg_results = Skater_reg().fit(n_clusters_opt, w, x_std, {'reg':BaseGM_Lag,'y':y,'x':x_constant,'w':w}, quorum=quorum, trace=True) + n_clusters = optim_k([sk_reg_results._trace[i][1][2] for i in range(1, len(sk_reg_results._trace))]) + self.clusters = sk_reg_results._trace[n_clusters-1][0] else: try: # Call the Skater_reg method based on GM_Lag - sk_reg_results = Skater_reg().fit( - n_clusters, - w, - x_std, - {"reg": BaseGM_Lag, "y": y, "x": x, "w": w}, - quorum=quorum, - trace=trace, - ) + sk_reg_results = Skater_reg().fit(n_clusters, w, x_std, {'reg':BaseGM_Lag,'y':y,'x':x_constant,'w':w}, quorum=quorum, trace=trace) self.clusters = sk_reg_results.current_labels_ except Exception as e: if str(e) == "one or more input arrays have more columns than rows": - raise ValueError( - "One or more input ended up with more variables than observations. Please check your setting for `quorum`." - ) + raise ValueError("One or more input ended up with more variables than observations. Please check your setting for `quorum`.") else: print("An error occurred:", e) self._trace = sk_reg_results._trace self.SSR = [self._trace[i][1][2] for i in range(1, len(self._trace))] - GM_Lag_Regimes.__init__( - self, y, x, regimes=self.clusters, w=w, name_regimes="Skater_reg", **kwargs - ) + GM_Lag_Regimes.__init__(self, y, x, regimes=self.clusters, w=w, name_y=name_y, name_x=name_x, name_regimes='Skater_reg', **kwargs) def _test(): @@ -1127,7 +1386,7 @@ def _test(): name_ds="columbus", name_w="columbus.gal", regime_err_sep=True, - regime_lag_sep=True, + regime_lag_sep = True, robust="white", ) print(model.output) diff --git a/spreg/user_output.py b/spreg/user_output.py index f5a236e2..18bab6e6 100755 --- a/spreg/user_output.py +++ b/spreg/user_output.py @@ -9,14 +9,14 @@ ) import numpy as np import pandas as pd +import geopandas as gpd # new for check_coords import copy as COPY -from . import diagnostics from . import sputils as spu from libpysal import weights from libpysal import graph from scipy.sparse.csr import csr_matrix -from .utils import get_lags # new for flex_wx -from itertools import compress # new for lfex_wx +from .utils import get_lags # new for flex_wx +from itertools import compress # new for lfex_wx def set_name_ds(name_ds): @@ -152,7 +152,6 @@ def set_name_yend_sp(name_y): """ return "W_" + name_y - def set_name_spatial_lags(names, w_lags): """Set the spatial lag names for multiple variables and lag orders" @@ -167,11 +166,10 @@ def set_name_spatial_lags(names, w_lags): """ lag_names = ["W_" + s for s in names] - for i in range(w_lags - 1): - lag_names += ["W" + str(i + 2) + "_" + s for s in names] + for i in range(w_lags-1): + lag_names += ["W" + str(i+2) + "_" + s for s in names] return lag_names - def set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=False): """Set the spatial instrument names in regression; return generic name if user provides no explicit name." @@ -196,7 +194,7 @@ def set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=False): names = names + name_q sp_inst_names = [] existing_names = set(names) - name_count = {} # Dictionary to store the count of each name + name_count = {} # Dictionary to store the count of each name for name in names: if not name.startswith("W_"): @@ -417,7 +415,7 @@ def check_y(y, n, name_y=None): n : int number of observations - + name_y : string Name of the y variable @@ -425,7 +423,7 @@ def check_y(y, n, name_y=None): ------- y : anything Object passed by the user to a regression class - + name_y : string Name of the y variable @@ -454,7 +452,7 @@ def check_y(y, n, name_y=None): name_y = y.columns.to_list() if len(name_y) == 1: name_y = name_y[0] - + y = y.to_numpy() if not isinstance(y, np.ndarray): print(y.__class__.__name__) @@ -475,7 +473,6 @@ def check_y(y, n, name_y=None): ) return y, name_y - def check_endog(arrays, names): """Check if each of the endogenous arrays passed by a user to a regression class are pandas objects. In this case, the function converts them to numpy arrays and collects their names. @@ -502,8 +499,7 @@ def check_endog(arrays, names): arrays[i].shape = (arrays[i].shape[0], 1) return (*arrays, *names) - -def check_weights(w, y, w_required=False, time=False, slx_lags=0): +def check_weights(w, y, w_required=False, time=False, slx_lags=0, allow_wk=False): """Check if the w parameter passed by the user is a libpysal.W object and check that its dimensionality matches the y parameter. Note that this check is not performed if w set to None. @@ -523,6 +519,8 @@ def check_weights(w, y, w_required=False, time=False, slx_lags=0): False (default) if not. slx_lags : int Number of lags of X in the spatial lag model. + allow_wk : boolean + True if Kernel weights are allowed as W, False (default) if not. Returns ------- @@ -551,18 +549,21 @@ def check_weights(w, y, w_required=False, time=False, slx_lags=0): if w_required == True or (w is not None) or slx_lags > 0: if isinstance(w, graph.Graph): w = w.to_W() - + if w == None: raise Exception("A weights matrix w must be provided to run this method.") - + if not isinstance(w, weights.W): from warnings import warn - warn("w must be API-compatible pysal weights object") # check for kernel weights, if so insert zeros on diagonal - if slx_lags == 1 and isinstance(w, weights.Kernel): - w = weights.fill_diagonal(w, val=0.0) + if slx_lags == 1 and isinstance(w, weights.Kernel) and allow_wk: + w = weights.fill_diagonal(w,val=0.0) + elif slx_lags > 1 and isinstance(w, weights.Kernel): + raise Exception("Higher orders not supported for kernel weights") + elif isinstance(w, weights.Kernel) and (slx_lags == 0 or not allow_wk): + raise Exception("Kernel weights not allowed as W for this model") if w.n != y.shape[0] and time == False: raise Exception("y must have n rows, and w must be an nxn PySAL W object") @@ -570,7 +571,7 @@ def check_weights(w, y, w_required=False, time=False, slx_lags=0): # check to make sure all entries equal 0 if diag.min() != 0 or diag.max() != 0: raise Exception("All entries on diagonal must equal 0.") - + return w @@ -641,15 +642,14 @@ def check_robust(robust, wk): # NOTE: we are not checking for the case of exactly 1.0 ### raise Exception("Off-diagonal entries must be less than 1.") elif robust.lower() == "white" or robust.lower() == "ogmm": - # if wk: - # raise Exception("White requires that wk be set to None") - pass # these options are not affected by wk + # if wk: + # raise Exception("White requires that wk be set to None") + pass # these options are not affected by wk else: raise Exception( "invalid value passed to robust, see docs for valid options" ) - ''' Deprecated in 1.6.1 def check_spat_diag(spat_diag, w): """Check if there is a w parameter passed by the user if the user also @@ -693,6 +693,37 @@ def check_spat_diag(spat_diag, w): ''' +def check_spat_diag(spat_diag, w, robust=None, slx_lags=None): + """Check if spatial diagnostics should be disabled given argument combinations. + + Parameters + ---------- + spat_diag : boolean + Value passed by a used to a regression class + w : weights object + robust : string + variance-covariance estimator + + + Returns + ------- + spat_diag : boolean + Updated spatial diagnostics flag + warn : string + Warning message if it is not possible to run spatial diagnostics + + """ + warn = None + if robust == "hac" and spat_diag: + warn = "Spatial diagnostics are not available for HAC estimation.\nHence, spatial diagnostics have been disabled for this model." + spat_diag = False + + if spat_diag and isinstance(w, weights.Kernel) and slx_lags > 0: + warn = "\nSpatial diagnostics are not currently available for SLX models with kernel weights.\nHence, spatial diagnostics have been disabled for this model." + spat_diag = False + + return spat_diag, warn + def check_reg_list(regimes, name_regimes, n): """Check if the regimes parameter passed by the user is a valid list of regimes. Note: this does not check if the regimes are valid for the @@ -703,7 +734,7 @@ def check_reg_list(regimes, name_regimes, n): regimes : list or np.array or pd.Series Object passed by the user to a regression class name_regimes : string - Name of the regimes variable + Name of the regimes variable n : int number of observations @@ -731,6 +762,11 @@ def check_reg_list(regimes, name_regimes, n): return regimes, name_regimes + + + + + def check_regimes(reg_set, N=None, K=None): """Check if there are at least two regimes @@ -826,7 +862,7 @@ def check_constant(x, name_x=None, just_rem=False): return x_constant, keep_x, warn -def flex_wx(w, x, name_x, constant=True, slx_lags=1, slx_vars="All"): +def flex_wx(w,x,name_x,constant=True,slx_lags=1,slx_vars="All"): """ Adds spatially lagged variables to an existing x matrix with or without a constant term Adds variable names prefaced by W_ for the lagged variables @@ -851,25 +887,71 @@ def flex_wx(w, x, name_x, constant=True, slx_lags=1, slx_vars="All"): """ if constant == True: - xwork = x[:, 1:] - xnamework = name_x[1:] + xwork = x[:,1:] + xnamework = name_x[1:] else: xwork = x xnamework = name_x - - if isinstance(slx_vars, list): + + if isinstance(slx_vars,list): if len(slx_vars) == len(xnamework): - xwork = xwork[:, slx_vars] - xnamework = list(compress(xnamework, slx_vars)) + xwork = xwork[:,slx_vars] + xnamework = list(compress(xnamework,slx_vars)) else: raise Exception("Mismatch number of columns and length slx_vars") - - lagx = get_lags(w, xwork, slx_lags) - xlagname = set_name_spatial_lags(xnamework, slx_lags) - bigx = np.hstack((x, lagx)) + + lagx = get_lags(w,xwork,slx_lags) + xlagname = set_name_spatial_lags(xnamework,slx_lags) + bigx = np.hstack((x,lagx)) bignamex = name_x + xlagname - return (bigx, bignamex) + return(bigx,bignamex) + +def check_coords(data,name_coords): + ''' + Checks to make sure the object passed is turned into a numpy array of coordinates. + + Parameters + ---------- + data : an n by 2 array or a selection of two columns from a data frame + + Returns + ------- + xy : n by 2 numpy array with coordinates + name_coords : names for coordinate variables + ''' + if (not(isinstance(data,np.ndarray))): + if (isinstance(data,pd.core.frame.DataFrame)): + xy = np.array(data) + + if not name_coords: + try: + name_coords = data.name + except AttributeError: + name_coords = data.columns.to_list() + + elif (isinstance(data,gpd.geoseries.GeoSeries)): # geometry column + if data[0].geom_type == 'Point': + xy = data.get_coordinates() + xy = np.array(xy) + + if not name_coords: + try: + name_coords = data.name + except AttributeError: + name_coords = data.columns.to_list() + + else: + raise Exception("Data type not supported") + else: + raise Exception("Data type not supported") + else: + xy = data + if not name_coords: + name_coords = "unknown" + if xy.shape[1] != 2: + raise Exception("Incompatible dimensions") + return xy,name_coords def _test(): import doctest diff --git a/spreg/utils.py b/spreg/utils.py index 54bdb5f2..37665184 100755 --- a/spreg/utils.py +++ b/spreg/utils.py @@ -2,7 +2,7 @@ Tools for different procedure estimations """ -__author__ = "Luc Anselin luc.anselin@asu.edu, \ +__author__ = "Luc Anselin lanselin@gmail.com, \ Pedro V. Amaral pedro.amaral@asu.edu, \ David C. Folch david.folch@asu.edu, \ Daniel Arribas-Bel darribas@asu.edu,\ @@ -13,11 +13,15 @@ import scipy.optimize as op import numpy.linalg as la from libpysal.weights.spatial_lag import lag_spatial +from libpysal.cg import KDTree # new for make_wnslx +from scipy.sparse import coo_array,csr_array # new for make_wnslx from .sputils import * import copy + class RegressionPropsY(object): + """ Helper class that adds common regression properties to any regression class that inherits it. It takes no parameters. See BaseOLS for example @@ -79,6 +83,7 @@ def std_y(self, val): class RegressionPropsVM(object): + """ Helper class that adds common regression properties to any regression class that inherits it. It takes no parameters. See BaseOLS for example @@ -106,9 +111,9 @@ def utu(self): return self._cache["utu"] except AttributeError: self._cache = {} - self._cache["utu"] = np.sum(self.u**2) + self._cache["utu"] = np.sum(self.u ** 2) except KeyError: - self._cache["utu"] = np.sum(self.u**2) + self._cache["utu"] = np.sum(self.u ** 2) return self._cache["utu"] @utu.setter @@ -216,7 +221,7 @@ def get_A1_het(S): def get_A1_hom(s, scalarKP=False): - """ + r""" Builds A1 for the spatial error GM estimation with homoscedasticity as in Drukker et al. [Drukker2011]_ (p. 9). @@ -255,7 +260,7 @@ def get_A1_hom(s, scalarKP=False): def get_A2_hom(s): - """ + r""" Builds A2 for the spatial error GM estimation with homoscedasticity as in Anselin (2011) :cite:`Anselin2011` @@ -320,9 +325,7 @@ def _moments2eqs(A1, s, u): return [G, g] -def optim_moments( - moments_in, vcX=np.array([0]), all_par=False, start=None, hard_bound=False -): +def optim_moments(moments_in, vcX=np.array([0]), all_par=False, start=None, hard_bound=False): """ Optimization of moments ... @@ -344,7 +347,7 @@ def optim_moments( hard_bound : boolean If true, raises an exception if the estimated spatial autoregressive parameter is outside the maximum/minimum bounds. - + Returns ------- x, f, d : tuple @@ -391,9 +394,7 @@ def optim_moments( if hard_bound: if abs(lambdaX[0][0]) >= 0.99: - raise Exception( - "Spatial parameter was outside the bounds of -0.99 and 0.99" - ) + raise Exception("Spatial parameter was outside the bounds of -0.99 and 0.99") if all_par: return lambdaX[0] @@ -422,7 +423,7 @@ def foptim_par(par, moments): """ vv = np.dot(moments[0], par) vv2 = moments[1] - vv - return sum(vv2**2) + return sum(vv2 ** 2) def get_spFilter(w, lamb, sf): @@ -499,7 +500,6 @@ def get_lags(w, x, w_lags): spat_lags = sphstack(spat_lags, lag) return spat_lags - def get_lags_split(w, x, max_lags, split_at): """ Calculates a given order of spatial lags and all the smaller orders, @@ -526,7 +526,7 @@ def get_lags_split(w, x, max_lags, split_at): rs_l = lag = lag_spatial(w, x) rs_h = None if 0 < split_at < max_lags: - for _ in range(split_at - 1): + for _ in range(split_at-1): lag = lag_spatial(w, lag) rs_l = sphstack(rs_l, lag) @@ -534,13 +534,10 @@ def get_lags_split(w, x, max_lags, split_at): lag = lag_spatial(w, lag) rs_h = sphstack(rs_h, lag) if i > 0 else lag else: - raise ValueError( - "max_lags must be greater than split_at and split_at must be greater than 0" - ) + raise ValueError("max_lags must be greater than split_at and split_at must be greater than 0") return rs_l, rs_h - def inverse_prod( w, data, @@ -622,13 +619,11 @@ def inverse_prod( except: matrix = la.inv(np.eye(w.shape[0]) - (scalar * w)) if post_multiply: - # inv_prod = spdot(data.T, matrix) - inv_prod = np.matmul( - data.T, matrix - ) # inverse matrix is dense, wrong type in spdot +# inv_prod = spdot(data.T, matrix) + inv_prod = np.matmul(data.T,matrix) # inverse matrix is dense, wrong type in spdot else: - # inv_prod = spdot(matrix, data) - inv_prod = np.matmul(matrix, data) +# inv_prod = spdot(matrix, data) + inv_prod = np.matmul(matrix,data) else: raise Exception("Invalid method selected for inversion.") return inv_prod @@ -637,7 +632,7 @@ def inverse_prod( def power_expansion( w, data, scalar, post_multiply=False, threshold=0.0000000001, max_iterations=None ): - """ + r""" Compute the inverse of a matrix using the power expansion (Leontief expansion). General form is: @@ -678,13 +673,13 @@ def power_expansion( return running_total -def set_endog(y, x, w, yend, q, w_lags, lag_q, slx_lags=0, slx_vars="All"): +def set_endog(y, x, w, yend, q, w_lags, lag_q, slx_lags=0,slx_vars="All"): # Create spatial lag of y yl = lag_spatial(w, y) # spatial and non-spatial instruments if issubclass(type(yend), np.ndarray): if slx_lags > 0: - lag_x, lag_xq = get_lags_split(w, x, slx_lags + 1, slx_lags) + lag_x, lag_xq = get_lags_split(w, x, slx_lags+1, slx_lags) else: lag_xq = x if lag_q: @@ -696,7 +691,7 @@ def set_endog(y, x, w, yend, q, w_lags, lag_q, slx_lags=0, slx_vars="All"): yend = sphstack(yend, yl) elif yend == None: # spatial instruments only if slx_lags > 0: - lag_x, lag_xq = get_lags_split(w, x, slx_lags + w_lags, slx_lags) + lag_x, lag_xq = get_lags_split(w, x, slx_lags+w_lags, slx_lags) else: lag_xq = get_lags(w, x, w_lags) q = lag_xq @@ -706,17 +701,18 @@ def set_endog(y, x, w, yend, q, w_lags, lag_q, slx_lags=0, slx_vars="All"): if slx_lags == 0: return yend, q else: # ajdust returned lag_x here using slx_vars - if isinstance(slx_vars, list): # slx_vars has True,False - if len(slx_vars) != x.shape[1]: + if (isinstance(slx_vars,list)): # slx_vars has True,False + if len(slx_vars) != x.shape[1] : raise Exception("slx_vars incompatible with x column dimensions") else: # use slx_vars to extract proper columns vv = slx_vars * slx_lags - lag_x = lag_x[:, vv] + lag_x = lag_x[:,vv] return yend, q, lag_x else: # slx_vars is "All" return yend, q, lag_x + def set_endog_sparse(y, x, w, yend, q, w_lags, lag_q): """ Same as set_endog, but with a sparse object passed as weights instead of W object. @@ -805,13 +801,12 @@ def RegressionProps_basic( if sig2 is not None: reg.sig2 = sig2 elif sig2n_k: - reg.sig2 = np.sum(reg.u**2) / (reg.n - reg.k) + reg.sig2 = np.sum(reg.u ** 2) / (reg.n - reg.k) else: - reg.sig2 = np.sum(reg.u**2) / reg.n + reg.sig2 = np.sum(reg.u ** 2) / reg.n if vm is not None: reg.vm = vm - def optim_k(trace, window_size=None): """ Finds optimal number of regimes for the endogenous spatial regimes model @@ -841,7 +836,7 @@ def optim_k(trace, window_size=None): >>> w = ps.weights.Queen.from_shapefile(ps.examples.get_path("NAT.shp")) >>> x_std = (x - np.mean(x,axis=0)) / np.std(x,axis=0) >>> reg = spreg.Skater_reg().fit(20, w, x_std, {'reg':spreg.OLS,'y':y,'x':x}, quorum=100, trace=True) - >>> spreg.utils.optim_k([reg._trace[i][1][2] for i in range(1, len(reg._trace))]) + >>> spreg.optim_k([reg._trace[i][1][2] for i in range(1, len(reg._trace))]) 9 @@ -849,20 +844,104 @@ def optim_k(trace, window_size=None): N = len(trace) if not window_size: - window_size = N // 4 # Mojena suggests from 70% to 90% - std_dev = [np.std(trace[i : i + window_size]) for i in range(N - window_size + 1)] - ma = np.convolve(trace, np.ones(window_size) / window_size, mode="valid") + window_size = N//4 # Mojena suggests from 70% to 90% + if window_size < 2: + window_size = N + std_dev = [np.std(trace[i:i+window_size]) for i in range(N - window_size + 1)] + ma = np.convolve(trace, np.ones(window_size)/window_size, mode='valid') treshold = [True] i = 0 while treshold[-1] and i < (N - window_size): - b = (6 / (window_size * (window_size * window_size - 1))) * ( - (2 * np.sum(np.arange(1, i + 2) * trace[window_size - 1 : i + window_size])) - - ((window_size + 1) * np.sum(trace[window_size - 1 : i + window_size])) - ) - l = (window_size - 1) * b / 2 - treshold.append(trace[i + window_size] < ma[i] - b - l - 2.75 * std_dev[i]) + b = (6/(window_size*(window_size*window_size-1)) + )*((2*np.sum(np.arange(1, i+2)*trace[window_size-1:i+window_size]) + )-((window_size+1)*np.sum(trace[window_size-1:i+window_size]))) + l = (window_size-1)*b/2 + treshold.append(trace[i+window_size] < ma[i] - b - l - 2.75*std_dev[i]) i += 1 - return i + window_size + return i+window_size + + +def make_wnslx(coords,params,leafsize=30,distance_metric='Euclidean'): + ''' + + Computes transformed distances as triangular kernel weights for transform = 'power', or fraction of maximum distance + (bandwidth) for transform = 'exponential'. Uses libpysal.cg.KDTree. The three main characteristics of the kernel weights + are passed as a tuple with k (number of nearest neighbors), upper_distance_bound (variable or fixed bandwidth), + and transform (power or exponential). + + With distance_upper_bound=np.inf, the computation is for a variable bandwidth, where the bandwidth + is determined by the number of nearest neighbors, k. When a distance_upper_bound is set that is larger than the + largest k-nearest neighbor distance, there is no effect. When the distance_upper_bound is less than the max + k-nearest neighbor distance for a given point, then it has the effect of imposing a fixed bandwidth, and + truncating the number of nearest neighbors to those within the bandwidth. As a result, the number of neighbors + will be less than k. + + Note that k is a binding constraint, so if imposing the bandwidth is important, k should be taken large enough. + + Parameters + ---------- + coords : n by 2 numpy array of x,y coordinates + params : tuple with + k : number of nearest neighbors (the diagonal is not included in k, so to obtain + k real nearest neighbors KDTree query must be called with k+1 + distance_upper_bound : bandwidth (see above for interpretation), np.inf is for no bandwidth, used by + KDTree query + transform : determines type of transformation, triangular kernel weights for 'power', + fractional distance for 'exponential' (exponential) + leafsize : argument to construct KDTree, default is 30 (from sklearn) + distance_metric : type of distance, default is "Euclidean", other option is "Arc" for arc-distance, to be used with long,lat + (note: long should be x and lat is y), both are supported by libpysal.cg.KDTree, but not + by its scipy and sklearn counterparts + + + Returns + ------- + spdis : transformed distance matrix as CSR sparse array + + ''' + k = params[0] + distance_upper_bound = params[1] + transform = params[2] + kdt = KDTree(coords,leafsize=leafsize,distance_metric=distance_metric) + dis,nbrs = kdt.query(coords,k=k+1,distance_upper_bound=distance_upper_bound) + # get rid of diagonals + dis = dis[:,1:] + nbrs = nbrs[:,1:] + n = dis.shape[0] + + # maximum distance in each row + if (np.isinf(distance_upper_bound)): # no fixed bandwidth + mxrow = dis[:,-1].reshape(-1,1) + else: + dis = np.nan_to_num(dis,copy=True,posinf=0) # turn inf to zero + mxrow = np.amax(dis,axis=1).reshape(-1,1) + + # rescaled distance + fdis = dis / mxrow + + if transform.lower() == 'power': # triangular kernel weights + fdis = -fdis + 1.0 + elif transform.lower() == 'exponential': # distance fraction + fdis = fdis + else: + raise Exception("Method not supported") + + # turn into COO sparse format and then into CSR + kk = fdis.shape[1] + rowids = np.repeat(np.arange(n),kk) + if (np.isinf(distance_upper_bound)): + colids = nbrs.flatten() + fdis = fdis.flatten() + else: # neighbors outside bandwidth have ID n + pickgd = (nbrs != n) + rowids = rowids[pickgd.flatten()] + colids = nbrs[pickgd].flatten() + fdis = fdis[pickgd].flatten() + + spdis = coo_array((fdis,(rowids,colids))) + spdis = spdis.tocsr(copy=True) + + return spdis def _test():