GeoDaCenter · lixun910 · Jul 3, 2023 · Jan 2, 2023 · Jan 2, 2023 · Jan 2, 2023
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -17,3 +17,4 @@
 ^_pkgdown\.yml$
 ^docs$
 ^pkgdown$
+^Dockerfile$
diff --git a/.github/workflows/build-check.yml b/.github/workflows/build-check.yml
@@ -28,9 +28,10 @@ jobs:
         config:
           - {os: macOS-latest,   r: 'release'}
           - {os: windows-latest, r: 'release'}
+          - {os: ubuntu-latest, r: 'release'}
           #- {os: windows-latest, r: '3.6', rspm: "https://packagemanager.rstudio.com/cran/latest"}
           #- {os: ubuntu-18.04,   r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest", http-user-agent: "R/4.0.0 (ubuntu-18.04) R (4.0.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" }
-          - {os: ubuntu-18.04,   r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"}
+          #- {os: ubuntu-18.04,   r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"}
           #- {os: ubuntu-18.04,   r: 'oldrel',  rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"}
           #- {os: ubuntu-18.04,   r: '3.6',     rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"}
           #- {os: ubuntu-18.04,   r: '3.5',     rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"}
@@ -42,16 +43,18 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
         with:
           submodules: recursive
 
-      - uses: r-lib/actions/setup-r@v1
+      - uses: r-lib/actions/setup-r@v2
         with:
+          rtools-version: '42'
           r-version: ${{ matrix.config.r }}
           http-user-agent: ${{ matrix.config.http-user-agent }}
+          use-public-rspm: true
 
-      - uses: r-lib/actions/setup-pandoc@v1
+      - uses: r-lib/actions/setup-pandoc@v2
 
       - name: Query dependencies
         run: |

diff --git a/.github/workflows/lint-project.yaml b/.github/workflows/lint-project.yaml
@@ -16,11 +16,11 @@ jobs:
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
         with:
           submodules: recursive
 
-      - uses: r-lib/actions/setup-r@v1
+      - uses: r-lib/actions/setup-r@v2
 
       - name: Install lintr
         run: install.packages("lintr")

diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -14,13 +14,13 @@ jobs:
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
         with:
           submodules: recursive
 
-      - uses: r-lib/actions/setup-r@v1
+      - uses: r-lib/actions/setup-r@v2
 
-      - uses: r-lib/actions/setup-pandoc@v1
+      - uses: r-lib/actions/setup-pandoc@v2
 
       - name: Query dependencies
         run: |

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -102,6 +102,8 @@
         "geodaweight.h": "c",
         "filesystem": "cpp",
         "numbers": "cpp",
-        "semaphore": "cpp"
+        "semaphore": "cpp",
+        "__bits": "cpp",
+        "__verbose_abort": "cpp"
     }
 }
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: rgeoda
 Type: Package
 Title: R Library for Spatial Data Analysis
-Version: 0.0.9
-Date: 2022-04-09
+Version: 0.0.10-4
+Date: 2023-07-01
 Authors@R: 
     c(person(given = "Xun", family = "Li", email="lixun910@gmail.com", role=c("aut","cre")),
       person(given = "Luc", family = "Anselin", email="anselin@uchicago.edu", role="aut"))
@@ -39,4 +39,4 @@ Encoding: UTF-8
 Suggests: 
     wkb,
     sp
-SystemRequirements: C++14
+SystemRequirements: C++17
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,7 @@
+FROM rocker/r-base
+
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get update && apt-get install -y git libssl-dev libgeos-dev libgeos++-dev gdal-bin libproj-dev libgdal-dev libudunits2-dev
+
+RUN install2.r --error proxy Rcpp wk sp digest sf BH wkb TinyTex
diff --git a/R/clustering.R b/R/clustering.R
@@ -4,15 +4,26 @@
 #' similar values for features of interest.
 #' @param k The number of clusters
 #' @param w An instance of Weight class
-#' @param df A data frame with selected variables only. E.g. guerry[c("Crm_prs", "Crm_prp", "Litercy")]
+#' @param df A data frame with selected variables only.
+#' E.g. guerry[c("Crm_prs", "Crm_prp", "Litercy")]
 #' @param bound_variable (optional) A data frame with selected bound variable
-#' @param min_bound (optional) A minimum bound value that applies to all clusters
-#' @param scale_method One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization).
-#' @param distance_method (optional) The distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan"
-#' @param random_seed (int,optional) The seed for random number generator. Defaults to 123456789.
-#' @param cpu_threads (optional) The number of cpu threads used for parallel computation
-#' @param rdist (optional) The distance matrix (lower triangular matrix, column wise storage)
-#' @return A names list with names "Clusters", "Total sum of squares", "Within-cluster sum of squares", "Total within-cluster sum of squares", and "The ratio of between to total sum of squares".
+#' @param min_bound (optional) A minimum bound value that applies to all
+#' clusters
+#' @param scale_method One of the scaling methods {'raw', 'standardize',
+#' 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data.
+#' Default is 'standardize' (Z-score normalization).
+#' @param distance_method (optional) The distance method used to compute the
+#' distance betwen observation i and j. Defaults to "euclidean". Options are
+#' "euclidean" and "manhattan"
+#' @param random_seed (int,optional) The seed for random number generator.
+#' Defaults to 123456789.
+#' @param cpu_threads (optional) The number of cpu threads used for parallel
+#' computation
+#' @param rdist (optional) The distance matrix (lower triangular matrix,
+#' column wise storage)
+#' @return A names list with names "Clusters", "Total sum of squares",
+#' "Within-cluster sum of squares", "Total within-cluster sum of squares",
+#' and "The ratio of between to total sum of squares".
 #' @examples
 #' library(sf)
 #' guerry_path <- system.file("extdata", "Guerry.shp", package = "rgeoda")
@@ -22,12 +33,15 @@
 #' guerry_clusters <- skater(4, queen_w, data)
 #' guerry_clusters
 #' @export
-skater <- function(k, w, df, bound_variable=data.frame(), min_bound=0, scale_method="standardize", distance_method="euclidean", random_seed=123456789, cpu_threads=6, rdist=numeric()) {
+skater <- function(k, w, df, bound_variable=data.frame(), min_bound=0,
+                   scale_method="standardize", distance_method="euclidean",
+                   random_seed=123456789, cpu_threads=6, rdist=numeric()) {
   if (w$num_obs < 1) {
     stop("The weights is not valid.")
   }
   if (k <1 && k > w$num_obs) {
-    stop("The number of clusters should be a positive integer number, which is less than the number of observations.")
+    stop("The number of clusters should be a positive integer number, which is
+         less than the number of observations.")
   }
   if (inherits(df, "data.frame") == FALSE) {
     stop("The input data needs to be a data.frame.")
@@ -43,9 +57,11 @@ skater <- function(k, w, df, bound_variable=data.frame(), min_bound=0, scale_met
     stop("The data.frame is empty.")
   }
 
-  scale_methods <- c('raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust')
+  scale_methods <- c('raw', 'standardize', 'demean', 'mad', 'range_standardize',
+                     'range_adjust')
   if (!(scale_method %in% scale_methods)) {
-    stop("The scale_method has to be one of {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'}")
+    stop("The scale_method has to be one of {'raw', 'standardize', 'demean',
+         'mad', 'range_standardize', 'range_adjust'}")
   }
 
   if (distance_method != "euclidean" && distance_method != "manhattan") {
@@ -56,7 +72,8 @@ skater <- function(k, w, df, bound_variable=data.frame(), min_bound=0, scale_met
   if (length(bound_variable) > 0)  {
     bound_values <- bound_variable[[1]]
   }
-  return(p_skater(k, w$GetPointer(), df, n_vars, scale_method, distance_method, bound_values, min_bound, random_seed, cpu_threads, rdist))
+  return(p_skater(k, w$GetPointer(), df, n_vars, scale_method, distance_method,
+                  bound_values, min_bound, random_seed, cpu_threads, rdist))
 }
 
 

diff --git a/R/sf_geoda.R b/R/sf_geoda.R
@@ -1,8 +1,8 @@
 # Create a random string (internally used)
 # The input is a positive number, indicating the number of items to choose from.
 random_string <- function(n = 5000) {
-  a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
-  paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
+  a <- do.call(paste0, replicate(10, sample(LETTERS, n, TRUE), FALSE))
+  return(a)
 }
 
 

diff --git a/man/skater.Rd b/man/skater.Rd
diff --git a/src/Makevars b/src/Makevars
@@ -8,7 +8,7 @@ PKG_CPPFLAGS=\
 PKG_LIBS=\
 	-pthread
 
-CXX_STD=CXX14
+CXX_STD=CXX17
 
 CPP_SRC_FILES = \
 	$(RGEODALIB)/libgeoda.cpp \

diff --git a/src/Makevars.win b/src/Makevars.win
@@ -12,7 +12,7 @@ PKG_LIBS=\
 	-L$(RWINLIB)/lib$(R_ARCH) \
 	-pthread
 
-CXX_STD = CXX14
+CXX_STD = CXX17
 
 CPP_SRC_FILES = \
 	$(RGEODALIB)/libgeoda.cpp \

diff --git a/src/libgeoda b/src/libgeoda
diff --git a/src/rcpp_clustering.cpp b/src/rcpp_clustering.cpp
@@ -81,10 +81,8 @@ Rcpp::List p_skater(int k, SEXP xp_w, Rcpp::List& data, int n_vars, std::string
   int num_obs = w->GetNumObs();
   double** dist_matrix = rdist_matrix(num_obs, rdist);
 
-  Rcout << "aaa" << dist_matrix;
   std::vector<std::vector<int> > cluster_ids = gda_skater(k, w, raw_data, scale_method, distance_method, raw_bound, min_bound, seed, cpu_threads, dist_matrix);
 
-  Rcout << "after gda_skater";
   if (dist_matrix) {
     for (int i = 1; i < num_obs; i++) {
       free(dist_matrix[i]);

diff --git a/tests/testthat/test-clustering.R b/tests/testthat/test-clustering.R
@@ -63,10 +63,14 @@ testthat::test_that("schc", {
     testthat::expect_equal(clusters[[5]], 0.2147711255)
 })
 
-# NOTE!!!!!!!!!
-# The results are computed using Boost library 1.58.0.
-# To pass the following test cases
-# , please install BH package version==1.58.0
+# NOTE
+# The previous results are computed using Boost library 1.58.0.
+# The new results are computed using Boost library 1.81.0.1
+# The differences are caused by the different implementation of
+# boost::unordered_map: he keys in boost::unordered_map are not ordered and
+# have different orders in the two Boost versions. This involves a different
+# mechanism of randomness in max-p algorithm when picking which area or region
+# to process.
 
 testthat::test_that("azp_greedy", {
     library(sf)
@@ -78,7 +82,7 @@ testthat::test_that("azp_greedy", {
 
     azp_clusters <- azp_greedy(5, queen_w, data)
 
-    testthat::expect_equal(azp_clusters[[5]], 0.3598541)
+    testthat::expect_equal(azp_clusters[[5]], 0.36, tolerance = 1e-3)
 
     bound_variable <- guerry["Pop1831"]
     min_bound <- 3236.67 # 10% of Pop1831
@@ -87,7 +91,7 @@ testthat::test_that("azp_greedy", {
                                bound_variable = bound_variable,
                                min_bound = min_bound)
 
-    testthat::expect_equal(azp_clusters[[5]], 0.3980921835)
+    testthat::expect_equal(azp_clusters[[5]], 0.417, tolerance = 1e-3)
 
 })
 
@@ -101,7 +105,7 @@ testthat::test_that("azp_sa", {
 
     azp_clusters <- azp_sa(5, queen_w, data, cooling_rate = 0.85, sa_maxit = 1)
 
-    testthat::expect_equal(azp_clusters[[5]], 0.4211363)
+    testthat::expect_equal(azp_clusters[[5]], 0.359, tolerance = 1e-3)
 })
 
 testthat::test_that("azp_tabu", {
@@ -129,9 +133,9 @@ testthat::test_that("maxp_greedy", {
     bound_vals <- guerry["Pop1831"]
     min_bound <- 3236.67 # 10% of Pop1831
 
-    #clusters <- maxp_greedy(queen_w, data, bound_vals, min_bound)
+    clusters <- maxp_greedy(queen_w, data, bound_vals, min_bound)
 
-    #testthat::expect_equal(clusters[[5]], 0.4499671068)
+    testthat::expect_equal(clusters[[5]], 0.484, tolerance = 1e-3)
 })
 
 testthat::test_that("maxp_sa", {
@@ -145,10 +149,10 @@ testthat::test_that("maxp_sa", {
     bound_vals <- guerry["Pop1831"]
     min_bound <- 3236.67 # 10% of Pop1831
 
-    #clusters <- maxp_sa(queen_w, data, bound_vals, min_bound,
-    #                    cooling_rate = 0.85, sa_maxit = 1)
+    clusters <- maxp_sa(queen_w, data, bound_vals, min_bound,
+                        cooling_rate = 0.85, sa_maxit = 1)
 
-    #testthat::expect_equal(clusters[[5]], 0.4585352223)
+    testthat::expect_equal(clusters[[5]], 0.496, tolerance = 1e-3)
 })
 
 testthat::test_that("maxp_tabu", {
@@ -163,9 +167,9 @@ testthat::test_that("maxp_tabu", {
     min_bound <- 3236.67 # 10% of Pop1831
 
 
-    #clusters <- maxp_tabu(queen_w, data, bound_vals, min_bound,
-    #                      tabu_length = 10, conv_tabu = 10)
+    clusters <- maxp_tabu(queen_w, data, bound_vals, min_bound,
+                          tabu_length = 10, conv_tabu = 10)
 
-    #testthat::expect_equal(clusters[[5]], 0.4893668149)
+    testthat::expect_equal(clusters[[5]], 0.478, tolerance = 1e-3)
 
 })