From 19107e43a3091bdd6d003124355052ab5659b76b Mon Sep 17 00:00:00 2001
From: Lydia Duncan <lydia-duncan@users.noreply.github.com>
Date: Fri, 6 Sep 2024 14:39:38 -0700
Subject: [PATCH 001/107] Prevent modification of const ref varargs, too

Modifies the same place as the PRs for const and const in, but additionally
needed a fix for `moveSetConstFlagsAndCheck` because the local tuple variable
was using a `PRIM_GET_MEMBER_VALUE` instead of a `PRIM_GET_MEMBER`.

We'll see if that has broader consequences, though.

----
Signed-off-by: Lydia Duncan <lydia-duncan@users.noreply.github.com>
---
 compiler/resolution/expandVarArgs.cpp      | 2 +-
 compiler/resolution/functionResolution.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler/resolution/expandVarArgs.cpp b/compiler/resolution/expandVarArgs.cpp
index 5c2a0ef40d16..21963ce4ba6d 100644
--- a/compiler/resolution/expandVarArgs.cpp
+++ b/compiler/resolution/expandVarArgs.cpp
@@ -474,7 +474,7 @@ static void expandVarArgsBody(FnSymbol*      fn,
   }
 
   if (formal->intent == INTENT_CONST || formal->intent == INTENT_CONST_IN ||
-      formal->intent == INTENT_BLANK) {
+      formal->intent == INTENT_CONST_REF || formal->intent == INTENT_BLANK) {
     // TODO: Note that this will be overly strict for arrays, syncs, and
     // atomics, since their default is "pass-by-ref".  However, we think this is
     // okay for now, in part due to thinking it's unlikely to be relied upon and
diff --git a/compiler/resolution/functionResolution.cpp b/compiler/resolution/functionResolution.cpp
index a6ce2f914d5a..6f83cd99ac40 100644
--- a/compiler/resolution/functionResolution.cpp
+++ b/compiler/resolution/functionResolution.cpp
@@ -9599,6 +9599,7 @@ static void resolveMoveForRhsCallExpr(CallExpr* call, Type* rhsType) {
 
 static void moveSetConstFlagsAndCheck(CallExpr* call, CallExpr* rhs) {
   if (rhs->isPrimitive(PRIM_GET_MEMBER) ||
+      rhs->isPrimitive(PRIM_GET_MEMBER_VALUE) ||
       rhs->isPrimitive(PRIM_ADDR_OF))
   {
     if (SymExpr* rhsBase = toSymExpr(rhs->get(1))) {

From f1993067cff24d38e86dae4f338b2f18583f0e99 Mon Sep 17 00:00:00 2001
From: Lydia Duncan <lydia-duncan@users.noreply.github.com>
Date: Fri, 6 Sep 2024 14:43:13 -0700
Subject: [PATCH 002/107] Add tests locking in the fix for const ref

Covers both queried and unqueried numbers of arguments

----
Signed-off-by: Lydia Duncan <lydia-duncan@users.noreply.github.com>
---
 test/functions/varargs/constRefVarargs.chpl      | 12 ++++++++++++
 test/functions/varargs/constRefVarargs.good      |  3 +++
 test/functions/varargs/constRefVarargsQuery.chpl | 12 ++++++++++++
 test/functions/varargs/constRefVarargsQuery.good |  3 +++
 4 files changed, 30 insertions(+)
 create mode 100644 test/functions/varargs/constRefVarargs.chpl
 create mode 100644 test/functions/varargs/constRefVarargs.good
 create mode 100644 test/functions/varargs/constRefVarargsQuery.chpl
 create mode 100644 test/functions/varargs/constRefVarargsQuery.good

diff --git a/test/functions/varargs/constRefVarargs.chpl b/test/functions/varargs/constRefVarargs.chpl
new file mode 100644
index 000000000000..63fe9cbb572c
--- /dev/null
+++ b/test/functions/varargs/constRefVarargs.chpl
@@ -0,0 +1,12 @@
+// Taken from https://github.com/chapel-lang/chapel/issues/25858
+proc myPrintln(const ref args...)
+{
+  writeln("args.type     = ", args.type:string);
+  writeln("args (before) = ", args);
+
+  args[0] *= 10;  // should not be allowed
+
+  writeln("args (after) = ", args);
+}
+
+myPrintln(1, 2.3, "four");
diff --git a/test/functions/varargs/constRefVarargs.good b/test/functions/varargs/constRefVarargs.good
new file mode 100644
index 000000000000..bf0e7ebb8dc0
--- /dev/null
+++ b/test/functions/varargs/constRefVarargs.good
@@ -0,0 +1,3 @@
+constRefVarargs.chpl:2: In function 'myPrintln':
+constRefVarargs.chpl:7: error: cannot assign to const variable
+  constRefVarargs.chpl:12: called as myPrintln(args(0): int(64), args(1): real(64), args(2): string)
diff --git a/test/functions/varargs/constRefVarargsQuery.chpl b/test/functions/varargs/constRefVarargsQuery.chpl
new file mode 100644
index 000000000000..900601268ed4
--- /dev/null
+++ b/test/functions/varargs/constRefVarargsQuery.chpl
@@ -0,0 +1,12 @@
+// Taken from https://github.com/chapel-lang/chapel/issues/25858, modified
+proc myPrintln(const ref args...?k)
+{
+  writeln("args.type     = ", args.type:string);
+  writeln("args (before) = ", args);
+
+  args[0] *= 10;  // should not be allowed
+
+  writeln("args (after) = ", args);
+}
+
+myPrintln(1, 2.3, "four");
diff --git a/test/functions/varargs/constRefVarargsQuery.good b/test/functions/varargs/constRefVarargsQuery.good
new file mode 100644
index 000000000000..8ea6b38b7ef3
--- /dev/null
+++ b/test/functions/varargs/constRefVarargsQuery.good
@@ -0,0 +1,3 @@
+constRefVarargsQuery.chpl:2: In function 'myPrintln':
+constRefVarargsQuery.chpl:7: error: cannot assign to const variable
+  constRefVarargsQuery.chpl:12: called as myPrintln(args(0): int(64), args(1): real(64), args(2): string)

From 79afb8dc2977c521b48bfa7d681b263e9fd07e35 Mon Sep 17 00:00:00 2001
From: Lydia Duncan <lydia-duncan@users.noreply.github.com>
Date: Thu, 3 Oct 2024 11:49:46 -0700
Subject: [PATCH 003/107] Futurize the DataFrames test that failed and add an
 alternative that works

This test failed due to us now (accurately) detecting that
`chpl__buildAssociativeArrayExpr` violates the const-ness of the varargs for
owned arguments.  In talking with Michael, we decided it was okay for this to
get futurized as a result, and added a version of the test that does not fail,
showing that you can still create associative arrays that store owned classes
(just not using associative array literals)

----
Signed-off-by: Lydia Duncan <lydia-duncan@users.noreply.github.com>
---
 .../HelloDataFrame-createDomFirst.chpl        | 22 +++++++++++++++++++
 .../HelloDataFrame-createDomFirst.compopts    |  1 +
 .../HelloDataFrame-createDomFirst.good        | 20 +++++++++++++++++
 .../psahabu/HelloDataFrame-createDomFirst.py  | 21 ++++++++++++++++++
 .../DataFrames/psahabu/HelloDataFrame.bad     |  3 +++
 .../DataFrames/psahabu/HelloDataFrame.future  |  2 ++
 6 files changed, 69 insertions(+)
 create mode 100644 test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.chpl
 create mode 100644 test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.compopts
 create mode 100644 test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.good
 create mode 100644 test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.py
 create mode 100644 test/library/draft/DataFrames/psahabu/HelloDataFrame.bad
 create mode 100644 test/library/draft/DataFrames/psahabu/HelloDataFrame.future

diff --git a/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.chpl b/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.chpl
new file mode 100644
index 000000000000..765206323fb6
--- /dev/null
+++ b/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.chpl
@@ -0,0 +1,22 @@
+use DataFrames;
+
+var validBits = [true, false, true, false, true];
+
+var columnOne: owned Series? = new owned TypedSeries(["a", "b", "c", "d", "e"], validBits);
+var columnTwo: owned Series? = new owned TypedSeries([1, 2, 3, 4, 5], validBits);
+var columnThree: owned Series? = new owned TypedSeries([10.0, 20.0, 30.0, 40.0, 50.0]);
+
+var dom = {"columnOne", "columnTwo", "columnThree"};
+var columns: [dom] owned Series?;
+columns["columnOne"] = columnOne;
+columns["columnTwo"] = columnTwo;
+columns["columnThree"] = columnThree;
+var idx = new shared TypedIndex(["rowOne", "rowTwo", "rowThree", "rowFour", "rowFive"]);
+
+var dataFrame = new owned DataFrame(columns, idx);
+var noIndex = new owned DataFrame(columns);
+writeln(dataFrame);
+writeln();
+writeln(dataFrame["columnThree"]);
+writeln();
+writeln(noIndex);
diff --git a/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.compopts b/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.compopts
new file mode 100644
index 000000000000..ab90c63f3ded
--- /dev/null
+++ b/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.compopts
@@ -0,0 +1 @@
+-snoParSafeWarning
diff --git a/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.good b/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.good
new file mode 100644
index 000000000000..0cfb197d2979
--- /dev/null
+++ b/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.good
@@ -0,0 +1,20 @@
+         columnOne   columnThree   columnTwo   
+rowOne           a          10.0           1   
+rowTwo        None          20.0        None   
+rowThree         c          30.0           3   
+rowFour       None          40.0        None   
+rowFive          e          50.0           5   
+
+rowOne      10.0
+rowTwo      20.0
+rowThree    30.0
+rowFour     40.0
+rowFive     50.0
+dtype: real(64)
+
+  columnOne   columnThree   columnTwo   
+0         a          10.0           1   
+1      None          20.0        None   
+2         c          30.0           3   
+3      None          40.0        None   
+4         e          50.0           5   
diff --git a/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.py b/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.py
new file mode 100644
index 000000000000..b13292b678a9
--- /dev/null
+++ b/test/library/draft/DataFrames/psahabu/HelloDataFrame-createDomFirst.py
@@ -0,0 +1,21 @@
+import pandas as pd
+
+columnOne = ["a", None, "c", None, "e"]
+columnTwo = [1, None, 3, None, 5]
+columnThree = [10.0, 20.0, 30.0, 40.0, 50.0]
+
+idx = pd.Index(["rowOne", "rowTwo", "rowThree", "rowFour", "rowFive"])
+dataFrame = pd.DataFrame({ "columnOne": columnOne,
+                           "columnTwo": columnTwo,
+                           "columnThree": columnThree },
+                          idx)
+noIndex = pd.DataFrame({ "columnOne": columnOne,
+                         "columnTwo": columnTwo,
+                         "columnThree": columnThree })
+
+
+print dataFrame
+print
+print dataFrame["columnThree"]
+print
+print noIndex
diff --git a/test/library/draft/DataFrames/psahabu/HelloDataFrame.bad b/test/library/draft/DataFrames/psahabu/HelloDataFrame.bad
new file mode 100644
index 000000000000..3ab33ac3fa28
--- /dev/null
+++ b/test/library/draft/DataFrames/psahabu/HelloDataFrame.bad
@@ -0,0 +1,3 @@
+HelloDataFrame.chpl:9: error: const actual is passed to a 'ref' formal of init=()
+HelloDataFrame.chpl:9: error: const actual is passed to a 'ref' formal of init=()
+HelloDataFrame.chpl:9: error: const actual is passed to a 'ref' formal of init=()
diff --git a/test/library/draft/DataFrames/psahabu/HelloDataFrame.future b/test/library/draft/DataFrames/psahabu/HelloDataFrame.future
new file mode 100644
index 000000000000..71e0e8881ec3
--- /dev/null
+++ b/test/library/draft/DataFrames/psahabu/HelloDataFrame.future
@@ -0,0 +1,2 @@
+bug: owned objects are supported with associative array literals
+#26035

From acae53e6d4cbe333622ad067954166f3353d2acf Mon Sep 17 00:00:00 2001
From: Lydia Duncan <lydia-duncan@users.noreply.github.com>
Date: Thu, 3 Oct 2024 13:17:59 -0700
Subject: [PATCH 004/107] Update behavior of test that was added by a separate
 PR that has since merged

This is basically the same thing as removing the deprecated behavior (for the
purpose of this particular test, we need to perform the behavior update more
generally still), so I don't think the change is concerning

----
Signed-off-by: Lydia Duncan <lydia-duncan@users.noreply.github.com>
---
 test/functions/varargs/varargsModArray.good | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/test/functions/varargs/varargsModArray.good b/test/functions/varargs/varargsModArray.good
index 034e4107bec4..7f694aa09881 100644
--- a/test/functions/varargs/varargsModArray.good
+++ b/test/functions/varargs/varargsModArray.good
@@ -1,4 +1,3 @@
-varargsModArray.chpl:2: warning: inferring a default intent to be 'ref' is deprecated - please use an explicit 'ref' intent for the argument 'args'
-args.type     = ([domain(1,int(64),one)] int(64),real(64),string)
-args (before) = (1 2 3, 2.3, four)
-args (after) = (10 2 3, 2.3, four)
+varargsModArray.chpl:2: In function 'myPrintln':
+varargsModArray.chpl:7: error: cannot assign to const variable
+  varargsModArray.chpl:13: called as myPrintln(args(0): [domain(1,int(64),one)] int(64), args(1): real(64), args(2): string)

From d0918dce3567bcfe6f007085a2d5b1781dd24b10 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Thu, 26 Sep 2024 14:23:02 -0700
Subject: [PATCH 005/107] Fix logical accessible CPU set debug message

Don't crash if the logical accessible CPU set for a locale isn't set.

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/topo/hwloc/topo-hwloc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c
index 192ecd32428c..92925bcde09b 100644
--- a/runtime/src/topo/hwloc/topo-hwloc.c
+++ b/runtime/src/topo/hwloc/topo-hwloc.c
@@ -704,7 +704,11 @@ static void partitionResources(void) {
   if (debug) {
     for (int i = 0; i < numLocalesOnNode; i++) {
       char buf[1024];
-      hwloc_bitmap_list_snprintf(buf, sizeof(buf), logAccSets[i]);
+      if (logAccSets[i] != NULL) {
+        hwloc_bitmap_list_snprintf(buf, sizeof(buf), logAccSets[i]);
+      } else {
+        strncpy(buf, "unknown", sizeof(buf));
+      }
       _DBG_P("logAccSets[%d]: %s", i, buf);
     }
   }

From ef3f0a1bc5bd55207d3bcf60deeaf684caa81ead Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Thu, 26 Sep 2024 14:40:07 -0700
Subject: [PATCH 006/107] Dump distance matrix to debugging output

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/topo/hwloc/topo-hwloc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c
index 92925bcde09b..e14a11681a9a 100644
--- a/runtime/src/topo/hwloc/topo-hwloc.c
+++ b/runtime/src/topo/hwloc/topo-hwloc.c
@@ -1324,6 +1324,15 @@ static void fillDistanceMatrix(int numObjs, hwloc_obj_t *objs,
       }
     }
   }
+#ifdef DEBUG
+  printf("distances:\n");
+  for (int i = 0; i < numLocales; i++) {
+    for (int j = 0; j < numObjs; j++) {
+      printf("%02d ", distances[i][j]);
+    }
+    printf("\n");
+  }
+#endif
 }
 
 

From 840e56dbcf26cf739e1990bbd6aa142404358faa Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Thu, 26 Sep 2024 15:06:58 -0700
Subject: [PATCH 007/107] Minimum distance might be the maximum

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/topo/hwloc/topo-hwloc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c
index e14a11681a9a..d0c777e098d2 100644
--- a/runtime/src/topo/hwloc/topo-hwloc.c
+++ b/runtime/src/topo/hwloc/topo-hwloc.c
@@ -1439,7 +1439,7 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr,
                   for (int j = 0; j < numNics; j++) {
                     _DBG_P("used[%d] = %d, distances[%d][%d] = %d",
                            j, used[j], i, j, distances[i][j]);
-                    if ((!used[j]) && (distances[i][j] < minimum)) {
+                    if ((!used[j]) && (distances[i][j] <= minimum)) {
                       minimum = distances[i][j];
                       minLoc = i;
                       minNic = j;
@@ -1512,6 +1512,7 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs,
     int owners[numDevs]; // locale that owns each device
     hwloc_obj_t objs[numDevs]; // the device objects
     int devsPerLocale = numDevs / numLocales;
+    _DBG_P("devsPerLocale = %d", devsPerLocale);
     int owned[numLocales]; // number of devices each locale owns
 
     for (int i = 0; i < numDevs; i++) {
@@ -1565,7 +1566,7 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs,
         for (int i = 0; i < numLocales; i++) {
           if (owned[i] < devsPerLocale) {
             for (int j = 0; j < numDevs; j++) {
-              if ((owners[j] == -1) && (distances[i][j] < minimum)) {
+              if ((owners[j] == -1) && (distances[i][j] <= minimum)) {
                 minimum = distances[i][j];
                 minLoc = i;
                 minDev = j;

From b49c7457aea49c583edb75f270a0aae04b193f50 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Thu, 26 Sep 2024 15:12:41 -0700
Subject: [PATCH 008/107] Partition devices based on number of co-locales, not
 locales

If we are oversubscribed then the locales should share the devices, instead
of treating them as co-locales and partitioning the devices

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/topo/hwloc/topo-hwloc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c
index d0c777e098d2..4a8c6a772f43 100644
--- a/runtime/src/topo/hwloc/topo-hwloc.c
+++ b/runtime/src/topo/hwloc/topo-hwloc.c
@@ -1507,20 +1507,21 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs,
   int numLocales = chpl_get_num_locales_on_node();
   _DBG_P("count = %d", *count);
   _DBG_P("numLocales = %d", numLocales);
-  if (numLocales > 1) {
+  int numColocales = chpl_env_rt_get_int("LOCALES_PER_NODE", 0);
+  if (numColocales > 1) {
     int numDevs = *count;
     int owners[numDevs]; // locale that owns each device
     hwloc_obj_t objs[numDevs]; // the device objects
-    int devsPerLocale = numDevs / numLocales;
+    int devsPerLocale = numDevs / numColocales;
     _DBG_P("devsPerLocale = %d", devsPerLocale);
-    int owned[numLocales]; // number of devices each locale owns
+    int owned[numColocales]; // number of devices each co-locale owns
 
     for (int i = 0; i < numDevs; i++) {
       owners[i] = -1;
       objs[i] = NULL;
     }
 
-    for (int i = 0; i < numLocales; i++) {
+    for (int i = 0; i < numColocales; i++) {
       owned[i] = 0;
     }
 
@@ -1589,6 +1590,7 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs,
     assert(j == devsPerLocale);
     *count = devsPerLocale;
   } else {
+    // No co-locales, use all the devices.
     for (int i = 0; i < *count; i++) {
       outAddrs[i] = inAddrs[i];
     }

From 7a25877568a33a182228e41698a36bdfbfaf0da3 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Tue, 8 Oct 2024 12:35:30 -0700
Subject: [PATCH 009/107] Allocate resources based on partitions instead of
 co-locales

If the number of nodes does not evenly divide the number of locales there will
be a "remainder node" that has fewer co-locales than the other nodes.
Previously, there was some special-casing to deal with the remainder node
which was clunky and error-prone. This commit introduces the "partition"
abstraction in which the number of partitions on each node is the expected
number of co-locales on the node. All nodes, including the remainder node,
allocate resources based on partitions, then assign co-locales to partitions.
On the remainder node this means that some partitions (and therefore
resources) go unused, but this is what we want because all locales should
have the same amount of resources. This greatly cleans up the code.

In addition, oversubscription handling is cleaner. If there are locales on
the node, but the expected number of co-locales is zero, the node is
oversubscribed and all locales share all resources.

Also added some remainder node and oversubsciption tests.

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/topo/hwloc/topo-hwloc.c     | 239 +++++++++++++-----------
 test/runtime/jhh/colocales/colocales.c  |  38 +++-
 test/runtime/jhh/colocales/colocales.py |  49 ++++-
 3 files changed, 207 insertions(+), 119 deletions(-)

diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c
index 4a8c6a772f43..cbce3cfa6bfb 100644
--- a/runtime/src/topo/hwloc/topo-hwloc.c
+++ b/runtime/src/topo/hwloc/topo-hwloc.c
@@ -101,6 +101,11 @@ static hwloc_nodeset_t numaSet = NULL;
 
 static hwloc_obj_t myRoot = NULL;
 
+// This is the number of partitions that the resources should be divided into.
+// Typically this is the number of co-locales per node, except for the last
+// node which might have fewer than that.
+static int numPartitions = 1;
+
 // Logical CPU sets for all locales on this node. Entries are NULL if
 // we don't have that info.
 static hwloc_cpuset_t *logAccSets = NULL;
@@ -272,13 +277,14 @@ void chpl_topo_exit(void) {
   }
 
   if (logAccSets != NULL) {
-    for (int i = 0; i < chpl_get_num_locales_on_node(); i++) {
+    for (int i = 0; i < numPartitions; i++) {
       if (logAccSets[i] != NULL) {
         hwloc_bitmap_free(logAccSets[i]);
       }
     }
     sys_free(logAccSets);
     logAccSets = NULL;
+    numPartitions = 1;
   }
   hwloc_topology_destroy(topology);
 }
@@ -531,7 +537,20 @@ static const char *objTypeString(hwloc_obj_type_t t) {
 }
 
 //
-// Partitions resources when running with co-locales.
+// Partition resources when running with co-locales. This is complicated a bit
+// by oversubscription and that the number of locales might not be evenly
+// divisable by the number of nodes. If the number of colocales is zero, then
+// we are oversubscribed and each locale uses all of the resources available
+// to it. Otherwise, the number of locales on the node might be less than the
+// expected number of co-locales because the "remainder" node might not have
+// its full complement of co-locales. To deal with this, the resources are
+// partitioned based on the expected number of co-locales, but then assigned
+// to locales based on the number of co-locales that actually exist. This
+// ensures that all co-locales on all nodes have the same amount of
+// resources. If there are more locales than expected co-locales then the
+// user has launched the program manually; just treat the system as
+// oversubscribed as it isn't clear how to partition resources in this
+// situation.
 //
 
 static void partitionResources(void) {
@@ -576,8 +595,18 @@ static void partitionResources(void) {
   if (numLocalesOnNode > 1) {
     oversubscribed = true;
   }
-  logAccSets = sys_calloc(numLocalesOnNode, sizeof(hwloc_cpuset_t));
-  if (numColocales > 0) {
+  if ((numColocales > 0) && (numLocalesOnNode <= numColocales)){
+    numPartitions = numColocales;
+  }
+  logAccSets = sys_calloc(numPartitions, sizeof(hwloc_cpuset_t));
+  if ((numColocales > 0) && (numLocalesOnNode > numColocales)) {
+    char msg[200];
+    snprintf(msg, sizeof(msg),
+             "The node has more locales (%d) than co-locales (%d).\n"
+             "Considering the node oversubscribed.",
+             numLocalesOnNode, numColocales);
+    chpl_warning(msg, 0, 0);
+  } else if (numColocales > 0) {
     // We get our own socket/NUMA/cache/core object if we have exclusive
     // access to the node, we know our local rank, and the number of locales
     // on the node is less than or equal to the number of objects. It is an
@@ -601,7 +630,7 @@ static void partitionResources(void) {
                                         HWLOC_OBJ_TYPE_MAX};
         for (int i = 0; rootTypes[i] != HWLOC_OBJ_TYPE_MAX; i++) {
           int numObjs = hwloc_get_nbobjs_by_type(topology, rootTypes[i]);
-          if (numObjs == numColocales) {
+          if (numObjs == numPartitions) {
             myRootType = rootTypes[i];
             break;
           }
@@ -611,15 +640,15 @@ static void partitionResources(void) {
         _DBG_P("myRootType: %s", objTypeString(myRootType));
         int numCores = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE);
         int numObjs = hwloc_get_nbobjs_by_type(topology, myRootType);
-        if (numObjs < numLocalesOnNode) {
+        if (numObjs < numPartitions) {
           char msg[200];
           snprintf(msg, sizeof(msg), "Node only has %d %s(s)", numObjs,
                    objTypeString(myRootType));
           chpl_error(msg, 0, 0);
         }
-        if (numObjs > numLocalesOnNode) {
-          int coresPerLocale = numCores / numObjs;
-          unusedCores = (numObjs - numLocalesOnNode) * coresPerLocale;
+        if (numObjs > numPartitions) {
+          int coresPerPartition = numCores / numObjs;
+          unusedCores = (numObjs - numPartitions) * coresPerPartition;
         }
 
         // Use the object whose logical index corresponds to our local rank.
@@ -628,11 +657,11 @@ static void partitionResources(void) {
 
         _DBG_P("confining ourself to %s %d", objTypeString(myRootType), rank);
 
-        // Compute the accessible PUs for all locales on this node based on
-        // the object each occupies. This is used to determine which NIC each
-        // locale should use.
+        // Compute the accessible PUs for all partitions based on the object
+        // each occupies. This is used to determine which NIC each locale
+        // should use.
 
-        for (int i = 0; i < numLocalesOnNode; i++) {
+        for (int i = 0; i < numPartitions; i++) {
           hwloc_obj_t obj;
           CHK_ERR(obj = hwloc_get_obj_inside_cpuset_by_type(topology,
                                   root->cpuset, myRootType, i));
@@ -642,21 +671,21 @@ static void partitionResources(void) {
         }
       } else {
         // Cores not tied to a root object
-        int coresPerLocale = numCPUsPhysAcc / numLocalesOnNode;
-        if (coresPerLocale < 1) {
+        int coresPerPartition = numCPUsPhysAcc / numPartitions;
+        if (coresPerPartition < 1) {
           char msg[200];
           snprintf(msg, sizeof(msg), "Cannot run %d co-locales on %d cores.",
-                   numLocalesOnNode, numCPUsPhysAcc);
+                   numPartitions, numCPUsPhysAcc);
           chpl_error(msg, 0, 0);
         }
-        unusedCores = numCPUsPhysAcc % numLocalesOnNode;
+        unusedCores = numCPUsPhysAcc % numPartitions;
         int count = 0;
         int locale = -1;
         int id;
         hwloc_bitmap_foreach_begin(id, physAccSet) {
           if (count == 0) {
             locale++;
-            if (locale == numLocalesOnNode) {
+            if (locale == numPartitions) {
               break;
             }
             CHK_ERR_ERRNO(logAccSets[locale] = hwloc_bitmap_alloc());
@@ -668,7 +697,7 @@ static void partitionResources(void) {
                                                     HWLOC_OBJ_CORE, pu));
           hwloc_bitmap_or(logAccSets[locale], logAccSets[locale],
                           core->cpuset);
-          count = (count + 1) % coresPerLocale;
+          count = (count + 1) % coresPerPartition;
         } hwloc_bitmap_foreach_end();
       }
       if (unusedCores != 0) {
@@ -697,12 +726,13 @@ static void partitionResources(void) {
       oversubscribed = false;
     }
   } else {
-    // We don't know which PUs other locales on the same node are using,
-    // so just set our own.
+    // The node is oversubscribed. We will use all accessible PUs, and we
+    // don't know which PUs other locales on the same node are using, so just
+    // set our own.
     logAccSets[0] = hwloc_bitmap_dup(logAccSet);
   }
   if (debug) {
-    for (int i = 0; i < numLocalesOnNode; i++) {
+    for (int i = 0; i < numPartitions; i++) {
       char buf[1024];
       if (logAccSets[i] != NULL) {
         hwloc_bitmap_list_snprintf(buf, sizeof(buf), logAccSets[i]);
@@ -1288,12 +1318,11 @@ static void fillDistanceMatrix(int numObjs, hwloc_obj_t *objs,
 
   // Build a distance matrix between locales and objects.
 
-  int numLocales = chpl_get_num_locales_on_node();
-  _DBG_P("numLocales = %d numObjs = %d", numLocales, numObjs);
+  _DBG_P("numPartitions = %d numObjs = %d", numPartitions, numObjs);
 
-  hwloc_obj_t locales[numLocales];
+  hwloc_obj_t locales[numPartitions];
 
-  for (int i = 0; i < numLocales; i++) {
+  for (int i = 0; i < numPartitions; i++) {
     if (logAccSets[i] != NULL) {
       CHK_ERR(locales[i] =  hwloc_get_obj_covering_cpuset(topology,
                                                         logAccSets[i]));
@@ -1315,7 +1344,7 @@ static void fillDistanceMatrix(int numObjs, hwloc_obj_t *objs,
   // is NULL then we don't know which PUs that locale is using, so
   // we ignore it by setting its distances to infinite.
 
-  for (int i = 0; i < numLocales; i++) {
+  for (int i = 0; i < numPartitions; i++) {
     for (int j = 0; j < numObjs; j++) {
       if (locales[i] != NULL) {
         distances[i][j] = distance(topology, objs[j], locales[i]);
@@ -1326,7 +1355,7 @@ static void fillDistanceMatrix(int numObjs, hwloc_obj_t *objs,
   }
 #ifdef DEBUG
   printf("distances:\n");
-  for (int i = 0; i < numLocales; i++) {
+  for (int i = 0; i < numPartitions; i++) {
     for (int j = 0; j < numObjs; j++) {
       printf("%02d ", distances[i][j]);
     }
@@ -1355,13 +1384,12 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr,
 
   hwloc_obj_t           nic = NULL;
   chpl_topo_pci_addr_t  *result = NULL;
-  int numLocales = chpl_get_num_locales_on_node();
   struct hwloc_pcidev_attr_s *nicAttr;
   int localRank = chpl_get_local_rank();
 
   _DBG_P("chpl_topo_selectNicByType: %04x:%02x:%02x.%x", inAddr->domain,
              inAddr->bus, inAddr->device, inAddr->function);
-  _DBG_P("numLocales %d rank %d", numLocales, localRank);
+  _DBG_P("numPartitions %d rank %d", numPartitions, localRank);
 
   // find the PCI object corresponding to the specified NIC
   nic = hwloc_get_pcidev_by_busid(topology, (unsigned) inAddr->domain,
@@ -1369,7 +1397,7 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr,
                                   (unsigned) inAddr->device,
                                   (unsigned) inAddr->function);
   if (nic != NULL) {
-    if ((numLocales > 1) && (localRank >= 0)) {
+    if ((numPartitions > 1) && (localRank >= 0)) {
       // Find all the NICS with the same vendor and device as the specified NIC.
       nicAttr = &(nic->attr->pcidev);
       int maxNics = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PCI_DEVICE);
@@ -1395,7 +1423,7 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr,
         //
         qsort(nics, numNics, sizeof(*nics), comparePCIObjs);
 
-        int distances[numLocales][numNics];
+        int distances[numPartitions][numNics];
         fillDistanceMatrix(numNics, nics, distances);
 
 
@@ -1406,15 +1434,15 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr,
         // share NICs, so mark all NICs as unassigned and repeat the
         // process.
 
-        hwloc_obj_t assigned[numLocales]; // NIC assigned to the locale
+        hwloc_obj_t assigned[numPartitions]; // NIC assigned to the locale
         int numAssigned = 0;
 
-        for (int i = 0; i < numLocales; i++) {
+        for (int i = 0; i < numPartitions; i++) {
           assigned[i] = NULL;
         }
 
         chpl_bool finished = false;
-        while (!finished && (numAssigned < numLocales)) {
+        while (!finished && (numAssigned < numPartitions)) {
 
           _DBG_P("outer loop: numAssigned %d", numAssigned);
           // The used array keeps track of NICs that have been assigned in
@@ -1427,12 +1455,12 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr,
           // assigned a NIC and a NIC that hasn't been assigned in this
           // iteration ("used") and assign that NIC to that locale.
           int numAvail = numNics;
-          while((numAvail > 0) && (numAssigned < numLocales)) {
+          while((numAvail > 0) && (numAssigned < numPartitions)) {
             _DBG_P("inner loop: numAssigned %d numAvail %d", numAssigned, numAvail);
               int minimum = INT32_MAX;
               int minNic = -1;
               int minLoc = -1;
-              for (int i = 0; i < numLocales; i++) {
+              for (int i = 0; i < numPartitions; i++) {
                 _DBG_P("assigned[%d] = %p", i, assigned[i]);
                 _DBG_P("minimum = %d", minimum);
                 if (!assigned[i]) {
@@ -1481,18 +1509,21 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr,
 }
 
 // Given the PCI bus addresses of a set of devices, determine which of those
-// devices the calling locale should use. Each co-locale is assigned the same
-// number of devices and each device is assigned to at most one locale. This
-// function uses a greedy algorithm to assign devices to locales. The
-// distance matrix records the distance between each device and the locale's
-// CPU set. The device/locale pair with the minimum distance are assigned to
-// each other and the device is removed from consideration. The process then
-// repeats until all co-locales have been assigned the proper number of
-// devices.
+// devices the calling locale should use. Devices are assigned to partitions,
+// where the number of partitions is equal to the expected number of
+// co-locales on the device (there may be fewer if the number of nodes
+// doesn't evenly divide the number of locales). Each partition is assigned
+// the same number of devices and each device is assigned to at most one
+// partition. This function uses a greedy algorithm to assign devices to
+// partition. The distance matrix records the distance between each device
+// and the partition's CPU set. The device/partition pair with the minimum
+// distance are assigned to each other and the device is removed from
+// consideration. The process then repeats until all partitions have been
+// assigned the proper number of devices.
 //
-// Note that cores are assigned to co-locales during initialization of the
+// Note that cores are assigned to partitions during initialization of the
 // topology layer before this function is called. As a result, the assignment
-// of cores and devices to co-locales may not be optimal, especially if the
+// of cores and devices to paratitions may not be optimal, especially if the
 // machine topology is asymmetric. For example, if there are two co-locales
 // on a machine with four NUMA domains, one co-locale will be assigned cores
 // in the first two NUMA domains and the other the second two domains. If
@@ -1504,81 +1535,77 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs,
                               int *count)
 {
   int result = 0;
-  int numLocales = chpl_get_num_locales_on_node();
   _DBG_P("count = %d", *count);
-  _DBG_P("numLocales = %d", numLocales);
-  int numColocales = chpl_env_rt_get_int("LOCALES_PER_NODE", 0);
-  if (numColocales > 1) {
+  _DBG_P("numPartitions = %d", numPartitions);
+  int rank = chpl_get_local_rank();
+  if ((numPartitions > 1) && (rank >= 0)) {
     int numDevs = *count;
-    int owners[numDevs]; // locale that owns each device
+    int owners[numDevs]; // partition to which the device belongs
     hwloc_obj_t objs[numDevs]; // the device objects
-    int devsPerLocale = numDevs / numColocales;
-    _DBG_P("devsPerLocale = %d", devsPerLocale);
-    int owned[numColocales]; // number of devices each co-locale owns
+    int devsPerPartition = numDevs / numPartitions;
+    _DBG_P("devsPerPartition = %d", devsPerPartition);
+    int owned[numPartitions]; // number of devices in each partition
 
     for (int i = 0; i < numDevs; i++) {
       owners[i] = -1;
       objs[i] = NULL;
     }
 
-    for (int i = 0; i < numColocales; i++) {
+    for (int i = 0; i < numPartitions; i++) {
       owned[i] = 0;
     }
 
-    int rank = chpl_get_local_rank();
-    if (rank >= 0) {
-      for (int i = 0; i < numDevs; i++) {
-        hwloc_obj_t obj;
-        // find the PCI object corresponding to the specified bus address
-        obj = hwloc_get_pcidev_by_busid(topology,
-                                        (unsigned) inAddrs[i].domain,
-                                        (unsigned) inAddrs[i].bus,
-                                        (unsigned) inAddrs[i].device,
-                                        (unsigned) inAddrs[i].function);
-        if (obj == NULL) {
-          _DBG_P("Could not find PCI %04x:%02x:%02x.%x", inAddrs[i].domain,
-                 inAddrs[i].bus, inAddrs[i].device, inAddrs[i].function);
-          if (debug) {
-            _DBG_P("PCI devices:");
-            for (hwloc_obj_t obj = hwloc_get_next_pcidev(topology, NULL);
-                 obj != NULL;
-                 obj = hwloc_get_next_pcidev(topology, obj)) {
-              _DBG_P("%04x:%02x:%02x.%x", obj->attr->pcidev.domain,
-                 obj->attr->pcidev.bus, obj->attr->pcidev.dev,
-                 obj->attr->pcidev.func);
-            }
+    for (int i = 0; i < numDevs; i++) {
+      hwloc_obj_t obj;
+      // find the PCI object corresponding to the specified bus address
+      obj = hwloc_get_pcidev_by_busid(topology,
+                                      (unsigned) inAddrs[i].domain,
+                                      (unsigned) inAddrs[i].bus,
+                                      (unsigned) inAddrs[i].device,
+                                      (unsigned) inAddrs[i].function);
+      if (obj == NULL) {
+        _DBG_P("Could not find PCI %04x:%02x:%02x.%x", inAddrs[i].domain,
+               inAddrs[i].bus, inAddrs[i].device, inAddrs[i].function);
+        if (debug) {
+          _DBG_P("PCI devices:");
+          for (hwloc_obj_t obj = hwloc_get_next_pcidev(topology, NULL);
+               obj != NULL;
+               obj = hwloc_get_next_pcidev(topology, obj)) {
+            _DBG_P("%04x:%02x:%02x.%x", obj->attr->pcidev.domain,
+               obj->attr->pcidev.bus, obj->attr->pcidev.dev,
+               obj->attr->pcidev.func);
           }
-          result = 1;
-          goto done;
         }
-        objs[i] = obj;
+        result = 1;
+        goto done;
       }
-      int distances[numLocales][numDevs];
-      fillDistanceMatrix(numDevs, objs, distances);
-      while (owned[rank] < devsPerLocale) {
-
-        // Find the minimum distance between a locale that needs more devices
-        // and a device that doesn't have an owner and assign that device to
-        // that locale.
-
-        int minimum = INT32_MAX;
-        int minDev = -1;
-        int minLoc = -1;
-        for (int i = 0; i < numLocales; i++) {
-          if (owned[i] < devsPerLocale) {
-            for (int j = 0; j < numDevs; j++) {
-              if ((owners[j] == -1) && (distances[i][j] <= minimum)) {
-                minimum = distances[i][j];
-                minLoc = i;
-                minDev = j;
-              }
+      objs[i] = obj;
+    }
+    int distances[numPartitions][numDevs];
+    fillDistanceMatrix(numDevs, objs, distances);
+    while (owned[rank] < devsPerPartition) {
+
+      // Find the minimum distance between a partition that needs more devices
+      // and a device that doesn't have a partition and assign that device to
+      // that partition.
+
+      int minimum = INT32_MAX;
+      int minDev = -1;
+      int minPart = -1;
+      for (int i = 0; i < numPartitions; i++) {
+        if (owned[i] < devsPerPartition) {
+          for (int j = 0; j < numDevs; j++) {
+            if ((owners[j] == -1) && (distances[i][j] <= minimum)) {
+              minimum = distances[i][j];
+              minPart = i;
+              minDev = j;
             }
           }
         }
-        assert((minDev >= 0) && (minLoc >= 0));
-        owners[minDev] = minLoc;
-        owned[minLoc]++;
       }
+      assert((minDev >= 0) && (minPart >= 0));
+      owners[minDev] = minPart;
+      owned[minPart]++;
     }
     // Return the addresses of our devices
     int j = 0;
@@ -1587,10 +1614,10 @@ int chpl_topo_selectMyDevices(chpl_topo_pci_addr_t *inAddrs,
           outAddrs[j++] = inAddrs[i];
       }
     }
-    assert(j == devsPerLocale);
-    *count = devsPerLocale;
+    assert(j == devsPerPartition);
+    *count = devsPerPartition;
   } else {
-    // No co-locales, use all the devices.
+    // Use all the devices.
     for (int i = 0; i < *count; i++) {
       outAddrs[i] = inAddrs[i];
     }
diff --git a/test/runtime/jhh/colocales/colocales.c b/test/runtime/jhh/colocales/colocales.c
index 5e32e210d159..68a3fbf50978 100644
--- a/test/runtime/jhh/colocales/colocales.c
+++ b/test/runtime/jhh/colocales/colocales.c
@@ -19,7 +19,8 @@ void Usage(char *name) {
   fprintf(stderr, "Usage: %s [-m mask] [-N nic] [-n numColocales] [-r rank]\n", name);
   fprintf(stderr, "\t-m <mask>\tMask off accessible PUs\n");
   fprintf(stderr, "\t-N <nic>\tNIC bus address\n");
-  fprintf(stderr, "\t-n <numLocales>\tNumber of locales on node\n");
+  fprintf(stderr, "\t-n <num>\tExpected number of co-locales on node\n");
+  fprintf(stderr, "\t-a <num>\tActual number of locales on node\n");
   fprintf(stderr, "\t-r <rank>\tLocal rank\n");
   fprintf(stderr, "\t-h\t\tPrint this message\n");
 }
@@ -31,18 +32,22 @@ int main(int argc, char* argv[]) {
   int *cpus = NULL;
   char *mask = NULL;
   int rank = -1;
-  int numLocales = 1;
-  char *numLocalesStr = "1";
+  int numCoLocales = -1;
+  char *numCoLocalesStr = NULL;
+  int numLocales = -1;
   char *nic = NULL;
 
-  while ((opt = getopt(argc, argv, "m:N:n:r:v")) != -1) {
+  while ((opt = getopt(argc, argv, "a:m:N:n:r:v")) != -1) {
     switch(opt) {
+      case 'a':
+        numLocales = atoi(optarg);
+        break;
       case 'm':
         mask = optarg;
         break;
       case 'n':
-        numLocales = atoi(optarg);
-        numLocalesStr = optarg;
+        numCoLocales = atoi(optarg);
+        numCoLocalesStr = optarg;
         break;
       case 'r':
         rank = atoi(optarg);
@@ -68,11 +73,28 @@ int main(int argc, char* argv[]) {
     exit(1);
   }
 
+  if (numCoLocales == -1) {
+    numCoLocales = 1;
+    numCoLocalesStr = "1";
+  }
+
+  if (numLocales == -1) {
+    numLocales = numCoLocales;
+  }
+
+  if (numLocales < 1) {
+    fprintf(stderr, "There must be > 0 locales on the node\n");
+    exit(1);
+  }
   if (rank >= numLocales) {
-    fprintf(stderr, "Rank must be less than number of locales on node.\n");
+    fprintf(stderr,
+            "Rank (%d) must be less than number of locales on node (%d).\n",
+            rank, numLocales);
     exit(1);
   }
-  setenv("CHPL_RT_LOCALES_PER_NODE", numLocalesStr, 1);
+  if (numCoLocales > 0) {
+    setenv("CHPL_RT_LOCALES_PER_NODE", numCoLocalesStr, 1);
+  }
   chpl__init_colocales(0, 0); // unsure why this is needed
   chpl_topo_pre_comm_init(mask);
   chpl_comm_init(NULL, NULL);
diff --git a/test/runtime/jhh/colocales/colocales.py b/test/runtime/jhh/colocales/colocales.py
index cd93c4855890..884e7afbf292 100755
--- a/test/runtime/jhh/colocales/colocales.py
+++ b/test/runtime/jhh/colocales/colocales.py
@@ -326,6 +326,32 @@ def test_12_leftover_cores(self):
         output = self.runCmd("./colocales -r 0 -n 17")
         self.assertIn("warning: 9 cores are unused\n", output);
 
+    def test_13_remainder_node(self):
+        """
+        Remainder node has fewer co-locales.
+        However, they should not get more than their allotment of cores.
+        """
+        for i in range(0, 3):
+            with self.subTest(i=i):
+                output = self.runCmd("./colocales -r %d -n %d -a 3 -N %s" %
+                                     (i, 16, self.nics[0]))
+                cpus = stringify([i*8+j for j in range(0, 8)])
+                self.assertIn("Physical CPUs: %s\n" % cpus, output)
+
+    def test_14_oversubscribed(self):
+        """
+        All locales get all cores when oversubscribed.
+        """
+        for i in range(0, 4):
+            with self.subTest(i=i):
+                output = self.runCmd("./colocales -r %d -a 4 -N %s" %
+                                     (i, self.nics[0]))
+                cpus = stringify(self.getSocketCores("all"))
+                self.assertIn("Physical CPUs: %s\n" % cpus, output)
+                cpus = stringify(self.getSocketThreads("all"))
+                self.assertIn("Logical CPUs: %s\n" % cpus, output)
+
+
 class Ex3Tests(TestCases.TestCase):
     """
     HPE Cray EX. One sockets, four NUMA domains per socket, 64 cores per
@@ -366,17 +392,30 @@ def test_11_two_colocales(self):
         Each co-locale gets the two GPUs closest to it
         """
         for i in range(0, 2):
-            output = self.runCmd("./colocales -r %d -n 2" % i);
-            x = i * 2
-            self.assertIn("GPUS: " + ' '.join(self.gpus[x:x+1]), output)
+            with self.subTest(i=i):
+                output = self.runCmd("./colocales -r %d -n 2" % i);
+                x = i * 2
+                self.assertIn("GPUS: " + ' '.join(self.gpus[x:x+1]), output)
 
     def test_12_four_colocales(self):
         """
         Each co-locale gets the GPU closest to it
         """
         for i in range(0, 4):
-            output = self.runCmd("./colocales -r %d -n 4" % i);
-            self.assertIn("GPUS: " + self.gpus[i], output)
+            with self.subTest(i=i):
+                output = self.runCmd("./colocales -r %d -n 4" % i);
+                self.assertIn("GPUS: " + self.gpus[i], output)
+
+    def test_13_oversubscribed(self):
+        """
+        All locales get all GPUs when oversubscribed.
+        """
+        for i in range(0, 4):
+            with self.subTest(i=i):
+                output = self.runCmd("./colocales -r %d -a 4" % i);
+                self.assertIn("GPUS: " + ' '.join(self.gpus), output)
+
+
 
 class Hpc6aTests(TestCases.TestCase):
     """

From 7fbac42b93f4f3deb49da1612908dbf0605a420e Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 09:48:30 -0700
Subject: [PATCH 010/107] Remove previous publish-docker-images.bash script

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/publish-docker-images.bash | 79 ----------------------------
 1 file changed, 79 deletions(-)
 delete mode 100755 util/cron/publish-docker-images.bash

diff --git a/util/cron/publish-docker-images.bash b/util/cron/publish-docker-images.bash
deleted file mode 100755
index e532c8d1217b..000000000000
--- a/util/cron/publish-docker-images.bash
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env bash
-
-# This script will build the docker images for 'amd' and 'arm' platforms using buildx and publish the images to the docker registry
-
-# Check if the script is run with correct arguement if not fail
-required_vars=(docker_repository image_version username password)
-for var in ${required_vars[@]}; do
-    if [ -z "${!var}" ] ; then
-        echo "${var} must be set."
-        exit 1
-    fi
-done
-
-echo "RepositoryName: $docker_repository"
-echo "imageVersion: $image_version"
-
-CWD=$(cd $(dirname $0) ; pwd)
-source $CWD/common.bash
-export CHPL_HOME=$(cd $CWD/../.. ; pwd)
-log_info "Setting CHPL_HOME to: ${CHPL_HOME}"
-
-# build_publish will build multi platform chapel docker images, tags them, and pushes the images to the docker repository .
-
-build_publish(){
-
-local registry="$1"
-local imageName="$2"
-local version="$3"
-
-# the below buildx command will build images for amd and arm, tags with the tags specified, and pushes it to the docker repository($registry)
-docker buildx build --platform=linux/amd64,linux/arm64 . --push -t $registry/$imageName:$version -t  $registry/$imageName:latest
-
-if [ $? -ne 0 ]
-then
-      echo "docker publish using buildx failed "
-      exit 1
-else
-      echo "docker publish using buildx succeeded "
-fi
-
-}
-
-# Get the repository name and chapel version, Build chapel docker images and push to docker hub repository .
-#build and publish chapel docker image
-docker login -u $username -p $password
-if [ $? -ne 0 ]
-then
-      echo " Docker login failed "
-      exit 1
-else
-      echo "docker login succeeded "
-fi
-
-cd $CHPL_HOME
-build_publish $docker_repository  chapel $image_version
-
-docker login -u $username -p $password
-if [ $? -ne 0 ]
-then
-      echo " Docker login failed "
-      exit 1
-else
-      echo "docker login succeeded "
-fi
-#build and publish chapel-gasnet docker image
-cd $CHPL_HOME/util/packaging/docker/gasnet
-build_publish $docker_repository  chapel-gasnet $image_version
-
-docker login -u $username -p $password
-if [ $? -ne 0 ]
-then
-      echo " Docker login failed "
-      exit 1
-else
-      echo "docker login succeeded "
-fi
-#build and publish chapel-gasnet-smp docker image
-cd $CHPL_HOME/util/packaging/docker/gasnet-smp
-build_publish $docker_repository  chapel-gasnet-smp $image_version

From dd7a91d8259b80f82276612df6612382578f6a77 Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 09:59:14 -0700
Subject: [PATCH 011/107] Update comments in test-docker.bash

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index d45f267f196f..de234bd1af2d 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -1,15 +1,18 @@
 #!/usr/bin/env bash
 #
-# This script will build, run, and verify chapel, gasnet, and gasnet-smp docker images.
+# This script will build, (sanity) test, and push the chapel, chapel-gasnet, and
+# chapel-gasnet-smp Docker images, using the `nightly` tag.
+#
+# Assumes Docker is already running on the system, logged into an account with
+# appropriate permissions to push the images.
 
 CWD=$(cd $(dirname $0) ; pwd)
 source $CWD/common.bash
 export CHPL_HOME=$(cd $CWD/../.. ; pwd)
 log_info "Setting CHPL_HOME to: ${CHPL_HOME}"
 
-# build_image function takes image name and docker script location as arguments.
-# Builds the image with the name from arg$1, runs the container and execute the install and verify script located in the location $2.
-build_image() {
+# update_image takes image name and test script location as arguments.
+update_image() {
   local imageName="$1"
   local script="$2"
   # Remove any existing image with the tag before building docker image
@@ -53,16 +56,16 @@ $nightlypatch
 EOF
 }
 
-# Build chapel Docker images
+# Build and push Chapel Docker images
 cd $CHPL_HOME
-build_image chapel/chapel:nightly  ${CHPL_HOME}/util/cron/docker-chapel.bash
+update_image chapel/chapel:nightly  ${CHPL_HOME}/util/cron/docker-chapel.bash
 
 cd $CHPL_HOME/util/packaging/docker/gasnet
 dockerfile_nightly_patch
-build_image chapel/chapel-gasnet:nightly ${CHPL_HOME}/util/cron/docker-gasnet.bash
+update_image chapel/chapel-gasnet:nightly ${CHPL_HOME}/util/cron/docker-gasnet.bash
 
 cd $CHPL_HOME/util/packaging/docker/gasnet-smp
 dockerfile_nightly_patch
-build_image chapel/chapel-gasnet-smp:nightly ${CHPL_HOME}/util/cron/docker-gasnet.bash
+update_image chapel/chapel-gasnet-smp:nightly ${CHPL_HOME}/util/cron/docker-gasnet.bash
 
 export CHPL_NIGHTLY_TEST_CONFIG_NAME="docker"

From 8f56102971a900ba37bacd4e931a6e078ec77ad9 Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 10:06:13 -0700
Subject: [PATCH 012/107] Refactor to only push after successful test

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index de234bd1af2d..fa3bbad60cc9 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -15,10 +15,12 @@ log_info "Setting CHPL_HOME to: ${CHPL_HOME}"
 update_image() {
   local imageName="$1"
   local script="$2"
+
   # Remove any existing image with the tag before building docker image
   docker image rm --force $imageName
 
-  docker buildx build --platform=linux/amd64,linux/arm64 . --push -t $imageName
+  # Build image
+  docker buildx build --platform=linux/amd64,linux/arm64 . -t $imageName
   BUILD_RESULT=$?
   if [ $BUILD_RESULT -ne 0 ]
   then
@@ -26,13 +28,17 @@ update_image() {
         exit 1
   fi
 
-  containerid= docker image ls | grep $imageName | awk '{print$3}'
+  # Set up to test container
   cd ${CHPL_HOME}/util/cron
   echo 'writeln("Hello, world!");' > hello.chpl
 
+  # Run test script inside container
   docker run --rm -i $imageName  <  $script
-
   CONTAINER_RUN=$?
+
+  # Clean up after our scratch test script, whether it succeeded or not
+  rm hello.chpl
+
   if [ $CONTAINER_RUN -ne 0 ]
   then
         echo "docker commands failed inside chapel $imageName container"
@@ -40,6 +46,9 @@ update_image() {
   else
         echo "docker commands succeeded inside chapel $imageName container"
   fi
+
+  # Push image after testing has succeeded
+  docker buildx build --platform=linux/amd64,linux/arm64 . --push -t $imageName
 }
 
 # Patch the Dockerfile to build FROM the nightly image instead of latest.

From c7966416a75741933c3b184c68a680c983e250aa Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 10:39:33 -0700
Subject: [PATCH 013/107] Implement release image mode via env var

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 99 +++++++++++++++++++++++++++-----------
 1 file changed, 71 insertions(+), 28 deletions(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index fa3bbad60cc9..19bd5b3386e7 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -1,7 +1,8 @@
 #!/usr/bin/env bash
 #
 # This script will build, (sanity) test, and push the chapel, chapel-gasnet, and
-# chapel-gasnet-smp Docker images, using the `nightly` tag.
+# chapel-gasnet-smp Docker images. Uses the 'nightly' tag by default, or
+# 'latest' and a release version tag if specified.
 #
 # Assumes Docker is already running on the system, logged into an account with
 # appropriate permissions to push the images.
@@ -10,14 +11,46 @@ CWD=$(cd $(dirname $0) ; pwd)
 source $CWD/common.bash
 export CHPL_HOME=$(cd $CWD/../.. ; pwd)
 log_info "Setting CHPL_HOME to: ${CHPL_HOME}"
+export CHPL_NIGHTLY_TEST_CONFIG_NAME="docker"
+
+# BEGIN FUNCTIONS
+
+# Patch the Dockerfile to build FROM the nightly image instead of latest.
+# Assumes the Dockerfile is available at ./Dockerfile.
+# Arguments are forwarded to `patch` command.
+dockerfile_nightly_patch() {
+  local patch_args="$@"
 
-# update_image takes image name and test script location as arguments.
+  local nightlypatch="
+1c1
+< FROM chapel/chapel:latest
+---
+> FROM chapel/chapel:nightly
+"
+
+  patch $patch_args ./Dockerfile << EOF
+$nightlypatch
+EOF
+}
+
+# Build, test, and push a Docker image.
+# Args:
+# - image name without tag
+# - test script location
+# - release version tag to use (optional, just 'nightly' otherwise)
 update_image() {
-  local imageName="$1"
+  local baseImageName="$1"
   local script="$2"
+  local release_tag="$3"
 
-  # Remove any existing image with the tag before building docker image
-  docker image rm --force $imageName
+  # Use specified release version tag, or 'nightly' if not specified
+  local imageName="${baseImageName}:${release_tag:-nightly}"
+
+  # Remove any existing image with the tag before building nightly docker image
+  if [ -n "$release_tag" ]
+  then
+    docker image rm --force $baseImageName
+  fi
 
   # Build image
   docker buildx build --platform=linux/amd64,linux/arm64 . -t $imageName
@@ -49,32 +82,42 @@ update_image() {
 
   # Push image after testing has succeeded
   docker buildx build --platform=linux/amd64,linux/arm64 . --push -t $imageName
+  # Also push as 'latest' tag if this is a release build
+  if [ -n "$release_tag" ]
+  then
+    # Use base image name (without tag) to use Docker's default tag
+    docker buildx build --platform=linux/amd64,linux/arm64 . --push -t $baseImageName
+  fi
 }
 
-# Patch the Dockerfile to build FROM the nightly image instead of latest.
-# Assumes the Dockerfile is available at ./Dockerfile.
-dockerfile_nightly_patch() {
-  local nightlypatch="
-1c1
-< FROM chapel/chapel:latest
----
-> FROM chapel/chapel:nightly
-"
-  patch ./Dockerfile << EOF
-$nightlypatch
-EOF
-}
+# Build, test, and push all Chapel Docker images.
+# Args:
+# - release version tag to use (optional, just 'nightly' otherwise)
+update_all_images() {
+  local release_tag="$1"
 
-# Build and push Chapel Docker images
-cd $CHPL_HOME
-update_image chapel/chapel:nightly  ${CHPL_HOME}/util/cron/docker-chapel.bash
+  cd $CHPL_HOME
+  update_image chapel/chapel ${CHPL_HOME}/util/cron/docker-chapel.bash release_tag
 
-cd $CHPL_HOME/util/packaging/docker/gasnet
-dockerfile_nightly_patch
-update_image chapel/chapel-gasnet:nightly ${CHPL_HOME}/util/cron/docker-gasnet.bash
+  cd $CHPL_HOME/util/packaging/docker/gasnet
+  dockerfile_nightly_patch
+  update_image chapel/chapel-gasnet ${CHPL_HOME}/util/cron/docker-gasnet.bash release_tag
+  dockerfile_nightly_patch -R
 
-cd $CHPL_HOME/util/packaging/docker/gasnet-smp
-dockerfile_nightly_patch
-update_image chapel/chapel-gasnet-smp:nightly ${CHPL_HOME}/util/cron/docker-gasnet.bash
+  cd $CHPL_HOME/util/packaging/docker/gasnet-smp
+  dockerfile_nightly_patch
+  update_image chapel/chapel-gasnet-smp ${CHPL_HOME}/util/cron/docker-gasnet.bash release_tag
+  dockerfile_nightly_patch -R
+}
+# END FUNCTIONS
 
-export CHPL_NIGHTLY_TEST_CONFIG_NAME="docker"
+
+# Build and push nightly images
+update_all_images
+
+# Build and push release images after ALL nightly builds have succeeded, if
+# release tag was specified.
+if [ -n "$RELEASE_VERSION" ]
+then
+  update_all_images $RELEASE_VERSION
+fi

From 4f2dd82197e13cd2db67ebabc3a1b9baaa841299 Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 10:49:25 -0700
Subject: [PATCH 014/107] Use release_tag variable rather than string literal

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index 19bd5b3386e7..af0b7fc8f936 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -97,16 +97,16 @@ update_all_images() {
   local release_tag="$1"
 
   cd $CHPL_HOME
-  update_image chapel/chapel ${CHPL_HOME}/util/cron/docker-chapel.bash release_tag
+  update_image chapel/chapel ${CHPL_HOME}/util/cron/docker-chapel.bash "$release_tag"
 
   cd $CHPL_HOME/util/packaging/docker/gasnet
   dockerfile_nightly_patch
-  update_image chapel/chapel-gasnet ${CHPL_HOME}/util/cron/docker-gasnet.bash release_tag
+  update_image chapel/chapel-gasnet ${CHPL_HOME}/util/cron/docker-gasnet.bash "$release_tag"
   dockerfile_nightly_patch -R
 
   cd $CHPL_HOME/util/packaging/docker/gasnet-smp
   dockerfile_nightly_patch
-  update_image chapel/chapel-gasnet-smp ${CHPL_HOME}/util/cron/docker-gasnet.bash release_tag
+  update_image chapel/chapel-gasnet-smp ${CHPL_HOME}/util/cron/docker-gasnet.bash "$release_tag"
   dockerfile_nightly_patch -R
 }
 # END FUNCTIONS

From 30e9514e35eb4c72b09b09d5649eab67aee08f01 Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 10:52:10 -0700
Subject: [PATCH 015/107] Quote variables to prevent bash weirdness

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index af0b7fc8f936..de79a5590693 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -19,7 +19,7 @@ export CHPL_NIGHTLY_TEST_CONFIG_NAME="docker"
 # Assumes the Dockerfile is available at ./Dockerfile.
 # Arguments are forwarded to `patch` command.
 dockerfile_nightly_patch() {
-  local patch_args="$@"
+  local patch_args="$*"
 
   local nightlypatch="
 1c1
@@ -28,7 +28,7 @@ dockerfile_nightly_patch() {
 > FROM chapel/chapel:nightly
 "
 
-  patch $patch_args ./Dockerfile << EOF
+  patch "$patch_args" ./Dockerfile << EOF
 $nightlypatch
 EOF
 }
@@ -49,11 +49,11 @@ update_image() {
   # Remove any existing image with the tag before building nightly docker image
   if [ -n "$release_tag" ]
   then
-    docker image rm --force $baseImageName
+    docker image rm --force "$baseImageName"
   fi
 
   # Build image
-  docker buildx build --platform=linux/amd64,linux/arm64 . -t $imageName
+  docker buildx build --platform=linux/amd64,linux/arm64 . -t "$imageName"
   BUILD_RESULT=$?
   if [ $BUILD_RESULT -ne 0 ]
   then
@@ -62,11 +62,11 @@ update_image() {
   fi
 
   # Set up to test container
-  cd ${CHPL_HOME}/util/cron
+  cd "${CHPL_HOME}/util/cron"
   echo 'writeln("Hello, world!");' > hello.chpl
 
   # Run test script inside container
-  docker run --rm -i $imageName  <  $script
+  docker run --rm -i "$imageName"  <  "$script"
   CONTAINER_RUN=$?
 
   # Clean up after our scratch test script, whether it succeeded or not
@@ -81,12 +81,12 @@ update_image() {
   fi
 
   # Push image after testing has succeeded
-  docker buildx build --platform=linux/amd64,linux/arm64 . --push -t $imageName
+  docker buildx build --platform=linux/amd64,linux/arm64 . --push -t "$imageName"
   # Also push as 'latest' tag if this is a release build
   if [ -n "$release_tag" ]
   then
-    # Use base image name (without tag) to use Docker's default tag
-    docker buildx build --platform=linux/amd64,linux/arm64 . --push -t $baseImageName
+    # Use base image name (without tag) to use Docker's default tag 'latest'
+    docker buildx build --platform=linux/amd64,linux/arm64 . --push -t "$baseImageName"
   fi
 }
 
@@ -96,17 +96,17 @@ update_image() {
 update_all_images() {
   local release_tag="$1"
 
-  cd $CHPL_HOME
-  update_image chapel/chapel ${CHPL_HOME}/util/cron/docker-chapel.bash "$release_tag"
+  cd "$CHPL_HOME"
+  update_image chapel/chapel "${CHPL_HOME}/util/cron/docker-chapel.bash" "$release_tag"
 
-  cd $CHPL_HOME/util/packaging/docker/gasnet
+  cd "$CHPL_HOME/util/packaging/docker/gasnet"
   dockerfile_nightly_patch
-  update_image chapel/chapel-gasnet ${CHPL_HOME}/util/cron/docker-gasnet.bash "$release_tag"
+  update_image chapel/chapel-gasnet "${CHPL_HOME}/util/cron/docker-gasnet.bash" "$release_tag"
   dockerfile_nightly_patch -R
 
-  cd $CHPL_HOME/util/packaging/docker/gasnet-smp
+  cd "$CHPL_HOME/util/packaging/docker/gasnet-smp"
   dockerfile_nightly_patch
-  update_image chapel/chapel-gasnet-smp ${CHPL_HOME}/util/cron/docker-gasnet.bash "$release_tag"
+  update_image chapel/chapel-gasnet-smp "${CHPL_HOME}/util/cron/docker-gasnet.bash" "$release_tag"
   dockerfile_nightly_patch -R
 }
 # END FUNCTIONS
@@ -119,5 +119,5 @@ update_all_images
 # release tag was specified.
 if [ -n "$RELEASE_VERSION" ]
 then
-  update_all_images $RELEASE_VERSION
+  update_all_images "$RELEASE_VERSION"
 fi

From 3c0adbc52daa41445aece20306aab32d06c9eadc Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 10:54:38 -0700
Subject: [PATCH 016/107] Add comment for patch reverses

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index de79a5590693..16e3dc53bdf3 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -102,11 +102,13 @@ update_all_images() {
   cd "$CHPL_HOME/util/packaging/docker/gasnet"
   dockerfile_nightly_patch
   update_image chapel/chapel-gasnet "${CHPL_HOME}/util/cron/docker-gasnet.bash" "$release_tag"
+  # Clean up after patch changes
   dockerfile_nightly_patch -R
 
   cd "$CHPL_HOME/util/packaging/docker/gasnet-smp"
   dockerfile_nightly_patch
   update_image chapel/chapel-gasnet-smp "${CHPL_HOME}/util/cron/docker-gasnet.bash" "$release_tag"
+  # Clean up after patch changes
   dockerfile_nightly_patch -R
 }
 # END FUNCTIONS

From a73535e0fecde3eb5addf6fcfafe48f710340e7d Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 10:55:43 -0700
Subject: [PATCH 017/107] Clarify some comments

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index 16e3dc53bdf3..b7c3eed4d745 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -6,6 +6,8 @@
 #
 # Assumes Docker is already running on the system, logged into an account with
 # appropriate permissions to push the images.
+#
+
 
 CWD=$(cd $(dirname $0) ; pwd)
 source $CWD/common.bash
@@ -37,7 +39,7 @@ EOF
 # Args:
 # - image name without tag
 # - test script location
-# - release version tag to use (optional, just 'nightly' otherwise)
+# - release version tag to use (optional, builds nightly otherwise)
 update_image() {
   local baseImageName="$1"
   local script="$2"
@@ -92,7 +94,7 @@ update_image() {
 
 # Build, test, and push all Chapel Docker images.
 # Args:
-# - release version tag to use (optional, just 'nightly' otherwise)
+# - release version tag to use (optional, builds nightly otherwise)
 update_all_images() {
   local release_tag="$1"
 
@@ -117,8 +119,8 @@ update_all_images() {
 # Build and push nightly images
 update_all_images
 
-# Build and push release images after ALL nightly builds have succeeded, if
-# release tag was specified.
+# Build and push release-tagged images, if RELEASE_VERSION was specified.
+# Runs after all nightly images, to abort if any fail.
 if [ -n "$RELEASE_VERSION" ]
 then
   update_all_images "$RELEASE_VERSION"

From 15bf1d965d55bc66190ae86d96f64c867ccecefd Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 11:38:22 -0700
Subject: [PATCH 018/107] Add comment and logging for RELEASE_VERSION var

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index b7c3eed4d745..759e55b38ff0 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -7,6 +7,9 @@
 # Assumes Docker is already running on the system, logged into an account with
 # appropriate permissions to push the images.
 #
+# Expected environment variables:
+# - RELEASE_VERSION (optional): If set, will also push the image tagged as
+#   'latest' and this version. Should match version in release branch name.
 
 
 CWD=$(cd $(dirname $0) ; pwd)
@@ -116,6 +119,13 @@ update_all_images() {
 # END FUNCTIONS
 
 
+if [ -n "$RELEASE_VERSION" ]
+then
+  log_info "Building and pushing nightly and release-tagged images for version: $RELEASE_VERSION"
+else
+  log_info "Building and pushing nightly images"
+fi
+
 # Build and push nightly images
 update_all_images
 

From f904a4b1a2936ab8af1591047f3bbe16df6f5c80 Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 11:42:41 -0700
Subject: [PATCH 019/107] Abort if releasing not on release branch

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index 759e55b38ff0..fb27246e2acb 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -122,6 +122,12 @@ update_all_images() {
 if [ -n "$RELEASE_VERSION" ]
 then
   log_info "Building and pushing nightly and release-tagged images for version: $RELEASE_VERSION"
+  local release_branch="release/$RELEASE_VERSION"
+  if [ "$(git rev-parse --abbrev-ref HEAD)" != "$release_branch" ]
+  then
+    log_error "Not on expected release branch $release_branch for version $RELEASE_VERSION, aborting"
+    exit 1
+  fi
 else
   log_info "Building and pushing nightly images"
 fi

From da54de4a35b139342276afa346f12a0b796cf183 Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 12:04:38 -0700
Subject: [PATCH 020/107] Remove incorrect use of local keyword

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index fb27246e2acb..e96d5a4a1f80 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -122,7 +122,7 @@ update_all_images() {
 if [ -n "$RELEASE_VERSION" ]
 then
   log_info "Building and pushing nightly and release-tagged images for version: $RELEASE_VERSION"
-  local release_branch="release/$RELEASE_VERSION"
+  release_branch="release/$RELEASE_VERSION"
   if [ "$(git rev-parse --abbrev-ref HEAD)" != "$release_branch" ]
   then
     log_error "Not on expected release branch $release_branch for version $RELEASE_VERSION, aborting"

From b7681a320918b1e61d9967c25270fad6745d3d51 Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 12:07:45 -0700
Subject: [PATCH 021/107] Add logging for start and end of each image

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index e96d5a4a1f80..3d351934ac1b 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -51,6 +51,8 @@ update_image() {
   # Use specified release version tag, or 'nightly' if not specified
   local imageName="${baseImageName}:${release_tag:-nightly}"
 
+  log_info "Starting $imageName..."
+
   # Remove any existing image with the tag before building nightly docker image
   if [ -n "$release_tag" ]
   then
@@ -93,6 +95,8 @@ update_image() {
     # Use base image name (without tag) to use Docker's default tag 'latest'
     docker buildx build --platform=linux/amd64,linux/arm64 . --push -t "$baseImageName"
   fi
+
+  log_info "Completed $imageName"
 }
 
 # Build, test, and push all Chapel Docker images.

From 825dd6d6543068cdde8257b0444505709cdedda4 Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 12:10:50 -0700
Subject: [PATCH 022/107] Resume pushing (nightly) image before test

Includes comment describing Docker limitation motivating this.
Also correctly remove tagged image name pre-build.

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index 3d351934ac1b..e80b11e53813 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -53,14 +53,20 @@ update_image() {
 
   log_info "Starting $imageName..."
 
-  # Remove any existing image with the tag before building nightly docker image
+  # Remove any existing image with the tag before building
   if [ -n "$release_tag" ]
   then
-    docker image rm --force "$baseImageName"
+    docker image rm --force "$imageName"
   fi
 
-  # Build image
-  docker buildx build --platform=linux/amd64,linux/arm64 . -t "$imageName"
+  # Build and push image
+  # Note: We push before testing due to a limitation of Docker
+  # (https://github.com/docker/buildx/issues/59) which prevents loading a
+  # multi-arch image without pushing. This means we may push a broken nightly
+  # image before erroring out; it's important that release pushes come after
+  # all nightly pushes so we can't push a broken release image.
+  # Anna, 2024-10-07
+  docker buildx build --platform=linux/amd64,linux/arm64 --push . -t "$imageName"
   BUILD_RESULT=$?
   if [ $BUILD_RESULT -ne 0 ]
   then
@@ -87,8 +93,6 @@ update_image() {
         echo "docker commands succeeded inside chapel $imageName container"
   fi
 
-  # Push image after testing has succeeded
-  docker buildx build --platform=linux/amd64,linux/arm64 . --push -t "$imageName"
   # Also push as 'latest' tag if this is a release build
   if [ -n "$release_tag" ]
   then

From ad510c559654bd122601fedf11aa4207784b042b Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Mon, 7 Oct 2024 12:28:36 -0700
Subject: [PATCH 023/107] Do 'latest' tag push immediately after release-tagged
 one

To be more robust against unnecessary rebuilds

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index e80b11e53813..380d85903c69 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -68,6 +68,13 @@ update_image() {
   # Anna, 2024-10-07
   docker buildx build --platform=linux/amd64,linux/arm64 --push . -t "$imageName"
   BUILD_RESULT=$?
+  # Also push as 'latest' tag if this is a release build
+  if [ -n "$release_tag" ]
+  then
+    # Use base image name (without tag) to use Docker's default tag 'latest'
+    docker buildx build --platform=linux/amd64,linux/arm64 . --push -t "$baseImageName"
+  fi
+
   if [ $BUILD_RESULT -ne 0 ]
   then
         echo "docker build failed for $imageName image"
@@ -93,13 +100,6 @@ update_image() {
         echo "docker commands succeeded inside chapel $imageName container"
   fi
 
-  # Also push as 'latest' tag if this is a release build
-  if [ -n "$release_tag" ]
-  then
-    # Use base image name (without tag) to use Docker's default tag 'latest'
-    docker buildx build --platform=linux/amd64,linux/arm64 . --push -t "$baseImageName"
-  fi
-
   log_info "Completed $imageName"
 }
 

From 9bd6d2d03413d894ea98fac75d86c8da93542f5e Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Tue, 8 Oct 2024 10:20:42 -0700
Subject: [PATCH 024/107] Avoid cd in update_image

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index 380d85903c69..f27a84e17422 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -81,15 +81,11 @@ update_image() {
         exit 1
   fi
 
-  # Set up to test container
-  cd "${CHPL_HOME}/util/cron"
-  echo 'writeln("Hello, world!");' > hello.chpl
-
   # Run test script inside container
+  echo 'writeln("Hello, world!");' > hello.chpl
   docker run --rm -i "$imageName"  <  "$script"
   CONTAINER_RUN=$?
-
-  # Clean up after our scratch test script, whether it succeeded or not
+  # Clean up scratch chpl file for testing
   rm hello.chpl
 
   if [ $CONTAINER_RUN -ne 0 ]

From cc3465c948d8d737e5d1aa27260599da6d234ded Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Tue, 8 Oct 2024 10:20:53 -0700
Subject: [PATCH 025/107] Fix arg order of 'latest' tag push

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index f27a84e17422..67a1060ffef8 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -72,7 +72,7 @@ update_image() {
   if [ -n "$release_tag" ]
   then
     # Use base image name (without tag) to use Docker's default tag 'latest'
-    docker buildx build --platform=linux/amd64,linux/arm64 . --push -t "$baseImageName"
+    docker buildx build --platform=linux/amd64,linux/arm64 --push . -t "$baseImageName"
   fi
 
   if [ $BUILD_RESULT -ne 0 ]

From 3a3f38d941a14d7dfe950990496bf8a7f5995564 Mon Sep 17 00:00:00 2001
From: Anna Rift <anna.rift@hpe.com>
Date: Tue, 8 Oct 2024 14:17:24 -0700
Subject: [PATCH 026/107] Remove quoted empty args breaking patch

Signed-off-by: Anna Rift <anna.rift@hpe.com>
---
 util/cron/test-docker.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/cron/test-docker.bash b/util/cron/test-docker.bash
index 67a1060ffef8..2f2746e99bff 100755
--- a/util/cron/test-docker.bash
+++ b/util/cron/test-docker.bash
@@ -33,7 +33,7 @@ dockerfile_nightly_patch() {
 > FROM chapel/chapel:nightly
 "
 
-  patch "$patch_args" ./Dockerfile << EOF
+  patch $patch_args ./Dockerfile << EOF
 $nightlypatch
 EOF
 }

From 54b244bec5fa42805dcaaabaa0ac992b04d22bbe Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Tue, 8 Oct 2024 15:49:07 -0700
Subject: [PATCH 027/107] Fix David's query system bug

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 .../include/chpl/framework/Context-detail.h   | 22 ++++++++++++++++---
 frontend/include/chpl/framework/query-impl.h  |  2 +-
 frontend/lib/framework/Context.cpp            |  6 +++++
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/frontend/include/chpl/framework/Context-detail.h b/frontend/include/chpl/framework/Context-detail.h
index b88223dfc8ac..877230a56731 100644
--- a/frontend/include/chpl/framework/Context-detail.h
+++ b/frontend/include/chpl/framework/Context-detail.h
@@ -260,6 +260,20 @@ class QueryMapResultBase {
   // lastChanged indicates the last revision in which the query result
   // has changed
   mutable RevisionNumber lastChanged = -1;
+  // This field exists to support isQueryRunning. When traversingg the dependencies
+  // of a query, we may re-run dependency queries to see if their results change.
+  // Sometimes, these queries will check isQueryRunning. At this time, the
+  // actual query (this) is not running, but we want to return 'true' to hide
+  // the details of the query system from the query author. This field is set to
+  // 'true' when the query is being tested for re-computation, so that
+  // we can return 'true' when isQueryRunning is called in that case.
+  //
+  // Note (Daniel 10/08/2024): there may be a way to combine this field with
+  // lastChanged somehow, since that's what we use for isQueryRunning while
+  // the query really is running. However, the semantics of that are nontrivial,
+  // and that field is used for a lot of things, so for the time being, the
+  // extra boolean is fine.
+  mutable bool beingTestedForReuse = false;
 
   mutable QueryDependencyVec dependencies;
 
@@ -280,6 +294,7 @@ class QueryMapResultBase {
 
   QueryMapResultBase(RevisionNumber lastChecked,
                      RevisionNumber lastChanged,
+                     bool beingTestedForReuse,
                      bool emittedErrors,
                      bool errorsPresentInSelfOrDependencies,
                      std::set<const QueryMapResultBase*> recursionErrors,
@@ -302,20 +317,21 @@ class QueryMapResult final : public QueryMapResultBase {
   //  * a default-constructed result
   QueryMapResult(QueryMap<ResultType, ArgTs...>* parentQueryMap,
                  std::tuple<ArgTs...> tupleOfArgs)
-    : QueryMapResultBase(-1, -1, false, false, {}, parentQueryMap),
+    : QueryMapResultBase(-1, -1, false, false, false, {}, parentQueryMap),
       tupleOfArgs(std::move(tupleOfArgs)),
       result() {
   }
   QueryMapResult(RevisionNumber lastChecked,
                  RevisionNumber lastChanged,
+                 bool beingTestedForReuse,
                  bool emittedErrors,
                  bool errorsPresentInSelfOrDependencies,
                  std::set<const QueryMapResultBase*> recursionErrors,
                  QueryMap<ResultType, ArgTs...>* parentQueryMap,
                  std::tuple<ArgTs...> tupleOfArgs,
                  ResultType result)
-    : QueryMapResultBase(lastChecked, lastChanged, emittedErrors,
-                         errorsPresentInSelfOrDependencies,
+    : QueryMapResultBase(lastChecked, lastChanged, beingTestedForReuse,
+                         emittedErrors, errorsPresentInSelfOrDependencies,
                          std::move(recursionErrors), parentQueryMap),
       tupleOfArgs(std::move(tupleOfArgs)),
       result(std::move(result)) {
diff --git a/frontend/include/chpl/framework/query-impl.h b/frontend/include/chpl/framework/query-impl.h
index 6b5e87449be5..bf6603f3b759 100644
--- a/frontend/include/chpl/framework/query-impl.h
+++ b/frontend/include/chpl/framework/query-impl.h
@@ -560,7 +560,7 @@ Context::isQueryRunning(
     return false;
   }
 
-  return search2->lastChecked == -1;
+  return search2->lastChecked == -1 || search2->beingTestedForReuse;
 }
 
 template<typename ResultType,
diff --git a/frontend/lib/framework/Context.cpp b/frontend/lib/framework/Context.cpp
index 8058c1fd34d8..a79141b8b7fb 100644
--- a/frontend/lib/framework/Context.cpp
+++ b/frontend/lib/framework/Context.cpp
@@ -1018,6 +1018,7 @@ void Context::recomputeIfNeeded(const QueryMapResultBase* resultEntry) {
   // changed since the last revision in which we computed this?
   // If so, compute it again.
   bool useSaved = true;
+  resultEntry->beingTestedForReuse = true;
   for (auto& dependency : resultEntry->dependencies) {
     const QueryMapResultBase* dependencyQuery = dependency.query;
     if (dependencyQuery->lastChanged > resultEntry->lastChanged) {
@@ -1042,6 +1043,7 @@ void Context::recomputeIfNeeded(const QueryMapResultBase* resultEntry) {
       }
     }
   }
+  resultEntry->beingTestedForReuse = false;
 
   if (useSaved == false) {
     auto marker = markRecomputing(true);
@@ -1124,6 +1126,7 @@ bool Context::queryCanUseSavedResult(
     useSaved = false;
   } else {
     useSaved = true;
+    resultEntry->beingTestedForReuse = true;
     for (auto& dependency: resultEntry->dependencies) {
       const QueryMapResultBase* dependencyQuery = dependency.query;
 
@@ -1142,6 +1145,7 @@ bool Context::queryCanUseSavedResult(
         break;
       }
     }
+    resultEntry->beingTestedForReuse = false;
     if (useSaved == true) {
       updateForReuse(resultEntry);
     }
@@ -1339,12 +1343,14 @@ void queryArgsPrintSep(std::ostream& s) {
 
 QueryMapResultBase::QueryMapResultBase(RevisionNumber lastChecked,
                    RevisionNumber lastChanged,
+                   bool beingTestedForReuse,
                    bool emittedErrors,
                    bool errorsPresentInSelfOrDependencies,
                    std::set<const QueryMapResultBase*> recursionErrors,
                    QueryMapBase* parentQueryMap)
   : lastChecked(lastChecked),
     lastChanged(lastChanged),
+    beingTestedForReuse(beingTestedForReuse),
     dependencies(),
     emittedErrors(emittedErrors),
     errorsPresentInSelfOrDependencies(errorsPresentInSelfOrDependencies),

From f96d06cc1bfd9bd0279005e49b1b36ee5fe15206 Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Tue, 8 Oct 2024 15:49:25 -0700
Subject: [PATCH 028/107] Add comment and assertion to code that tripped me up

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 frontend/include/chpl/framework/Context.h            | 4 ++++
 frontend/include/chpl/resolution/ResolutionContext.h | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/frontend/include/chpl/framework/Context.h b/frontend/include/chpl/framework/Context.h
index 3d502879d316..33ee4190e2ec 100644
--- a/frontend/include/chpl/framework/Context.h
+++ b/frontend/include/chpl/framework/Context.h
@@ -227,6 +227,10 @@ class Context {
       context_ = nullptr;
     }
 
+    bool isCleared() {
+      return context_ == nullptr;
+    }
+
     ~RecomputeMarker() {
       restore();
     }
diff --git a/frontend/include/chpl/resolution/ResolutionContext.h b/frontend/include/chpl/resolution/ResolutionContext.h
index c6ed9b3b74d8..ec832a0b0bd0 100644
--- a/frontend/include/chpl/resolution/ResolutionContext.h
+++ b/frontend/include/chpl/resolution/ResolutionContext.h
@@ -454,8 +454,14 @@ class ResolutionContext::GlobalQuery {
     }
 
     // Otherwise we are computing, so set the recompute marker.
+    // We don't want it to go out of scope now (since that undoes the mark),
+    // but this is just the 'begin' function; we do want it to go out of scope
+    // when the query is done (when 'end' is called). So, marker it in a field.
     const bool isRecomputing = false;
     auto activeRecomputeMarker = context_->markRecomputing(isRecomputing);
+
+    // We better not be saving another marker, since we only have room to save one.
+    CHPL_ASSERT(recomputeMarker_.isCleared());
     std::swap(recomputeMarker_, activeRecomputeMarker);
 
     // Set the stopwatch if it is compile-time enabled.

From aa6ac54c36f0d82e05a532aacf9ae85ab8cc402f Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Tue, 8 Oct 2024 16:04:35 -0700
Subject: [PATCH 029/107] Add David's reproducer to tests

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 frontend/test/resolution/testResolve.cpp | 44 ++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/frontend/test/resolution/testResolve.cpp b/frontend/test/resolution/testResolve.cpp
index 8bc3687b9b1b..83acc0e0c630 100644
--- a/frontend/test/resolution/testResolve.cpp
+++ b/frontend/test/resolution/testResolve.cpp
@@ -1761,6 +1761,48 @@ static void test29(Context* context) {
   }
 }
 
+// This bug is hard to replicate with queries alone, but does seem to show
+// up in some cases of the query system.
+static void testInfiniteCycleBug() {
+  auto context = buildStdContext();
+  auto ctx = context.get();
+
+  ctx->advanceToNextRevision(false);
+  setupModuleSearchPaths(ctx, false, false, {}, {});
+
+  CompilerFlags flags;
+  flags.set(CompilerFlags::WARN_UNSTABLE, true);
+  setCompilerFlags(ctx, std::move(flags));
+
+  std::string program0 =
+    R""""(
+    proc foo() {
+      var x = 0;
+      proc bar() { return x; }
+      return bar();
+    }
+    var x = foo();
+    )"""";
+
+  std::ignore = resolveQualifiedTypeOfX(ctx, program0);
+
+  ctx->advanceToNextRevision(false);
+  setupModuleSearchPaths(ctx, false, false, {}, {});
+
+  std::string program1 =
+    R""""(
+    proc baz() {
+      var x = 0;
+      proc ding() { return x; }
+      return bar();
+    }
+    var x = baz();
+    )"""";
+
+  std::ignore = resolveQualifiedTypeOfX(ctx, program1);
+}
+
+
 int main() {
   test1();
   test2();
@@ -1794,5 +1836,7 @@ int main() {
   test28(ctx.get());
   test29(ctx.get());
 
+  testInfiniteCycleBug();
+
   return 0;
 }

From f0089839b68a0794338505cf92273ce7217f0e2b Mon Sep 17 00:00:00 2001
From: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com>
Date: Tue, 8 Oct 2024 18:21:43 -0500
Subject: [PATCH 030/107] Add new configs for gpu on hpe cray ex

This adds the following new configurations scripts
for testing gpus on HPE Cray EX systems:
- `test-gpu-ex-cpu` (analogous to `test-gpu-cpu` for Cray CS)
- `test-gpu-ex-cuda-12.interop` (analogous to
  `test-gpu-cuda.interop` for Cray CS)
- `test-gpu-ex-cuda-12.specialization` (analogous to
  `test-gpu-cuda.specialization` for Cray CS)
- `test-perf.gpu-ex-cuda-12.um` (analogous to
  `test-perf.gpu-cuda.um` for Cray CS)

Signed-off-by: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com>
---
 util/cron/test-gpu-ex-cpu.bash                | 14 +++++++++
 util/cron/test-gpu-ex-cuda-12.bash            |  1 +
 util/cron/test-gpu-ex-cuda-12.interop.bash    | 27 +++++++++++++++++
 .../test-gpu-ex-cuda-12.specialization.bash   | 21 +++++++++++++
 util/cron/test-perf.gpu-ex-cuda-12.um.bash    | 30 +++++++++++++++++++
 5 files changed, 93 insertions(+)
 create mode 100755 util/cron/test-gpu-ex-cpu.bash
 create mode 100755 util/cron/test-gpu-ex-cuda-12.interop.bash
 create mode 100755 util/cron/test-gpu-ex-cuda-12.specialization.bash
 create mode 100755 util/cron/test-perf.gpu-ex-cuda-12.um.bash

diff --git a/util/cron/test-gpu-ex-cpu.bash b/util/cron/test-gpu-ex-cpu.bash
new file mode 100755
index 000000000000..d5956f5d09af
--- /dev/null
+++ b/util/cron/test-gpu-ex-cpu.bash
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+#
+# GPU native testing on a Cray EX (using none for CHPL_COMM)
+
+CWD=$(cd $(dirname ${BASH_SOURCE[0]}) ; pwd)
+source $CWD/common-native-gpu.bash
+source $CWD/common-hpe-cray-ex.bash
+
+export CHPL_GPU=cpu
+export CHPL_COMM=none
+export CHPL_GPU_NO_CPU_MODE_WARNING=y
+
+export CHPL_NIGHTLY_TEST_CONFIG_NAME="gpu-ex-cpu"
+$CWD/nightly -cron ${nightly_args}
diff --git a/util/cron/test-gpu-ex-cuda-12.bash b/util/cron/test-gpu-ex-cuda-12.bash
index 69a18fe76b4a..ff13152932df 100755
--- a/util/cron/test-gpu-ex-cuda-12.bash
+++ b/util/cron/test-gpu-ex-cuda-12.bash
@@ -12,6 +12,7 @@ export CHPL_LLVM=bundled  # CUDA 12 is only supported with bundled LLVM
 export CHPL_COMM=none
 export CHPL_LOCALE_MODEL=gpu
 export CHPL_LAUNCHER_PARTITION=allgriz
+export CHPL_TEST_GPU=true
 export CHPL_GPU=nvidia  # amd is also detected automatically
 
 export CHPL_NIGHTLY_TEST_CONFIG_NAME="gpu-ex-cuda-12"
diff --git a/util/cron/test-gpu-ex-cuda-12.interop.bash b/util/cron/test-gpu-ex-cuda-12.interop.bash
new file mode 100755
index 000000000000..974132a6c3bd
--- /dev/null
+++ b/util/cron/test-gpu-ex-cuda-12.interop.bash
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+#
+# GPU native testing on a Cray EX (using none for CHPL_COMM)
+
+CWD=$(cd $(dirname ${BASH_SOURCE[0]}) ; pwd)
+source $CWD/common.bash
+source $CWD/common-hpe-cray-ex.bash
+
+
+# We need 12.4 for the stream test because the CUDA driver on pinoak
+# only supports PTX for 12.4, untill the driver is updated, we need to
+# stick with 12.4 instead of 12.5
+module load cuda/12.4  # default is CUDA 12
+
+# We need cublas for the cublas interop test, but since we are using 12.4 above
+# pinoak doesn't have the cublas library for 12.4, so we need to use the cublas
+# from 12.5 (which is compatible across minor versions)
+# This can be removed once we use CUDA 12.5
+export CHPL_LIB_PATH="/opt/nvidia/hpc_sdk/Linux_x86_64/24.7/math_libs/lib64"
+
+export CHPL_LLVM=bundled  # CUDA 12 is only supported with bundled LLVM
+export CHPL_TEST_GPU=true
+export CHPL_LAUNCHER_PARTITION=allgriz
+export CHPL_NIGHTLY_TEST_DIRS="gpu/interop/"
+
+export CHPL_NIGHTLY_TEST_CONFIG_NAME="gpu-ex-cuda-12.interop"
+$CWD/nightly -cron ${nightly_args}
diff --git a/util/cron/test-gpu-ex-cuda-12.specialization.bash b/util/cron/test-gpu-ex-cuda-12.specialization.bash
new file mode 100755
index 000000000000..5228ca3f7a5b
--- /dev/null
+++ b/util/cron/test-gpu-ex-cuda-12.specialization.bash
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+#
+# GPU native testing on a Cray EX (using none for CHPL_COMM)
+
+CWD=$(cd $(dirname ${BASH_SOURCE[0]}) ; pwd)
+source $CWD/common-native-gpu.bash
+source $CWD/common-hpe-cray-ex.bash
+
+module load cudatoolkit  # default is CUDA 12
+
+export CHPL_LLVM=bundled  # CUDA 12 is only supported with bundled LLVM
+export CHPL_COMM=none
+export CHPL_LOCALE_MODEL=gpu
+export CHPL_LAUNCHER_PARTITION=allgriz
+export CHPL_TEST_GPU=true
+export CHPL_GPU=nvidia  # amd is also detected automatically
+
+export CHPL_GPU_SPECIALIZATION=y
+
+export CHPL_NIGHTLY_TEST_CONFIG_NAME="gpu-ex-cuda-12.specialization"
+$CWD/nightly -cron ${nightly_args}
diff --git a/util/cron/test-perf.gpu-ex-cuda-12.um.bash b/util/cron/test-perf.gpu-ex-cuda-12.um.bash
new file mode 100755
index 000000000000..75f5c38c52f8
--- /dev/null
+++ b/util/cron/test-perf.gpu-ex-cuda-12.um.bash
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+#
+# Run GPU performance testing on a Cray EX
+
+CWD=$(cd $(dirname $0) ; pwd)
+source $CWD/common-native-gpu.bash
+source $CWD/common-hpe-cray-ex.bash
+
+module load cudatoolkit  # default is CUDA 12
+
+export CHPL_LLVM=bundled  # CUDA 12 is only supported with bundled LLVM
+export CHPL_COMM=none
+export CHPL_LOCALE_MODEL=gpu
+export CHPL_LAUNCHER_PARTITION=allgriz
+export CHPL_GPU=nvidia  # amd is detected automatically
+export CHPL_GPU_MEM_STRATEGY=unified_memory
+
+export CHPL_NIGHTLY_TEST_CONFIG_NAME="perf.gpu-ex-cuda-12.um"
+
+export CHPL_TEST_PERF_CONFIG_NAME="1-node-a100" # pinoak has ampere GPUs
+source $CWD/common-native-gpu-perf.bash
+# make sure this comes after setting SUBDIR (set by native-gpu-perf) and
+# CONFIG_NAME
+source $CWD/common-perf.bash
+
+SHORT_NAME=um
+nightly_args="${nightly_args} -performance-description $SHORT_NAME -performance-configs default:v,$SHORT_NAME:v -sync-dir-suffix $SHORT_NAME"
+nightly_args="${nightly_args} -startdate 10/10/24"
+
+$CWD/nightly -cron ${nightly_args}

From 611510ad0061e31b6e7fa4a8ca99122d5479840d Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Tue, 8 Oct 2024 16:37:35 -0700
Subject: [PATCH 031/107] Emit recursion errors when they are dected during
 recomputation

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 frontend/include/chpl/framework/query-impl.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/frontend/include/chpl/framework/query-impl.h b/frontend/include/chpl/framework/query-impl.h
index bf6603f3b759..94d046a6992e 100644
--- a/frontend/include/chpl/framework/query-impl.h
+++ b/frontend/include/chpl/framework/query-impl.h
@@ -233,7 +233,9 @@ Context::getResult(QueryMap<ResultType, ArgTs...>* queryMap,
     //  printf("Found result %p %s\n", savedElement, queryMap->queryName);
   }
 
-  if (newElementWasAdded == false && savedElement->lastChecked == -1) {
+  if (newElementWasAdded == false &&
+      (savedElement->lastChecked == -1 ||
+       savedElement->beingTestedForReuse)) {
     // A recursion error was encountered. We will try to gracefully handle
     // this error by adding it to the set of recursion errors on this
     // result.

From 4e09155052b6c22879178fa4a23e2130a6ffe42c Mon Sep 17 00:00:00 2001
From: Ahmad Rezaii <ahmad.rezaii@hpe.com>
Date: Wed, 9 Oct 2024 11:02:09 -0600
Subject: [PATCH 032/107] update homebrew formula for chapel to match release

Signed-off-by: Ahmad Rezaii <ahmad.rezaii@hpe.com>
---
 util/packaging/homebrew/chapel-release.rb | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/util/packaging/homebrew/chapel-release.rb b/util/packaging/homebrew/chapel-release.rb
index f4f49e2e22be..77ca8ddf36b5 100644
--- a/util/packaging/homebrew/chapel-release.rb
+++ b/util/packaging/homebrew/chapel-release.rb
@@ -1,19 +1,20 @@
 class Chapel < Formula
+  include Language::Python::Shebang
   desc "Programming language for productive parallel computing at scale"
   homepage "https://chapel-lang.org/"
   url "https://github.com/chapel-lang/chapel/releases/download/2.2.0/chapel-2.2.0.tar.gz"
   sha256 "bb16952a87127028031fd2b56781bea01ab4de7c3466f7b6a378c4d8895754b6"
   license "Apache-2.0"
-  revision 1
+  revision 2
   head "https://github.com/chapel-lang/chapel.git", branch: "main"
 
   bottle do
-    sha256 arm64_sequoia: "8d89a038eccaf6554f234a24b31d142b37043e3cb6bbffe5d11d60dac34eb163"
-    sha256 arm64_sonoma:  "a8e2a5cc575a16cc513cbdc19edd212a115e689b5d7df2f62f80d7cc08140da4"
-    sha256 arm64_ventura: "68752adba8c728b86fea019bc7080ee255f3ba81705c54db60d80d34d33db19b"
-    sha256 sonoma:        "97ab1744ea1f5e61a445a3c907d381f3b8e9a5d78f5503520a9ad89b22304dc3"
-    sha256 ventura:       "afeb776fbe3475093841eb26731f54b8533724de9c96dce750b738a22b848289"
-    sha256 x86_64_linux:  "6dcaabe4a79be7ed91b6f89c5725fa421f661b0f70baeec065872bb8fb83dfaa"
+    sha256 arm64_sequoia: "14a251ee7322a074dad39a8cc7dd0db9bf68458526ba5905c36508c4d9ee28f5"
+    sha256 arm64_sonoma:  "929ce6c154e9d54d9c795b8f869f1247cbdc0d9b5a9a30e7614842dfd7a660f0"
+    sha256 arm64_ventura: "478587cf8190effca0543bacfac22d66a9672194e73fe184408795792a209a25"
+    sha256 sonoma:        "ce4da24faa3e5723998c9dc33dfe23c32f3a31e7d47f75f30a189567b4532a90"
+    sha256 ventura:       "30295a6d3dc7218295f247445a4a9cbc3f2d58b32d07e32faf5684108676bf35"
+    sha256 x86_64_linux:  "2cfa7cbf0c3fbb43c3bc78f9b7b99e90a2a2b52aec2186256cfef0e70a804377"
   end
 
   depends_on "cmake"
@@ -37,6 +38,12 @@ def install
     # It should be noted that this will expand to: 'for cmd in python3.12 python3 python python2; do'
     # in our find-python.sh script.
     inreplace "util/config/find-python.sh", /^(for cmd in )(python3 )/, "\\1#{python} \\2"
+    inreplace "third-party/chpl-venv/Makefile", "python3 -c ", "#{python} -c "
+
+    # a lot of scripts have a python3 or python shebang, which does not point to python3.12 anymore
+    Pathname.glob("**/*.py") do |pyfile|
+      rewrite_shebang detected_python_shebang, pyfile
+    end
 
     libexec.install Dir["*"]
     # Chapel uses this ENV to work out where to install.
@@ -44,6 +51,7 @@ def install
     ENV["CHPL_GMP"] = "system"
     # This ENV avoids a problem where cmake cache is invalidated by subsequent make calls
     ENV["CHPL_CMAKE_USE_CC_CXX"] = "1"
+    ENV["CHPL_CMAKE_PYTHON"] = python
 
     # don't try to set CHPL_LLVM_GCC_PREFIX since the llvm
     # package should be configured to use a reasonable GCC

From ed09aeb2f83c783e50164b88497325ef16e11a0e Mon Sep 17 00:00:00 2001
From: Ahmad Rezaii <ahmad.rezaii@hpe.com>
Date: Wed, 9 Oct 2024 11:02:37 -0600
Subject: [PATCH 033/107] update homebrew formula for chapel on main with
 release updates

Signed-off-by: Ahmad Rezaii <ahmad.rezaii@hpe.com>
---
 util/packaging/homebrew/chapel-main.rb | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/util/packaging/homebrew/chapel-main.rb b/util/packaging/homebrew/chapel-main.rb
index 217be36bbb43..fc33f7d833fd 100644
--- a/util/packaging/homebrew/chapel-main.rb
+++ b/util/packaging/homebrew/chapel-main.rb
@@ -1,4 +1,5 @@
 class Chapel < Formula
+  include Language::Python::Shebang
   desc "Programming language for productive parallel computing at scale"
   homepage "https://chapel-lang.org/"
   url "<url-placeholder-value-injected-during-testing>"
@@ -31,6 +32,12 @@ def install
     # It should be noted that this will expand to: 'for cmd in python3.12 python3 python python2; do'
     # in our find-python.sh script.
     inreplace "util/config/find-python.sh", /^(for cmd in )(python3 )/, "\\1#{python} \\2"
+    inreplace "third-party/chpl-venv/Makefile", "python3 -c ", "#{python} -c "
+
+    # a lot of scripts have a python3 or python shebang, which does not point to python3.12 anymore
+    Pathname.glob("**/*.py") do |pyfile|
+      rewrite_shebang detected_python_shebang, pyfile
+    end
 
     libexec.install Dir["*"]
     # Chapel uses this ENV to work out where to install.
@@ -38,6 +45,7 @@ def install
     ENV["CHPL_GMP"] = "system"
     # This ENV avoids a problem where cmake cache is invalidated by subsequent make calls
     ENV["CHPL_CMAKE_USE_CC_CXX"] = "1"
+    ENV["CHPL_CMAKE_PYTHON"] = python
 
     # don't try to set CHPL_LLVM_GCC_PREFIX since the llvm
     # package should be configured to use a reasonable GCC

From ffbfdd355a7c5e6b0d57955735ad7ae2aea571a6 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Tue, 17 Sep 2024 14:45:15 -0700
Subject: [PATCH 034/107] Non-blocking PUT implementation

Previously, non-blocking PUTs were implemented via blocking PUTs, which could
severely limit performance. Prior to 2.0, small PUTs invoked fi_inject_write,
which essentially turned them into non-blocking PUTs, but chpl_comm_put
returned as if the PUT was completed. This could cause MCM violations as well
as hangs caused by not progressing the network stack properly. These
deficiences were fixed in 2.0, but led to a performance regression. This
commit implements non-blocking PUTs correctly, so that the chpl_comm_*nb*
functions work correctly. This should restore 1.32.0 performance while
avoiding MCM violations and hangs.

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/include/chpl-comm.h     |   2 +
 runtime/include/chpl-mem-desc.h |   1 +
 runtime/src/comm/ofi/comm-ofi.c | 412 ++++++++++++++++++++++----------
 3 files changed, 287 insertions(+), 128 deletions(-)

diff --git a/runtime/include/chpl-comm.h b/runtime/include/chpl-comm.h
index 9fd6d2432bee..2bb1e54ddc78 100644
--- a/runtime/include/chpl-comm.h
+++ b/runtime/include/chpl-comm.h
@@ -155,6 +155,8 @@ void chpl_comm_wait_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles);
 // detected.
 int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles);
 
+void chpl_comm_free_nb(chpl_comm_nb_handle_t* h);
+
 // Returns whether or not the passed wide address is known to be in
 // a communicable memory region and known to be readable. That is,
 // GET to that address should succeed without an access violation
diff --git a/runtime/include/chpl-mem-desc.h b/runtime/include/chpl-mem-desc.h
index 4855aef27d59..f9e65ea46892 100644
--- a/runtime/include/chpl-mem-desc.h
+++ b/runtime/include/chpl-mem-desc.h
@@ -67,6 +67,7 @@ extern "C" {
   m(COMM_PER_LOC_INFO,    "comm layer per-locale information",        false), \
   m(COMM_PRV_OBJ_ARRAY,   "comm layer private objects array",         false), \
   m(COMM_PRV_BCAST_DATA,  "comm layer private broadcast data",        false), \
+  m(COMM_NB_HANDLE,       "comm layer non-blocking handle",           false), \
   m(MEM_HEAP_SPACE,       "mem layer heap expansion space",           false), \
   m(GLOM_STRINGS_DATA,    "glom strings data",                        true ), \
   m(STRING_LITERALS_BUF,  "string literals buffer",                   true ), \
diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index 87e6c6ad6438..56fd205c8900 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -4314,10 +4314,17 @@ void amRequestExecOn(c_nodeid_t node, c_sublocid_t subloc,
   }
 }
 
-
+/*
+ * amRequestRmaPut
+ * 
+ * Performs a PUT by sending an active message to the remote node that
+ * causes it to perform GET. This is currently a blocking operation
+ * so the "blocking" argument is unused. When this operation returns
+ * the data have been successfully transmitted to the remote node.
+ */
 static inline
 void amRequestRmaPut(c_nodeid_t node, void* addr, void* raddr, size_t size,
-                     chpl_bool blocking) {
+                     chpl_bool blocking /* unused */) {
   assert(!isAmHandler);
 
   retireDelayedAmDone(false /*taskIsEnding*/);
@@ -4335,7 +4342,7 @@ void amRequestRmaPut(c_nodeid_t node, void* addr, void* raddr, size_t size,
                                .addr = raddr,
                                .raddr = myAddr,
                                .size = size, }, };
-  amRequestCommon(node, &req, sizeof(req.rma), true, NULL);
+  amRequestCommon(node, &req, sizeof(req.rma), true /*blocking*/, NULL);
 
   mrUnLocalizeSource(myAddr, addr);
 }
@@ -4360,7 +4367,7 @@ void amRequestRmaGet(c_nodeid_t node, void* addr, void* raddr, size_t size) {
                                .addr = raddr,
                                .raddr = myAddr,
                                .size = size, }, };
-  amRequestCommon(node, &req, sizeof(req.rma), true, NULL);
+  amRequestCommon(node, &req, sizeof(req.rma), true /*blocking*/, NULL);
 
   mrUnLocalizeTarget(myAddr, addr, size);
 }
@@ -5446,11 +5453,54 @@ void amCheckLiveness(void) {
 // Interface: RMA
 //
 
+// OFI-specific non-blocking handle implementation
+
+typedef struct chpl_comm_ofi_nb_handle_t {
+  chpl_bool completed;            // operation has completed
+  size_t count;                   // number of sub-operations
+  chpl_atomic_bool complete[1];   // flag for sub-operation completion
+} chpl_comm_ofi_nb_handle_t;
+
 chpl_comm_nb_handle_t chpl_comm_put_nb(void* addr, c_nodeid_t node,
                                        void* raddr, size_t size,
                                        int32_t commID, int ln, int32_t fn) {
-  chpl_comm_put(addr, node, raddr, size, commID, ln, fn);
-  return NULL;
+  DBG_PRINTF(DBG_IFACE,
+             "%s(%p, %d, %p, %zd, %d)", __func__,
+             addr, (int) node, raddr, size, (int) commID);
+
+  retireDelayedAmDone(false /*taskIsEnding*/);
+
+  chpl_comm_ofi_nb_handle_t *handle = NULL;
+
+  //
+  // Sanity checks, self-communication.
+  //
+  CHK_TRUE(addr != NULL);
+  CHK_TRUE(raddr != NULL);
+
+  if (size == 0) {
+    goto done;
+  }
+
+  if (node == chpl_nodeID) {
+    memmove(raddr, addr, size);
+    goto done;
+  }
+
+  // Communications callback support
+  if (chpl_comm_have_callbacks(chpl_comm_cb_event_kind_put)) {
+      chpl_comm_cb_info_t cb_data =
+        {chpl_comm_cb_event_kind_put, chpl_nodeID, node,
+         .iu.comm={addr, raddr, size, commID, ln, fn}};
+      chpl_comm_do_callbacks (&cb_data);
+  }
+
+  chpl_comm_diags_verbose_rdma("put", node, size, ln, fn, commID);
+  chpl_comm_diags_incr(put);
+
+  handle = ofi_put(addr, node, raddr, size, false /*blocking*/);
+done:
+  return handle;
 }
 
 
@@ -5463,35 +5513,104 @@ chpl_comm_nb_handle_t chpl_comm_get_nb(void* addr, c_nodeid_t node,
 
 
 int chpl_comm_test_nb_complete(chpl_comm_nb_handle_t h) {
+  chpl_comm_ofi_nb_handle_t *handle = (chpl_comm_ofi_nb_handle_t *) h;
   chpl_comm_diags_incr(test_nb);
-
-  // fi_cq_readfrom?
-  return ((void*) h) == NULL;
+  DBG_PRINTF(DBG_RMA, "chpl_comm_test_nb_complete %p", handle);
+  int completed = 1;
+  if (handle != NULL) {
+    completed = handle->completed;
+  }
+  DBG_PRINTF(DBG_RMA, "chpl_comm_test_nb_complete %p %s", handle,
+             completed ? "true" : "false");
+  return completed;
 }
 
+/*
+ * check_complete
+ * 
+ * Returns true if a new handle completion is detected, false otherwise
+ * Ignores handles that have previously completed. If blocking is true and
+ * there are uncompleted handles this will not return until a new handle
+ * completion is detected.
+ */
+static chpl_bool check_complete(chpl_comm_nb_handle_t* h, size_t nhandles,
+                  chpl_bool blocking) {
 
-void chpl_comm_wait_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles) {
-  chpl_comm_diags_incr(wait_nb);
+  DBG_PRINTF(DBG_RMA, "check_complete");
 
-  size_t i;
-  // fi_cq_readfrom?
-  for( i = 0; i < nhandles; i++ ) {
-    CHK_TRUE(h[i] == NULL);
+  chpl_bool completed = false; // at least one new completion detected
+  chpl_bool pending = false;  // there is a handle with uncompleted operations
+  struct perTxCtxInfo_t* tcip = NULL;
+  while (true) {
+    pending = false;
+    for(size_t i = 0; i < nhandles; i++) {
+      chpl_comm_ofi_nb_handle_t *handle = (chpl_comm_ofi_nb_handle_t *) h[i];
+      DBG_PRINTF(DBG_RMA, "handle[%d] %p", i, handle);
+
+      // ignore handles that have already completed
+      // NULL handles have by definition already completed
+      if ((handle == NULL) || handle->completed) {
+        continue;
+      }
+      pending = true;
+      // determine if this handle is now complete by checking the status
+      // of its individual operations
+      chpl_bool handleComplete = true;
+      for (size_t j = 0; j < handle->count; j++) {
+        if(!atomic_load_explicit_bool(&handle->complete[j],
+                                      chpl_memory_order_acquire)) {
+          handleComplete = false;
+          break;
+        }
+      }
+      if (handleComplete) {
+        completed = true;
+        handle->completed = true;
+        // break here when one handle completes instead of checking them all?
+      }
+    }
+    DBG_PRINTF(DBG_RMA, "check_complete blocking %s", blocking ? "true" : "false");
+    DBG_PRINTF(DBG_RMA, "check_complete completed %s", completed ? "true" : "false");
+    DBG_PRINTF(DBG_RMA, "check_complete pending %s", pending ? "true" : "false");
+    if (!blocking || completed || !pending) { 
+      break;
+    }
+    // progress the endpoint so handles can complete and then try again
+    if (tcip == NULL) {
+      CHK_TRUE((tcip = tciAlloc()) != NULL);
+    }
+    DBG_PRINTF(DBG_RMA, "check_complete yielding tcip %p", tcip);
+    sched_yield();
+    (*tcip->ensureProgressFn)(tcip);
+  }
+  if (tcip) {
+    tciFree(tcip);
   }
+  DBG_PRINTF(DBG_RMA, "check_complete returning %s", completed ? 
+             "true" : "false");
+  return completed;
 }
 
+void chpl_comm_wait_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles) {
+  chpl_comm_diags_incr(wait_nb);
+
+  DBG_PRINTF(DBG_RMA, "chpl_comm_wait_nb_some");
+
+  (void) check_complete(h, nhandles, true /*blocking*/);
+}
 
 int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles) {
   chpl_comm_diags_incr(try_nb);
 
-  size_t i;
-  // fi_cq_readfrom?
-  for( i = 0; i < nhandles; i++ ) {
-    CHK_TRUE(h[i] == NULL);
-  }
-  return 0;
+  DBG_PRINTF(DBG_RMA, "chpl_comm_try_nb_some");
+  return check_complete(h, nhandles, false /*blocking*/);
 }
 
+void chpl_comm_free_nb(chpl_comm_nb_handle_t* h) {
+  if (h != NULL) {
+    chpl_mem_free(h, 0, 0);
+  }
+}
 
 void chpl_comm_put(void* addr, c_nodeid_t node, void* raddr,
                    size_t size, int32_t commID, int ln, int32_t fn) {
@@ -5901,70 +6020,103 @@ void waitForCQSpace(struct perTxCtxInfo_t* tcip, size_t len) {
   }
 }
 
-typedef chpl_comm_nb_handle_t (rmaPutFn_t)(void* myAddr, void* mrDesc,
-                                           c_nodeid_t node,
-                                           uint64_t mrRaddr, uint64_t mrKey,
-                                           size_t size,
-                                           chpl_bool blocking,
-                                           struct perTxCtxInfo_t* tcip);
+typedef void (rmaPutFn_t)(void* myAddr, void* mrDesc,
+                          c_nodeid_t node,
+                          uint64_t mrRaddr, uint64_t mrKey,
+                          size_t size,
+                          chpl_bool blocking,
+                          chpl_atomic_bool *done,
+                          struct perTxCtxInfo_t* tcip);
 
 static rmaPutFn_t rmaPutFn_selector;
 
 static inline
 chpl_comm_nb_handle_t ofi_put(const void* addr, c_nodeid_t node,
                               void* raddr, size_t size, chpl_bool blocking) {
-  //
-  // Don't ask the provider to transfer more than it wants to.
-  //
-  if (size > ofi_info->ep_attr->max_msg_size) {
+  
+  char *src = (char *) addr;
+  char *dest = (char *) raddr;
+  chpl_comm_ofi_nb_handle_t *handle = NULL;
+ 
+  // Determine how many operations the PUT requires based on the provider's
+  // maximum message size
+
+  int ops = (size + ofi_info->ep_attr->max_msg_size - 1) /
+             ofi_info->ep_attr->max_msg_size;
+  if (ops > 1) {
     DBG_PRINTF(DBG_RMA | DBG_RMA_WRITE,
                "splitting large PUT %d:%p <= %p, size %zd",
                (int) node, raddr, addr, size);
+  }
 
-    size_t chunkSize = ofi_info->ep_attr->max_msg_size;
-    for (size_t i = 0; i < size; i += chunkSize) {
-      if (chunkSize > size - i) {
-        chunkSize = size - i;
-      }
-      (void) ofi_put(&((const char*) addr)[i], node, &((char*) raddr)[i],
-                     chunkSize, blocking);
-    }
-
-    return NULL;
+  struct perTxCtxInfo_t* tcip = NULL;
+  CHK_TRUE((tcip = tciAlloc()) != NULL);
+  if (!blocking && !tcip->bound) {
+    // Non-blocking operations require bound endpoints
+    blocking = true;
   }
 
-  DBG_PRINTF(DBG_RMA | DBG_RMA_WRITE,
-             "PUT %d:%p <= %p, size %zd",
-             (int) node, raddr, addr, size);
+  if (!blocking) {
+    // Allocate a handle large enough to hold one "done" flags per op
+    int handleSize = sizeof(chpl_comm_ofi_nb_handle_t) +
+                     ((ops - 1) * sizeof(chpl_atomic_bool));
+    handle = chpl_mem_alloc(handleSize, CHPL_RT_MD_COMM_NB_HANDLE, 0, 0);
 
-  //
-  // If the remote address is directly accessible do an RMA from this
-  // side; otherwise do the opposite RMA from the other side.
-  //
-  chpl_comm_nb_handle_t ret;
-  uint64_t mrKey;
-  uint64_t mrRaddr;
-  if (mrGetKey(&mrKey, &mrRaddr, node, raddr, size)) {
-    struct perTxCtxInfo_t* tcip;
-    CHK_TRUE((tcip = tciAlloc()) != NULL);
-    if (tcip->txCntr == NULL) {
-      waitForCQSpace(tcip, 1);
+    handle->count = ops;
+    handle->completed = false;
+    for (size_t i = 0; i < ops; i++) {
+      atomic_init_bool(&handle->complete[i], false);
     }
+  }
 
-    void* mrDesc;
-    void* myAddr = mrLocalizeSource(&mrDesc, addr, size, "PUT src");
+  size_t chunkSize = ofi_info->ep_attr->max_msg_size;
+  size_t offset = 0;
+  for (int i = 0; i < ops; i++) {
+    if (chunkSize > size - offset) {
+      chunkSize = size - offset;
+    }
+    DBG_PRINTF(DBG_RMA | DBG_RMA_WRITE,
+               "PUT %d:%p <= %p, size %zd, %s",
+               (int) node, dest, src, chunkSize,
+               blocking ? "blocking" : "non-blocking");
 
-    ret = rmaPutFn_selector(myAddr, mrDesc, node, mrRaddr, mrKey, size,
-                            blocking, tcip);
+    //
+    // If the remote address is directly accessible do an RMA from this
+    // side; otherwise do the opposite RMA from the other side.
+    //
+    uint64_t mrKey;
+    uint64_t mrRaddr;
+    if (mrGetKey(&mrKey, &mrRaddr, node, (void *) dest, chunkSize)) {
+      if (tcip->txCntr == NULL) {
+        // TODO: why is this necessary?
+        waitForCQSpace(tcip, 1);
+      }
 
-    mrUnLocalizeSource(myAddr, addr);
-    tciFree(tcip);
-  } else {
-    amRequestRmaPut(node, (void*) addr, raddr, size, blocking);
-    ret = NULL;
-  }
+      void* mrDesc;
+      void* myAddr = mrLocalizeSource(&mrDesc, (const void *) src,
+                                      chunkSize, "PUT src");
 
-  return ret;
+      chpl_atomic_bool *done = blocking ? NULL : &handle->complete[i];
+      rmaPutFn_selector(myAddr, mrDesc, node, mrRaddr, mrKey, chunkSize,
+                        blocking, done, tcip);
+
+      mrUnLocalizeSource(myAddr, src);
+    } else {
+      amRequestRmaPut(node, (void *) src, (void *) dest, size, blocking);
+      // amRequestRmaPut is currently a blocking operation, so mark
+      // the operation as complete
+      if (!blocking) {
+        atomic_init_bool(&(handle->complete[i]), true);
+      }
+    }
+    offset += chunkSize;
+    src += chunkSize;
+    dest += chunkSize;
+  }
+  tciFree(tcip);
+  DBG_PRINTF(DBG_RMA | DBG_RMA_WRITE,
+               "PUT %d:%p <= %p, handle %p", handle);
+  return handle;
 }
 
 
@@ -5973,33 +6125,31 @@ static rmaPutFn_t rmaPutFn_msgOrd;
 static rmaPutFn_t rmaPutFn_dlvrCmplt;
 
 static inline
-chpl_comm_nb_handle_t rmaPutFn_selector(void* myAddr, void* mrDesc,
-                                        c_nodeid_t node,
-                                        uint64_t mrRaddr, uint64_t mrKey,
-                                        size_t size,
-                                        chpl_bool blocking,
-                                        struct perTxCtxInfo_t* tcip) {
-  chpl_comm_nb_handle_t ret = NULL;
+void rmaPutFn_selector(void* myAddr, void* mrDesc,
+                       c_nodeid_t node,
+                       uint64_t mrRaddr, uint64_t mrKey,
+                       size_t size,
+                       chpl_bool blocking,
+                       chpl_atomic_bool *done, 
+                       struct perTxCtxInfo_t* tcip) {
 
   switch (mcmMode) {
-  case mcmm_msgOrdFence:
-    ret = rmaPutFn_msgOrdFence(myAddr, mrDesc, node, mrRaddr, mrKey, size,
-                               blocking, tcip);
-    break;
-  case mcmm_msgOrd:
-    ret = rmaPutFn_msgOrd(myAddr, mrDesc, node, mrRaddr, mrKey, size,
-                          blocking, tcip);
-    break;
-  case mcmm_dlvrCmplt:
-    ret = rmaPutFn_dlvrCmplt(myAddr, mrDesc, node, mrRaddr, mrKey, size,
-                             blocking, tcip);
-    break;
-  default:
-    INTERNAL_ERROR_V("unexpected mcmMode %d", mcmMode);
-    break;
-  }
-
-  return ret;
+    case mcmm_msgOrdFence:
+      rmaPutFn_msgOrdFence(myAddr, mrDesc, node, mrRaddr, mrKey, size,
+                                 blocking, done, tcip);
+      break;
+    case mcmm_msgOrd:
+      rmaPutFn_msgOrd(myAddr, mrDesc, node, mrRaddr, mrKey, size,
+                            blocking, done, tcip);
+      break;
+    case mcmm_dlvrCmplt:
+      rmaPutFn_dlvrCmplt(myAddr, mrDesc, node, mrRaddr, mrKey, size,
+                               blocking, done, tcip);
+      break;
+    default:
+      INTERNAL_ERROR_V("unexpected mcmMode %d", mcmMode);
+      break;
+    }
 }
 
 
@@ -6020,12 +6170,13 @@ static ssize_t wrap_fi_writemsg(const void* addr, void* mrDesc,
 // Implements ofi_put() when MCM mode is message ordering with fences.
 //
 static
-chpl_comm_nb_handle_t rmaPutFn_msgOrdFence(void* myAddr, void* mrDesc,
-                                           c_nodeid_t node,
-                                           uint64_t mrRaddr, uint64_t mrKey,
-                                           size_t size,
-                                           chpl_bool blocking,
-                                           struct perTxCtxInfo_t* tcip) {
+void rmaPutFn_msgOrdFence(void* myAddr, void* mrDesc,
+                          c_nodeid_t node,
+                          uint64_t mrRaddr, uint64_t mrKey,
+                          size_t size,
+                          chpl_bool blocking,
+                          chpl_atomic_bool *done,
+                          struct perTxCtxInfo_t* tcip) {
   uint64_t    flags = 0;
   chpl_atomic_bool txnDone;
   void        *ctx;
@@ -6040,17 +6191,18 @@ chpl_comm_nb_handle_t rmaPutFn_msgOrdFence(void* myAddr, void* mrDesc,
     // memory visibility until later.
     //
     flags = FI_INJECT;
-  } else {
-    blocking = true;
   }
   if (bitmapTest(tcip->amoVisBitmap, node)) {
     //
-    // Special case: If our last operation was an AMO  then we need to do a
-    // fenced PUT to force the AMO to complete before this PUT.
+    // Special case: If our last operation was an AMO then we need to do a
+    // fenced PUT to force the AMO to be visible before this PUT.
     //
     flags |= FI_FENCE;
   }
-  ctx = TX_CTX_INIT(tcip, blocking, &txnDone);
+  if (done == NULL) {
+    done = &txnDone;
+  }
+  ctx = txCtxInit(tcip, __LINE__, done);
   (void) wrap_fi_writemsg(myAddr, mrDesc, node, mrRaddr, mrKey, size,
                               ctx, flags, tcip);
   if (blocking) {
@@ -6070,8 +6222,6 @@ chpl_comm_nb_handle_t rmaPutFn_msgOrdFence(void* myAddr, void* mrDesc,
   } else {
     mcmReleaseOneNode(node, tcip, "PUT");
   }
-
-  return NULL;
 }
 
 
@@ -6079,12 +6229,13 @@ chpl_comm_nb_handle_t rmaPutFn_msgOrdFence(void* myAddr, void* mrDesc,
 // Implements ofi_put() when MCM mode is message ordering.
 // TODO: see comment for rmaPutFn_msgOrdFence.
 static
-chpl_comm_nb_handle_t rmaPutFn_msgOrd(void* myAddr, void* mrDesc,
-                                      c_nodeid_t node,
-                                      uint64_t mrRaddr, uint64_t mrKey,
-                                      size_t size,
-                                      chpl_bool blocking,
-                                      struct perTxCtxInfo_t* tcip) {
+void rmaPutFn_msgOrd(void* myAddr, void* mrDesc,
+                     c_nodeid_t node,
+                     uint64_t mrRaddr, uint64_t mrKey,
+                     size_t size,
+                     chpl_bool blocking,
+                     chpl_atomic_bool *done,
+                     struct perTxCtxInfo_t* tcip) {
 
   uint64_t    flags = 0;
   chpl_atomic_bool txnDone;
@@ -6106,10 +6257,11 @@ chpl_comm_nb_handle_t rmaPutFn_msgOrd(void* myAddr, void* mrDesc,
     // and we have a bound tx context so we can delay forcing the
     // memory visibility until later.
     flags = FI_INJECT;
-  } else {
-    blocking = true;
   }
-  ctx = TX_CTX_INIT(tcip, blocking, &txnDone);
+  if (done == NULL) {
+    done = &txnDone;
+  }
+  ctx = TX_CTX_INIT(tcip, blocking, done);
   (void) wrap_fi_writemsg(myAddr, mrDesc, node, mrRaddr, mrKey, size,
                           ctx, flags, tcip);
 
@@ -6123,8 +6275,6 @@ chpl_comm_nb_handle_t rmaPutFn_msgOrd(void* myAddr, void* mrDesc,
   } else {
     mcmReleaseOneNode(node, tcip, "PUT");
   }
-
-  return NULL;
 }
 
 
@@ -6132,19 +6282,24 @@ chpl_comm_nb_handle_t rmaPutFn_msgOrd(void* myAddr, void* mrDesc,
 // Implements ofi_put() when MCM mode is delivery complete.
 //
 static
-chpl_comm_nb_handle_t rmaPutFn_dlvrCmplt(void* myAddr, void* mrDesc,
-                                         c_nodeid_t node,
-                                         uint64_t mrRaddr, uint64_t mrKey,
-                                         size_t size,
-                                         chpl_bool blocking,
-                                         struct perTxCtxInfo_t* tcip) {
+void rmaPutFn_dlvrCmplt(void* myAddr, void* mrDesc,
+                        c_nodeid_t node,
+                        uint64_t mrRaddr, uint64_t mrKey,
+                        size_t size,
+                        chpl_bool blocking,
+                        chpl_atomic_bool *done,
+                        struct perTxCtxInfo_t* tcip) {
   chpl_atomic_bool txnDone;
-  void *ctx = TX_CTX_INIT(tcip, true /*blocking*/, &txnDone);
+  if (done == NULL) {
+    done = &txnDone;
+  }
+  void *ctx = TX_CTX_INIT(tcip, blocking, done);
   (void) wrap_fi_write(myAddr, mrDesc, node, mrRaddr, mrKey,
                        size, ctx, tcip);
-  waitForTxnComplete(tcip, ctx);
-  txCtxCleanup(ctx);
-  return NULL;
+  if (blocking) {
+    waitForTxnComplete(tcip, ctx);
+    txCtxCleanup(ctx);
+  }
 }
 
 
@@ -6193,8 +6348,8 @@ ssize_t wrap_fi_writemsg(const void* addr, void* mrDesc,
   }
   DBG_PRINTF(DBG_RMA | DBG_RMA_WRITE,
              "tx write msg: %d:%#" PRIx64 " <= %p, size %zd, ctx %p, "
-             "flags %#" PRIx64,
-             (int) node, mrRaddr, addr, size, ctx, flags);
+             "flags %#" PRIx64 " tcip %p",
+             (int) node, mrRaddr, addr, size, ctx, flags, tcip);
   OFI_RIDE_OUT_EAGAIN(tcip, fi_writemsg(tcip->txCtx, &msg, flags));
   tcip->numTxnsOut++;
   tcip->numTxnsSent++;
@@ -6375,6 +6530,7 @@ chpl_comm_nb_handle_t ofi_get(void* addr, c_nodeid_t node,
   if (mrGetKey(&mrKey, &mrRaddr, node, raddr, size)) {
     struct perTxCtxInfo_t* tcip;
     CHK_TRUE((tcip = tciAlloc()) != NULL);
+    // TODO: Why is this necessary?
     waitForCQSpace(tcip, 1);
 
     void* mrDesc;

From 4ba2770446315b28d21692cc912c5aa86c861a94 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Wed, 18 Sep 2024 06:39:27 -0700
Subject: [PATCH 035/107] Added comments

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index 56fd205c8900..8eda55eb45ef 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -5453,7 +5453,15 @@ void amCheckLiveness(void) {
 // Interface: RMA
 //
 
-// OFI-specific non-blocking handle implementation
+// OFI-specific non-blocking handle implementation 
+
+// Non-blocking operations require bound endpoints, to avoid having one thread
+// with a pending operation while the endpoint is in use by a different
+// thread. Since we assume bound endpoints are the norm, it's easiest to just
+// disallow non-bound endpoints. This allows the "completed" flag to be a
+// simple boolean. The "complete" flags for the sub-operations are booleans
+// because the lower-level code that uses them does not assume bound
+// endpoints.
 
 typedef struct chpl_comm_ofi_nb_handle_t {
   chpl_bool completed;            // operation has completed
@@ -5461,6 +5469,18 @@ typedef struct chpl_comm_ofi_nb_handle_t {
   chpl_atomic_bool complete[1];   // flag for sub-operation completion
 } chpl_comm_ofi_nb_handle_t;
 
+/*
+ * chpl_comm_put_nb
+ *
+ * Non-blocking PUT. The PUT may complete after this function returns. Returns
+ * a handle that can be used to wait for and check the status of the PUT. The
+ * handle may be NULL, in which case the PUT has already completed. The
+ * memory buffer must not be modified before the PUT completes. Completion
+ * indicates that subsequent PUTs to the same memory will occur after the
+ * completed PUT; it does not mean that the results of the PUT are visible in
+ * memory (see the README.md for details). Concurrent non-blocking PUTs may
+ * occur in any order.
+ */
 chpl_comm_nb_handle_t chpl_comm_put_nb(void* addr, c_nodeid_t node,
                                        void* raddr, size_t size,
                                        int32_t commID, int ln, int32_t fn) {
@@ -5566,7 +5586,6 @@ static chpl_bool check_complete(chpl_comm_nb_handle_t* h, size_t nhandles,
       if (handleComplete) {
         completed = true;
         handle->completed = true;
-        // break here when one handle completes instead of checking them all?
       }
     }
     DBG_PRINTF(DBG_RMA, "check_complete blocking %s", blocking ? "true" : "false");

From 132677df14f6e8d51d03f8b9d078be04c89761bd Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Wed, 18 Sep 2024 06:48:11 -0700
Subject: [PATCH 036/107] Free non-blocking handle after operation completes

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/chpl-cache.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/runtime/src/chpl-cache.c b/runtime/src/chpl-cache.c
index 11acf41d4717..2503eafb1eb6 100644
--- a/runtime/src/chpl-cache.c
+++ b/runtime/src/chpl-cache.c
@@ -1967,6 +1967,7 @@ chpl_bool do_wait_for(struct rdcache_s* cache, cache_seqn_t sn)
     // Whether we waited above or not, if the first entry's event
     // is already complete, then remove it from the queue.
     if (chpl_comm_test_nb_complete(cache->pending[index])) {
+      chpl_comm_free_nb(cache->pending[index]);
       fifo_circleb_pop(&cache->pending_first_entry,
                        &cache->pending_last_entry,
                        cache->pending_len);

From 11eccc12ca2b48290b32c7a8d77d8e720c79014f Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Wed, 18 Sep 2024 10:26:15 -0700
Subject: [PATCH 037/107] Cleanup

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/include/chpl-comm.h     |  1 +
 runtime/src/comm/ofi/comm-ofi.c | 42 +++++++++------------------------
 2 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/runtime/include/chpl-comm.h b/runtime/include/chpl-comm.h
index 2bb1e54ddc78..e24ef5d8647a 100644
--- a/runtime/include/chpl-comm.h
+++ b/runtime/include/chpl-comm.h
@@ -155,6 +155,7 @@ void chpl_comm_wait_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles);
 // detected.
 int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles);
 
+// Free a handle returned by chpl_comm_*_nb.
 void chpl_comm_free_nb(chpl_comm_nb_handle_t* h);
 
 // Returns whether or not the passed wide address is known to be in
diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index 8eda55eb45ef..22d947ea4790 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -5455,13 +5455,13 @@ void amCheckLiveness(void) {
 
 // OFI-specific non-blocking handle implementation 
 
-// Non-blocking operations require bound endpoints, to avoid having one thread
-// with a pending operation while the endpoint is in use by a different
-// thread. Since we assume bound endpoints are the norm, it's easiest to just
-// disallow non-bound endpoints. This allows the "completed" flag to be a
-// simple boolean. The "complete" flags for the sub-operations are booleans
-// because the lower-level code that uses them does not assume bound
-// endpoints.
+// Non-blocking operations require bound endpoints, to avoid having a handle
+// for a pending operation held by one thread, while the endpoint is in use
+// by a different thread. Bound endpoints are the norm, so it's easiest to
+// just disallow non-blocking operations on non-bound endpoints. This allows
+// the "completed" flag to be a simple boolean. The "complete" flags for the
+// sub-operations are booleans because the lower-level code that uses them
+// does not assume bound endpoints.
 
 typedef struct chpl_comm_ofi_nb_handle_t {
   chpl_bool completed;            // operation has completed
@@ -5531,33 +5531,23 @@ chpl_comm_nb_handle_t chpl_comm_get_nb(void* addr, c_nodeid_t node,
   return NULL;
 }
 
-
 int chpl_comm_test_nb_complete(chpl_comm_nb_handle_t h) {
   chpl_comm_ofi_nb_handle_t *handle = (chpl_comm_ofi_nb_handle_t *) h;
   chpl_comm_diags_incr(test_nb);
-  DBG_PRINTF(DBG_RMA, "chpl_comm_test_nb_complete %p", handle);
-  int completed = 1;
-  if (handle != NULL) {
-    completed = handle->completed;
-  }
-  DBG_PRINTF(DBG_RMA, "chpl_comm_test_nb_complete %p %s", handle,
-             completed ? "true" : "false");
-  return completed;
+  return handle != NULL ? handle->completed : 1;
 }
 
 /*
  * check_complete
  * 
- * Returns true if a new handle completion is detected, false otherwise
+ * Returns true if a new handle completion is detected, false otherwise.
  * Ignores handles that have previously completed. If blocking is true and
- * there are uncompleted handles this will not return until a new handle
- * completion is detected.
+ * there are uncompleted handles this will not return until a new completion
+ * is detected.
  */
 static chpl_bool check_complete(chpl_comm_nb_handle_t* h, size_t nhandles,
                   chpl_bool blocking) {
 
-  DBG_PRINTF(DBG_RMA, "check_complete");
-
   chpl_bool completed = false; // at least one new completion detected
   chpl_bool pending = false;  // there is a handle with uncompleted operations
   struct perTxCtxInfo_t* tcip = NULL;
@@ -5565,7 +5555,6 @@ static chpl_bool check_complete(chpl_comm_nb_handle_t* h, size_t nhandles,
     pending = false;
     for(size_t i = 0; i < nhandles; i++) {
       chpl_comm_ofi_nb_handle_t *handle = (chpl_comm_ofi_nb_handle_t *) h[i];
-      DBG_PRINTF(DBG_RMA, "handle[%d] %p", i, handle);
 
       // ignore handles that have already completed
       // NULL handles have by definition already completed
@@ -5588,9 +5577,6 @@ static chpl_bool check_complete(chpl_comm_nb_handle_t* h, size_t nhandles,
         handle->completed = true;
       }
     }
-    DBG_PRINTF(DBG_RMA, "check_complete blocking %s", blocking ? "true" : "false");
-    DBG_PRINTF(DBG_RMA, "check_complete completed %s", completed ? "true" : "false");
-    DBG_PRINTF(DBG_RMA, "check_complete pending %s", pending ? "true" : "false");
     if (!blocking || completed || !pending) { 
       break;
     }
@@ -5598,30 +5584,24 @@ static chpl_bool check_complete(chpl_comm_nb_handle_t* h, size_t nhandles,
     if (tcip == NULL) {
       CHK_TRUE((tcip = tciAlloc()) != NULL);
     }
-    DBG_PRINTF(DBG_RMA, "check_complete yielding tcip %p", tcip);
     sched_yield();
     (*tcip->ensureProgressFn)(tcip);
   }
   if (tcip) {
     tciFree(tcip);
   }
-  DBG_PRINTF(DBG_RMA, "check_complete returning %s", completed ? 
-             "true" : "false");
   return completed;
 }
 
 void chpl_comm_wait_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles) {
   chpl_comm_diags_incr(wait_nb);
 
-  DBG_PRINTF(DBG_RMA, "chpl_comm_wait_nb_some");
-
   (void) check_complete(h, nhandles, true /*blocking*/);
 }
 
 int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles) {
   chpl_comm_diags_incr(try_nb);
 
-  DBG_PRINTF(DBG_RMA, "chpl_comm_try_nb_some");
   return check_complete(h, nhandles, false /*blocking*/);
 }
 

From a77c004441a347093e98f1d427d04a73be5041dd Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Thu, 19 Sep 2024 14:18:34 -0700
Subject: [PATCH 038/107] Rewrote PUT logic

Rewrote PUT logic so that low-level functions are non-blocking, and a blocking
PUT is implemented by initiating a non-blocking PUT and waiting for it to
complete. This simplifies the implementation and avoids code duplication.

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/include/chpl-comm.h     |   2 +-
 runtime/src/comm/ofi/comm-ofi.c | 440 ++++++++++++++++----------------
 2 files changed, 225 insertions(+), 217 deletions(-)

diff --git a/runtime/include/chpl-comm.h b/runtime/include/chpl-comm.h
index e24ef5d8647a..07a4037e2c23 100644
--- a/runtime/include/chpl-comm.h
+++ b/runtime/include/chpl-comm.h
@@ -156,7 +156,7 @@ void chpl_comm_wait_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles);
 int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles);
 
 // Free a handle returned by chpl_comm_*_nb.
-void chpl_comm_free_nb(chpl_comm_nb_handle_t* h);
+void chpl_comm_free_nb(chpl_comm_nb_handle_t h);
 
 // Returns whether or not the passed wide address is known to be in
 // a communicable memory region and known to be readable. That is,
diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index 22d947ea4790..f9c03e98a26d 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -359,6 +359,19 @@ static const char* mcmModeNames[] = { "undefined",
 
 static bool cxiHybridMRMode = false;
 
+
+//
+// Non-blocking handle
+//
+typedef struct nb_handle {
+  chpl_taskID_t id;          // task that created the handle
+  chpl_bool reported;        // operation has been reported as complete
+  chpl_atomic_bool complete; // operation has completed
+  struct nb_handle *next;
+} nb_handle;
+
+typedef nb_handle* nb_handle_t;
+
 ////////////////////////////////////////
 //
 // Forward decls
@@ -369,8 +382,8 @@ static struct perTxCtxInfo_t* tciAllocForAmHandler(void);
 static chpl_bool tciAllocTabEntry(struct perTxCtxInfo_t*);
 static void tciFree(struct perTxCtxInfo_t*);
 static void waitForCQSpace(struct perTxCtxInfo_t*, size_t);
-static chpl_comm_nb_handle_t ofi_put(const void*, c_nodeid_t, void*, size_t,
-                                     chpl_bool);
+static void ofi_put(const void*, c_nodeid_t, void*, size_t);
+static nb_handle_t ofi_put_nb(nb_handle_t, const void*, c_nodeid_t, void*, size_t);
 static void ofi_put_lowLevel(const void*, void*, c_nodeid_t,
                              uint64_t, uint64_t, size_t, void*,
                              uint64_t, struct perTxCtxInfo_t*);
@@ -379,6 +392,8 @@ static chpl_comm_nb_handle_t ofi_get(void*, c_nodeid_t, void*, size_t);
 static void ofi_get_lowLevel(void*, void*, c_nodeid_t,
                              uint64_t, uint64_t, size_t, void*,
                              uint64_t, struct perTxCtxInfo_t*);
+static chpl_bool check_complete(nb_handle_t*, size_t, chpl_bool);
+
 static void do_remote_get_buff(void*, c_nodeid_t, void*, size_t);
 static void do_remote_amo_nf_buff(void*, c_nodeid_t, void*, size_t,
                                   enum fi_op, enum fi_datatype);
@@ -3322,7 +3337,7 @@ void chpl_comm_broadcast_private(int id, size_t size) {
   for (int i = 0; i < chpl_numNodes; i++) {
     if (i != chpl_nodeID) {
       (void) ofi_put(chpl_rt_priv_bcast_tab[id], i,
-                     chplPrivBcastTabMap[i][id], size, true /*blocking*/);
+                     chplPrivBcastTabMap[i][id], size);
     }
   }
 }
@@ -4170,7 +4185,7 @@ static void am_debugPrep(amRequest_t*);
 static void amRequestExecOn(c_nodeid_t, c_sublocid_t, chpl_fn_int_t,
                             chpl_comm_on_bundle_t*, size_t,
                             chpl_bool, chpl_bool);
-static void amRequestRmaPut(c_nodeid_t, void*, void*, size_t, chpl_bool);
+static void amRequestRmaPut(c_nodeid_t, void*, void*, size_t);
 static void amRequestRmaGet(c_nodeid_t, void*, void*, size_t);
 static void amRequestAMO(c_nodeid_t, void*, const void*, const void*, void*,
                          int, enum fi_datatype, size_t);
@@ -4317,14 +4332,11 @@ void amRequestExecOn(c_nodeid_t node, c_sublocid_t subloc,
 /*
  * amRequestRmaPut
  * 
- * Performs a PUT by sending an active message to the remote node that
- * causes it to perform GET. This is currently a blocking operation
- * so the "blocking" argument is unused. When this operation returns
- * the data have been successfully transmitted to the remote node.
+ * Performs a PUT by sending an active message to the remote node that causes
+ * it to perform a GET. This operation returns when the GET has completed.
  */
 static inline
-void amRequestRmaPut(c_nodeid_t node, void* addr, void* raddr, size_t size,
-                     chpl_bool blocking /* unused */) {
+void amRequestRmaPut(c_nodeid_t node, void* addr, void* raddr, size_t size) {
   assert(!isAmHandler);
 
   retireDelayedAmDone(false /*taskIsEnding*/);
@@ -5288,8 +5300,7 @@ void amWrapPut(struct taskArg_RMA_t* tsk_rma) {
   DBG_PRINTF(DBG_AM | DBG_AM_RECV, "%s", am_reqStartStr((amRequest_t*) rma));
 
   CHK_TRUE(mrGetKey(NULL, NULL, rma->b.node, rma->raddr, rma->size));
-  (void) ofi_put(rma->addr, rma->b.node, rma->raddr, rma->size,
-                 true /*blocking*/);
+  (void) ofi_put(rma->addr, rma->b.node, rma->raddr, rma->size);
 
   //
   // Note: the RMA bytes must be visible in target memory before the
@@ -5455,6 +5466,8 @@ void amCheckLiveness(void) {
 
 // OFI-specific non-blocking handle implementation 
 
+// XXX update
+
 // Non-blocking operations require bound endpoints, to avoid having a handle
 // for a pending operation held by one thread, while the endpoint is in use
 // by a different thread. Bound endpoints are the norm, so it's easiest to
@@ -5463,35 +5476,32 @@ void amCheckLiveness(void) {
 // sub-operations are booleans because the lower-level code that uses them
 // does not assume bound endpoints.
 
-typedef struct chpl_comm_ofi_nb_handle_t {
-  chpl_bool completed;            // operation has completed
-  size_t count;                   // number of sub-operations
-  chpl_atomic_bool complete[1];   // flag for sub-operation completion
-} chpl_comm_ofi_nb_handle_t;
+static inline 
+void nb_handle_init(nb_handle_t h) {
+  h->id = chpl_task_getId();
+  h->reported = false;
+  atomic_init_bool(&h->complete, false);
+  h->next = NULL;
+}
+
+static inline 
+void nb_handle_destroy(nb_handle_t h) {
+  atomic_destroy_bool(&h->complete);
+}
 
 /*
- * chpl_comm_put_nb
+ * put_prologue
  *
- * Non-blocking PUT. The PUT may complete after this function returns. Returns
- * a handle that can be used to wait for and check the status of the PUT. The
- * handle may be NULL, in which case the PUT has already completed. The
- * memory buffer must not be modified before the PUT completes. Completion
- * indicates that subsequent PUTs to the same memory will occur after the
- * completed PUT; it does not mean that the results of the PUT are visible in
- * memory (see the README.md for details). Concurrent non-blocking PUTs may
- * occur in any order.
+ * Common prologue operations for chpl_comm_put and chpl_comm_put_nb. Returns
+ * true if the PUT should proceed, false if it was handled in this function.
  */
-chpl_comm_nb_handle_t chpl_comm_put_nb(void* addr, c_nodeid_t node,
-                                       void* raddr, size_t size,
-                                       int32_t commID, int ln, int32_t fn) {
-  DBG_PRINTF(DBG_IFACE,
-             "%s(%p, %d, %p, %zd, %d)", __func__,
-             addr, (int) node, raddr, size, (int) commID);
+static inline
+chpl_bool put_prologue(void* addr, c_nodeid_t node, void* raddr, size_t size,
+                       int32_t commID, int ln, int32_t fn) {
 
+  chpl_bool proceed = false;
   retireDelayedAmDone(false /*taskIsEnding*/);
 
-  chpl_comm_ofi_nb_handle_t *handle = NULL;
-
   //
   // Sanity checks, self-communication.
   //
@@ -5517,12 +5527,36 @@ chpl_comm_nb_handle_t chpl_comm_put_nb(void* addr, c_nodeid_t node,
 
   chpl_comm_diags_verbose_rdma("put", node, size, ln, fn, commID);
   chpl_comm_diags_incr(put);
-
-  handle = ofi_put(addr, node, raddr, size, false /*blocking*/);
+  proceed = true;
 done:
-  return handle;
+  return proceed;
 }
 
+/*
+ * chpl_comm_put_nb
+ *
+ * Non-blocking PUT. The PUT may complete after this function returns. Returns
+ * a handle that can be used to wait for and check the status of the PUT. The
+ * handle may be NULL, in which case the PUT has already completed. The
+ * memory buffer must not be modified before the PUT completes. Completion
+ * indicates that subsequent PUTs to the same memory will occur after the
+ * completed PUT; it does not mean that the results of the PUT are visible in
+ * memory (see the README.md for details). Concurrent non-blocking PUTs may
+ * occur in any order.
+ */
+chpl_comm_nb_handle_t chpl_comm_put_nb(void* addr, c_nodeid_t node,
+                                       void* raddr, size_t size,
+                                       int32_t commID, int ln, int32_t fn) {
+  DBG_PRINTF(DBG_IFACE,
+             "%s(%p, %d, %p, %zd, %d)", __func__,
+             addr, (int) node, raddr, size, (int) commID);
+
+  nb_handle_t handle = NULL; 
+  if (put_prologue(addr, node, raddr, size, commID, ln, fn)) {
+    handle = ofi_put_nb(handle, addr, node, raddr, size);
+  }
+  return (chpl_comm_nb_handle_t) handle;
+}
 
 chpl_comm_nb_handle_t chpl_comm_get_nb(void* addr, c_nodeid_t node,
                                        void* raddr, size_t size,
@@ -5531,50 +5565,77 @@ chpl_comm_nb_handle_t chpl_comm_get_nb(void* addr, c_nodeid_t node,
   return NULL;
 }
 
+
+static inline
+int test_nb_complete(nb_handle_t handle) {
+  return handle != NULL ? handle->reported : 1;
+}
+
+static inline
+void wait_nb_some(nb_handle_t *handles, size_t nhandles) {
+  (void) check_complete(handles, nhandles, true /*blocking*/);
+}
+
+static inline
+int try_nb_some(nb_handle_t *handles, size_t nhandles) {
+  return check_complete(handles, nhandles, false /*blocking*/);
+}
+
 int chpl_comm_test_nb_complete(chpl_comm_nb_handle_t h) {
-  chpl_comm_ofi_nb_handle_t *handle = (chpl_comm_ofi_nb_handle_t *) h;
   chpl_comm_diags_incr(test_nb);
-  return handle != NULL ? handle->completed : 1;
+  return test_nb_complete((nb_handle_t) h);
 }
 
 /*
  * check_complete
  * 
  * Returns true if a new handle completion is detected, false otherwise.
- * Ignores handles that have previously completed. If blocking is true and
- * there are uncompleted handles this will not return until a new completion
- * is detected.
+ * Ignores handles that have previously completed (h->reported == true). If
+ * blocking is true and there are uncompleted handles this will not return
+ * until a new completion is detected.
  */
-static chpl_bool check_complete(chpl_comm_nb_handle_t* h, size_t nhandles,
+static
+chpl_bool check_complete(nb_handle_t *handles, size_t nhandles,
                   chpl_bool blocking) {
 
   chpl_bool completed = false; // at least one new completion detected
-  chpl_bool pending = false;  // there is a handle with uncompleted operations
+  chpl_bool pending = false;  // there is an uncompleted handle
+  if ((handles == NULL) || (nhandles == 0)) {
+    goto done;
+  }
   struct perTxCtxInfo_t* tcip = NULL;
   while (true) {
     pending = false;
     for(size_t i = 0; i < nhandles; i++) {
-      chpl_comm_ofi_nb_handle_t *handle = (chpl_comm_ofi_nb_handle_t *) h[i];
-
+      nb_handle_t handle = handles[i];
       // ignore handles that have already completed
       // NULL handles have by definition already completed
-      if ((handle == NULL) || handle->completed) {
+      if ((handle == NULL) || handle->reported) {
         continue;
       }
+      if (handle->id != chpl_task_getId()) {
+        char msg[128];
+        char task1[32];
+        char task2[32];
+        snprintf(msg, sizeof(msg),
+             "Task %s did not create non-blocking handle (created by %s)",
+            chpl_task_idToString(task1, sizeof(task1), chpl_task_getId()),
+            chpl_task_idToString(task2, sizeof(task2), handle->id));
+      }
       pending = true;
-      // determine if this handle is now complete by checking the status
-      // of its individual operations
-      chpl_bool handleComplete = true;
-      for (size_t j = 0; j < handle->count; j++) {
-        if(!atomic_load_explicit_bool(&handle->complete[j],
+      // determine if this handle is now complete by checking the completion
+      // status of its operations
+      chpl_bool allComplete = true;
+      for (nb_handle_t p = handle; p != NULL; p = p->next) {
+        if(!atomic_load_explicit_bool(&p->complete,
                                       chpl_memory_order_acquire)) {
-          handleComplete = false;
+          allComplete = false;
           break;
         }
       }
-      if (handleComplete) {
+      if (allComplete) {
         completed = true;
-        handle->completed = true;
+        handle->reported = true;
       }
     }
     if (!blocking || completed || !pending) { 
@@ -5590,24 +5651,27 @@ static chpl_bool check_complete(chpl_comm_nb_handle_t* h, size_t nhandles,
   if (tcip) {
     tciFree(tcip);
   }
+done:
   return completed;
 }
 
 void chpl_comm_wait_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles) {
   chpl_comm_diags_incr(wait_nb);
-
-  (void) check_complete(h, nhandles, true /*blocking*/);
+  wait_nb_some((nb_handle_t *) h, nhandles);
 }
 
 int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles) {
   chpl_comm_diags_incr(try_nb);
-
-  return check_complete(h, nhandles, false /*blocking*/);
+  return try_nb_some((nb_handle_t *) h, nhandles);
 }
 
-void chpl_comm_free_nb(chpl_comm_nb_handle_t* h) {
-  if (h != NULL) {
-    chpl_mem_free(h, 0, 0);
+void chpl_comm_free_nb(chpl_comm_nb_handle_t h) {
+  nb_handle_t handle = (nb_handle_t) h;
+  nb_handle_t next;
+  for (; handle != NULL; handle = next) {
+    next = handle->next;
+    nb_handle_destroy(handle);
+    chpl_mem_free(handle, 0, 0);
   }
 }
 
@@ -5617,38 +5681,11 @@ void chpl_comm_put(void* addr, c_nodeid_t node, void* raddr,
              "%s(%p, %d, %p, %zd, %d)", __func__,
              addr, (int) node, raddr, size, (int) commID);
 
-  retireDelayedAmDone(false /*taskIsEnding*/);
-
-  //
-  // Sanity checks, self-communication.
-  //
-  CHK_TRUE(addr != NULL);
-  CHK_TRUE(raddr != NULL);
-
-  if (size == 0) {
-    return;
-  }
-
-  if (node == chpl_nodeID) {
-    memmove(raddr, addr, size);
-    return;
-  }
-
-  // Communications callback support
-  if (chpl_comm_have_callbacks(chpl_comm_cb_event_kind_put)) {
-      chpl_comm_cb_info_t cb_data =
-        {chpl_comm_cb_event_kind_put, chpl_nodeID, node,
-         .iu.comm={addr, raddr, size, commID, ln, fn}};
-      chpl_comm_do_callbacks (&cb_data);
+  if (put_prologue(addr, node, raddr, size, commID, ln, fn)) {
+    ofi_put(addr, node, raddr, size);
   }
-
-  chpl_comm_diags_verbose_rdma("put", node, size, ln, fn, commID);
-  chpl_comm_diags_incr(put);
-
-  (void) ofi_put(addr, node, raddr, size, true /*blocking*/);
 }
 
-
 void chpl_comm_get(void* addr, int32_t node, void* raddr,
                    size_t size, int32_t commID, int ln, int32_t fn) {
   DBG_PRINTF(DBG_IFACE,
@@ -6000,6 +6037,7 @@ void tciFree(struct perTxCtxInfo_t* tcip) {
   //
   if (!tcip->bound) {
     DBG_PRINTF(DBG_TCIPS, "free tciTab[%td]", tcip - tciTab);
+    forceMemFxVisAllNodes(true, true, -1, tcip);
     atomic_store_bool(&tcip->allocated, false);
   }
 }
@@ -6019,30 +6057,54 @@ void waitForCQSpace(struct perTxCtxInfo_t* tcip, size_t len) {
   }
 }
 
-typedef void (rmaPutFn_t)(void* myAddr, void* mrDesc,
+typedef void (rmaPutFn_t)(nb_handle_t handle, void* myAddr, void* mrDesc,
                           c_nodeid_t node,
                           uint64_t mrRaddr, uint64_t mrKey,
                           size_t size,
-                          chpl_bool blocking,
-                          chpl_atomic_bool *done,
                           struct perTxCtxInfo_t* tcip);
 
 static rmaPutFn_t rmaPutFn_selector;
 
+/*
+ * ofi_put
+ *
+ * Blocking PUT. Implemented by initiating a non-blocking PUT and waiting for
+ * it to complete.
+ */
+
 static inline
-chpl_comm_nb_handle_t ofi_put(const void* addr, c_nodeid_t node,
-                              void* raddr, size_t size, chpl_bool blocking) {
+void ofi_put(const void* addr, c_nodeid_t node, void* raddr, size_t size) {
+  
+  // Allocate the handle on the stack to avoid malloc overhead
+  nb_handle handle_struct;
+  nb_handle_t handle = &handle_struct;
+  nb_handle_init(handle);
+
+  handle = ofi_put_nb(handle, addr, node, raddr, size);
+  do {
+    wait_nb_some(&handle, 1);
+  } while(!test_nb_complete(handle));
+  nb_handle_destroy(handle);
+}
+
+/*
+ * ofi_put_nb
+ *
+ * Non-blocking PUT. Returns a handle that can be used to test the completion
+ * status of the PUT and wait for it to complete. If the PUT is too large
+ * for the fabric it is broken into multiple PUTs.
+ *
+ */
+static
+nb_handle_t ofi_put_nb(nb_handle_t handle, const void* addr, c_nodeid_t node,
+                              void* raddr, size_t size) {
   
   char *src = (char *) addr;
   char *dest = (char *) raddr;
-  chpl_comm_ofi_nb_handle_t *handle = NULL;
+  nb_handle_t prev = NULL;
+  nb_handle_t first = NULL;
  
-  // Determine how many operations the PUT requires based on the provider's
-  // maximum message size
-
-  int ops = (size + ofi_info->ep_attr->max_msg_size - 1) /
-             ofi_info->ep_attr->max_msg_size;
-  if (ops > 1) {
+  if (size > ofi_info->ep_attr->max_msg_size) {
     DBG_PRINTF(DBG_RMA | DBG_RMA_WRITE,
                "splitting large PUT %d:%p <= %p, size %zd",
                (int) node, raddr, addr, size);
@@ -6050,38 +6112,34 @@ chpl_comm_nb_handle_t ofi_put(const void* addr, c_nodeid_t node,
 
   struct perTxCtxInfo_t* tcip = NULL;
   CHK_TRUE((tcip = tciAlloc()) != NULL);
-  if (!blocking && !tcip->bound) {
-    // Non-blocking operations require bound endpoints
-    blocking = true;
-  }
-
-  if (!blocking) {
-    // Allocate a handle large enough to hold one "done" flags per op
-    int handleSize = sizeof(chpl_comm_ofi_nb_handle_t) +
-                     ((ops - 1) * sizeof(chpl_atomic_bool));
-    handle = chpl_mem_alloc(handleSize, CHPL_RT_MD_COMM_NB_HANDLE, 0, 0);
-
-    handle->count = ops;
-    handle->completed = false;
-    for (size_t i = 0; i < ops; i++) {
-      atomic_init_bool(&handle->complete[i], false);
-    }
-  }
 
   size_t chunkSize = ofi_info->ep_attr->max_msg_size;
   size_t offset = 0;
-  for (int i = 0; i < ops; i++) {
+  while (offset < size) {
     if (chunkSize > size - offset) {
       chunkSize = size - offset;
     }
     DBG_PRINTF(DBG_RMA | DBG_RMA_WRITE,
-               "PUT %d:%p <= %p, size %zd, %s",
-               (int) node, dest, src, chunkSize,
-               blocking ? "blocking" : "non-blocking");
+               "PUT %d:%p <= %p, size %zd",
+               (int) node, dest, src, chunkSize);
+
+    if (handle == NULL) {
+      handle = chpl_mem_alloc(sizeof(*handle),
+                              CHPL_RT_MD_COMM_NB_HANDLE, 0, 0);
+      nb_handle_init(handle);
+    }
+    // Make a linked-list of handles
+    if (prev != NULL) {
+      prev->next = handle;
+    }
+    // Keep track of the first handle so we can return it.
+    if (first == NULL) {
+      first = handle;
+    }
 
     //
-    // If the remote address is directly accessible do an RMA from this
-    // side; otherwise do the opposite RMA from the other side.
+    // If the remote address is directly accessible do a PUT RMA from this
+    // side; otherwise do a GET from the other side.
     //
     uint64_t mrKey;
     uint64_t mrRaddr;
@@ -6095,27 +6153,24 @@ chpl_comm_nb_handle_t ofi_put(const void* addr, c_nodeid_t node,
       void* myAddr = mrLocalizeSource(&mrDesc, (const void *) src,
                                       chunkSize, "PUT src");
 
-      chpl_atomic_bool *done = blocking ? NULL : &handle->complete[i];
-      rmaPutFn_selector(myAddr, mrDesc, node, mrRaddr, mrKey, chunkSize,
-                        blocking, done, tcip);
+      rmaPutFn_selector(handle, myAddr, mrDesc, node, mrRaddr,
+                        mrKey, chunkSize, tcip);
 
       mrUnLocalizeSource(myAddr, src);
     } else {
-      amRequestRmaPut(node, (void *) src, (void *) dest, size, blocking);
-      // amRequestRmaPut is currently a blocking operation, so mark
-      // the operation as complete
-      if (!blocking) {
-        atomic_init_bool(&(handle->complete[i]), true);
-      }
+      amRequestRmaPut(node, (void *) src, (void *) dest, size);
+      atomic_store_bool(&handle->complete, true);
     }
     offset += chunkSize;
     src += chunkSize;
     dest += chunkSize;
+    prev = handle;
+    handle = NULL;
   }
   tciFree(tcip);
   DBG_PRINTF(DBG_RMA | DBG_RMA_WRITE,
-               "PUT %d:%p <= %p, handle %p", handle);
-  return handle;
+               "PUT %d:%p <= %p, handle %p", first);
+  return first;
 }
 
 
@@ -6124,26 +6179,24 @@ static rmaPutFn_t rmaPutFn_msgOrd;
 static rmaPutFn_t rmaPutFn_dlvrCmplt;
 
 static inline
-void rmaPutFn_selector(void* myAddr, void* mrDesc,
+void rmaPutFn_selector(nb_handle_t handle, void* myAddr, void* mrDesc,
                        c_nodeid_t node,
                        uint64_t mrRaddr, uint64_t mrKey,
                        size_t size,
-                       chpl_bool blocking,
-                       chpl_atomic_bool *done, 
                        struct perTxCtxInfo_t* tcip) {
 
   switch (mcmMode) {
     case mcmm_msgOrdFence:
-      rmaPutFn_msgOrdFence(myAddr, mrDesc, node, mrRaddr, mrKey, size,
-                                 blocking, done, tcip);
+      rmaPutFn_msgOrdFence(handle, myAddr, mrDesc, node, mrRaddr, mrKey, size,
+                           tcip);
       break;
     case mcmm_msgOrd:
-      rmaPutFn_msgOrd(myAddr, mrDesc, node, mrRaddr, mrKey, size,
-                            blocking, done, tcip);
+      rmaPutFn_msgOrd(handle, myAddr, mrDesc, node, mrRaddr, mrKey, size,
+                      tcip);
       break;
     case mcmm_dlvrCmplt:
-      rmaPutFn_dlvrCmplt(myAddr, mrDesc, node, mrRaddr, mrKey, size,
-                               blocking, done, tcip);
+      rmaPutFn_dlvrCmplt(handle, myAddr, mrDesc, node, mrRaddr, mrKey, size,
+                         tcip);
       break;
     default:
       INTERNAL_ERROR_V("unexpected mcmMode %d", mcmMode);
@@ -6166,26 +6219,22 @@ static ssize_t wrap_fi_writemsg(const void* addr, void* mrDesc,
 
 
 //
-// Implements ofi_put() when MCM mode is message ordering with fences.
+// Implements ofi_put_nb() when MCM mode is message ordering with fences.
 //
 static
-void rmaPutFn_msgOrdFence(void* myAddr, void* mrDesc,
+void rmaPutFn_msgOrdFence(nb_handle_t handle, void* myAddr, void* mrDesc,
                           c_nodeid_t node,
                           uint64_t mrRaddr, uint64_t mrKey,
                           size_t size,
-                          chpl_bool blocking,
-                          chpl_atomic_bool *done,
                           struct perTxCtxInfo_t* tcip) {
-  uint64_t    flags = 0;
-  chpl_atomic_bool txnDone;
-  void        *ctx;
+  uint64_t flags = 0;
 
   if (tcip->bound
       && size <= ofi_info->tx_attr->inject_size
-      && !blocking && envInjectRMA) {
+      && envInjectRMA) {
     //
     // Special case: write injection has the least latency.  We can use it if
-    // this PUT is non-blocking, its size doesn't exceed the injection size
+    // this PUT  doesn't exceed the injection size
     // limit, and we have a bound tx context so we can delay forcing the
     // memory visibility until later.
     //
@@ -6195,110 +6244,69 @@ void rmaPutFn_msgOrdFence(void* myAddr, void* mrDesc,
     //
     // Special case: If our last operation was an AMO then we need to do a
     // fenced PUT to force the AMO to be visible before this PUT.
+    // TODO: this logic is a bit screwed-up. FI_FENCE by itself doesn't 
+    // force the AMO to be visible, it just ensures that the PUT cannot pass
+    // the AMO. We need to do something to make it visible, and we need
+    // to clear the bitmap so that we don't keep fencing PUTs until something
+    // else makes it visible.
     //
     flags |= FI_FENCE;
   }
-  if (done == NULL) {
-    done = &txnDone;
-  }
-  ctx = txCtxInit(tcip, __LINE__, done);
+  void *ctx = txCtxInit(tcip, __LINE__, &handle->complete);
   (void) wrap_fi_writemsg(myAddr, mrDesc, node, mrRaddr, mrKey, size,
                               ctx, flags, tcip);
-  if (blocking) {
-    waitForTxnComplete(tcip, ctx);
-    txCtxCleanup(ctx);
-  }
-
   //
   // When using message ordering we have to do something after the PUT
   // to force it into visibility, and on the same tx context as the PUT
   // itself because libfabric message ordering is specific to endpoint
-  // pairs.  With a bound tx context we can do it later, when needed.
-  // Otherwise we have to do it here, before we release the tx context.
+  // pairs.  Indicate that there is dangling PUT to the remote node.
   //
-  if (tcip->bound) {
-    bitmapSet(tcip->putVisBitmap, node);
-  } else {
-    mcmReleaseOneNode(node, tcip, "PUT");
-  }
+  bitmapSet(tcip->putVisBitmap, node);
 }
 
 
 //
-// Implements ofi_put() when MCM mode is message ordering.
+// Implements ofi_put_nb() when MCM mode is message ordering.
 // TODO: see comment for rmaPutFn_msgOrdFence.
 static
-void rmaPutFn_msgOrd(void* myAddr, void* mrDesc,
+void rmaPutFn_msgOrd(nb_handle_t handle, void* myAddr, void* mrDesc,
                      c_nodeid_t node,
                      uint64_t mrRaddr, uint64_t mrKey,
                      size_t size,
-                     chpl_bool blocking,
-                     chpl_atomic_bool *done,
                      struct perTxCtxInfo_t* tcip) {
 
   uint64_t    flags = 0;
-  chpl_atomic_bool txnDone;
-  void        *ctx;
-  //
-  // When using message ordering we have to do something after the PUT
-  // to force it into visibility, and on the same tx context as the PUT
-  // itself because libfabric message ordering is specific to endpoint
-  // pairs.  With a bound tx context we can do it later, when needed.
-  // Otherwise we have to do it here, before we release the tx context.
-  //
 
   if (tcip->bound
       && size <= ofi_info->tx_attr->inject_size
-      && !blocking && envInjectRMA) {
+      && envInjectRMA) {
     //
+    // XXX update this
     // Special case: write injection has the least latency.  We can use
     // that if this PUT's size doesn't exceed the injection size limit
     // and we have a bound tx context so we can delay forcing the
     // memory visibility until later.
     flags = FI_INJECT;
   }
-  if (done == NULL) {
-    done = &txnDone;
-  }
-  ctx = TX_CTX_INIT(tcip, blocking, done);
+  void *ctx = txCtxInit(tcip, __LINE__, &handle->complete);
   (void) wrap_fi_writemsg(myAddr, mrDesc, node, mrRaddr, mrKey, size,
                           ctx, flags, tcip);
-
-  if (blocking) {
-    waitForTxnComplete(tcip, ctx);
-    txCtxCleanup(ctx);
-  }
-
-  if (tcip->bound) {
-    bitmapSet(tcip->putVisBitmap, node);
-  } else {
-    mcmReleaseOneNode(node, tcip, "PUT");
-  }
+  bitmapSet(tcip->putVisBitmap, node);
 }
 
 
 //
-// Implements ofi_put() when MCM mode is delivery complete.
+// Implements ofi_put_nb() when MCM mode is delivery complete.
 //
 static
-void rmaPutFn_dlvrCmplt(void* myAddr, void* mrDesc,
+void rmaPutFn_dlvrCmplt(nb_handle_t handle, void* myAddr, void* mrDesc,
                         c_nodeid_t node,
                         uint64_t mrRaddr, uint64_t mrKey,
                         size_t size,
-                        chpl_bool blocking,
-                        chpl_atomic_bool *done,
                         struct perTxCtxInfo_t* tcip) {
-  chpl_atomic_bool txnDone;
-  if (done == NULL) {
-    done = &txnDone;
-  }
-  void *ctx = TX_CTX_INIT(tcip, blocking, done);
+  void *ctx = txCtxInit(tcip, __LINE__, &handle->complete);
   (void) wrap_fi_write(myAddr, mrDesc, node, mrRaddr, mrKey,
                        size, ctx, tcip);
-  if (blocking) {
-    waitForTxnComplete(tcip, ctx);
-    txCtxCleanup(ctx);
-  }
 }
 
 
@@ -6454,7 +6462,7 @@ void do_remote_put_buff(void* addr, c_nodeid_t node, void* raddr,
   if (size > MAX_UNORDERED_TRANS_SZ
       || !mrGetKey(&mrKey, &mrRaddr, node, raddr, size)
       || (info = task_local_buff_acquire(put_buff)) == NULL) {
-    (void) ofi_put(addr, node, raddr, size, true /*blocking*/);
+    (void) ofi_put(addr, node, raddr, size);
     return;
   }
 
@@ -8300,7 +8308,7 @@ void chpl_comm_impl_barrier(const char *msg) {
     DBG_PRINTF(DBG_BARRIER, "BAR notify parent %d", (int) bar_parent);
     ofi_put(&one, bar_parent,
             (void*) &bar_infoMap[bar_parent]->child_notify[parChild],
-            sizeof(one), true /*blocking*/);
+            sizeof(one));
 
     //
     // Wait for our parent locale to release us from the barrier.
@@ -8327,7 +8335,7 @@ void chpl_comm_impl_barrier(const char *msg) {
       DBG_PRINTF(DBG_BARRIER, "BAR release child %d", (int) child);
       ofi_put(&one, child,
               (void*) &bar_infoMap[child]->parent_release,
-              sizeof(one), true /*blocking*/);
+              sizeof(one));
     }
   }
 

From 08861b52c30418b731e5562d05e8cd73a6c945bb Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Fri, 20 Sep 2024 11:01:36 -0700
Subject: [PATCH 039/107] Add environment variables for testing

Allow specifying the maximum message size and maximum number of endpoings.
These are intended primarily for testing.

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index f9c03e98a26d..b1fadcd88736 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -1588,6 +1588,13 @@ struct fi_info* findProvInList(struct fi_info* info,
   if (best && (isInProvider("efa", best))) {
     best->tx_attr->inject_size = 0;
   }
+
+  // Set the maximum message size if specified
+
+  best->ep_attr->max_msg_size =
+    chpl_env_rt_get_int("COMM_OFI_MAX_MSG_SIZE",
+                        best->ep_attr->max_msg_size);
+
   return (best == NULL) ? NULL : fi_dupinfo(best);
 }
 
@@ -1707,6 +1714,11 @@ chpl_bool canBindTxCtxs(struct fi_info* info) {
   // endpoints. Until that is fixed, assume it can create as many endpoints
   // as we need.
   size_t epCount = isInProvider("cxi", info) ? SIZE_MAX : dom_attr->ep_cnt;
+
+  // Set the maximum number of endpoints if specified
+
+  epCount = chpl_env_rt_get_int("COMM_OFI_MAX_ENDPOINTS", epCount);
+
   size_t numWorkerTxCtxs = ((envPreferScalableTxEp
                           && dom_attr->max_ep_tx_ctx > 1)
                          ? dom_attr->max_ep_tx_ctx

From 6baa0e0a1a1ced0eef7f26b9ef213ede0110a493 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Fri, 20 Sep 2024 11:09:52 -0700
Subject: [PATCH 040/107] Free dynamically-allocated handles in ofi_put

Also some code cleanup.

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/include/chpl-comm.h     |  2 +-
 runtime/src/chpl-cache.c        |  2 +-
 runtime/src/comm/ofi/comm-ofi.c | 28 ++++++++++++++++------------
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/runtime/include/chpl-comm.h b/runtime/include/chpl-comm.h
index 07a4037e2c23..332f361b26e6 100644
--- a/runtime/include/chpl-comm.h
+++ b/runtime/include/chpl-comm.h
@@ -156,7 +156,7 @@ void chpl_comm_wait_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles);
 int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles);
 
 // Free a handle returned by chpl_comm_*_nb.
-void chpl_comm_free_nb(chpl_comm_nb_handle_t h);
+void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t h);
 
 // Returns whether or not the passed wide address is known to be in
 // a communicable memory region and known to be readable. That is,
diff --git a/runtime/src/chpl-cache.c b/runtime/src/chpl-cache.c
index 2503eafb1eb6..4ff6008409a3 100644
--- a/runtime/src/chpl-cache.c
+++ b/runtime/src/chpl-cache.c
@@ -1967,7 +1967,7 @@ chpl_bool do_wait_for(struct rdcache_s* cache, cache_seqn_t sn)
     // Whether we waited above or not, if the first entry's event
     // is already complete, then remove it from the queue.
     if (chpl_comm_test_nb_complete(cache->pending[index])) {
-      chpl_comm_free_nb(cache->pending[index]);
+      chpl_comm_free_nb_handle(cache->pending[index]);
       fifo_circleb_pop(&cache->pending_first_entry,
                        &cache->pending_last_entry,
                        cache->pending_len);
diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index b1fadcd88736..1c0a2ec7a420 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -1717,7 +1717,7 @@ chpl_bool canBindTxCtxs(struct fi_info* info) {
 
   // Set the maximum number of endpoints if specified
 
-  epCount = chpl_env_rt_get_int("COMM_OFI_MAX_ENDPOINTS", epCount);
+  epCount = chpl_env_rt_get_int("COMM_OFI_EP_CNT", epCount);
 
   size_t numWorkerTxCtxs = ((envPreferScalableTxEp
                           && dom_attr->max_ep_tx_ctx > 1)
@@ -5583,16 +5583,6 @@ int test_nb_complete(nb_handle_t handle) {
   return handle != NULL ? handle->reported : 1;
 }
 
-static inline
-void wait_nb_some(nb_handle_t *handles, size_t nhandles) {
-  (void) check_complete(handles, nhandles, true /*blocking*/);
-}
-
-static inline
-int try_nb_some(nb_handle_t *handles, size_t nhandles) {
-  return check_complete(handles, nhandles, false /*blocking*/);
-}
-
 int chpl_comm_test_nb_complete(chpl_comm_nb_handle_t h) {
   chpl_comm_diags_incr(test_nb);
   return test_nb_complete((nb_handle_t) h);
@@ -5667,17 +5657,27 @@ chpl_bool check_complete(nb_handle_t *handles, size_t nhandles,
   return completed;
 }
 
+static inline
+void wait_nb_some(nb_handle_t *handles, size_t nhandles) {
+  (void) check_complete(handles, nhandles, true /*blocking*/);
+}
+
 void chpl_comm_wait_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles) {
   chpl_comm_diags_incr(wait_nb);
   wait_nb_some((nb_handle_t *) h, nhandles);
 }
 
+static inline
+int try_nb_some(nb_handle_t *handles, size_t nhandles) {
+  return check_complete(handles, nhandles, false /*blocking*/);
+}
+
 int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles) {
   chpl_comm_diags_incr(try_nb);
   return try_nb_some((nb_handle_t *) h, nhandles);
 }
 
-void chpl_comm_free_nb(chpl_comm_nb_handle_t h) {
+void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t h) {
   nb_handle_t handle = (nb_handle_t) h;
   nb_handle_t next;
   for (; handle != NULL; handle = next) {
@@ -6096,6 +6096,10 @@ void ofi_put(const void* addr, c_nodeid_t node, void* raddr, size_t size) {
   do {
     wait_nb_some(&handle, 1);
   } while(!test_nb_complete(handle));
+  if (handle->next != NULL) {
+    // free any handles for sub-operations
+    chpl_comm_free_nb_handle(handle->next);
+  }
   nb_handle_destroy(handle);
 }
 

From 2e0c3891b0e43b45a8b3774e0396303925f97585 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Fri, 20 Sep 2024 11:19:29 -0700
Subject: [PATCH 041/107] Change forceMemFxVisAllNodes to work on unbound
 endpoints

We are now using this function to force visibility when an unbound endpoint is
released, so it needs to work on unbound endpoints.

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index 1c0a2ec7a420..ba0d9d78e1d4 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -7511,15 +7511,11 @@ void forceMemFxVisAllNodes(chpl_bool checkPuts, chpl_bool checkAmos,
                            struct perTxCtxInfo_t* tcip) {
   //
   // Enforce MCM: make sure the memory effects of all the operations
-  // we've done so far, to any node, are actually visible.  This is only
-  // needed if we have a bound tx context.  Otherwise, we would have
-  // forced visibility at the time of the operation.
+  // we've done so far, to any node, are actually visible.  
   //
-  if (tcip->bound) {
-    mcmReleaseAllNodes(checkPuts ? tcip->putVisBitmap : NULL,
-                       checkAmos ? tcip->amoVisBitmap : NULL,
-                       skipNode, tcip, "PUT and/or AMO");
-  }
+  mcmReleaseAllNodes(checkPuts ? tcip->putVisBitmap : NULL,
+                     checkAmos ? tcip->amoVisBitmap : NULL,
+                     skipNode, tcip, "PUT and/or AMO");
 }
 
 

From 47e75cbdc0f6dd77cf6c55bc1b8f13f7b7029935 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Fri, 20 Sep 2024 11:33:18 -0700
Subject: [PATCH 042/107] Improved tci debugging output

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index ba0d9d78e1d4..8d95f5c0c72f 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -5971,8 +5971,8 @@ struct perTxCtxInfo_t* tciAllocCommon(chpl_bool bindToAmHandler) {
       _ttcip->amoVisBitmap = bitmapAlloc(chpl_numNodes);
     }
   }
-  DBG_PRINTF(DBG_TCIPS, "alloc%s tciTab[%td]",
-             _ttcip->bound ? " bound" : "", _ttcip - tciTab);
+  DBG_PRINTF(DBG_TCIPS, "alloc%s tciTab[%td] %p",
+             _ttcip->bound ? " bound" : "", _ttcip - tciTab, _ttcip);
   return _ttcip;
 }
 
@@ -6048,7 +6048,7 @@ void tciFree(struct perTxCtxInfo_t* tcip) {
   // Bound contexts stay bound.  We only release non-bound ones.
   //
   if (!tcip->bound) {
-    DBG_PRINTF(DBG_TCIPS, "free tciTab[%td]", tcip - tciTab);
+    DBG_PRINTF(DBG_TCIPS, "free tciTab[%td] %p", tcip - tciTab, tcip);
     forceMemFxVisAllNodes(true, true, -1, tcip);
     atomic_store_bool(&tcip->allocated, false);
   }

From 9f4079e3567a28e45d2c68a905b82328a28fb49c Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Fri, 20 Sep 2024 13:01:57 -0700
Subject: [PATCH 043/107] Allocate visibility bitmaps for unbound endpoints

Operations to force visibility are deferred until the endpoint is released,
which requires the visibility bitmaps.

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index 8d95f5c0c72f..cbefe62c573f 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -5966,6 +5966,8 @@ struct perTxCtxInfo_t* tciAllocCommon(chpl_bool bindToAmHandler) {
   if (bindToAmHandler
       || (tciTabBindTxCtxs && chpl_task_isFixedThread())) {
     _ttcip->bound = true;
+  }
+  if (mcmMode != mcmm_dlvrCmplt) {
     _ttcip->putVisBitmap = bitmapAlloc(chpl_numNodes);
     if ((ofi_info->caps & FI_ATOMIC) != 0) {
       _ttcip->amoVisBitmap = bitmapAlloc(chpl_numNodes);

From 06e11af40a855f181a44fd8bc8b4a4eb20257735 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Fri, 20 Sep 2024 14:17:56 -0700
Subject: [PATCH 044/107] Fixed number of transmit contexts computation

Fixed how the number of transmit contexts needed is computed, and added some
comments.

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index cbefe62c573f..b0839499d6cd 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -1722,13 +1722,14 @@ chpl_bool canBindTxCtxs(struct fi_info* info) {
   size_t numWorkerTxCtxs = ((envPreferScalableTxEp
                           && dom_attr->max_ep_tx_ctx > 1)
                          ? dom_attr->max_ep_tx_ctx
-                         : epCount)
-                        - 1
-                        - numAmHandlers;
+                         : epCount) - 1 - numAmHandlers;
+
   if (envCommConcurrency > 0 && envCommConcurrency < numWorkerTxCtxs) {
     numWorkerTxCtxs = envCommConcurrency;
   }
 
+  numTxCtxs = numWorkerTxCtxs + 1 + numAmHandlers;
+
   return fixedNumThreads <= numWorkerTxCtxs;
 }
 
@@ -2526,12 +2527,30 @@ void init_ofiEp(void) {
   //
   // Compute numbers of transmit and receive contexts, and then create
   // the transmit context table.
-  //
+  // 
+  // The logic here is a bit convoluted and can probably be cleaned up. See
+  // the tciTab comment above for more details. For non-scalable endpoints,
+  // we would like to have one transmit context (and therefore one endpoint)
+  // per worker thread, one per AM handler, and one for the process in
+  // general. That will allow us to bind worker threads and AM handlers to
+  // transmit contexts. If we can't get that many endpoints then transmit
+  // contexts will not be bound, which signficantly reduces performance. 
+  //
+  // For scalable endpoints we only need one transmit endpoint with enough
+  // transmit contexts to bind them as described above. If max_ep_tx_ctx for
+  // the provider is less than that, then we won't use a scalable endpoint.
+  // If we are using a scalable endpoint we have to set tx_ctx_cnt to tell
+  // the provider how many transmit contexts we want per endpoint.
+  //
+  int desiredTxCtxs;
   tciTabBindTxCtxs = canBindTxCtxs(ofi_info);
   if (tciTabBindTxCtxs) {
-    numTxCtxs = chpl_task_getFixedNumThreads() + numAmHandlers + 1;
+    desiredTxCtxs = chpl_task_getFixedNumThreads() + numAmHandlers + 1;
   } else {
-    numTxCtxs = chpl_task_getMaxPar() + numAmHandlers + 1;
+    desiredTxCtxs = chpl_task_getMaxPar() + numAmHandlers + 1;
+  }
+  if (desiredTxCtxs < numTxCtxs) {
+    numTxCtxs = desiredTxCtxs;
   }
   DBG_PRINTF(DBG_CFG,"tciTabBindTxCtxs %s numTxCtxs %d numAmHandlers %d",
              tciTabBindTxCtxs ? "true" : "false", numTxCtxs, numAmHandlers);

From 1872b30a47b4274def696303ec1dfdb51498c908 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Fri, 20 Sep 2024 14:40:20 -0700
Subject: [PATCH 045/107] Added tciAlloc call site debug info

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index b0839499d6cd..81107a4ce937 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -74,6 +74,7 @@
 #include <rdma/fi_errno.h>
 #include <rdma/fi_rma.h>
 
+
 #include <sys/mman.h>
 #ifndef MAP_HUGETLB
 // MAP_HUGETLB is not defined on all systems (e.g. MacOS)
@@ -377,7 +378,8 @@ typedef nb_handle* nb_handle_t;
 // Forward decls
 //
 
-static struct perTxCtxInfo_t* tciAlloc(void);
+static struct perTxCtxInfo_t* tciAllocFunc(const char *, int);
+#define tciAlloc() tciAllocFunc(__FILE__, __LINE__)
 static struct perTxCtxInfo_t* tciAllocForAmHandler(void);
 static chpl_bool tciAllocTabEntry(struct perTxCtxInfo_t*);
 static void tciFree(struct perTxCtxInfo_t*);
@@ -5947,11 +5949,11 @@ static __thread struct perTxCtxInfo_t* _ttcip;
 
 
 static inline
-struct perTxCtxInfo_t* tciAlloc(void) {
+struct perTxCtxInfo_t* tciAllocFunc(const char *file, int line) {
+  DBG_PRINTF(DBG_TCIPS, "tciAlloc %s:%d]", file, line);
   return tciAllocCommon(false /*bindToAmHandler*/);
 }
 
-
 static inline
 struct perTxCtxInfo_t* tciAllocForAmHandler(void) {
   return tciAllocCommon(true /*bindToAmHandler*/);

From b07b36d74ef8f0130f66b4fbf4fe4d9d686ab602 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Mon, 23 Sep 2024 15:20:38 -0700
Subject: [PATCH 046/107] Change type of numTxCtxs and numRxCtxs to size_t

Change type of numTxCtxs and numRxCtxs to size_t to match type of
info->domain_attr->ep_cnt.

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index 81107a4ce937..207b9871b02c 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -187,8 +187,8 @@ static chpl_bool envInjectAM;           // env: inject AM messages
 static chpl_bool envUseDedicatedAmhCores;  // env: use dedicated AM cores
 static const char* envExpectedProvider; // env: provider we should select
 
-static int numTxCtxs;
-static int numRxCtxs;
+static size_t numTxCtxs;
+static size_t numRxCtxs;
 
 struct perTxCtxInfo_t {
   chpl_atomic_bool allocated;        // true: in use; false: available
@@ -1231,15 +1231,15 @@ void init_ofi(void) {
              (tciTab[tciTabLen - 1].txCntr == NULL) ? "CQ" : "counter");
   if (ofi_txEpScal != NULL) {
     DBG_PRINTF(DBG_CFG,
-               "per node config: 1 scalable tx ep + %d tx ctx%s (%d bound), "
-               "%d rx ctx%s",
+               "per node config: 1 scalable tx ep + %zu tx ctx%s (%d bound), "
+               "%zu rx ctx%s",
                numTxCtxs, (numTxCtxs == 1) ? "" : "s",
                tciTabBindTxCtxs ? chpl_task_getFixedNumThreads() : 0,
                numRxCtxs, (numRxCtxs == 1) ? "" : "s");
   } else {
     DBG_PRINTF(DBG_CFG,
-               "per node config: %d regular tx ep+ctx%s (%d bound), "
-               "%d rx ctx%s",
+               "per node config: %zu regular tx ep+ctx%s (%d bound), "
+               "%zu rx ctx%s",
                numTxCtxs, (numTxCtxs == 1) ? "" : "s",
                tciTabBindTxCtxs ? chpl_task_getFixedNumThreads() : 0,
                numRxCtxs, (numRxCtxs == 1) ? "" : "s");

From 27f9312f139955daf153d7c2df7ff3a4ee2b2b20 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Mon, 23 Sep 2024 16:05:08 -0700
Subject: [PATCH 047/107] numTxCtxs is now of type size_t

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index 207b9871b02c..94ca09dfa5e1 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -8430,7 +8430,7 @@ void ofiErrReport(const char* exprStr, int retVal, const char* errStr) {
       "OFI error: %s: %s:\n"
       "  The program has reached the limit on the number of files it can\n"
       "  have open at once.  This may be because the product of the number\n"
-      "  of locales (%d) and the communication concurrency (roughly %d) is\n"
+      "  of locales (%d) and the communication concurrency (roughly %zu) is\n"
       "  a significant fraction of the open-file limit (%ld).  If so,\n"
       "  either setting CHPL_RT_COMM_CONCURRENCY to decrease communication\n"
       "  concurrency or running on fewer locales may allow the program to\n"

From e3bc955f523eaafc1762eda9eec2dd06c00e97d8 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Mon, 23 Sep 2024 16:31:03 -0700
Subject: [PATCH 048/107] Better comments

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index 94ca09dfa5e1..f23f56290d5b 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -361,9 +361,22 @@ static const char* mcmModeNames[] = { "undefined",
 static bool cxiHybridMRMode = false;
 
 
-//
-// Non-blocking handle
-//
+// OFI-specific non-blocking handle implementation 
+
+// This is defined here because it is used in the forward declarations below.
+// The rountines to initialize and destroy handles, nb_handle_init and
+// nb_handle_destroy appear in the RMA section later. The "id" is used to
+// verify that the only the task that created the handle uses it -- this
+// prevents multiple threads from simultaneously accessing the same transmit
+// context if they are not bound to threads. The semantics of
+// chpl_comm_test_nb_complete, chpl_comm_wait_nb_some, and chpl_comm_try_nb
+// some require distinguishing newly-completed handles from those that that
+// have previously commited. The "reported" field is used to distinguish
+// between the two. The "complete" field is set when the operation completes.
+// It is an atomic because the lower-level functions that set it require it.
+// Operations that are too large for the underlying fabric are represented by
+// a linked-list of handles.
+
 typedef struct nb_handle {
   chpl_taskID_t id;          // task that created the handle
   chpl_bool reported;        // operation has been reported as complete
@@ -5497,18 +5510,6 @@ void amCheckLiveness(void) {
 // Interface: RMA
 //
 
-// OFI-specific non-blocking handle implementation 
-
-// XXX update
-
-// Non-blocking operations require bound endpoints, to avoid having a handle
-// for a pending operation held by one thread, while the endpoint is in use
-// by a different thread. Bound endpoints are the norm, so it's easiest to
-// just disallow non-blocking operations on non-bound endpoints. This allows
-// the "completed" flag to be a simple boolean. The "complete" flags for the
-// sub-operations are booleans because the lower-level code that uses them
-// does not assume bound endpoints.
-
 static inline 
 void nb_handle_init(nb_handle_t h) {
   h->id = chpl_task_getId();
@@ -6320,7 +6321,6 @@ void rmaPutFn_msgOrd(nb_handle_t handle, void* myAddr, void* mrDesc,
       && size <= ofi_info->tx_attr->inject_size
       && envInjectRMA) {
     //
-    // XXX update this
     // Special case: write injection has the least latency.  We can use
     // that if this PUT's size doesn't exceed the injection size limit
     // and we have a bound tx context so we can delay forcing the

From 80d1bc1296760ce91c2b962258b855d6a09c10dc Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Tue, 24 Sep 2024 09:46:19 -0700
Subject: [PATCH 049/107] Remove trailing whitespace

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index f23f56290d5b..594e3e7b36c2 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -361,7 +361,7 @@ static const char* mcmModeNames[] = { "undefined",
 static bool cxiHybridMRMode = false;
 
 
-// OFI-specific non-blocking handle implementation 
+// OFI-specific non-blocking handle implementation
 
 // This is defined here because it is used in the forward declarations below.
 // The rountines to initialize and destroy handles, nb_handle_init and
@@ -2542,14 +2542,14 @@ void init_ofiEp(void) {
   //
   // Compute numbers of transmit and receive contexts, and then create
   // the transmit context table.
-  // 
+  //
   // The logic here is a bit convoluted and can probably be cleaned up. See
   // the tciTab comment above for more details. For non-scalable endpoints,
   // we would like to have one transmit context (and therefore one endpoint)
   // per worker thread, one per AM handler, and one for the process in
   // general. That will allow us to bind worker threads and AM handlers to
   // transmit contexts. If we can't get that many endpoints then transmit
-  // contexts will not be bound, which signficantly reduces performance. 
+  // contexts will not be bound, which signficantly reduces performance.
   //
   // For scalable endpoints we only need one transmit endpoint with enough
   // transmit contexts to bind them as described above. If max_ep_tx_ctx for
@@ -4377,7 +4377,7 @@ void amRequestExecOn(c_nodeid_t node, c_sublocid_t subloc,
 
 /*
  * amRequestRmaPut
- * 
+ *
  * Performs a PUT by sending an active message to the remote node that causes
  * it to perform a GET. This operation returns when the GET has completed.
  */
@@ -5510,7 +5510,7 @@ void amCheckLiveness(void) {
 // Interface: RMA
 //
 
-static inline 
+static inline
 void nb_handle_init(nb_handle_t h) {
   h->id = chpl_task_getId();
   h->reported = false;
@@ -5518,7 +5518,7 @@ void nb_handle_init(nb_handle_t h) {
   h->next = NULL;
 }
 
-static inline 
+static inline
 void nb_handle_destroy(nb_handle_t h) {
   atomic_destroy_bool(&h->complete);
 }
@@ -5585,7 +5585,7 @@ chpl_comm_nb_handle_t chpl_comm_put_nb(void* addr, c_nodeid_t node,
              "%s(%p, %d, %p, %zd, %d)", __func__,
              addr, (int) node, raddr, size, (int) commID);
 
-  nb_handle_t handle = NULL; 
+  nb_handle_t handle = NULL;
   if (put_prologue(addr, node, raddr, size, commID, ln, fn)) {
     handle = ofi_put_nb(handle, addr, node, raddr, size);
   }
@@ -5612,7 +5612,7 @@ int chpl_comm_test_nb_complete(chpl_comm_nb_handle_t h) {
 
 /*
  * check_complete
- * 
+ *
  * Returns true if a new handle completion is detected, false otherwise.
  * Ignores handles that have previously completed (h->reported == true). If
  * blocking is true and there are uncompleted handles this will not return
@@ -5662,7 +5662,7 @@ chpl_bool check_complete(nb_handle_t *handles, size_t nhandles,
         handle->reported = true;
       }
     }
-    if (!blocking || completed || !pending) { 
+    if (!blocking || completed || !pending) {
       break;
     }
     // progress the endpoint so handles can complete and then try again
@@ -6110,7 +6110,7 @@ static rmaPutFn_t rmaPutFn_selector;
 
 static inline
 void ofi_put(const void* addr, c_nodeid_t node, void* raddr, size_t size) {
-  
+
   // Allocate the handle on the stack to avoid malloc overhead
   nb_handle handle_struct;
   nb_handle_t handle = &handle_struct;
@@ -6138,12 +6138,12 @@ void ofi_put(const void* addr, c_nodeid_t node, void* raddr, size_t size) {
 static
 nb_handle_t ofi_put_nb(nb_handle_t handle, const void* addr, c_nodeid_t node,
                               void* raddr, size_t size) {
-  
+
   char *src = (char *) addr;
   char *dest = (char *) raddr;
   nb_handle_t prev = NULL;
   nb_handle_t first = NULL;
- 
+
   if (size > ofi_info->ep_attr->max_msg_size) {
     DBG_PRINTF(DBG_RMA | DBG_RMA_WRITE,
                "splitting large PUT %d:%p <= %p, size %zd",
@@ -6284,7 +6284,7 @@ void rmaPutFn_msgOrdFence(nb_handle_t handle, void* myAddr, void* mrDesc,
     //
     // Special case: If our last operation was an AMO then we need to do a
     // fenced PUT to force the AMO to be visible before this PUT.
-    // TODO: this logic is a bit screwed-up. FI_FENCE by itself doesn't 
+    // TODO: this logic is a bit screwed-up. FI_FENCE by itself doesn't
     // force the AMO to be visible, it just ensures that the PUT cannot pass
     // the AMO. We need to do something to make it visible, and we need
     // to clear the bitmap so that we don't keep fencing PUTs until something
@@ -7534,7 +7534,7 @@ void forceMemFxVisAllNodes(chpl_bool checkPuts, chpl_bool checkAmos,
                            struct perTxCtxInfo_t* tcip) {
   //
   // Enforce MCM: make sure the memory effects of all the operations
-  // we've done so far, to any node, are actually visible.  
+  // we've done so far, to any node, are actually visible.
   //
   mcmReleaseAllNodes(checkPuts ? tcip->putVisBitmap : NULL,
                      checkAmos ? tcip->amoVisBitmap : NULL,

From b58f4e03a8961902152ed00b8e5fa1741a878b86 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Tue, 24 Sep 2024 10:09:07 -0700
Subject: [PATCH 050/107] Run bigTransfer test with unbound endpoints

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 test/runtime/configMatters/comm/unbound/EXECENV                | 1 +
 test/runtime/configMatters/comm/unbound/README                 | 2 ++
 test/runtime/configMatters/comm/unbound/SKIPIF                 | 1 +
 test/runtime/configMatters/comm/unbound/bigTransfer.chpl       | 1 +
 test/runtime/configMatters/comm/unbound/bigTransfer.compopts   | 1 +
 test/runtime/configMatters/comm/unbound/bigTransfer.execopts   | 1 +
 test/runtime/configMatters/comm/unbound/bigTransfer.good       | 1 +
 test/runtime/configMatters/comm/unbound/bigTransfer.numlocales | 1 +
 8 files changed, 9 insertions(+)
 create mode 100644 test/runtime/configMatters/comm/unbound/EXECENV
 create mode 100644 test/runtime/configMatters/comm/unbound/README
 create mode 100644 test/runtime/configMatters/comm/unbound/SKIPIF
 create mode 120000 test/runtime/configMatters/comm/unbound/bigTransfer.chpl
 create mode 120000 test/runtime/configMatters/comm/unbound/bigTransfer.compopts
 create mode 120000 test/runtime/configMatters/comm/unbound/bigTransfer.execopts
 create mode 120000 test/runtime/configMatters/comm/unbound/bigTransfer.good
 create mode 120000 test/runtime/configMatters/comm/unbound/bigTransfer.numlocales

diff --git a/test/runtime/configMatters/comm/unbound/EXECENV b/test/runtime/configMatters/comm/unbound/EXECENV
new file mode 100644
index 000000000000..a2aab52168d8
--- /dev/null
+++ b/test/runtime/configMatters/comm/unbound/EXECENV
@@ -0,0 +1 @@
+CHPL_RT_COMM_OFI_EP_CNT=10
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/unbound/README b/test/runtime/configMatters/comm/unbound/README
new file mode 100644
index 000000000000..6c22d25f53b5
--- /dev/null
+++ b/test/runtime/configMatters/comm/unbound/README
@@ -0,0 +1,2 @@
+Tests for CHPL_COMM=ofi with unbound endpoints. This is accomplished by
+setting CHPL_RT_COMM_OFI_EP_CNT to a small value.
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/unbound/SKIPIF b/test/runtime/configMatters/comm/unbound/SKIPIF
new file mode 100644
index 000000000000..1a0e68776535
--- /dev/null
+++ b/test/runtime/configMatters/comm/unbound/SKIPIF
@@ -0,0 +1 @@
+CHPL_COMM != ofi
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/unbound/bigTransfer.chpl b/test/runtime/configMatters/comm/unbound/bigTransfer.chpl
new file mode 120000
index 000000000000..3d38c2034ae9
--- /dev/null
+++ b/test/runtime/configMatters/comm/unbound/bigTransfer.chpl
@@ -0,0 +1 @@
+../bigTransfer.chpl
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/unbound/bigTransfer.compopts b/test/runtime/configMatters/comm/unbound/bigTransfer.compopts
new file mode 120000
index 000000000000..3d12c48c7991
--- /dev/null
+++ b/test/runtime/configMatters/comm/unbound/bigTransfer.compopts
@@ -0,0 +1 @@
+../bigTransfer.compopts
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/unbound/bigTransfer.execopts b/test/runtime/configMatters/comm/unbound/bigTransfer.execopts
new file mode 120000
index 000000000000..88345245bd8d
--- /dev/null
+++ b/test/runtime/configMatters/comm/unbound/bigTransfer.execopts
@@ -0,0 +1 @@
+../bigTransfer.execopts
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/unbound/bigTransfer.good b/test/runtime/configMatters/comm/unbound/bigTransfer.good
new file mode 120000
index 000000000000..523fb7880072
--- /dev/null
+++ b/test/runtime/configMatters/comm/unbound/bigTransfer.good
@@ -0,0 +1 @@
+../bigTransfer.good
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/unbound/bigTransfer.numlocales b/test/runtime/configMatters/comm/unbound/bigTransfer.numlocales
new file mode 120000
index 000000000000..6d7c873c2c01
--- /dev/null
+++ b/test/runtime/configMatters/comm/unbound/bigTransfer.numlocales
@@ -0,0 +1 @@
+../bigTransfer.numlocales
\ No newline at end of file

From a174b158a17ccdbc3a08c261d4b23e2308db0d4a Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Tue, 24 Sep 2024 17:22:55 +0000
Subject: [PATCH 051/107] Only run PUT

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 test/runtime/configMatters/comm/unbound/bigTransfer.execopts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 120000 => 100644 test/runtime/configMatters/comm/unbound/bigTransfer.execopts

diff --git a/test/runtime/configMatters/comm/unbound/bigTransfer.execopts b/test/runtime/configMatters/comm/unbound/bigTransfer.execopts
deleted file mode 120000
index 88345245bd8d..000000000000
--- a/test/runtime/configMatters/comm/unbound/bigTransfer.execopts
+++ /dev/null
@@ -1 +0,0 @@
-../bigTransfer.execopts
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/unbound/bigTransfer.execopts b/test/runtime/configMatters/comm/unbound/bigTransfer.execopts
new file mode 100644
index 000000000000..e7574f0058dd
--- /dev/null
+++ b/test/runtime/configMatters/comm/unbound/bigTransfer.execopts
@@ -0,0 +1 @@
+--doGET=false --xferMB=2048

From c06b0b4cd8e7ac08b558cfda1369744304f70cca Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Tue, 24 Sep 2024 10:29:27 -0700
Subject: [PATCH 052/107] Run bigTransfer tests with small fabric message size

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 test/runtime/configMatters/comm/large-rma/EXECENV              | 1 +
 test/runtime/configMatters/comm/large-rma/README               | 3 +++
 test/runtime/configMatters/comm/large-rma/bigTransfer.chpl     | 1 +
 test/runtime/configMatters/comm/large-rma/bigTransfer.compopts | 1 +
 test/runtime/configMatters/comm/large-rma/bigTransfer.execopts | 1 +
 test/runtime/configMatters/comm/large-rma/bigTransfer.good     | 1 +
 .../configMatters/comm/large-rma/bigTransfer.numlocales        | 1 +
 7 files changed, 9 insertions(+)
 create mode 100644 test/runtime/configMatters/comm/large-rma/EXECENV
 create mode 100644 test/runtime/configMatters/comm/large-rma/README
 create mode 120000 test/runtime/configMatters/comm/large-rma/bigTransfer.chpl
 create mode 120000 test/runtime/configMatters/comm/large-rma/bigTransfer.compopts
 create mode 100644 test/runtime/configMatters/comm/large-rma/bigTransfer.execopts
 create mode 120000 test/runtime/configMatters/comm/large-rma/bigTransfer.good
 create mode 120000 test/runtime/configMatters/comm/large-rma/bigTransfer.numlocales

diff --git a/test/runtime/configMatters/comm/large-rma/EXECENV b/test/runtime/configMatters/comm/large-rma/EXECENV
new file mode 100644
index 000000000000..2e7098a16072
--- /dev/null
+++ b/test/runtime/configMatters/comm/large-rma/EXECENV
@@ -0,0 +1 @@
+CHPL_RT_COMM_OFI_MAX_MSG_SIZE=100
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/large-rma/README b/test/runtime/configMatters/comm/large-rma/README
new file mode 100644
index 000000000000..2f57cdde6057
--- /dev/null
+++ b/test/runtime/configMatters/comm/large-rma/README
@@ -0,0 +1,3 @@
+Test RMA operations that are larger than the maximum message size of the fabric
+and therefore require multiple transfers. This is accomplished by setting
+the CHPL_RT_COMM_OFI_MAX_MSG_SIZE to a small value.
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/large-rma/bigTransfer.chpl b/test/runtime/configMatters/comm/large-rma/bigTransfer.chpl
new file mode 120000
index 000000000000..3d38c2034ae9
--- /dev/null
+++ b/test/runtime/configMatters/comm/large-rma/bigTransfer.chpl
@@ -0,0 +1 @@
+../bigTransfer.chpl
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/large-rma/bigTransfer.compopts b/test/runtime/configMatters/comm/large-rma/bigTransfer.compopts
new file mode 120000
index 000000000000..3d12c48c7991
--- /dev/null
+++ b/test/runtime/configMatters/comm/large-rma/bigTransfer.compopts
@@ -0,0 +1 @@
+../bigTransfer.compopts
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/large-rma/bigTransfer.execopts b/test/runtime/configMatters/comm/large-rma/bigTransfer.execopts
new file mode 100644
index 000000000000..e7574f0058dd
--- /dev/null
+++ b/test/runtime/configMatters/comm/large-rma/bigTransfer.execopts
@@ -0,0 +1 @@
+--doGET=false --xferMB=2048
diff --git a/test/runtime/configMatters/comm/large-rma/bigTransfer.good b/test/runtime/configMatters/comm/large-rma/bigTransfer.good
new file mode 120000
index 000000000000..523fb7880072
--- /dev/null
+++ b/test/runtime/configMatters/comm/large-rma/bigTransfer.good
@@ -0,0 +1 @@
+../bigTransfer.good
\ No newline at end of file
diff --git a/test/runtime/configMatters/comm/large-rma/bigTransfer.numlocales b/test/runtime/configMatters/comm/large-rma/bigTransfer.numlocales
new file mode 120000
index 000000000000..6d7c873c2c01
--- /dev/null
+++ b/test/runtime/configMatters/comm/large-rma/bigTransfer.numlocales
@@ -0,0 +1 @@
+../bigTransfer.numlocales
\ No newline at end of file

From c4ff0a13d9e37951c87aefae25c18cd6c8b0397e Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Tue, 1 Oct 2024 16:21:46 -0700
Subject: [PATCH 053/107] Add chpl_comm_free_nb_handle to CHPL_COMM=none

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/none/comm-none.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/runtime/src/comm/none/comm-none.c b/runtime/src/comm/none/comm-none.c
index 8e6df722ec78..fff2c16c7f03 100644
--- a/runtime/src/comm/none/comm-none.c
+++ b/runtime/src/comm/none/comm-none.c
@@ -98,6 +98,10 @@ int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles)
   return 0;
 }
 
+void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t h) {
+  assert(h == NULL);
+}
+
 int chpl_comm_addr_gettable(c_nodeid_t node, void* start, size_t len)
 {
   return 0;

From 95a418e1435a89936963edbec3266e28553ca692 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Wed, 2 Oct 2024 06:52:14 -0700
Subject: [PATCH 054/107] Add chpl_comm_free_nb_handle to CHPL_COMM=gasnet

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/gasnet/comm-gasnet.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/runtime/src/comm/gasnet/comm-gasnet.c b/runtime/src/comm/gasnet/comm-gasnet.c
index 961c04a2f665..b184abbc25f0 100644
--- a/runtime/src/comm/gasnet/comm-gasnet.c
+++ b/runtime/src/comm/gasnet/comm-gasnet.c
@@ -686,6 +686,8 @@ int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles)
   return gasnet_try_syncnb_some((gasnet_handle_t*) h, nhandles) == GASNET_OK;
 }
 
+void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t* h) { }
+
 int chpl_comm_addr_gettable(c_nodeid_t node, void* start, size_t len)
 {
 #ifdef GASNET_SEGMENT_EVERYTHING

From 1efd4f47d86d59478690ee1d11892851d1d9c9c6 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Wed, 2 Oct 2024 06:53:45 -0700
Subject: [PATCH 055/107] Added chpl_comm_free_nb_handle to CHPL_COMM=ugni

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ugni/comm-ugni.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/runtime/src/comm/ugni/comm-ugni.c b/runtime/src/comm/ugni/comm-ugni.c
index 8ebda8765909..dc9df8e74b8f 100644
--- a/runtime/src/comm/ugni/comm-ugni.c
+++ b/runtime/src/comm/ugni/comm-ugni.c
@@ -6189,6 +6189,9 @@ int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles)
 }
 
 
+void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t* h) { }
+
+
 int chpl_comm_addr_gettable(c_nodeid_t node, void* start, size_t len)
 {
   // This call asks if a future GET is safe, but we can't know that in the case

From 9bbb972c852cca914656f50fd8a4030f0963991b Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Wed, 2 Oct 2024 07:00:43 -0700
Subject: [PATCH 056/107] Added chpl_comm_free_nb_handle to gasnet-ex

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/gasnet/comm-gasnet-ex.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/runtime/src/comm/gasnet/comm-gasnet-ex.c b/runtime/src/comm/gasnet/comm-gasnet-ex.c
index 1dc6ed1b0d60..283a64e71611 100644
--- a/runtime/src/comm/gasnet/comm-gasnet-ex.c
+++ b/runtime/src/comm/gasnet/comm-gasnet-ex.c
@@ -685,6 +685,8 @@ int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles)
   return gex_Event_TestSome((gex_Event_t*) h, nhandles, GEX_NO_FLAGS) == GASNET_OK;
 }
 
+void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t* h) { }
+
 // TODO GEX could be scalable query to gasnet itself
 int chpl_comm_addr_gettable(c_nodeid_t node, void* start, size_t len)
 {

From 2dfb923d5c29711e6e3f4b3899cbbf9ce2299d51 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Wed, 2 Oct 2024 07:02:43 -0700
Subject: [PATCH 057/107] Fixed typos

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/gasnet/comm-gasnet-ex.c | 2 +-
 runtime/src/comm/gasnet/comm-gasnet.c    | 2 +-
 runtime/src/comm/ugni/comm-ugni.c        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/runtime/src/comm/gasnet/comm-gasnet-ex.c b/runtime/src/comm/gasnet/comm-gasnet-ex.c
index 283a64e71611..88e9ff99b10b 100644
--- a/runtime/src/comm/gasnet/comm-gasnet-ex.c
+++ b/runtime/src/comm/gasnet/comm-gasnet-ex.c
@@ -685,7 +685,7 @@ int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles)
   return gex_Event_TestSome((gex_Event_t*) h, nhandles, GEX_NO_FLAGS) == GASNET_OK;
 }
 
-void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t* h) { }
+void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t h) { }
 
 // TODO GEX could be scalable query to gasnet itself
 int chpl_comm_addr_gettable(c_nodeid_t node, void* start, size_t len)
diff --git a/runtime/src/comm/gasnet/comm-gasnet.c b/runtime/src/comm/gasnet/comm-gasnet.c
index b184abbc25f0..53e168e6b4e0 100644
--- a/runtime/src/comm/gasnet/comm-gasnet.c
+++ b/runtime/src/comm/gasnet/comm-gasnet.c
@@ -686,7 +686,7 @@ int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles)
   return gasnet_try_syncnb_some((gasnet_handle_t*) h, nhandles) == GASNET_OK;
 }
 
-void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t* h) { }
+void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t h) { }
 
 int chpl_comm_addr_gettable(c_nodeid_t node, void* start, size_t len)
 {
diff --git a/runtime/src/comm/ugni/comm-ugni.c b/runtime/src/comm/ugni/comm-ugni.c
index dc9df8e74b8f..098a331c8089 100644
--- a/runtime/src/comm/ugni/comm-ugni.c
+++ b/runtime/src/comm/ugni/comm-ugni.c
@@ -6189,7 +6189,7 @@ int chpl_comm_try_nb_some(chpl_comm_nb_handle_t* h, size_t nhandles)
 }
 
 
-void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t* h) { }
+void chpl_comm_free_nb_handle(chpl_comm_nb_handle_t h) { }
 
 
 int chpl_comm_addr_gettable(c_nodeid_t node, void* start, size_t len)

From b260c4feca30d019f2f5e73a2e3e3dc25875d0f9 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Wed, 9 Oct 2024 06:51:09 -0700
Subject: [PATCH 058/107] Addressed reviewer's comments

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/comm/ofi/comm-ofi.c                   | 12 ++++--------
 test/runtime/configMatters/comm/large-rma/EXECENV |  2 +-
 test/runtime/configMatters/comm/large-rma/README  |  6 +++---
 test/runtime/configMatters/comm/unbound/EXECENV   |  2 +-
 test/runtime/configMatters/comm/unbound/README    |  2 +-
 test/runtime/configMatters/comm/unbound/SKIPIF    |  2 +-
 6 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/runtime/src/comm/ofi/comm-ofi.c b/runtime/src/comm/ofi/comm-ofi.c
index 594e3e7b36c2..8f7d79a1a88b 100644
--- a/runtime/src/comm/ofi/comm-ofi.c
+++ b/runtime/src/comm/ofi/comm-ofi.c
@@ -5533,7 +5533,6 @@ static inline
 chpl_bool put_prologue(void* addr, c_nodeid_t node, void* raddr, size_t size,
                        int32_t commID, int ln, int32_t fn) {
 
-  chpl_bool proceed = false;
   retireDelayedAmDone(false /*taskIsEnding*/);
 
   //
@@ -5543,12 +5542,12 @@ chpl_bool put_prologue(void* addr, c_nodeid_t node, void* raddr, size_t size,
   CHK_TRUE(raddr != NULL);
 
   if (size == 0) {
-    goto done;
+    return false;
   }
 
   if (node == chpl_nodeID) {
     memmove(raddr, addr, size);
-    goto done;
+    return false;
   }
 
   // Communications callback support
@@ -5561,9 +5560,7 @@ chpl_bool put_prologue(void* addr, c_nodeid_t node, void* raddr, size_t size,
 
   chpl_comm_diags_verbose_rdma("put", node, size, ln, fn, commID);
   chpl_comm_diags_incr(put);
-  proceed = true;
-done:
-  return proceed;
+  return true;
 }
 
 /*
@@ -5625,7 +5622,7 @@ chpl_bool check_complete(nb_handle_t *handles, size_t nhandles,
   chpl_bool completed = false; // at least one new completion detected
   chpl_bool pending = false;  // there is an uncompleted handle
   if ((handles == NULL) || (nhandles == 0)) {
-    goto done;
+    return false;
   }
   struct perTxCtxInfo_t* tcip = NULL;
   while (true) {
@@ -5675,7 +5672,6 @@ chpl_bool check_complete(nb_handle_t *handles, size_t nhandles,
   if (tcip) {
     tciFree(tcip);
   }
-done:
   return completed;
 }
 
diff --git a/test/runtime/configMatters/comm/large-rma/EXECENV b/test/runtime/configMatters/comm/large-rma/EXECENV
index 2e7098a16072..0ad0a9d5532a 100644
--- a/test/runtime/configMatters/comm/large-rma/EXECENV
+++ b/test/runtime/configMatters/comm/large-rma/EXECENV
@@ -1 +1 @@
-CHPL_RT_COMM_OFI_MAX_MSG_SIZE=100
\ No newline at end of file
+CHPL_RT_COMM_OFI_MAX_MSG_SIZE=100
diff --git a/test/runtime/configMatters/comm/large-rma/README b/test/runtime/configMatters/comm/large-rma/README
index 2f57cdde6057..2c52e1865396 100644
--- a/test/runtime/configMatters/comm/large-rma/README
+++ b/test/runtime/configMatters/comm/large-rma/README
@@ -1,3 +1,3 @@
-Test RMA operations that are larger than the maximum message size of the fabric
-and therefore require multiple transfers. This is accomplished by setting
-the CHPL_RT_COMM_OFI_MAX_MSG_SIZE to a small value.
\ No newline at end of file
+Test RMA operations that are larger than the maximum message size of the
+fabric and therefore require multiple transfers. This is accomplished by
+setting the CHPL_RT_COMM_OFI_MAX_MSG_SIZE to a small value.
diff --git a/test/runtime/configMatters/comm/unbound/EXECENV b/test/runtime/configMatters/comm/unbound/EXECENV
index a2aab52168d8..f96d1e8c5898 100644
--- a/test/runtime/configMatters/comm/unbound/EXECENV
+++ b/test/runtime/configMatters/comm/unbound/EXECENV
@@ -1 +1 @@
-CHPL_RT_COMM_OFI_EP_CNT=10
\ No newline at end of file
+CHPL_RT_COMM_OFI_EP_CNT=10
diff --git a/test/runtime/configMatters/comm/unbound/README b/test/runtime/configMatters/comm/unbound/README
index 6c22d25f53b5..47b5253591dc 100644
--- a/test/runtime/configMatters/comm/unbound/README
+++ b/test/runtime/configMatters/comm/unbound/README
@@ -1,2 +1,2 @@
 Tests for CHPL_COMM=ofi with unbound endpoints. This is accomplished by
-setting CHPL_RT_COMM_OFI_EP_CNT to a small value.
\ No newline at end of file
+setting CHPL_RT_COMM_OFI_EP_CNT to a small value.
diff --git a/test/runtime/configMatters/comm/unbound/SKIPIF b/test/runtime/configMatters/comm/unbound/SKIPIF
index 1a0e68776535..3a1ff69f948a 100644
--- a/test/runtime/configMatters/comm/unbound/SKIPIF
+++ b/test/runtime/configMatters/comm/unbound/SKIPIF
@@ -1 +1 @@
-CHPL_COMM != ofi
\ No newline at end of file
+CHPL_COMM != ofi

From 4114c6a35b6c71997f14dbecc5ee17a18e35649b Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 9 Oct 2024 11:33:32 -0700
Subject: [PATCH 059/107] Add an example GPU snippet as a test to lock behavior

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 test/gpu/native/examples/michael-jhu/stream.chpl | 15 +++++++++++++++
 test/gpu/native/examples/michael-jhu/stream.good |  3 +++
 2 files changed, 18 insertions(+)
 create mode 100644 test/gpu/native/examples/michael-jhu/stream.chpl
 create mode 100644 test/gpu/native/examples/michael-jhu/stream.good

diff --git a/test/gpu/native/examples/michael-jhu/stream.chpl b/test/gpu/native/examples/michael-jhu/stream.chpl
new file mode 100644
index 000000000000..385afd792541
--- /dev/null
+++ b/test/gpu/native/examples/michael-jhu/stream.chpl
@@ -0,0 +1,15 @@
+use GpuDiagnostics;
+
+config const m = 1<<26,
+             alpha = 3.0;
+
+startVerboseGpu();
+on here.gpus[0] {
+  var A, B, C: [1..m] real;
+
+  B = 1;
+  C = 1;
+
+  A = B + alpha*C;
+}
+stopVerboseGpu();
diff --git a/test/gpu/native/examples/michael-jhu/stream.good b/test/gpu/native/examples/michael-jhu/stream.good
new file mode 100644
index 000000000000..8d29c093b503
--- /dev/null
+++ b/test/gpu/native/examples/michael-jhu/stream.good
@@ -0,0 +1,3 @@
+0 (gpu 0): stream.chpl:10: kernel launch (block size: 512x1x1)
+0 (gpu 0): stream.chpl:11: kernel launch (block size: 512x1x1)
+0 (gpu 0): stream.chpl:13: kernel launch (block size: 512x1x1)

From 50997585a1d99642eea081a32337c1e0e6d5ce64 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 9 Oct 2024 11:35:41 -0700
Subject: [PATCH 060/107] Add a comment and update the good file

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 test/gpu/native/examples/michael-jhu/stream.chpl | 7 ++++++-
 test/gpu/native/examples/michael-jhu/stream.good | 4 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/test/gpu/native/examples/michael-jhu/stream.chpl b/test/gpu/native/examples/michael-jhu/stream.chpl
index 385afd792541..4951ef19a272 100644
--- a/test/gpu/native/examples/michael-jhu/stream.chpl
+++ b/test/gpu/native/examples/michael-jhu/stream.chpl
@@ -1,9 +1,11 @@
 use GpuDiagnostics;
+startVerboseGpu();
+
+// code in the slide starts here
 
 config const m = 1<<26,
              alpha = 3.0;
 
-startVerboseGpu();
 on here.gpus[0] {
   var A, B, C: [1..m] real;
 
@@ -12,4 +14,7 @@ on here.gpus[0] {
 
   A = B + alpha*C;
 }
+
+// code in the slide ends here
+
 stopVerboseGpu();
diff --git a/test/gpu/native/examples/michael-jhu/stream.good b/test/gpu/native/examples/michael-jhu/stream.good
index 8d29c093b503..3d8df7b01b63 100644
--- a/test/gpu/native/examples/michael-jhu/stream.good
+++ b/test/gpu/native/examples/michael-jhu/stream.good
@@ -1,3 +1,3 @@
-0 (gpu 0): stream.chpl:10: kernel launch (block size: 512x1x1)
-0 (gpu 0): stream.chpl:11: kernel launch (block size: 512x1x1)
+0 (gpu 0): stream.chpl:12: kernel launch (block size: 512x1x1)
 0 (gpu 0): stream.chpl:13: kernel launch (block size: 512x1x1)
+0 (gpu 0): stream.chpl:15: kernel launch (block size: 512x1x1)

From 6ae059bcbbc0c700be9ddaeae9a291f2c1e8db0b Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 9 Oct 2024 11:43:59 -0700
Subject: [PATCH 061/107] Add a config error for invalid locale model setting

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 util/chplenv/chpl_locale_model.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/util/chplenv/chpl_locale_model.py b/util/chplenv/chpl_locale_model.py
index bdbf23ba18cc..bf43b8832de7 100755
--- a/util/chplenv/chpl_locale_model.py
+++ b/util/chplenv/chpl_locale_model.py
@@ -2,12 +2,17 @@
 import sys
 
 import overrides
-from utils import memoize
+from utils import memoize, error
 
 
 @memoize
 def get():
     locale_model_val = overrides.get('CHPL_LOCALE_MODEL', 'flat')
+
+    if locale_model_val != 'flat' and locale_model_val != 'gpu':
+        error('{} is not a valid value for CHPL_LOCALE_MODEL. '
+              'It can only be "flat" or "gpu".'.format(locale_model_val))
+
     return locale_model_val
 
 

From 43fb0b9738ca011ec8c83d3a0f7682e2d6e72b5d Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 9 Oct 2024 12:04:37 -0700
Subject: [PATCH 062/107] Adjust the implementation based on Andy's suggestion

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 util/chplenv/chpl_locale_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/chplenv/chpl_locale_model.py b/util/chplenv/chpl_locale_model.py
index bf43b8832de7..7cd09f891ff4 100755
--- a/util/chplenv/chpl_locale_model.py
+++ b/util/chplenv/chpl_locale_model.py
@@ -9,7 +9,7 @@
 def get():
     locale_model_val = overrides.get('CHPL_LOCALE_MODEL', 'flat')
 
-    if locale_model_val != 'flat' and locale_model_val != 'gpu':
+    if locale_model_val not in ['flat', 'gpu']:
         error('{} is not a valid value for CHPL_LOCALE_MODEL. '
               'It can only be "flat" or "gpu".'.format(locale_model_val))
 

From 6aeca91a990e3f41aa07d6f84162508d42b9b56d Mon Sep 17 00:00:00 2001
From: Lydia Duncan <lydia-duncan@users.noreply.github.com>
Date: Wed, 9 Oct 2024 12:56:50 -0700
Subject: [PATCH 063/107] At Michael's suggestion, limit the application to
 `PRIM_GET_MEMBER_VALUE`

Michael pointed out that `PRIM_GET_MEMBER_VALUE` can situationally return a ref
or a value and we don't want to apply the const-ness in the value case (as it
will be a copy and thus independent of the behavior of the `const` source).  So
ensure that the result of the call will be a ref before propagating the `const`.

This passed the normally impacted tests locally, will run a full paratest as
well

----
Signed-off-by: Lydia Duncan <lydia-duncan@users.noreply.github.com>
---
 compiler/resolution/functionResolution.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/resolution/functionResolution.cpp b/compiler/resolution/functionResolution.cpp
index 6f83cd99ac40..5a6b3bb68d9a 100644
--- a/compiler/resolution/functionResolution.cpp
+++ b/compiler/resolution/functionResolution.cpp
@@ -9599,7 +9599,7 @@ static void resolveMoveForRhsCallExpr(CallExpr* call, Type* rhsType) {
 
 static void moveSetConstFlagsAndCheck(CallExpr* call, CallExpr* rhs) {
   if (rhs->isPrimitive(PRIM_GET_MEMBER) ||
-      rhs->isPrimitive(PRIM_GET_MEMBER_VALUE) ||
+      (rhs->isPrimitive(PRIM_GET_MEMBER_VALUE) && rhs->qualType().isRef()) ||
       rhs->isPrimitive(PRIM_ADDR_OF))
   {
     if (SymExpr* rhsBase = toSymExpr(rhs->get(1))) {

From a20458117a2db6fbf87548f9cdb9bdcff3caae90 Mon Sep 17 00:00:00 2001
From: "John H. Hartman" <jhh67@users.noreply.github.com>
Date: Wed, 9 Oct 2024 13:44:35 -0700
Subject: [PATCH 064/107] Fixed typo.

Signed-off-by: John H. Hartman <jhh67@users.noreply.github.com>
---
 runtime/src/topo/hwloc/topo-hwloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/src/topo/hwloc/topo-hwloc.c b/runtime/src/topo/hwloc/topo-hwloc.c
index cbce3cfa6bfb..7195deaa1ccc 100644
--- a/runtime/src/topo/hwloc/topo-hwloc.c
+++ b/runtime/src/topo/hwloc/topo-hwloc.c
@@ -1523,7 +1523,7 @@ chpl_topo_pci_addr_t *chpl_topo_selectNicByType(chpl_topo_pci_addr_t *inAddr,
 //
 // Note that cores are assigned to partitions during initialization of the
 // topology layer before this function is called. As a result, the assignment
-// of cores and devices to paratitions may not be optimal, especially if the
+// of cores and devices to partitions may not be optimal, especially if the
 // machine topology is asymmetric. For example, if there are two co-locales
 // on a machine with four NUMA domains, one co-locale will be assigned cores
 // in the first two NUMA domains and the other the second two domains. If

From c49231dad3fe9464c1084da14ee485cd5adb525f Mon Sep 17 00:00:00 2001
From: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com>
Date: Wed, 9 Oct 2024 16:36:52 -0500
Subject: [PATCH 065/107] Changes based on review

Signed-off-by: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com>
---
 util/cron/test-gpu-ex-cuda-12.bash                | 1 -
 util/cron/test-gpu-ex-cuda-12.interop.bash        | 8 ++++----
 util/cron/test-gpu-ex-cuda-12.specialization.bash | 2 +-
 util/cron/test-perf.gpu-ex-cuda-12.bash           | 2 +-
 util/cron/test-perf.gpu-ex-cuda-12.um.bash        | 2 +-
 5 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/util/cron/test-gpu-ex-cuda-12.bash b/util/cron/test-gpu-ex-cuda-12.bash
index ff13152932df..69a18fe76b4a 100755
--- a/util/cron/test-gpu-ex-cuda-12.bash
+++ b/util/cron/test-gpu-ex-cuda-12.bash
@@ -12,7 +12,6 @@ export CHPL_LLVM=bundled  # CUDA 12 is only supported with bundled LLVM
 export CHPL_COMM=none
 export CHPL_LOCALE_MODEL=gpu
 export CHPL_LAUNCHER_PARTITION=allgriz
-export CHPL_TEST_GPU=true
 export CHPL_GPU=nvidia  # amd is also detected automatically
 
 export CHPL_NIGHTLY_TEST_CONFIG_NAME="gpu-ex-cuda-12"
diff --git a/util/cron/test-gpu-ex-cuda-12.interop.bash b/util/cron/test-gpu-ex-cuda-12.interop.bash
index 974132a6c3bd..e50294790c2e 100755
--- a/util/cron/test-gpu-ex-cuda-12.interop.bash
+++ b/util/cron/test-gpu-ex-cuda-12.interop.bash
@@ -8,17 +8,17 @@ source $CWD/common-hpe-cray-ex.bash
 
 
 # We need 12.4 for the stream test because the CUDA driver on pinoak
-# only supports PTX for 12.4, untill the driver is updated, we need to
+# only supports PTX for 12.4, until the driver is updated, we need to
 # stick with 12.4 instead of 12.5
-module load cuda/12.4  # default is CUDA 12
+module load cuda/12.4  # default is CUDA 12.5
 
 # We need cublas for the cublas interop test, but since we are using 12.4 above
 # pinoak doesn't have the cublas library for 12.4, so we need to use the cublas
 # from 12.5 (which is compatible across minor versions)
 # This can be removed once we use CUDA 12.5
-export CHPL_LIB_PATH="/opt/nvidia/hpc_sdk/Linux_x86_64/24.7/math_libs/lib64"
+export CHPL_LIB_PATH="/opt/nvidia/hpc_sdk/Linux_x86_64/24.7/math_libs/lib64:$CHPL_LIB_PATH"
 
-export CHPL_LLVM=bundled  # CUDA 12 is only supported with bundled LLVM
+export CHPL_LLVM=bundled  # Using bundled LLVM since that's safer
 export CHPL_TEST_GPU=true
 export CHPL_LAUNCHER_PARTITION=allgriz
 export CHPL_NIGHTLY_TEST_DIRS="gpu/interop/"
diff --git a/util/cron/test-gpu-ex-cuda-12.specialization.bash b/util/cron/test-gpu-ex-cuda-12.specialization.bash
index 5228ca3f7a5b..9fda01da052e 100755
--- a/util/cron/test-gpu-ex-cuda-12.specialization.bash
+++ b/util/cron/test-gpu-ex-cuda-12.specialization.bash
@@ -8,7 +8,7 @@ source $CWD/common-hpe-cray-ex.bash
 
 module load cudatoolkit  # default is CUDA 12
 
-export CHPL_LLVM=bundled  # CUDA 12 is only supported with bundled LLVM
+export CHPL_LLVM=bundled  # Using bundled LLVM since that's safer
 export CHPL_COMM=none
 export CHPL_LOCALE_MODEL=gpu
 export CHPL_LAUNCHER_PARTITION=allgriz
diff --git a/util/cron/test-perf.gpu-ex-cuda-12.bash b/util/cron/test-perf.gpu-ex-cuda-12.bash
index e50b0d49a4b8..acbf70ca0751 100755
--- a/util/cron/test-perf.gpu-ex-cuda-12.bash
+++ b/util/cron/test-perf.gpu-ex-cuda-12.bash
@@ -8,7 +8,7 @@ source $CWD/common-hpe-cray-ex.bash
 
 module load cudatoolkit  # default is CUDA 12
 
-export CHPL_LLVM=bundled  # CUDA 12 is only supported with bundled LLVM
+export CHPL_LLVM=bundled  # Using bundled LLVM since that's safer
 export CHPL_COMM=none
 export CHPL_LOCALE_MODEL=gpu
 export CHPL_LAUNCHER_PARTITION=allgriz
diff --git a/util/cron/test-perf.gpu-ex-cuda-12.um.bash b/util/cron/test-perf.gpu-ex-cuda-12.um.bash
index 75f5c38c52f8..ee99ad6e923c 100755
--- a/util/cron/test-perf.gpu-ex-cuda-12.um.bash
+++ b/util/cron/test-perf.gpu-ex-cuda-12.um.bash
@@ -8,7 +8,7 @@ source $CWD/common-hpe-cray-ex.bash
 
 module load cudatoolkit  # default is CUDA 12
 
-export CHPL_LLVM=bundled  # CUDA 12 is only supported with bundled LLVM
+export CHPL_LLVM=bundled # Using bundled LLVM since that's safer
 export CHPL_COMM=none
 export CHPL_LOCALE_MODEL=gpu
 export CHPL_LAUNCHER_PARTITION=allgriz

From fcadf6321246f1267157698082e81eb10aedc67a Mon Sep 17 00:00:00 2001
From: Brandon Neth <brandon.neth@hpe.com>
Date: Thu, 10 Oct 2024 13:26:28 -0700
Subject: [PATCH 066/107] first pass on partial read implementation. need to
 reduce code duplication and add more tests, including multilocale tests. ---
 Signed-off-by: Brandon Neth <brandon.neth@hpe.com>

---
 modules/packages/Zarr.chpl                  | 50 +++++++++++++++++++++
 test/library/packages/Zarr/ZarrPartial.chpl | 32 +++++++++++++
 test/library/packages/Zarr/ZarrPartial.good |  1 +
 3 files changed, 83 insertions(+)
 create mode 100644 test/library/packages/Zarr/ZarrPartial.chpl
 create mode 100644 test/library/packages/Zarr/ZarrPartial.good

diff --git a/modules/packages/Zarr.chpl b/modules/packages/Zarr.chpl
index 180644db45bc..0c5222ffc4b9 100644
--- a/modules/packages/Zarr.chpl
+++ b/modules/packages/Zarr.chpl
@@ -494,6 +494,56 @@ module Zarr {
   }
 
 
+  /*
+
+
+  */ 
+  proc readZarrArrayPartial(directoryPath: string, type dtype, param dimCount: int, partialDomain: domain(dimCount), 
+                            bloscThreads: int(32) = 1, targetLocales: [] locale = Locales) throws {
+    var md = getMetadata(directoryPath);
+    validateMetadata(md, dtype, dimCount);
+    // Size and shape tuples
+    var totalShape, chunkShape : dimCount*int;
+    var chunkCounts: dimCount*int;
+    var totalRanges,chunkRanges: dimCount*range(int);
+    for i in 0..<dimCount {
+      totalShape[i] = md.shape[i];
+      chunkShape[i] = md.chunks[i];
+      chunkCounts[i] = ceil(totalShape[i]:real / chunkShape[i]:real) : int;
+      totalRanges[i] = 0..<totalShape[i];
+      chunkRanges[i] = 0..<chunkCounts[i];
+    }
+    const fullChunkDomain: domain(dimCount) = chunkRanges;
+
+
+    // Initialize the distributed domain and array
+    const undistD : domain(dimCount) = totalRanges;
+    const Dist = new blockDist(boundingBox=undistD, targetLocales=targetLocales);
+    const D = Dist.createDomain(partialDomain);
+    var A: [D] dtype;
+
+    coforall loc in Locales do on loc {
+      blosc_init();
+      blosc_set_nthreads(bloscThreads);
+      const hereD = A.localSubdomain();
+      ref hereA = A[hereD];
+
+      const localChunks = getLocalChunks(D, hereD, chunkShape);
+      forall chunkIndex in localChunks {
+
+        const chunkPath = buildChunkPath(directoryPath, ".", chunkIndex);
+
+        const thisChunkDomain = getChunkDomain(chunkShape, chunkIndex);
+        const thisChunkHere = hereD[thisChunkDomain];
+
+        ref thisChunkSlice = hereA.localSlice(thisChunkHere);
+        readChunk(dimCount, chunkPath, thisChunkDomain, thisChunkSlice);
+      }
+      blosc_destroy();
+    }
+    return A;
+  }
+
   /*
     Reads a v2.0 zarr store from storage using a single locale, returning a
     locally allocated array. This method assumes a shared filesystem
diff --git a/test/library/packages/Zarr/ZarrPartial.chpl b/test/library/packages/Zarr/ZarrPartial.chpl
new file mode 100644
index 000000000000..fb383d459e48
--- /dev/null
+++ b/test/library/packages/Zarr/ZarrPartial.chpl
@@ -0,0 +1,32 @@
+use Zarr;
+use IO;
+use FileSystem;
+use Random;
+
+
+proc main() {
+  const fullD: domain(3) = {0..10, 0..10, 0..10};
+  const partialD1: domain(3) = {0..5, 0..5, 0..5};
+
+  var ones: [fullD] real(32) = 1;
+  var twos: [fullD] real(32) = 2;
+
+  if (isDir("ones")) then rmTree("ones");
+  if (isDir("twos")) then rmTree("twos");
+  writeZarrArray("ones", ones, (3,3,3));
+  writeZarrArray("twos", twos, (3,3,3));
+
+  var inputData = readZarrArray("ones", real(32), 3);
+  inputData[partialD1] = readZarrArrayPartial("twos", real(32), 3, partialD1);
+
+  var expectedData: [fullD] real(32) = 1.0;
+  expectedData[partialD1] = 2.0;
+
+  assert(inputData.domain == expectedData.domain, "Domain mismatch: %? %?".format(inputData.domain, expectedData.domain));
+  forall i in inputData.domain do 
+    assert(inputData[i] == expectedData[i], "Mismatch for 3D real data on indices: %?.\nWritten: %?\nRead: %?".format(i, inputData[i], expectedData[i]));
+  rmTree("ones");
+  rmTree("twos");
+
+  writeln("Pass");
+}
diff --git a/test/library/packages/Zarr/ZarrPartial.good b/test/library/packages/Zarr/ZarrPartial.good
new file mode 100644
index 000000000000..656dfc57d555
--- /dev/null
+++ b/test/library/packages/Zarr/ZarrPartial.good
@@ -0,0 +1 @@
+Pass

From 7aba8cd64b0030a3ed6347a3b835d8e31977a34d Mon Sep 17 00:00:00 2001
From: Andy Stone <stonea@users.noreply.github.com>
Date: Thu, 10 Oct 2024 15:12:41 -0700
Subject: [PATCH 067/107] annotations for 10/10/24 perf triage

- Mandelbrot complex regression
- update blc to use foreach (impacted perf)
- non-blocking put affecting ra-rmo perf on ofi
- mark that we've updated Chapel version used for Arkouda nightly testing
- Large arkouda refactor affecting bigint perf

---
Signed-off-by: Andy Stone <stonea@users.noreply.github.com>
---
 test/ANNOTATIONS.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/ANNOTATIONS.yaml b/test/ANNOTATIONS.yaml
index 6e3079c91433..a12f9dcc8846 100644
--- a/test/ANNOTATIONS.yaml
+++ b/test/ANNOTATIONS.yaml
@@ -1230,6 +1230,8 @@ mandelbrot:
     - Mandelbrot tweaks (#5188)
   05/08/24:
     - Remove complex math function wrappers for LLVM 16+ (#25019)
+  10/10/24:
+    - Fix complex support with CHPL_LOCALE_MODEL=gpu (#26048)
 
 
 mandelbrot-extras:
@@ -1669,6 +1671,8 @@ nbody:
     - Codegen sqrt and fabs directly to llvm intrinsics when possible (#24455)
   04/03/24:
     - Change param-based nbody to foreach-based (#24745)
+  10/06/24:
+    - Update my study version of nbody to be like nbody#2, but using foreach loops (#26044)
 
 no-op:
   02/24/17:
@@ -1790,6 +1794,8 @@ ra.ml-perf:
     - machine changes, cannot reproduce old numbers
   02/28/20:
     - Use localAccess for ra-on (#15001)
+  10/9/24:
+    - Non-blocking PUT in CHPL_COMM=ofi (#25977)
 
 ra-atomics.ml-perf:
   10/03/17:
@@ -2493,11 +2499,14 @@ arkouda: &arkouda-base
     - Array transfer perf fix (Bears-R-Us/arkouda#3671)
   8/27/24:
     - Fix performance regression in to_ndarray (Bears-R-Us/arkouda#3697)
+  09/27/24:
+    - Use Chapel 2.2 for Arkouda "release" nightly testing (#26014)
 
 arkouda-string:
   <<: *arkouda-base
 arkouda-bigint:
   <<: *arkouda-base
+    - Fixes 3783, 3784, 3788 multilocale io test failures (Bears-R-Us/arkouda#3790)
 
 arkouda-comp:
   <<: *arkouda-base

From baf5d2e8aca7c4c54ad3b83d9612f00be34ed585 Mon Sep 17 00:00:00 2001
From: Andy Stone <stonea@users.noreply.github.com>
Date: Thu, 10 Oct 2024 15:20:52 -0700
Subject: [PATCH 068/107] fix syntax error

---
Signed-off-by: Andy Stone <stonea@users.noreply.github.com>
---
 test/ANNOTATIONS.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/ANNOTATIONS.yaml b/test/ANNOTATIONS.yaml
index a12f9dcc8846..c2801994239b 100644
--- a/test/ANNOTATIONS.yaml
+++ b/test/ANNOTATIONS.yaml
@@ -2504,8 +2504,10 @@ arkouda: &arkouda-base
 
 arkouda-string:
   <<: *arkouda-base
+
 arkouda-bigint:
   <<: *arkouda-base
+  09/27/24:
     - Fixes 3783, 3784, 3788 multilocale io test failures (Bears-R-Us/arkouda#3790)
 
 arkouda-comp:

From bb7e4f892f05ce41463ab1b85964da15377a3a21 Mon Sep 17 00:00:00 2001
From: Brandon Neth <brandon.neth@hpe.com>
Date: Thu, 10 Oct 2024 15:20:55 -0700
Subject: [PATCH 069/107] refactored the full read to use the partial read
 function --- Signed-off-by: Brandon Neth <brandon.neth@hpe.com>

---
 modules/packages/Zarr.chpl | 42 ++++----------------------------------
 1 file changed, 4 insertions(+), 38 deletions(-)

diff --git a/modules/packages/Zarr.chpl b/modules/packages/Zarr.chpl
index 0c5222ffc4b9..5523aefb5d0a 100644
--- a/modules/packages/Zarr.chpl
+++ b/modules/packages/Zarr.chpl
@@ -371,46 +371,13 @@ module Zarr {
   proc readZarrArray(directoryPath: string, type dtype, param dimCount: int, bloscThreads: int(32) = 1, targetLocales: [] locale = Locales) throws {
     var md = getMetadata(directoryPath);
     validateMetadata(md, dtype, dimCount);
-    // Size and shape tuples
-    var totalShape, chunkShape : dimCount*int;
-    var chunkCounts: dimCount*int;
-    var totalRanges,chunkRanges: dimCount*range(int);
+    var totalRanges: dimCount*range(int);
     for i in 0..<dimCount {
-      totalShape[i] = md.shape[i];
-      chunkShape[i] = md.chunks[i];
-      chunkCounts[i] = ceil(totalShape[i]:real / chunkShape[i]:real) : int;
-      totalRanges[i] = 0..<totalShape[i];
-      chunkRanges[i] = 0..<chunkCounts[i];
+      totalRanges[i] = 0..<md.shape[i];
     }
-    const fullChunkDomain: domain(dimCount) = chunkRanges;
+    const fullDomain: domain(dimCount) = totalRanges;
 
-    // Initialize the distributed domain and array
-    const undistD : domain(dimCount) = totalRanges;
-    const Dist = new blockDist(boundingBox=undistD, targetLocales=targetLocales);
-    const D = Dist.createDomain(undistD);
-    var A: [D] dtype;
-
-
-    coforall loc in Locales do on loc {
-      blosc_init();
-      blosc_set_nthreads(bloscThreads);
-      const hereD = A.localSubdomain();
-      ref hereA = A[hereD];
-
-      const localChunks = getLocalChunks(D, hereD, chunkShape);
-      forall chunkIndex in localChunks {
-
-        const chunkPath = buildChunkPath(directoryPath, ".", chunkIndex);
-
-        const thisChunkDomain = getChunkDomain(chunkShape, chunkIndex);
-        const thisChunkHere = hereD[thisChunkDomain];
-
-        ref thisChunkSlice = hereA.localSlice(thisChunkHere);
-        readChunk(dimCount, chunkPath, thisChunkDomain, thisChunkSlice);
-      }
-      blosc_destroy();
-    }
-    return A;
+    return readZarrArrayPartial(directoryPath, dtype, dimCount, fullDomain, bloscThreads=bloscThreads, targetLocales=targetLocales);
   }
 
 
@@ -515,7 +482,6 @@ module Zarr {
     }
     const fullChunkDomain: domain(dimCount) = chunkRanges;
 
-
     // Initialize the distributed domain and array
     const undistD : domain(dimCount) = totalRanges;
     const Dist = new blockDist(boundingBox=undistD, targetLocales=targetLocales);

From e90fbc90f3a064b8b9e749534c05ed9657ccecf9 Mon Sep 17 00:00:00 2001
From: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com>
Date: Thu, 10 Oct 2024 18:11:22 -0500
Subject: [PATCH 070/107] Make CHPL_LIB_PATH ignore empty strings

If CHPL_LIB_PATH has colons in it, we split the path based on the
colons. However, if we have a trailing or leading colon, or two colons
back to back (`::`) in CHPL_LIB_PATH, it get's added as an empty string.
This cases errors since there isn't an argument to go with `-L` for the
target compiler.

This makes the change such the the `addPath` function ignores the empty
strings in CHPL_LIB_PATH.

Signed-off-by: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com>
---
 compiler/util/files.cpp | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/compiler/util/files.cpp b/compiler/util/files.cpp
index d215ca9a3215..37780f0020dc 100644
--- a/compiler/util/files.cpp
+++ b/compiler/util/files.cpp
@@ -91,7 +91,23 @@ static void addPath(const char* pathVar, std::vector<const char*>* pathvec) {
       colon++;                            // and advance to the next
     }
 
-    pathvec->push_back(astr(dirString));
+    // FIXME (Maybe?)
+    // Following the precedent of $PATH on Unix, we should
+    // treat empty strings between colons like  :: or trailing/leading
+    // colons as meaning to add the current directory to the path.
+    // If we don't include the current directory in the CHPL_LIB_PATH by
+    // default, this behavior below is incorrect, and instead of ignoring
+    // empty strings, it should figure out the current directory and add
+    // that to the path.
+    // Alternatively, we can alter the compiler to throw -L . when
+    // CHPL_LIB_PATH has empty strings in between colons.
+    // However, if we do include the current directory in CHPL_LIB_PATH
+    // by default, then this doesn't need fixing, delete this FIXME.
+
+    // ignore empty strings
+    if (dirString && *dirString) {
+      pathvec->push_back(astr(dirString));
+    }
 
     dirString = colon;                     // advance dirString
   } while (colon != NULL);

From 41c3f13841158e8084226a7a24864a73556eacdb Mon Sep 17 00:00:00 2001
From: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com>
Date: Thu, 10 Oct 2024 19:17:41 -0500
Subject: [PATCH 071/107] Changes based on review

Signed-off-by: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com>
---
 compiler/util/files.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/util/files.cpp b/compiler/util/files.cpp
index 37780f0020dc..4183614988d1 100644
--- a/compiler/util/files.cpp
+++ b/compiler/util/files.cpp
@@ -105,7 +105,7 @@ static void addPath(const char* pathVar, std::vector<const char*>* pathvec) {
     // by default, then this doesn't need fixing, delete this FIXME.
 
     // ignore empty strings
-    if (dirString && *dirString) {
+    if (dirString && strlen(dirString) > 0) {
       pathvec->push_back(astr(dirString));
     }
 

From 9bc8e839c7f3ba6e34c3e5f45e0bfae3a2c753b2 Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Fri, 11 Oct 2024 09:27:42 -0700
Subject: [PATCH 072/107] Reorder fields in query map result to help reduce
 memory footprint

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 frontend/include/chpl/framework/Context-detail.h | 4 ++--
 frontend/lib/framework/Context.cpp               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/frontend/include/chpl/framework/Context-detail.h b/frontend/include/chpl/framework/Context-detail.h
index 877230a56731..d6816482b4a2 100644
--- a/frontend/include/chpl/framework/Context-detail.h
+++ b/frontend/include/chpl/framework/Context-detail.h
@@ -275,8 +275,6 @@ class QueryMapResultBase {
   // extra boolean is fine.
   mutable bool beingTestedForReuse = false;
 
-  mutable QueryDependencyVec dependencies;
-
   // Whether or not errors from this query result have been shown to the
   // user (they may not have been if some query checked for errors).
   mutable bool emittedErrors = false;
@@ -287,6 +285,8 @@ class QueryMapResultBase {
   // This is not too strongly connected to emittedErrors (which tracks whether
   // errors --- if any --- were shown to the user for this query result only)
   mutable bool errorsPresentInSelfOrDependencies = false;
+
+  mutable QueryDependencyVec dependencies;
   mutable std::set<const QueryMapResultBase*> recursionErrors;
   mutable QueryErrorVec errors;
 
diff --git a/frontend/lib/framework/Context.cpp b/frontend/lib/framework/Context.cpp
index a79141b8b7fb..50a15dd61412 100644
--- a/frontend/lib/framework/Context.cpp
+++ b/frontend/lib/framework/Context.cpp
@@ -1351,9 +1351,9 @@ QueryMapResultBase::QueryMapResultBase(RevisionNumber lastChecked,
   : lastChecked(lastChecked),
     lastChanged(lastChanged),
     beingTestedForReuse(beingTestedForReuse),
-    dependencies(),
     emittedErrors(emittedErrors),
     errorsPresentInSelfOrDependencies(errorsPresentInSelfOrDependencies),
+    dependencies(),
     recursionErrors(std::move(recursionErrors)),
     errors(),
     parentQueryMap(parentQueryMap) {

From c803a6e0828dee94d8850b5576d4dc5d28273b0e Mon Sep 17 00:00:00 2001
From: Brandon Neth <brandon.neth@hpe.com>
Date: Fri, 11 Oct 2024 09:40:33 -0700
Subject: [PATCH 073/107] error handling test and implementation, and clean
 files / ignore files --- Signed-off-by: Brandon Neth <brandon.neth@hpe.com>

---
 modules/packages/Zarr.chpl                  |  2 ++
 test/library/packages/Zarr/.gitignore       |  2 ++
 test/library/packages/Zarr/CLEANFILES       |  4 ++-
 test/library/packages/Zarr/ZarrPartial.chpl | 35 ++++++++++++++++++---
 4 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/modules/packages/Zarr.chpl b/modules/packages/Zarr.chpl
index 5523aefb5d0a..9b11b77c598c 100644
--- a/modules/packages/Zarr.chpl
+++ b/modules/packages/Zarr.chpl
@@ -484,6 +484,8 @@ module Zarr {
 
     // Initialize the distributed domain and array
     const undistD : domain(dimCount) = totalRanges;
+    if ! undistD.contains(partialDomain) then
+      throw new IllegalArgumentError("Partial domain is out of bounds of the array domain.");
     const Dist = new blockDist(boundingBox=undistD, targetLocales=targetLocales);
     const D = Dist.createDomain(partialDomain);
     var A: [D] dtype;
diff --git a/test/library/packages/Zarr/.gitignore b/test/library/packages/Zarr/.gitignore
index 92c301538277..522cbed03eaa 100644
--- a/test/library/packages/Zarr/.gitignore
+++ b/test/library/packages/Zarr/.gitignore
@@ -10,4 +10,6 @@ LocalIOStore_*
 TestLocal
 TestTargetLocale
 TestNoCompressorSpecified
+ones
+twos
 
diff --git a/test/library/packages/Zarr/CLEANFILES b/test/library/packages/Zarr/CLEANFILES
index c6f3893d3eb9..b00459200470 100644
--- a/test/library/packages/Zarr/CLEANFILES
+++ b/test/library/packages/Zarr/CLEANFILES
@@ -7,4 +7,6 @@ Test3D
 LocalIOStore_*
 TestLocal
 TestTargetLocale
-TestNoCompressorSpecified
\ No newline at end of file
+TestNoCompressorSpecified
+ones
+twos
\ No newline at end of file
diff --git a/test/library/packages/Zarr/ZarrPartial.chpl b/test/library/packages/Zarr/ZarrPartial.chpl
index fb383d459e48..2e06a69ba79b 100644
--- a/test/library/packages/Zarr/ZarrPartial.chpl
+++ b/test/library/packages/Zarr/ZarrPartial.chpl
@@ -3,10 +3,9 @@ use IO;
 use FileSystem;
 use Random;
 
-
-proc main() {
+proc test3D() {
   const fullD: domain(3) = {0..10, 0..10, 0..10};
-  const partialD1: domain(3) = {0..5, 0..5, 0..5};
+  const partialD: domain(3) = {0..5, 0..5, 0..5};
 
   var ones: [fullD] real(32) = 1;
   var twos: [fullD] real(32) = 2;
@@ -17,16 +16,42 @@ proc main() {
   writeZarrArray("twos", twos, (3,3,3));
 
   var inputData = readZarrArray("ones", real(32), 3);
-  inputData[partialD1] = readZarrArrayPartial("twos", real(32), 3, partialD1);
+  inputData[partialD] = readZarrArrayPartial("twos", real(32), 3, partialD);
 
   var expectedData: [fullD] real(32) = 1.0;
-  expectedData[partialD1] = 2.0;
+  expectedData[partialD] = 2.0;
 
   assert(inputData.domain == expectedData.domain, "Domain mismatch: %? %?".format(inputData.domain, expectedData.domain));
   forall i in inputData.domain do 
     assert(inputData[i] == expectedData[i], "Mismatch for 3D real data on indices: %?.\nWritten: %?\nRead: %?".format(i, inputData[i], expectedData[i]));
   rmTree("ones");
   rmTree("twos");
+}
 
+proc testOutOfBounds() {
+  const fullD: domain(2) = {0..10, 0..10};
+  const partialD: domain(2) = {10..10, 10..12};
+
+  var ones: [fullD] real(32) = 1;
+  var twos: [fullD] real(32) = 2;
+
+  if (isDir("ones")) then rmTree("ones");
+  if (isDir("twos")) then rmTree("twos");
+  writeZarrArray("ones", ones, (3,3));
+  writeZarrArray("twos", twos, (3,3));
+
+  var inputData = readZarrArray("ones", real(32), 2);
+  try {
+    var outOfBoundsEdge = readZarrArrayPartial("twos", real(32), 2, partialD);
+  } catch e {
+    assert(e.message() == "Partial domain is out of bounds of the array domain.", "Unexpected error message: %s".format(e.message()));
+    return;
+  }
+  writeln("Expected error for out of bounds partial domain.");
+}
+proc main() {
+  
+  test3D();
+  testOutOfBounds();
   writeln("Pass");
 }

From e5099a47672cd12034f8154c28dc3cf85dff9752 Mon Sep 17 00:00:00 2001
From: Brandon Neth <brandon.neth@hpe.com>
Date: Fri, 11 Oct 2024 09:50:16 -0700
Subject: [PATCH 074/107] added multi-locale test --- Signed-off-by: Brandon
 Neth <brandon.neth@hpe.com>

---
 modules/packages/Zarr.chpl                        | 2 +-
 test/library/packages/Zarr/ZarrPartial.chpl       | 6 ++++--
 test/library/packages/Zarr/ZarrPartial.numlocales | 2 ++
 3 files changed, 7 insertions(+), 3 deletions(-)
 create mode 100644 test/library/packages/Zarr/ZarrPartial.numlocales

diff --git a/modules/packages/Zarr.chpl b/modules/packages/Zarr.chpl
index 9b11b77c598c..087b279668e7 100644
--- a/modules/packages/Zarr.chpl
+++ b/modules/packages/Zarr.chpl
@@ -465,7 +465,7 @@ module Zarr {
 
 
   */ 
-  proc readZarrArrayPartial(directoryPath: string, type dtype, param dimCount: int, partialDomain: domain(dimCount), 
+  proc readZarrArrayPartial(directoryPath: string, type dtype, param dimCount: int, partialDomain, 
                             bloscThreads: int(32) = 1, targetLocales: [] locale = Locales) throws {
     var md = getMetadata(directoryPath);
     validateMetadata(md, dtype, dimCount);
diff --git a/test/library/packages/Zarr/ZarrPartial.chpl b/test/library/packages/Zarr/ZarrPartial.chpl
index 2e06a69ba79b..a3220f155693 100644
--- a/test/library/packages/Zarr/ZarrPartial.chpl
+++ b/test/library/packages/Zarr/ZarrPartial.chpl
@@ -2,10 +2,12 @@ use Zarr;
 use IO;
 use FileSystem;
 use Random;
+use BlockDist;
 
 proc test3D() {
-  const fullD: domain(3) = {0..10, 0..10, 0..10};
-  const partialD: domain(3) = {0..5, 0..5, 0..5};
+  const dist = new blockDist(boundingBox={0..10, 0..10, 0..10});
+  const fullD = dist.createDomain({0..10, 0..10, 0..10});
+  const partialD = dist.createDomain({0..0, 0..0, 0..10});
 
   var ones: [fullD] real(32) = 1;
   var twos: [fullD] real(32) = 2;
diff --git a/test/library/packages/Zarr/ZarrPartial.numlocales b/test/library/packages/Zarr/ZarrPartial.numlocales
new file mode 100644
index 000000000000..7a754f414cd8
--- /dev/null
+++ b/test/library/packages/Zarr/ZarrPartial.numlocales
@@ -0,0 +1,2 @@
+1
+2
\ No newline at end of file

From a280111927858b4d015fe62b656b6c9ac794ae20 Mon Sep 17 00:00:00 2001
From: Brandon Neth <brandon.neth@hpe.com>
Date: Fri, 11 Oct 2024 10:05:19 -0700
Subject: [PATCH 075/107] Docs and updating old comments --- Signed-off-by:
 Brandon Neth <brandon.neth@hpe.com>

---
 modules/packages/Zarr.chpl | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/modules/packages/Zarr.chpl b/modules/packages/Zarr.chpl
index 087b279668e7..f04e8c1b1698 100644
--- a/modules/packages/Zarr.chpl
+++ b/modules/packages/Zarr.chpl
@@ -367,6 +367,10 @@ module Zarr {
 
     :arg dimCount: Dimensionality of the zarr array
 
+    :arg bloscThreads: The number of threads to use during compression (default=1)
+
+    :arg targetLocales: The locales to use for reading the array in the shape the
+      array will be distributed
   */
   proc readZarrArray(directoryPath: string, type dtype, param dimCount: int, bloscThreads: int(32) = 1, targetLocales: [] locale = Locales) throws {
     var md = getMetadata(directoryPath);
@@ -460,10 +464,25 @@ module Zarr {
     }
   }
 
-
   /*
+    Reads part of a v2.0 zarr store from storage using all locales, returning a
+    block distributed array. Each locale reads and decompresses the chunks
+    with elements in its subdomain. This method assumes a shared filesystem
+    where all nodes can access the store directory.
 
+    :arg directoryPath: Relative or absolute path to the root of the zarr
+      store. The store is expected to contain a '.zarray' metadata file
+
+    :arg dtype: Chapel type of the store's data
+
+    :arg dimCount: Dimensionality of the zarr array
+
+    :arg partialDomain: The domain of the elements of the array that should be read
+
+    :arg bloscThreads: The number of threads to use during compression (default=1)
 
+    :arg targetLocales: The locales to use for reading the array in the shape the
+      array will be distributed
   */ 
   proc readZarrArrayPartial(directoryPath: string, type dtype, param dimCount: int, partialDomain, 
                             bloscThreads: int(32) = 1, targetLocales: [] locale = Locales) throws {

From 886315053a4942a121e4b3c1048e8de853a532f1 Mon Sep 17 00:00:00 2001
From: Brandon Neth <brandon.neth@hpe.com>
Date: Fri, 11 Oct 2024 10:38:59 -0700
Subject: [PATCH 076/107] linting --- Signed-off-by: Brandon Neth
 <brandon.neth@hpe.com>

---
 modules/packages/Zarr.chpl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/packages/Zarr.chpl b/modules/packages/Zarr.chpl
index f04e8c1b1698..2dd026fe7e0e 100644
--- a/modules/packages/Zarr.chpl
+++ b/modules/packages/Zarr.chpl
@@ -483,8 +483,8 @@ module Zarr {
 
     :arg targetLocales: The locales to use for reading the array in the shape the
       array will be distributed
-  */ 
-  proc readZarrArrayPartial(directoryPath: string, type dtype, param dimCount: int, partialDomain, 
+  */
+  proc readZarrArrayPartial(directoryPath: string, type dtype, param dimCount: int, partialDomain,
                             bloscThreads: int(32) = 1, targetLocales: [] locale = Locales) throws {
     var md = getMetadata(directoryPath);
     validateMetadata(md, dtype, dimCount);

From e58e81ea89daa9efa9293c08a524b607ada8b7ff Mon Sep 17 00:00:00 2001
From: Ben Harshbarger <ben.harshb@gmail.com>
Date: Wed, 9 Oct 2024 11:55:32 -0700
Subject: [PATCH 077/107] Avoid segfault when there is no parent class

Signed-off-by: Ben Harshbarger <ben.harshb@gmail.com>
---
 frontend/lib/resolution/Resolver.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/frontend/lib/resolution/Resolver.cpp b/frontend/lib/resolution/Resolver.cpp
index c07611f8e583..c80663c1bd43 100644
--- a/frontend/lib/resolution/Resolver.cpp
+++ b/frontend/lib/resolution/Resolver.cpp
@@ -3971,9 +3971,11 @@ static const Type* getGenericType(Context* context, const Type* recv) {
     gen = cur->instantiatedFromCompositeType();
     if (gen == nullptr) gen = cur;
   } else if (auto bct = recv->toBasicClassType()) {
-    if (bct->parentClassType()->instantiatedFromCompositeType()) {
-      auto pt = getGenericType(context, bct->parentClassType())->toBasicClassType();
-      bct = BasicClassType::get(context, bct->id(), bct->name(), pt, bct->instantiatedFrom(), bct->substitutions());
+    if (auto pct = bct->parentClassType()) {
+      if (pct->instantiatedFromCompositeType()) {
+        auto pt = getGenericType(context, pct)->toBasicClassType();
+        bct = BasicClassType::get(context, bct->id(), bct->name(), pt, bct->instantiatedFrom(), bct->substitutions());
+      }
     }
 
     gen = bct->instantiatedFromCompositeType();

From 42f929d8c237a5b7876a94d6ac2ecba19b08b7f4 Mon Sep 17 00:00:00 2001
From: Ben Harshbarger <ben.harshb@gmail.com>
Date: Wed, 9 Oct 2024 11:55:44 -0700
Subject: [PATCH 078/107] Improve generation of super.init() call in default
 initializers

This commit fixes an issue where super.init() calls were being generated
despite the lack of a parent class. This lead to resolution failures
when trying to resolve init on the root class.

This kind of error manifested itself when there was an interface in the
inherit-exprs list of the class. The 'initHelper' function was naively
ignoring the possibility of such expressions.

The bug is fixed by first relying on ``initialTypeForTypeDecl`` to
compute the parent class rather than doing it manually inside
'initHelper'.

-----

While here, it was convenient to fix another super.init() related bug
when generating default initializers when a parent (or other ancestor
class) had a user-defined initializer. In such cases the compiler should
not be generating formals for classes that define their own initializer,
and should instead be invoking an empty 'super.init()' when such a class
is the direct parent.

A straightforward way to fix this issue was to abandon the recursive
nature of the 'initHelper' function and instead build the default
initializer for the parent class. With a correct parent initializer we
can simply copy the formals to the child class initializer and avoid
having to think recursively.

Lastly, 'initHelper' is modified to return a boolean indicating whether
a 'super.init()' call is needed, as opposed to examining the
inherit-exprs of the class.

Signed-off-by: Ben Harshbarger <ben.harshb@gmail.com>
---
 frontend/lib/resolution/default-functions.cpp | 115 ++++++++++--------
 .../test/resolution/testInitSemantics.cpp     |  81 ++++++++++++
 2 files changed, 146 insertions(+), 50 deletions(-)

diff --git a/frontend/lib/resolution/default-functions.cpp b/frontend/lib/resolution/default-functions.cpp
index 96f188ede659..39fa62d5a82b 100644
--- a/frontend/lib/resolution/default-functions.cpp
+++ b/frontend/lib/resolution/default-functions.cpp
@@ -297,40 +297,55 @@ static void collectFields(const AstNode* ast,
   }
 }
 
-static void initHelper(Context* context,
+static const BuilderResult& buildInitializer(Context* context, ID typeID);
+
+static bool initHelper(Context* context,
                        Builder* builder,
                        const AggregateDecl* typeDecl,
                        const Location& dummyLoc,
-                       AstList& formals, AstList& superArgs, AstList& stmts,
-                       bool isChild = true) {
+                       AstList& formals, AstList& superArgs, AstList& stmts) {
+  // Return 'true' if a super.init call is necessary
+  bool addSuperInit = false;
+
+  // Check if we need a super.init() call. If the parent has a default
+  // initializer, add arguments to the super.init() and formals to the
+  // current initializer.
   if (auto cls = typeDecl->toClass()) {
-    if (cls->numInheritExprs() == 1) {
-      ResolutionResultByPostorderID r;
-      auto visitor = Resolver::createForParentClass(context, typeDecl,
-                                                    {}, nullptr, r);
-      cls->inheritExpr(0)->traverse(visitor);
-      auto res = r.byAst(cls->inheritExpr(0));
-      if (auto parentType = res.type().type()) {
-        if (auto pct = parentType->getCompositeType()) {
-          const Type* manager = nullptr;
-          auto borrowedNonnilDecor =
-              ClassTypeDecorator(ClassTypeDecorator::BORROWED_NONNIL);
-          auto parentReceiver =
-            ClassType::get(context, pct->toBasicClassType(), manager, borrowedNonnilDecor);
-
-          // Do not add formals if the parent has a user-defined initializer
-          // TODO: It would be nice to be able to generate a nice error message
-          //   for the user if they try and pass arguments for the parent in
-          //   this case.
-          if (!areOverloadsPresentInDefiningScope(context, parentReceiver, QualifiedType::INIT_RECEIVER, USTR("init"))) {
-            auto parentAst = parsing::idToAst(context, pct->id());
-            if (auto parentDecl = parentAst->toAggregateDecl()) {
-              initHelper(context, builder, parentDecl, dummyLoc,
-                         formals, superArgs, stmts, /*isChild=*/false);
-            }
+    auto t = initialTypeForTypeDecl(context, cls->id());
+    auto bct = t->getCompositeType()->toBasicClassType();
+    auto pct = bct->parentClassType();
+
+    if (pct && !pct->isObjectType()) {
+      addSuperInit = true;
+
+      const Type* manager = nullptr;
+      auto borrowedNonnilDecor =
+          ClassTypeDecorator(ClassTypeDecorator::BORROWED_NONNIL);
+      auto parentReceiver =
+        ClassType::get(context, pct->toBasicClassType(), manager, borrowedNonnilDecor);
+      auto userDefinedExists = areOverloadsPresentInDefiningScope(context,
+                                 parentReceiver,
+                                 QualifiedType::INIT_RECEIVER,
+                                 USTR("init"));
+
+      if (!userDefinedExists) {
+        auto& br = buildInitializer(context, pct->id());
+        auto mod = br.topLevelExpression(0)->toModule();
+        auto fn = mod->child(0)->toFunction();
+
+        // Add formals and super.init() arguments
+        for (auto formal : fn->formals()) {
+          if (auto named = formal->toNamedDecl();
+              named && named->name() != USTR("this")) {
+            formals.push_back(formal->copy());
+
+            owned<AstNode> arg = Identifier::build(builder, dummyLoc, named->name());
+            superArgs.push_back(std::move(arg));
           }
         }
       }
+    } else {
+      addSuperInit = false;
     }
   }
 
@@ -358,26 +373,22 @@ static void initHelper(Context* context,
                                           typeExpr ? typeExpr->copy() : nullptr,
                                           initExpr ? initExpr->copy() : nullptr);
 
-    if (isChild) {
-      // Create 'this.field = arg;' statement
-      owned<AstNode> lhs = Dot::build(builder, dummyLoc,
-                                      Identifier::build(builder, dummyLoc, USTR("this")),
-                                      field->name());
-      owned<AstNode> rhs = Identifier::build(builder, dummyLoc, field->name());
-      owned<AstNode> assign = OpCall::build(builder, dummyLoc, USTR("="),
-                                            std::move(lhs), std::move(rhs));
-      stmts.push_back(std::move(assign));
-    } else {
-      // collect arguments for super.init(...)
-      owned<AstNode> arg = Identifier::build(builder, dummyLoc, field->name());
-      superArgs.push_back(std::move(arg));
-    }
+    // Create 'this.field = arg;' statement
+    owned<AstNode> lhs = Dot::build(builder, dummyLoc,
+                                    Identifier::build(builder, dummyLoc, USTR("this")),
+                                    field->name());
+    owned<AstNode> rhs = Identifier::build(builder, dummyLoc, field->name());
+    owned<AstNode> assign = OpCall::build(builder, dummyLoc, USTR("="),
+                                          std::move(lhs), std::move(rhs));
+    stmts.push_back(std::move(assign));
 
     formals.push_back(std::move(formal));
   }
+
+  return addSuperInit;
 }
 
-static const BuilderResult& buildInitializer(Context* context, ID typeID) {
+const BuilderResult& buildInitializer(Context* context, ID typeID) {
   auto typeDecl = parsing::idToAst(context, typeID)->toAggregateDecl();
   auto parentMod = parsing::idToParentModule(context, typeID);
   auto modName = "chpl__generated_" + parentMod.symbolName(context).str() + "_" + typeDecl->name().str() + "_init";
@@ -393,14 +404,18 @@ static const BuilderResult& buildInitializer(Context* context, ID typeID) {
   AstList formals;
   AstList stmts;
   AstList superArgs;
-  initHelper(context, builder, typeDecl, dummyLoc, formals, superArgs, stmts);
-
-  if (auto cls = typeDecl->toClass()) {
-    if (cls->numInheritExprs() > 0) {
-      owned<AstNode> dot = Dot::build(builder, dummyLoc, Identifier::build(builder, dummyLoc, USTR("super")), USTR("init"));
-      owned<AstNode> call = FnCall::build(builder, dummyLoc, std::move(dot), std::move(superArgs), false);
-      stmts.insert(stmts.begin(), std::move(call));
-    }
+  bool addSuperInit = initHelper(context, builder, typeDecl, dummyLoc,
+                                 formals, superArgs, stmts);
+
+  if (addSuperInit) {
+    owned<AstNode> dot = Dot::build(builder, dummyLoc,
+                                    Identifier::build(builder, dummyLoc,
+                                                      USTR("super")),
+                                    USTR("init"));
+    owned<AstNode> call = FnCall::build(builder, dummyLoc,
+                                        std::move(dot), std::move(superArgs),
+                                        /*callUsedSquareBrackets*/false);
+    stmts.insert(stmts.begin(), std::move(call));
   }
 
   auto body = Block::build(builder, dummyLoc, std::move(stmts));
diff --git a/frontend/test/resolution/testInitSemantics.cpp b/frontend/test/resolution/testInitSemantics.cpp
index 724a9b4b776b..4218100306e4 100644
--- a/frontend/test/resolution/testInitSemantics.cpp
+++ b/frontend/test/resolution/testInitSemantics.cpp
@@ -1515,6 +1515,87 @@ static void testInheritance() {
     xt->stringify(ss, chpl::StringifyKind::CHPL_SYNTAX);
     assert(ss.str() == "owned B(int(64), real(64))");
   }
+
+  // Default initializer when parent has user-defined initializer
+  {
+    Context ctx;
+    Context* context = &ctx;
+    ErrorGuard guard(context);
+
+    std::string program = R"""(
+      class A {
+        var x : int;
+
+        proc init(x: int = 0) {
+          this.x = x;
+        }
+      }
+
+      class B : A {
+        var y : string;
+      }
+
+      var b1 = new B();
+      var b2 = new B("test");
+      )""";
+
+    auto vars = resolveTypesOfVariables(context, program, {"b1", "b2"});
+    auto b1 = vars["b1"].type();
+    auto b2 = vars["b2"].type();
+
+    auto check = [] (const Type* type) {
+      std::stringstream ss;
+      type->stringify(ss, chpl::StringifyKind::CHPL_SYNTAX);
+      assert(ss.str() == "owned B");
+    };
+
+    check(b1);
+    check(b2);
+  }
+
+  // Default initializer when grandparent has user-defined initializer
+  {
+    Context ctx;
+    Context* context = &ctx;
+    ErrorGuard guard(context);
+
+    std::string program = R"""(
+      class X {
+        var one : int;
+
+        proc init(one: int = 0) {
+          this.one = one;
+        }
+      }
+
+      class Y : X {
+        var two : real;
+      }
+
+      class Z : Y {
+        var three : string;
+      }
+
+      var z1 = new Z();
+      var z2 = new Z(42.0);
+      var z3 = new Z(42.0, "test");
+      )""";
+
+    auto vars = resolveTypesOfVariables(context, program, {"z1", "z2", "z3"});
+    auto z1 = vars["z1"].type();
+    auto z2 = vars["z2"].type();
+    auto z3 = vars["z3"].type();
+
+    auto check = [] (const Type* type) {
+      std::stringstream ss;
+      type->stringify(ss, chpl::StringifyKind::CHPL_SYNTAX);
+      assert(ss.str() == "owned Z");
+    };
+
+    check(z1);
+    check(z2);
+    check(z3);
+  }
 }
 
 static void testImplicitSuperInit() {

From d9299f49973d93cf9c2753125e7521abfb605b4b Mon Sep 17 00:00:00 2001
From: Ben Harshbarger <ben.harshb@gmail.com>
Date: Wed, 9 Oct 2024 12:07:47 -0700
Subject: [PATCH 079/107] Avoid segfault when super.init() does not resolve

Signed-off-by: Ben Harshbarger <ben.harshb@gmail.com>
---
 frontend/lib/resolution/InitResolver.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/frontend/lib/resolution/InitResolver.cpp b/frontend/lib/resolution/InitResolver.cpp
index 8261f3f78827..e3d94b92165e 100644
--- a/frontend/lib/resolution/InitResolver.cpp
+++ b/frontend/lib/resolution/InitResolver.cpp
@@ -683,14 +683,15 @@ bool InitResolver::handleCallToSuperInit(const FnCall* node,
 }
 
 void InitResolver::updateSuperType(const CallResolutionResult* c) {
-  auto& msc = c->mostSpecific().only();
-  auto superThis = msc.formalActualMap().byFormalIdx(0).formalType().type();
+  if (auto& msc = c->mostSpecific().only()) {
+    auto superThis = msc.formalActualMap().byFormalIdx(0).formalType().type();
 
-  this->superType_ = superThis->getCompositeType()->toBasicClassType();
+    this->superType_ = superThis->getCompositeType()->toBasicClassType();
 
-  // Only update the current receiver if the parent was generic.
-  if (superType_->instantiatedFromCompositeType() != nullptr) {
-    updateResolverVisibleReceiverType();
+    // Only update the current receiver if the parent was generic.
+    if (superType_->instantiatedFromCompositeType() != nullptr) {
+      updateResolverVisibleReceiverType();
+    }
   }
 
   phase_ = PHASE_NEED_COMPLETE;

From 1d97c119dcb34b5ebf31fd1816295572c7c7928a Mon Sep 17 00:00:00 2001
From: Ben Harshbarger <ben.harshb@gmail.com>
Date: Fri, 11 Oct 2024 13:13:07 -0700
Subject: [PATCH 080/107] Remove unnecessary else branch

Signed-off-by: Ben Harshbarger <ben.harshb@gmail.com>
---
 frontend/lib/resolution/default-functions.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/frontend/lib/resolution/default-functions.cpp b/frontend/lib/resolution/default-functions.cpp
index 39fa62d5a82b..bedc41f79340 100644
--- a/frontend/lib/resolution/default-functions.cpp
+++ b/frontend/lib/resolution/default-functions.cpp
@@ -344,8 +344,6 @@ static bool initHelper(Context* context,
           }
         }
       }
-    } else {
-      addSuperInit = false;
     }
   }
 

From 9833eae591e336ac40cacf9d3c7d6eb8de0a6d2a Mon Sep 17 00:00:00 2001
From: Ben Harshbarger <ben.harshb@gmail.com>
Date: Fri, 11 Oct 2024 13:32:59 -0700
Subject: [PATCH 081/107] Add test for interface-related bug with super.init,
 and skip Interfaces in resolveModule

Signed-off-by: Ben Harshbarger <ben.harshb@gmail.com>
---
 .../lib/resolution/resolution-queries.cpp     |  1 +
 .../test/resolution/testInitSemantics.cpp     | 21 +++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/frontend/lib/resolution/resolution-queries.cpp b/frontend/lib/resolution/resolution-queries.cpp
index 107fe688e5eb..158828a5ab00 100644
--- a/frontend/lib/resolution/resolution-queries.cpp
+++ b/frontend/lib/resolution/resolution-queries.cpp
@@ -193,6 +193,7 @@ const ResolutionResultByPostorderID& resolveModule(Context* context, ID id) {
             child->isTypeDecl() ||
             child->isFunction() ||
             child->isModule() ||
+            child->isInterface() ||
             child->isExternBlock()) {
             // Resolve use/import to find deprecation/unstable warnings.
             // child->isUse() ||
diff --git a/frontend/test/resolution/testInitSemantics.cpp b/frontend/test/resolution/testInitSemantics.cpp
index 4218100306e4..e6b7664f3fa9 100644
--- a/frontend/test/resolution/testInitSemantics.cpp
+++ b/frontend/test/resolution/testInitSemantics.cpp
@@ -1596,6 +1596,27 @@ static void testInheritance() {
     check(z2);
     check(z3);
   }
+
+  // Make sure that existence of an interface in the inherit-exprs list
+  // does not cause a super.init call to be generated.
+  {
+    Context ctx;
+    Context* context = &ctx;
+    ErrorGuard guard(context);
+
+    std::string program = R"""(
+      interface myInterface {}
+
+      class C : myInterface {
+        var x : string;
+      }
+
+      var c = new C();
+      )""";
+
+    auto m = parseModule(context, std::move(program));
+    std::ignore = resolveModule(context, m->id());
+  }
 }
 
 static void testImplicitSuperInit() {

From ba5e608f69120e25eed37b3c1e1f34eec7e5a402 Mon Sep 17 00:00:00 2001
From: Brandon Neth <brandon.neth@hpe.com>
Date: Fri, 11 Oct 2024 15:08:21 -0700
Subject: [PATCH 082/107] use temporary directories instead of the named ones
 --- Signed-off-by: Brandon Neth <brandon.neth@hpe.com>

---
 test/library/packages/Zarr/ZarrPartial.chpl | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/library/packages/Zarr/ZarrPartial.chpl b/test/library/packages/Zarr/ZarrPartial.chpl
index a3220f155693..fdceeee1ef56 100644
--- a/test/library/packages/Zarr/ZarrPartial.chpl
+++ b/test/library/packages/Zarr/ZarrPartial.chpl
@@ -3,7 +3,7 @@ use IO;
 use FileSystem;
 use Random;
 use BlockDist;
-
+use Subprocess;
 proc test3D() {
   const dist = new blockDist(boundingBox={0..10, 0..10, 0..10});
   const fullD = dist.createDomain({0..10, 0..10, 0..10});
@@ -12,13 +12,15 @@ proc test3D() {
   var ones: [fullD] real(32) = 1;
   var twos: [fullD] real(32) = 2;
 
-  if (isDir("ones")) then rmTree("ones");
-  if (isDir("twos")) then rmTree("twos");
-  writeZarrArray("ones", ones, (3,3,3));
-  writeZarrArray("twos", twos, (3,3,3));
+  var dir1, dir2: string;
+  spawn(["mktemp", "-d"], stdout=pipeStyle.pipe).stdout.readln(dir1);
+  spawn(["mktemp", "-d"], stdout=pipeStyle.pipe).stdout.readln(dir2);
+
+  writeZarrArray(dir1, ones, (3,3,3));
+  writeZarrArray(dir2, twos, (3,3,3));
 
-  var inputData = readZarrArray("ones", real(32), 3);
-  inputData[partialD] = readZarrArrayPartial("twos", real(32), 3, partialD);
+  var inputData = readZarrArray(dir1, real(32), 3);
+  inputData[partialD] = readZarrArrayPartial(dir2, real(32), 3, partialD);
 
   var expectedData: [fullD] real(32) = 1.0;
   expectedData[partialD] = 2.0;
@@ -26,8 +28,6 @@ proc test3D() {
   assert(inputData.domain == expectedData.domain, "Domain mismatch: %? %?".format(inputData.domain, expectedData.domain));
   forall i in inputData.domain do 
     assert(inputData[i] == expectedData[i], "Mismatch for 3D real data on indices: %?.\nWritten: %?\nRead: %?".format(i, inputData[i], expectedData[i]));
-  rmTree("ones");
-  rmTree("twos");
 }
 
 proc testOutOfBounds() {

From 93826ac5cb0f6469fe92b2de5e2277e74ebc5ead Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Thu, 3 Oct 2024 16:27:26 -0700
Subject: [PATCH 083/107] Add temporary workaround for the chpl_t resolution

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 frontend/lib/resolution/Resolver.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/frontend/lib/resolution/Resolver.cpp b/frontend/lib/resolution/Resolver.cpp
index c07611f8e583..9ba151153495 100644
--- a/frontend/lib/resolution/Resolver.cpp
+++ b/frontend/lib/resolution/Resolver.cpp
@@ -2577,10 +2577,11 @@ QualifiedType Resolver::typeForId(const ID& id, bool localGenericToUnknown) {
         ct = mr;
         auto fieldName = parsing::fieldIdToName(context, id);
         // TODO: shared has additional fields that are not generic
-        CHPL_ASSERT(fieldName == "chpl_t" || fieldName == "chpl_p");
-        auto intent = fieldName == "chpl_t" ? QualifiedType::TYPE : QualifiedType::VAR;
-        auto borrowed = nct->withDecorator(context, nct->decorator().toBorrowed());
-        return QualifiedType(intent, borrowed);
+        if (fieldName == "chpl_t" || fieldName == "chpl_p") {
+          auto intent = fieldName == "chpl_t" ? QualifiedType::TYPE : QualifiedType::VAR;
+          auto borrowed = nct->withDecorator(context, nct->decorator().toBorrowed());
+          return QualifiedType(intent, borrowed);
+        }
       } else if (auto comprt = rt->getCompositeType()) {
         if (comprt->id() == parentId) {
           ct = comprt; // handle record, class with field

From 2176ef5b97027a7e3db331c2a56a1b3ca271b45a Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Thu, 3 Oct 2024 16:47:30 -0700
Subject: [PATCH 084/107] Replace bugfix to Ahmad's code with a more reasonable
 version

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 frontend/lib/resolution/Resolver.cpp | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/frontend/lib/resolution/Resolver.cpp b/frontend/lib/resolution/Resolver.cpp
index 9ba151153495..f5131eab21ab 100644
--- a/frontend/lib/resolution/Resolver.cpp
+++ b/frontend/lib/resolution/Resolver.cpp
@@ -2571,18 +2571,13 @@ QualifiedType Resolver::typeForId(const ID& id, bool localGenericToUnknown) {
     // that we are working with a nested method
     if (auto rt = methodReceiverType().type()) {
       // get the new class type if the receiver is a class
-      auto nct = rt->toClassType();
-      // get the manager record using ClassType method managerRecordType()
-      if (auto mr = checkIfReceiverIsManagerRecord(context, nct, parentId)) {
-        ct = mr;
-        auto fieldName = parsing::fieldIdToName(context, id);
-        // TODO: shared has additional fields that are not generic
-        if (fieldName == "chpl_t" || fieldName == "chpl_p") {
-          auto intent = fieldName == "chpl_t" ? QualifiedType::TYPE : QualifiedType::VAR;
-          auto borrowed = nct->withDecorator(context, nct->decorator().toBorrowed());
-          return QualifiedType(intent, borrowed);
+      if (auto ct = rt->toClassType()) {
+        if (auto mr = ct->managerRecordType(context)) {
+          rt = mr;
         }
-      } else if (auto comprt = rt->getCompositeType()) {
+      }
+
+      if (auto comprt = rt->getCompositeType()) {
         if (comprt->id() == parentId) {
           ct = comprt; // handle record, class with field
         } else if (auto bct = comprt->toBasicClassType()) {

From b8cd0d81a84a19e5b6f357e016a09583cb30235b Mon Sep 17 00:00:00 2001
From: Rui Chen <rui@chenrui.dev>
Date: Fri, 11 Oct 2024 21:38:32 -0400
Subject: [PATCH 085/107] chore: bump pyyaml to 6.0.2 to support py3.13 build

Signed-off-by: Rui Chen <rui@chenrui.dev>
---
 third-party/chpl-venv/test-requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third-party/chpl-venv/test-requirements.txt b/third-party/chpl-venv/test-requirements.txt
index a8f97300c880..2da4f7deaa24 100644
--- a/third-party/chpl-venv/test-requirements.txt
+++ b/third-party/chpl-venv/test-requirements.txt
@@ -1,4 +1,4 @@
-PyYAML==6.0.1
+PyYAML==6.0.2
 filelock==3.12.2
 argcomplete==3.1.2
 setuptools==68.0.0

From 6f8de374773cd5801febe93c776e2129138c1d5f Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 14 Oct 2024 08:12:00 -0600
Subject: [PATCH 086/107] force inlining of complex math functions

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 runtime/include/chplmath.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/runtime/include/chplmath.h b/runtime/include/chplmath.h
index b2d7309bb919..76dce20ec2de 100644
--- a/runtime/include/chplmath.h
+++ b/runtime/include/chplmath.h
@@ -45,33 +45,33 @@ MAYBE_GPU static inline int chpl_macro_float_signbit(float x) { return signbit(x
 #ifdef __cplusplus
 // workaround for C++ lacking C99 complex support
 #define chpl_complex_wrapper(basename)                                     \
-  MAYBE_GPU static inline double                                           \
+  MAYBE_GPU static ___always_inline double                                           \
     chpl_##basename(_complex128 x) { return __builtin_##basename(x); }     \
-  MAYBE_GPU static inline float                                            \
+  MAYBE_GPU static ___always_inline float                                            \
     chpl_##basename##f(_complex64 x) { return __builtin_##basename##f(x); }
 chpl_COMPLEX_RETURN_REAL(chpl_complex_wrapper)
 #undef chpl_complex_wrapper
 
 #define chpl_complex_wrapper(basename)                                     \
-  MAYBE_GPU static inline _complex128                                      \
+  MAYBE_GPU static ___always_inline _complex128                                      \
     chpl_##basename(_complex128 x) { return __builtin_##basename(x); }     \
-  MAYBE_GPU static inline _complex64                                       \
+  MAYBE_GPU static ___always_inline _complex64                                       \
     chpl_##basename##f(_complex64 x) { return __builtin_##basename##f(x); }
 chpl_COMPLEX_RETURN_COMPLEX(chpl_complex_wrapper)
 #undef chpl_complex_wrapper
 #else
 #define chpl_complex_wrapper(basename)                         \
-  MAYBE_GPU static inline double                               \
+  MAYBE_GPU static ___always_inline double                               \
     chpl_##basename(_complex128 x) { return basename(x); }     \
-  MAYBE_GPU static inline float                                \
+  MAYBE_GPU static ___always_inline float                                \
     chpl_##basename##f(_complex64 x) { return basename##f(x); }
 chpl_COMPLEX_RETURN_REAL(chpl_complex_wrapper)
 #undef chpl_complex_wrapper
 
 #define chpl_complex_wrapper(basename)                         \
-  MAYBE_GPU static inline _complex128                          \
+  MAYBE_GPU static ___always_inline _complex128                          \
     chpl_##basename(_complex128 x) { return basename(x); }     \
-  MAYBE_GPU static inline _complex64                           \
+  MAYBE_GPU static ___always_inline _complex64                           \
     chpl_##basename##f(_complex64 x) { return basename##f(x); }
 chpl_COMPLEX_RETURN_COMPLEX(chpl_complex_wrapper)
 #undef chpl_complex_wrapper

From 002f1fd4738c5d9e5a2f822da362be9611c6e6a0 Mon Sep 17 00:00:00 2001
From: Lydia Duncan <lydia-duncan@users.noreply.github.com>
Date: Wed, 9 Oct 2024 14:14:56 -0700
Subject: [PATCH 087/107] Update the version of pytest used by the language
 server

Updates it from 8.2.0 to 8.3.3

Incidentally, this matches the version used by the Chapel Sphinx domain, though
the two packages do not interact on that front so it does not matter.

----
Signed-off-by: Lydia Duncan <lydia-duncan@users.noreply.github.com>
---
 third-party/chpl-venv/cls-test-requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third-party/chpl-venv/cls-test-requirements.txt b/third-party/chpl-venv/cls-test-requirements.txt
index 3c52cae07868..a3c41f771422 100644
--- a/third-party/chpl-venv/cls-test-requirements.txt
+++ b/third-party/chpl-venv/cls-test-requirements.txt
@@ -1,2 +1,2 @@
-pytest==8.2.0
+pytest==8.3.3
 pytest-lsp==0.4.1

From 65023e36acd2f51646009aaa05e60f7be319f0c1 Mon Sep 17 00:00:00 2001
From: Lydia Duncan <lydia-duncan@users.noreply.github.com>
Date: Fri, 11 Oct 2024 13:24:35 -0700
Subject: [PATCH 088/107] Also update pytest-lsp for the language server

Updates it from 0.4.1. to 0.4.3

There didn't seem to be a difference in behavior for it

----
Signed-off-by: Lydia Duncan <lydia-duncan@users.noreply.github.com>
---
 third-party/chpl-venv/cls-test-requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third-party/chpl-venv/cls-test-requirements.txt b/third-party/chpl-venv/cls-test-requirements.txt
index a3c41f771422..5225c096fcba 100644
--- a/third-party/chpl-venv/cls-test-requirements.txt
+++ b/third-party/chpl-venv/cls-test-requirements.txt
@@ -1,2 +1,2 @@
 pytest==8.3.3
-pytest-lsp==0.4.1
+pytest-lsp==0.4.3

From 31f12b031a8350ef864099e7d9c0aa48419075c0 Mon Sep 17 00:00:00 2001
From: Lydia Duncan <lydia-duncan@users.noreply.github.com>
Date: Fri, 11 Oct 2024 14:11:19 -0700
Subject: [PATCH 089/107] Update the Python bindings dependencies

Updates:
- attrs from 23.1.0 to 24.2.0
- cattrs from 23.1.2 to 24.1.2
- pygls from 1.3.0 to 1.3.1
- typeguard from 3.0.2 to 4.3.0

----
Signed-off-by: Lydia Duncan <lydia-duncan@users.noreply.github.com>
---
 third-party/chpl-venv/chapel-py-requirements.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/third-party/chpl-venv/chapel-py-requirements.txt b/third-party/chpl-venv/chapel-py-requirements.txt
index 3da048e94f5c..8f01c9a12656 100644
--- a/third-party/chpl-venv/chapel-py-requirements.txt
+++ b/third-party/chpl-venv/chapel-py-requirements.txt
@@ -1,6 +1,6 @@
-attrs==23.1.0
-cattrs==23.1.2
+attrs==24.2.0
+cattrs==24.1.2
 lsprotocol==2023.0.1
-pygls==1.3.0
-typeguard==3.0.2
+pygls==1.3.1
+typeguard==4.3.0
 ConfigArgParse==1.7

From 83464dc8c5a194dae2a5c81a066d72f579c9c374 Mon Sep 17 00:00:00 2001
From: Lydia Duncan <lydia-duncan@users.noreply.github.com>
Date: Fri, 11 Oct 2024 14:53:46 -0700
Subject: [PATCH 090/107] Update c2chapel's dependency

Updates pycparser from 2.20 to 2.22

----
Signed-off-by: Lydia Duncan <lydia-duncan@users.noreply.github.com>
---
 third-party/chpl-venv/c2chapel-requirements.txt | 2 +-
 tools/c2chapel/Makefile                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third-party/chpl-venv/c2chapel-requirements.txt b/third-party/chpl-venv/c2chapel-requirements.txt
index 14c456cfb70e..8930fa110534 100644
--- a/third-party/chpl-venv/c2chapel-requirements.txt
+++ b/third-party/chpl-venv/c2chapel-requirements.txt
@@ -1,3 +1,3 @@
 # tools/c2chapel/Makefile should match so fakeHeaders download matches
-pycparser==2.20
+pycparser==2.22
 pycparserext
diff --git a/tools/c2chapel/Makefile b/tools/c2chapel/Makefile
index 52baecbd84e4..a72cab4fbac9 100644
--- a/tools/c2chapel/Makefile
+++ b/tools/c2chapel/Makefile
@@ -35,7 +35,7 @@ link=$(bdir)/c2chapel
 
 # Note, this version is used only for the fake headers,
 # but it should probably match third-party/chpl-venv/c2chapel-requirements.txt
-VERSION=2.20
+VERSION=2.22
 TAR=release_v$(VERSION).tar.gz
 
 RELEASE=https://github.com/eliben/pycparser/archive/$(TAR)

From 330214332d9543c6814a9018422e294bb8c6e01b Mon Sep 17 00:00:00 2001
From: Lydia Duncan <lydia-duncan@users.noreply.github.com>
Date: Fri, 11 Oct 2024 14:54:56 -0700
Subject: [PATCH 091/107] Update the dependencies for start_test

Updates:
- PyYAML from 6.0.1 to 6.0.2 (source of merge conflict)
- filelock from 3.12.2 to 3.16.1
- argcomplete from 3.1.2 to 3.5.1
- setuptools from 68.0.0 to 75.1.0

----
Signed-off-by: Lydia Duncan <lydia-duncan@users.noreply.github.com>
---
 third-party/chpl-venv/test-requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third-party/chpl-venv/test-requirements.txt b/third-party/chpl-venv/test-requirements.txt
index 2da4f7deaa24..f50835f61ccc 100644
--- a/third-party/chpl-venv/test-requirements.txt
+++ b/third-party/chpl-venv/test-requirements.txt
@@ -1,4 +1,4 @@
 PyYAML==6.0.2
-filelock==3.12.2
-argcomplete==3.1.2
-setuptools==68.0.0
+filelock==3.16.1
+argcomplete==3.5.1
+setuptools==75.1.0

From f275a64532ed99310c1aeed8803496ce93b56128 Mon Sep 17 00:00:00 2001
From: Lydia Duncan <lydia-duncan@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:45:38 -0700
Subject: [PATCH 092/107] Update the chpldoc dependencies that aren't tied to
 the new Sphinx version

Updates:
- Jinja2 from 3.1.3 to 3.1.4
- Pygments from 2.17.2 to 2.18.0
- urllib3 from 2.2.1 to 2.2.3
- babel from 2.14.0 to 2.16.0

----
Signed-off-by: Lydia Duncan <lydia-duncan@users.noreply.github.com>
---
 third-party/chpl-venv/chpldoc-requirements1.txt | 2 +-
 third-party/chpl-venv/chpldoc-requirements2.txt | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/third-party/chpl-venv/chpldoc-requirements1.txt b/third-party/chpl-venv/chpldoc-requirements1.txt
index 265423f39f9c..9c720f125ee4 100644
--- a/third-party/chpl-venv/chpldoc-requirements1.txt
+++ b/third-party/chpl-venv/chpldoc-requirements1.txt
@@ -1,2 +1,2 @@
 # Split into 3 files to work around problems with CHPL_PIP_FROM_SOURCE
-MarkupSafe==2.1.5
+MarkupSafe==3.0.1
diff --git a/third-party/chpl-venv/chpldoc-requirements2.txt b/third-party/chpl-venv/chpldoc-requirements2.txt
index 7583018d1fc6..73f2f314a67b 100644
--- a/third-party/chpl-venv/chpldoc-requirements2.txt
+++ b/third-party/chpl-venv/chpldoc-requirements2.txt
@@ -1,7 +1,7 @@
 # Split into 3 files to work around problems with CHPL_PIP_FROM_SOURCE
-Jinja2==3.1.3
-Pygments==2.17.2
+Jinja2==3.1.4
+Pygments==2.18.0
 Sphinx==7.2.6
-urllib3==2.2.1
+urllib3==2.2.3
 docutils==0.20.1
-babel==2.14.0
+babel==2.16.0

From adb741c567b4f9f1460f7f23a02a45b76e04225c Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Thu, 3 Oct 2024 15:32:17 -0700
Subject: [PATCH 093/107] Improve passing signatures between LSP and CLS

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 .../src/chpl-language-server.py               | 33 +++++++++++++++----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/tools/chpl-language-server/src/chpl-language-server.py b/tools/chpl-language-server/src/chpl-language-server.py
index f96260da9a82..f9b2fc94eb6a 100755
--- a/tools/chpl-language-server/src/chpl-language-server.py
+++ b/tools/chpl-language-server/src/chpl-language-server.py
@@ -499,6 +499,8 @@ def __init__(self, file: str, config: Optional["WorkspaceConfig"]):
         self.context: chapel.Context = chapel.Context()
         self.file_infos: List["FileInfo"] = []
         self.global_uses: Dict[str, List[References]] = defaultdict(list)
+        self.instantiation_ids: Dict[chapel.TypedSignature, str] = {}
+        self.instantiation_id_counter = 0
 
         if config:
             file_config = config.for_file(file)
@@ -508,6 +510,27 @@ def __init__(self, file: str, config: Optional["WorkspaceConfig"]):
 
         self.context.set_module_paths(self.module_paths, self.file_paths)
 
+    def register_signature(self, sig: chapel.TypedSignature) -> str:
+        """
+        The language server can't send over typed signatures directly for
+        situations such as call hierarchy items (but we need to reason about
+        instantiations). Instead, keep a global unique ID for each signature,
+        and use that to identify them.
+        """
+        if sig in self.instantiation_ids:
+            return self.instantiation_ids[sig]
+
+        self.instantiation_id_counter += 1
+        uid = str(self.instantiation_id_counter)
+        self.instantiation_ids[sig] = uid
+        return uid
+
+    def retrieve_signature(self, uid: str) -> Optional[chapel.TypedSignature]:
+        for sig, sig_uid in self.instantiation_ids.items():
+            if sig_uid == uid:
+                return sig
+        return None
+
     def new_file_info(
         self, uri: str, use_resolver: bool
     ) -> Tuple["FileInfo", List[Any]]:
@@ -1420,7 +1443,7 @@ def fn_to_call_hierarchy_item(
         fn: chapel.Function = sig.ast()
         item = self.sym_to_call_hierarchy_item(fn)
         fi, _ = self.get_file_info(item.uri)
-        item.data[1] = fi.index_of_instantiation(fn, sig)
+        item.data[1] = fi.context.register_signature(sig)
 
         return item
 
@@ -1433,7 +1456,7 @@ def unpack_call_hierarchy_item(
             item.data is None
             or not isinstance(item.data, list)
             or not isinstance(item.data[0], str)
-            or not isinstance(item.data[1], int)
+            or not isinstance(item.data[1], str)
         ):
             self.show_message(
                 "Call hierarchy item contains missing or invalid additional data",
@@ -1456,11 +1479,7 @@ def unpack_call_hierarchy_item(
             # We don't handle that here.
             return None
 
-        instantiation = None
-        if idx != -1:
-            instantiation = fi.instantiation_at_index(fn, idx)
-        else:
-            instantiation = fi.concrete_instantiation_for(fn)
+        instantiation = fi.context.retrieve_signature(idx)
 
         return (fi, fn, instantiation)
 

From 3c8ac594d1b68f198b10b7669635e49bbad22a05 Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Thu, 3 Oct 2024 16:24:04 -0700
Subject: [PATCH 094/107] Use the current file's context when resolving
 referenced file for call hierarchy

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 .../src/chpl-language-server.py               | 69 ++++++++++++++-----
 1 file changed, 52 insertions(+), 17 deletions(-)

diff --git a/tools/chpl-language-server/src/chpl-language-server.py b/tools/chpl-language-server/src/chpl-language-server.py
index f9b2fc94eb6a..197c9a38eb89 100755
--- a/tools/chpl-language-server/src/chpl-language-server.py
+++ b/tools/chpl-language-server/src/chpl-language-server.py
@@ -1030,7 +1030,9 @@ def __init__(self, config: CLSConfig):
         super().__init__("chpl-language-server", "v0.1")
 
         self.contexts: Dict[str, ContextContainer] = {}
-        self.file_infos: Dict[str, FileInfo] = {}
+        self.context_ids: Dict[ContextContainer, str] = {}
+        self.context_id_counter = 0
+        self.file_infos: Dict[Tuple[str, Optional[str]], FileInfo] = {}
         self.configurations: Dict[str, WorkspaceConfig] = {}
 
         self.use_resolver: bool = config.get("resolver")
@@ -1130,9 +1132,17 @@ def get_context(self, uri: str) -> ContextContainer:
         for file in context.file_paths:
             self.contexts[file] = context
         self.contexts[path] = context
+        self.context_id_counter += 1
+        self.context_ids[context] = str(self.context_id_counter)
 
         return context
 
+    def retrieve_context(self, context_id: str) -> Optional[ContextContainer]:
+        for ctx, cid in self.context_ids.items():
+            if cid == context_id:
+                return ctx
+        return None
+
     def eagerly_process_all_files(self, context: ContextContainer):
         cfg = context.config
         if cfg:
@@ -1140,7 +1150,7 @@ def eagerly_process_all_files(self, context: ContextContainer):
                 self.get_file_info("file://" + file, do_update=False)
 
     def get_file_info(
-        self, uri: str, do_update: bool = False
+        self, uri: str, do_update: bool = False, context_id: Optional[str] = None
     ) -> Tuple[FileInfo, List[Any]]:
         """
         The language server maintains a FileInfo object per file. The FileInfo
@@ -1151,19 +1161,36 @@ def get_file_info(
         creating one if it doesn't exist. If do_update is set to True,
         then the FileInfo's index is rebuilt even if it has already been
         computed. This is useful if the underlying file has changed.
+
+        Most of the tiem, we will create a new context for a given URI. When
+        requested, however, context_id will be used to create a FileInfo
+        for a specific context. This is useful if e.g., file A wants to display
+        an instantiation in file B.
         """
 
         errors = []
 
-        if uri in self.file_infos:
-            file_info = self.file_infos[uri]
+        fi_key = (uri, context_id)
+        if fi_key in self.file_infos:
+            file_info = self.file_infos[fi_key]
             if do_update:
                 errors = file_info.context.advance()
         else:
-            file_info, errors = self.get_context(uri).new_file_info(
+            if context_id:
+                context = self.retrieve_context(context_id)
+                assert(context)
+            else:
+                context = self.get_context(uri)
+
+            file_info, errors = context.new_file_info(
                 uri, self.use_resolver
             )
-            self.file_infos[uri] = file_info
+            self.file_infos[fi_key] = file_info
+
+            # Also make this the "default" context for this file in case we
+            # open it.
+            if (uri, None) not in self.file_infos:
+                self.file_infos[(uri, None)] = file_info
 
         # filter out errors that are not related to the file
         cur_path = uri[len("file://") :]
@@ -1419,7 +1446,8 @@ def sym_to_call_hierarchy_item(
         """
         loc = location_to_location(sym.location())
 
-        inst_idx = -1
+        inst_id = ""
+        context_id = None
 
         return CallHierarchyItem(
             name=sym.name(),
@@ -1428,11 +1456,11 @@ def sym_to_call_hierarchy_item(
             uri=loc.uri,
             range=loc.range,
             selection_range=location_to_range(sym.name_location()),
-            data=[sym.unique_id(), inst_idx],
+            data=[sym.unique_id(), inst_id, context_id],
         )
 
     def fn_to_call_hierarchy_item(
-        self, sig: chapel.TypedSignature
+        self, sig: chapel.TypedSignature, caller_context: ContextContainer
     ) -> CallHierarchyItem:
         """
         Like sym_to_call_hierarchy_item, but for function instantiations.
@@ -1442,8 +1470,8 @@ def fn_to_call_hierarchy_item(
         """
         fn: chapel.Function = sig.ast()
         item = self.sym_to_call_hierarchy_item(fn)
-        fi, _ = self.get_file_info(item.uri)
-        item.data[1] = fi.context.register_signature(sig)
+        item.data[1] = caller_context.register_signature(sig)
+        item.data[2] = self.context_ids[caller_context]
 
         return item
 
@@ -1457,15 +1485,22 @@ def unpack_call_hierarchy_item(
             or not isinstance(item.data, list)
             or not isinstance(item.data[0], str)
             or not isinstance(item.data[1], str)
+            or not isinstance(item.data[2], str)
         ):
             self.show_message(
                 "Call hierarchy item contains missing or invalid additional data",
                 MessageType.Error,
             )
             return None
-        uid, idx = item.data
+        uid, idx, ctx = item.data
+
+        context_id = None
+        if ctx != "":
+            context_id = ctx
 
-        fi, _ = self.get_file_info(item.uri)
+        print(f"Context: {context_id}", file=sys.stderr)
+
+        fi, _ = self.get_file_info(item.uri, context_id=context_id)
 
         # TODO: Performance:
         # Once the Python bindings supports it, we can use the
@@ -2019,7 +2054,7 @@ async def prepare_call_hierarchy(
 
         # Oddly, returning multiple here makes for no child nodes in the VSCode
         # UI. Just take one signature for now.
-        return next(([ls.fn_to_call_hierarchy_item(sig)] for sig in sigs), [])
+        return next(([ls.fn_to_call_hierarchy_item(sig, fi.context)] for sig in sigs), [])
 
     @server.feature(CALL_HIERARCHY_INCOMING_CALLS)
     async def call_hierarchy_incoming(
@@ -2065,7 +2100,7 @@ async def call_hierarchy_incoming(
             if isinstance(called_fn, str):
                 item = ls.sym_to_call_hierarchy_item(hack_id_to_node[called_fn])
             else:
-                item = ls.fn_to_call_hierarchy_item(called_fn)
+                item = ls.fn_to_call_hierarchy_item(called_fn, fi.context)
 
             to_return.append(
                 CallHierarchyIncomingCall(
@@ -2089,7 +2124,7 @@ async def call_hierarchy_outgoing(
         if unpacked is None:
             return None
 
-        _, fn, instantiation = unpacked
+        fi, fn, instantiation = unpacked
 
         outgoing_calls: Dict[chapel.TypedSignature, List[chapel.FnCall]] = (
             defaultdict(list)
@@ -2112,7 +2147,7 @@ async def call_hierarchy_outgoing(
 
         to_return = []
         for called_fn, calls in outgoing_calls.items():
-            item = ls.fn_to_call_hierarchy_item(called_fn)
+            item = ls.fn_to_call_hierarchy_item(called_fn, fi.context)
             to_return.append(
                 CallHierarchyOutgoingCall(
                     item,

From 2cd00bc163886809688971d1dbc9d79f80e23956 Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Thu, 3 Oct 2024 16:25:16 -0700
Subject: [PATCH 095/107] Try to be more consistent about LSP bridge types

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 .../src/chpl-language-server.py                    | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tools/chpl-language-server/src/chpl-language-server.py b/tools/chpl-language-server/src/chpl-language-server.py
index 197c9a38eb89..aa6d3487b514 100755
--- a/tools/chpl-language-server/src/chpl-language-server.py
+++ b/tools/chpl-language-server/src/chpl-language-server.py
@@ -1446,7 +1446,7 @@ def sym_to_call_hierarchy_item(
         """
         loc = location_to_location(sym.location())
 
-        inst_id = ""
+        inst_id = None
         context_id = None
 
         return CallHierarchyItem(
@@ -1484,8 +1484,8 @@ def unpack_call_hierarchy_item(
             item.data is None
             or not isinstance(item.data, list)
             or not isinstance(item.data[0], str)
-            or not isinstance(item.data[1], str)
-            or not isinstance(item.data[2], str)
+            or not isinstance(item.data[1], Optional[str])
+            or not isinstance(item.data[2], Optional[str])
         ):
             self.show_message(
                 "Call hierarchy item contains missing or invalid additional data",
@@ -1494,13 +1494,7 @@ def unpack_call_hierarchy_item(
             return None
         uid, idx, ctx = item.data
 
-        context_id = None
-        if ctx != "":
-            context_id = ctx
-
-        print(f"Context: {context_id}", file=sys.stderr)
-
-        fi, _ = self.get_file_info(item.uri, context_id=context_id)
+        fi, _ = self.get_file_info(item.uri, context_id=ctx)
 
         # TODO: Performance:
         # Once the Python bindings supports it, we can use the

From 4d17a0e6ef90a0ee6340a1ee1e7ba1312c72a0fb Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Thu, 3 Oct 2024 18:20:15 -0700
Subject: [PATCH 096/107] Apply black

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 .../src/chpl-language-server.py                  | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tools/chpl-language-server/src/chpl-language-server.py b/tools/chpl-language-server/src/chpl-language-server.py
index aa6d3487b514..b57b9496ee5a 100755
--- a/tools/chpl-language-server/src/chpl-language-server.py
+++ b/tools/chpl-language-server/src/chpl-language-server.py
@@ -1150,7 +1150,10 @@ def eagerly_process_all_files(self, context: ContextContainer):
                 self.get_file_info("file://" + file, do_update=False)
 
     def get_file_info(
-        self, uri: str, do_update: bool = False, context_id: Optional[str] = None
+        self,
+        uri: str,
+        do_update: bool = False,
+        context_id: Optional[str] = None,
     ) -> Tuple[FileInfo, List[Any]]:
         """
         The language server maintains a FileInfo object per file. The FileInfo
@@ -1178,13 +1181,11 @@ def get_file_info(
         else:
             if context_id:
                 context = self.retrieve_context(context_id)
-                assert(context)
+                assert context
             else:
                 context = self.get_context(uri)
 
-            file_info, errors = context.new_file_info(
-                uri, self.use_resolver
-            )
+            file_info, errors = context.new_file_info(uri, self.use_resolver)
             self.file_infos[fi_key] = file_info
 
             # Also make this the "default" context for this file in case we
@@ -2048,7 +2049,10 @@ async def prepare_call_hierarchy(
 
         # Oddly, returning multiple here makes for no child nodes in the VSCode
         # UI. Just take one signature for now.
-        return next(([ls.fn_to_call_hierarchy_item(sig, fi.context)] for sig in sigs), [])
+        return next(
+            ([ls.fn_to_call_hierarchy_item(sig, fi.context)] for sig in sigs),
+            [],
+        )
 
     @server.feature(CALL_HIERARCHY_INCOMING_CALLS)
     async def call_hierarchy_incoming(

From 18631c73d983d0074255481569956b3b4db39999 Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Fri, 11 Oct 2024 17:10:55 -0700
Subject: [PATCH 097/107] Add tests for call hierarchy

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 .../test/call_hierarchy.py                    | 157 ++++++++++++++++++
 tools/chpl-language-server/test/util/utils.py |  20 ++-
 2 files changed, 174 insertions(+), 3 deletions(-)
 create mode 100644 tools/chpl-language-server/test/call_hierarchy.py

diff --git a/tools/chpl-language-server/test/call_hierarchy.py b/tools/chpl-language-server/test/call_hierarchy.py
new file mode 100644
index 000000000000..dc8c2e53fbfd
--- /dev/null
+++ b/tools/chpl-language-server/test/call_hierarchy.py
@@ -0,0 +1,157 @@
+"""
+Tests basic functionality, including autocompletion, go-to-definition, hover,
+and references
+"""
+
+import sys
+
+from lsprotocol.types import ClientCapabilities
+from lsprotocol.types import CallHierarchyPrepareParams, CallHierarchyOutgoingCallsParams, CallHierarchyItem
+from lsprotocol.types import InitializeParams
+import pytest
+import pytest_lsp
+import typing
+from pytest_lsp import ClientServerConfig, LanguageClient
+
+from util.utils import *
+from util.config import CLS_PATH
+
+
+@pytest_lsp.fixture(
+    config=ClientServerConfig(
+        server_command=[
+            sys.executable,
+            CLS_PATH(),
+            "--resolver",
+        ],
+        client_factory=get_base_client,
+    )
+)
+async def client(lsp_client: LanguageClient):
+    # Setup
+    params = InitializeParams(capabilities=ClientCapabilities())
+    await lsp_client.initialize_session(params)
+
+    yield
+
+    # Teardown
+    await lsp_client.shutdown_session()
+
+class CallTree:
+    def __init__(self, item_id: str, children: typing.List["CallTree"]):
+        self.item_id = item_id
+        self.children = children
+
+async def collect_call_tree(client: LanguageClient, item: CallHierarchyItem, depth: int) -> typing.Optional[CallTree]:
+    if depth <= 0:
+        return None
+
+    assert(isinstance(item.data, list))
+    assert(len(item.data) == 3)
+    item_id = item.data[0]
+
+    children = []
+    outgoing = await client.call_hierarchy_outgoing_calls_async(CallHierarchyOutgoingCallsParams(item))
+    if outgoing is not None:
+        for outgoing_call in outgoing:
+            new_tree = await collect_call_tree(client, outgoing_call.to, depth - 1)
+            if new_tree is not None:
+                children.append(new_tree)
+
+    return CallTree(item_id, children)
+
+
+async def compute_call_hierarchy(client: LanguageClient, doc: TextDocumentIdentifier, position: Position, depth: int) -> typing.Optional[CallTree]:
+    items = await client.text_document_prepare_call_hierarchy_async(CallHierarchyPrepareParams(text_document=doc, position=position))
+    if items is None:
+        return None
+
+    assert(len(items) == 1)
+    return await collect_call_tree(client, items[0], depth)
+
+def verify_call_hierarchy(tree: CallTree, expected: CallTree):
+    assert(tree.item_id == expected.item_id)
+    assert(len(tree.children) == len(expected.children))
+    for i in range(len(tree.children)):
+        verify_call_hierarchy(tree.children[i], expected.children[i])
+
+async def check_call_hierarchy(client: LanguageClient, doc: TextDocumentIdentifier, position: Position, expected: CallTree, depth: int = 10) -> typing.Optional[CallTree]:
+    items = await client.text_document_prepare_call_hierarchy_async(CallHierarchyPrepareParams(text_document=doc, position=position))
+    assert(items is not None)
+    assert(len(items) == 1)
+    tree = await collect_call_tree(client, items[0], depth)
+    assert(tree is not None)
+    verify_call_hierarchy(tree, expected)
+    return tree
+
+@pytest.mark.asyncio
+async def test_call_hierarchy_basic(client: LanguageClient):
+    file = """
+           proc foo() {}
+           proc bar() do foo();
+           bar();
+           """
+
+    async with source_file(client, file) as doc:
+        expect = CallTree("main.bar", [ CallTree("main.foo", []) ])
+        await check_call_hierarchy(client, doc, pos((2, 0)), expect)
+
+@pytest.mark.asyncio
+async def test_call_hierarchy_overloads(client: LanguageClient):
+    file = """
+           proc foo(arg: int) {}
+           proc foo(arg: bool) {}
+           foo(1);
+           foo(true);
+           """
+
+    async with source_file(client, file) as doc:
+        expect_int = CallTree("main.foo", [])
+        await check_call_hierarchy(client, doc, pos((2, 0)), expect_int)
+        expect_bool = CallTree("main.foo#1", [])
+        await check_call_hierarchy(client, doc, pos((3, 0)), expect_bool)
+
+@pytest.mark.asyncio
+async def test_call_hierarchy_recursive(client: LanguageClient):
+    file = """
+           proc foo() do foo();
+           foo();
+           """
+
+    async with source_file(client, file) as doc:
+        expect = CallTree("main.foo", [ CallTree("main.foo", []) ])
+        await check_call_hierarchy(client, doc, pos((1, 0)), expect, depth=2)
+
+@pytest.mark.asyncio
+async def test_call_hierarchy_across_files(client: LanguageClient):
+    fileA = """
+            module A {
+              proc someImplementationDetail(arg: string) {}
+            }
+            """
+    fileB = """
+            module B {
+              use A;
+
+              proc toString(x: int): string do return "";
+              proc toString(x: real): string do return "";
+
+              proc doSomething(arg) {
+                someImplementationDetail(toString(arg));
+              }
+            }
+            """
+    fileC = """
+            module C {
+              use B;
+
+              doSomething(12);
+              doSomething(12.0);
+            }
+            """
+
+    async with unrelated_source_files(client, A=fileA, B=fileB, C=fileC) as docs:
+        expected_int = CallTree("B.doSomething", [ CallTree("A.someImplementationDetail", []), CallTree("B.toString", []) ])
+        await check_call_hierarchy(client, docs("C"), pos((3, 2)), expected_int)
+        expected_real = CallTree("B.doSomething", [ CallTree("A.someImplementationDetail", []), CallTree("B.toString#1", []) ])
+        await check_call_hierarchy(client, docs("C"), pos((4, 2)), expected_real)
diff --git a/tools/chpl-language-server/test/util/utils.py b/tools/chpl-language-server/test/util/utils.py
index dcfd48cf1129..4717f283ebde 100644
--- a/tools/chpl-language-server/test/util/utils.py
+++ b/tools/chpl-language-server/test/util/utils.py
@@ -73,7 +73,12 @@ def on_semantic_token_refresh(params):
 
 
 class SourceFilesContext:
-    def __init__(self, client: LanguageClient, files: typing.Dict[str, str]):
+    def __init__(
+        self,
+        client: LanguageClient,
+        files: typing.Dict[str, str],
+        build_cls_commands: bool = True,
+    ):
         self.tempdir = tempfile.TemporaryDirectory()
         self.client = client
 
@@ -96,8 +101,9 @@ def __init__(self, client: LanguageClient, files: typing.Dict[str, str]):
             commands[filepath] = [{"module_dirs": [], "files": allfiles}]
 
         commandspath = os.path.join(self.tempdir.name, ".cls-commands.json")
-        with open(commandspath, "w") as f:
-            json.dump(commands, f)
+        if build_cls_commands:
+            with open(commandspath, "w") as f:
+                json.dump(commands, f)
 
     def _get_doc(self, name: str) -> TextDocumentIdentifier:
         return TextDocumentIdentifier(
@@ -159,6 +165,14 @@ def source_files(client: LanguageClient, **files: str):
     return SourceFilesContext(client, files)
 
 
+def unrelated_source_files(client: LanguageClient, **files: str):
+    """
+    Same as 'source_files', but doesn't create a .cls-commands.json file that
+    would cause the files to be treated as "connected" and resolved together.
+    """
+    return SourceFilesContext(client, files, build_cls_commands=False)
+
+
 def source_file(
     client: LanguageClient,
     contents: str,

From 641fc8b9d850f65ecf1cbe3587e72e535f60b8cd Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Fri, 11 Oct 2024 17:11:41 -0700
Subject: [PATCH 098/107] Apply black

No Jade, did not hit CI warning this time :)

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 .../test/call_hierarchy.py                    | 93 ++++++++++++++-----
 1 file changed, 71 insertions(+), 22 deletions(-)

diff --git a/tools/chpl-language-server/test/call_hierarchy.py b/tools/chpl-language-server/test/call_hierarchy.py
index dc8c2e53fbfd..706fa13e6732 100644
--- a/tools/chpl-language-server/test/call_hierarchy.py
+++ b/tools/chpl-language-server/test/call_hierarchy.py
@@ -6,7 +6,11 @@
 import sys
 
 from lsprotocol.types import ClientCapabilities
-from lsprotocol.types import CallHierarchyPrepareParams, CallHierarchyOutgoingCallsParams, CallHierarchyItem
+from lsprotocol.types import (
+    CallHierarchyPrepareParams,
+    CallHierarchyOutgoingCallsParams,
+    CallHierarchyItem,
+)
 from lsprotocol.types import InitializeParams
 import pytest
 import pytest_lsp
@@ -37,53 +41,79 @@ async def client(lsp_client: LanguageClient):
     # Teardown
     await lsp_client.shutdown_session()
 
+
 class CallTree:
     def __init__(self, item_id: str, children: typing.List["CallTree"]):
         self.item_id = item_id
         self.children = children
 
-async def collect_call_tree(client: LanguageClient, item: CallHierarchyItem, depth: int) -> typing.Optional[CallTree]:
+
+async def collect_call_tree(
+    client: LanguageClient, item: CallHierarchyItem, depth: int
+) -> typing.Optional[CallTree]:
     if depth <= 0:
         return None
 
-    assert(isinstance(item.data, list))
-    assert(len(item.data) == 3)
+    assert isinstance(item.data, list)
+    assert len(item.data) == 3
     item_id = item.data[0]
 
     children = []
-    outgoing = await client.call_hierarchy_outgoing_calls_async(CallHierarchyOutgoingCallsParams(item))
+    outgoing = await client.call_hierarchy_outgoing_calls_async(
+        CallHierarchyOutgoingCallsParams(item)
+    )
     if outgoing is not None:
         for outgoing_call in outgoing:
-            new_tree = await collect_call_tree(client, outgoing_call.to, depth - 1)
+            new_tree = await collect_call_tree(
+                client, outgoing_call.to, depth - 1
+            )
             if new_tree is not None:
                 children.append(new_tree)
 
     return CallTree(item_id, children)
 
 
-async def compute_call_hierarchy(client: LanguageClient, doc: TextDocumentIdentifier, position: Position, depth: int) -> typing.Optional[CallTree]:
-    items = await client.text_document_prepare_call_hierarchy_async(CallHierarchyPrepareParams(text_document=doc, position=position))
+async def compute_call_hierarchy(
+    client: LanguageClient,
+    doc: TextDocumentIdentifier,
+    position: Position,
+    depth: int,
+) -> typing.Optional[CallTree]:
+    items = await client.text_document_prepare_call_hierarchy_async(
+        CallHierarchyPrepareParams(text_document=doc, position=position)
+    )
     if items is None:
         return None
 
-    assert(len(items) == 1)
+    assert len(items) == 1
     return await collect_call_tree(client, items[0], depth)
 
+
 def verify_call_hierarchy(tree: CallTree, expected: CallTree):
-    assert(tree.item_id == expected.item_id)
-    assert(len(tree.children) == len(expected.children))
+    assert tree.item_id == expected.item_id
+    assert len(tree.children) == len(expected.children)
     for i in range(len(tree.children)):
         verify_call_hierarchy(tree.children[i], expected.children[i])
 
-async def check_call_hierarchy(client: LanguageClient, doc: TextDocumentIdentifier, position: Position, expected: CallTree, depth: int = 10) -> typing.Optional[CallTree]:
-    items = await client.text_document_prepare_call_hierarchy_async(CallHierarchyPrepareParams(text_document=doc, position=position))
-    assert(items is not None)
-    assert(len(items) == 1)
+
+async def check_call_hierarchy(
+    client: LanguageClient,
+    doc: TextDocumentIdentifier,
+    position: Position,
+    expected: CallTree,
+    depth: int = 10,
+) -> typing.Optional[CallTree]:
+    items = await client.text_document_prepare_call_hierarchy_async(
+        CallHierarchyPrepareParams(text_document=doc, position=position)
+    )
+    assert items is not None
+    assert len(items) == 1
     tree = await collect_call_tree(client, items[0], depth)
-    assert(tree is not None)
+    assert tree is not None
     verify_call_hierarchy(tree, expected)
     return tree
 
+
 @pytest.mark.asyncio
 async def test_call_hierarchy_basic(client: LanguageClient):
     file = """
@@ -93,9 +123,10 @@ async def test_call_hierarchy_basic(client: LanguageClient):
            """
 
     async with source_file(client, file) as doc:
-        expect = CallTree("main.bar", [ CallTree("main.foo", []) ])
+        expect = CallTree("main.bar", [CallTree("main.foo", [])])
         await check_call_hierarchy(client, doc, pos((2, 0)), expect)
 
+
 @pytest.mark.asyncio
 async def test_call_hierarchy_overloads(client: LanguageClient):
     file = """
@@ -111,6 +142,7 @@ async def test_call_hierarchy_overloads(client: LanguageClient):
         expect_bool = CallTree("main.foo#1", [])
         await check_call_hierarchy(client, doc, pos((3, 0)), expect_bool)
 
+
 @pytest.mark.asyncio
 async def test_call_hierarchy_recursive(client: LanguageClient):
     file = """
@@ -119,9 +151,10 @@ async def test_call_hierarchy_recursive(client: LanguageClient):
            """
 
     async with source_file(client, file) as doc:
-        expect = CallTree("main.foo", [ CallTree("main.foo", []) ])
+        expect = CallTree("main.foo", [CallTree("main.foo", [])])
         await check_call_hierarchy(client, doc, pos((1, 0)), expect, depth=2)
 
+
 @pytest.mark.asyncio
 async def test_call_hierarchy_across_files(client: LanguageClient):
     fileA = """
@@ -150,8 +183,24 @@ async def test_call_hierarchy_across_files(client: LanguageClient):
             }
             """
 
-    async with unrelated_source_files(client, A=fileA, B=fileB, C=fileC) as docs:
-        expected_int = CallTree("B.doSomething", [ CallTree("A.someImplementationDetail", []), CallTree("B.toString", []) ])
+    async with unrelated_source_files(
+        client, A=fileA, B=fileB, C=fileC
+    ) as docs:
+        expected_int = CallTree(
+            "B.doSomething",
+            [
+                CallTree("A.someImplementationDetail", []),
+                CallTree("B.toString", []),
+            ],
+        )
         await check_call_hierarchy(client, docs("C"), pos((3, 2)), expected_int)
-        expected_real = CallTree("B.doSomething", [ CallTree("A.someImplementationDetail", []), CallTree("B.toString#1", []) ])
-        await check_call_hierarchy(client, docs("C"), pos((4, 2)), expected_real)
+        expected_real = CallTree(
+            "B.doSomething",
+            [
+                CallTree("A.someImplementationDetail", []),
+                CallTree("B.toString#1", []),
+            ],
+        )
+        await check_call_hierarchy(
+            client, docs("C"), pos((4, 2)), expected_real
+        )

From 5942d7e6cfc6fb45ca025830ae51637a564901ca Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Fri, 11 Oct 2024 19:01:48 -0700
Subject: [PATCH 099/107] Fix vector deserialization to avoid odd entries

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 tools/chapel-py/src/python-types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/chapel-py/src/python-types.h b/tools/chapel-py/src/python-types.h
index 78b3d9081e2c..ef1cb2dbbcec 100644
--- a/tools/chapel-py/src/python-types.h
+++ b/tools/chapel-py/src/python-types.h
@@ -92,7 +92,7 @@ template <typename T>
 std::vector<T> unwrapVector(ContextObject* CONTEXT, PyObject* vec) {
   std::vector<T> toReturn(PyList_Size(vec));
   for (ssize_t i = 0; i < PyList_Size(vec); i++) {
-    toReturn.push_back(PythonReturnTypeInfo<T>::unwrap(CONTEXT, PyList_GetItem(vec, i)));
+    toReturn[i] = PythonReturnTypeInfo<T>::unwrap(CONTEXT, PyList_GetItem(vec, i));
   }
   return toReturn;
 }

From 0ebec9b168b7ad2b69fce65a4725795c5d0e9729 Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Fri, 11 Oct 2024 19:02:28 -0700
Subject: [PATCH 100/107] Expand testing to both .cls-commands.json and not

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 .../test/call_hierarchy.py                    | 43 +++++++++++--------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/tools/chpl-language-server/test/call_hierarchy.py b/tools/chpl-language-server/test/call_hierarchy.py
index 706fa13e6732..ad27658f7812 100644
--- a/tools/chpl-language-server/test/call_hierarchy.py
+++ b/tools/chpl-language-server/test/call_hierarchy.py
@@ -183,24 +183,31 @@ async def test_call_hierarchy_across_files(client: LanguageClient):
             }
             """
 
+    expected_int = CallTree(
+        "B.doSomething",
+        [
+            CallTree("A.someImplementationDetail", []),
+            CallTree("B.toString", []),
+        ],
+    )
+    expected_real = CallTree(
+        "B.doSomething",
+        [
+            CallTree("A.someImplementationDetail", []),
+            CallTree("B.toString#1", []),
+        ],
+    )
+
+    async def check(docs):
+        await check_call_hierarchy(client, docs("C"), pos((3, 2)), expected_int)
+        await check_call_hierarchy(client, docs("C"), pos((4, 2)), expected_real)
+
     async with unrelated_source_files(
         client, A=fileA, B=fileB, C=fileC
     ) as docs:
-        expected_int = CallTree(
-            "B.doSomething",
-            [
-                CallTree("A.someImplementationDetail", []),
-                CallTree("B.toString", []),
-            ],
-        )
-        await check_call_hierarchy(client, docs("C"), pos((3, 2)), expected_int)
-        expected_real = CallTree(
-            "B.doSomething",
-            [
-                CallTree("A.someImplementationDetail", []),
-                CallTree("B.toString#1", []),
-            ],
-        )
-        await check_call_hierarchy(
-            client, docs("C"), pos((4, 2)), expected_real
-        )
+        await check(docs)
+
+    async with source_files(
+        client, A=fileA, B=fileB, C=fileC
+    ) as docs:
+        await check(docs)

From fc64b9c33348564e0608d89105bb43f1e3ceedd1 Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Fri, 11 Oct 2024 19:03:02 -0700
Subject: [PATCH 101/107] Add a comment

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 tools/chpl-language-server/test/call_hierarchy.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/chpl-language-server/test/call_hierarchy.py b/tools/chpl-language-server/test/call_hierarchy.py
index ad27658f7812..412276568676 100644
--- a/tools/chpl-language-server/test/call_hierarchy.py
+++ b/tools/chpl-language-server/test/call_hierarchy.py
@@ -202,11 +202,13 @@ async def check(docs):
         await check_call_hierarchy(client, docs("C"), pos((3, 2)), expected_int)
         await check_call_hierarchy(client, docs("C"), pos((4, 2)), expected_real)
 
+    # Ensure that call hierarchy works without .cls-commands.json...
     async with unrelated_source_files(
         client, A=fileA, B=fileB, C=fileC
     ) as docs:
         await check(docs)
 
+    # ...and with .cls-commands.json
     async with source_files(
         client, A=fileA, B=fileB, C=fileC
     ) as docs:

From a60bbe90470d433f33bec617055691ffc3634f9c Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Fri, 11 Oct 2024 19:04:43 -0700
Subject: [PATCH 102/107] Fix format with 'black'

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 tools/chpl-language-server/test/call_hierarchy.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/chpl-language-server/test/call_hierarchy.py b/tools/chpl-language-server/test/call_hierarchy.py
index 412276568676..68df81194ecb 100644
--- a/tools/chpl-language-server/test/call_hierarchy.py
+++ b/tools/chpl-language-server/test/call_hierarchy.py
@@ -200,7 +200,9 @@ async def test_call_hierarchy_across_files(client: LanguageClient):
 
     async def check(docs):
         await check_call_hierarchy(client, docs("C"), pos((3, 2)), expected_int)
-        await check_call_hierarchy(client, docs("C"), pos((4, 2)), expected_real)
+        await check_call_hierarchy(
+            client, docs("C"), pos((4, 2)), expected_real
+        )
 
     # Ensure that call hierarchy works without .cls-commands.json...
     async with unrelated_source_files(
@@ -209,7 +211,5 @@ async def check(docs):
         await check(docs)
 
     # ...and with .cls-commands.json
-    async with source_files(
-        client, A=fileA, B=fileB, C=fileC
-    ) as docs:
+    async with source_files(client, A=fileA, B=fileB, C=fileC) as docs:
         await check(docs)

From d3b417cbd0d6baa244357f77839edb206e423c3d Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Mon, 14 Oct 2024 10:53:13 -0700
Subject: [PATCH 103/107] Weaken assertion to CHPL_UNIMPL to avoid crashing on
 String.chpl

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 frontend/lib/resolution/resolution-queries.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/lib/resolution/resolution-queries.cpp b/frontend/lib/resolution/resolution-queries.cpp
index 158828a5ab00..0970e2371d0a 100644
--- a/frontend/lib/resolution/resolution-queries.cpp
+++ b/frontend/lib/resolution/resolution-queries.cpp
@@ -2150,8 +2150,8 @@ ApplicabilityResult instantiateSignature(ResolutionContext* rc,
   const TypedFnSignature* parentSignature = sig->parentFn();
   if (parentSignature) {
     for (auto up = parentSignature; up; up = up->parentFn()) {
-      CHPL_ASSERT(!up->needsInstantiation());
       if (up->needsInstantiation()) {
+        CHPL_UNIMPL("parent function needs instantiation");
         return ApplicabilityResult::failure(sig->id(), FAIL_CANDIDATE_OTHER);
       }
     }

From 2eceecbd1217c3b8092e2554b6ed425ad83cabed Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Mon, 14 Oct 2024 11:22:32 -0700
Subject: [PATCH 104/107] Add new test to lock down go-to-def without
 .cls-commands

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 tools/chpl-language-server/test/basic.py | 31 ++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tools/chpl-language-server/test/basic.py b/tools/chpl-language-server/test/basic.py
index 926168994618..9d17c5508e76 100644
--- a/tools/chpl-language-server/test/basic.py
+++ b/tools/chpl-language-server/test/basic.py
@@ -139,6 +139,37 @@ async def test_go_to_definition_use_standard(client: LanguageClient):
         await check_goto_decl_def_module(client, doc, pos((1, 10)), mod_Map)
         await check_goto_decl_def_module(client, doc, pos((2, 8)), mod_Time)
 
+@pytest.mark.asyncio
+async def test_go_to_definition_use_across_modules(client: LanguageClient):
+    """
+    Ensure that go-to-definition works on symbols that reference other modules
+    """
+
+    fileA = """
+            module A {
+              var x = 42;
+            }
+            """
+    fileB = """
+            module B {
+              use A;
+              var y = x;
+            }
+            """
+
+    async def check(docs):
+        docA = docs("A")
+        docB = docs("B")
+
+        await check_goto_decl_def_module(client, docB, pos((1, 6)), docA)
+        await check_goto_decl_def(client, docB, pos((2, 10)), (docA, pos((1, 6))))
+
+    async with source_files(client, A=fileA, B=fileB) as docs:
+        await check(docs)
+
+    async with unrelated_source_files(client, A=fileA, B=fileB) as docs:
+        await check(docs)
+
 
 @pytest.mark.asyncio
 async def test_go_to_definition_standard_rename(client: LanguageClient):

From 00aee9eb78ffb725ad2e5e8fa427a2b6cc9a3e1e Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Mon, 14 Oct 2024 11:22:54 -0700
Subject: [PATCH 105/107] Apply black again

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 tools/chpl-language-server/test/basic.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/chpl-language-server/test/basic.py b/tools/chpl-language-server/test/basic.py
index 9d17c5508e76..5999bf35911f 100644
--- a/tools/chpl-language-server/test/basic.py
+++ b/tools/chpl-language-server/test/basic.py
@@ -139,6 +139,7 @@ async def test_go_to_definition_use_standard(client: LanguageClient):
         await check_goto_decl_def_module(client, doc, pos((1, 10)), mod_Map)
         await check_goto_decl_def_module(client, doc, pos((2, 8)), mod_Time)
 
+
 @pytest.mark.asyncio
 async def test_go_to_definition_use_across_modules(client: LanguageClient):
     """
@@ -162,7 +163,9 @@ async def check(docs):
         docB = docs("B")
 
         await check_goto_decl_def_module(client, docB, pos((1, 6)), docA)
-        await check_goto_decl_def(client, docB, pos((2, 10)), (docA, pos((1, 6))))
+        await check_goto_decl_def(
+            client, docB, pos((2, 10)), (docA, pos((1, 6)))
+        )
 
     async with source_files(client, A=fileA, B=fileB) as docs:
         await check(docs)

From 32de3b8b15cebbb881cd16093b0ce7524e33e317 Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Mon, 14 Oct 2024 11:31:10 -0700
Subject: [PATCH 106/107] Fix typos

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 tools/chpl-language-server/src/chpl-language-server.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/chpl-language-server/src/chpl-language-server.py b/tools/chpl-language-server/src/chpl-language-server.py
index b57b9496ee5a..b526143459da 100755
--- a/tools/chpl-language-server/src/chpl-language-server.py
+++ b/tools/chpl-language-server/src/chpl-language-server.py
@@ -1165,7 +1165,7 @@ def get_file_info(
         then the FileInfo's index is rebuilt even if it has already been
         computed. This is useful if the underlying file has changed.
 
-        Most of the tiem, we will create a new context for a given URI. When
+        Most of the time, we will create a new context for a given URI. When
         requested, however, context_id will be used to create a FileInfo
         for a specific context. This is useful if e.g., file A wants to display
         an instantiation in file B.
@@ -1493,7 +1493,7 @@ def unpack_call_hierarchy_item(
                 MessageType.Error,
             )
             return None
-        uid, idx, ctx = item.data
+        uid, inst_id, ctx = item.data
 
         fi, _ = self.get_file_info(item.uri, context_id=ctx)
 
@@ -1509,7 +1509,7 @@ def unpack_call_hierarchy_item(
             # We don't handle that here.
             return None
 
-        instantiation = fi.context.retrieve_signature(idx)
+        instantiation = fi.context.retrieve_signature(inst_id)
 
         return (fi, fn, instantiation)
 

From 70ad2801b4913d341127b6e31ac5c6278afc0bb1 Mon Sep 17 00:00:00 2001
From: Danila Fedorin <daniel.fedorin@hpe.com>
Date: Mon, 14 Oct 2024 11:46:24 -0700
Subject: [PATCH 107/107] Fix file comment on call_hierarchy

Signed-off-by: Danila Fedorin <daniel.fedorin@hpe.com>
---
 tools/chpl-language-server/test/call_hierarchy.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/chpl-language-server/test/call_hierarchy.py b/tools/chpl-language-server/test/call_hierarchy.py
index 68df81194ecb..0260e705e354 100644
--- a/tools/chpl-language-server/test/call_hierarchy.py
+++ b/tools/chpl-language-server/test/call_hierarchy.py
@@ -1,6 +1,5 @@
 """
-Tests basic functionality, including autocompletion, go-to-definition, hover,
-and references
+Test the call hierarchy feature, which computes calls between functions.
 """
 
 import sys