From 0d9a0dcc6f02abf577f1c6ab402c74eb8e98947e Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Mon, 9 Sep 2024 17:47:12 -0500
Subject: [PATCH 01/39] Name correction for Ranran Haoran Zhang, closes #3292.

---
 data/xml/2020.findings.xml   | 2 +-
 data/xml/2021.naacl.xml      | 2 +-
 data/xml/2023.eacl.xml       | 2 +-
 data/xml/2023.emnlp.xml      | 2 +-
 data/xml/2023.findings.xml   | 2 +-
 data/yaml/name_variants.yaml | 2 ++
 6 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/data/xml/2020.findings.xml b/data/xml/2020.findings.xml
index bc2a296f9a..4b4fb7e83a 100644
--- a/data/xml/2020.findings.xml
+++ b/data/xml/2020.findings.xml
@@ -330,7 +330,7 @@
     </paper>
     <paper id="23">
       <title>Minimize Exposure Bias of <fixed-case>S</fixed-case>eq2<fixed-case>S</fixed-case>eq Models in Joint Entity and Relation Extraction</title>
-      <author><first>Ranran Haoran</first><last>Zhang</last></author>
+      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Qianying</first><last>Liu</last></author>
       <author><first>Aysa Xuemo</first><last>Fan</last></author>
       <author><first>Heng</first><last>Ji</last></author>
diff --git a/data/xml/2021.naacl.xml b/data/xml/2021.naacl.xml
index afb238d89b..9eb131274b 100644
--- a/data/xml/2021.naacl.xml
+++ b/data/xml/2021.naacl.xml
@@ -7496,7 +7496,7 @@
       <author><first>Jiawei</first><last>Ma</last></author>
       <author><first>Jingxuan</first><last>Tu</last></author>
       <author><first>Ying</first><last>Lin</last></author>
-      <author><first>Ranran Haoran</first><last>Zhang</last></author>
+      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Weili</first><last>Liu</last></author>
       <author><first>Aabhas</first><last>Chauhan</last></author>
       <author><first>Yingjun</first><last>Guan</last></author>
diff --git a/data/xml/2023.eacl.xml b/data/xml/2023.eacl.xml
index 1f8185590d..ef68a7341f 100644
--- a/data/xml/2023.eacl.xml
+++ b/data/xml/2023.eacl.xml
@@ -1920,7 +1920,7 @@
     </paper>
     <paper id="142">
       <title><fixed-case>C</fixed-case>on<fixed-case>E</fixed-case>ntail: An Entailment-based Framework for Universal Zero and Few Shot Classification with Supervised Contrastive Pretraining</title>
-      <author><first>Ranran Haoran</first><last>Zhang</last><affiliation>The Pennsylvania State University</affiliation></author>
+      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last><affiliation>The Pennsylvania State University</affiliation></author>
       <author><first>Aysa Xuemo</first><last>Fan</last><affiliation>University of Illinois at Urbana-Champaign</affiliation></author>
       <author><first>Rui</first><last>Zhang</last><affiliation>Penn State University</affiliation></author>
       <pages>1941-1953</pages>
diff --git a/data/xml/2023.emnlp.xml b/data/xml/2023.emnlp.xml
index 81d58b48c8..3e425af594 100644
--- a/data/xml/2023.emnlp.xml
+++ b/data/xml/2023.emnlp.xml
@@ -6067,7 +6067,7 @@
     <paper id="433">
       <title>Unified Low-Resource Sequence Labeling by Sample-Aware Dynamic Sparse Finetuning</title>
       <author><first>Sarkar Snigdha Sarathi</first><last>Das</last></author>
-      <author><first>Haoran</first><last>Zhang</last></author>
+      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Peng</first><last>Shi</last></author>
       <author><first>Wenpeng</first><last>Yin</last></author>
       <author><first>Rui</first><last>Zhang</last></author>
diff --git a/data/xml/2023.findings.xml b/data/xml/2023.findings.xml
index 53f5e39640..ef19af444f 100644
--- a/data/xml/2023.findings.xml
+++ b/data/xml/2023.findings.xml
@@ -21079,7 +21079,7 @@
     <paper id="496">
       <title>Exploring the Potential of Large Language Models in Generating Code-Tracing Questions for Introductory Programming Courses</title>
       <author><first>Aysa</first><last>Fan</last></author>
-      <author><first>Haoran</first><last>Zhang</last></author>
+      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Luc</first><last>Paquette</last></author>
       <author><first>Rui</first><last>Zhang</last></author>
       <pages>7406-7421</pages>
diff --git a/data/yaml/name_variants.yaml b/data/yaml/name_variants.yaml
index ff118b613c..f5ad0712e9 100644
--- a/data/yaml/name_variants.yaml
+++ b/data/yaml/name_variants.yaml
@@ -10637,3 +10637,5 @@
 - canonical: {first: Genta Indra, last: Winata}
   variants:
   - {first: Genta, last: Winata}
+- canonical: {first: Ranran Haoran, last: Zhang}
+  id: ranran-haoran-zhang

From 3663c56ed82afed742d38fda04591aa7e37a7699 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Mon, 9 Sep 2024 20:14:59 -0500
Subject: [PATCH 02/39] Paper Revision{2024.acl-long.387}, closes #3839.

---
 data/xml/2024.acl.xml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index e0a20a98fa..8e2ef425dc 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -4973,10 +4973,11 @@
       <author><first>Ziyu</first><last>Yao</last><affiliation>George Mason University</affiliation></author>
       <pages>7174-7193</pages>
       <abstract>Large language models (LLMs) have shown strong arithmetic reasoning capabilities when prompted with Chain-of-Thought (CoT) prompts. However, we have only a limited understanding of how they are processed by LLMs. To demystify it, prior work has primarily focused on ablating different components in the CoT prompt and empirically observing their resulting LLM performance change. Yet, the reason why these components are important to LLM reasoning is not explored. To fill this gap, in this work, we investigate “neuron activation” as a lens to provide a unified explanation to observations made by prior work. Specifically, we look into neurons within the feed-forward layers of LLMs that may have activated their arithmetic reasoning capabilities, using Llama2 as an example. To facilitate this investigation, we also propose an approach based on GPT-4 to automatically identify neurons that imply arithmetic reasoning. Our analyses revealed that the activation of reasoning neurons in the feed-forward layers of an LLM can explain the importance of various components in a CoT prompt, and future research can extend it for a more complete understanding.</abstract>
-      <url hash="ac2dab44">2024.acl-long.387</url>
+      <url hash="364a0886">2024.acl-long.387</url>
       <bibkey>rai-yao-2024-investigation</bibkey>
       <revision id="1" href="2024.acl-long.387v1" hash="96778503"/>
       <revision id="2" href="2024.acl-long.387v2" hash="ac2dab44" date="2024-08-29">Minor updates.</revision>
+      <revision id="3" href="2024.acl-long.387v3" hash="364a0886" date="2024-09-09">Minor updates.</revision>
     </paper>
     <paper id="388">
       <title>Leveraging Large Language Models for Learning Complex Legal Concepts through Storytelling</title>

From e1c30cf24c9ae333b6cd9f94d6603f96b45ef363 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Mon, 9 Sep 2024 20:16:39 -0500
Subject: [PATCH 03/39] Paper Revision{2024.acl-long.3}, closes #3843.

---
 data/xml/2024.acl.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index 8e2ef425dc..1e19a5e108 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -57,8 +57,10 @@
       <author><first>Yue</first><last>Zhang</last><affiliation>Westlake University</affiliation></author>
       <pages>36-53</pages>
       <abstract>Large language models (LLMs) have achieved human-level text generation, emphasizing the need for effective deepfake text detection to mitigate risks like the spread of fake news and plagiarism. Existing research has been constrained by evaluating detection methods o specific domains or particular language models. In practical scenarios, however, the detector faces texts from various domains or LLMs without knowing their sources. To this end, we build a comprehensive testbed by gathering texts from diverse human writings and deepfake texts generated by different LLMs. Empirical results on mainstream detection methods demonstrate the difficulties associated with detecting deepfake text in a wide-ranging testbed, particularly in out-of-distribution scenarios. Such difficulties align with the diminishing linguistic differences between the two text sources. Despite challenges, the top-performing detector can identify 84.12% out-of-domain texts generated by a new LLM, indicating the feasibility for application scenarios.</abstract>
-      <url hash="5fdb577f">2024.acl-long.3</url>
+      <url hash="31eb75e1">2024.acl-long.3</url>
       <bibkey>li-etal-2024-mage</bibkey>
+      <revision id="1" href="2024.acl-long.3v1" hash="5fdb577f"/>
+      <revision id="2" href="2024.acl-long.3v2" hash="31eb75e1" date="2024-09-09">Minor updates.</revision>
     </paper>
     <paper id="4">
       <title><fixed-case>P</fixed-case>riv<fixed-case>LM</fixed-case>-Bench: A Multi-level Privacy Evaluation Benchmark for Language Models</title>

From 6bb4e667edb2c327d434e27367c238d3c14cdcb1 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Mon, 9 Sep 2024 20:17:56 -0500
Subject: [PATCH 04/39] Paper Revision{2024.nlp4convai-1.5}, closes #3852.

---
 data/xml/2024.nlp4convai.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2024.nlp4convai.xml b/data/xml/2024.nlp4convai.xml
index 29b9ef42e0..aa9cbefa8b 100644
--- a/data/xml/2024.nlp4convai.xml
+++ b/data/xml/2024.nlp4convai.xml
@@ -77,8 +77,10 @@
       <author><first>Florian</first><last>Matthes</last><affiliation>Technische Universität München</affiliation></author>
       <pages>73-88</pages>
       <abstract>Conversational search systems enable information retrieval via natural language interactions, with the goal of maximizing users’ information gain over multiple dialogue turns. The increasing prevalence of conversational interfaces adopting this search paradigm challenges traditional information retrieval approaches, stressing the importance of better understanding the engineering process of developing these systems. We undertook a systematic literature review to investigate the links between theoretical studies and technical implementations of conversational search systems. Our review identifies real-world application scenarios, system architectures, and functional components. We consolidate our results by presenting a layered architecture framework and explaining the core functions of conversational search systems. Furthermore, we reflect on our findings in light of the rapid progress in large language models, discussing their capabilities, limitations, and directions for future research.</abstract>
-      <url hash="b9359051">2024.nlp4convai-1.5</url>
+      <url hash="d36b0544">2024.nlp4convai-1.5</url>
       <bibkey>schneider-etal-2024-engineering</bibkey>
+      <revision id="1" href="2024.nlp4convai-1.5v1" hash="b9359051"/>
+      <revision id="2" href="2024.nlp4convai-1.5v2" hash="d36b0544" date="2024-09-09">This revision corrects the page numbering.</revision>
     </paper>
     <paper id="6">
       <title>Efficient Dynamic Hard Negative Sampling for Dialogue Selection</title>

From 69bda12f3c14f8f599cf2032ed045e7d582e4ec4 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Mon, 9 Sep 2024 20:20:26 -0500
Subject: [PATCH 05/39] Paper Revision: {2024.acl-long.233}, closes #3856.

---
 data/xml/2024.acl.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index 1e19a5e108..15363b8ffa 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -3028,8 +3028,10 @@
       <author><first>Jianguo</first><last>Li</last><affiliation>Ant Group</affiliation></author>
       <pages>4247-4262</pages>
       <abstract>Self-attention and position embedding are two crucial modules in transformer-based Large Language Models (LLMs). However, the potential relationship between them is far from well studied, especially for long context window extending. In fact, anomalous behaviors that hinder long context extrapolation exist between Rotary Position Embedding (RoPE) and vanilla self-attention.Incorrect initial angles between <tex-math>Q</tex-math> and <tex-math>K</tex-math> can cause misestimation in modeling rotary position embedding of the closest tokens.To address this issue, we propose <tex-math>\textbf{Co}</tex-math>llinear <tex-math>\textbf{C}</tex-math>onstrained <tex-math>\textbf{A}</tex-math>ttention mechanism, namely CoCA. Specifically, we enforce a collinear constraint between <tex-math>Q</tex-math> and <tex-math>K</tex-math> to seamlessly integrate RoPE and self-attention.While only adding minimal computational and spatial complexity, this integration significantly enhances long context window extrapolation ability. We provide an optimized implementation, making it a drop-in replacement for any existing transformer-based models.Extensive experiments demonstrate that CoCA excels in extending context windows. A CoCA-based GPT model, trained with a context length of 512, can extend the context window up to 32K (60<tex-math>\times</tex-math>) without any fine-tuning.Additionally, incorporating CoCA into LLaMA-7B achieves extrapolation up to 32K within a training length of only 2K.Our code is publicly available at: https://github.com/codefuse-ai/Collinear-Constrained-Attention</abstract>
-      <url hash="4eabb3e5">2024.acl-long.233</url>
+      <url hash="1753c452">2024.acl-long.233</url>
       <bibkey>zhu-etal-2024-coca</bibkey>
+      <revision id="1" href="2024.acl-long.233v1" hash="4eabb3e5"/>
+      <revision id="2" href="2024.acl-long.233v2" hash="1753c452" date="2024-09-09">The author's affiliation changed.</revision>
     </paper>
     <paper id="234">
       <title><fixed-case>I</fixed-case>nfo<fixed-case>L</fixed-case>oss<fixed-case>QA</fixed-case>: Characterizing and Recovering Information Loss in Text Simplification</title>

From 22cd9cb4bb5d86b48fbc971d2b693a9435dc71df Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 10 Sep 2024 06:22:41 -0500
Subject: [PATCH 06/39] Paper Metadata: 2024.arabicnlp-1.47, closes #3857.

---
 data/xml/2024.arabicnlp.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2024.arabicnlp.xml b/data/xml/2024.arabicnlp.xml
index 6460f59db7..ca9f0e6ffd 100644
--- a/data/xml/2024.arabicnlp.xml
+++ b/data/xml/2024.arabicnlp.xml
@@ -557,7 +557,7 @@
     </paper>
     <paper id="47">
       <title>Mela at <fixed-case>A</fixed-case>r<fixed-case>AIE</fixed-case>val Shared Task: Propagandistic Techniques Detection in <fixed-case>A</fixed-case>rabic with a Multilingual Approach</title>
-      <author><first>Md</first><last>Riyadh</last></author>
+      <author><first>Md Abdur Razzaq</first><last>Riyadh</last></author>
       <author><first>Sara</first><last>Nabhani</last></author>
       <pages>478-482</pages>
       <abstract>This paper presents our system submitted for Task 1 of the ArAIEval Shared Task on Unimodal (Text) Propagandistic Technique Detection in Arabic. Task 1 involves identifying all employed propaganda techniques in a given text from a set of possible techniques or detecting that no propaganda technique is present. Additionally, the task requires identifying the specific spans of text where these techniques occur. We explored the capabilities of a multilingual BERT model for this task, focusing on the effectiveness of using outputs from different hidden layers within the model. By fine-tuning the multilingual BERT, we aimed to improve the model’s ability to recognize and locate various propaganda techniques. Our experiments showed that leveraging the hidden layers of the BERT model enhanced detection performance. Our system achieved competitive results, ranking second in the shared task, demonstrating that multilingual BERT models, combined with outputs from hidden layers, can effectively detect and identify spans of propaganda techniques in Arabic text.</abstract>

From 275c77d81c37675f9c38cac23253c2a9baf8e819 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Tue, 10 Sep 2024 09:40:48 -0400
Subject: [PATCH 07/39] Remove author name merge (#3292)

---
 data/xml/2020.findings.xml   | 2 +-
 data/xml/2021.naacl.xml      | 2 +-
 data/xml/2023.eacl.xml       | 2 +-
 data/xml/2023.emnlp.xml      | 2 +-
 data/xml/2023.findings.xml   | 2 +-
 data/yaml/name_variants.yaml | 5 -----
 6 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/data/xml/2020.findings.xml b/data/xml/2020.findings.xml
index 4b4fb7e83a..bc2a296f9a 100644
--- a/data/xml/2020.findings.xml
+++ b/data/xml/2020.findings.xml
@@ -330,7 +330,7 @@
     </paper>
     <paper id="23">
       <title>Minimize Exposure Bias of <fixed-case>S</fixed-case>eq2<fixed-case>S</fixed-case>eq Models in Joint Entity and Relation Extraction</title>
-      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
+      <author><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Qianying</first><last>Liu</last></author>
       <author><first>Aysa Xuemo</first><last>Fan</last></author>
       <author><first>Heng</first><last>Ji</last></author>
diff --git a/data/xml/2021.naacl.xml b/data/xml/2021.naacl.xml
index 9eb131274b..afb238d89b 100644
--- a/data/xml/2021.naacl.xml
+++ b/data/xml/2021.naacl.xml
@@ -7496,7 +7496,7 @@
       <author><first>Jiawei</first><last>Ma</last></author>
       <author><first>Jingxuan</first><last>Tu</last></author>
       <author><first>Ying</first><last>Lin</last></author>
-      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
+      <author><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Weili</first><last>Liu</last></author>
       <author><first>Aabhas</first><last>Chauhan</last></author>
       <author><first>Yingjun</first><last>Guan</last></author>
diff --git a/data/xml/2023.eacl.xml b/data/xml/2023.eacl.xml
index ef68a7341f..1f8185590d 100644
--- a/data/xml/2023.eacl.xml
+++ b/data/xml/2023.eacl.xml
@@ -1920,7 +1920,7 @@
     </paper>
     <paper id="142">
       <title><fixed-case>C</fixed-case>on<fixed-case>E</fixed-case>ntail: An Entailment-based Framework for Universal Zero and Few Shot Classification with Supervised Contrastive Pretraining</title>
-      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last><affiliation>The Pennsylvania State University</affiliation></author>
+      <author><first>Ranran Haoran</first><last>Zhang</last><affiliation>The Pennsylvania State University</affiliation></author>
       <author><first>Aysa Xuemo</first><last>Fan</last><affiliation>University of Illinois at Urbana-Champaign</affiliation></author>
       <author><first>Rui</first><last>Zhang</last><affiliation>Penn State University</affiliation></author>
       <pages>1941-1953</pages>
diff --git a/data/xml/2023.emnlp.xml b/data/xml/2023.emnlp.xml
index 3e425af594..4743c3f697 100644
--- a/data/xml/2023.emnlp.xml
+++ b/data/xml/2023.emnlp.xml
@@ -6067,7 +6067,7 @@
     <paper id="433">
       <title>Unified Low-Resource Sequence Labeling by Sample-Aware Dynamic Sparse Finetuning</title>
       <author><first>Sarkar Snigdha Sarathi</first><last>Das</last></author>
-      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
+      <author><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Peng</first><last>Shi</last></author>
       <author><first>Wenpeng</first><last>Yin</last></author>
       <author><first>Rui</first><last>Zhang</last></author>
diff --git a/data/xml/2023.findings.xml b/data/xml/2023.findings.xml
index ef19af444f..4296a51471 100644
--- a/data/xml/2023.findings.xml
+++ b/data/xml/2023.findings.xml
@@ -21079,7 +21079,7 @@
     <paper id="496">
       <title>Exploring the Potential of Large Language Models in Generating Code-Tracing Questions for Introductory Programming Courses</title>
       <author><first>Aysa</first><last>Fan</last></author>
-      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
+      <author><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Luc</first><last>Paquette</last></author>
       <author><first>Rui</first><last>Zhang</last></author>
       <pages>7406-7421</pages>
diff --git a/data/yaml/name_variants.yaml b/data/yaml/name_variants.yaml
index f5ad0712e9..0d6f09d7ef 100644
--- a/data/yaml/name_variants.yaml
+++ b/data/yaml/name_variants.yaml
@@ -10557,9 +10557,6 @@
 - canonical: {first: Zhicheng, last: Guo}
   comment: xidian
   id: zhicheng-guo-xidian
-- canonical: {first: Ranran Haoran, last: Zhang}
-  variants:
-  - {first: Haoran, last: Zhang}
 - canonical: {first: Michael, last: Schlichtkrull}
   variants:
   - {first: Michael Sejr, last: Schlichtkrull}
@@ -10637,5 +10634,3 @@
 - canonical: {first: Genta Indra, last: Winata}
   variants:
   - {first: Genta, last: Winata}
-- canonical: {first: Ranran Haoran, last: Zhang}
-  id: ranran-haoran-zhang

From b1ba90f3cfe4b46bb74bdb58eaf4550fe4e99c1d Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Tue, 10 Sep 2024 09:48:05 -0400
Subject: [PATCH 08/39] Revert "Remove author name merge (#3292)"

This reverts commit 275c77d81c37675f9c38cac23253c2a9baf8e819.
---
 data/xml/2020.findings.xml   | 2 +-
 data/xml/2021.naacl.xml      | 2 +-
 data/xml/2023.eacl.xml       | 2 +-
 data/xml/2023.emnlp.xml      | 2 +-
 data/xml/2023.findings.xml   | 2 +-
 data/yaml/name_variants.yaml | 5 +++++
 6 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/data/xml/2020.findings.xml b/data/xml/2020.findings.xml
index bc2a296f9a..4b4fb7e83a 100644
--- a/data/xml/2020.findings.xml
+++ b/data/xml/2020.findings.xml
@@ -330,7 +330,7 @@
     </paper>
     <paper id="23">
       <title>Minimize Exposure Bias of <fixed-case>S</fixed-case>eq2<fixed-case>S</fixed-case>eq Models in Joint Entity and Relation Extraction</title>
-      <author><first>Ranran Haoran</first><last>Zhang</last></author>
+      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Qianying</first><last>Liu</last></author>
       <author><first>Aysa Xuemo</first><last>Fan</last></author>
       <author><first>Heng</first><last>Ji</last></author>
diff --git a/data/xml/2021.naacl.xml b/data/xml/2021.naacl.xml
index afb238d89b..9eb131274b 100644
--- a/data/xml/2021.naacl.xml
+++ b/data/xml/2021.naacl.xml
@@ -7496,7 +7496,7 @@
       <author><first>Jiawei</first><last>Ma</last></author>
       <author><first>Jingxuan</first><last>Tu</last></author>
       <author><first>Ying</first><last>Lin</last></author>
-      <author><first>Ranran Haoran</first><last>Zhang</last></author>
+      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Weili</first><last>Liu</last></author>
       <author><first>Aabhas</first><last>Chauhan</last></author>
       <author><first>Yingjun</first><last>Guan</last></author>
diff --git a/data/xml/2023.eacl.xml b/data/xml/2023.eacl.xml
index 1f8185590d..ef68a7341f 100644
--- a/data/xml/2023.eacl.xml
+++ b/data/xml/2023.eacl.xml
@@ -1920,7 +1920,7 @@
     </paper>
     <paper id="142">
       <title><fixed-case>C</fixed-case>on<fixed-case>E</fixed-case>ntail: An Entailment-based Framework for Universal Zero and Few Shot Classification with Supervised Contrastive Pretraining</title>
-      <author><first>Ranran Haoran</first><last>Zhang</last><affiliation>The Pennsylvania State University</affiliation></author>
+      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last><affiliation>The Pennsylvania State University</affiliation></author>
       <author><first>Aysa Xuemo</first><last>Fan</last><affiliation>University of Illinois at Urbana-Champaign</affiliation></author>
       <author><first>Rui</first><last>Zhang</last><affiliation>Penn State University</affiliation></author>
       <pages>1941-1953</pages>
diff --git a/data/xml/2023.emnlp.xml b/data/xml/2023.emnlp.xml
index 4743c3f697..3e425af594 100644
--- a/data/xml/2023.emnlp.xml
+++ b/data/xml/2023.emnlp.xml
@@ -6067,7 +6067,7 @@
     <paper id="433">
       <title>Unified Low-Resource Sequence Labeling by Sample-Aware Dynamic Sparse Finetuning</title>
       <author><first>Sarkar Snigdha Sarathi</first><last>Das</last></author>
-      <author><first>Ranran Haoran</first><last>Zhang</last></author>
+      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Peng</first><last>Shi</last></author>
       <author><first>Wenpeng</first><last>Yin</last></author>
       <author><first>Rui</first><last>Zhang</last></author>
diff --git a/data/xml/2023.findings.xml b/data/xml/2023.findings.xml
index 4296a51471..ef19af444f 100644
--- a/data/xml/2023.findings.xml
+++ b/data/xml/2023.findings.xml
@@ -21079,7 +21079,7 @@
     <paper id="496">
       <title>Exploring the Potential of Large Language Models in Generating Code-Tracing Questions for Introductory Programming Courses</title>
       <author><first>Aysa</first><last>Fan</last></author>
-      <author><first>Ranran Haoran</first><last>Zhang</last></author>
+      <author id="ranran-haoran-zhang"><first>Ranran Haoran</first><last>Zhang</last></author>
       <author><first>Luc</first><last>Paquette</last></author>
       <author><first>Rui</first><last>Zhang</last></author>
       <pages>7406-7421</pages>
diff --git a/data/yaml/name_variants.yaml b/data/yaml/name_variants.yaml
index 0d6f09d7ef..f5ad0712e9 100644
--- a/data/yaml/name_variants.yaml
+++ b/data/yaml/name_variants.yaml
@@ -10557,6 +10557,9 @@
 - canonical: {first: Zhicheng, last: Guo}
   comment: xidian
   id: zhicheng-guo-xidian
+- canonical: {first: Ranran Haoran, last: Zhang}
+  variants:
+  - {first: Haoran, last: Zhang}
 - canonical: {first: Michael, last: Schlichtkrull}
   variants:
   - {first: Michael Sejr, last: Schlichtkrull}
@@ -10634,3 +10637,5 @@
 - canonical: {first: Genta Indra, last: Winata}
   variants:
   - {first: Genta, last: Winata}
+- canonical: {first: Ranran Haoran, last: Zhang}
+  id: ranran-haoran-zhang

From 8ee40075d79f18efd8f0ad18649a81e6d1a26a16 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Tue, 10 Sep 2024 09:53:14 -0400
Subject: [PATCH 09/39] Fixed duplicate key

---
 data/yaml/name_variants.yaml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/data/yaml/name_variants.yaml b/data/yaml/name_variants.yaml
index f5ad0712e9..93342e530e 100644
--- a/data/yaml/name_variants.yaml
+++ b/data/yaml/name_variants.yaml
@@ -10558,8 +10558,8 @@
   comment: xidian
   id: zhicheng-guo-xidian
 - canonical: {first: Ranran Haoran, last: Zhang}
-  variants:
-  - {first: Haoran, last: Zhang}
+  comment: Penn State University
+  id: ranran-haoran-zhang
 - canonical: {first: Michael, last: Schlichtkrull}
   variants:
   - {first: Michael Sejr, last: Schlichtkrull}
@@ -10637,5 +10637,3 @@
 - canonical: {first: Genta Indra, last: Winata}
   variants:
   - {first: Genta, last: Winata}
-- canonical: {first: Ranran Haoran, last: Zhang}
-  id: ranran-haoran-zhang

From 7d794607df51f66b5e304a8f80dfec6a3e40f80d Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Wed, 11 Sep 2024 17:00:27 -0500
Subject: [PATCH 10/39] Paper Metadata: 2024.propor-1.31, closes #3861.

---
 data/xml/2024.propor.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2024.propor.xml b/data/xml/2024.propor.xml
index 1e3759f240..90fd47bc39 100644
--- a/data/xml/2024.propor.xml
+++ b/data/xml/2024.propor.xml
@@ -322,7 +322,7 @@
       <title>Exploring <fixed-case>P</fixed-case>ortuguese Hate Speech Detection in Low-Resource Settings: Lightly Tuning Encoder Models or In-Context Learning of Large Models?</title>
       <author><first>Gabriel</first><last>Assis</last></author>
       <author><first>Annie</first><last>Amorim</last></author>
-      <author><first>Jonnatahn</first><last>Carvalho</last></author>
+      <author><first>Jonnathan</first><last>Carvalho</last></author>
       <author><first>Daniel</first><last>de Oliveira</last></author>
       <author><first>Daniela</first><last>Vianna</last></author>
       <author><first>Aline</first><last>Paes</last></author>

From 57f82fb5386f234c8bbe04a0e4864632c158a091 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Wed, 11 Sep 2024 17:19:00 -0500
Subject: [PATCH 11/39] Paper Metadata: {2024.starsem-1.30}, closes #3864.

---
 data/xml/2024.starsem.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/xml/2024.starsem.xml b/data/xml/2024.starsem.xml
index 9872757673..2d25741ccc 100644
--- a/data/xml/2024.starsem.xml
+++ b/data/xml/2024.starsem.xml
@@ -355,10 +355,10 @@
     <paper id="30">
       <title>A Trip Towards Fairness: Bias and De-Biasing in Large Language Models</title>
       <author><first>Leonardo</first><last>Ranaldi</last><affiliation>University of Rome Tor Vergata and Idiap Research Institute</affiliation></author>
-      <author><first>Elena</first><last>Ruzzetti</last><affiliation>University of Rome Tor Vergata</affiliation></author>
+      <author><first>Elena Sofia</first><last>Ruzzetti</last><affiliation>University of Rome Tor Vergata</affiliation></author>
       <author><first>Davide</first><last>Venditti</last><affiliation>University of Rome Tor Vergata</affiliation></author>
       <author><first>Dario</first><last>Onorati</last><affiliation>Sapienza University of Rome</affiliation></author>
-      <author><first>Fabio</first><last>Zanzotto</last><affiliation>University of Rome Tor Vergata</affiliation></author>
+      <author><first>Fabio Massimo</first><last>Zanzotto</last><affiliation>University of Rome Tor Vergata</affiliation></author>
       <pages>372-384</pages>
       <abstract>Cheap-to-Build Very Large-Language Models (CtB-LLMs) with affordable training are emerging as the next big revolution in natural language processing and understanding. These CtB-LLMs are democratizing access to trainable Very Large-Language Models (VLLMs) and, thus, may represent the building blocks of many NLP systems solving downstream tasks. Hence, a little or a large bias in CtB-LLMs may cause huge harm. In this paper, we performed a large investigation of the bias of three families of CtB-LLMs, and we showed that debiasing techniques are effective and usable. Indeed, according to current tests, the LLaMA and the OPT families have an important bias in gender, race, religion, and profession. In contrast to the analysis for other LMMs, we discovered that bias depends not on the number of parameters but on the perplexity. Finally, the debiasing of OPT using LORA reduces bias up to 4.12 points in the normalized stereotype score.</abstract>
       <url hash="4ba0e749">2024.starsem-1.30</url>

From b607f5dfe44ee939f6fb12e42da49afc1c208ac9 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Wed, 11 Sep 2024 17:30:30 -0500
Subject: [PATCH 12/39] Paper Metadata: 2024.findings-acl.847, closes #3869.

---
 data/xml/2024.findings.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2024.findings.xml b/data/xml/2024.findings.xml
index 52a810e049..901e4ddb44 100644
--- a/data/xml/2024.findings.xml
+++ b/data/xml/2024.findings.xml
@@ -16598,7 +16598,7 @@
     <paper id="847">
       <title>Pushing the Limits of Zero-shot End-to-End Speech Translation</title>
       <author><first>Ioannis</first><last>Tsiamas</last><affiliation>Apple and Universidad Politécnica de Cataluna</affiliation></author>
-      <author><first>Gerard</first><last>Gállego</last><affiliation>Universidad Politécnica de Cataluna</affiliation></author>
+      <author><first>Gerard I.</first><last>Gállego</last><affiliation>Universidad Politécnica de Cataluna</affiliation></author>
       <author><first>José</first><last>Fonollosa</last><affiliation>Universitat Politècnica de Catalunya</affiliation></author>
       <author><first>Marta</first><last>Costa-jussà</last><affiliation>Meta</affiliation></author>
       <pages>14245-14267</pages>

From 8d9cfd7f95687e727f5040d1b547f4b778842db2 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 13:41:31 -0500
Subject: [PATCH 13/39] Paper Revision: {2024.acl-long.693}, closes #3875.

---
 data/xml/2024.acl.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index ef3eb8c793..f20229ebe1 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -8993,8 +8993,10 @@
       <author><first>Arman</first><last>Cohan</last><affiliation>Yale University and Allen Institute for Artificial Intelligence</affiliation></author>
       <pages>12841-12858</pages>
       <abstract>We introduce KnowledgeFMath, a novel benchmark designed to evaluate LLMs’ capabilities in solving knowledge-intensive math reasoning problems. Compared to prior works, this study features three core advancements. First, KnowledgeFMath includes 1,259 problems with a hybrid of textual and tabular content. These problems require college-level knowledge in the finance domain for effective resolution. Second, we provide expert-annotated, detailed solution references in Python program format, ensuring a high-quality benchmark for LLM assessment. We also construct a finance-domain knowledge bank and investigate various knowledge integration strategies. Finally, we evaluate a wide spectrum of 26 LLMs with different prompting strategies like Chain-of-Thought and Program-of-Thought. Our experimental results reveal that the current best-performing system (i.e., GPT-4 with CoT prompting) achieves only 56.6% accuracy, leaving substantial room for improvement. Moreover, while augmenting LLMs with external knowledge can improve their performance (e.g., from 33.5% to 47.1% for GPT-3.5), their accuracy remains significantly lower than the estimated human expert performance of 92%. We believe that KnowledgeFMath can advance future research in the area of domain-specific knowledge retrieval and integration, particularly within the context of solving math reasoning problems.</abstract>
-      <url hash="e2e21860">2024.acl-long.693</url>
+      <url hash="ba7e748c">2024.acl-long.693</url>
       <bibkey>zhao-etal-2024-knowledgefmath</bibkey>
+      <revision id="1" href="2024.acl-long.693v1" hash="e2e21860"/>
+      <revision id="2" href="2024.acl-long.693v2" hash="ba7e748c" date="2024-09-17">Revised the dataset name.</revision>
     </paper>
     <paper id="694">
       <title><fixed-case>API</fixed-case>-<fixed-case>BLEND</fixed-case>: A Comprehensive Corpora for Training and Benchmarking <fixed-case>API</fixed-case> <fixed-case>LLM</fixed-case>s</title>

From bb69dcf083ec99fbd1d2ca3d17d2db0a684bcd74 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 13:43:32 -0500
Subject: [PATCH 14/39] Paper Revision: {2024.acl-long.852}, closes #3879.

---
 data/xml/2024.acl.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index f20229ebe1..8e53e8ad6d 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -11145,8 +11145,10 @@
       <author><first>Arman</first><last>Cohan</last><affiliation>Yale University</affiliation></author>
       <pages>16103-16120</pages>
       <abstract>Recent LLMs have demonstrated remarkable performance in solving exam-like math word problems. However, the degree to which these numerical reasoning skills are effective in real-world scenarios, particularly in expert domains, is still largely unexplored. This paper introduces DocMath-Eval, a comprehensive benchmark specifically designed to evaluate the numerical reasoning capabilities of LLMs in the context of understanding and analyzing financial documents containing both text and tables. We evaluate a wide spectrum of 27 LLMs, including those specialized in math, coding and finance, with Chain-of-Thought and Program-of-Thought prompting methods. We found that even the current best-performing system (i.e., GPT-4) still significantly lags behind human experts in solving complex numerical reasoning problems grounded in long contexts. We believe DocMath-Eval can be used as a valuable benchmark to evaluate LLMs’ capabilities to solve challenging numerical reasoning problems in expert domains.</abstract>
-      <url hash="de5c6157">2024.acl-long.852</url>
+      <url hash="0f0bea3b">2024.acl-long.852</url>
       <bibkey>zhao-etal-2024-docmath</bibkey>
+      <revision id="1" href="2024.acl-long.852v1" hash="de5c6157"/>
+      <revision id="2" href="2024.acl-long.852v2" hash="0f0bea3b" date="2024-09-17">Included experimental results.</revision>
     </paper>
     <paper id="853">
       <title>Unintended Impacts of <fixed-case>LLM</fixed-case> Alignment on Global Representation</title>

From c45051cdcb441a3d52e7bb29b20e6425484e1b5a Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 13:44:58 -0500
Subject: [PATCH 15/39] Paper Revision{2024.findings-acl.354}, closes #3881.

---
 data/xml/2024.findings.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2024.findings.xml b/data/xml/2024.findings.xml
index 901e4ddb44..2c69d3a85b 100644
--- a/data/xml/2024.findings.xml
+++ b/data/xml/2024.findings.xml
@@ -10422,8 +10422,10 @@
       <author><first>Benoit</first><last>Crabbé</last><affiliation>Université de Paris</affiliation></author>
       <pages>5935-5947</pages>
       <abstract>We introduce a novel dataset tailored for code generation, aimed at aiding developers in common tasks. Our dataset provides examples that include a clarified intent, code snippets associated, and an average of three related unit tests. It encompasses a range of libraries such as Pandas, Numpy, and Regex, along with more than 70 standard libraries in Python code derived from Stack Overflow. Comprising 3,402 crafted examples by Python experts, our dataset is designed for both model finetuning and standalone evaluation. To complete unit tests evaluation, we categorize examples in order to get more fine grained analysis, enhancing the understanding of models’ strengths and weaknesses in specific coding tasks. The examples have been refined to reduce data contamination, a process confirmed by the performance of three leading models: Mistral 7B, CodeLLAMA 13B, and Starcoder 15B. We further investigate data-contamination testing GPT-4 performance on a part of our dataset. The benchmark can be accessed at anonymized address.</abstract>
-      <url hash="1f2c9568">2024.findings-acl.354</url>
+      <url hash="48cccfbe">2024.findings-acl.354</url>
       <bibkey>beau-crabbe-2024-codeinsight</bibkey>
+      <revision id="1" href="2024.findings-acl.354v1" hash="1f2c9568"/>
+      <revision id="2" href="2024.findings-acl.354v2" hash="48cccfbe" date="2024-09-17">Minor updates.</revision>
     </paper>
     <paper id="355">
       <title><fixed-case>V</fixed-case>i<fixed-case>H</fixed-case>ate<fixed-case>T</fixed-case>5: Enhancing Hate Speech Detection in <fixed-case>V</fixed-case>ietnamese With a Unified Text-to-Text Transformer Model</title>

From 536512fdc51ee41a2fd1387f840db92a01c4e16d Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 13:47:36 -0500
Subject: [PATCH 16/39] Paper Revision{2023.findings-acl.38}, closes #3885.

---
 data/xml/2023.findings.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2023.findings.xml b/data/xml/2023.findings.xml
index ef19af444f..c4ea33d98c 100644
--- a/data/xml/2023.findings.xml
+++ b/data/xml/2023.findings.xml
@@ -3166,10 +3166,12 @@
       <author><first>Ryan</first><last>Cotterell</last><affiliation>ETH Zürich</affiliation></author>
       <pages>598-614</pages>
       <abstract>Byte-Pair Encoding (BPE) is a popular algorithm used for tokenizing data in NLP, despite being devised initially as a compression method.BPE appears to be a greedy algorithm at face value, but the underlying optimization problem that BPE seeks to solve has not yet been laid down. We formalize BPE as a combinatorial optimization problem. Via submodular functions, we prove that the iterative greedy version is a 1/sigma*(1-e(-sigma))-approximation of an optimal merge sequence, where sigma is the total backward curvature with respect to the optimal merge sequence. Empirically the lower bound of the approximation is approx0.37.We provide a faster implementation of BPE which improves the runtime complexity from O(NM) to O(N log M), where N is the sequence length and M is the merge count. Finally, we optimize the brute-force algorithm for optimal BPE using memoization.</abstract>
-      <url hash="40ed62e8">2023.findings-acl.38</url>
+      <url hash="8867d506">2023.findings-acl.38</url>
       <bibkey>zouhar-etal-2023-formal</bibkey>
       <doi>10.18653/v1/2023.findings-acl.38</doi>
       <video href="2023.findings-acl.38.mp4"/>
+      <revision id="1" href="2023.findings-acl.38v1" hash="40ed62e8"/>
+      <revision id="2" href="2023.findings-acl.38v2" hash="8867d506" date="2024-09-17">Fix typos in Proof of Theorem 4.2 and Algorithm 3 as well as the malformed rendering of Figure 3.</revision>
     </paper>
     <paper id="39">
       <title>Automatic Named Entity Obfuscation in Speech</title>

From 520683f08cae3abbd2156c80fe6980429567a033 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 13:49:20 -0500
Subject: [PATCH 17/39] Paper Revision{2021.findings-emnlp.96}, closes #3888.

---
 data/xml/2021.findings.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2021.findings.xml b/data/xml/2021.findings.xml
index d4652661ea..9890bc3539 100644
--- a/data/xml/2021.findings.xml
+++ b/data/xml/2021.findings.xml
@@ -7958,13 +7958,15 @@
       <author><first>Albert Y.S.</first><last>Lam</last></author>
       <pages>1114–1120</pages>
       <abstract>This paper investigates the effectiveness of pre-training for few-shot intent classification. While existing paradigms commonly further pre-train language models such as BERT on a vast amount of unlabeled corpus, we find it highly effective and efficient to simply fine-tune BERT with a small set of labeled utterances from public datasets. Specifically, fine-tuning BERT with roughly 1,000 labeled data yields a pre-trained model – IntentBERT, which can easily surpass the performance of existing pre-trained models for few-shot intent classification on novel domains with very different semantics. The high effectiveness of IntentBERT confirms the feasibility and practicality of few-shot intent detection, and its high generalization ability across different domains suggests that intent classification tasks may share a similar underlying structure, which can be efficiently learned from a small set of labeled data. The source code can be found at <url>https://github.com/hdzhang-code/IntentBERT</url>.</abstract>
-      <url hash="2ae86f5a">2021.findings-emnlp.96</url>
+      <url hash="297df895">2021.findings-emnlp.96</url>
       <bibkey>zhang-etal-2021-effectiveness-pre</bibkey>
       <doi>10.18653/v1/2021.findings-emnlp.96</doi>
       <video href="2021.findings-emnlp.96.mp4"/>
       <pwcdataset url="https://paperswithcode.com/dataset/banking77">BANKING77</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/hint3">HINT3</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/hwu64">HWU64</pwcdataset>
+      <revision id="1" href="2021.findings-emnlp.96v1" hash="2ae86f5a"/>
+      <revision id="2" href="2021.findings-emnlp.96v2" hash="297df895" date="2024-09-17">Changes the order of the authors.</revision>
     </paper>
     <paper id="97">
       <title>Improving Abstractive Dialogue Summarization with Hierarchical Pretraining and Topic Segment</title>

From 0015196169bc4381913bd96a1621b03284414cda Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 13:52:18 -0500
Subject: [PATCH 18/39] Paper Revision{2022.naacl-main.39}, closes #3890.

---
 data/xml/2022.naacl.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2022.naacl.xml b/data/xml/2022.naacl.xml
index c729852f14..ae7b509cf4 100644
--- a/data/xml/2022.naacl.xml
+++ b/data/xml/2022.naacl.xml
@@ -621,7 +621,7 @@
       <author><first>Albert</first><last>Lam</last></author>
       <pages>532-542</pages>
       <abstract>It is challenging to train a good intent classifier for a task-oriented dialogue system with only a few annotations. Recent studies have shown that fine-tuning pre-trained language models with a small set of labeled utterances from public benchmarks in a supervised manner is extremely helpful. However, we find that supervised pre-training yields an anisotropic feature space, which may suppress the expressive power of the semantic representations. Inspired by recent research in isotropization, we propose to improve supervised pre-training by regularizing the feature space towards isotropy. We propose two regularizers based on contrastive learning and correlation matrix respectively, and demonstrate their effectiveness through extensive experiments. Our main finding is that it is promising to regularize supervised pre-training with isotropization to further improve the performance of few-shot intent detection. The source code can be found at <url>https://github.com/fanolabs/isoIntentBert-main</url>.</abstract>
-      <url hash="5e5f256f">2022.naacl-main.39</url>
+      <url hash="7283669b">2022.naacl-main.39</url>
       <attachment type="software" hash="f3d4a0c2">2022.naacl-main.39.software.zip</attachment>
       <bibkey>zhang-etal-2022-fine</bibkey>
       <doi>10.18653/v1/2022.naacl-main.39</doi>
@@ -630,6 +630,8 @@
       <pwcdataset url="https://paperswithcode.com/dataset/banking77">BANKING77</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/hint3">HINT3</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/hwu64">HWU64</pwcdataset>
+      <revision id="1" href="2022.naacl-main.39v1" hash="5e5f256f"/>
+      <revision id="2" href="2022.naacl-main.39v2" hash="7283669b" date="2024-09-17">Changes the order of the authors.</revision>
     </paper>
     <paper id="40">
       <title>Cross-document Misinformation Detection based on Event Graph Reasoning</title>

From d7d9525ac0fb58522ddfd7c8753392a656b0cc9e Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 13:53:43 -0500
Subject: [PATCH 19/39] Paper Revision{2023.findings-acl.706}, closes #3892.

---
 data/xml/2023.findings.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2023.findings.xml b/data/xml/2023.findings.xml
index c4ea33d98c..0ca9816ec8 100644
--- a/data/xml/2023.findings.xml
+++ b/data/xml/2023.findings.xml
@@ -11998,9 +11998,11 @@
       <author><first>Albert Y.S.</first><last>Lam</last><affiliation>Fano Labs</affiliation></author>
       <pages>11105-11121</pages>
       <abstract>We consider the task of few-shot intent detection, which involves training a deep learning model to classify utterances based on their underlying intents using only a small amount of labeled data. The current approach to address this problem is through continual pre-training, i.e., fine-tuning pre-trained language models (PLMs) on external resources (e.g., conversational corpora, public intent detection datasets, or natural language understanding datasets) before using them as utterance encoders for training an intent classifier. In this paper, we show that continual pre-training may not be essential, since the overfitting problem of PLMs on this task may not be as serious as expected. Specifically, we find that directly fine-tuning PLMs on only a handful of labeled examples already yields decent results compared to methods that employ continual pre-training, and the performance gap diminishes rapidly as the number of labeled data increases. To maximize the utilization of the limited available data, we propose a context augmentation method and leverage sequential self-distillation to boost performance. Comprehensive experiments on real-world benchmarks show that given only two or more labeled samples per class, direct fine-tuning outperforms many strong baselines that utilize external data sources for continual pre-training. The code can be found at <url>https://github.com/hdzhang-code/DFTPlus</url>.</abstract>
-      <url hash="811ea19b">2023.findings-acl.706</url>
+      <url hash="268745c5">2023.findings-acl.706</url>
       <bibkey>zhang-etal-2023-revisit</bibkey>
       <doi>10.18653/v1/2023.findings-acl.706</doi>
+      <revision id="1" href="2023.findings-acl.706v1" hash="811ea19b"/>
+      <revision id="2" href="2023.findings-acl.706v2" hash="268745c5" date="2024-09-17">Changes the order of the authors.</revision>
     </paper>
     <paper id="707">
       <title>Improving Contrastive Learning of Sentence Embeddings from <fixed-case>AI</fixed-case> Feedback</title>

From 23221f5a5fd719c7e8e2ff9d1663005b27e35335 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 13:55:49 -0500
Subject: [PATCH 20/39] Paper correction for 2024.bea-1.32, closes #3844.

---
 data/xml/2024.bea.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2024.bea.xml b/data/xml/2024.bea.xml
index 1b0ce81bb3..40af6cf92e 100644
--- a/data/xml/2024.bea.xml
+++ b/data/xml/2024.bea.xml
@@ -366,8 +366,10 @@
       <author><first>Giora</first><last>Alexandron</last><affiliation>Weizmann Institute of Science</affiliation></author>
       <pages>391-402</pages>
       <abstract>Unsupervised clustering of student responses to open-ended questions into behavioral and cognitive profiles using pre-trained LLM embeddings is an emerging technique, but little is known about how well this captures pedagogically meaningful information. We investigate this in the context of student responses to open-ended questions in biology, which were previously analyzed and clustered by experts into theory-driven Knowledge Profiles (KPs).Comparing these KPs to ones discovered by purely data-driven clustering techniques, we report poor discoverability of most KPs, except for the ones including the correct answers. We trace this ‘discoverability bias’ to the representations of KPs in the pre-trained LLM embeddings space.</abstract>
-      <url hash="1d8a7729">2024.bea-1.32</url>
+      <url hash="4dcdf183">2024.bea-1.32</url>
       <bibkey>gurin-schleifer-etal-2024-anna</bibkey>
+      <revision id="1" href="2024.bea-1.32v1" hash="1d8a7729"/>
+      <revision id="2" href="2024.bea-1.32v2" hash="4dcdf183" date="2024-09-17">Corrected a typo.</revision>
     </paper>
     <paper id="33">
       <title>Assessing Student Explanations with Large Language Models Using Fine-Tuning and Few-Shot Learning</title>

From 2fbf0cd9c7e4bcc6a5c917d5a57918b21052681e Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 13:59:17 -0500
Subject: [PATCH 21/39] Paper Metadata: {2023.findings-acl.706}, closes #3891.

---
 data/xml/2023.findings.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/xml/2023.findings.xml b/data/xml/2023.findings.xml
index 0ca9816ec8..fd1cbbd68f 100644
--- a/data/xml/2023.findings.xml
+++ b/data/xml/2023.findings.xml
@@ -11993,9 +11993,9 @@
       <title>Revisit Few-shot Intent Classification with <fixed-case>PLM</fixed-case>s: Direct Fine-tuning vs. Continual Pre-training</title>
       <author><first>Haode</first><last>Zhang</last><affiliation>The Hong Kong Polytechnic University</affiliation></author>
       <author><first>Haowen</first><last>Liang</last><affiliation>The Hong Kong Polytechnic University</affiliation></author>
-      <author><first>Li-Ming</first><last>Zhan</last><affiliation>The Hong Kong Polytechnic University</affiliation></author>
-      <author><first>Xiao-Ming</first><last>Wu</last><affiliation>Hong Kong Polytechnic University</affiliation></author>
+      <author><first>Liming</first><last>Zhan</last><affiliation>The Hong Kong Polytechnic University</affiliation></author>
       <author><first>Albert Y.S.</first><last>Lam</last><affiliation>Fano Labs</affiliation></author>
+      <author><first>Xiao-Ming</first><last>Wu</last><affiliation>Hong Kong Polytechnic University</affiliation></author>
       <pages>11105-11121</pages>
       <abstract>We consider the task of few-shot intent detection, which involves training a deep learning model to classify utterances based on their underlying intents using only a small amount of labeled data. The current approach to address this problem is through continual pre-training, i.e., fine-tuning pre-trained language models (PLMs) on external resources (e.g., conversational corpora, public intent detection datasets, or natural language understanding datasets) before using them as utterance encoders for training an intent classifier. In this paper, we show that continual pre-training may not be essential, since the overfitting problem of PLMs on this task may not be as serious as expected. Specifically, we find that directly fine-tuning PLMs on only a handful of labeled examples already yields decent results compared to methods that employ continual pre-training, and the performance gap diminishes rapidly as the number of labeled data increases. To maximize the utilization of the limited available data, we propose a context augmentation method and leverage sequential self-distillation to boost performance. Comprehensive experiments on real-world benchmarks show that given only two or more labeled samples per class, direct fine-tuning outperforms many strong baselines that utilize external data sources for continual pre-training. The code can be found at <url>https://github.com/hdzhang-code/DFTPlus</url>.</abstract>
       <url hash="268745c5">2023.findings-acl.706</url>

From 19db664723b7743bfef976e07dbf3cd041b4b98d Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 14:01:14 -0500
Subject: [PATCH 22/39] Paper Metadata: {2022.naacl-main.39}, closes #3889.

---
 data/xml/2022.naacl.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/xml/2022.naacl.xml b/data/xml/2022.naacl.xml
index ae7b509cf4..6acf3a1f8d 100644
--- a/data/xml/2022.naacl.xml
+++ b/data/xml/2022.naacl.xml
@@ -615,10 +615,10 @@
       <author><first>Haode</first><last>Zhang</last></author>
       <author><first>Haowen</first><last>Liang</last></author>
       <author><first>Yuwei</first><last>Zhang</last></author>
-      <author><first>Li-Ming</first><last>Zhan</last></author>
-      <author><first>Xiao-Ming</first><last>Wu</last></author>
+      <author><first>Liming</first><last>Zhan</last></author>
       <author><first>Xiaolei</first><last>Lu</last></author>
       <author><first>Albert</first><last>Lam</last></author>
+      <author><first>Xiao-Ming</first><last>Wu</last></author>
       <pages>532-542</pages>
       <abstract>It is challenging to train a good intent classifier for a task-oriented dialogue system with only a few annotations. Recent studies have shown that fine-tuning pre-trained language models with a small set of labeled utterances from public benchmarks in a supervised manner is extremely helpful. However, we find that supervised pre-training yields an anisotropic feature space, which may suppress the expressive power of the semantic representations. Inspired by recent research in isotropization, we propose to improve supervised pre-training by regularizing the feature space towards isotropy. We propose two regularizers based on contrastive learning and correlation matrix respectively, and demonstrate their effectiveness through extensive experiments. Our main finding is that it is promising to regularize supervised pre-training with isotropization to further improve the performance of few-shot intent detection. The source code can be found at <url>https://github.com/fanolabs/isoIntentBert-main</url>.</abstract>
       <url hash="7283669b">2022.naacl-main.39</url>

From 672f252a6ae3e6741138df51122a1d23625b5331 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 14:04:54 -0500
Subject: [PATCH 23/39] Paper Metadata{2021.findings-emnlp.96}, closes #3887.

---
 data/xml/2021.findings.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2021.findings.xml b/data/xml/2021.findings.xml
index 9890bc3539..121747d13b 100644
--- a/data/xml/2021.findings.xml
+++ b/data/xml/2021.findings.xml
@@ -7954,8 +7954,8 @@
       <author><first>Li-Ming</first><last>Zhan</last></author>
       <author><first>Jiaxin</first><last>Chen</last></author>
       <author><first>Guangyuan</first><last>Shi</last></author>
-      <author><first>Xiao-Ming</first><last>Wu</last></author>
       <author><first>Albert Y.S.</first><last>Lam</last></author>
+      <author><first>Xiao-Ming</first><last>Wu</last></author>
       <pages>1114–1120</pages>
       <abstract>This paper investigates the effectiveness of pre-training for few-shot intent classification. While existing paradigms commonly further pre-train language models such as BERT on a vast amount of unlabeled corpus, we find it highly effective and efficient to simply fine-tune BERT with a small set of labeled utterances from public datasets. Specifically, fine-tuning BERT with roughly 1,000 labeled data yields a pre-trained model – IntentBERT, which can easily surpass the performance of existing pre-trained models for few-shot intent classification on novel domains with very different semantics. The high effectiveness of IntentBERT confirms the feasibility and practicality of few-shot intent detection, and its high generalization ability across different domains suggests that intent classification tasks may share a similar underlying structure, which can be efficiently learned from a small set of labeled data. The source code can be found at <url>https://github.com/hdzhang-code/IntentBERT</url>.</abstract>
       <url hash="297df895">2021.findings-emnlp.96</url>

From 0aa1ce0a4293e312dfbc7981ee762413b3732773 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 14:08:14 -0500
Subject: [PATCH 24/39] 2024.wassa-1.8 : Swap author 3 and 4 according to the
 order in the paper, closes #3886.

---
 data/xml/2024.wassa.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2024.wassa.xml b/data/xml/2024.wassa.xml
index ceb7e36daf..1ba88b73ca 100644
--- a/data/xml/2024.wassa.xml
+++ b/data/xml/2024.wassa.xml
@@ -93,8 +93,8 @@
       <title>Entity-Level Sentiment: More than the Sum of Its Parts</title>
       <author><first>Egil</first><last>Rønningstad</last></author>
       <author><first>Roman</first><last>Klinger</last><affiliation>Otto-Friedrich Universität Bamberg</affiliation></author>
-      <author><first>Erik</first><last>Velldal</last><affiliation>University of Oslo</affiliation></author>
       <author><first>Lilja</first><last>Øvrelid</last><affiliation>Dept. of Informatics, University of Oslo</affiliation></author>
+      <author><first>Erik</first><last>Velldal</last><affiliation>University of Oslo</affiliation></author>
       <pages>84-96</pages>
       <abstract>In sentiment analysis of longer texts, there may be a variety of topics discussed, of entities mentioned, and of sentiments expressed regarding each entity. We find a lack of studies exploring how such texts express their sentiment towards each entity of interest, and how these sentiments can be modelled. In order to better understand how sentiment regarding persons and organizations (each entity in our scope) is expressed in longer texts, we have collected a dataset of expert annotations where the overall sentiment regarding each entity is identified, together with the sentence-level sentiment for these entities separately. We show that the reader’s perceived sentiment regarding an entity often differs from an arithmetic aggregation of sentiments at the sentence level. Only 70% of the positive and 55% of the negative entities receive a correct overall sentiment label when we aggregate the (human-annotated) sentiment labels for the sentences where the entity is mentioned. Our dataset reveals the complexity of entity-specific sentiment in longer texts, and allows for more precise modelling and evaluation of such sentiment expressions.</abstract>
       <url hash="6cd290db">2024.wassa-1.8</url>

From e5776ec9af2d0edd417da7671f8266afdd888929 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 14:11:05 -0500
Subject: [PATCH 25/39] Paper Metadata: {2024.acl-long.852}, closes #3880.

---
 data/xml/2024.acl.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index 8e53e8ad6d..cae735d0c4 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -11132,7 +11132,7 @@
       <bibkey>jin-etal-2024-mmtom</bibkey>
     </paper>
     <paper id="852">
-      <title><fixed-case>D</fixed-case>oc<fixed-case>M</fixed-case>ath-Eval: Evaluating Math Reasoning Capabilities of <fixed-case>LLM</fixed-case>s in Understanding Financial Documents</title>
+      <title><fixed-case>D</fixed-case>oc<fixed-case>M</fixed-case>ath-Eval: Evaluating Math Reasoning Capabilities of <fixed-case>LLM</fixed-case>s in Understanding Long and Specialized Documents</title>
       <author><first>Yilun</first><last>Zhao</last><affiliation>Yale University</affiliation></author>
       <author><first>Yitao</first><last>Long</last><affiliation>New York University</affiliation></author>
       <author><first>Hongjun</first><last>Liu</last><affiliation>College of Computer Science and Technology, Zhejiang University</affiliation></author>
@@ -11144,7 +11144,7 @@
       <author><first>Rui</first><last>Zhang</last><affiliation>Pennsylvania State University</affiliation></author>
       <author><first>Arman</first><last>Cohan</last><affiliation>Yale University</affiliation></author>
       <pages>16103-16120</pages>
-      <abstract>Recent LLMs have demonstrated remarkable performance in solving exam-like math word problems. However, the degree to which these numerical reasoning skills are effective in real-world scenarios, particularly in expert domains, is still largely unexplored. This paper introduces DocMath-Eval, a comprehensive benchmark specifically designed to evaluate the numerical reasoning capabilities of LLMs in the context of understanding and analyzing financial documents containing both text and tables. We evaluate a wide spectrum of 27 LLMs, including those specialized in math, coding and finance, with Chain-of-Thought and Program-of-Thought prompting methods. We found that even the current best-performing system (i.e., GPT-4) still significantly lags behind human experts in solving complex numerical reasoning problems grounded in long contexts. We believe DocMath-Eval can be used as a valuable benchmark to evaluate LLMs’ capabilities to solve challenging numerical reasoning problems in expert domains.</abstract>
+      <abstract>Recent LLMs have demonstrated remarkable performance in solving exam-like math word problems. However, the degree to which these numerical reasoning skills are effective in real-world scenarios, particularly in expert domains, is still largely unexplored. This paper introduces DocMath-Eval, a comprehensive benchmark specifically designed to evaluate the numerical reasoning capabilities of LLMs in the context of understanding and analyzing specialized documents containing both text and tables. We conduct an extensive evaluation of 48 LLMs with Chain-of-Thought and Program-of-Thought prompting methods, aiming to comprehensively assess the capabilities and limitations of existing LLMs in DocMath-Eval. We found that even the current best-performing system (i.e., GPT-4o) still significantly lags behind human experts in solving complex numerical reasoning problems grounded in long contexts. We believe that DocMath-Eval can serve as a valuable benchmark for evaluating LLMs' capabilities in solving challenging numerical reasoning problems within expert domains.</abstract>
       <url hash="0f0bea3b">2024.acl-long.852</url>
       <bibkey>zhao-etal-2024-docmath</bibkey>
       <revision id="1" href="2024.acl-long.852v1" hash="de5c6157"/>

From 9cc02a1aa0c847821be302b79ba3096f288ec0ad Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Tue, 17 Sep 2024 14:12:47 -0500
Subject: [PATCH 26/39] Paper Metadata: {2024.acl-long.693}, closes #3876.

---
 data/xml/2024.acl.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index cae735d0c4..b754558313 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -8984,7 +8984,7 @@
       <bibkey>zhao-etal-2024-tapera</bibkey>
     </paper>
     <paper id="693">
-      <title><fixed-case>K</fixed-case>nowledge<fixed-case>FM</fixed-case>ath: A Knowledge-Intensive Math Reasoning Dataset in Finance Domains</title>
+      <title>FinanceMATH: Knowledge-Intensive Math Reasoning in Finance Domains</title>
       <author><first>Yilun</first><last>Zhao</last><affiliation>Yale University</affiliation></author>
       <author><first>Hongjun</first><last>Liu</last></author>
       <author><first>Yitao</first><last>Long</last><affiliation>New York University</affiliation></author>
@@ -8992,7 +8992,7 @@
       <author><first>Chen</first><last>Zhao</last><affiliation>New York University Shanghai</affiliation></author>
       <author><first>Arman</first><last>Cohan</last><affiliation>Yale University and Allen Institute for Artificial Intelligence</affiliation></author>
       <pages>12841-12858</pages>
-      <abstract>We introduce KnowledgeFMath, a novel benchmark designed to evaluate LLMs’ capabilities in solving knowledge-intensive math reasoning problems. Compared to prior works, this study features three core advancements. First, KnowledgeFMath includes 1,259 problems with a hybrid of textual and tabular content. These problems require college-level knowledge in the finance domain for effective resolution. Second, we provide expert-annotated, detailed solution references in Python program format, ensuring a high-quality benchmark for LLM assessment. We also construct a finance-domain knowledge bank and investigate various knowledge integration strategies. Finally, we evaluate a wide spectrum of 26 LLMs with different prompting strategies like Chain-of-Thought and Program-of-Thought. Our experimental results reveal that the current best-performing system (i.e., GPT-4 with CoT prompting) achieves only 56.6% accuracy, leaving substantial room for improvement. Moreover, while augmenting LLMs with external knowledge can improve their performance (e.g., from 33.5% to 47.1% for GPT-3.5), their accuracy remains significantly lower than the estimated human expert performance of 92%. We believe that KnowledgeFMath can advance future research in the area of domain-specific knowledge retrieval and integration, particularly within the context of solving math reasoning problems.</abstract>
+      <abstract>We introduce FinanceMath, a novel benchmark designed to evaluate LLMs' capabilities in solving knowledge-intensive math reasoning problems. Compared to prior works, this study features three core advancements. First, FinanceMath includes 1,200 problems with a hybrid of textual and tabular content. These problems require college-level knowledge in the finance domain for effective resolution. Second, we provide expert-annotated, detailed solution references in Python program format, ensuring a high-quality benchmark for LLM assessment. We also construct a finance-domain knowledge bank and investigate various knowledge integration strategies. Finally, we evaluate a wide spectrum of 44 LLMs with both Chain-of-Thought and Program-of-Thought prompting methods. Our experimental results reveal that the current best-performing system (i.e., GPT-4o) achieves only 60.9% accuracy using CoT prompting, leaving substantial room for improvement. Moreover, while augmenting LLMs with external knowledge can improve model performance (e.g., from 47.5% to 54.5% for Gemini-1.5-Pro), their accuracy remains significantly lower than the estimated human expert performance of 92%. We believe that FinanceMath can advance future research in the area of domain-specific knowledge retrieval and integration, particularly within the context of solving reasoning-intensive tasks.</abstract>
       <url hash="ba7e748c">2024.acl-long.693</url>
       <bibkey>zhao-etal-2024-knowledgefmath</bibkey>
       <revision id="1" href="2024.acl-long.693v1" hash="e2e21860"/>

From 4f52bdcd1b33a738a555e837043a17da0b3e3c97 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Wed, 18 Sep 2024 14:17:39 -0500
Subject: [PATCH 27/39] Paper Revision{2024.acl-long.329}, closes #3896.

---
 data/xml/2024.acl.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index b754558313..5237618275 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -4266,8 +4266,10 @@
       <author><first>Anette</first><last>Frank</last><affiliation>Ruprecht-Karls-Universität Heidelberg</affiliation></author>
       <pages>6048-6089</pages>
       <abstract>Large language models (LLMs) can explain their predictions through post-hoc or Chain-of-Thought (CoT) explanations. But an LLM could make up reasonably sounding explanations that are unfaithful to its underlying reasoning. Recent work has designed tests that aim to judge the faithfulness of post-hoc or CoT explanations. In this work we argue that these faithfulness tests do not measure faithfulness to the models’ inner workings – but rather their self-consistency at output level.Our contributions are three-fold: i) We clarify the status of faithfulness tests in view of model explainability, characterising them as self-consistency tests instead. This assessment we underline by ii) constructing a Comparative Consistency Bank for self-consistency tests that for the first time compares existing tests on a common suite of 11 open LLMs and 5 tasks – including iii) our new self-consistency measure CC-SHAP. CC-SHAP is a fine-grained measure (not a test) of LLM self-consistency. It compares how a model’s input contributes to the predicted answer and to generating the explanation. Our fine-grained CC-SHAP metric allows us iii) to compare LLM behaviour when making predictions and to analyse the effect of other consistency tests at a deeper level, which takes us one step further towards measuring faithfulness by bringing us closer to the internals of the model than strictly surface output-oriented tests.</abstract>
-      <url hash="ab7d495d">2024.acl-long.329</url>
+      <url hash="8a044b2a">2024.acl-long.329</url>
       <bibkey>parcalabescu-frank-2024-measuring</bibkey>
+      <revision id="1" href="2024.acl-long.329v1" hash="ab7d495d"/>
+      <revision id="2" href="2024.acl-long.329v2" hash="8a044b2a" date="2024-09-18">This revision mentions a sponsor in the acknowledgements and fixes the typo in Eq. 4.</revision>
     </paper>
     <paper id="330">
       <title>Learning or Self-aligning? Rethinking Instruction Fine-tuning</title>

From d9fc9ef906603abb6dc7644018e523f3a89f990a Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Wed, 18 Sep 2024 14:19:15 -0500
Subject: [PATCH 28/39] Paper Revision{2023.acl-long.223}, closes #3895.

---
 data/xml/2023.acl.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data/xml/2023.acl.xml b/data/xml/2023.acl.xml
index 39a5c32c41..799f88d5fb 100644
--- a/data/xml/2023.acl.xml
+++ b/data/xml/2023.acl.xml
@@ -3128,10 +3128,12 @@
       <author><first>Anette</first><last>Frank</last><affiliation>Heidelberg University</affiliation></author>
       <pages>4032-4059</pages>
       <abstract>Vision and language models (VL) are known to exploit unrobust indicators in individual modalities (e.g., introduced by distributional biases) instead of focusing on relevant information in each modality. That a unimodal model achieves similar accuracy on a VL task to a multimodal one, indicates that so-called unimodal collapse occurred. However, accuracy-based tests fail to detect e.g., when the model prediction is wrong, while the model used relevant information from a modality. Instead, we propose MM-SHAP, a performance-agnostic multimodality score based on Shapley values that reliably quantifies in which proportions a multimodal model uses individual modalities. We apply MM-SHAP in two ways: (1) to compare models for their average degree of multimodality, and (2) to measure for individual models the contribution of individual modalities for different tasks and datasets. Experiments with six VL models – LXMERT, CLIP and four ALBEF variants – on four VL tasks highlight that unimodal collapse can occur to different degrees and in different directions, contradicting the wide-spread assumption that unimodal collapse is one-sided. Based on our results, we recommend MM-SHAP for analysing multimodal tasks, to diagnose and guide progress towards multimodal integration. Code available at <url>https://github.com/Heidelberg-NLP/MM-SHAP</url>.</abstract>
-      <url hash="86b9cb56">2023.acl-long.223</url>
+      <url hash="b1be1a8c">2023.acl-long.223</url>
       <bibkey>parcalabescu-frank-2023-mm</bibkey>
       <doi>10.18653/v1/2023.acl-long.223</doi>
       <video href="2023.acl-long.223.mp4"/>
+      <revision id="1" href="2023.acl-long.223v1" hash="86b9cb56"/>
+      <revision id="2" href="2023.acl-long.223v2" hash="b1be1a8c" date="2024-09-18">This revision includes mentions a sponsor in the Acknowledgments section and rectifies the line below Eq. (1).</revision>
     </paper>
     <paper id="224">
       <title>Towards Boosting the Open-Domain Chatbot with Human Feedback</title>

From a38ec953c56e1836e7906b714749eb7c66f64f3d Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Mon, 23 Sep 2024 08:28:20 -0400
Subject: [PATCH 29/39] Name correction: Cesar Yoshikawa

---
 data/xml/2022.deeplo.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2022.deeplo.xml b/data/xml/2022.deeplo.xml
index f0ed86c1ea..b847f095d4 100644
--- a/data/xml/2022.deeplo.xml
+++ b/data/xml/2022.deeplo.xml
@@ -30,7 +30,7 @@
       <author><first>William</first><last>Chen</last></author>
       <author><first>Richard</first><last>Castro</last></author>
       <author><first>Núria</first><last>Bel</last></author>
-      <author><first>Cesar</first><last>Toshio</last></author>
+      <author><first>Cesar</first><last>Yoshikawa</last></author>
       <author><first>Renzo</first><last>Venturas</last></author>
       <author><first>Hilario</first><last>Aradiel</last></author>
       <author><first>Nelsi</first><last>Melgarejo</last></author>

From c7955ccd57a78535ad7689bb722b3d2b0ca48ea3 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Mon, 23 Sep 2024 09:10:06 -0400
Subject: [PATCH 30/39] =?UTF-8?q?Name=20correction:=20Patr=C3=ADcia=20Ferr?=
 =?UTF-8?q?eira?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data/xml/2024.sigdial.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2024.sigdial.xml b/data/xml/2024.sigdial.xml
index 8aecc67d7a..1f2eecb171 100644
--- a/data/xml/2024.sigdial.xml
+++ b/data/xml/2024.sigdial.xml
@@ -277,7 +277,7 @@
     </paper>
     <paper id="24">
       <title>Sentiment-Aware Dialogue Flow Discovery for Interpreting Communication Trends</title>
-      <author><first>Patrícia Sofia Pereira</first><last>Ferreira</last></author>
+      <author><first>Patrícia</first><last>Ferreira</last></author>
       <author><first>Isabel</first><last>Carvalho</last></author>
       <author><first>Ana</first><last>Alves</last></author>
       <author><first>Catarina</first><last>Silva</last></author>

From 2836e050dcc04f4faf6638c6eafdd2b43f6900dc Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Mon, 23 Sep 2024 17:12:52 -0500
Subject: [PATCH 31/39] Paper Metadata: 2024.gebnlp-1.5, closes #3898.

---
 data/xml/2024.gebnlp.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2024.gebnlp.xml b/data/xml/2024.gebnlp.xml
index 40fa3caf73..66835b8816 100644
--- a/data/xml/2024.gebnlp.xml
+++ b/data/xml/2024.gebnlp.xml
@@ -64,7 +64,7 @@
       <title>A Fairness Analysis of Human and <fixed-case>AI</fixed-case>-Generated Student Reflection Summaries</title>
       <author><first>Bhiman</first><last>Baghel</last><affiliation>University of Pittsburgh</affiliation></author>
       <author><first>Arun Balajiee</first><last>Lekshmi Narayanan</last><affiliation>University of Pittsburgh</affiliation></author>
-      <author><first>Michael</first><last>Yoder</last><affiliation>School of Computer Science, Carnegie Mellon University</affiliation></author>
+      <author><first>Michael Miller</first><last>Yoder</last><affiliation>School of Computer Science, Carnegie Mellon University</affiliation></author>
       <pages>60-77</pages>
       <abstract>This study examines the fairness of human- and AI-generated summaries of student reflections in university STEM classes, focusing on potential gender biases. Using topic modeling, we first identify topics that are more prevalent in reflections from female students and others that are more common among male students. We then analyze whether human and AI-generated summaries reflect the concerns of students of any particular gender over others. Our analysis reveals that though human-generated and extractive AI summarization techniques do not show a clear bias, abstractive AI-generated summaries exhibit a bias towards male students. Pedagogical themes are over-represented from male reflections in these summaries, while concept-specific topics are under-represented from female reflections. This research contributes to a deeper understanding of AI-generated bias in educational contexts, highlighting the need for future work on mitigating these biases.</abstract>
       <url hash="4da27e8b">2024.gebnlp-1.5</url>

From 99bd72070b5c2ea19b7c24de5868f54cfa654024 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Mon, 23 Sep 2024 17:14:33 -0500
Subject: [PATCH 32/39] Paper Metadata: {2024.findings-acl.872}, closes #3901.

---
 data/xml/2024.findings.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2024.findings.xml b/data/xml/2024.findings.xml
index 2c69d3a85b..7e6b8e0413 100644
--- a/data/xml/2024.findings.xml
+++ b/data/xml/2024.findings.xml
@@ -16903,7 +16903,7 @@
       <author><first>Matteo</first><last>Gabburo</last><affiliation>University of Trento</affiliation></author>
       <author><first>Nicolaas</first><last>Jedema</last><affiliation>Amazon</affiliation></author>
       <author><first>Siddhant</first><last>Garg</last><affiliation>Meta</affiliation></author>
-      <author><first>Leonardo</first><last>Ribeiro</last><affiliation>Amazon</affiliation></author>
+      <author><first>Leonardo F. R.</first><last>Ribeiro</last><affiliation>Amazon</affiliation></author>
       <author><first>Alessandro</first><last>Moschitti</last><affiliation>Amazon AGI</affiliation></author>
       <pages>14636-14650</pages>
       <abstract>In this paper, we investigate which questions are challenging for retrieval-based Question Answering (QA). We (i) propose retrieval complexity (RC), a novel metric conditioned on the completeness of retrieved documents, which measures the difficulty of answering questions, and (ii) propose an unsupervised pipeline to measure RC given an arbitrary retrieval system.Our proposed pipeline measures RC more accurately than alternative estimators, including LLMs, on six challenging QA benchmarks. Further investigation reveals that RC scores strongly correlate with both QA performance and expert judgment across five of the six studied benchmarks, indicating that RC is an effective measure of question difficulty.Subsequent categorization of high-RC questions shows that they span a broad set of question shapes, including multi-hop, compositional, and temporal QA, indicating that RC scores can categorize a new subset of complex questions. Our system can also have a major impact on retrieval-based systems by helping to identify more challenging questions on existing datasets.</abstract>

From 9a3f5095e25065f92d139a8003d1dd07a47abf07 Mon Sep 17 00:00:00 2001
From: anthology-assist <anthologygit@gmail.com>
Date: Mon, 23 Sep 2024 17:16:34 -0500
Subject: [PATCH 33/39] Author correction for William Soto Martinez, closes
 #3899.

---
 data/xml/2024.inlg.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2024.inlg.xml b/data/xml/2024.inlg.xml
index 54c83e5151..ca123a4fad 100644
--- a/data/xml/2024.inlg.xml
+++ b/data/xml/2024.inlg.xml
@@ -77,7 +77,7 @@
     </paper>
     <paper id="7">
       <title>Generating from <fixed-case>AMR</fixed-case>s into High and Low-Resource Languages using Phylogenetic Knowledge and Hierarchical <fixed-case>QL</fixed-case>o<fixed-case>RA</fixed-case> Training (<fixed-case>HQL</fixed-case>)</title>
-      <author><first>William Eduardo</first><last>Soto Martinez</last></author>
+      <author><first>William</first><last>Soto Martinez</last></author>
       <author><first>Yannick</first><last>Parmentier</last></author>
       <author><first>Claire</first><last>Gardent</last></author>
       <pages>70–81</pages>

From bdb00ef998bc613dcb4df8dac637a19e7682a88a Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Mon, 23 Sep 2024 18:32:19 -0400
Subject: [PATCH 34/39] Update 2024.lrec-main.464 (closes #3874)

---
 data/xml/2024.lrec.xml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/data/xml/2024.lrec.xml b/data/xml/2024.lrec.xml
index f6a8a37449..2d281a3e78 100644
--- a/data/xml/2024.lrec.xml
+++ b/data/xml/2024.lrec.xml
@@ -5509,10 +5509,11 @@
     </paper>
     <paper id="464">
       <title>Does the Language Matter? Curriculum Learning over Neo-<fixed-case>L</fixed-case>atin Languages</title>
-      <author><first>Giulia</first><last>Pucci</last></author>
       <author><first>Leonardo</first><last>Ranaldi</last></author>
+      <author><first>Giulia</first><last>Pucci</last></author>
+      <author><first>André</first><last>Freitas</last></author>
       <pages>5212–5220</pages>
-      <abstract>Curriculum Learning (CL) is emerging as a relevant technique to reduce the cost of pre-training Large Language Models. The idea, tested for the English language, is to train LLMs by organizing training examples from the simplest to the most complex. Complexity measures may depend on the specific language. Hence, this paper aims to investigate whether CL and the complexity measure can be easily exported to other languages. For this reason, we present a set of linguistically motivated measures to determine the complexity of examples, which has been used in English: these measures are based on text length, rarity, and comprehensibility. We then test the approach to two Romance languages: Italian and French. Our results show that the technique can be easily exported to languages other than English without adaptation.</abstract>
+      <abstract>Curriculum Learning (CL) has been emerged as an effective technique for improving the performances and reducing the cost of pre-training Large Language Models (LLMs). The efficacy of CL demonstrated in different scenarios is in the training LLMs by organizing examples from the simplest to the most complex. Although improvements have been shown extensively, this approach was used for pre-training, leaving novel fine-tuning approaches such as instruction-tuning unexplored. In this paper, we propose a novel complexity measure to empower the instruction-tuning method using the CL paradigm. To complement previous works, we propose cognitively motivated measures to determine the complexity of training demonstrations used in the instruction-tuning paradigm. Hence, we experiment with the proposed heuristics first in English and then in other languages. The downstream results show that delivering training examples by complexity ranking is also effective for instruction tuning, as it improves downstream performance while reducing costs. Furthermore, the technique can be easily transferred to languages other than English, e.g., Italian and French, without any adaptation, maintaining functionality and effectiveness.</abstract>
       <url hash="8d23ec5e">2024.lrec-main.464</url>
       <bibkey>pucci-ranaldi-2024-language</bibkey>
     </paper>

From 6cf9e2c71c75dc2dbbcb7a9e74c6a9aedf399d2e Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Tue, 24 Sep 2024 08:03:41 -0400
Subject: [PATCH 35/39] Fix broken merge

---
 data/xml/2024.acl.xml  | 15 ---------------
 data/xml/2024.lrec.xml |  2 +-
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index dd1b7e42e5..1292c8a73c 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -3262,12 +3262,9 @@
       <abstract>Self-attention and position embedding are two crucial modules in transformer-based Large Language Models (LLMs). However, the potential relationship between them is far from well studied, especially for long context window extending. In fact, anomalous behaviors that hinder long context extrapolation exist between Rotary Position Embedding (RoPE) and vanilla self-attention.Incorrect initial angles between <tex-math>Q</tex-math> and <tex-math>K</tex-math> can cause misestimation in modeling rotary position embedding of the closest tokens.To address this issue, we propose <tex-math>\textbf{Co}</tex-math>llinear <tex-math>\textbf{C}</tex-math>onstrained <tex-math>\textbf{A}</tex-math>ttention mechanism, namely CoCA. Specifically, we enforce a collinear constraint between <tex-math>Q</tex-math> and <tex-math>K</tex-math> to seamlessly integrate RoPE and self-attention.While only adding minimal computational and spatial complexity, this integration significantly enhances long context window extrapolation ability. We provide an optimized implementation, making it a drop-in replacement for any existing transformer-based models.Extensive experiments demonstrate that CoCA excels in extending context windows. A CoCA-based GPT model, trained with a context length of 512, can extend the context window up to 32K (60<tex-math>\times</tex-math>) without any fine-tuning.Additionally, incorporating CoCA into LLaMA-7B achieves extrapolation up to 32K within a training length of only 2K.Our code is publicly available at: https://github.com/codefuse-ai/Collinear-Constrained-Attention</abstract>
       <url hash="1753c452">2024.acl-long.233</url>
       <bibkey>zhu-etal-2024-coca</bibkey>
-<<<<<<< HEAD
       <revision id="1" href="2024.acl-long.233v1" hash="4eabb3e5"/>
       <revision id="2" href="2024.acl-long.233v2" hash="1753c452" date="2024-09-09">The author's affiliation changed.</revision>
-=======
       <doi>10.18653/v1/2024.acl-long.233</doi>
->>>>>>> origin/master
     </paper>
     <paper id="234">
       <title><fixed-case>I</fixed-case>nfo<fixed-case>L</fixed-case>oss<fixed-case>QA</fixed-case>: Characterizing and Recovering Information Loss in Text Simplification</title>
@@ -4599,12 +4596,9 @@
       <abstract>Large language models (LLMs) can explain their predictions through post-hoc or Chain-of-Thought (CoT) explanations. But an LLM could make up reasonably sounding explanations that are unfaithful to its underlying reasoning. Recent work has designed tests that aim to judge the faithfulness of post-hoc or CoT explanations. In this work we argue that these faithfulness tests do not measure faithfulness to the models’ inner workings – but rather their self-consistency at output level.Our contributions are three-fold: i) We clarify the status of faithfulness tests in view of model explainability, characterising them as self-consistency tests instead. This assessment we underline by ii) constructing a Comparative Consistency Bank for self-consistency tests that for the first time compares existing tests on a common suite of 11 open LLMs and 5 tasks – including iii) our new self-consistency measure CC-SHAP. CC-SHAP is a fine-grained measure (not a test) of LLM self-consistency. It compares how a model’s input contributes to the predicted answer and to generating the explanation. Our fine-grained CC-SHAP metric allows us iii) to compare LLM behaviour when making predictions and to analyse the effect of other consistency tests at a deeper level, which takes us one step further towards measuring faithfulness by bringing us closer to the internals of the model than strictly surface output-oriented tests.</abstract>
       <url hash="8a044b2a">2024.acl-long.329</url>
       <bibkey>parcalabescu-frank-2024-measuring</bibkey>
-<<<<<<< HEAD
       <revision id="1" href="2024.acl-long.329v1" hash="ab7d495d"/>
       <revision id="2" href="2024.acl-long.329v2" hash="8a044b2a" date="2024-09-18">This revision mentions a sponsor in the acknowledgements and fixes the typo in Eq. 4.</revision>
-=======
       <doi>10.18653/v1/2024.acl-long.329</doi>
->>>>>>> origin/master
     </paper>
     <paper id="330">
       <title>Learning or Self-aligning? Rethinking Instruction Fine-tuning</title>
@@ -5375,11 +5369,8 @@
       <bibkey>rai-yao-2024-investigation</bibkey>
       <revision id="1" href="2024.acl-long.387v1" hash="96778503"/>
       <revision id="2" href="2024.acl-long.387v2" hash="ac2dab44" date="2024-08-29">Minor updates.</revision>
-<<<<<<< HEAD
       <revision id="3" href="2024.acl-long.387v3" hash="364a0886" date="2024-09-09">Minor updates.</revision>
-=======
       <doi>10.18653/v1/2024.acl-long.387</doi>
->>>>>>> origin/master
     </paper>
     <paper id="388">
       <title>Leveraging Large Language Models for Learning Complex Legal Concepts through Storytelling</title>
@@ -9698,12 +9689,9 @@
       <abstract>We introduce FinanceMath, a novel benchmark designed to evaluate LLMs' capabilities in solving knowledge-intensive math reasoning problems. Compared to prior works, this study features three core advancements. First, FinanceMath includes 1,200 problems with a hybrid of textual and tabular content. These problems require college-level knowledge in the finance domain for effective resolution. Second, we provide expert-annotated, detailed solution references in Python program format, ensuring a high-quality benchmark for LLM assessment. We also construct a finance-domain knowledge bank and investigate various knowledge integration strategies. Finally, we evaluate a wide spectrum of 44 LLMs with both Chain-of-Thought and Program-of-Thought prompting methods. Our experimental results reveal that the current best-performing system (i.e., GPT-4o) achieves only 60.9% accuracy using CoT prompting, leaving substantial room for improvement. Moreover, while augmenting LLMs with external knowledge can improve model performance (e.g., from 47.5% to 54.5% for Gemini-1.5-Pro), their accuracy remains significantly lower than the estimated human expert performance of 92%. We believe that FinanceMath can advance future research in the area of domain-specific knowledge retrieval and integration, particularly within the context of solving reasoning-intensive tasks.</abstract>
       <url hash="ba7e748c">2024.acl-long.693</url>
       <bibkey>zhao-etal-2024-knowledgefmath</bibkey>
-<<<<<<< HEAD
       <revision id="1" href="2024.acl-long.693v1" hash="e2e21860"/>
       <revision id="2" href="2024.acl-long.693v2" hash="ba7e748c" date="2024-09-17">Revised the dataset name.</revision>
-=======
       <doi>10.18653/v1/2024.acl-long.693</doi>
->>>>>>> origin/master
     </paper>
     <paper id="694">
       <title><fixed-case>API</fixed-case>-<fixed-case>BLEND</fixed-case>: A Comprehensive Corpora for Training and Benchmarking <fixed-case>API</fixed-case> <fixed-case>LLM</fixed-case>s</title>
@@ -12012,12 +12000,9 @@
       <abstract>Recent LLMs have demonstrated remarkable performance in solving exam-like math word problems. However, the degree to which these numerical reasoning skills are effective in real-world scenarios, particularly in expert domains, is still largely unexplored. This paper introduces DocMath-Eval, a comprehensive benchmark specifically designed to evaluate the numerical reasoning capabilities of LLMs in the context of understanding and analyzing specialized documents containing both text and tables. We conduct an extensive evaluation of 48 LLMs with Chain-of-Thought and Program-of-Thought prompting methods, aiming to comprehensively assess the capabilities and limitations of existing LLMs in DocMath-Eval. We found that even the current best-performing system (i.e., GPT-4o) still significantly lags behind human experts in solving complex numerical reasoning problems grounded in long contexts. We believe that DocMath-Eval can serve as a valuable benchmark for evaluating LLMs' capabilities in solving challenging numerical reasoning problems within expert domains.</abstract>
       <url hash="0f0bea3b">2024.acl-long.852</url>
       <bibkey>zhao-etal-2024-docmath</bibkey>
-<<<<<<< HEAD
       <revision id="1" href="2024.acl-long.852v1" hash="de5c6157"/>
       <revision id="2" href="2024.acl-long.852v2" hash="0f0bea3b" date="2024-09-17">Included experimental results.</revision>
-=======
       <doi>10.18653/v1/2024.acl-long.852</doi>
->>>>>>> origin/master
     </paper>
     <paper id="853">
       <title>Unintended Impacts of <fixed-case>LLM</fixed-case> Alignment on Global Representation</title>
diff --git a/data/xml/2024.lrec.xml b/data/xml/2024.lrec.xml
index 2d281a3e78..383adc9a3f 100644
--- a/data/xml/2024.lrec.xml
+++ b/data/xml/2024.lrec.xml
@@ -5515,7 +5515,7 @@
       <pages>5212–5220</pages>
       <abstract>Curriculum Learning (CL) has been emerged as an effective technique for improving the performances and reducing the cost of pre-training Large Language Models (LLMs). The efficacy of CL demonstrated in different scenarios is in the training LLMs by organizing examples from the simplest to the most complex. Although improvements have been shown extensively, this approach was used for pre-training, leaving novel fine-tuning approaches such as instruction-tuning unexplored. In this paper, we propose a novel complexity measure to empower the instruction-tuning method using the CL paradigm. To complement previous works, we propose cognitively motivated measures to determine the complexity of training demonstrations used in the instruction-tuning paradigm. Hence, we experiment with the proposed heuristics first in English and then in other languages. The downstream results show that delivering training examples by complexity ranking is also effective for instruction tuning, as it improves downstream performance while reducing costs. Furthermore, the technique can be easily transferred to languages other than English, e.g., Italian and French, without any adaptation, maintaining functionality and effectiveness.</abstract>
       <url hash="8d23ec5e">2024.lrec-main.464</url>
-      <bibkey>pucci-ranaldi-2024-language</bibkey>
+      <bibkey>ranaldi-etal-2024-language</bibkey>
     </paper>
     <paper id="465">
       <title>Do Language Models Care about Text Quality? Evaluating Web-Crawled Corpora across 11 Languages</title>

From 041b2ff5f5f864731cb8b1dffd5b109df6f33be3 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Tue, 24 Sep 2024 10:33:21 -0400
Subject: [PATCH 36/39] Name corrections to 2024.arabicnlp-1.65

---
 data/xml/2024.arabicnlp.xml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/data/xml/2024.arabicnlp.xml b/data/xml/2024.arabicnlp.xml
index a0792f7313..6cfc944a86 100644
--- a/data/xml/2024.arabicnlp.xml
+++ b/data/xml/2024.arabicnlp.xml
@@ -824,15 +824,15 @@
     </paper>
     <paper id="65">
       <title><fixed-case>B</fixed-case>ias<fixed-case>G</fixed-case>anda at <fixed-case>FIGNEWS</fixed-case> 2024 Shared Task: A Quest to Uncover Biased Views in News Coverage</title>
-      <author><first>Blqees</first><last>Blqees</last></author>
-      <author><first>Al</first><last>Wardi</last></author>
+      <author><first>Al Manar</first><last>Al Wardi</last></author>
+      <author><first>Blqees</first><last>Al Busaidi</last></author>
       <author><first>Malath</first><last>Al-Sibani</last></author>
-      <author><first>Hiba</first><last>Al-Siyabi</last></author>
-      <author><first>Najma</first><last>Zidjaly</last><affiliation>Sultan Qaboos University</affiliation></author>
+      <author><first>Hiba Salim Muhammad</first><last>Al-Siyabi</last></author>
+      <author><first>Najma</first><last>Al Zidjaly</last><affiliation>Sultan Qaboos University</affiliation></author>
       <pages>609-613</pages>
       <abstract>In this study, we aimed to identify biased language in a dataset provided by the FIGNEWS 2024 committee on the Gaza-Israel war. We classified entries into seven categories: Unbiased, Biased against Palestine, Biased against Israel, Biased against Others, Biased against both Palestine and Israel, Unclear, and Not Applicable. Our team reviewed the literature to develop a codebook of terminologies and definitions. By coding each example, we sought to detect language tendencies used by media outlets when reporting on the same event. The primary finding was that most examples were classified as “Biased against Palestine,” as all examined language data used one-sided terms to describe the October 7 event. The least used category was “Not Applicable,” reserved for irrelevant examples or those lacking context. It is recommended to use neutral and balanced language when reporting volatile political news.</abstract>
       <url hash="ed28ae49">2024.arabicnlp-1.65</url>
-      <bibkey>blqees-etal-2024-biasganda</bibkey>
+      <bibkey>al-wardi-etal-2024-biasganda</bibkey>
       <doi>10.18653/v1/2024.arabicnlp-1.65</doi>
     </paper>
     <paper id="66">

From b182dc94bafd79ce38d2a5c0f4671820e7e6f797 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Wed, 25 Sep 2024 17:13:03 -0400
Subject: [PATCH 37/39] Name correction; 2024.eamt-1.14

---
 data/xml/2024.eamt.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2024.eamt.xml b/data/xml/2024.eamt.xml
index 7e0b7edcf9..a0ce21b508 100644
--- a/data/xml/2024.eamt.xml
+++ b/data/xml/2024.eamt.xml
@@ -150,7 +150,7 @@
     </paper>
     <paper id="14">
       <title>Quality Estimation with <tex-math>k</tex-math>-nearest Neighbors and Automatic Evaluation for Model-specific Quality Estimation</title>
-      <author><first>Tu</first><last>Dinh</last><affiliation>Karlsruher Institut für Technologie</affiliation></author>
+      <author><first>Tu</first><last>Anh Dinh</last><affiliation>Karlsruhe Institut für Technologie</affiliation></author>
       <author><first>Tobias</first><last>Palzer</last><affiliation>Technische Universität München</affiliation></author>
       <author><first>Jan</first><last>Niehues</last></author>
       <pages>133-146</pages>

From abc4d953a476ff138cca8f0fad17ad7ea5320c94 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Mon, 30 Sep 2024 06:29:42 -0500
Subject: [PATCH 38/39] Remove middle name

---
 data/xml/2024.lrec.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/xml/2024.lrec.xml b/data/xml/2024.lrec.xml
index 383adc9a3f..d014c43cbd 100644
--- a/data/xml/2024.lrec.xml
+++ b/data/xml/2024.lrec.xml
@@ -3041,7 +3041,7 @@
     </paper>
     <paper id="257">
       <title><fixed-case>CASIMIR</fixed-case>: A Corpus of Scientific Articles Enhanced with Multiple Author-Integrated Revisions</title>
-      <author><first>Léane Isabelle</first><last>Jourdan</last></author>
+      <author><first>Léane</first><last>Jourdan</last></author>
       <author><first>Florian</first><last>Boudin</last></author>
       <author><first>Nicolas</first><last>Hernandez</last></author>
       <author><first>Richard</first><last>Dufour</last></author>

From e19d9b89ee977f758b87b16fe21a2a51b8e123f0 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Tue, 24 Sep 2024 10:12:11 -0400
Subject: [PATCH 39/39] Name correction (closes #3872)

---
 data/xml/2024.acl.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index f1d3c6de9b..3308126c2b 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -5988,7 +5988,7 @@
     </paper>
     <paper id="433">
       <title><fixed-case>LLME</fixed-case>mbed: Rethinking Lightweight <fixed-case>LLM</fixed-case>’s Genuine Function in Text Classification</title>
-      <author><first>ChunLiu</first><last>ChunLiu</last><affiliation>AMS</affiliation></author>
+      <author><first>Chun</first><last>Liu</last><affiliation>AMS</affiliation></author>
       <author><first>Hongguang</first><last>Zhang</last><affiliation>Systems Engineering Institute, AMS</affiliation></author>
       <author><first>Kainan</first><last>Zhao</last><affiliation>AMS</affiliation></author>
       <author><first>Xinghai</first><last>Ju</last><affiliation>Information Engineering University</affiliation></author>
@@ -5996,7 +5996,7 @@
       <pages>7994-8004</pages>
       <abstract>With the booming of Large Language Models (LLMs), prompt-learning has become a promising method mainly researched in various research areas. Recently, many attempts based on prompt-learning have been made to improve the performance of text classification. However, most of these methods are based on heuristic Chain-of-Thought (CoT), and tend to be more complex but less efficient. In this paper, we rethink the LLM-based text classification methodology, propose a simple and effective transfer learning strategy, namely LLMEmbed, to address this classical but challenging task. To illustrate, we first study how to properly extract and fuse the text embeddings via various lightweight LLMs at different network depths to improve their robustness and discrimination, then adapt such embeddings to train the classifier. We perform extensive experiments on publicly available datasets, and the results show that LLMEmbed achieves strong performance while enjoys low training overhead using lightweight LLM backbones compared to recent methods based on larger LLMs, *i.e.* GPT-3, and sophisticated prompt-based strategies. Our LLMEmbed achieves adequate accuracy on publicly available benchmarks without any fine-tuning while merely use 4% model parameters, 1.8% electricity consumption and 1.5% runtime compared to its counterparts. Code is available at: https://github.com/ChunLiu-cs/LLMEmbed-ACL2024.</abstract>
       <url hash="06fc7249">2024.acl-long.433</url>
-      <bibkey>chunliu-etal-2024-llmembed</bibkey>
+      <bibkey>liu-etal-2024-llmembed</bibkey>
       <doi>10.18653/v1/2024.acl-long.433</doi>
     </paper>
     <paper id="434">