diff --git a/bin/ingest_aclpub2.py b/bin/ingest_aclpub2.py
index c31776668b..d192c34808 100755
--- a/bin/ingest_aclpub2.py
+++ b/bin/ingest_aclpub2.py
@@ -46,6 +46,10 @@
 #
 # Check things over, then commit and push the changes and synchronize the files.
 
+# TODO:
+# - check for venue YAML, create/complain if non-existent
+# - add verification model to ensure format is correct
+
 import click
 import yaml
 import re
@@ -142,12 +146,19 @@ def parse_conf_yaml(ingestion_dir: str) -> Dict[str, Any]:
     cover_subtitle == shortbooktitle
     '''
     ingestion_dir = Path(ingestion_dir)
-    if (ingestion_dir / 'conference_details.yml').exists():
-        meta = yaml.safe_load((ingestion_dir / 'conference_details.yml').read_text())
+
+    paths_to_check = [
+        ingestion_dir / 'conference_details.yml',
+        ingestion_dir / 'inputs' / 'conference_details.yml',
+    ]
+    meta = None
+    for path in paths_to_check:
+        if path.exists():
+            meta = yaml.safe_load(path.read_text())
+            break
     else:
-        meta = yaml.safe_load(
-            (ingestion_dir / 'inputs/conference_details.yml').read_text()
-        )
+        raise Exception("Can't find conference_details.yml (looked in {paths_to_check})")
+
     meta['month'] = meta['start_date'].strftime('%B')
     meta['year'] = str(meta['start_date'].year)
 
@@ -175,12 +186,26 @@ def parse_conf_yaml(ingestion_dir: str) -> Dict[str, Any]:
 
 
 def parse_paper_yaml(ingestion_dir: str) -> List[Dict[str, str]]:
+    """
+    Reads papers.yml to get metadata. Skips non-archival papers.
+    """
     ingestion_dir = Path(ingestion_dir)
 
-    if (ingestion_dir / 'conference_details.yml').exists():
-        papers = yaml.safe_load((ingestion_dir / 'papers.yml').read_text())
+    paths_to_check = [
+        ingestion_dir / 'papers.yml',
+        ingestion_dir / 'inputs' / 'papers.yml',
+    ]
+    papers = None
+    for path in paths_to_check:
+        if path.exists():
+            papers = yaml.safe_load(path.read_text())
+            break
     else:
-        papers = yaml.safe_load((ingestion_dir / 'input/papers.yml').read_text())
+        raise Exception("Can't find papers.yml (looked in root dir and under inputs/)")
+
+    # remove non-archival papers
+    papers = [p for p in papers if p.get('archival', True)]
+
     return papers
 
 
@@ -194,42 +219,42 @@ def add_paper_nums_in_paper_yaml(
 
     start, end = 1, 0
     for paper in papers:
-        if 'archival' not in paper.keys():
-            paper.update({'archival': '1'})
-        assert 'archival' in paper.keys(), f'{paper["id"]} is missing key archival'
         assert 'file' in paper.keys(), f'{paper["id"]} is missing key file'
-        if (
-            paper['archival'] == 1
-            or paper['archival'] is True
-            or paper['archival'] == '1'
-        ):
-            paper_id = str(paper['id'])
-            # if 'file' not in paper.keys():
-            #     print(f'{paper_id} does not have file key but archive is {paper["archival"]}')
-            #     paper_name = paper['title']
-            # else:
 
-            paper_path = paper['file']
-            paper_need_read_path = None
-            # TODO: we should just be able to read paper_path directly, and throw an
-            # error if it doesn't exist
-            if (path := ingestion_dir / "watermarked_pdfs" / paper_path).exists():
-                paper_need_read_path = str(path)
-            elif (
-                path := ingestion_dir / "watermarked_pdfs" / f"{paper_id}.pdf"
-            ).exists():
+        paper_id = str(paper['id'])
+        # if 'file' not in paper.keys():
+        #     print(f'{paper_id} does not have file key but archive is {paper["archival"]}')
+        #     paper_name = paper['title']
+        # else:
+
+        paper_path = paper['file']
+
+        # TODO: we should just be able to read paper_path directly, and throw an
+        # error if it doesn't exist
+        paper_need_read_path = None
+        paths_to_check = [
+            ingestion_dir / "watermarked_pdfs" / paper_path,
+            ingestion_dir / "watermarked_pdfs" / f"{paper_id}.pdf",
+            ingestion_dir / "build" / "watermarked_pdfs" / paper_path,
+            ingestion_dir / "build" / "watermarked_pdfs" / f"{paper_id}.pdf",
+        ]
+        paper_need_read_path = None
+        for path in paths_to_check:
+            if path.exists():
                 paper_need_read_path = str(path)
+                break
+        else:
+            raise Exception(
+                f"* Fatal: could not find paper ID {paper_id} ({paths_to_check})"
+            )
 
-            assert (
-                paper_need_read_path is not None
-            ), f"* Fatal: could not find {paper_id} (path was {paper_path}, {path})"
+        pdf = open(paper_need_read_path, 'rb')
+        pdf_reader = PyPDF2.PdfReader(pdf)
+        num_of_pages = len(pdf_reader.pages)
+        start = end + 1
+        end = start + num_of_pages - 1
+        paper['pages'] = f'{start}-{end}'
 
-            pdf = open(paper_need_read_path, 'rb')
-            pdf_reader = PyPDF2.PdfReader(pdf)
-            num_of_pages = len(pdf_reader.pages)
-            start = end + 1
-            end = start + num_of_pages - 1
-            paper['pages'] = f'{start}-{end}'
     return papers
 
 
@@ -342,6 +367,7 @@ def paper2xml(
                    'semantic_scholar_id',
                    'username']
     '''
+
     fields = [
         'title',
         'author',
@@ -351,7 +377,7 @@ def paper2xml(
         'doi',
         'language',
     ]
-    paper = make_simple_element('paper', attrib={'id': str(paper_num)})
+    paper = make_simple_element('paper', attrib={"id": str(paper_num)})
     for field in fields:
         if field == 'author':
             authors = paper_item['authors']
@@ -372,15 +398,19 @@ def paper2xml(
             if field == 'url':
                 value = f'{anthology_id}'
             elif field == 'abstract':
-                value = paper_item['abstract'].replace('\n', '')
+                value = None
+                if "abstract" in paper_item:
+                    value = paper_item["abstract"].replace('\n', '')
             elif field == 'title':
                 value = paper_item[field]
             elif field == 'pages':
                 value = paper_item[field]
             else:
                 continue
+
             try:
-                make_simple_element(field, text=value, parent=paper)
+                if value is not None:
+                    make_simple_element(field, text=value, parent=paper)
             except Exception:
                 print(
                     f"Couldn't process {paper} for {anthology_id}, please check the abstract in the papers.yaml file for this paper",
@@ -450,16 +480,39 @@ def copy_pdf_and_attachment(
     venue_name = meta['anthology_venue_id'].lower()
     volume_name = meta['volume_name'].lower()
 
-    pdfs_dest_dir = create_dest_path(pdfs_dir, venue_name)
-    pdfs_src_dir = os.path.join(meta['path'], 'watermarked_pdfs')
+    pdfs_src_dir = None
+    paths_to_check = [
+        Path(meta['path']) / 'watermarked_pdfs',
+        Path(meta['path']) / 'build' / 'watermarked_pdfs',
+    ]
+    for path in paths_to_check:
+        if path.exists() and path.is_dir():
+            pdfs_src_dir = path
+            break
+    else:
+        raise FileNotFoundError(f"Could not find watermarked PDFs in {paths_to_check}")
+
+    pdfs_dest_dir = Path(create_dest_path(pdfs_dir, venue_name))
 
     # copy proceedings.pdf
-    proceedings_pdf_src_path = os.path.join(meta['path'], 'proceedings.pdf')
-    proceedings_pdf_dest_path = None
-    if os.path.exists(proceedings_pdf_src_path):
-        proceedings_pdf_dest_path = (
-            os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf"
+    proceedings_pdf_src_path = None
+    paths_to_check = [
+        Path('proceedings.pdf'),
+        Path("build") / 'proceedings.pdf',
+    ]
+    for path in paths_to_check:
+        if path.exists():
+            proceedings_pdf_src_path = str(path)
+            break
+    else:
+        print(
+            f"Warning: could not find proceedings.pdf in {paths_to_check}",
+            file=sys.stderr,
         )
+
+    proceedings_pdf_dest_path = None
+    if proceedings_pdf_src_path is not None:
+        proceedings_pdf_dest_path = pdfs_dest_dir / f"{collection_id}-{volume_name}.pdf"
         if dry_run:
             print(
                 f'would\'ve moved {proceedings_pdf_src_path} to {proceedings_pdf_dest_path}'
@@ -476,11 +529,24 @@ def copy_pdf_and_attachment(
         "attachments": [],
     }
 
-    frontmatter_src_path = 'front_matter.pdf'
-    if os.path.exists(frontmatter_src_path):
-        frontmatter_dest_path = (
-            os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + '.0.pdf'
+    frontmatter_src_path = None
+    paths_to_check = [
+        Path('front_matter.pdf'),
+        Path('0.pdf'),
+        Path("build") / 'front_matter.pdf',
+        Path("build") / '0.pdf',
+    ]
+    for path in paths_to_check:
+        if path.exists():
+            frontmatter_src_path = str(path)
+            break
+    else:
+        print(
+            f"Warning: could not find front matter in {paths_to_check}", file=sys.stderr
         )
+
+    if frontmatter_src_path is not None:
+        frontmatter_dest_path = pdfs_dest_dir / f"{collection_id}-{volume_name}.0.pdf"
         if dry_run:
             print(f'would\'ve moved {frontmatter_src_path} to {frontmatter_dest_path}')
         if not dry_run:
@@ -489,6 +555,7 @@ def copy_pdf_and_attachment(
         # create the PDF entry so that we'll get <frontmatter>
         volume[0]['pdf'] = frontmatter_dest_path
 
+    paper_num = 0
     for i, paper in enumerate(papers):
         # archival papers only
         if 'archival' not in paper.keys():
@@ -509,23 +576,21 @@ def copy_pdf_and_attachment(
             # paper_name = paper['file']
             if paper_name != '' or paper_name is not None:
                 paper_id = str(paper['id'])
-                paper_num = i + 1
+                paper_num += 1
                 paper_id_full = f'{collection_id}-{volume_name}.{paper_num}'
 
                 pdf_src_path = None
-                if os.path.exists(os.path.join(pdfs_src_dir, paper_name)):
-                    pdf_src_path = os.path.join(pdfs_src_dir, paper_name)
-                elif os.path.exists(os.path.join(pdfs_src_dir, f'{paper_id}.pdf')):
-                    pdf_src_path = os.path.join(pdfs_src_dir, f'{paper_id}.pdf')
+                if (pdfs_src_dir / paper_name).exists():
+                    pdf_src_path = pdfs_src_dir / paper_name
+                elif pdfs_src_dir / f'{paper_id}.pdf':
+                    pdf_src_path = pdfs_src_dir / f'{paper_id}.pdf'
 
                 assert (
                     pdf_src_path
-                ), f"Couldn't find {paper_name}/{paper_id} in {pdfs_src_dir}"
-                pdf_dest_path = os.path.join(
-                    pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf"
-                )
+                ), f"Couldn't find {paper_name} or {paper_id} in {pdfs_src_dir}"
+                pdf_dest_path = pdfs_dest_dir / f"{paper_id_full}.pdf"
                 if dry_run:
-                    print(f'would\'ve moved {pdf_src_path} to {pdf_dest_path}')
+                    print(f"would've moved {pdf_src_path} to {pdf_dest_path}")
                 if not dry_run:
                     maybe_copy(pdf_src_path, pdf_dest_path)
 
@@ -536,22 +601,35 @@ def copy_pdf_and_attachment(
                 }
 
             # copy attachments
-            # TODO: skipping attachments because full of non-publishable stuff
-            if False and 'attachments' in paper:
+            if 'attachments' in paper:
                 attachs_dest_dir = create_dest_path(attachments_dir, venue_name)
-                attachs_src_dir = os.path.join(meta['path'], 'attachments')
-                assert os.path.exists(
-                    attachs_src_dir
-                ), f'paper {i, paper_name} contains attachments but attachments folder was not found'
+                attachs_src_dir = Path(meta['path']) / 'attachments'
+                # assert (
+                #     attachs_src_dir.exists()
+                # ), f'paper {i, paper_name} contains attachments but attachments folder was not found'
 
                 for attachment in paper['attachments']:
-                    print("ATTACH", paper_id_full, attachment)
-                    file_path = attachment.get('file', None)
+                    file_path = Path(attachment.get('file', None))
                     if file_path is None:
                         continue
-                    attach_src_path = attachs_src_dir + '/' + file_path
-                    attach_src_extension = attach_src_path.split(".")[-1]
 
+                    attach_src_path = None
+                    paths_to_check = [
+                        attachs_src_dir / file_path,
+                        attachs_src_dir / file_path.name,
+                    ]
+                    for path in paths_to_check:
+                        if path.exists():
+                            attach_src_path = str(path)
+                            break
+                    else:
+                        print(
+                            f"Warning: paper {paper_id} attachment {file_path} not found, skipping",
+                            file=sys.stderr,
+                        )
+                        continue
+
+                    attach_src_extension = attach_src_path.split(".")[-1]
                     type_ = attachment['type'].replace(" ", "")
                     file_name = f'{collection_id}-{volume_name}.{paper_num}.{type_}.{attach_src_extension}'
 
@@ -567,6 +645,7 @@ def copy_pdf_and_attachment(
                             )
                         else:
                             maybe_copy(attach_src_path, attach_dest_path)
+                            print(f"Attaching {attach_dest_path}/{type_} to {paper_num}")
                             volume[paper_num]['attachments'].append(
                                 (attach_dest_path, type_)
                             )
@@ -767,10 +846,13 @@ def main(ingestion_dir, pdfs_dir, attachments_dir, dry_run, anthology_dir, inges
     volume_full_id, meta = process_proceeding(
         ingestion_dir, anthology_datadir, venue_index, venue_keys
     )
+
+    # Load the papers.yaml file, skipping non-archival papers
     papers = parse_paper_yaml(ingestion_dir)
     # print(f'original paper {papers[0]}')
+
+    # add page numbering by parsing the PDFs
     papers = add_paper_nums_in_paper_yaml(papers, ingestion_dir)
-    # print(f'updated paper {papers[0]}')
 
     (
         volume,
diff --git a/bin/ingest_mitpress.py b/bin/ingest_mitpress.py
index 70449675e2..d0bed2254f 100755
--- a/bin/ingest_mitpress.py
+++ b/bin/ingest_mitpress.py
@@ -424,7 +424,6 @@ def sort_papers_by_page(paper_tuple):
                     paper_id = max(paper_id, int(paper.attrib["id"]))
 
                 paper_id += 1
-                print(f"Setting paper_id to {paper_id}")
 
         anth_id = f"{collection_id}-{issue}.{paper_id}"
 
diff --git a/data/xml/2020.aacl.xml b/data/xml/2020.aacl.xml
index 395dc1fc60..e1d3800ffe 100644
--- a/data/xml/2020.aacl.xml
+++ b/data/xml/2020.aacl.xml
@@ -1549,7 +1549,7 @@
       <abstract>We introduce fairseq S2T, a fairseq extension for speech-to-text (S2T) modeling tasks such as end-to-end speech recognition and speech-to-text translation. It follows fairseq’s careful design for scalability and extensibility. We provide end-to-end workflows from data pre-processing, model training to offline (online) inference. We implement state-of-the-art RNN-based as well as Transformer-based models and open-source detailed training recipes. Fairseq’s machine translation models and language models can be seamlessly integrated into S2T workflows for multi-task learning or transfer learning. Fairseq S2T is available at https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text.</abstract>
       <url hash="ba6e2aa3">2020.aacl-demo.6</url>
       <bibkey>wang-etal-2020-fairseq</bibkey>
-      <pwccode url="https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text" additional="true">pytorch/fairseq</pwccode>
+      <pwccode url="https://github.com/pytorch/fairseq" additional="true">pytorch/fairseq</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/librispeech">LibriSpeech</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/must-c">MuST-C</pwcdataset>
     </paper>
diff --git a/data/xml/2020.emnlp.xml b/data/xml/2020.emnlp.xml
index cdffea717b..8f1296d816 100644
--- a/data/xml/2020.emnlp.xml
+++ b/data/xml/2020.emnlp.xml
@@ -5911,6 +5911,7 @@
       <doi>10.18653/v1/2020.emnlp-main.392</doi>
       <video href="https://slideslive.com/38939296"/>
       <bibkey>drozdov-etal-2020-unsupervised</bibkey>
+      <pwcdataset url="https://paperswithcode.com/dataset/ptb">PTB Diagnostic ECG Database</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="393">
@@ -7513,7 +7514,7 @@
       <doi>10.18653/v1/2020.emnlp-main.498</doi>
       <video href="https://slideslive.com/38938695"/>
       <bibkey>garg-ramakrishnan-2020-bae</bibkey>
-      <pwccode url="https://github.com/QData/TextAttack/blob/master/textattack/attack_recipes/bae_garg_2019.py" additional="true">QData/TextAttack</pwccode>
+      <pwccode url="https://github.com/QData/TextAttack" additional="true">QData/TextAttack</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-binary">IMDB-BINARY</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mpqa-opinion-corpus">MPQA Opinion Corpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mr">MR</pwcdataset>
diff --git a/data/xml/2020.findings.xml b/data/xml/2020.findings.xml
index eec3d161d5..44b0f0e556 100644
--- a/data/xml/2020.findings.xml
+++ b/data/xml/2020.findings.xml
@@ -4223,6 +4223,7 @@
       <url hash="75ef57d3">2020.findings-emnlp.285</url>
       <doi>10.18653/v1/2020.findings-emnlp.285</doi>
       <bibkey>wang-etal-2020-integrating-task</bibkey>
+      <pwccode url="https://github.com/raywangwr/bert_label_embedding" additional="false">raywangwr/bert_label_embedding</pwccode>
     </paper>
     <paper id="286">
       <title>Efficient Transformer-based Large Scale Language Representations using Hardware-friendly Block Structured Pruning</title>
diff --git a/data/xml/2021.acl.xml b/data/xml/2021.acl.xml
index f9a3100b3f..45571b378a 100644
--- a/data/xml/2021.acl.xml
+++ b/data/xml/2021.acl.xml
@@ -3401,6 +3401,7 @@
       <bibkey>yang-etal-2021-neural</bibkey>
       <video href="2021.acl-long.209.mp4"/>
       <pwccode url="https://github.com/sustcsonglin/TN-PCFG" additional="false">sustcsonglin/TN-PCFG</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/ptb">PTB Diagnostic ECG Database</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="210">
diff --git a/data/xml/2021.findings.xml b/data/xml/2021.findings.xml
index 2c42f8f71d..5b75c0daa0 100644
--- a/data/xml/2021.findings.xml
+++ b/data/xml/2021.findings.xml
@@ -4833,6 +4833,7 @@
       <doi>10.18653/v1/2021.findings-acl.342</doi>
       <bibkey>merkhofer-etal-2021-perceptual</bibkey>
       <video href="2021.findings-acl.342.mp4"/>
+      <pwccode url="https://github.com/mitre/hpmet" additional="false">mitre/hpmet</pwccode>
     </paper>
     <paper id="343">
       <title>Scaling Within Document Coreference to Long Texts</title>
diff --git a/data/xml/2021.naacl.xml b/data/xml/2021.naacl.xml
index 0efbeabef0..9d754fb9ea 100644
--- a/data/xml/2021.naacl.xml
+++ b/data/xml/2021.naacl.xml
@@ -1830,6 +1830,7 @@
       <bibkey>yang-etal-2021-pcfgs</bibkey>
       <video href="2021.naacl-main.117.mp4"/>
       <pwccode url="https://github.com/sustcsonglin/TN-PCFG" additional="false">sustcsonglin/TN-PCFG</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/ptb">PTB Diagnostic ECG Database</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="118">
diff --git a/data/xml/2022.findings.xml b/data/xml/2022.findings.xml
index b6cdd66ce2..19b16a859b 100644
--- a/data/xml/2022.findings.xml
+++ b/data/xml/2022.findings.xml
@@ -1590,6 +1590,7 @@
       <video href="2022.findings-acl.101.mp4"/>
       <pwccode url="https://github.com/Nickil21/weakly-supervised-parsing" additional="false">Nickil21/weakly-supervised-parsing</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/chinese-treebank">Chinese Treebank</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/ptb">PTB Diagnostic ECG Database</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="102">
diff --git a/data/xml/2022.lrec.xml b/data/xml/2022.lrec.xml
index 513b27c29d..f31fab36a4 100644
--- a/data/xml/2022.lrec.xml
+++ b/data/xml/2022.lrec.xml
@@ -1066,6 +1066,7 @@
       <abstract>In this paper, we describe ParCorFull2.0, a parallel corpus annotated with full coreference chains for multiple languages, which is an extension of the existing corpus ParCorFull (Lapshinova-Koltunski et al., 2018). Similar to the previous version, this corpus has been created to address translation of coreference across languages, a phenomenon still challenging for machine translation (MT) and other multilingual natural language processing (NLP) applications. The current version of the corpus that we present here contains not only parallel texts for the language pair English-German, but also for English-French and English-Portuguese, which are all major European languages. The new language pairs belong to the Romance languages. The addition of a new language group creates a need of extension not only in terms of texts added, but also in terms of the annotation guidelines. Both French and Portuguese contain structures not found in English and German. Moreover, Portuguese is a pro-drop language bringing even more systemic differences in the realisation of coreference into our cross-lingual resources. These differences cause problems for multilingual coreference resolution and machine translation. Our parallel corpus with full annotation of coreference will be a valuable resource with a variety of uses not only for NLP applications, but also for contrastive linguists and researchers in translation studies.</abstract>
       <url hash="c5e94873">2022.lrec-1.85</url>
       <bibkey>lapshinova-koltunski-etal-2022-parcorfull2</bibkey>
+      <pwccode url="https://github.com/chardmeier/parcor-full" additional="false">chardmeier/parcor-full</pwccode>
     </paper>
     <paper id="86">
       <title>A Multi-Party Dialogue Ressource in <fixed-case>F</fixed-case>rench</title>
@@ -8464,7 +8465,6 @@
       <abstract>The social NLP researchers and mental health practitioners have witnessed exponential growth in the field of mental health detection and analysis on social media. It has become important to identify the reason behind mental illness. In this context, we introduce a new dataset for Causal Analysis of Mental health in Social media posts (CAMS). We first introduce the annotation schema for this task of causal analysis. The causal analysis comprises of two types of annotations, viz, causal interpretation and causal categorization. We show the efficacy of our scheme in two ways: (i) crawling and annotating 3155 Reddit data and (ii) re-annotate the publicly available SDCNL dataset of 1896 instances for interpretable causal analysis. We further combine them as CAMS dataset and make it available along with the other source codes https://anonymous.4open.science/r/CAMS1/. Our experimental results show that the hybrid CNN-LSTM model gives the best performance over CAMS dataset.</abstract>
       <url hash="be97ea56">2022.lrec-1.686</url>
       <bibkey>garg-etal-2022-cams</bibkey>
-      <pwccode url="https://github.com/drmuskangarg/cams" additional="false">drmuskangarg/cams</pwccode>
     </paper>
     <paper id="687">
       <title>How Does the Experimental Setting Affect the Conclusions of Neural Encoding Models?</title>
diff --git a/data/xml/2022.naacl.xml b/data/xml/2022.naacl.xml
index b331fd4055..22b5c628e3 100644
--- a/data/xml/2022.naacl.xml
+++ b/data/xml/2022.naacl.xml
@@ -5769,6 +5769,7 @@
       <doi>10.18653/v1/2022.naacl-main.353</doi>
       <video href="2022.naacl-main.353.mp4"/>
       <pwccode url="https://github.com/sustcsonglin/TN-PCFG" additional="true">sustcsonglin/TN-PCFG</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/ptb">PTB Diagnostic ECG Database</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="354">
diff --git a/data/xml/2022.sdp.xml b/data/xml/2022.sdp.xml
index 09a49fd79c..282da1eacf 100644
--- a/data/xml/2022.sdp.xml
+++ b/data/xml/2022.sdp.xml
@@ -224,6 +224,7 @@
       <abstract>We address the named entity omission - the drawback of many current abstractive text summarizers. We suggest a custom pretraining objective to enhance the model’s attention on the named entities in a text. At first, the named entity recognition model RoBERTa is trained to determine named entities in the text. After that this model is used to mask named entities in the text and the BART model is trained to reconstruct them. Next, BART model is fine-tuned on the summarization task. Our experiments showed that this pretraining approach drastically improves named entity inclusion precision and recall metrics.</abstract>
       <url hash="f52de4fe">2022.sdp-1.17</url>
       <bibkey>berezin-batura-2022-named</bibkey>
+      <pwcdataset url="https://paperswithcode.com/dataset/scierc">SciERC</pwcdataset>
     </paper>
     <paper id="18">
       <title>Named Entity Recognition Based Automatic Generation of Research Highlights</title>
diff --git a/data/xml/2023.acl.xml b/data/xml/2023.acl.xml
index 9ce0668169..4cdd00adec 100644
--- a/data/xml/2023.acl.xml
+++ b/data/xml/2023.acl.xml
@@ -5104,6 +5104,9 @@
       <abstract>We introduce a dataset for evidence/rationale extraction on an extreme multi-label classification task over long medical documents. One such task is Computer-Assisted Coding (CAC) which has improved significantly in recent years, thanks to advances in machine learning technologies. Yet simply predicting a set of final codes for a patient encounter is insufficient as CAC systems are required to provide supporting textual evidence to justify the billing codes. A model able to produce accurate and reliable supporting evidence for each code would be a tremendous benefit. However, a human annotated code evidence corpus is extremely difficult to create because it requires specialized knowledge. In this paper, we introduce MDACE, the first publicly available code evidence dataset, which is built on a subset of the MIMIC-III clinical records. The dataset – annotated by professional medical coders – consists of 302 Inpatient charts with 3,934 evidence spans and 52 Profee charts with 5,563 evidence spans. We implemented several evidence extraction methods based on the EffectiveCAN model (Liu et al., 2021) to establish baseline performance on this dataset. MDACE can be used to evaluate code evidence extraction methods for CAC systems, as well as the accuracy and interpretability of deep learning models for multi-label classification. We believe that the release of MDACE will greatly improve the understanding and application of deep learning technologies for medical coding and document classification.</abstract>
       <url hash="3f266b3d">2023.acl-long.416</url>
       <bibkey>cheng-etal-2023-mdace</bibkey>
+      <pwccode url="https://github.com/3mcloud/MDACE" additional="false">3mcloud/MDACE</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/evidence-inference">Evidence Inference</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/mimic-iii">MIMIC-III</pwcdataset>
     </paper>
     <paper id="417">
       <title>Towards Zero-Shot Multilingual Transfer for Code-Switched Responses</title>
@@ -11270,6 +11273,10 @@
       <year>2023</year>
       <venue>acl</venue>
     </meta>
+    <frontmatter>
+      <url hash="50b9f6b9">2023.acl-short.0</url>
+      <bibkey>acl-2023-short-frontmatter</bibkey>
+    </frontmatter>
     <paper id="1">
       <title>Should you marginalize over possible tokenizations?</title>
       <author><first>Nadezhda</first><last>Chirkova</last><affiliation>Naver Labs Europe</affiliation></author>
@@ -15380,13 +15387,21 @@
   <event id="acl-2023">
     <colocated>
       <volume-id>2023.findings-acl</volume-id>
+      <volume-id>2023.americasnlp-1</volume-id>
       <volume-id>2023.bea-1</volume-id>
       <volume-id>2023.bionlp-1</volume-id>
+      <volume-id>2023.cawl-1</volume-id>
+      <volume-id>2023.clinicalnlp-1</volume-id>
       <volume-id>2023.codi-1</volume-id>
+      <volume-id>2023.dialdoc-1</volume-id>
       <volume-id>2023.disrpt-1</volume-id>
       <volume-id>2023.iwslt-1</volume-id>
       <volume-id>2023.law-1</volume-id>
+      <volume-id>2023.nlrse-1</volume-id>
+      <volume-id>2023.repl4nlp-1</volume-id>
+      <volume-id>2023.sicon-1</volume-id>
       <volume-id>2023.semeval-1</volume-id>
+      <volume-id>2023.sustainlp-1</volume-id>
       <volume-id>2023.woah-1</volume-id>
       <volume-id>2023.wnu-1</volume-id>
     </colocated>
diff --git a/data/xml/2023.americasnlp.xml b/data/xml/2023.americasnlp.xml
new file mode 100644
index 0000000000..d6e8fa468d
--- /dev/null
+++ b/data/xml/2023.americasnlp.xml
@@ -0,0 +1,276 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.americasnlp">
+  <volume id="1" ingest-date="2023-07-12">
+    <meta>
+      <booktitle>Proceedings of the Workshop on Natural Language Processing for Indigenous Languages of the Americas (AmericasNLP)</booktitle>
+      <editor><first>Manuel</first><last>Mager</last></editor>
+      <editor><first>Abteen</first><last>Ebrahimi</last></editor>
+      <editor><first>Arturo</first><last>Oncevay</last></editor>
+      <editor><first>Enora</first><last>Rice</last></editor>
+      <editor><first>Shruti</first><last>Rijhwani</last></editor>
+      <editor><first>Alexis</first><last>Palmer</last></editor>
+      <editor><first>Katharina</first><last>Kann</last></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Toronto, Canada</address>
+      <month>July</month>
+      <year>2023</year>
+      <url hash="5b003f99">2023.americasnlp-1</url>
+      <venue>americasnlp</venue>
+    </meta>
+    <frontmatter>
+      <url hash="09460534">2023.americasnlp-1.0</url>
+      <bibkey>americasnlp-2023-natural</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Use of <fixed-case>NLP</fixed-case> in the Context of Belief states of Ethnic Minorities in <fixed-case>L</fixed-case>atin <fixed-case>A</fixed-case>merica</title>
+      <author><first>Olga</first><last>Kellert</last><affiliation>University of Gttingen</affiliation></author>
+      <author><first>Mahmud</first><last>Zaman</last><affiliation>University of Gttingen</affiliation></author>
+      <pages>1-5</pages>
+      <abstract>The major goal of our study is to test methodsin NLP in the domain of health care educationrelated to Covid-19 of vulnerable groups suchas indigenous people from Latin America. Inorder to achieve this goal, we asked participantsin a survey questionnaire to provide answersabout health related topics. We used these answersto measure the health education status ofour participants. In this paper, we summarizethe results from our NLP-application on theparticipants’ answers. In the first experiment,we use embeddings-based tools to measure thesemantic similarity between participants’ answersand “expert” or “reference” answers. Inthe second experiment, we use synonym-basedmethods to classify answers under topics. Wecompare the results from both experiments withhuman annotations. Our results show that thetested NLP-methods reach a significantly loweraccuracy score than human annotations in bothexperiments. We explain this difference by theassumption that human annotators are muchbetter in pragmatic inferencing necessary toclassify the semantic similarity and topic classificationof answers.</abstract>
+      <url hash="58b13d1a">2023.americasnlp-1.1</url>
+      <bibkey>kellert-zaman-2023-use</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Neural Machine Translation through Active Learning on low-resource languages: The case of <fixed-case>S</fixed-case>panish to <fixed-case>M</fixed-case>apudungun</title>
+      <author><first>Begoa</first><last>Pendas</last><affiliation>Pontificia Universidad Catolica de Chile</affiliation></author>
+      <author><first>Andres</first><last>Carvallo</last><affiliation>CENIA</affiliation></author>
+      <author><first>Carlos</first><last>Aspillaga</last><affiliation>CENIA</affiliation></author>
+      <pages>6-11</pages>
+      <abstract>Active learning is an algorithmic approach that strategically selects a subset of examples for labeling, with the goal of reducing workload and required resources. Previous research has applied active learning to Neural Machine Translation (NMT) for high-resource or well-represented languages, achieving significant reductions in manual labor. In this study, we explore the application of active learning for NMT in the context of Mapudungun, a low-resource language spoken by the Mapuche community in South America. Mapudungun was chosen due to the limited number of fluent speakers and the pressing need to provide access to content predominantly available in widely represented languages. We assess both model-dependent and model-agnostic active learning strategies for NMT between Spanish and Mapudungun in both directions, demonstrating that we can achieve over 40% reduction in manual translation workload in both cases.</abstract>
+      <url hash="36ad9279">2023.americasnlp-1.2</url>
+      <bibkey>pendas-etal-2023-neural</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Understanding Native Language Identification for <fixed-case>B</fixed-case>razilian Indigenous Languages</title>
+      <author><first>Paulo</first><last>Cavalin</last><affiliation>IBM Research - Brazil</affiliation></author>
+      <author><first>Pedro</first><last>Domingues</last><affiliation>IBM Research Brazil</affiliation></author>
+      <author><first>Julio</first><last>Nogima</last><affiliation>IBM Research - Brazil</affiliation></author>
+      <author><first>Claudio</first><last>Pinhanez</last><affiliation>IBM Research</affiliation></author>
+      <pages>12-18</pages>
+      <abstract>We investigate native language identification (LangID) for Brazilian Indigenous Languages (BILs), using the Bible as training data. Our research extends from previous work, by presenting two analyses on the generalization of Bible-based LangID in non-biblical data. First, with newly collected non-biblical datasets, we show that such a LangID can still provide quite reasonable accuracy in languages for which there are more established writing standards, such as Guarani Mbya and Kaigang, but there can be a quite drastic drop in accuracy depending on the language. Then, we applied the LangID on a large set of texts, about 13M sentences from the Portuguese Wikipedia, towards understanding the difficulty factors may come out of such task in practice. The main outcome is that the lack of handling other American indigenous languages can affect considerably the precision for BILs, suggesting the need of a joint effort with related languages from the Americas.</abstract>
+      <url hash="11d70989">2023.americasnlp-1.3</url>
+      <bibkey>cavalin-etal-2023-understanding</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Codex to corpus: Exploring annotation and processing for an open and extensible machine-readable edition of the Florentine Codex</title>
+      <author><first>Francis</first><last>Tyers</last><affiliation>Indiana University</affiliation></author>
+      <author><first>Robert</first><last>Pugh</last><affiliation>Indiana University</affiliation></author>
+      <author><first>Valery</first><last>Berthoud F.</last><affiliation>Humboldt-Universitt zu Berlin</affiliation></author>
+      <pages>19-29</pages>
+      <abstract>This paper describes an ongoing effort to create, from the original hand-written text, a machine-readable, linguistically-annotated, and easily-searchable corpus of the Nahuatl portion of the Florentine Codex, a 16th century Mesoamerican manuscript written in Nahuatl and Spanish. The Codex consists of 12 books and over 300,000 tokens. We describe the process of annotating 3 of these books, the steps of text preprocessing undertaken, our approach to efficient manual processing and annotation, and some of the challenges faced along the way. We also report on a set of experiments evaluating our ability to automate the text processing tasks to aid in the remaining annotation effort, and find the results promising despite the relatively low volume of training data. Finally, we briefly present a real use case from the humanities that would benefit from the searchable, linguistically annotated corpus we describe.</abstract>
+      <url hash="61d0c697">2023.americasnlp-1.4</url>
+      <bibkey>tyers-etal-2023-codex</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Developing finite-state language technology for <fixed-case>M</fixed-case>aya</title>
+      <author><first>Robert</first><last>Pugh</last><affiliation>Indiana University</affiliation></author>
+      <author><first>Francis</first><last>Tyers</last><affiliation>Indiana University</affiliation></author>
+      <author><first>Quetzil</first><last>Castaeda</last><affiliation>Indiana University</affiliation></author>
+      <pages>30-39</pages>
+      <abstract>We describe a suite of finite-state language technologies for Maya, a Mayan language spoken in Mexico. At the core is a computational model of Maya morphology and phonology using a finite-state transducer. This model results in a morphological analyzer and a morphologically-informed spell-checker. All of these technologies are designed for use as both a pedagogical reading/writing aid for L2 learners and as a general language processing tool capable of supporting much of the natural variation in written Maya. We discuss the relevant features of Maya morphosyntax and orthography, and then outline the implementation details of the analyzer. To conclude, we present a longer-term vision for these tools and their use by both native speakers and learners.</abstract>
+      <url hash="e93af2ce">2023.americasnlp-1.5</url>
+      <bibkey>pugh-etal-2023-developing</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Modelling the Reduplicating <fixed-case>L</fixed-case>ushootseed Morphology with an <fixed-case>FST</fixed-case> and <fixed-case>LSTM</fixed-case></title>
+      <author><first>Jack</first><last>Rueter</last><affiliation>University of Helsinki, Digital Humanities</affiliation></author>
+      <author><first>Mika</first><last>Hmlinen</last><affiliation>Rootroo Ltd</affiliation></author>
+      <author><first>Khalid</first><last>Alnajjar</last><affiliation>University of Helsinki</affiliation></author>
+      <pages>40-46</pages>
+      <abstract>In this paper, we present an FST based approach for conducting morphological analysis, lemmatization and generation of Lushootseed words. Furthermore, we use the FST to generate training data for an LSTM based neural model and train this model to do morphological analysis. The neural model reaches a 71.9% accuracy on the test data. Furthermore, we discuss reduplication types in the Lushootseed language forms. The approach involves the use of both attested instances of reduplication and bare stems for applying a variety of reduplications to, as it is unclear just how much variation can be attributed to the individual speakers and authors of the source materials. That is, there may be areal factors that can be aligned with certain types of reduplication and their frequencies.</abstract>
+      <url hash="00bdb295">2023.americasnlp-1.6</url>
+      <bibkey>rueter-etal-2023-modelling</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Fine-tuning Sentence-<fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a to Construct Word Embeddings for Low-resource Languages from Bilingual Dictionaries</title>
+      <author><first>Diego</first><last>Bear</last><affiliation>University of New Brunswick</affiliation></author>
+      <author><first>Paul</first><last>Cook</last><affiliation>University of New Brunswick</affiliation></author>
+      <pages>47-57</pages>
+      <abstract>Conventional approaches to learning word embeddings (Mikolov et al., 2013; Pennington et al., 2014) are limited to relatively few languages with sufficiently large training corpora. To address this limitation, we propose an alternative approach to deriving word embeddings for Wolastoqey and Mi’kmaq that leverages definitions from a bilingual dictionary. More specifically, following Bear and Cook (2022), we experiment with encoding English definitions of Wolastoqey and Mi’kmaq words into vector representations using English sequence representation models. For this, we consider using and finetuning sentence-RoBERTa models (Reimers and Gurevych, 2019). We evaluate our word embeddings using a similar methodology to that of Bear and Cook using evaluations based on word classification, clustering and reverse dictionary search. We additionally construct word embeddings for higher-resource languages English, German and Spanishusing our methods and evaluate our embeddings on existing word-similarity datasets. Our findings indicate that our word embedding methods can be used to produce meaningful vector representations for low-resource languages such as Wolastoqey and Mi’kmaq and for higher-resource languages.</abstract>
+      <url hash="412950de">2023.americasnlp-1.7</url>
+      <bibkey>bear-cook-2023-fine</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Identification of Dialect for Eastern and <fixed-case>S</fixed-case>outhwestern <fixed-case>O</fixed-case>jibwe Words Using a Small Corpus</title>
+      <author><first>Kalvin</first><last>Hartwig</last><affiliation>Unaffiliated</affiliation></author>
+      <author><first>Evan</first><last>Lucas</last><affiliation>Michigan Technological University</affiliation></author>
+      <author><first>Timothy</first><last>Havens</last><affiliation>Michigan Technological University</affiliation></author>
+      <pages>58-66</pages>
+      <abstract>The Ojibwe language has several dialects that vary to some degree in both spoken and written form. We present a method of using support vector machines to classify two different dialects (Eastern and Southwestern Ojibwe) using a very small corpus of text. Classification accuracy at the sentence level is 90% across a five-fold cross validation and 72% when the sentence-trained model is applied to a data set of individual words. Our code and the word level data set are released openly on Github at [link to be inserted for final version, working demonstration notebook uploaded with paper].</abstract>
+      <url hash="0b27cefb">2023.americasnlp-1.8</url>
+      <bibkey>hartwig-etal-2023-identification</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Enriching <fixed-case>W</fixed-case>ayunaiki<fixed-case>S</fixed-case>panish Neural Machine Translation with Linguistic Information</title>
+      <author><first>Nora</first><last>Graichen</last><affiliation>UdS</affiliation></author>
+      <author><first>Josef</first><last>Van Genabith</last><affiliation>DFKI</affiliation></author>
+      <author><first>Cristina</first><last>Espaa-bonet</last><affiliation>DFKI GmbH</affiliation></author>
+      <pages>67-83</pages>
+      <abstract>We present the first neural machine translation system for the low-resource language pair WayunaikiSpanish and explore strategies to inject linguistic knowledge into the model to improve translation quality. We explore a wide range of methods and combine complementary approaches. Results indicate that incorporating linguistic information through linguistically motivated subword segmentation, factored models, and pretrained embeddings helps the system to generate improved translations, with the segmentation contributing the most.In order to evaluate translation quality in a general domain and go beyond the available religious domain data, we gather and make publicly available a new test set and supplementary material.Although translation quality as measured with automatic metrics is low, we hope these resources will facilitate and support further research on Wayunaiki.</abstract>
+      <url hash="1889dd25">2023.americasnlp-1.9</url>
+      <bibkey>graichen-etal-2023-enriching</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Towards the First Named Entity Recognition of <fixed-case>I</fixed-case>nuktitut for an Improved Machine Translation</title>
+      <author><first>Ngoc Tan</first><last>Le</last><affiliation>Universite du Quebec a Montreal</affiliation></author>
+      <author><first>Soumia</first><last>Kasdi</last><affiliation>Universite du Quebec a Montreal</affiliation></author>
+      <author><first>Fatiha</first><last>Sadat</last><affiliation>UQAM</affiliation></author>
+      <pages>84-93</pages>
+      <abstract>Named Entity Recognition is a crucial step to ensure good quality performance of several Natural Language Processing applications and tools, including machine translation and information retrieval. Moreover, it is considered as a fundamental module of many Natural Language Understanding tasks such as question-answering systems. This paper presents a first study on NER for an under-represented Indigenous Inuit language of Canada, Inuktitut, which lacks linguistic resources and large labeled data.Our proposed NER model for Inuktitut is built by transferring linguistic characteristics from English to Inuktitut, based on either rules or bilingual word embeddings. We provide an empirical study based on a comparison with the state of the art models and as well as intrinsic and extrinsic evaluations. In terms of Recall, Precision and F-score, the obtained results show the effectiveness of the proposed NER methods. Furthermore, it improved the performance of Inuktitut-English Neural Machine Translation.</abstract>
+      <url hash="b8695299">2023.americasnlp-1.10</url>
+      <bibkey>le-etal-2023-towards</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Parallel Corpus for Indigenous Language Translation: <fixed-case>S</fixed-case>panish-Mazatec and <fixed-case>S</fixed-case>panish-<fixed-case>M</fixed-case>ixtec</title>
+      <author><first>Atnafu Lambebo</first><last>Tonja</last><affiliation>Instituto Politcnico Nacional (IPN), Centro de Investigacin en Computacin (CIC)</affiliation></author>
+      <author><first>Christian</first><last>Maldonado-sifuentes</last><affiliation>TRAI-L .com</affiliation></author>
+      <author><first>David Alejandro</first><last>Mendoza Castillo</last><affiliation>TRAI-L.com</affiliation></author>
+      <author><first>Olga</first><last>Kolesnikova</last><affiliation>Instituto Politecnico Nacional</affiliation></author>
+      <author><first>No</first><last>Castro-snchez</last><affiliation>TecNM/Cenidet</affiliation></author>
+      <author><first>Grigori</first><last>Sidorov</last><affiliation>CIC-IPN</affiliation></author>
+      <author><first>Alexander</first><last>Gelbukh</last><affiliation>Instituto Politcnico Nacional</affiliation></author>
+      <pages>94-102</pages>
+      <abstract>In this paper, we present a parallel Spanish- Mazatec and Spanish-Mixtec corpus for machine translation (MT) tasks, where Mazatec and Mixtec are two indigenous Mexican languages. We evaluated the usability of the collected corpus using three different approaches: transformer, transfer learning, and fine-tuning pre-trained multilingual MT models. Fine-tuning the Facebook m2m100-48 model outperformed the other approaches, with BLEU scores of 12.09 and 22.25 for Mazatec-Spanish and Spanish-Mazatec translations, respectively, and 16.75 and 22.15 for Mixtec-Spanish and Spanish-Mixtec translations, respectively. The results indicate that translation performance is influenced by the dataset size (9,799 sentences in Mazatec and 13,235 sentences in Mixtec) and is more effective when indigenous languages are used as target languages. The findings emphasize the importance of creating parallel corpora for indigenous languages and fine-tuning models for low-resource translation tasks. Future research will investigate zero-shot and few-shot learning approaches to further improve translation performance in low-resource settings.</abstract>
+      <url hash="23179653">2023.americasnlp-1.11</url>
+      <bibkey>tonja-etal-2023-parallel</bibkey>
+    </paper>
+    <paper id="12">
+      <title>A finite-state morphological analyser for <fixed-case>H</fixed-case>ighland <fixed-case>P</fixed-case>uebla <fixed-case>N</fixed-case>ahuatl</title>
+      <author><first>Robert</first><last>Pugh</last><affiliation>Indiana University</affiliation></author>
+      <author><first>Francis</first><last>Tyers</last><affiliation>Indiana University</affiliation></author>
+      <pages>103-108</pages>
+      <abstract>This paper describes the development of a free/open-source finite-state morphologicaltransducer for Highland Puebla Nahuatl, a Uto-Aztecan language spoken in and around the stateof Puebla in Mexico. The finite-state toolkit used for the work is the Helsinki Finite-StateToolkit (HFST); we use the lexc formalism for modelling the morphotactics and twol formal-ism for modelling morphophonological alternations. An evaluation is presented which showsthat the transducer has a reasonable coveragearound 90%on freely-available corpora of the language, and high precisionover 95%on a manually verified test set</abstract>
+      <url hash="7b34f300">2023.americasnlp-1.12</url>
+      <bibkey>pugh-tyers-2023-finite</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Neural Machine Translation for the Indigenous Languages of the <fixed-case>A</fixed-case>mericas: An Introduction</title>
+      <author><first>Manuel</first><last>Mager</last><affiliation>Amazon AWS</affiliation></author>
+      <author><first>Rajat</first><last>Bhatnagar</last><affiliation>University of Colorado Boulder</affiliation></author>
+      <author><first>Graham</first><last>Neubig</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Ngoc Thang</first><last>Vu</last><affiliation>University of Stuttgart</affiliation></author>
+      <author><first>Katharina</first><last>Kann</last><affiliation>University of Colorado Boulder</affiliation></author>
+      <pages>109-133</pages>
+      <abstract>Neural models have drastically advanced state of the art for machine translation (MT) between high-resource languages. Traditionally, these models rely on large amounts of training data, but many language pairs lack these resources. However, an important part of the languages in the world do not have this amount of data. Most languages from the Americas are among them, having a limited amount of parallel and monolingual data, if any. Here, we present an introduction to the interested reader to the basic challenges, concepts, and techniques that involve the creation of MT systems for these languages. Finally, we discuss the recent advances and findings and open questions, product of an increased interest of the NLP community in these languages.</abstract>
+      <url hash="eb6b5d3d">2023.americasnlp-1.13</url>
+      <bibkey>mager-etal-2023-neural</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Community consultation and the development of an online Akuzipik-<fixed-case>E</fixed-case>nglish dictionary</title>
+      <author><first>Benjamin</first><last>Hunt</last><affiliation>George Mason University</affiliation></author>
+      <author><first>Lane</first><last>Schwartz</last><affiliation>University of Alaska Fairbanks</affiliation></author>
+      <author><first>Sylvia</first><last>Schreiner</last><affiliation>George Mason University</affiliation></author>
+      <author><first>Emily</first><last>Chen</last><affiliation>University of Illinois at Urbana-Champaign</affiliation></author>
+      <pages>134-143</pages>
+      <abstract>In this paper, we present a new online dictionary of Akuzipik, an Indigenous language of St. Lawrence Island (Alaska) and Chukotka (Russia).We discuss community desires for strengthening language use in the community and in educational settings, and present specific features of an online dictionary designed to serve these community goals.</abstract>
+      <url hash="e9dd2c88">2023.americasnlp-1.14</url>
+      <bibkey>hunt-etal-2023-community</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Finding words that aren’t there: Using word embeddings to improve dictionary search for low-resource languages</title>
+      <author><first>Antti</first><last>Arppe</last><affiliation>University of Alberta</affiliation></author>
+      <author><first>Andrew</first><last>Neitsch</last><affiliation>University of Alberta</affiliation></author>
+      <author><first>Daniel</first><last>Dacanay</last><affiliation>University of Alberta</affiliation></author>
+      <author><first>Jolene</first><last>Poulin</last><affiliation>University of Alberta</affiliation></author>
+      <author><first>Daniel</first><last>Hieber</last><affiliation>University of Alberta</affiliation></author>
+      <author><first>Atticus</first><last>Harrigan</last><affiliation>University of Alberta</affiliation></author>
+      <pages>144-155</pages>
+      <abstract>Modern machine learning techniques have produced many impressive results in language technology, but these techniques generally require an amount of training data that is many orders of magnitude greater than what exists for low-resource languages in general, and endangered ones in particular. However, dictionary definitions in a comparatively much more well-resourced majority language can provide a link between low-resource languages and machine learning models trained on massive amounts of majority-language data. By leveraging a pre-trained English word embedding to compute sentence embeddings for definitions in bilingual dictionaries for four Indigenous languages spoken in North America, Plains Cree (nhiyawwin), Arapaho (Hinno’itit), Northern Haida (Xaad Kl), and Tsuut’ina (Tst’n), we have obtained promising results for dictionary search. Not only are the search results in the majority language of the definitions more relevant, but they can be semantically relevant in ways not achievable with classic information retrieval techniques: users can perform successful searches for words that do not occur at all in the dictionary. These techniques are directly applicable to any bilingual dictionary providing translations between a high- and low-resource language.</abstract>
+      <url hash="ed4e596a">2023.americasnlp-1.15</url>
+      <bibkey>arppe-etal-2023-finding</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Enhancing <fixed-case>S</fixed-case>panish-<fixed-case>Q</fixed-case>uechua Machine Translation with Pre-Trained Models and Diverse Data Sources: <fixed-case>LCT</fixed-case>-<fixed-case>EHU</fixed-case> at <fixed-case>A</fixed-case>mericas<fixed-case>NLP</fixed-case> Shared Task</title>
+      <author><first>Nouman</first><last>Ahmed</last><affiliation>University of the Basque Country</affiliation></author>
+      <author><first>Natalia</first><last>Flechas Manrique</last><affiliation>University of the Basque Country</affiliation></author>
+      <author><first>Antonije</first><last>Petrovi</last><affiliation>University of the Basque Country</affiliation></author>
+      <pages>156-162</pages>
+      <abstract>We present the LCT-EHU submission to the AmericasNLP 2023 low-resource machine translation shared task. We focus on the Spanish-Quechua language pair and explore the usage of different approaches: (1) Obtain new parallel corpora from the literature and legal domains, (2) Compare a high-resource Spanish-English pre-trained MT model with a Spanish-Finnish pre-trained model (with Finnish being chosen as a target language due to its morphological similarity to Quechua), and (3) Explore additional techniques such as copied corpus and back-translation. Overall, we show that the Spanish-Finnish pre-trained model outperforms other setups, while low-quality synthetic data reduces the performance.</abstract>
+      <url hash="c319b076">2023.americasnlp-1.16</url>
+      <bibkey>ahmed-etal-2023-enhancing</bibkey>
+    </paper>
+    <paper id="17">
+      <title><fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> is not a good indigenous translator</title>
+      <author><first>David</first><last>Stap</last><affiliation>University of Amsterdam</affiliation></author>
+      <author><first>Ali</first><last>Araabi</last><affiliation>University of Amsterdam</affiliation></author>
+      <pages>163-167</pages>
+      <abstract>This report investigates the continuous challenges of Machine Translation (MT) systems on indigenous and extremely low-resource language pairs. Despite the notable achievements of Large Language Models (LLMs) that excel in various tasks, their applicability to low-resource languages remains questionable. In this study, we leveraged the AmericasNLP competition to evaluate the translation performance of different systems for Spanish to 11 indigenous languages from South America. Our team, LTLAmsterdam, submitted a total of four systems including GPT-4, a bilingual model, fine-tuned M2M100, and a combination of fine-tuned M2M100 with $k$NN-MT. We found that even large language models like GPT-4 are not well-suited for extremely low-resource languages. Our results suggest that fine-tuning M2M100 models can offer significantly better performance for extremely low-resource translation.</abstract>
+      <url hash="29f87327">2023.americasnlp-1.17</url>
+      <bibkey>stap-araabi-2023-chatgpt</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Few-shot <fixed-case>S</fixed-case>panish-<fixed-case>A</fixed-case>ymara Machine Translation Using <fixed-case>E</fixed-case>nglish-<fixed-case>A</fixed-case>ymara Lexicon</title>
+      <author><first>Liling</first><last>Tan</last><affiliation>Amazon</affiliation></author>
+      <pages>168-172</pages>
+      <url hash="a6d82c1e">2023.americasnlp-1.18</url>
+      <abstract>This paper presents the experiments to train a Spanish-Aymara machine translation model for the AmericasNLP 2023 Machine Translation shared task. We included the English-Aymara GlobalVoices corpus and an English-Aymara lexicon to train the model and limit our training resources to train the model in a \textit{few-shot} manner.</abstract>
+      <bibkey>tan-2023-shot</bibkey>
+    </paper>
+    <paper id="19">
+      <title><fixed-case>P</fixed-case>lay<fixed-case>G</fixed-case>round Low Resource Machine Translation System for the 2023 <fixed-case>A</fixed-case>mericas<fixed-case>NLP</fixed-case> Shared Task</title>
+      <author><first>Tianrui</first><last>Gu</last><affiliation>University of California, Santa Barbara</affiliation></author>
+      <author><first>Kaie</first><last>Chen</last><affiliation>University of California, Santa Barbara</affiliation></author>
+      <author><first>Siqi</first><last>Ouyang</last><affiliation>University of California, Santa Barbara</affiliation></author>
+      <author><first>Lei</first><last>Li</last><affiliation>University of California Santa Barbara</affiliation></author>
+      <pages>173-176</pages>
+      <abstract>This paper presents PlayGround’s submission to the AmericasNLP 2023 shared task on machine translation (MT) into indigenous languages. We finetuned NLLB-600M, a multilingual MT model pre-trained on Flores-200, on 10 low-resource language directions and examined the effectiveness of weight averaging and back translation. Our experiments showed that weight averaging, on average, led to a 0.0169 improvement in the ChrF++ score. Additionally, we found that back translation resulted in a 0.008 improvement in the ChrF++ score.</abstract>
+      <url hash="70c4244c">2023.americasnlp-1.19</url>
+      <bibkey>gu-etal-2023-playground</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Four Approaches to Low-Resource Multilingual <fixed-case>NMT</fixed-case>: The <fixed-case>H</fixed-case>elsinki Submission to the <fixed-case>A</fixed-case>mericas<fixed-case>NLP</fixed-case> 2023 Shared Task</title>
+      <author><first>Ona</first><last>De Gibert</last><affiliation>University of Helsinki</affiliation></author>
+      <author><first>Ral</first><last>Vzquez</last><affiliation>University of Helsinki</affiliation></author>
+      <author><first>Mikko</first><last>Aulamo</last><affiliation>University of Helsinki</affiliation></author>
+      <author><first>Yves</first><last>Scherrer</last><affiliation>University of Helsinki</affiliation></author>
+      <author><first>Sami</first><last>Virpioja</last><affiliation>University of Helsinki</affiliation></author>
+      <author><first>Jrg</first><last>Tiedemann</last><affiliation>University of Helsinki</affiliation></author>
+      <pages>177-191</pages>
+      <abstract>The Helsinki-NLP team participated in the AmericasNLP 2023 Shared Task with 6 submissions for all 11 language pairs arising from 4 different multilingual systems. We provide a detailed look at the work that went into collecting and preprocessing the data that led to our submissions. We explore various setups for multilingual Neural Machine Translation (NMT), namely knowledge distillation and transfer learning, multilingual NMT including a high-resource language (English), language-specific fine-tuning, and multilingual NMT exclusively using low-resource data. Our multilingual Model B ranks first in 4 out of the 11 language pairs.</abstract>
+      <url hash="0400493b">2023.americasnlp-1.20</url>
+      <bibkey>de-gibert-etal-2023-four</bibkey>
+    </paper>
+    <paper id="21">
+      <title><fixed-case>S</fixed-case>heffield’s Submission to the <fixed-case>A</fixed-case>mericas<fixed-case>NLP</fixed-case> Shared Task on Machine Translation into Indigenous Languages</title>
+      <author><first>Edward</first><last>Gow-smith</last><affiliation>University of Sheffield</affiliation></author>
+      <author><first>Danae</first><last>Snchez Villegas</last><affiliation>University of Sheffield</affiliation></author>
+      <pages>192-199</pages>
+      <abstract>The University of Sheffield took part in the shared task 2023 AmericasNLP for all eleven language pairs. Our models consist of training different variations of NLLB-200 model on data provided by the organizers and available data from various sources such as constitutions, handbooks and news articles. Our models outperform the baseline model on the development set on chrF with substantial improvements particularly for Aymara, Guarani and Quechua. On the test set, our best submission achieves the highest average chrF of all the submissions, we rank first in four of the eleven languages, and at least one of our models ranks in the top 3 for all languages.</abstract>
+      <url hash="6ab71941">2023.americasnlp-1.21</url>
+      <bibkey>gow-smith-snchez-villegas-2023-sheffields</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Enhancing Translation for Indigenous Languages: Experiments with Multilingual Models</title>
+      <author><first>Atnafu Lambebo</first><last>Tonja</last><affiliation>Instituto Politcnico Nacional (IPN), Centro de Investigacin en Computacin (CIC)</affiliation></author>
+      <author><first>Hellina Hailu</first><last>Nigatu</last><affiliation>UC Berkeley</affiliation></author>
+      <author><first>Olga</first><last>Kolesnikova</last><affiliation>Instituto Politecnico Nacional</affiliation></author>
+      <author><first>Grigori</first><last>Sidorov</last><affiliation>CIC-IPN</affiliation></author>
+      <author><first>Alexander</first><last>Gelbukh</last><affiliation>Instituto Politcnico Nacional</affiliation></author>
+      <author><first>Jugal</first><last>Kalita</last><affiliation>University of Colorado</affiliation></author>
+      <pages>200-205</pages>
+      <abstract>This paper describes CIC NLP’s submission to the AmericasNLP 2023 Shared Task on machine translation systems for indigenous languages of the Americas. We present the system descriptions for three methods. We used two multilingual models, namely M2M-100 and mBART50, and one bilingual (one-to-one) — Helsinki NLP Spanish-English translation model, and experimented with different transfer learning setups. We experimented with 11 languages from America and report the setups we used as well as the results we achieved. Overall, the mBART setup was able to improve upon the baseline for three out of the eleven languages.</abstract>
+      <url hash="43f2bd8d">2023.americasnlp-1.22</url>
+      <bibkey>tonja-etal-2023-enhancing</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Findings of the <fixed-case>A</fixed-case>mericas<fixed-case>NLP</fixed-case> 2023 Shared Task on Machine Translation into Indigenous Languages</title>
+      <author><first>Abteen</first><last>Ebrahimi</last><affiliation>University of Colorado, Boulder</affiliation></author>
+      <author><first>Manuel</first><last>Mager</last><affiliation>Amazon AWS</affiliation></author>
+      <author><first>Shruti</first><last>Rijhwani</last><affiliation>Google</affiliation></author>
+      <author><first>Enora</first><last>Rice</last><affiliation>University of Colorado Boulder</affiliation></author>
+      <author><first>Arturo</first><last>Oncevay</last><affiliation>The University of Edinburgh</affiliation></author>
+      <author><first>Claudia</first><last>Baltazar</last><affiliation/></author>
+      <author><first>María</first><last>Cortés</last><affiliation/></author>
+      <author><first>Cynthia</first><last>Montaño</last><affiliation>University of California, Berkeley</affiliation></author>
+      <author><first>John E.</first><last>Ortega</last><affiliation>Northeastern University</affiliation></author>
+      <author><first>Rolando</first><last>Coto-solano</last><affiliation>Dartmouth College</affiliation></author>
+      <author><first>Hilaria</first><last>Cruz</last><affiliation>University of Louisville</affiliation></author>
+      <author><first>Alexis</first><last>Palmer</last><affiliation>University of Colorado Boulder</affiliation></author>
+      <author><first>Katharina</first><last>Kann</last><affiliation>University of Colorado Boulder</affiliation></author>
+      <pages>206-219</pages>
+      <abstract>In this work, we present the results of the AmericasNLP 2023 Shared Task on Machine Translation into Indigenous Languages of the Americas. This edition of the shared task featured eleven language pairs, one of which – Chatino-Spanish – uses a newly collected evaluation dataset, consisting of professionally translated text from the legal domain. Seven teams participated in the shared task, with a total of 181 submissions. Additionally, we conduct a human evaluation of the best system outputs, and compare them to the best submissions from the prior shared task. We find that this analysis agrees with the quantitative measures used to rank submissions, which shows further improvements of 9.64 ChrF on average across all languages, when compared to the prior winning system.</abstract>
+      <url hash="946c2904">2023.americasnlp-1.23</url>
+      <bibkey>ebrahimi-etal-2023-findings</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.cawl.xml b/data/xml/2023.cawl.xml
new file mode 100644
index 0000000000..f0c905213d
--- /dev/null
+++ b/data/xml/2023.cawl.xml
@@ -0,0 +1,136 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.cawl">
+  <volume id="1" ingest-date="2023-07-12">
+    <meta>
+      <booktitle>Proceedings of the Workshop on Computation and Written Language (CAWL 2023)</booktitle>
+      <editor><first>Kyle</first><last>Gorman</last></editor>
+      <editor><first>Richard</first><last>Sproat</last></editor>
+      <editor><first>Brian</first><last>Roark</last></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Toronto, Canada</address>
+      <month>July</month>
+      <year>2023</year>
+      <url hash="7439f297">2023.cawl-1</url>
+      <venue>cawl</venue>
+    </meta>
+    <frontmatter>
+      <url hash="fb4621c5">2023.cawl-1.0</url>
+      <bibkey>cawl-2023-computation</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Myths about Writing Systems in Speech &amp; Language Technology</title>
+      <author><first>Kyle</first><last>Gorman</last><affiliation>The Graduate Center, City University of New York</affiliation></author>
+      <author><first>Richard</first><last>Sproat</last><affiliation>Google, Japan</affiliation></author>
+      <pages>1-5</pages>
+      <abstract>Natural language processing is largely focused on written text processing. However, many computational linguists tacitly endorse myths about the nature of writing. We highlight two of these myths—the conflation of language and writing, and the notion that Chinese, Japanese, and Korean writing is ideographic—and suggest how the community can dispel them.</abstract>
+      <url hash="a9a8c2b8">2023.cawl-1.1</url>
+      <bibkey>gorman-sproat-2023-myths</bibkey>
+    </paper>
+    <paper id="2">
+      <title>The Hidden Folk: Linguistic Properties Encoded in Multilingual Contextual Character Representations</title>
+      <author><first>Manex</first><last>Agirrezabal</last><affiliation>University of Copenhagen</affiliation></author>
+      <author><first>Sidsel</first><last>Boldsen</last><affiliation>University of Copenhagen</affiliation></author>
+      <author><first>Nora</first><last>Hollenstein</last><affiliation>University of Copenhagen</affiliation></author>
+      <pages>6-13</pages>
+      <abstract>To gain a better understanding of the linguistic information encoded in character-based language models, we probe the multilingual contextual CANINE model. We design a range of phonetic probing tasks in six Nordic languages, including Faroese as an additional zero-shot instance. We observe that some phonetic information is indeed encoded in the character representations, as consonants and vowels can be well distinguished using a linear classifier. Furthermore, results for the Danish and Norwegian language seem to be worse for the consonant/vowel distinction in comparison to other languages. The information encoded in these representations can also be learned in a zero-shot scenario, as Faroese shows a reasonably good performance in the same vowel/consonant distinction task.</abstract>
+      <url hash="747f417d">2023.cawl-1.2</url>
+      <bibkey>agirrezabal-etal-2023-hidden</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Preserving the Authenticity of Handwritten Learner Language: Annotation Guidelines for Creating Transcripts Retaining Orthographic Features</title>
+      <author><first>Christian</first><last>Gold</last><affiliation>Fernuniversitaet Hagen</affiliation></author>
+      <author><first>Ronja</first><last>Laarmann-quante</last><affiliation>Ruhr University Bochum</affiliation></author>
+      <author><first>Torsten</first><last>Zesch</last><affiliation>Computational Linguistics, FernUniversität in Hagen</affiliation></author>
+      <pages>14-21</pages>
+      <abstract>Handwritten texts produced by young learners often contain orthographic features like spelling errors, capitalization errors, punctuation mistakes, and impurities such as strikethrough, inserts, and smudges that are typically normalized or ignored in existing transcriptions. For applications like handwriting recognition with the goal of automatically analyzing a learner’s language performance, however, retaining such features would be necessary.To address this, we present transcription guidelines that retain the features addressed above. Our guidelines were developed iteratively and include numerous example images to illustrate the various issues.On a subset of about 90 double-transcribed texts, we compute inter-annotator agreement and show that our guidelines can be applied with high levels of percentage agreement of about .98.Overall, we transcribed 1,350 learner texts, which is about the same size as the widely adopted handwriting recognition datasets IAM (1,500 pages) and CVL (1,600 pages).Our final corpus can be used to train a handwriting recognition system that transcribes closely to the real productions by young learners.Such a system is a prerequisite for applying automatic orthography feedback systems to handwritten texts in the future.</abstract>
+      <url hash="981087ce">2023.cawl-1.3</url>
+      <bibkey>gold-etal-2023-preserving</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Exploring the Impact of Transliteration on <fixed-case>NLP</fixed-case> Performance for Low-Resource Languages: The Case of <fixed-case>M</fixed-case>altese and <fixed-case>A</fixed-case>rabic</title>
+      <author><first>Kurt</first><last>Micallef</last><affiliation>University of Malta</affiliation></author>
+      <author><first>Fadhl</first><last>Eryani</last><affiliation>University of Tübingen</affiliation></author>
+      <author><first>Nizar</first><last>Habash</last><affiliation>New York University Abu Dhabi</affiliation></author>
+      <author><first>Houda</first><last>Bouamor</last><affiliation>Carnegie Mellon University in Qatar</affiliation></author>
+      <author><first>Claudia</first><last>Borg</last><affiliation>University of Malta</affiliation></author>
+      <pages>22-32</pages>
+      <abstract>Maltese is a low-resource language of Arabic and Romance origins written in Latin script. We explore the impact of transliterating Maltese into Arabic script on a number of downstream tasks. We compare multiple transliteration pipelines ranging from simple one-to-one character maps to more sophisticated alternatives that explore multiple possibilities or make use of manual linguistic annotations. We show that the sophisticated systems are consistently better than simpler systems, quantitatively and qualitatively. We also show transliterating Maltese can be considered as an option to improve the cross-lingual transfer capabilities.</abstract>
+      <url hash="d45cd466">2023.cawl-1.4</url>
+      <bibkey>micallef-etal-2023-exploring</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Distinguishing <fixed-case>R</fixed-case>omanized <fixed-case>H</fixed-case>indi from <fixed-case>R</fixed-case>omanized <fixed-case>U</fixed-case>rdu</title>
+      <author><first>Elizabeth</first><last>Nielsen</last><affiliation>University of Edinburgh</affiliation></author>
+      <author><first>Christo</first><last>Kirov</last><affiliation>Google</affiliation></author>
+      <author><first>Brian</first><last>Roark</last><affiliation>Google Inc.</affiliation></author>
+      <pages>33-42</pages>
+      <abstract>We examine the task of distinguishing between Hindi and Urdu when those languages are romanized, i.e., written in the Latin script. Both languages are widely informally romanized, and to the extent that they are identified in the Latin script by language identification systems, they are typically conflated. In the absence of large labeled collections of such text, we consider methods for generating training data. Beginning with a small set of seed words, each of which are strongly indicative of one of the languages versus the other, we prompt a pretrained large language model (LLM) to generate romanized text. Treating text generated from an Urdu prompt as one class and text generated from a Hindi prompt as the other class, we build a binary language identification (LangID) classifier. We demonstrate that the resulting classifier distinguishes manually romanized Urdu Wikipedia text from manually romanized Hindi Wikipedia text far better than chance. We use this classifier to estimate the prevalence of Urdu in a large collection of text labeled as romanized Hindi that has been used to train large language models. These techniques can be applied to bootstrap classifiers in other cases where a dataset is known to contain multiple distinct but related classes, such as different dialects of the same language, but for which labels cannot easily be obtained.</abstract>
+      <url hash="94c529f5">2023.cawl-1.5</url>
+      <bibkey>nielsen-etal-2023-distinguishing</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Back-Transliteration of <fixed-case>E</fixed-case>nglish Loanwords in <fixed-case>J</fixed-case>apanese</title>
+      <author><first>Yuying</first><last>Ren</last><affiliation>Graduate Center, City University of New York</affiliation></author>
+      <pages>43-49</pages>
+      <abstract>We propose methods for transliterating English loanwords in Japanese from their Japanese written form (katakana/romaji) to their original English written form. Our data is a Japanese-English loanwords dictionary that we have created ourselves. We employ two approaches: the direct transliteration, which directly converts words from katakana to English, and the indirect transliteration, which utilizes the English pronunciation as an intermediate step. Additionally, we compare the effectiveness of using katakana versus romaji as input characters. We develop 6 models of 2 types for our experiments: one with an English lexicon-filter, and the other without. For each type, we built 3 models, including a pair n-gram based on WFSTs and two sequence-to-sequence models leveraging LSTM and transformer. Our best performing model was the pair n-gram model with a lexicon-filter, directly transliterating from katakana to English.</abstract>
+      <url hash="5e8583b6">2023.cawl-1.6</url>
+      <bibkey>ren-2023-back</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Pronunciation Ambiguities in <fixed-case>J</fixed-case>apanese Kanji</title>
+      <author><first>Wen</first><last>Zhang</last><affiliation>The Graduate Center, City University of New York</affiliation></author>
+      <pages>50-60</pages>
+      <abstract>Japanese writing is a complex system, and a large part of the complexity resides in the use of kanji. A single kanji character in modern Japanese may have multiple pronunciations, either as native vocabulary or as words borrowed from Chinese. This causes a problem for text-to-speech synthesis (TTS) because the system has to predict which pronunciation of each kanji character is appropriate in the context. The problem is called homograph disambiguation. To solve the problem, this research provides a new annotated Japanese single kanji character pronunciation data set and describes an experiment using the logistic regression (LR) classifier. A baseline is computed to compare with the LR classifier accuracy. This experiment provides the first experimental research in Japanese single kanji homograph disambiguation. The annotated Japanese data is freely released to the public to support further work.</abstract>
+      <url hash="2a265f6f">2023.cawl-1.7</url>
+      <bibkey>zhang-2023-pronunciation</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Lenient Evaluation of <fixed-case>J</fixed-case>apanese Speech Recognition: Modeling Naturally Occurring Spelling Inconsistency</title>
+      <author><first>Shigeki</first><last>Karita</last><affiliation>Google</affiliation></author>
+      <author><first>Richard</first><last>Sproat</last><affiliation>Google, Japan</affiliation></author>
+      <author><first>Haruko</first><last>Ishikawa</last><affiliation>Google, Japan</affiliation></author>
+      <pages>61-70</pages>
+      <abstract>Word error rate (WER) and character error rate (CER) are standard metrics inSpeech Recognition (ASR), but one problem has always been alternative spellings: If one’s system transcribes adviser whereas the ground truth has advisor, this will count as an error even though the two spellings really represent the same word.Japanese is notorious for “lacking orthography”: most words can be spelled in multiple ways, presenting a problem for accurate ASR evaluation. In this paper we propose a new lenient evaluation metric as a more defensible CER measure for Japanese ASR. We create a lattice of plausible respellings of the reference transcription, using a combination of lexical resources, a Japanese text-processing system, and a neural machine translation model for reconstructing kanji from hiragana or katakana. In amanual evaluation, raters rated 95.4% of the proposed spelling variants as plausible. ASR results show that our method, which does not penalize the system for choosing a valid alternate spelling of a word, affords a 2.4%–3.1% absolute reduction in CER depending on the task.</abstract>
+      <url hash="1d753ff4">2023.cawl-1.8</url>
+      <bibkey>karita-etal-2023-lenient</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Disambiguating Numeral Sequences to Decipher Ancient Accounting Corpora</title>
+      <author><first>Logan</first><last>Born</last><affiliation>Simon Fraser University</affiliation></author>
+      <author><first>M. Willis</first><last>Monroe</last><affiliation>University of British Columbia</affiliation></author>
+      <author><first>Kathryn</first><last>Kelley</last><affiliation>Università di Bologna</affiliation></author>
+      <author><first>Anoop</first><last>Sarkar</last><affiliation>Simon Fraser University</affiliation></author>
+      <pages>71-81</pages>
+      <abstract>A numeration system encodes abstract numeric quantities as concrete strings of written characters. The numeration systems used by modern scripts tend to be precise and unambiguous, but this was not so for the ancient and partially-deciphered proto-Elamite (PE) script, where written numerals can have up to four distinct readings depending on the system that is used to read them. We consider the task of disambiguating between these readings in order to determine the values of the numeric quantities recorded in this corpus. We contribute an automated conversion from PE notation to modern Hindu-Arabic notation, as well as two disambiguation techniques based on structural properties of the original documents and classifiers learned with the bootstrapping algorithm. We also contribute a test set for evaluating disambiguation techniques, as well as a novel approach to cautious rule selection for bootstrapped classifiers. Our analysis confirms existing intuitions about this script and reveals previously-unknown correlations between tablet content and numeral magnitude. This work is crucial to understanding and deciphering PE, as the corpus is heavily accounting-focused and contains many more numeric tokens than tokens of text.</abstract>
+      <url hash="9924d21d">2023.cawl-1.9</url>
+      <bibkey>born-etal-2023-disambiguating</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Decipherment of Lost Ancient Scripts as Combinatorial Optimisation Using Coupled Simulated Annealing</title>
+      <author><first>Fabio</first><last>Tamburini</last><affiliation>FICLIT - University of Bologna</affiliation></author>
+      <pages>82-91</pages>
+      <abstract>This paper presents a new approach to the ancient scripts decipherment problem based on combinatorial optimisation and coupled simulated annealing. The proposed system is able to produce enhanced results in cognate identification when compared to the state-of-the-art systems on standard evaluation benchmarks used in literature.</abstract>
+      <url hash="740b9cbf">2023.cawl-1.10</url>
+      <bibkey>tamburini-2023-decipherment</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Learning the Character Inventories of Undeciphered Scripts Using Unsupervised Deep Clustering</title>
+      <author><first>Logan</first><last>Born</last><affiliation>Simon Fraser University</affiliation></author>
+      <author><first>M. Willis</first><last>Monroe</last><affiliation>University of British Columbia</affiliation></author>
+      <author><first>Kathryn</first><last>Kelley</last><affiliation>Università di Bologna</affiliation></author>
+      <author><first>Anoop</first><last>Sarkar</last><affiliation>Simon Fraser University</affiliation></author>
+      <pages>92-104</pages>
+      <url hash="49e2ce97">2023.cawl-1.11</url>
+      <bibkey>born-etal-2023-learning</bibkey>
+      <abstract>A crucial step in deciphering a text is to identify what set of characters were used to write it. This requires grouping character tokens according to visual and contextual features, which can be challenging for human analysts when the number of tokens or underlying types is large. Prior work has shown that this process can be automated by clustering dense representations of character images, in a task which we call “script clustering”. In this work, we present novel architectures which exploit varying degrees of contextual and visual information to learn representations for use in script clustering. We evaluate on a range of modern and ancient scripts, and find that our models produce representations which are more effective for script recovery than the current state-of-the-art, despite using just ~2\% as many parameters. Our analysis fruitfully applies these models to assess hypotheses about the character inventory of the partially-deciphered proto-Elamite script.</abstract>
+    </paper>
+    <paper id="12">
+      <title>A Mutual Information-based Approach to Quantifying Logography in <fixed-case>J</fixed-case>apanese and <fixed-case>S</fixed-case>umerian</title>
+      <author><first>Noah</first><last>Hermalin</last><affiliation>University of California Berkeley</affiliation></author>
+      <pages>105-110</pages>
+      <abstract>Writing systems have traditionally been classified by whether they prioritize encoding phonological information (phonographic) versus morphological or semantic information (logographic). Recent work has broached the question of how membership in these categories can be quantified. Sproat and Gutkin (2021) proposed a range of metrics by which degree of logography can be quantified, including mutual information and a metric based on contextual attention required by a sequence-to-sequence RNN that maps pronunciations to spellings. We aim to build on this work by treating a definition of logography which, in contrast to the definition used by Sproat and Gutkin, more directly incorporates morphological identity. We compare mutual information between graphic forms and phonological forms and between graphic forms and morphological identity for written Japanese and Sumerian. Our results suggest that our methods present a promising means of classifying the degree to which a writing system is logographic or phonographic.</abstract>
+      <url hash="42e8a715">2023.cawl-1.12</url>
+      <bibkey>hermalin-2023-mutual</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.clinicalnlp.xml b/data/xml/2023.clinicalnlp.xml
new file mode 100644
index 0000000000..397c1e5fd8
--- /dev/null
+++ b/data/xml/2023.clinicalnlp.xml
@@ -0,0 +1,684 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.clinicalnlp">
+  <volume id="1" ingest-date="2023-07-12">
+    <meta>
+      <booktitle>Proceedings of the 5th Clinical Natural Language Processing Workshop</booktitle>
+      <editor><first>Tristan</first><last>Naumann</last></editor>
+      <editor><first>Asma</first><last>Ben Abacha</last></editor>
+      <editor><first>Steven</first><last>Bethard</last></editor>
+      <editor><first>Kirk</first><last>Roberts</last></editor>
+      <editor><first>Anna</first><last>Rumshisky</last></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Toronto, Canada</address>
+      <month>July</month>
+      <year>2023</year>
+      <url hash="0e3fa54f">2023.clinicalnlp-1</url>
+      <venue>clinicalnlp</venue>
+    </meta>
+    <paper id="1">
+      <title>Clinical <fixed-case>BERTS</fixed-case>core: An Improved Measure of Automatic Speech Recognition Performance in Clinical Settings</title>
+      <author><first>Joel</first><last>Shor</last><affiliation>Google</affiliation></author>
+      <author><first>Ruyue Agnes</first><last>Bi</last></author>
+      <author><first>Subhashini</first><last>Venugopalan</last><affiliation>Google</affiliation></author>
+      <author><first>Steven</first><last>Ibara</last><affiliation>Cornell University</affiliation></author>
+      <author><first>Roman</first><last>Goldenberg</last></author>
+      <author><first>Ehud</first><last>Rivlin</last><affiliation>Technion, Technion</affiliation></author>
+      <pages>1-7</pages>
+      <abstract>Automatic Speech Recognition (ASR) in medical contexts has the potential to save time, cut costs, increase report accuracy, and reduce physician burnout. However, the healthcare industry has been slower to adopt this technology, in part due to the importance of avoiding medically-relevant transcription mistakes. In this work, we present the Clinical BERTScore (CBERTScore), an ASR metric that penalizes clinically-relevant mistakes more than others. We collect a benchmark of 18 clinician preferences on 149 realistic medical sentences called the Clinician Transcript Preference benchmark (CTP) and make it publicly available for the community to further develop clinically-aware ASR metrics. To our knowledge, this is the first public dataset of its kind. We demonstrate that our metric more closely aligns with clinician preferences on medical sentences as compared to other metrics (WER, BLUE, METEOR, etc), sometimes by wide margins.</abstract>
+      <url hash="7038a157">2023.clinicalnlp-1.1</url>
+      <bibkey>shor-etal-2023-clinical</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Medical Visual Textual Entailment for Numerical Understanding of Vision-and-Language Models</title>
+      <author><first>Hitomi</first><last>Yanaka</last><affiliation>the University of Tokyo and RIKEN</affiliation></author>
+      <author><first>Yuta</first><last>Nakamura</last><affiliation>The University of Tokyo</affiliation></author>
+      <author><first>Yuki</first><last>Chida</last></author>
+      <author><first>Tomoya</first><last>Kurosawa</last></author>
+      <pages>8-18</pages>
+      <abstract>Assessing the capacity of numerical understanding of vision-and-language models over images and texts is crucial for real vision-and-language applications, such as systems for automated medical image analysis.We provide a visual reasoning dataset focusing on numerical understanding in the medical domain.The experiments using our dataset show that current vision-and-language models fail to perform numerical inference in the medical domain.However, the data augmentation with only a small amount of our dataset improves the model performance, while maintaining the performance in the general domain.</abstract>
+      <url hash="c8a14e7b">2023.clinicalnlp-1.2</url>
+      <bibkey>yanaka-etal-2023-medical</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Privacy-Preserving Knowledge Transfer through Partial Parameter Sharing</title>
+      <author><first>Paul</first><last>Youssef</last><affiliation>Phillips-Universität Marburg</affiliation></author>
+      <author><first>Jörg</first><last>Schlötterer</last><affiliation>Universität Mannheim and Phillips-Universität Marburg</affiliation></author>
+      <author><first>Christin</first><last>Seifert</last><affiliation>Phillips-Universität Marburg and University of Twente</affiliation></author>
+      <pages>19-23</pages>
+      <abstract>Valuable datasets that contain sensitive information are not shared due to privacy and copyright concerns. This hinders progress in many areas and prevents the use of machine learning solutions to solve relevant tasks. One possible solution is sharing models that are trained on such datasets. However, this is also associated with potential privacy risks due to data extraction attacks. In this work, we propose a solution based on sharing parts of the model’s parameters, and using a proxy dataset for complimentary knowledge transfer. Our experiments show encouraging results, and reduced risk to potential training data identification attacks. We present a viable solution to sharing knowledge with data-disadvantaged parties, that do not have the resources to produce high-quality data, with reduced privacy risks to the sharing parties. We make our code publicly available.</abstract>
+      <url hash="82179706">2023.clinicalnlp-1.3</url>
+      <bibkey>youssef-etal-2023-privacy</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Breaking Barriers: Exploring the Diagnostic Potential of Speech Narratives in <fixed-case>H</fixed-case>indi for <fixed-case>A</fixed-case>lzheimer’s Disease</title>
+      <author><first>Kritesh</first><last>Rauniyar</last></author>
+      <author><first>Shuvam</first><last>Shiwakoti</last></author>
+      <author><first>Sweta</first><last>Poudel</last></author>
+      <author><first>Surendrabikram</first><last>Thapa</last></author>
+      <author><first>Usman</first><last>Naseem</last></author>
+      <author><first>Mehwish</first><last>Nasim</last><affiliation>University of Western Australia and Flinders University of South Australia</affiliation></author>
+      <pages>24-30</pages>
+      <abstract>Alzheimer’s Disease (AD) is a neurodegenerative disorder that affects cognitive abilities and memory, especially in older adults. One of the challenges of AD is that it can be difficult to diagnose in its early stages. However, recent research has shown that changes in language, including speech decline and difficulty in processing information, can be important indicators of AD and may help with early detection. Hence, the speech narratives of the patients can be useful in diagnosing the early stages of Alzheimer’s disease. While the previous works have presented the potential of using speech narratives to diagnose AD in high-resource languages, this work explores the possibility of using a low-resourced language, i.e., Hindi language, to diagnose AD. In this paper, we present a dataset specifically for analyzing AD in the Hindi language, along with experimental results using various state-of-the-art algorithms to assess the diagnostic potential of speech narratives in Hindi. Our analysis suggests that speech narratives in the Hindi language have the potential to aid in the diagnosis of AD. Our dataset and code are made publicly available at https://github.com/rkritesh210/DementiaBankHindi.</abstract>
+      <url hash="f1c2c3f4">2023.clinicalnlp-1.4</url>
+      <bibkey>rauniyar-etal-2023-breaking</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Investigating Massive Multilingual Pre-Trained Machine Translation Models for Clinical Domain via Transfer Learning</title>
+      <author><first>Lifeng</first><last>Han</last></author>
+      <author><first>Gleb</first><last/></author>
+      <author><first>Irina</first><last>Sorokina</last></author>
+      <author><first>Serge</first><last>Gladkoff</last><affiliation>Logrus Global AI Lab</affiliation></author>
+      <author><first>Goran</first><last>Nenadic</last><affiliation>University of Manchester</affiliation></author>
+      <pages>31-40</pages>
+      <abstract>Massively multilingual pre-trained language models (MMPLMs) are developed in recent years demonstrating superpowers and the pre-knowledge they acquire for downstream tasks.This work investigates whether MMPLMs can be applied to clinical domain machine translation (MT) towards entirely unseen languages via transfer learning.We carry out an experimental investigation using Meta-AI’s MMPLMs “wmt21-dense-24-wide-en-X and X-en (WMT21fb)” which were pre-trained on 7 language pairs and 14 translation directions including English to Czech, German, Hausa, Icelandic, Japanese, Russian, and Chinese, and the opposite direction.We fine-tune these MMPLMs towards English-<i>Spanish</i> language pair which <i>did not exist at all</i> in their original pre-trained corpora both implicitly and explicitly.We prepare carefully aligned <i>clinical</i> domain data for this fine-tuning, which is different from their original mixed domain knowledge.Our experimental result shows that the fine-tuning is very successful using just 250k well-aligned in-domain EN-ES segments for three sub-task translation testings: clinical cases, clinical terms, and ontology concepts. It achieves very close evaluation scores to another MMPLM NLLB from Meta-AI, which included Spanish as a high-resource setting in the pre-training.To the best of our knowledge, this is the first work on using MMPLMs towards <i>clinical domain transfer-learning NMT</i> successfully for totally unseen languages during pre-training.</abstract>
+      <url hash="73fa3c4a">2023.clinicalnlp-1.5</url>
+      <bibkey>han-etal-2023-investigating</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Tracking the Evolution of Covid-19 Symptoms through Clinical Conversations</title>
+      <author><first>Ticiana</first><last>Coelho Da Silva</last><affiliation>Universidade Federal do Ceará</affiliation></author>
+      <author><first>José</first><last>Fernandes De Macêdo</last></author>
+      <author><first>Régis</first><last>Magalhães</last><affiliation>Universidade Federal do Ceará</affiliation></author>
+      <pages>41-47</pages>
+      <abstract>The Coronavirus pandemic has heightened the demand for technological solutions capable of gathering and monitoring data automatically, quickly, and securely. To achieve this need, the Plantão Coronavirus chatbot has been made available to the population of Ceará State in Brazil. This chatbot employs automated symptom detection technology through Natural Language Processing (NLP). The proposal of this work is a symptom tracker, which is a neural network that processes texts and captures symptoms in messages exchanged between citizens of the state and the Plantão Coronavirus nurse/doctor, i.e., clinical conversations. The model has the ability to recognize new patterns and has identified a high incidence of altered psychological behaviors, including anguish, anxiety, and sadness, among users who tested positive or negative for Covid-19. As a result, the tool has emphasized the importance of expanding coverage through community mental health services in the state.</abstract>
+      <url hash="26e5d2b5">2023.clinicalnlp-1.6</url>
+      <bibkey>coelho-da-silva-etal-2023-tracking</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Aligning Factual Consistency for Clinical Studies Summarization through Reinforcement Learning</title>
+      <author><first>Xiangru</first><last>Tang</last><affiliation>Yale University</affiliation></author>
+      <author><first>Arman</first><last>Cohan</last><affiliation>Yale University and Allen Institute for Artificial Intelligence</affiliation></author>
+      <author><first>Mark</first><last>Gerstein</last><affiliation>Yale University</affiliation></author>
+      <pages>48-58</pages>
+      <abstract>In the rapidly evolving landscape of medical research, accurate and concise summarization of clinical studies is crucial to support evidence-based practice. This paper presents a novel approach to clinical studies summarization, leveraging reinforcement learning to enhance factual consistency and align with human annotator preferences. Our work focuses on two tasks: Conclusion Generation and Review Generation. We train a CONFIT summarization model that outperforms GPT-3 and previous state-of-the-art models on the same datasets and collects expert and crowd-worker annotations to evaluate the quality and factual consistency of the generated summaries. These annotations enable us to measure the correlation of various automatic metrics, including modern factual evaluation metrics like QAFactEval, with human-assessed factual consistency. By employing top-correlated metrics as objectives for a reinforcement learning model, we demonstrate improved factuality in generated summaries that are preferred by human annotators.</abstract>
+      <url hash="5ac51316">2023.clinicalnlp-1.7</url>
+      <bibkey>tang-etal-2023-aligning</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Navigating Data Scarcity: Pretraining for Medical Utterance Classification</title>
+      <author><first>Do June</first><last>Min</last></author>
+      <author><first>Veronica</first><last>Perez-Rosas</last><affiliation>University of Michigan - Ann Arbor</affiliation></author>
+      <author><first>Rada</first><last>Mihalcea</last><affiliation>University of Michigan</affiliation></author>
+      <pages>59-68</pages>
+      <abstract>Pretrained language models leverage self-supervised learning to use large amounts of unlabeled text for learning contextual representations of sequences. However, in the domain of medical conversations, the availability of large, public datasets is limited due to issues of privacy and data management. In this paper, we study the effectiveness of dialog-aware pretraining objectives and multiphase training in using unlabeled data to improve LMs training for medical utterance classification. The objectives of pretraining for dialog awareness involve tasks that take into account the structure of conversations, including features such as turn-taking and the roles of speakers. The multiphase training process uses unannotated data in a sequence that prioritizes similarities and connections between different domains. We empirically evaluate these methods on conversational dialog classification tasks in the medical and counseling domains, and find that multiphase training can help achieve higher performance than standard pretraining or finetuning.</abstract>
+      <url hash="5471d623">2023.clinicalnlp-1.8</url>
+      <bibkey>min-etal-2023-navigating</bibkey>
+    </paper>
+    <paper id="9">
+      <title><fixed-case>H</fixed-case>indi Chatbot for Supporting Maternal and Child Health Related Queries in Rural <fixed-case>I</fixed-case>ndia</title>
+      <author><first>Ritwik</first><last>Mishra</last><affiliation>Indraprastha Institute of Information Technology, Delhi</affiliation></author>
+      <author><first>Simranjeet</first><last>Singh</last></author>
+      <author><first>Jasmeet</first><last>Kaur</last><affiliation>Indraprastha Institute of Information Technology, Delhi</affiliation></author>
+      <author><first>Pushpendra</first><last>Singh</last></author>
+      <author><first>Rajiv</first><last>Shah</last></author>
+      <pages>69-77</pages>
+      <abstract>In developing countries like India, doctors and healthcare professionals working in public health spend significant time answering health queries that are fact-based and repetitive. Therefore, we propose an automated way to answer maternal and child health-related queries. A database of Frequently Asked Questions (FAQs) and their corresponding answers generated by experts is curated from rural health workers and young mothers. We develop a Hindi chatbot that identifies k relevant Question and Answer (QnA) pairs from the database in response to a healthcare query (q) written in Devnagri script or Hindi-English (Hinglish) code-mixed script. The curated database covers 80% of all the queries that a user of our study is likely to ask. We experimented with (i) rule-based methods, (ii) sentence embeddings, and (iii) a paraphrasing classifier, to calculate the q-Q similarity. We observed that paraphrasing classifier gives the best result when trained first on an open-domain text and then on the healthcare domain. Our chatbot uses an ensemble of all three approaches. We observed that if a given q can be answered using the database, then our chatbot can provide at least one relevant QnA pair among its top three suggestions for up to 70% of the queries.</abstract>
+      <url hash="567e350f">2023.clinicalnlp-1.9</url>
+      <bibkey>mishra-etal-2023-hindi</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Multi-Task Training with In-Domain Language Models for Diagnostic Reasoning</title>
+      <author><first>Brihat</first><last>Sharma</last><affiliation>University of Wisconsin - Madison</affiliation></author>
+      <author><first>Yanjun</first><last>Gao</last></author>
+      <author><first>Timothy</first><last>Miller</last><affiliation>Harvard University</affiliation></author>
+      <author><first>Matthew</first><last>Churpek</last><affiliation>University of Wisconsin - Madison</affiliation></author>
+      <author><first>Majid</first><last>Afshar</last><affiliation>University of Wisconsin - Madison</affiliation></author>
+      <author><first>Dmitriy</first><last>Dligach</last><affiliation>Loyola University Chicago</affiliation></author>
+      <pages>78-85</pages>
+      <abstract>Generative artificial intelligence (AI) is a promising direction for augmenting clinical diagnostic decision support and reducing diagnostic errors, a leading contributor to medical errors. To further the development of clinical AI systems, the Diagnostic Reasoning Benchmark (DR.BENCH) was introduced as a comprehensive generative AI framework, comprised of six tasks representing key components in clinical reasoning. We present a comparative analysis of in-domain versus out-of-domain language models as well as multi-task versus single task training with a focus on the problem summarization task in DR.BENCH. We demonstrate that a multi-task, clinically-trained language model outperforms its general domain counterpart by a large margin, establishing a new state-of-the-art performance, with a ROUGE-L score of 28.55. This research underscores the value of domain-specific training for optimizing clinical diagnostic reasoning tasks.</abstract>
+      <url hash="9fcf9727">2023.clinicalnlp-1.10</url>
+      <bibkey>sharma-etal-2023-multi</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Context-aware Medication Event Extraction from Unstructured Text</title>
+      <author><first>Noushin</first><last>Salek Faramarzi</last><affiliation>, State University of New York at Stony Brook</affiliation></author>
+      <author><first>Meet</first><last>Patel</last></author>
+      <author><first>Sai Harika</first><last>Bandarupally</last></author>
+      <author><first>Ritwik</first><last>Banerjee</last><affiliation>State University of New York, Stony Brook</affiliation></author>
+      <pages>86-95</pages>
+      <abstract>Accurately capturing medication history is crucial in delivering high-quality medical care. The extraction of medication events from unstructured clinical notes, however, is challenging because the information is presented in complex narratives. We address this challenge by leveraging the newly released Contextualized Medication Event Dataset (CMED) as part of our participation in the 2022 National NLP Clinical Challenges (n2c2) shared task. Our study evaluates the performance of various pretrained language models in this task. Further, we find that data augmentation coupled with domain-specific training provides notable improvements. With experiments, we also underscore the importance of careful data preprocessing in medical event detection.</abstract>
+      <url hash="d12f38b6">2023.clinicalnlp-1.11</url>
+      <bibkey>salek-faramarzi-etal-2023-context</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Improving Automatic <fixed-case>KCD</fixed-case> Coding: Introducing the <fixed-case>K</fixed-case>o<fixed-case>DAK</fixed-case> and an Optimized Tokenization Method for <fixed-case>K</fixed-case>orean Clinical Documents</title>
+      <author><first>Geunyeong</first><last>Jeong</last><affiliation>Konkuk University</affiliation></author>
+      <author><first>Juoh</first><last>Sun</last></author>
+      <author><first>Seokwon</first><last>Jeong</last><affiliation>Kangwon National University</affiliation></author>
+      <author><first>Hyunjin</first><last>Shin</last></author>
+      <author><first>Harksoo</first><last>Kim</last><affiliation>Konkuk University</affiliation></author>
+      <pages>96-101</pages>
+      <abstract>International Classification of Diseases (ICD) coding is the task of assigning a patient’s electronic health records into standardized codes, which is crucial for enhancing medical services and reducing healthcare costs. In Korea, automatic Korean Standard Classification of Diseases (KCD) coding has been hindered by limited resources, differences in ICD systems, and language-specific characteristics. Therefore, we construct the Korean Dataset for Automatic KCD coding (KoDAK) by collecting and preprocessing Korean clinical documents. In addition, we propose a tokenization method optimized for Korean clinical documents. Our experiments show that our proposed method outperforms Korean Medical BERT (KM-BERT) in Macro-F1 performance by 0.14%p while using fewer model parameters, demonstrating its effectiveness in Korean clinical documents.</abstract>
+      <url hash="4e4af009">2023.clinicalnlp-1.12</url>
+      <bibkey>jeong-etal-2023-improving</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Who needs context? Classical techniques for <fixed-case>A</fixed-case>lzheimer’s disease detection</title>
+      <author><first>Behrad</first><last>Taghibeyglou</last></author>
+      <author><first>Frank</first><last>Rudzicz</last><affiliation>Dalhousie University</affiliation></author>
+      <pages>102-107</pages>
+      <abstract>Natural language processing (NLP) has shown great potential for Alzheimer’s disease (AD) detection, particularly due to the adverse effect of AD on spontaneous speech. The current body of literature has directed attention toward context-based models, especially Bidirectional Encoder Representations from Transformers (BERTs), owing to their exceptional abilities to integrate contextual information in a wide range of NLP tasks.This comes at the cost of added model opacity and computational requirements. Taking this into consideration, we propose a Word2Vec-based model for AD detection in 108 age- and sex-matched participants who were asked to describe the Cookie Theft picture. We also investigate the effectiveness of our model by fine-tuning BERT-based sequence classification models, as well as incorporating linguistic features. Our results demonstrate that our lightweight and easy-to-implement model outperforms some of the state-of-the-art models available in the literature, as well as BERT models.</abstract>
+      <url hash="669f4607">2023.clinicalnlp-1.13</url>
+      <bibkey>taghibeyglou-rudzicz-2023-needs</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Knowledge Injection for Disease Names in Logical Inference between <fixed-case>J</fixed-case>apanese Clinical Texts</title>
+      <author><first>Natsuki</first><last>Murakami</last><affiliation>Ochanomizu Women’s University</affiliation></author>
+      <author><first>Mana</first><last>Ishida</last></author>
+      <author><first>Yuta</first><last>Takahashi</last><affiliation>Ochanomizu Women’s University</affiliation></author>
+      <author><first>Hitomi</first><last>Yanaka</last><affiliation>the University of Tokyo and RIKEN</affiliation></author>
+      <author><first>Daisuke</first><last>Bekki</last><affiliation>Ochanomizu University</affiliation></author>
+      <pages>108-117</pages>
+      <abstract>In the medical field, there are many clinical texts such as electronic medical records, and research on Japanese natural language processing using these texts has been conducted.One such research involves Recognizing Textual Entailment (RTE) in clinical texts using a semantic analysis and logical inference system, ccg2lambda.However, it is difficult for existing inference systems to correctly determine the entailment relations , if the input sentence contains medical domain specific paraphrases such as disease names.In this study, we propose a method to supplement the equivalence relations of disease names as axioms by identifying candidates for paraphrases that lack in theorem proving.Candidates of paraphrases are identified by using a model for the NER task for disease names and a disease name dictionary.We also construct an inference test set that requires knowledge injection of disease names and evaluate our inference system.Experiments showed that our inference system was able to correctly infer for 106 out of 149 inference test sets.</abstract>
+      <url hash="9431570a">2023.clinicalnlp-1.14</url>
+      <bibkey>murakami-etal-2023-knowledge</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Training Models on Oversampled Data and a Novel Multi-class Annotation Scheme for Dementia Detection</title>
+      <author><first>Nadine</first><last>Abdelhalim</last><affiliation>University of Manchester</affiliation></author>
+      <author><first>Ingy</first><last>Abdelhalim</last><affiliation>University of Manchester</affiliation></author>
+      <author><first>Riza</first><last>Batista-Navarro</last><affiliation>University of Manchester</affiliation></author>
+      <pages>118-124</pages>
+      <abstract>This work introduces a novel three-class annotation scheme for text-based dementia classification in patients, based on their recorded visit interactions. Multiple models were developed utilising BERT, RoBERTa and DistilBERT. Two approaches were employed to improve the representation of dementia samples: oversampling the underrepresented data points in the original Pitt dataset and combining the Pitt with the Holland and Kempler datasets. The DistilBERT models trained on either an oversampled Pitt dataset or the combined dataset performed best in classifying the dementia class. Specifically, the model trained on the oversampled Pitt dataset and the one trained on the combined dataset obtained state-of-the-art performance with 98.8% overall accuracy and 98.6% macro-averaged F1-score, respectively. The models’ outputs were manually inspected through saliency highlighting, using Local Interpretable Model-agnostic Explanations (LIME), to provide a better understanding of its predictions.</abstract>
+      <url hash="c95c3a58">2023.clinicalnlp-1.15</url>
+      <bibkey>abdelhalim-etal-2023-training</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Improving the Transferability of Clinical Note Section Classification Models with <fixed-case>BERT</fixed-case> and Large Language Model Ensembles</title>
+      <author><first>Weipeng</first><last>Zhou</last></author>
+      <author><first>Majid</first><last>Afshar</last><affiliation>University of Wisconsin - Madison</affiliation></author>
+      <author><first>Dmitriy</first><last>Dligach</last><affiliation>Loyola University Chicago</affiliation></author>
+      <author><first>Yanjun</first><last>Gao</last></author>
+      <author><first>Timothy</first><last>Miller</last><affiliation>Harvard University</affiliation></author>
+      <pages>125-130</pages>
+      <abstract>Text in electronic health records is organized into sections, and classifying those sections into section categories is useful for downstream tasks. In this work, we attempt to improve the transferability of section classification models by combining the dataset-specific knowledge in supervised learning models with the world knowledge inside large language models (LLMs). Surprisingly, we find that zero-shot LLMs out-perform supervised BERT-based models applied to out-of-domain data. We also find that their strengths are synergistic, so that a simple ensemble technique leads to additional performance gains.</abstract>
+      <url hash="a4d99d09">2023.clinicalnlp-1.16</url>
+      <bibkey>zhou-etal-2023-improving-transferability</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Can Large Language Models Safely Address Patient Questions Following Cataract Surgery?</title>
+      <author><first>Mohita</first><last>Chowdhury</last></author>
+      <author><first>Ernest</first><last>Lim</last></author>
+      <author><first>Aisling</first><last>Higham</last></author>
+      <author><first>Rory</first><last>McKinnon</last></author>
+      <author><first>Nikoletta</first><last>Ventoura</last></author>
+      <author><first>Yajie</first><last>He</last></author>
+      <author><first>Nick</first><last>De Pennington</last></author>
+      <pages>131-137</pages>
+      <abstract>Recent advances in large language models (LLMs) have generated significant interest in their application across various domains including healthcare. However, there is limited data on their safety and performance in real-world scenarios. This study uses data collected using an autonomous telemedicine clinical assistant. The assistant asks symptom-based questions to elicit patient concerns and allows patients to ask questions about their post-operative recovery. We utilise real-world postoperative questions posed to the assistant by a cohort of 120 patients to examine the safety and appropriateness of responses generated by a recent popular LLM by OpenAI, ChatGPT. We demonstrate that LLMs have the potential to helpfully address routine patient queries following routine surgery. However, important limitations around the safety of today’s models exist which must be considered.</abstract>
+      <url hash="8f6d7324">2023.clinicalnlp-1.17</url>
+      <bibkey>chowdhury-etal-2023-large</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Large Scale Sequence-to-Sequence Models for Clinical Note Generation from Patient-Doctor Conversations</title>
+      <author><first>Gagandeep</first><last>Singh</last><affiliation>Nuance Communications</affiliation></author>
+      <author><first>Yue</first><last>Pan</last><affiliation>Nuance Communications</affiliation></author>
+      <author><first>Jesus</first><last>Andres-Ferrer</last></author>
+      <author><first>Miguel</first><last>Del-Agua</last><affiliation>Nuance Communications</affiliation></author>
+      <author><first>Frank</first><last>Diehl</last><affiliation>Nuance Communications</affiliation></author>
+      <author><first>Joel</first><last>Pinto</last></author>
+      <author><first>Paul</first><last>Vozila</last><affiliation>Nuance Communications</affiliation></author>
+      <pages>138-143</pages>
+      <abstract>We present our work on building large scale sequence-to-sequence models for generating clinical note from patient-doctor conversation. This is formulated as an abstractive summarization task for which we use encoder-decoder transformer model with pointer-generator. We discuss various modeling enhancements to this baseline model which include using subword and multiword tokenization scheme, prefixing the targets with a chain-of-clinical-facts, and training with contrastive loss that is defined over various candidate summaries. We also use flash attention during training and query chunked attention during inference to be able to process long input and output sequences and to improve computational efficiency. Experiments are conducted on a dataset containing about 900K encounters from around 1800 healthcare providers covering 27 specialties. The results are broken down into primary care and non-primary care specialties. Consistent accuracy improvements are observed across both of these categories.</abstract>
+      <url hash="97f6c362">2023.clinicalnlp-1.18</url>
+      <bibkey>singh-etal-2023-large</bibkey>
+    </paper>
+    <paper id="19">
+      <title>clulab at <fixed-case>MEDIQA</fixed-case>-Chat 2023: Summarization and classification of medical dialogues</title>
+      <author><first>Kadir Bulut</first><last>Ozler</last></author>
+      <author><first>Steven</first><last>Bethard</last><affiliation>University of Arizona</affiliation></author>
+      <pages>144-149</pages>
+      <abstract>Clinical Natural Language Processing has been an increasingly popular research area in the NLP community. With the rise of large language models (LLMs) and their impressive abilities in NLP tasks, it is crucial to pay attention to their clinical applications. Sequence to sequence generative approaches with LLMs have been widely used in recent years. To be a part of the research in clinical NLP with recent advances in the field, we participated in task A of MEDIQA-Chat at ACL-ClinicalNLP Workshop 2023. In this paper, we explain our methods and findings as well as our comments on our results and limitations.</abstract>
+      <url hash="e2e71c47">2023.clinicalnlp-1.19</url>
+      <bibkey>ozler-bethard-2023-clulab</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Leveraging Natural Language Processing and Clinical Notes for Dementia Detection</title>
+      <author><first>Ming</first><last>Liu</last><affiliation>Deakin University</affiliation></author>
+      <author><first>Richard</first><last>Beare</last><affiliation>NA</affiliation></author>
+      <author><first>Taya</first><last>Collyer</last><affiliation>NA</affiliation></author>
+      <author><first>Nadine</first><last>Andrew</last><affiliation>NA</affiliation></author>
+      <author><first>Velandai</first><last>Srikanth</last><affiliation>NA</affiliation></author>
+      <pages>150-155</pages>
+      <abstract>Early detection and automated classification of dementia has recently gained considerable attention using neuroimaging data and spontaneous speech. In this paper, we validate the possibility of dementia detection with in-hospital clinical notes. We collected 954 patients’ clinical notes from a local hospital and assign dementia/non-dementia labels to those patients based on clinical assessment and telephone interview. Given the labeled dementia data sets, we fine tune a ClinicalBioBERT based on some filtered clinical notes and conducted experiments on both binary and three class dementia classification. Our experiment results show that the fine tuned ClinicalBioBERT achieved satisfied performance on binary classification but failed on three class dementia classification. Further analysis suggests that more human prior knowledge should be considered.</abstract>
+      <url hash="4f957996">2023.clinicalnlp-1.20</url>
+      <bibkey>liu-etal-2023-leveraging</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Automated Orthodontic Diagnosis from a Summary of Medical Findings</title>
+      <author><first>Takumi</first><last>Ohtsuka</last><affiliation>Ehime University</affiliation></author>
+      <author><first>Tomoyuki</first><last>Kajiwara</last><affiliation>Ehime University</affiliation></author>
+      <author><first>Chihiro</first><last>Tanikawa</last><affiliation>NA</affiliation></author>
+      <author><first>Yuujin</first><last>Shimizu</last><affiliation>NA</affiliation></author>
+      <author><first>Hajime</first><last>Nagahara</last><affiliation>Osaka University</affiliation></author>
+      <author><first>Takashi</first><last>Ninomiya</last><affiliation>Ehime University</affiliation></author>
+      <pages>156-160</pages>
+      <abstract>We propose a method to automate orthodontic diagnosis with natural language processing. It is worthwhile to assist dentists with such technology to prevent errors by inexperienced dentists and to reduce the workload of experienced ones. However, text length and style inconsistencies in medical findings make an automated orthodontic diagnosis with deep-learning models difficult. In this study, we improve the performance of automatic diagnosis utilizing short summaries of medical findings written in a consistent style by experienced dentists. Experimental results on 970 Japanese medical findings show that summarization consistently improves the performance of various machine learning models for automated orthodontic diagnosis. Although BERT is the model that gains the most performance with the proposed method, the convolutional neural network achieved the best performance.</abstract>
+      <url hash="d4a315c7">2023.clinicalnlp-1.21</url>
+      <bibkey>ohtsuka-etal-2023-automated</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Harnessing the Power of <fixed-case>BERT</fixed-case> in the <fixed-case>T</fixed-case>urkish Clinical Domain: Pretraining Approaches for Limited Data Scenarios</title>
+      <author><first>Hazal</first><last>Türkmen</last></author>
+      <author><first>Oguz</first><last>Dikenelli</last><affiliation>Ege University</affiliation></author>
+      <author><first>Cenk</first><last>Eraslan</last></author>
+      <author><first>Mehmet</first><last>Calli</last><affiliation>NA</affiliation></author>
+      <author><first>Suha</first><last>Ozbek</last></author>
+      <pages>161-170</pages>
+      <abstract>Recent advancements in natural language processing (NLP) have been driven by large language models (LLMs), thereby revolutionizing the field. Our study investigates the impact of diverse pre-training strategies on the performance of Turkish clinical language models in a multi-label classification task involving radiology reports, with a focus on overcoming language resource limitations. Additionally, for the first time, we evaluated the simultaneous pre-training approach by utilizing limited clinical task data. We developed four models: TurkRadBERT-task v1, TurkRadBERT-task v2, TurkRadBERT-sim v1, and TurkRadBERT-sim v2. Our results revealed superior performance from BERTurk and TurkRadBERT-task v1, both of which leverage a broad general-domain corpus. Although task-adaptive pre-training is capable of identifying domain-specific patterns, it may be prone to overfitting because of the constraints of the task-specific corpus. Our findings highlight the importance of domain-specific vocabulary during pre-training to improve performance. They also affirmed that a combination of general domain knowledge and task-specific fine-tuning is crucial for optimal performance across various categories. This study offers key insights for future research on pre-training techniques in the clinical domain, particularly for low-resource languages.</abstract>
+      <url hash="93acd9fd">2023.clinicalnlp-1.22</url>
+      <bibkey>turkmen-etal-2023-harnessing</bibkey>
+    </paper>
+    <paper id="23">
+      <title>A Meta-dataset of <fixed-case>G</fixed-case>erman Medical Corpora: Harmonization of Annotations and Cross-corpus <fixed-case>NER</fixed-case> Evaluation</title>
+      <author><first>Ignacio</first><last>Llorca</last></author>
+      <author><first>Florian</first><last>Borchert</last><affiliation>Hasso Plattner Institute</affiliation></author>
+      <author><first>Matthieu-P.</first><last>Schapranow</last></author>
+      <pages>171-181</pages>
+      <abstract>Over the last years, an increasing number of publicly available, semantically annotated medical corpora have been released for the German language. While their annotations cover comparable semantic classes, the synergies of such efforts have not been explored, yet. This is due to substantial differences in the data schemas (syntax) and annotated entities (semantics), which hinder the creation of common meta-datasets. For instance, it is unclear whether named entity recognition (NER) taggers trained on one or more of such datasets are useful to detect entities in any of the other datasets. In this work, we create harmonized versions of German medical corpora using the BigBIO framework, and make them available to the community. Using these as a meta-dataset, we perform a series of cross-corpus evaluation experiments on two settings of aligned labels. These consist in fine-tuning various pre-trained Transformers on different combinations of training sets, and testing them against each dataset separately. We find that a) trained NER models generalize poorly, with F1 scores dropping approx. 20 pp. on unseen test data, and b) current pre-trained Transformer models for the German language do not systematically alleviate this issue. However, our results suggest that models benefit from additional training corpora in most cases, even if these belong to different medical fields or text genres.</abstract>
+      <url hash="35cde92e">2023.clinicalnlp-1.23</url>
+      <bibkey>llorca-etal-2023-meta</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Uncovering the Potential for a Weakly Supervised End-to-End Model in Recognising Speech from Patient with Post-Stroke Aphasia</title>
+      <author><first>Giulia</first><last>Sanguedolce</last></author>
+      <author><first>Patrick</first><last>Naylor</last><affiliation>Imperial College London, Imperial College London</affiliation></author>
+      <author><first>Fatemeh</first><last>Geranmayeh</last><affiliation>Imperial College London</affiliation></author>
+      <pages>182-190</pages>
+      <abstract>Post-stroke speech and language deficits (aphasia) significantly impact patients’ quality of life. Many with mild symptoms remain undiagnosed, and the majority do not receive the intensive doses of therapy recommended, due to healthcare costs and/or inadequate services. Automatic Speech Recognition (ASR) may help overcome these difficulties by improving diagnostic rates and providing feedback during tailored therapy. However, its performance is often unsatisfactory due to the high variability in speech errors and scarcity of training datasets. This study assessed the performance of Whisper, a recently released end-to-end model, in patients with post-stroke aphasia (PWA). We tuned its hyperparameters to achieve the lowest word error rate (WER) on aphasic speech. WER was significantly higher in PWA compared to age-matched controls (10.3% vs 38.5%, <tex-math>p&lt;0.001</tex-math>). We demonstrated that worse WER was related to the more severe aphasia as measured by expressive (overt naming, and spontaneous speech production) and receptive (written and spoken comprehension) language assessments. Stroke lesion size did not affect the performance of Whisper. Linear mixed models accounting for demographic factors, therapy duration, and time since stroke, confirmed worse Whisper performance with left hemispheric frontal lesions.We discuss the implications of these findings for how future ASR can be improved in PWA.</abstract>
+      <url hash="67738d64">2023.clinicalnlp-1.24</url>
+      <bibkey>sanguedolce-etal-2023-uncovering</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Textual Entailment for Temporal Dependency Graph Parsing</title>
+      <author><first>Jiarui</first><last>Yao</last></author>
+      <author><first>Steven</first><last>Bethard</last><affiliation>University of Arizona</affiliation></author>
+      <author><first>Kristin</first><last>Wright-Bettner</last><affiliation>University of Colorado at Boulder</affiliation></author>
+      <author><first>Eli</first><last>Goldner</last></author>
+      <author><first>David</first><last>Harris</last></author>
+      <author><first>Guergana</first><last>Savova</last><affiliation>Harvard University</affiliation></author>
+      <pages>191-199</pages>
+      <abstract>We explore temporal dependency graph (TDG) parsing in the clinical domain. We leverage existing annotations on the THYME dataset to semi-automatically construct a TDG corpus. Then we propose a new natural language inference (NLI) approach to TDG parsing, and evaluate it both on general domain TDGs from wikinews and the newly constructed clinical TDG corpus. We achieve competitive performance on general domain TDGs with a much simpler model than prior work. On the clinical TDGs, our method establishes the first result of TDG parsing on clinical data with 0.79/0.88 micro/macro F1.</abstract>
+      <url hash="9fc00e6e">2023.clinicalnlp-1.25</url>
+      <bibkey>yao-etal-2023-textual</bibkey>
+    </paper>
+    <paper id="26">
+      <title>Generating medically-accurate summaries of patient-provider dialogue: A multi-stage approach using large language models</title>
+      <author><first>Varun</first><last>Nair</last><affiliation>Curai Health</affiliation></author>
+      <author><first>Elliot</first><last>Schumacher</last><affiliation>Curai Health and Johns Hopkins University</affiliation></author>
+      <author><first>Anitha</first><last>Kannan</last><affiliation>Curai Health</affiliation></author>
+      <pages>200-217</pages>
+      <abstract>A medical provider’s summary of a patient visit serves several critical purposes, including clinical decision-making, facilitating hand-offs between providers, and as a reference for the patient. An effective summary is required to be coherent and accurately capture all the medically relevant information in the dialogue, despite the complexity of patient-generated language. Even minor inaccuracies in visit summaries (for example, summarizing “patient does not have a fever” when a fever is present) can be detrimental to the outcome of care for the patient.This paper tackles the problem of medical conversation summarization by discretizing the task into several smaller dialogue-understanding tasks that are sequentially built upon. First, we identify medical entities and their affirmations within the conversation to serve as building blocks. We study dynamically constructing few-shot prompts for tasks by conditioning on relevant patient information and use GPT-3 as the backbone for our experiments. We also develop GPT-derived summarization metrics to measure performance against reference summaries quantitatively. Both our human evaluation study and metrics for medical correctness show that summaries generated using this approach are clinically accurate and outperform the baseline approach of summarizing the dialog in a zero-shot, single-prompt setting.</abstract>
+      <url hash="13dc10dd">2023.clinicalnlp-1.26</url>
+      <bibkey>nair-etal-2023-generating</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Factors Affecting the Performance of Automated Speaker Verification in <fixed-case>A</fixed-case>lzheimer’s Disease Clinical Trials</title>
+      <author><first>Malikeh</first><last>Ehghaghi</last></author>
+      <author><first>Marija</first><last>Stanojevic</last><affiliation>WinterLightLabs and Temple University</affiliation></author>
+      <author><first>Ali</first><last>Akram</last></author>
+      <author><first>Jekaterina</first><last>Novikova</last><affiliation>Winterlight Labs</affiliation></author>
+      <pages>218-227</pages>
+      <abstract>Detecting duplicate patient participation in clinical trials is a major challenge because repeated patients can undermine the credibility and accuracy of the trial’s findings and result in significant health and financial risks. Developing accurate automated speaker verification (ASV) models is crucial to verify the identity of enrolled individuals and remove duplicates, but the size and quality of data influence ASV performance. However, there has been limited investigation into the factors that can affect ASV capabilities in clinical environments. In this paper, we bridge the gap by conducting analysis of how participant demographic characteristics, audio quality criteria, and severity level of Alzheimer’s disease (AD) impact the performance of ASV utilizing a dataset of speech recordings from 659 participants with varying levels of AD, obtained through multiple speech tasks. Our results indicate that ASV performance: 1) is slightly better on male speakers than on female speakers; 2) degrades for individuals who are above 70 years old; 3) is comparatively better for non-native English speakers than for native English speakers; 4) is negatively affected by clinician interference, noisy background, and unclear participant speech; 5) tends to decrease with an increase in the severity level of AD. Our study finds that voice biometrics raise fairness concerns as certain subgroups exhibit different ASV performances owing to their inherent voice characteristics. Moreover, the performance of ASV is influenced by the quality of speech recordings, which underscores the importance of improving the data collection settings in clinical trials.</abstract>
+      <url hash="80e3fd97">2023.clinicalnlp-1.27</url>
+      <bibkey>ehghaghi-etal-2023-factors</bibkey>
+    </paper>
+    <paper id="28">
+      <title>Team Cadence at <fixed-case>MEDIQA</fixed-case>-Chat 2023: Generating, augmenting and summarizing clinical dialogue with large language models</title>
+      <author><first>Ashwyn</first><last>Sharma</last><affiliation>Cadence Solutions</affiliation></author>
+      <author><first>David</first><last>Feldman</last></author>
+      <author><first>Aneesh</first><last>Jain</last></author>
+      <pages>228-235</pages>
+      <abstract>This paper describes Team Cadence’s winning submission to Task C of the MEDIQA-Chat 2023 shared tasks. We also present the set of methods, including a novel N-pass strategy to summarize a mix of clinical dialogue and an incomplete summarized note, used to complete Task A and Task B, ranking highly on the leaderboard amongst stable and reproducible code submissions. The shared tasks invited participants to summarize, classify and generate patient-doctor conversations. Considering the small volume of training data available, we took a data-augmentation-first approach to the three tasks by focusing on the dialogue generation task, i.e., Task C. It proved effective in improving our models’ performance on Task A and Task B. We also found the BART architecture to be highly versatile, as it formed the base for all our submissions. Finally, based on the results shared by the organizers, we note that Team Cadence was the only team to submit stable and reproducible runs to all three tasks.</abstract>
+      <url hash="8412223e">2023.clinicalnlp-1.28</url>
+      <bibkey>sharma-etal-2023-team</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Method for Designing Semantic Annotation of Sepsis Signs in Clinical Text</title>
+      <author><first>Melissa</first><last>Yan</last><affiliation>Norwegian University of Science and Technology</affiliation></author>
+      <author><first>Lise</first><last>Gustad</last><affiliation>Nord University and Norwegian University of Science and Technology</affiliation></author>
+      <author><first>Lise</first><last>Høvik</last></author>
+      <author><first>Øystein</first><last>Nytrø</last><affiliation>Norwegian University of Science and Technology</affiliation></author>
+      <pages>236-246</pages>
+      <abstract>Annotated clinical text corpora are essential for machine learning studies that model and predict care processes and disease progression. However, few studies describe the necessary experimental design of the annotation guideline and annotation phases. This makes replication, reuse, and adoption challenging.Using clinical questions about sepsis, we designed a semantic annotation guideline to capture sepsis signs from clinical text. The clinical questions aid guideline design, application, and evaluation. Our method incrementally evaluates each change in the guideline by testing the resulting annotated corpus using clinical questions. Additionally, our method uses inter-annotator agreement to judge the annotator compliance and quality of the guideline. We show that the method, combined with controlled design increments, is simple and allows the development and measurable improvement of a purpose-built semantic annotation guideline. We believe that our approach is useful for incremental design of semantic annotation guidelines in general.</abstract>
+      <url hash="321c42b8">2023.clinicalnlp-1.29</url>
+      <bibkey>yan-etal-2023-method</bibkey>
+    </paper>
+    <paper id="30">
+      <title>Prompt Discriminative Language Models for Domain Adaptation</title>
+      <author><first>Keming</first><last>Lu</last></author>
+      <author><first>Peter</first><last>Potash</last><affiliation>Microsoft</affiliation></author>
+      <author><first>Xihui</first><last>Lin</last><affiliation>Microsoft</affiliation></author>
+      <author><first>Yuwen</first><last>Sun</last></author>
+      <author><first>Zihan</first><last>Qian</last></author>
+      <author><first>Zheng</first><last>Yuan</last><affiliation>Alibaba Group</affiliation></author>
+      <author><first>Tristan</first><last>Naumann</last><affiliation>Microsoft Research</affiliation></author>
+      <author><first>Tianxi</first><last>Cai</last><affiliation>Harvard T.H. Chan School of Public Health</affiliation></author>
+      <author><first>Junwei</first><last>Lu</last><affiliation>Harvard University</affiliation></author>
+      <pages>247-258</pages>
+      <abstract>Prompt tuning offers an efficient approach to domain adaptation for pretrained language models, which predominantly focus on masked language modeling or generative objectives. However, the potential of discriminative language models in biomedical tasks remains underexplored.To bridge this gap, we develop BioDLM, a method tailored for biomedical domain adaptation of discriminative language models that incorporates prompt-based continual pretraining and prompt tuning for downstream tasks. BioDLM aims to maximize the potential of discriminative language models in low-resource scenarios by reformulating these tasks as span-level corruption detection, thereby enhancing performance on domain-specific tasks and improving the efficiency of continual pertaining.In this way, BioDLM provides a data-efficient domain adaptation method for discriminative language models, effectively enhancing performance on discriminative tasks within the biomedical domain.</abstract>
+      <url hash="ae20bb11">2023.clinicalnlp-1.30</url>
+      <bibkey>lu-etal-2023-prompt</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Cross-domain <fixed-case>G</fixed-case>erman Medical Named Entity Recognition using a Pre-Trained Language Model and Unified Medical Semantic Types</title>
+      <author><first>Siting</first><last>Liang</last><affiliation>German Research Center for AI</affiliation></author>
+      <author><first>Mareike</first><last>Hartmann</last></author>
+      <author><first>Daniel</first><last>Sonntag</last><affiliation>German Research Center for AI and Carl von Ossietzky Universität Oldenburg</affiliation></author>
+      <pages>259-271</pages>
+      <abstract>Information extraction from clinical text has the potential to facilitate clinical research and personalized clinical care, but annotating large amounts of data for each set of target tasks is prohibitive. We present a German medical Named Entity Recognition (NER) system capable of cross-domain knowledge transferring. The system builds on a pre-trained German language model and a token-level binary classifier, employing semantic types sourced from the Unified Medical Language System (UMLS) as entity labels to identify corresponding entity spans within the input text. To enhance the system’s performance and robustness, we pre-train it using a medical literature corpus that incorporates UMLS semantic term annotations. We evaluate the system’s effectiveness on two German annotated datasets obtained from different clinics in zero- and few-shot settings. The results show that our approach outperforms task-specific Condition Random Fields (CRF) classifiers in terms of accuracy. Our work contributes to developing robust and transparent German medical NER models that can support the extraction of information from various clinical texts.</abstract>
+      <url hash="9330b0f7">2023.clinicalnlp-1.31</url>
+      <bibkey>liang-etal-2023-cross</bibkey>
+    </paper>
+    <paper id="32">
+      <title>Reducing Knowledge Noise for Improved Semantic Analysis in Biomedical Natural Language Processing Applications</title>
+      <author><first>Usman</first><last>Naseem</last></author>
+      <author><first>Surendrabikram</first><last>Thapa</last></author>
+      <author><first>Qi</first><last>Zhang</last></author>
+      <author><first>Liang</first><last>Hu</last><affiliation>Tongji University</affiliation></author>
+      <author><first>Anum</first><last>Masood</last></author>
+      <author><first>Mehwish</first><last>Nasim</last><affiliation>University of Western Australia and Flinders University of South Australia</affiliation></author>
+      <pages>272-277</pages>
+      <abstract>Graph-based techniques have gained traction for representing and analyzing data in various natural language processing (NLP) tasks. Knowledge graph-based language representation models have shown promising results in leveraging domain-specific knowledge for NLP tasks, particularly in the biomedical NLP field. However, such models have limitations, including knowledge noise and neglect of contextual relationships, leading to potential semantic errors and reduced accuracy. To address these issues, this paper proposes two novel methods. The first method combines knowledge graph-based language model with nearest-neighbor models to incorporate semantic and category information from neighboring instances. The second method involves integrating knowledge graph-based language model with graph neural networks (GNNs) to leverage feature information from neighboring nodes in the graph. Experiments on relation extraction (RE) and classification tasks in English and Chinese language datasets demonstrate significant performance improvements with both methods, highlighting their potential for enhancing the performance of language models and improving NLP applications in the biomedical domain.</abstract>
+      <url hash="1b5e2d03">2023.clinicalnlp-1.32</url>
+      <bibkey>naseem-etal-2023-reducing</bibkey>
+    </paper>
+    <paper id="33">
+      <title>Medical knowledge-enhanced prompt learning for diagnosis classification from clinical text</title>
+      <author><first>Yuxing</first><last>Lu</last></author>
+      <author><first>Xukai</first><last>Zhao</last></author>
+      <author><first>Jinzhuo</first><last>Wang</last><affiliation>Peking University</affiliation></author>
+      <pages>278-288</pages>
+      <abstract>Artificial intelligence based diagnosis systems have emerged as powerful tools to reform traditional medical care. Each clinician now wants to have his own intelligent diagnostic partner to expand the range of services he can provide. When reading a clinical note, experts make inferences with relevant knowledge. However, medical knowledge appears to be heterogeneous, including structured and unstructured knowledge. Existing approaches are incapable of uniforming them well. Besides, the descriptions of clinical findings in clinical notes, which are reasoned to diagnosis, vary a lot for different diseases or patients. To address these problems, we propose a Medical Knowledge-enhanced Prompt Learning (MedKPL) model for diagnosis classification. First, to overcome the heterogeneity of knowledge, given the knowledge relevant to diagnosis, MedKPL extracts and normalizes the relevant knowledge into a prompt sequence. Then, MedKPL integrates the knowledge prompt with the clinical note into a designed prompt for representation. Therefore, MedKPL can integrate medical knowledge into the models to enhance diagnosis and effectively transfer learned diagnosis capacity to unseen diseases using alternating relevant disease knowledge. The experimental results on two medical datasets show that our method can obtain better medical text classification results and can perform better in transfer and few-shot settings among datasets of different diseases.</abstract>
+      <url hash="6a789e88">2023.clinicalnlp-1.33</url>
+      <bibkey>lu-etal-2023-medical</bibkey>
+    </paper>
+    <paper id="34">
+      <title>Multilingual Clinical <fixed-case>NER</fixed-case>: Translation or Cross-lingual Transfer?</title>
+      <author><first>Félix</first><last>Gaschi</last><affiliation>University of Lorraine</affiliation></author>
+      <author><first>Xavier</first><last>Fontaine</last></author>
+      <author><first>Parisa</first><last>Rastin</last></author>
+      <author><first>Yannick</first><last>Toussaint</last><affiliation>Université de Lorraine</affiliation></author>
+      <pages>289-311</pages>
+      <abstract>Natural language tasks like Named Entity Recognition (NER) in the clinical domain on non-English texts can be very time-consuming and expensive due to the lack of annotated data. Cross-lingual transfer (CLT) is a way to circumvent this issue thanks to the ability of multilingual large language models to be fine-tuned on a specific task in one language and to provide high accuracy for the same task in another language. However, other methods leveraging translation models can be used to perform NER without annotated data in the target language, by either translating the training set or test set. This paper compares cross-lingual transfer with these two alternative methods, to perform clinical NER in French and in German without any training data in those languages. To this end, we release MedNERF a medical NER test set extracted from French drug prescriptions and annotated with the same guidelines as an English dataset. Through extensive experiments on this dataset and on a German medical dataset (Frei and Kramer, 2021), we show that translation-based methods can achieve similar performance to CLT but require more care in their design. And while they can take advantage of monolingual clinical language models, those do not guarantee better results than large general-purpose multilingual models, whether with cross-lingual transfer or translation.</abstract>
+      <url hash="4a1440f7">2023.clinicalnlp-1.34</url>
+      <bibkey>gaschi-etal-2023-multilingual</bibkey>
+    </paper>
+    <paper id="35">
+      <title><fixed-case>UMLS</fixed-case>-<fixed-case>KGI</fixed-case>-<fixed-case>BERT</fixed-case>: Data-Centric Knowledge Integration in Transformers for Biomedical Entity Recognition</title>
+      <author><first>Aidan</first><last>Mannion</last></author>
+      <author><first>Didier</first><last>Schwab</last><affiliation>Université Grenoble Alpes</affiliation></author>
+      <author><first>Lorraine</first><last>Goeuriot</last><affiliation>Université Grenoble Alpes</affiliation></author>
+      <pages>312-322</pages>
+      <abstract>Pre-trained transformer language models (LMs) have in recent years become the dominant paradigm in applied NLP. These models have achieved state-of-the-art performance on tasks such as information extraction, question answering, sentiment analysis, document classification and many others. In the biomedical domain, significant progress has been made in adapting this paradigm to NLP tasks that require the integration of domain-specific knowledge as well as statistical modelling of language. In particular, research in this area has focused on the question of how best to construct LMs that take into account not only the patterns of token distribution in medical text, but also the wealth of structured information contained in terminology resources such as the UMLS. This work contributes a data-centric paradigm for enriching the language representations of biomedical transformer-encoder LMs by extracting text sequences from the UMLS.This allows for graph-based learning objectives to be combined with masked-language pre-training. Preliminary results from experiments in the extension of pre-trained LMs as well as training from scratch show that this framework improves downstream performance on multiple biomedical and clinical Named Entity Recognition (NER) tasks. All pre-trained models, data processing pipelines and evaluation scripts will be made publicly available.</abstract>
+      <url hash="a62e76f7">2023.clinicalnlp-1.35</url>
+      <bibkey>mannion-etal-2023-umls</bibkey>
+    </paper>
+    <paper id="36">
+      <title><fixed-case>W</fixed-case>ang<fixed-case>L</fixed-case>ab at <fixed-case>MEDIQA</fixed-case>-Chat 2023: Clinical Note Generation from Doctor-Patient Conversations using Large Language Models</title>
+      <author><first>John</first><last>Giorgi</last></author>
+      <author><first>Augustin</first><last>Toma</last><affiliation>University of Toronto</affiliation></author>
+      <author><first>Ronald</first><last>Xie</last></author>
+      <author><first>Sondra</first><last>Chen</last></author>
+      <author><first>Kevin</first><last>An</last></author>
+      <author><first>Grace</first><last>Zheng</last><affiliation>University of Toronto</affiliation></author>
+      <author><first>Bo</first><last>Wang</last><affiliation>Vector Institute</affiliation></author>
+      <pages>323-334</pages>
+      <abstract>This paper describes our submission to the MEDIQA-Chat 2023 shared task for automatic clinical note generation from doctor-patient conversations. We report results for two approaches: the first fine-tunes a pre-trained language model (PLM) on the shared task data, and the second uses few-shot in-context learning (ICL) with a large language model (LLM). Both achieve high performance as measured by automatic metrics (e.g. ROUGE, BERTScore) and ranked second and first, respectively, of all submissions to the shared task. Expert human scrutiny indicates that notes generated via the ICL-based approach with GPT-4 are preferred about as often as human-written notes, making it a promising path toward automated note generation from doctor-patient conversations.</abstract>
+      <url hash="e3c9907f">2023.clinicalnlp-1.36</url>
+      <bibkey>giorgi-etal-2023-wanglab</bibkey>
+    </paper>
+    <paper id="37">
+      <title>Automatic Coding at Scale: Design and Deployment of a Nationwide System for Normalizing Referrals in the <fixed-case>C</fixed-case>hilean Public Healthcare System</title>
+      <author><first>Fabián</first><last>Villena</last><affiliation>Universidad de Chile</affiliation></author>
+      <author><first>Matías</first><last>Rojas</last></author>
+      <author><first>Felipe</first><last>Arias</last></author>
+      <author><first>Jorge</first><last>Pacheco</last></author>
+      <author><first>Paulina</first><last>Vera</last></author>
+      <author><first>Jocelyn</first><last>Dunstan</last><affiliation>Universidad de Chile</affiliation></author>
+      <pages>335-343</pages>
+      <abstract>The disease coding task involves assigning a unique identifier from a controlled vocabulary to each disease mentioned in a clinical document. This task is relevant since it allows information extraction from unstructured data to perform, for example, epidemiological studies about the incidence and prevalence of diseases in a determined context. However, the manual coding process is subject to errors as it requires medical personnel to be competent in coding rules and terminology. In addition, this process consumes a lot of time and energy, which could be allocated to more clinically relevant tasks. These difficulties can be addressed by developing computational systems that automatically assign codes to diseases. In this way, we propose a two-step system for automatically coding diseases in referrals from the Chilean public healthcare system. Specifically, our model uses a state-of-the-art NER model for recognizing disease mentions and a search engine system based on Elasticsearch for assigning the most relevant codes associated with these disease mentions. The system’s performance was evaluated on referrals manually coded by clinical experts. Our system obtained a MAP score of 0.63 for the subcategory level and 0.83 for the category level, close to the best-performing models in the literature. This system could be a support tool for health professionals, optimizing the coding and management process. Finally, to guarantee reproducibility, we publicly release the code of our models and experiments.</abstract>
+      <url hash="b7d36f33">2023.clinicalnlp-1.37</url>
+      <bibkey>villena-etal-2023-automatic</bibkey>
+    </paper>
+    <paper id="38">
+      <title>Building blocks for complex tasks: Robust generative event extraction for radiology reports under domain shifts</title>
+      <author><first>Sitong</first><last>Zhou</last></author>
+      <author><first>Meliha</first><last>Yetisgen</last><affiliation>University of Washington</affiliation></author>
+      <author><first>Mari</first><last>Ostendorf</last><affiliation>University of Washington</affiliation></author>
+      <pages>344-357</pages>
+      <abstract>This paper explores methods for extracting information from radiology reports that generalize across exam modalities to reduce requirements for annotated data. We demonstrate that multi-pass T5-based text-to-text generative models exhibit better generalization across exam modalities compared to approaches that employ BERT-based task-specific classification layers. We then develop methods that reduce the inference cost of the model, making large-scale corpus processing more feasible for clinical applications. Specifically, we introduce a generative technique that decomposes complex tasks into smaller subtask blocks, which improves a single-pass model when combined with multitask training. In addition, we leverage target-domain contexts during inference to enhance domain adaptation, enabling use of smaller models. Analyses offer insights into the benefits of different cost reduction strategies.</abstract>
+      <url hash="775ec7e1">2023.clinicalnlp-1.38</url>
+      <bibkey>zhou-etal-2023-building</bibkey>
+    </paper>
+    <paper id="39">
+      <title>Intersectionality and Testimonial Injustice in Medical Records</title>
+      <author><first>Kenya</first><last>Andrews</last><affiliation>University of Illinois at Chicago</affiliation></author>
+      <author><first>Bhuvni</first><last>Shah</last></author>
+      <author><first>Lu</first><last>Cheng</last><affiliation>University of Illinois at Chicago</affiliation></author>
+      <pages>358-372</pages>
+      <abstract>Detecting testimonial injustice is an essential element of addressing inequities and promoting inclusive healthcare practices, many of which are life-critical. However, using a single demographic factor to detect testimonial injustice does not fully encompass the nuanced identities that contribute to a patient’s experience. Further, some injustices may only be evident when examining the nuances that arise through the lens of intersectionality. Ignoring such injustices can result in poor quality of care or life-endangering events. Thus, considering intersectionality could result in more accurate classifications and just decisions. To illustrate this, we use real-world medical data to determine whether medical records exhibit words that could lead to testimonial injustice, employ fairness metrics (e.g. demographic parity, differential intersectional fairness, and subgroup fairness) to assess the severity to which subgroups are experiencing testimonial injustice, and analyze how the intersectionality of demographic features (e.g. gender and race) make a difference in uncovering testimonial injustice. From our analysis we found that with intersectionality we can better see disparities in how subgroups are treated and there are differences in how someone is treated based on the intersection of their demographic attributes. This has not been previously studied in clinical records, nor has it been proven through empirical study.</abstract>
+      <url hash="4cf1adf4">2023.clinicalnlp-1.39</url>
+      <bibkey>andrews-etal-2023-intersectionality</bibkey>
+    </paper>
+    <paper id="40">
+      <title>Interactive Span Recommendation for Biomedical Text</title>
+      <author><first>Louis</first><last>Blankemeier</last><affiliation>Stanford University</affiliation></author>
+      <author><first>Theodore</first><last>Zhao</last><affiliation>Microsoft</affiliation></author>
+      <author><first>Robert</first><last>Tinn</last></author>
+      <author><first>Sid</first><last>Kiblawi</last><affiliation>Microsoft</affiliation></author>
+      <author><first>Yu</first><last>Gu</last><affiliation>Microsoft</affiliation></author>
+      <author><first>Akshay</first><last>Chaudhari</last><affiliation>Stanford University and Subtle Medical</affiliation></author>
+      <author><first>Hoifung</first><last>Poon</last><affiliation>Microsoft</affiliation></author>
+      <author><first>Sheng</first><last>Zhang</last><affiliation>Microsoft</affiliation></author>
+      <author><first>Mu</first><last>Wei</last><affiliation>Microsoft</affiliation></author>
+      <author><first>J.</first><last>Preston</last></author>
+      <pages>373-384</pages>
+      <abstract>Motivated by the scarcity of high-quality labeled biomedical text, as well as the success of data programming, we introduce KRISS-Search. By leveraging the Unified Medical Language Systems (UMLS) ontology, KRISS-Search addresses an interactive few-shot span recommendation task that we propose. We first introduce unsupervised KRISS-Search and show that our method outperforms existing methods in identifying spans that are semantically similar to a given span of interest, with &gt;50% AUPRC improvement relative to PubMedBERT. We then introduce supervised KRISS-Search, which leverages human interaction to improve the notion of similarity used by unsupervised KRISS-Search. Through simulated human feedback, we demonstrate an enhanced F1 score of 0.68 in classifying spans as semantically similar or different in the low-label setting, outperforming PubMedBERT by 2 F1 points. Finally, supervised KRISS-Search demonstrates competitive or superior performance compared to PubMedBERT in few-shot biomedical named entity recognition (NER) across five benchmark datasets, with an average improvement of 5.6 F1 points. We envision KRISS-Search increasing the efficiency of programmatic data labeling and also providing broader utility as an interactive biomedical search engine.</abstract>
+      <url hash="a3998099">2023.clinicalnlp-1.40</url>
+      <bibkey>blankemeier-etal-2023-interactive</bibkey>
+    </paper>
+    <paper id="41">
+      <title>Prompt-based Extraction of Social Determinants of Health Using Few-shot Learning</title>
+      <author><first>Giridhar Kaushik</first><last>Ramachandran</last><affiliation>George Mason University</affiliation></author>
+      <author><first>Yujuan</first><last>Fu</last><affiliation>University of Washington</affiliation></author>
+      <author><first>Bin</first><last>Han</last><affiliation>University of Washington</affiliation></author>
+      <author><first>Kevin</first><last>Lybarger</last><affiliation>George Mason University</affiliation></author>
+      <author><first>Nic</first><last>Dobbins</last></author>
+      <author><first>Ozlem</first><last>Uzuner</last><affiliation>George Mason University</affiliation></author>
+      <author><first>Meliha</first><last>Yetisgen</last><affiliation>University of Washington</affiliation></author>
+      <pages>385-393</pages>
+      <abstract>Social determinants of health (SDOH) documented in the electronic health record through unstructured text are increasingly being studied to understand how SDOH impacts patient health outcomes. In this work, we utilize the Social History Annotation Corpus (SHAC), a multi-institutional corpus of de-identified social history sections annotated for SDOH, including substance use, employment, and living status information. We explore the automatic extraction of SDOH information with SHAC in both standoff and inline annotation formats using GPT-4 in a one-shot prompting setting. We compare GPT-4 extraction performance with a high-performing supervised approach and perform thorough error analyses. Our prompt-based GPT-4 method achieved an overall 0.652 F1 on the SHAC test set, similar to the 7th best-performing system among all teams in the n2c2 challenge with SHAC.</abstract>
+      <url hash="857d81fb">2023.clinicalnlp-1.41</url>
+      <bibkey>ramachandran-etal-2023-prompt</bibkey>
+    </paper>
+    <paper id="42">
+      <title>Teddysum at <fixed-case>MEDIQA</fixed-case>-Chat 2023: an analysis of fine-tuning strategy for long dialog summarization</title>
+      <author><first>Yongbin</first><last>Jeong</last></author>
+      <author><first>Ju-Hyuck</first><last>Han</last></author>
+      <author><first>Kyung Min</first><last>Chae</last><affiliation>Konyang University</affiliation></author>
+      <author><first>Yousang</first><last>Cho</last></author>
+      <author><first>Hyunbin</first><last>Seo</last><affiliation>teddysum</affiliation></author>
+      <author><first>KyungTae</first><last>Lim</last><affiliation>Seoul National University of Science and Technology</affiliation></author>
+      <author><first>Key-Sun</first><last>Choi</last><affiliation>Korea Advanced Institute of Science &amp; Technology and Konyang University</affiliation></author>
+      <author><first>Younggyun</first><last>Hahm</last></author>
+      <pages>394-402</pages>
+      <abstract>In this paper, we introduce the design and various attempts for TaskB of MEDIQA-Chat 2023. The goal of TaskB in MEDIQA-Chat 2023 is to generate full clinical note from doctor-patient consultation dialogues. This task has several challenging issues, such as lack of training data, handling long dialogue inputs, and generating semi-structured clinical note which have section heads. To address these issues, we conducted various experiments and analyzed their results. We utilized the DialogLED model pre-trained on long dialogue data to handle long inputs, and we pre-trained on other dialogue datasets to address the lack of training data. We also attempted methods such as using prompts and contrastive learning for handling sections. This paper provides insights into clinical note generation through analyzing experimental methods and results, and it suggests future research directions.</abstract>
+      <url hash="558d0c52">2023.clinicalnlp-1.42</url>
+      <bibkey>jeong-etal-2023-teddysum</bibkey>
+    </paper>
+    <paper id="43">
+      <title>Rare Codes Count: Mining Inter-code Relations for Long-tail Clinical Text Classification</title>
+      <author><first>Jiamin</first><last>Chen</last></author>
+      <author><first>Xuhong</first><last>Li</last><affiliation>Baidu</affiliation></author>
+      <author><first>Junting</first><last>Xi</last></author>
+      <author><first>Lei</first><last>Yu</last><affiliation>Beihang University</affiliation></author>
+      <author><first>Haoyi</first><last>Xiong</last><affiliation>Baidu</affiliation></author>
+      <pages>403-413</pages>
+      <abstract>Multi-label clinical text classification, such as automatic ICD coding, has always been a challenging subject in Natural Language Processing, due to its long, domain-specific documents and long-tail distribution over a large label set. Existing methods adopt different model architectures to encode the clinical notes. Whereas without digging out the useful connections between labels, the model presents a huge gap in predicting performances between rare and frequent codes. In this work, we propose a novel method for further mining the helpful relations between different codes via a relation-enhanced code encoder to improve the rare code performance. Starting from the simple code descriptions, the model reaches comparable, even better performances than models with heavy external knowledge. Our proposed method is evaluated on MIMIC-III, a common dataset in the medical domain. It outperforms the previous state-of-art models on both overall metrics and rare code performances. Moreover, the interpretation results further prove the effectiveness of our methods. Our code is publicly available at https://github.com/jiaminchen-1031/Rare-ICD.</abstract>
+      <url hash="6177323f">2023.clinicalnlp-1.43</url>
+      <bibkey>chen-etal-2023-rare</bibkey>
+    </paper>
+    <paper id="44">
+      <title><fixed-case>N</fixed-case>ew<fixed-case>A</fixed-case>ge<fixed-case>H</fixed-case>ealth<fixed-case>W</fixed-case>arriors at <fixed-case>MEDIQA</fixed-case>-Chat 2023 Task A: Summarizing Short Medical Conversation with Transformers</title>
+      <author><first>Prakhar</first><last>Mishra</last></author>
+      <author><first>Ravi Theja</first><last>Desetty</last><affiliation>Glance</affiliation></author>
+      <pages>414-421</pages>
+      <abstract>This paper presents the MEDIQA-Chat 2023 shared task organized at the ACL-Clinical NLP workshop. The shared task is motivated by the need to develop methods to automatically generate clinical notes from doctor-patient conversations. In this paper, we present our submission for <i>MEDIQA-Chat 2023 Task A: Short Dialogue2Note Summarization</i>. Manual creation of these clinical notes requires extensive human efforts, thus making it a time-consuming and expensive process. To address this, we propose an ensemble-based method over GPT-3, BART, BERT variants, and Rule-based systems to automatically generate clinical notes from these conversations. The proposed system achieves a score of 0.730 and 0.544 for both the sub-tasks on the test set (ranking 8th on the leaderboard for both tasks) and shows better performance compared to a baseline system using BART variants. </abstract>
+      <url hash="cb797cd9">2023.clinicalnlp-1.44</url>
+      <bibkey>mishra-desetty-2023-newagehealthwarriors</bibkey>
+    </paper>
+    <paper id="45">
+      <title>Storyline-Centric Detection of Aphasia and Dysarthria in Stroke Patient Transcripts</title>
+      <author><first>Peiqi</first><last>Sui</last></author>
+      <author><first>Kelvin</first><last>Wong</last><affiliation>Weill Cornell Medicine, Cornell University and Houston Methodist Research Institute</affiliation></author>
+      <author><first>Xiaohui</first><last>Yu</last><affiliation>NA</affiliation></author>
+      <author><first>John</first><last>Volpi</last><affiliation>Houston Methodist Neurological Institute</affiliation></author>
+      <author><first>Stephen</first><last>Wong</last><affiliation>Houston Methodist Hospital and Weill Cornell Medicine</affiliation></author>
+      <pages>422-432</pages>
+      <abstract>Aphasia and dysarthria are both common symptoms of stroke, affecting around 30% and 50% of acute ischemic stroke patients. In this paper, we propose a storyline-centric approach to detect aphasia and dysarthria in acute stroke patients using transcribed picture descriptions alone. Our pipeline enriches the training set with healthy data to address the lack of acute stroke patient data and utilizes knowledge distillation to significantly improve upon a document classification baseline, achieving an AUC of 0.814 (aphasia) and 0.764 (dysarthria) on a patient-only validation set.</abstract>
+      <url hash="f54a872b">2023.clinicalnlp-1.45</url>
+      <bibkey>sui-etal-2023-storyline</bibkey>
+    </paper>
+    <paper id="46">
+      <title>Pre-trained language models in <fixed-case>S</fixed-case>panish for health insurance coverage</title>
+      <author><first>Claudio</first><last>Aracena</last></author>
+      <author><first>Nicolás</first><last>Rodríguez</last></author>
+      <author><first>Victor</first><last>Rocco</last></author>
+      <author><first>Jocelyn</first><last>Dunstan</last><affiliation>Universidad de Chile</affiliation></author>
+      <pages>433-438</pages>
+      <abstract>The field of clinical natural language processing (NLP) can extract useful information from clinical text. Since 2017, the NLP field has shifted towards using pre-trained language models (PLMs), improving performance in several tasks. Most of the research in this field has focused on English text, but there are some available PLMs in Spanish. In this work, we use clinical PLMs to analyze text from admission and medical reports in Spanish for an insurance and health provider to give a probability of no coverage in a labor insurance process. Our results show that fine-tuning a PLM pre-trained with the provider’s data leads to better results, but this process is time-consuming and computationally expensive. At least for this task, fine-tuning publicly available clinical PLM leads to comparable results to a custom PLM, but in less time and with fewer resources. Analyzing large volumes of insurance requests is burdensome for employers, and models can ease this task by pre-classifying reports that are likely not to have coverage. Our approach of entirely using clinical-related text improves the current models while reinforcing the idea of clinical support systems that simplify human labor but do not replace it. To our knowledge, the clinical corpus collected for this study is the largest one reported for the Spanish language.</abstract>
+      <url hash="fa52faad">2023.clinicalnlp-1.46</url>
+      <bibkey>aracena-etal-2023-pre</bibkey>
+    </paper>
+    <paper id="47">
+      <title>Utterance Classification with Logical Neural Network: Explainable <fixed-case>AI</fixed-case> for Mental Disorder Diagnosis</title>
+      <author><first>Yeldar</first><last>Toleubay</last><affiliation>University of Tsukuba, Tsukuba University</affiliation></author>
+      <author><first>Don Joven</first><last>Agravante</last><affiliation>International Business Machines</affiliation></author>
+      <author><first>Daiki</first><last>Kimura</last><affiliation>IBM Research</affiliation></author>
+      <author><first>Baihan</first><last>Lin</last><affiliation>Columbia University and IBM, International Business Machines</affiliation></author>
+      <author><first>Djallel</first><last>Bouneffouf</last></author>
+      <author><first>Michiaki</first><last>Tatsubori</last><affiliation>IBM Research</affiliation></author>
+      <pages>439-446</pages>
+      <abstract>In response to the global challenge of mental health problems, we proposes a Logical Neural Network (LNN) based Neuro-Symbolic AI method for the diagnosis of mental disorders. Due to the lack of effective therapy coverage for mental disorders, there is a need for an AI solution that can assist therapists with the diagnosis. However, current Neural Network models lack explainability and may not be trusted by therapists. The LNN is a Recurrent Neural Network architecture that combines the learning capabilities of neural networks with the reasoning capabilities of classical logic-based AI. The proposed system uses input predicates from clinical interviews to output a mental disorder class, and different predicate pruning techniques are used to achieve scalability and higher scores. In addition, we provide an insight extraction method to aid therapists with their diagnosis. The proposed system addresses the lack of explainability of current Neural Network models and provides a more trustworthy solution for mental disorder diagnosis.</abstract>
+      <url hash="08061f0a">2023.clinicalnlp-1.47</url>
+      <bibkey>toleubay-etal-2023-utterance</bibkey>
+    </paper>
+    <paper id="48">
+      <title>A Survey of Evaluation Methods of Generated Medical Textual Reports</title>
+      <author><first>Yongxin</first><last>Zhou</last><affiliation>Laboratoire d’Informatique de Grenoble</affiliation></author>
+      <author><first>Fabien</first><last>Ringeval</last><affiliation>University of Grenoble-Alpes</affiliation></author>
+      <author><first>François</first><last>Portet</last><affiliation>Université Grenoble Alpes</affiliation></author>
+      <pages>447-459</pages>
+      <abstract>Medical Report Generation (MRG) is a sub-task of Natural Language Generation (NLG) and aims to present information from various sources in textual form and synthesize salient information, with the goal of reducing the time spent by domain experts in writing medical reports and providing support information for decision-making. Given the specificity of the medical domain, the evaluation of automatically generated medical reports is of paramount importance to the validity of these systems. Therefore, in this paper, we focus on the evaluation of automatically generated medical reports from the perspective of automatic and human evaluation. We present evaluation methods for general NLG evaluation and how they have been applied to domain-specific medical tasks. The study shows that MRG evaluation methods are very diverse, and that further work is needed to build shared evaluation methods. The state of the art also emphasizes that such an evaluation must be task specific and include human assessments, requesting the participation of experts in the field.</abstract>
+      <url hash="fdf73a2a">2023.clinicalnlp-1.48</url>
+      <bibkey>zhou-etal-2023-survey</bibkey>
+    </paper>
+    <paper id="49">
+      <title><fixed-case>UMASS</fixed-case>_<fixed-case>B</fixed-case>io<fixed-case>NLP</fixed-case> at <fixed-case>MEDIQA</fixed-case>-Chat 2023: Can <fixed-case>LLM</fixed-case>s generate high-quality synthetic note-oriented doctor-patient conversations?</title>
+      <author><first>Junda</first><last>Wang</last></author>
+      <author><first>Zonghai</first><last>Yao</last><affiliation>University of Massachusetts at Amherst</affiliation></author>
+      <author><first>Avijit</first><last>Mitra</last></author>
+      <author><first>Samuel</first><last>Osebe</last></author>
+      <author><first>Zhichao</first><last>Yang</last><affiliation>University of Massachusetts, Amherst</affiliation></author>
+      <author><first>Hong</first><last>Yu</last><affiliation>Columbia University</affiliation></author>
+      <pages>460-471</pages>
+      <abstract>This paper presents UMASS_BioNLP team participation in the MEDIQA-Chat 2023 shared task for Task-A and Task-C. We focus especially on Task-C and propose a novel LLMs cooperation system named a doctor-patient loop to generate high-quality conversation data sets. The experiment results demonstrate that our approaches yield reasonable performance as evaluated by automatic metrics such as ROUGE, medical concept recall, BLEU, and Self-BLEU. Furthermore, we conducted a comparative analysis between our proposed method and ChatGPT and GPT-4. This analysis also investigates the potential of utilizing cooperation LLMs to generate high-quality datasets.</abstract>
+      <url hash="38d5263c">2023.clinicalnlp-1.49</url>
+      <bibkey>wang-etal-2023-umass</bibkey>
+    </paper>
+    <paper id="50">
+      <title><fixed-case>H</fixed-case>ealth<fixed-case>M</fixed-case>avericks@<fixed-case>MEDIQA</fixed-case>-Chat 2023: Benchmarking different Transformer based models for Clinical Dialogue Summarization</title>
+      <author><first>Kunal</first><last>Suri</last><affiliation>Optum,India</affiliation></author>
+      <author><first>Saumajit</first><last>Saha</last></author>
+      <author><first>Atul</first><last>Singh</last></author>
+      <pages>472-489</pages>
+      <abstract>In recent years, we have seen many Transformer based models being created to address Dialog Summarization problem. While there has been a lot of work on understanding how these models stack against each other in summarizing regular conversations such as the ones found in DialogSum dataset, there haven’t been many analysis of these models on Clinical Dialog Summarization. In this article, we describe our solution to MEDIQA-Chat 2023 Shared Tasks as part of ACL-ClinicalNLP 2023 workshop which benchmarks some of the popular Transformer Architectures such as BioBart, Flan-T5, DialogLED, and OpenAI GPT3 on the problem of Clinical Dialog Summarization. We analyse their performance on two tasks - summarizing short conversations and long conversations. In addition to this, we also benchmark two popular summarization ensemble methods and report their performance.</abstract>
+      <url hash="371ccef2">2023.clinicalnlp-1.50</url>
+      <bibkey>suri-etal-2023-healthmavericks</bibkey>
+    </paper>
+    <paper id="51">
+      <title><fixed-case>S</fixed-case>umm<fixed-case>QA</fixed-case> at <fixed-case>MEDIQA</fixed-case>-Chat 2023: In-Context Learning with <fixed-case>GPT</fixed-case>-4 for Medical Summarization</title>
+      <author><first>Yash</first><last>Mathur</last></author>
+      <author><first>Sanketh</first><last>Rangreji</last></author>
+      <author><first>Raghav</first><last>Kapoor</last></author>
+      <author><first>Medha</first><last>Palavalli</last></author>
+      <author><first>Amanda</first><last>Bertsch</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Matthew</first><last>Gormley</last><affiliation>School of Computer Science, Carnegie Mellon University and 3M</affiliation></author>
+      <pages>490-502</pages>
+      <abstract>Medical dialogue summarization is challenging due to the unstructured nature of medical conversations, the use of medical terminologyin gold summaries, and the need to identify key information across multiple symptom sets. We present a novel system for the Dialogue2Note Medical Summarization tasks in the MEDIQA 2023 Shared Task. Our approach for sectionwise summarization (Task A) is a two-stage process of selecting semantically similar dialogues and using the top-k similar dialogues as in-context examples for GPT-4. For full-note summarization (Task B), we use a similar solution with k=1. We achieved 3rd place in Task A (2nd among all teams), 4th place in Task B Division Wise Summarization (2nd among all teams), 15th place in Task A Section Header Classification (9th among all teams), and 8th place among all teams in Task B. Our results highlight the effectiveness of few-shot prompting for this task, though we also identify several weaknesses of prompting-based approaches. We compare GPT-4 performance with several finetuned baselines. We find that GPT-4 summaries are more abstractive and shorter. We make our code publicly available.</abstract>
+      <url hash="57cf3fe4">2023.clinicalnlp-1.51</url>
+      <bibkey>mathur-etal-2023-summqa</bibkey>
+    </paper>
+    <paper id="52">
+      <title>Overview of the <fixed-case>MEDIQA</fixed-case>-Chat 2023 Shared Tasks on the Summarization &amp; Generation of Doctor-Patient Conversations</title>
+      <author><first>Asma</first><last>Ben Abacha</last><affiliation>Microsoft, USA</affiliation></author>
+      <author><first>Wen-wai</first><last>Yim</last></author>
+      <author><first>Griffin</first><last>Adams</last></author>
+      <author><first>Neal</first><last>Snider</last></author>
+      <author><first>Meliha</first><last>Yetisgen</last><affiliation>University of Washington</affiliation></author>
+      <pages>503-513</pages>
+      <abstract>Automatic generation of clinical notes from doctor-patient conversations can play a key role in reducing daily doctors’ workload and improving their interactions with the patients. MEDIQA-Chat 2023 aims to advance and promote research on effective solutions through shared tasks on the automatic summarization of doctor-patient conversations and on the generation of synthetic dialogues from clinical notes for data augmentation. Seventeen teams participated in the challenge and experimented with a broad range of approaches and models. In this paper, we describe the three MEDIQA-Chat 2023 tasks, the datasets, and the participants’ results and methods. We hope that these shared tasks will lead to additional research efforts and insights on the automatic generation and evaluation of clinical notes.</abstract>
+      <url hash="a6af9e7a">2023.clinicalnlp-1.52</url>
+      <bibkey>ben-abacha-etal-2023-overview</bibkey>
+    </paper>
+    <paper id="53">
+      <title>Transfer Learning for Low-Resource Clinical Named Entity Recognition</title>
+      <author><first>Nevasini</first><last>Sasikumar</last></author>
+      <author><first>Krishna Sri Ipsit</first><last>Mantri</last></author>
+      <pages>514-518</pages>
+      <abstract>We propose a transfer learning method that adapts a high-resource English clinical NER model to low-resource languages and domains using only small amounts of in-domain annotated data. Our approach involves translating in-domain datasets to English, fine-tuning the English model on the translated data, and then transferring it to the target language/domain. Experiments on Spanish, French, and conversational clinical text datasets show accuracy gains over models trained on target data alone. Our method achieves state-of-the-art performance and can enable clinical NLP in more languages and modalities with limited resources.</abstract>
+      <url hash="9e113924">2023.clinicalnlp-1.53</url>
+      <bibkey>sasikumar-mantri-2023-transfer</bibkey>
+    </paper>
+    <paper id="54">
+      <title><fixed-case>IUTEAM</fixed-case>1 at <fixed-case>MEDIQA</fixed-case>-Chat 2023: Is simple fine tuning effective for multi layer summarization of clinical conversations?</title>
+      <author><first>Dhananjay</first><last>Srivastava</last></author>
+      <pages>519-523</pages>
+      <abstract>Clinical conversation summarization has become an important application of Natural language Processing. In this work, we intend to analyze summarization model ensembling approaches, that can be utilized to improve the overall accuracy of the generated medical report called chart note. The work starts with a single summarization model creating the baseline. Then leads to an ensemble of summarization models trained on a separate section of the chart note. This leads to the final approach of passing the generated results to another summarization model in a multi-layer/stage fashion for better coherency of the generated text. Our results indicate that although an ensemble of models specialized in each section produces better results, the multi-layer/stage approach does not improve accuracy. The code for the above paper is available at https://github.com/dhananjay-srivastava/MEDIQA-Chat-2023-iuteam1.git</abstract>
+      <url hash="02c98688">2023.clinicalnlp-1.54</url>
+      <bibkey>srivastava-2023-iuteam1</bibkey>
+    </paper>
+    <paper id="55">
+      <title><fixed-case>C</fixed-case>are4<fixed-case>L</fixed-case>ang at <fixed-case>MEDIQA</fixed-case>-Chat 2023: Fine-tuning Language Models for Classifying and Summarizing Clinical Dialogues</title>
+      <author><first>Amal</first><last>Alqahtani</last><affiliation>George Washington University</affiliation></author>
+      <author><first>Rana</first><last>Salama</last><affiliation>George Washington University</affiliation></author>
+      <author><first>Mona</first><last>Diab</last><affiliation>George Washington University</affiliation></author>
+      <author><first>Abdou</first><last>Youssef</last><affiliation>George Washington University</affiliation></author>
+      <pages>524-528</pages>
+      <abstract>Summarizing medical conversations is one of the tasks proposed by MEDIQA-Chat to promote research on automatic clinical note generation from doctor-patient conversations. In this paper, we present our submission to this task using fine-tuned language models, including T5, BART and BioGPT models. The fine-tuned models are evaluated using ensemble metrics including ROUGE, BERTScore andBLEURT. Among the fine-tuned models, Flan-T5 achieved the highest aggregated score for dialogue summarization.</abstract>
+      <url hash="9e68b483">2023.clinicalnlp-1.55</url>
+      <bibkey>alqahtani-etal-2023-care4lang</bibkey>
+    </paper>
+    <paper id="56">
+      <title><fixed-case>C</fixed-case>alvados at <fixed-case>MEDIQA</fixed-case>-Chat 2023: Improving Clinical Note Generation with Multi-Task Instruction Finetuning</title>
+      <author><first>Kirill</first><last>Milintsevich</last><affiliation>Université de Caen Basse Normandie and University of Tartu</affiliation></author>
+      <author><first>Navneet</first><last>Agarwal</last></author>
+      <pages>529-535</pages>
+      <abstract>This paper presents our system for the MEDIQA-Chat 2023 shared task on medical conversation summarization. Our approach involves finetuning a LongT5 model on multiple tasks simultaneously, which we demonstrate improves the model’s overall performance while reducing the number of factual errors and hallucinations in the generated summary. Furthermore, we investigated the effect of augmenting the data with in-text annotations from a clinical named entity recognition model, finding that this approach decreased summarization quality. Lastly, we explore using different text generation strategies for medical note generation based on the length of the note. Our findings suggest that the application of our proposed approach can be beneficial for improving the accuracy and effectiveness of medical conversation summarization.</abstract>
+      <url hash="62790d1a">2023.clinicalnlp-1.56</url>
+      <bibkey>milintsevich-agarwal-2023-calvados</bibkey>
+    </paper>
+    <paper id="57">
+      <title><fixed-case>DS</fixed-case>4<fixed-case>DH</fixed-case> at <fixed-case>MEDIQA</fixed-case>-Chat 2023: Leveraging <fixed-case>SVM</fixed-case> and <fixed-case>GPT</fixed-case>-3 Prompt Engineering for Medical Dialogue Classification and Summarization</title>
+      <author><first>Boya</first><last>Zhang</last></author>
+      <author><first>Rahul</first><last>Mishra</last></author>
+      <author><first>Douglas</first><last>Teodoro</last><affiliation>University of Geneva</affiliation></author>
+      <pages>536-545</pages>
+      <abstract>This paper presents the results of the Data Science for Digital Health (DS4DH) group in the MEDIQA-Chat Tasks at ACL-ClinicalNLP 2023. Our study combines the power of a classical machine learning method, Support Vector Machine, for classifying medical dialogues, along with the implementation of one-shot prompts using GPT-3.5. We employ dialogues and summaries from the same category as prompts to generate summaries for novel dialogues. Our findings exceed the average benchmark score, offering a robust reference for assessing performance in this field.</abstract>
+      <url hash="49c53a44">2023.clinicalnlp-1.57</url>
+      <bibkey>zhang-etal-2023-ds4dh</bibkey>
+    </paper>
+    <paper id="58">
+      <title><fixed-case>G</fixed-case>erstein<fixed-case>L</fixed-case>ab at <fixed-case>MEDIQA</fixed-case>-Chat 2023: Clinical Note Summarization from Doctor-Patient Conversations through Fine-tuning and In-context Learning</title>
+      <author><first>Xiangru</first><last>Tang</last><affiliation>Yale University</affiliation></author>
+      <author><first>Andrew</first><last>Tran</last></author>
+      <author><first>Jeffrey</first><last>Tan</last></author>
+      <author><first>Mark</first><last>Gerstein</last><affiliation>Yale University</affiliation></author>
+      <pages>546-554</pages>
+      <abstract>This paper presents our contribution to the MEDIQA-2023 Dialogue2Note shared task, encompassing both subtask A and subtask B. We approach the task as a dialogue summarization problem and implement two distinct pipelines: (a) a fine-tuning of a pre-trained dialogue summarization model and GPT-3, and (b) few-shot in-context learning (ICL) using a large language model, GPT-4. Both methods achieve excellent results in terms of ROUGE-1 F1, BERTScore F1 (deberta-xlarge-mnli), and BLEURT, with scores of 0.4011, 0.7058, and 0.5421, respectively. Additionally, we predict the associated section headers using RoBERTa and SciBERT based classification models. Our team ranked fourth among all teams, while each team is allowed to submit three runs as part of their submission. We also utilize expert annotations to demonstrate that the notes generated through the ICL GPT-4 are better than all other baselines. The code for our submission is available.</abstract>
+      <url hash="57efd858">2023.clinicalnlp-1.58</url>
+      <bibkey>tang-etal-2023-gersteinlab</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.dialdoc.xml b/data/xml/2023.dialdoc.xml
new file mode 100644
index 0000000000..c811d5bcf8
--- /dev/null
+++ b/data/xml/2023.dialdoc.xml
@@ -0,0 +1,170 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.dialdoc">
+  <volume id="1" ingest-date="2023-07-12">
+    <meta>
+      <booktitle>Proceedings of the Third DialDoc Workshop on Document-grounded Dialogue and Conversational Question Answering</booktitle>
+      <editor><first>Smaranda</first><last>Muresan</last></editor>
+      <editor><first>Vivian</first><last>Chen</last></editor>
+      <editor><first>Kennington</first><last>Casey</last></editor>
+      <editor><first>Vandyke</first><last>David</last></editor>
+      <editor><first>Dethlefs</first><last>Nina</last></editor>
+      <editor><first>Inoue</first><last>Koji</last></editor>
+      <editor><first>Ekstedt</first><last>Erik</last></editor>
+      <editor><first>Ultes</first><last>Stefan</last></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Toronto, Canada</address>
+      <month>July</month>
+      <year>2023</year>
+      <url hash="1ac84f07">2023.dialdoc-1</url>
+      <venue>dialdoc</venue>
+    </meta>
+    <frontmatter>
+      <url hash="db4120a4">2023.dialdoc-1.0</url>
+      <bibkey>dialdoc-2023-dialdoc</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Cross-lingual Data Augmentation for Document-grounded Dialog Systems in Low Resource Languages</title>
+      <author><first>Qi</first><last>Gou</last></author>
+      <author><first>Zehua</first><last>Xia</last></author>
+      <author><first>Wenzhe</first><last>Du</last></author>
+      <pages>1-7</pages>
+      <abstract>This paper proposes a framework to address the issue of data scarcity in Document-Grounded Dialogue Systems(DGDS). Our model leverages high-resource languages to enhance the capability of dialogue generation in low-resource languages. Specifically, We present a novel pipeline CLEM (Cross-Lingual Enhanced Model) including adversarial training retrieval (Retriever and Re-ranker), and Fid (fusion-in-decoder) generator. To further leverage high-resource language, we also propose an innovative architecture to conduct alignment across different languages with translated training. Extensive experiment results demonstrate the effectiveness of our model and we achieved 4th place in the DialDoc 2023 Competition. Therefore, CLEM can serve as a solution to resource scarcity in DGDS and provide useful guidance for multi-lingual alignment tasks.</abstract>
+      <url hash="1d8205ef">2023.dialdoc-1.1</url>
+      <bibkey>gou-etal-2023-cross</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>M</fixed-case>o<fixed-case>QA</fixed-case>: Benchmarking Multi-Type Open-Domain Question Answering</title>
+      <author><first>Howard</first><last>Yen</last><affiliation>Princeton University</affiliation></author>
+      <author><first>Tianyu</first><last>Gao</last></author>
+      <author><first>Jinhyuk</first><last>Lee</last><affiliation>Google</affiliation></author>
+      <author><first>Danqi</first><last>Chen</last><affiliation>Department of Computer Science, Princeton University</affiliation></author>
+      <pages>8-29</pages>
+      <abstract>Previous research on open-domain question answering (QA) mainly focuses on questions with short answers. However, information-seeking QA often requires various formats of answers depending on the nature of the questions, e.g., why/how questions typically require a long answer. In this paper, we present MoQA, a benchmark for open-domain QA that requires building one system that can provide short, medium, long, and yes/no answers to different questions accordingly. MoQA builds upon Natural Questions with multiple types of questions and additional crowdsourcing efforts to ensure high query quality. We adapt state-of-the-art models, and reveal unique findings in multi-type open-domain QA: (1) For retriever-reader models, training one retriever on all types achieves the overall best performance, but it is challenging to train one reader model to output answers of different formats, or to train a question classifier to distinguish between types; (2) An end-to-end closed-book QA model trained on multiple types struggles with the task across the board; (3) State-of-the-art large language models such as the largest GPT-3 models (Brown et al., 2020; Ouyang et al., 2022) also lag behind open-book QA models. Our benchmark and analysis call for more effort into building versatile open-domain QA models in the future.</abstract>
+      <url hash="f30a1cc7">2023.dialdoc-1.2</url>
+      <bibkey>yen-etal-2023-moqa</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Exploration of multilingual prompts in document-grounded dialogue</title>
+      <author><first>Xiaocheng</first><last>Zhang</last></author>
+      <author><first>Huang</first><last>Qing</last></author>
+      <author><first>Fu</first><last>Lin</last></author>
+      <pages>30-35</pages>
+      <abstract>Transferring DGD models from high-resource languages to low-resource languages is a meaningful but challenging task. Being able to provide multilingual responses to multilingual documents further complicates the task. This paper describes our method at DialDoc23 Shared Task (Document-Grounded Dialogue and Conversational Question Answering) for generate responses based on the most relevant passage retrieved. We divide it into three steps of retrieval, re-ranking and generation. Our methods include negative sample augmentation, prompt learning, pseudo-labeling and ensemble. On the submission page, we rank 2nd based on the sum of token-level F1, SacreBleu and Rouge-L scores used for the final evaluation, and get the total score of 210.25.</abstract>
+      <url hash="57a6a967">2023.dialdoc-1.3</url>
+      <bibkey>zhang-etal-2023-exploration</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Position Matters! Empirical Study of Order Effect in Knowledge-grounded Dialogue</title>
+      <author><first>Hsuan</first><last>Su</last></author>
+      <author><first>Shachi</first><last>H. Kumar</last><affiliation>Intel Labs</affiliation></author>
+      <author><first>Sahisnu</first><last>Mazumder</last><affiliation>Intel Labs, USA</affiliation></author>
+      <author><first>Wenda</first><last>Chen</last></author>
+      <author><first>Ramesh</first><last>Manuvinakurike</last></author>
+      <author><first>Eda</first><last>Okur</last><affiliation>Intel Labs</affiliation></author>
+      <author><first>Saurav</first><last>Sahay</last><affiliation>Intel</affiliation></author>
+      <author><first>Lama</first><last>Nachman</last></author>
+      <author><first>Shang-Tse</first><last>Chen</last><affiliation>National Taiwan University</affiliation></author>
+      <author><first>Hung-yi</first><last>Lee</last><affiliation>National Taiwan University</affiliation></author>
+      <pages>36-43</pages>
+      <abstract>With the power of large pretrained language models, various research works have integrated knowledge into dialogue systems. The traditional techniques treat knowledge as part of the input sequence for the dialogue system, prepending a set of knowledge statements in front of dialogue history.However, such a mechanism forces knowledge sets to be concatenated in an ordered manner, making models implicitly pay imbalanced attention to the sets during training.In this paper, we first investigate how the order of the knowledge set can influence autoregressive dialogue systems’ responses. We conduct experiments on two commonly used dialogue datasets with two types of transformer-based models and find that models view the input knowledge unequally. To this end, we propose a simple and novel technique to alleviate the order effect by modifying the position embeddings of knowledge input in these models. With the proposed position embedding method, the experimental results show that each knowledge statement is uniformly considered to generate responses.</abstract>
+      <url hash="f1df74d1">2023.dialdoc-1.4</url>
+      <bibkey>su-etal-2023-position</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Enhancing Multilingual Document-Grounded Dialogue Using Cascaded Prompt-Based Post-Training Models</title>
+      <author><first>Jun</first><last>Liu</last></author>
+      <author><first>Shuang</first><last>Cheng</last></author>
+      <author><first>Zineng</first><last>Zhou</last></author>
+      <author><first>Yang</first><last>Gu</last><affiliation>, Chinese Academy of Sciences</affiliation></author>
+      <author><first>Jian</first><last>Ye</last></author>
+      <author><first>Haiyong</first><last>Luo</last></author>
+      <pages>44-51</pages>
+      <abstract>The Dialdoc23 shared task presents a Multilingual Document-Grounded Dialogue Systems (MDGDS) challenge, where system responses are generated in multiple languages using user’s queries, historical dialogue records and relevant passages. A major challenge for this task is the limited training data available in low-resource languages such as French and Vietnamese. In this paper, we propose Cascaded Prompt-based Post-training Models, dividing the task into three subtasks: Retrieval, Reranking and Generation. We conduct post-training on high-resource language such as English and Chinese to enhance performance of low-resource languages by using the similarities of languages. Additionally, we utilize the prompt method to activate model’s ability on diverse languages within the dialogue domain and explore which prompt is a good prompt. Our comprehensive experiments demonstrate the effectiveness of our proposed methods, which achieved the first place on the leaderboard with a total score of 215.40 in token-level F1, SacreBleu, and Rouge-L metrics.</abstract>
+      <url hash="c99f9969">2023.dialdoc-1.5</url>
+      <bibkey>liu-etal-2023-enhancing-multilingual</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Enhanced Training Methods for Multiple Languages</title>
+      <author><first>Hai</first><last>Li</last></author>
+      <author><first>Yang</first><last>Li</last><affiliation>Shanghai Jiaotong University</affiliation></author>
+      <pages>52-56</pages>
+      <abstract>Document-grounded dialogue generation based on multilingual is a challenging and realistic task. Unlike previous tasks, it need to tackle with multiple high-resource languages facilitating low-resource languages. This paper summarizes our research based on a three-stage pipeline that includes retrieval, re-rank and generation where each component is individually optimized. In different languages with limited data scenarios, we mainly improve the robustness of the pipeline through data augmentation and embedding perturbation with purpose of improving the performance designing three training methods: cross-language enhancement training, weighted training with neighborhood distribution augmentation, and ensemble adversarial training, all of that can be used as plug and play modules. Through experiments with different settings, it has been shown that our methods can effectively improve the generalization performance of pipeline with score ranking 6th among the public submissions on leaderboards.</abstract>
+      <url hash="e7e927f8">2023.dialdoc-1.6</url>
+      <bibkey>li-li-2023-enhanced</bibkey>
+    </paper>
+    <paper id="7">
+      <title><fixed-case>SLDT</fixed-case>: Sequential Latent Document Transformer for Multilingual Document-based Dialogue</title>
+      <author><first>Zhanyu</first><last>Ma</last></author>
+      <author><first>Zeming</first><last>Liu</last></author>
+      <author><first>Jian</first><last>Ye</last></author>
+      <pages>57-67</pages>
+      <abstract>Multilingual document-grounded dialogue, where the system is required to generate responses based on both the conversation Multilingual context and external knowledge sources.Traditional pipeline methods for knowledge identification and response generation, while effective in certain scenarios, suffer from error propagation issues and fail to capture the interdependence between these two sub-tasks. To overcome these challenges, we propose the application of the SLDT method, which treats passage-knowledge selection as a sequential decision process rather than a single-step decision process.We achieved winner 3rd in dialdoc 2023 and we also validated the effectiveness of our method on other datasets. The ablation experiment also shows that our method significantly improves the basic model compared to other methods.</abstract>
+      <url hash="4ba723c4">2023.dialdoc-1.7</url>
+      <bibkey>ma-etal-2023-sldt</bibkey>
+    </paper>
+    <paper id="8">
+      <title>A Dialogue System for Assessing Activities of Daily Living: Improving Consistency with Grounded Knowledge</title>
+      <author><first>Zhecheng</first><last>Sheng</last></author>
+      <author><first>Raymond</first><last>Finzel</last><affiliation>University of Minnesota - Twin Cities</affiliation></author>
+      <author><first>Michael</first><last>Lucke</last></author>
+      <author><first>Sheena</first><last>Dufresne</last></author>
+      <author><first>Maria</first><last>Gini</last><affiliation>University of Minnesota , Twin Ciities</affiliation></author>
+      <author><first>Serguei</first><last>Pakhomov</last><affiliation>University of Minnesota - Twin Cities</affiliation></author>
+      <pages>68-79</pages>
+      <abstract>In healthcare, the ability to care for oneself is reflected in the “Activities of Daily Living (ADL),” which serve as a measure of functional ability (functioning). A lack of functioning may lead to poor living conditions requiring personal care and assistance. To accurately identify those in need of support, assistance programs continuously evaluate participants’ functioning across various domains. However, the assessment process may encounter consistency issues when multiple assessors with varying levels of expertise are involved. Novice assessors, in particular, may lack the necessary preparation for real-world interactions with participants. To address this issue, we developed a dialogue system that simulates interactions between assessors and individuals of varying functioning in a natural and reproducible way. The dialogue system consists of two major modules, one for natural language understanding (NLU) and one for natural language generation (NLG), respectively. In order to generate responses consistent with the underlying knowledge base, the dialogue system requires both an understanding of the user’s query and of biographical details of an individual being simulated. To fulfill this requirement, we experimented with query classification and generated responses based on those biographical details using some recently released InstructGPT-like models.</abstract>
+      <url hash="0bbc9bd3">2023.dialdoc-1.8</url>
+      <bibkey>sheng-etal-2023-dialogue</bibkey>
+    </paper>
+    <paper id="9">
+      <title><fixed-case>C</fixed-case>-<fixed-case>PMI</fixed-case>: Conditional Pointwise Mutual Information for Turn-level Dialogue Evaluation</title>
+      <author><first>Liliang</first><last>Ren</last></author>
+      <author><first>Mankeerat</first><last>Sidhu</last></author>
+      <author><first>Qi</first><last>Zeng</last></author>
+      <author><first>Revanth</first><last>Gangi Reddy</last></author>
+      <author><first>Heng</first><last>Ji</last></author>
+      <author><first>ChengXiang</first><last>Zhai</last></author>
+      <pages>80-85</pages>
+      <abstract>Existing reference-free turn-level evaluation metrics for chatbots inadequately capture the interaction between the user and the system. Consequently, they often correlate poorly with human evaluations. To address this issue, we propose a novel model-agnostic approach that leverages Conditional Pointwise Mutual Information (C-PMI) to measure the turn-level interaction between the system and the user based on a given evaluation dimension. Experimental results on the widely used FED dialogue evaluation dataset demonstrate that our approach significantly improves the correlation with human judgment compared with existing evaluation systems. By replacing the negative log-likelihood-based scorer with our proposed C-PMI scorer, we achieve a relative 60.5% higher Spearman correlation on average for the FED evaluation metric. Our code is publicly available at https://github.com/renll/C-PMI.</abstract>
+      <url hash="6cc760c3">2023.dialdoc-1.9</url>
+      <bibkey>ren-etal-2023-c</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>C</fixed-case>onv<fixed-case>RGX</fixed-case>: Recognition, Generation, and Extraction for Self-trained Conversational Question Answering</title>
+      <author><first>Tianhua</first><last>Zhang</last></author>
+      <author><first>Liping</first><last>Tang</last></author>
+      <author><first>Wei</first><last>Fang</last><affiliation>Massachusetts Institute of Technology</affiliation></author>
+      <author><first>Hongyin</first><last>Luo</last><affiliation>Massachusetts Institute of Technology</affiliation></author>
+      <author><first>Xixin</first><last>Wu</last><affiliation>The Chinese University of Hong Kong</affiliation></author>
+      <author><first>Helen</first><last>Meng</last></author>
+      <author><first>James</first><last>Glass</last></author>
+      <pages>86-100</pages>
+      <abstract>Collecting and constructing human-annotated corpora for training conversational question-answering (CQA) models has recently been shown to be inefficient and costly. To solve this problem, previous works have proposed training QA models with automatically generated QA data. In this work, we extend earlier studies on QA synthesis, and propose an efficient QA data generation algorithm under conversational settings. Our model recognizes potential dialogue topics, generates corresponding questions, and extracts answers from grounding passages. To improve the quality of generated QAs and downstream self-training of CQA models, we propose dropout and agreement-based QA selection methods. We conduct experiments on both data augmentation and domain adaptation settings. Experiments on the QuAC and Doc2Dial tasks show that the proposed method can significantly improve the quality of generated QA data, and also improves the accuracy of self-trained CQA models based on the constructed training corpora.</abstract>
+      <url hash="93e99020">2023.dialdoc-1.10</url>
+      <bibkey>zhang-etal-2023-convrgx</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Language-Agnostic Transformers and Assessing <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>-Based Query Rewriting for Multilingual Document-Grounded <fixed-case>QA</fixed-case></title>
+      <author><first>Srinivas</first><last>Gowriraj</last></author>
+      <author><first>Soham Dinesh</first><last>Tiwari</last></author>
+      <author><first>Mitali</first><last>Potnis</last></author>
+      <author><first>Srijan</first><last>Bansal</last></author>
+      <author><first>Teruko</first><last>Mitamura</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Eric</first><last>Nyberg</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <pages>101-108</pages>
+      <abstract>The DialDoc 2023 shared task has expanded the document-grounded dialogue task to encompass multiple languages, despite having limited annotated data. This paper assesses the effectiveness of both language-agnostic and language-aware paradigms for multilingual pre-trained transformer models in a bi-encoder-based dense passage retriever (DPR), concluding that the language-agnostic approach is superior. Additionally, the study investigates the impact of query rewriting techniques using large language models, such as ChatGPT, on multilingual, document-grounded question-answering systems. The experiments conducted demonstrate that, for the examples examined, query rewriting does not enhance performance compared to the original queries. This failure is due to topic switching in final dialogue turns and irrelevant topics being considered for query rewriting.</abstract>
+      <url hash="6f28e1a4">2023.dialdoc-1.11</url>
+      <bibkey>gowriraj-etal-2023-language</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Follow the Knowledge: Structural Biases and Artefacts in Knowledge Grounded Dialog Datasets</title>
+      <author><first>Ehsan</first><last>Lotfi</last><affiliation>Universiteit Antwerpen</affiliation></author>
+      <author><first>Maxime</first><last>De Bruyn</last><affiliation>Antwerp University</affiliation></author>
+      <author><first>Jeska.buhmann@uantwerpen.be</first><last>Jeska.buhmann@uantwerpen.be</last><affiliation>NA</affiliation></author>
+      <author><first>Walter</first><last>Daelemans</last><affiliation>University of Antwerp</affiliation></author>
+      <pages>109-121</pages>
+      <abstract>Crowd-sourcing has been one of the primary ways to curate conversational data, specially for certain scenarios like grounding in knowledge. In this setting, using online platforms like AMT, non-expert participants are hired to converse with each other, following instructions which try to guide the outcome towards the desired format. The resulting data then is used for different parts of dialog modelling like knowledge selection and response selection/generation.In this work, we take a closer look into two of the most popular knowledge grounded dialog (KGD) datasets. Investigating potential biases and artefacts in knowledge selection labels, we observe that in many cases the ‘knowledge selection flow’ simply follows the order of presented knowledge pieces. In Wizard of Wikipedia (the most popular KGD dataset) we use simple content-agnostic models based on this bias to get significant knowledge selection performance. In Topical-Chat we see a similar correlation between the knowledge selection sequence and the order of entities and their segments, as provided to crowd-source workers. We believe that the observed results, question the significance and origin of the presumed dialog-level attributes like ‘knowledge flow’ in these crowd-sourced datasets.</abstract>
+      <url hash="6653752c">2023.dialdoc-1.12</url>
+      <bibkey>lotfi-etal-2023-follow</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.iwslt.xml b/data/xml/2023.iwslt.xml
index 12ebc88e65..6ceb0bfaed 100644
--- a/data/xml/2023.iwslt.xml
+++ b/data/xml/2023.iwslt.xml
@@ -1,6 +1,6 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <collection id="2023.iwslt">
-  <volume id="1" ingest-date="2023-07-08">
+  <volume id="1" ingest-date="2023-07-10">
     <meta>
       <booktitle>Proceedings of the 20th International Conference on Spoken Language Translation (IWSLT 2023)</booktitle>
       <editor><first>Elizabeth</first><last>Salesky</last></editor>
@@ -19,66 +19,66 @@
     </frontmatter>
     <paper id="1">
       <title><fixed-case>FINDINGS</fixed-case> <fixed-case>OF</fixed-case> <fixed-case>THE</fixed-case> <fixed-case>IWSLT</fixed-case> 2023 <fixed-case>EVALUATION</fixed-case> <fixed-case>CAMPAIGN</fixed-case></title>
-      <author><first>Sweta</first><last>Agrawal</last><affiliation>Umd</affiliation></author>
-      <author><first>Antonios</first><last>Anastasopoulos</last><affiliation>Gmu</affiliation></author>
-      <author><first>Luisa</first><last>Bentivogli</last><affiliation>Fbk</affiliation></author>
+      <author><first>Sweta</first><last>Agrawal</last><affiliation>UMD</affiliation></author>
+      <author><first>Antonios</first><last>Anastasopoulos</last><affiliation>GMU</affiliation></author>
+      <author><first>Luisa</first><last>Bentivogli</last><affiliation>FBK</affiliation></author>
       <author><first>Ondřej</first><last>Bojar</last><affiliation>Charles U.</affiliation></author>
       <author><first>Claudia</first><last>Borg</last><affiliation>U. Malta</affiliation></author>
-      <author><first>Marine</first><last>Carpuat</last><affiliation>Umd</affiliation></author>
-      <author><first>Roldano</first><last>Cattoni</last><affiliation>Fbk</affiliation></author>
-      <author><first>Mauro</first><last>Cettolo</last><affiliation>Fbk</affiliation></author>
+      <author><first>Marine</first><last>Carpuat</last><affiliation>UMD</affiliation></author>
+      <author><first>Roldano</first><last>Cattoni</last><affiliation>FBK</affiliation></author>
+      <author><first>Mauro</first><last>Cettolo</last><affiliation>FBK</affiliation></author>
       <author><first>Mingda</first><last>Chen</last><affiliation>Meta</affiliation></author>
-      <author><first>William</first><last>Chen</last><affiliation>Cmu</affiliation></author>
-      <author><first>Khalid</first><last>Choukri</last><affiliation>Elda</affiliation></author>
-      <author><first>Alexandra</first><last>Chronopoulou</last><affiliation>Lmu</affiliation></author>
-      <author><first>Anna</first><last>Currey</last><affiliation>Aws</affiliation></author>
-      <author><first>Thierry</first><last>Declerck</last><affiliation>Dfki</affiliation></author>
+      <author><first>William</first><last>Chen</last><affiliation>CMU</affiliation></author>
+      <author><first>Khalid</first><last>Choukri</last><affiliation>ELDA</affiliation></author>
+      <author><first>Alexandra</first><last>Chronopoulou</last><affiliation>LMU</affiliation></author>
+      <author><first>Anna</first><last>Currey</last><affiliation>AWS</affiliation></author>
+      <author><first>Thierry</first><last>Declerck</last><affiliation>DFKI</affiliation></author>
       <author><first>Qianqian</first><last>Dong</last><affiliation>Bytedance</affiliation></author>
-      <author><first>Kevin</first><last>Duh</last><affiliation>Jhu</affiliation></author>
+      <author><first>Kevin</first><last>Duh</last><affiliation>JHU</affiliation></author>
       <author><first>Yannick</first><last>Estève</last><affiliation>Avignon U.</affiliation></author>
-      <author><first>Marcello</first><last>Federico</last><affiliation>Aws</affiliation></author>
+      <author><first>Marcello</first><last>Federico</last><affiliation>AWS</affiliation></author>
       <author><first>Souhir</first><last>Gahbiche</last><affiliation>Airbus</affiliation></author>
       <author><first>Barry</first><last>Haddow</last><affiliation>U. Edinburgh</affiliation></author>
-      <author><first>Benjamin</first><last>Hsu</last><affiliation>Aws</affiliation></author>
-      <author><first>Phu</first><last>Mon Htut</last><affiliation>Aws</affiliation></author>
+      <author><first>Benjamin</first><last>Hsu</last><affiliation>AWS</affiliation></author>
+      <author><first>Phu</first><last>Mon Htut</last><affiliation>AWS</affiliation></author>
       <author><first>Hirofumi</first><last>Inaguma</last><affiliation>Meta</affiliation></author>
       <author><first>Dávid</first><last>Javorský</last><affiliation>Charles U.</affiliation></author>
-      <author><first>John</first><last>Judge</last><affiliation>Dcu</affiliation></author>
-      <author><first>Yasumasa</first><last>Kano</last><affiliation>Naist</affiliation></author>
+      <author><first>John</first><last>Judge</last><affiliation>DCU</affiliation></author>
+      <author><first>Yasumasa</first><last>Kano</last><affiliation>NAIST</affiliation></author>
       <author><first>Tom</first><last>Ko</last><affiliation>Bytedance</affiliation></author>
       <author><first>Rishu</first><last>Kumar</last><affiliation>Charles U.</affiliation></author>
       <author><first>Pengwei</first><last>Li</last><affiliation>Meta</affiliation></author>
       <author><first>Xutai</first><last>Ma</last><affiliation>Meta</affiliation></author>
-      <author><first>Prashant</first><last>Mathur</last><affiliation>Aws</affiliation></author>
+      <author><first>Prashant</first><last>Mathur</last><affiliation>AWS</affiliation></author>
       <author><first>Evgeny</first><last>Matusov</last><affiliation>AppTek</affiliation></author>
-      <author><first>Paul</first><last>McNamee</last><affiliation>Jhu</affiliation></author>
+      <author><first>Paul</first><last>McNamee</last><affiliation>JHU</affiliation></author>
       <author><first>John</first><last>P. McCrae</last><affiliation>U. Galway</affiliation></author>
-      <author><first>Kenton</first><last>Murray</last><affiliation>Jhu</affiliation></author>
-      <author><first>Maria</first><last>Nadejde</last><affiliation>Aws</affiliation></author>
-      <author><first>Satoshi</first><last>Nakamura</last><affiliation>Naist</affiliation></author>
-      <author><first>Matteo</first><last>Negri</last><affiliation>Fbk</affiliation></author>
+      <author><first>Kenton</first><last>Murray</last><affiliation>JHU</affiliation></author>
+      <author><first>Maria</first><last>Nadejde</last><affiliation>AWS</affiliation></author>
+      <author><first>Satoshi</first><last>Nakamura</last><affiliation>NAIST</affiliation></author>
+      <author><first>Matteo</first><last>Negri</last><affiliation>FBK</affiliation></author>
       <author><first>Ha</first><last>Nguyen</last><affiliation>Avignon U.</affiliation></author>
-      <author><first>Jan</first><last>Niehues</last><affiliation>Kit</affiliation></author>
-      <author><first>Xing</first><last>Niu</last><affiliation>Aws</affiliation></author>
+      <author><first>Jan</first><last>Niehues</last><affiliation>KIT</affiliation></author>
+      <author><first>Xing</first><last>Niu</last><affiliation>AWS</affiliation></author>
       <author><first>Atul</first><last>Kr. Ojha</last><affiliation>U. Galway</affiliation></author>
       <author><first>John</first><last>E. Ortega</last><affiliation>Northeastern U.</affiliation></author>
       <author><first>Proyag</first><last>Pal</last><affiliation>U. Edinburgh</affiliation></author>
       <author><first>Juan</first><last>Pino</last><affiliation>Meta</affiliation></author>
-      <author><first>Lonneke</first><last>van der Plas</last><affiliation>Idiap</affiliation></author>
+      <author><first>Lonneke</first><last>van der Plas</last><affiliation>IDIAP</affiliation></author>
       <author><first>Peter</first><last>Polák</last><affiliation>Charles U.</affiliation></author>
-      <author><first>Elijah</first><last>Rippeth</last><affiliation>Umd</affiliation></author>
-      <author><first>Elizabeth</first><last>Salesky</last><affiliation>Jhu</affiliation></author>
-      <author><first>Jiatong</first><last>Shi</last><affiliation>Cmu</affiliation></author>
+      <author><first>Elijah</first><last>Rippeth</last><affiliation>UMD</affiliation></author>
+      <author><first>Elizabeth</first><last>Salesky</last><affiliation>JHU</affiliation></author>
+      <author><first>Jiatong</first><last>Shi</last><affiliation>CMU</affiliation></author>
       <author><first>Matthias</first><last>Sperber</last><affiliation>Apple</affiliation></author>
       <author><first>Sebastian</first><last>Stüker</last><affiliation>Zoom</affiliation></author>
-      <author><first>Katsuhito</first><last>Sudoh</last><affiliation>Naist</affiliation></author>
+      <author><first>Katsuhito</first><last>Sudoh</last><affiliation>NAIST</affiliation></author>
       <author><first>Yun</first><last>Tang</last><affiliation>Meta</affiliation></author>
-      <author><first>Brian</first><last>Thompson</last><affiliation>Aws</affiliation></author>
+      <author><first>Brian</first><last>Thompson</last><affiliation>AWS</affiliation></author>
       <author><first>Kevin</first><last>Tran</last><affiliation>Meta</affiliation></author>
       <author><first>Marco</first><last>Turchi</last><affiliation>Zoom</affiliation></author>
-      <author><first>Alex</first><last>Waibel</last><affiliation>Cmu</affiliation></author>
+      <author><first>Alex</first><last>Waibel</last><affiliation>CMU</affiliation></author>
       <author><first>Mingxuan</first><last>Wang</last><affiliation>Bytedance</affiliation></author>
-      <author><first>Shinji</first><last>Watanabe</last><affiliation>Cmu</affiliation></author>
+      <author><first>Shinji</first><last>Watanabe</last><affiliation>CMU</affiliation></author>
       <author><first>Rodolfo</first><last>Zevallos</last><affiliation>U. Pompeu Fabra</affiliation></author>
       <pages>1-61</pages>
       <abstract>This paper reports on the shared tasks organized by the 20th IWSLT Conference. The shared tasks address 9 scientific challenges in spoken language translation: simultaneous and offline translation, automatic subtitling and dubbing, speech-to-speech translation, multilingual, dialect and low-resource speech translation, and formality control. The shared tasks attracted a total of 38 submissions by 31 teams. The growing interest towards spoken language translation is also witnessed by the constantly increasing number of shared task organizers and contributors to the overview paper, almost evenly distributed across industry and academia.</abstract>
@@ -95,12 +95,13 @@
       <pages>62-78</pages>
       <abstract>We present the ACL 60/60 evaluation sets for multilingual translation of ACL 2022 technical presentations into 10 target languages. This dataset enables further research into multilingual speech translation under realistic recording conditions with unsegmented audio and domain-specific terminology, applying NLP tools to text and speech in the technical domain, and evaluating and improving model robustness to diverse speaker demographics.</abstract>
       <url hash="78b6c7c4">2023.iwslt-1.2</url>
+      <attachment type="dataset" hash="8c714d0c">2023.iwslt-1.2.dataset.zip</attachment>
       <bibkey>salesky-etal-2023-evaluating</bibkey>
     </paper>
     <paper id="3">
       <title>The <fixed-case>M</fixed-case>ine<fixed-case>T</fixed-case>rans Systems for <fixed-case>IWSLT</fixed-case> 2023 Offline Speech Translation and Speech-to-Speech Translation Tasks</title>
       <author><first>Yichao</first><last>Du</last><affiliation>University of Science and Technology of China</affiliation></author>
-      <author><first>Guo</first><last>Zhengsheng</last><affiliation>Tencent</affiliation></author>
+      <author><first>Guo</first><last>Zhengsheng</last><affiliation>tencent</affiliation></author>
       <author><first>Jinchuan</first><last>Tian</last><affiliation>Peking University</affiliation></author>
       <author><first>Zhirui</first><last>Zhang</last><affiliation>Tencent AI Lab</affiliation></author>
       <author><first>Xing</first><last>Wang</last><affiliation>Tencent</affiliation></author>
@@ -127,7 +128,7 @@
       <title>The <fixed-case>USTC</fixed-case>’s Dialect Speech Translation System for <fixed-case>IWSLT</fixed-case> 2023</title>
       <author><first>Pan</first><last>Deng</last><affiliation>University of Science and Technology of China</affiliation></author>
       <author><first>Shihao</first><last>Chen</last><affiliation>University of Science and Technology of China</affiliation></author>
-      <author><first>Weitai</first><last>Zhang</last><affiliation>Ustc</affiliation></author>
+      <author><first>Weitai</first><last>Zhang</last><affiliation>USTC</affiliation></author>
       <author><first>Jie</first><last>Zhang</last><affiliation>University of Science &amp;Technology of China</affiliation></author>
       <author><first>Lirong</first><last>Dai</last><affiliation>University of Science &amp;Technology of China</affiliation></author>
       <pages>102-112</pages>
@@ -164,7 +165,7 @@
       <title>Enhancing Video Translation Context with Object Labels</title>
       <author><first>Jeremy</first><last>Gwinnup</last><affiliation>Air Force Research Laboratory</affiliation></author>
       <author><first>Tim</first><last>Anderson</last><affiliation>Air Force Research Laboratory</affiliation></author>
-      <author><first>Brian</first><last>Ore</last><affiliation>Afrl</affiliation></author>
+      <author><first>Brian</first><last>Ore</last><affiliation>AFRL</affiliation></author>
       <author><first>Eric</first><last>Hansen</last><affiliation>Air Force Research Laboratory</affiliation></author>
       <author><first>Kevin</first><last>Duh</last><affiliation>Johns Hopkins University</affiliation></author>
       <pages>130-137</pages>
@@ -218,7 +219,7 @@
       <title><fixed-case>MT</fixed-case> Metrics Correlate with Human Ratings of Simultaneous Speech Translation</title>
       <author><first>Dominik</first><last>Macháček</last><affiliation>Charles University, MFF UFAL</affiliation></author>
       <author><first>Ondřej</first><last>Bojar</last><affiliation>Charles University, MFF UFAL</affiliation></author>
-      <author><first>Raj</first><last>Dabre</last><affiliation>Nict</affiliation></author>
+      <author><first>Raj</first><last>Dabre</last><affiliation>NICT</affiliation></author>
       <pages>169-179</pages>
       <abstract>There have been several meta-evaluation studies on the correlation between human ratings and offline machine translation (MT) evaluation metrics such as BLEU, chrF2, BertScore and COMET. These metrics have been used to evaluate simultaneous speech translation (SST) but their correlations with human ratings of SST, which has been recently collected as Continuous Ratings (CR), are unclear. In this paper, we leverage the evaluations of candidate systems submitted to the English-German SST task at IWSLT 2022 and conduct an extensive correlation analysis of CR and the aforementioned metrics. Our study reveals that the offline metrics are well correlated with CR and can be reliably used for evaluating machine translation in simultaneous mode, with some limitations on the test set size. We conclude that given the current quality levels of SST, these metrics can be used as proxies for CR, alleviating the need for large scale human evaluation. Additionally, we observe that correlations of the metrics with translation as a reference is significantly higher than with simultaneous interpreting, and thus we recommend the former for reliable evaluation.</abstract>
       <url hash="d66a6517">2023.iwslt-1.12</url>
@@ -270,13 +271,13 @@
     </paper>
     <paper id="15">
       <title>Submission of <fixed-case>USTC</fixed-case>’s System for the <fixed-case>IWSLT</fixed-case> 2023 - Offline Speech Translation Track</title>
-      <author><first>Xinyuan</first><last>Zhou</last><affiliation>Iflytek</affiliation></author>
+      <author><first>Xinyuan</first><last>Zhou</last><affiliation>iflytek</affiliation></author>
       <author><first>Jianwei</first><last>Cui</last><affiliation>University of Science and Technology of China</affiliation></author>
-      <author><first>Zhongyi</first><last>Ye</last><affiliation>Iflytek</affiliation></author>
+      <author><first>Zhongyi</first><last>Ye</last><affiliation>iflytek</affiliation></author>
       <author><first>Yichi</first><last>Wang</last><affiliation>University of Science and Technology of China</affiliation></author>
       <author><first>Luzhen</first><last>Xu</last><affiliation>University of Science and Technology of China</affiliation></author>
-      <author><first>Hanyi</first><last>Zhang</last><affiliation>Iflytek</affiliation></author>
-      <author><first>Weitai</first><last>Zhang</last><affiliation>Ustc</affiliation></author>
+      <author><first>Hanyi</first><last>Zhang</last><affiliation>iflytek</affiliation></author>
+      <author><first>Weitai</first><last>Zhang</last><affiliation>USTC</affiliation></author>
       <author><first>Lirong</first><last>Dai</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>194-201</pages>
       <abstract>This paper describes the submissions of the research group USTC-NELSLIP to the 2023 IWSLT Offline Speech Translation competition, which involves translating spoken English into written Chinese. We utilize both cascaded models and end-to-end models for this task. To improve the performance of the cascaded models, we introduce Whisper to reduce errors in the intermediate source language text, achieving a significant improvement in ASR recognition performance. For end-to-end models, we propose Stacked Acoustic-and-Textual En- coding extension (SATE-ex), which feeds the output of the acoustic decoder into the textual decoder for information fusion and to prevent error propagation. Additionally, we improve the performance of the end-to-end system in translating speech by combining the SATE-ex model with the encoder-decoder model through ensembling.</abstract>
@@ -287,7 +288,7 @@
       <title><fixed-case>I</fixed-case>2<fixed-case>R</fixed-case>’s End-to-End Speech Translation System for <fixed-case>IWSLT</fixed-case> 2023 Offline Shared Task</title>
       <author><first>Muhammad</first><last>Huzaifah</last><affiliation>Agency for Science, Technology and Research</affiliation></author>
       <author><first>Kye</first><last>Min Tan</last><affiliation>Institute for Infocomm Research, A*STAR</affiliation></author>
-      <author><first>Richeng</first><last>Duan</last><affiliation>Astar</affiliation></author>
+      <author><first>Richeng</first><last>Duan</last><affiliation>ASTAR</affiliation></author>
       <pages>202-210</pages>
       <abstract>This paper describes I2R’s submission to the offline speech translation track for IWSLT 2023. We focus on an end-to-end approach for translation from English audio to German text, one of the three available language directions in this year’s edition. The I2R system leverages on pretrained models that have been exposed to large-scale audio and text data for our base model. We introduce several stages of additional pretraining followed by fine-tuning to adapt the system for the downstream speech translation task. The strategy is supplemented by other techniques such as data augmentation, domain tagging, knowledge distillation, and model ensemble, among others. We evaluate the system on several publicly available test sets for comparison.</abstract>
       <url hash="30cf7c8b">2023.iwslt-1.16</url>
@@ -319,7 +320,7 @@
       <author><first>Salima</first><last>Mdhaffar</last><affiliation>LIA - University of Avignon</affiliation></author>
       <author><first>Gaëlle</first><last>Laperrière</last><affiliation>Avignon University LIA</affiliation></author>
       <author><first>Lucas</first><last>Maison</last><affiliation>LIA - Avignon University</affiliation></author>
-      <author><first>Sameer</first><last>Khurana</last><affiliation>Mit</affiliation></author>
+      <author><first>Sameer</first><last>Khurana</last><affiliation>MIT</affiliation></author>
       <author><first>Yannick</first><last>Estève</last><affiliation>LIA - Avignon University</affiliation></author>
       <pages>219-226</pages>
       <abstract>This paper describes the ON-TRAC consortium speech translation systems developed for IWSLT 2023 evaluation campaign. Overall, we participated in three speech translation tracks featured in the low-resource and dialect speech translation shared tasks, namely; i) spoken Tamasheq to written French, ii) spoken Pashto to written French, and iii) spoken Tunisian to written English. All our primary submissions are based on the end-to-end speech-to-text neural architecture using a pretrained SAMU-XLSR model as a speech encoder and a mbart model as a decoder. The SAMU-XLSR model is built from the XLS-R 128 in order to generate language agnostic sentence-level embeddings. This building is driven by the LaBSE model trained on multilingual text dataset. This architecture allows us to improve the input speech representations and achieve significant improvements compared to conventional end-to-end speech translation systems.</abstract>
@@ -405,7 +406,7 @@
       <author><first>Hengchao</first><last>Shang</last><affiliation>Huawei Technologies Co., Ltd.</affiliation></author>
       <author><first>Daimeng</first><last>Wei</last><affiliation>Huawei Technologies Co., Ltd.</affiliation></author>
       <author><first>Min</first><last>Zhang</last><affiliation>Huawei</affiliation></author>
-      <author><first>Shimin</first><last>Tao</last><affiliation>Huawei</affiliation></author>
+      <author><first>Shimin</first><last>Tao</last><affiliation>huawei</affiliation></author>
       <author><first>Hao</first><last>Yang</last><affiliation>Huawei Co. Ltd</affiliation></author>
       <pages>277-282</pages>
       <abstract>This paper describes our work on the IWSLT2023 Speech-to-Speech task. Our proposed cascaded system consists of an ensemble of Conformer and S2T-Transformer-based ASR models, a Transformer-based MT model, and a Diffusion-based TTS model. Our primary focus in this competition was to investigate the modeling ability of the Diffusion model for TTS tasks in high-resource scenarios and the role of TTS in the overall S2S task. To this end, we proposed DTS, an end-to-end diffusion-based TTS model that takes raw text as input and generates waveform by iteratively denoising on pure Gaussian noise. Compared to previous TTS models, the speech generated by DTS is more natural and performs better in code-switching scenarios. As the training process is end-to-end, it is relatively straightforward. Our experiments demonstrate that DTS outperforms other TTS models on the GigaS2S benchmark, and also brings positive gains for the entire S2S system.</abstract>
@@ -476,15 +477,15 @@
     </paper>
     <paper id="31">
       <title><fixed-case>NAIST</fixed-case> Simultaneous Speech-to-speech Translation System for <fixed-case>IWSLT</fixed-case> 2023</title>
-      <author><first>Ryo</first><last>Fukuda</last><affiliation>Naist</affiliation></author>
-      <author><first>Yuta</first><last>Nishikawa</last><affiliation>Naist</affiliation></author>
+      <author><first>Ryo</first><last>Fukuda</last><affiliation>NAIST</affiliation></author>
+      <author><first>Yuta</first><last>Nishikawa</last><affiliation>NAIST</affiliation></author>
       <author><first>Yasumasa</first><last>Kano</last><affiliation>Nara Institute of Science and Technology</affiliation></author>
-      <author><first>Yuka</first><last>Ko</last><affiliation>Naist</affiliation></author>
+      <author><first>Yuka</first><last>Ko</last><affiliation>NAIST</affiliation></author>
       <author><first>Tomoya</first><last>Yanagita</last><affiliation>Nara Institute of Science and Technology</affiliation></author>
       <author><first>Kosuke</first><last>Doi</last><affiliation>Nara Institute of Science and Technology</affiliation></author>
-      <author><first>Mana</first><last>Makinae</last><affiliation>Naist</affiliation></author>
-      <author><first>Sakriani</first><last>Sakti</last><affiliation>Jaist/naist</affiliation></author>
-      <author><first>Katsuhito</first><last>Sudoh</last><affiliation>Naist</affiliation></author>
+      <author><first>Mana</first><last>Makinae</last><affiliation>NAIST</affiliation></author>
+      <author><first>Sakriani</first><last>Sakti</last><affiliation>JAIST/NAIST</affiliation></author>
+      <author><first>Katsuhito</first><last>Sudoh</last><affiliation>NAIST</affiliation></author>
       <author><first>Satoshi</first><last>Nakamura</last><affiliation>Nara Institute of Science and Technology</affiliation></author>
       <pages>330-340</pages>
       <abstract>This paper describes NAIST’s submission to the IWSLT 2023 Simultaneous Speech Translation task: English-to-German, Japanese, Chinese speech-to-text translation and English-to-Japanese speech-to-speech translation. Our speech-to-text system uses an end-to-end multilingual speech translation model based on large-scale pre-trained speech and text models. We add Inter-connections into the model to incorporate the outputs from intermediate layers of the pre-trained speech model and augment prefix-to-prefix text data using Bilingual Prefix Alignment to enhance the simultaneity of the offline speech translation model. Our speech-to-speech system employs an incremental text-to-speech module that consists of a Japanese pronunciation estimation model, an acoustic model, and a neural vocoder.</abstract>
@@ -515,11 +516,11 @@
     </paper>
     <paper id="34">
       <title>Tagged End-to-End Simultaneous Speech Translation Training Using Simultaneous Interpretation Data</title>
-      <author><first>Yuka</first><last>Ko</last><affiliation>Naist</affiliation></author>
-      <author><first>Ryo</first><last>Fukuda</last><affiliation>Naist</affiliation></author>
-      <author><first>Yuta</first><last>Nishikawa</last><affiliation>Naist</affiliation></author>
+      <author><first>Yuka</first><last>Ko</last><affiliation>NAIST</affiliation></author>
+      <author><first>Ryo</first><last>Fukuda</last><affiliation>NAIST</affiliation></author>
+      <author><first>Yuta</first><last>Nishikawa</last><affiliation>NAIST</affiliation></author>
       <author><first>Yasumasa</first><last>Kano</last><affiliation>Nara Institute of Science and Technology</affiliation></author>
-      <author><first>Katsuhito</first><last>Sudoh</last><affiliation>Naist</affiliation></author>
+      <author><first>Katsuhito</first><last>Sudoh</last><affiliation>NAIST</affiliation></author>
       <author><first>Satoshi</first><last>Nakamura</last><affiliation>Nara Institute of Science and Technology</affiliation></author>
       <pages>363-375</pages>
       <abstract>Simultaneous speech translation (SimulST) translates partial speech inputs incrementally. Although the monotonic correspondence between input and output is preferable for smaller latency, it is not the case for distant language pairs such as English and Japanese. A prospective approach to this problem is to mimic simultaneous interpretation (SI) using SI data to train a SimulST model. However, the size of such SI data is limited, so the SI data should be used together with ordinary bilingual data whose translations are given in offline. In this paper, we propose an effective way to train a SimulST model using mixed data of SI and offline. The proposed method trains a single model using the mixed data with style tags that tell the model to generate SI- or offline-style outputs. Experiment results show improvements of BLEURT in different latency ranges, and our analyses revealed the proposed model generates SI-style outputs more than the baseline.</abstract>
@@ -580,7 +581,7 @@
     </paper>
     <paper id="38">
       <title>Speech Translation with Foundation Models and Optimal Transport: <fixed-case>UPC</fixed-case> at <fixed-case>IWSLT</fixed-case>23</title>
-      <author><first>Ioannis</first><last>Tsiamas</last><affiliation>Upc</affiliation></author>
+      <author><first>Ioannis</first><last>Tsiamas</last><affiliation>UPC</affiliation></author>
       <author><first>Gerard</first><last>I. Gállego</last><affiliation>Universitat Politcnica de Catalunya</affiliation></author>
       <author><first>Jose</first><last>Fonollosa</last><affiliation>Universitat Politecnica de Catalunya</affiliation></author>
       <author><first>Marta</first><last>R. Costa-jussá</last><affiliation>Meta AI</affiliation></author>
@@ -627,7 +628,7 @@
       <author><first>Kurt</first><last>Micallef</last><affiliation>University of Malta</affiliation></author>
       <author><first>Ahnaf</first><last>Mozib Samin</last><affiliation>University of Malta</affiliation></author>
       <author><first>Andrea</first><last>DeMarco</last><affiliation>University of Malta</affiliation></author>
-      <author><first>Lonneke</first><last>van der Plas</last><affiliation>Idiap</affiliation></author>
+      <author><first>Lonneke</first><last>van der Plas</last><affiliation>IDIAP</affiliation></author>
       <author><first>Claudia</first><last>Borg</last><affiliation>University of Malta</affiliation></author>
       <pages>433-441</pages>
       <abstract>For the 2023 IWSLT Maltese Speech Translation Task, UM-DFKI jointly presents a cascade solution which achieves 0.6 BLEU. While this is the first time that a Maltese speech translation task has been released by IWSLT, this paper explores previous solutions for other speech translation tasks, focusing primarily on low-resource scenarios. Moreover, we present our method of fine-tuning XLS-R models for Maltese ASR using a collection of multi-lingual speech corpora as well as the fine-tuning of the mBART model for Maltese to English machine translation.</abstract>
@@ -636,10 +637,10 @@
     </paper>
     <paper id="42">
       <title><fixed-case>NVIDIA</fixed-case> <fixed-case>N</fixed-case>e<fixed-case>M</fixed-case>o Offline Speech Translation Systems for <fixed-case>IWSLT</fixed-case> 2023</title>
-      <author><first>Oleksii</first><last>Hrinchuk</last><affiliation>Nvidia</affiliation></author>
+      <author><first>Oleksii</first><last>Hrinchuk</last><affiliation>NVIDIA</affiliation></author>
       <author><first>Vladimir</first><last>Bataev</last><affiliation>STC-innovations Ltd</affiliation></author>
       <author><first>Evelina</first><last>Bakhturina</last><affiliation>Nvidia</affiliation></author>
-      <author><first>Boris</first><last>Ginsburg</last><affiliation>Nvidia</affiliation></author>
+      <author><first>Boris</first><last>Ginsburg</last><affiliation>NVIDIA</affiliation></author>
       <pages>442-448</pages>
       <abstract>This paper provides an overview of NVIDIA NeMo’s speech translation systems for the IWSLT 2023 Offline Speech Translation Task. This year, we focused on end-to-end system which capitalizes on pre-trained models and synthetic data to mitigate the problem of direct speech translation data scarcity. When trained on IWSLT 2022 constrained data, our best En-&gt;De end-to-end model achieves the average score of 31 BLEU on 7 test sets from IWSLT 2010-2020 which improves over our last year cascade (28.4) and end-to-end (25.7) submissions. When trained on IWSLT 2023 constrained data, the average score drops to 29.5 BLEU.</abstract>
       <url hash="0f15c9d0">2023.iwslt-1.42</url>
diff --git a/data/xml/2023.nlrse.xml b/data/xml/2023.nlrse.xml
new file mode 100644
index 0000000000..9fcd1cc7c4
--- /dev/null
+++ b/data/xml/2023.nlrse.xml
@@ -0,0 +1,149 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.nlrse">
+  <volume id="1" ingest-date="2023-07-12">
+    <meta>
+      <booktitle>Proceedings of the 1st Workshop on Natural Language Reasoning and Structured Explanations (NLRSE)</booktitle>
+      <editor><first>Bhavana</first><last>Dalvi Mishra</last></editor>
+      <editor><first>Greg</first><last>Durrett</last></editor>
+      <editor><first>Peter</first><last>Jansen</last></editor>
+      <editor><first>Danilo</first><last>Neves Ribeiro</last></editor>
+      <editor><first>Jason</first><last>Wei</last></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Toronto, Canada</address>
+      <month>June</month>
+      <year>2023</year>
+      <url hash="d298f4fd">2023.nlrse-1</url>
+      <venue>nlrse</venue>
+    </meta>
+    <frontmatter>
+      <url hash="5cc195b0">2023.nlrse-1.0</url>
+      <bibkey>nlrse-2023-natural</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Knowledge Graph-augmented Language Models for Complex Question Answering</title>
+      <author><first>Priyanka</first><last>Sen</last><affiliation>Amazon</affiliation></author>
+      <author><first>Sandeep</first><last>Mavadia</last><affiliation>Amazon Alexa</affiliation></author>
+      <author><first>Amir</first><last>Saffari</last><affiliation>Amazon</affiliation></author>
+      <pages>1-8</pages>
+      <abstract>Large language models have shown impressive abilities to reason over input text, however, they are prone to hallucinations. On the other hand, end-to-end knowledge graph question answering (KGQA) models output responses grounded in facts, but they still struggle with complex reasoning, such as comparison or ordinal questions. In this paper, we propose a new method for complex question answering where we combine a knowledge graph retriever based on an end-to-end KGQA model with a language model that reasons over the retrieved facts to return an answer. We observe that augmenting language model prompts with retrieved KG facts improves performance over using a language model alone by an average of 83%. In particular, we see improvements on complex questions requiring count, intersection, or multi-hop reasoning operations.</abstract>
+      <url hash="9be1f9b0">2023.nlrse-1.1</url>
+      <bibkey>sen-etal-2023-knowledge</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Exploring the Curious Case of Code Prompts</title>
+      <author><first>Li</first><last>Zhang</last><affiliation>University of Pennsylvania</affiliation></author>
+      <author><first>Liam</first><last>Dugan</last><affiliation>University of Pennsylvania</affiliation></author>
+      <author><first>Hainiu</first><last>Xu</last><affiliation>University of Pennsylvania</affiliation></author>
+      <author><first>Chris</first><last>Callison-burch</last><affiliation>University of Pennsylvania</affiliation></author>
+      <pages>9-17</pages>
+      <abstract>Recent work has shown that prompting language models with code-like representations of natural language leads to performance improvements on structured reasoning tasks. However, such tasks comprise only a small subset of all natural language tasks. In our work, we seek to answer whether or not code-prompting is the preferred way of interacting with language models in general. We compare code and text prompts across three popular GPT models (davinci, code-davinci-002, and text-davinci-002) on a broader selection of tasks (e.g., QA, sentiment, summarization) and find that with few exceptions, code prompts do not consistently outperform text prompts. Furthermore, we show that the style of code prompt has a large effect on performance for some (but not all) tasks and that fine-tuning on text instructions leads to better relative performance of code prompts.</abstract>
+      <url hash="c54b3d4d">2023.nlrse-1.2</url>
+      <bibkey>zhang-etal-2023-exploring</bibkey>
+    </paper>
+    <paper id="3">
+      <title>A smashed glass cannot be full: Generation of Commonsense Explanations through Prompt-based Few-shot Learning</title>
+      <author><first>Andrea</first><last>Zaninello</last><affiliation>Fondazione Bruno Kessler</affiliation></author>
+      <author><first>Bernardo</first><last>Magnini</last><affiliation>FBK</affiliation></author>
+      <pages>18-29</pages>
+      <abstract>We assume that providing explanations is a process to elicit implicit knowledge in human communication, and propose a general methodology to generate commonsense explanations from pairs of semantically related sentences. We take advantage of both prompting applied to large, encoder-decoder pre-trained language models, and few-shot learning techniques, such as pattern-exploiting training. Experiments run on the e-SNLI dataset show that the proposed method achieves state-of-the-art results on the explanation generation task, with a substantial reduction of labelled data. The obtained results open new perspective on a number of tasks involving the elicitation of implicit knowledge.</abstract>
+      <url hash="54396538">2023.nlrse-1.3</url>
+      <bibkey>zaninello-magnini-2023-smashed</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Saliency Map Verbalization: Comparing Feature Importance Representations from Model-free and Instruction-based Methods</title>
+      <author><first>Nils</first><last>Feldhus</last><affiliation>German Research Center for Artificial Intelligence (DFKI)</affiliation></author>
+      <author><first>Leonhard</first><last>Hennig</last><affiliation>German Research Center for Artificial Intelligence (DFKI)</affiliation></author>
+      <author><first>Maximilian</first><last>Nasert</last><affiliation>German Research Center for Artificial Intelligence (DFKI)</affiliation></author>
+      <author><first>Christopher</first><last>Ebert</last><affiliation>German Research Center for Artificial Intelligence (DFKI)</affiliation></author>
+      <author><first>Robert</first><last>Schwarzenberg</last><affiliation>German Research Center For Artificial Intelligence (DFKI)</affiliation></author>
+      <author><first>Sebastian</first><last>Mller</last><affiliation>Quality and Usability Lab, TU Berlin</affiliation></author>
+      <pages>30-46</pages>
+      <abstract>Saliency maps can explain a neural model’s predictions by identifying important input features. They are difficult to interpret for laypeople, especially for instances with many features. In order to make them more accessible, we formalize the underexplored task of translating saliency maps into natural language and compare methods that address two key challenges of this approach – what and how to verbalize. In both automatic and human evaluation setups, using token-level attributions from text classification tasks, we compare two novel methods (search-based and instruction-based verbalizations) against conventional feature importance representations (heatmap visualizations and extractive rationales), measuring simulatability, faithfulness, helpfulness and ease of understanding. Instructing GPT-3.5 to generate saliency map verbalizations yields plausible explanations which include associations, abstractive summarization and commonsense reasoning, achieving by far the highest human ratings, but they are not faithfully capturing numeric information and are inconsistent in their interpretation of the task. In comparison, our search-based, model-free verbalization approach efficiently completes templated verbalizations, is faithful by design, but falls short in helpfulness and simulatability. Our results suggest that saliency map verbalization makes feature attribution explanations more comprehensible and less cognitively challenging to humans than conventional representations.</abstract>
+      <url hash="6771edab">2023.nlrse-1.4</url>
+      <bibkey>feldhus-etal-2023-saliency</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Using Planning to Improve Semantic Parsing of Instructional Texts</title>
+      <author><first>Vanya</first><last>Cohen</last><affiliation>The University of Texas at Austin</affiliation></author>
+      <author><first>Raymond</first><last>Mooney</last><affiliation>University of Texas at Austin</affiliation></author>
+      <pages>47-58</pages>
+      <abstract>We develop a symbolic planning-based decoder to improve the few-shot semantic parsing of instructional texts. The system takes long-form instructional texts as input and produces sequences of actions in a formal language that enable execution of the instructions. This task poses unique challenges since input texts may contain long context dependencies and ambiguous and domain-specific language. Valid semantic parses also require sequences of steps that constitute an executable plan. We build on recent progress in semantic parsing by leveraging large language models to learn parsers from small amounts of training data. During decoding, our method employs planning methods and domain information to rank and correct candidate parses. To validate our method, we evaluate on four domains: two household instruction-following domains and two cooking recipe interpretation domains. We present results for few-shot semantic parsing using leave-one-out cross-validation. We show that utilizing planning domain information improves the quality of generated plans. Through ablations we also explore the effects of our decoder design choices.</abstract>
+      <url hash="c5d22c50">2023.nlrse-1.5</url>
+      <bibkey>cohen-mooney-2023-using</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Reasoning Circuits: Few-shot Multi-hop Question Generation with Structured Rationales</title>
+      <author><first>Saurabh</first><last>Kulshreshtha</last><affiliation>University of Massachusetts Lowell</affiliation></author>
+      <author><first>Anna</first><last>Rumshisky</last><affiliation>University of Massachusetts Lowell</affiliation></author>
+      <pages>59-77</pages>
+      <abstract>Multi-hop Question Generation is the task of generating questions which require the reader to reason over and combine information spread across multiple passages employing several reasoning steps. Chain-of-thought rationale generation has been shown to improve performance on multi-step reasoning tasks and make model predictions more interpretable. However, few-shot performance gains from including rationales have been largely observed only in +100B language models, and otherwise require large-scale manual rationale annotation. In this paper, we introduce a new framework for applying chain-of-thought inspired structured rationale generation to multi-hop question generation under a very low supervision regime (8- to 128-shot). We propose to annotate a small number of examples following our proposed multi-step rationale schema, treating each reasoning step as a separate task to be performed by a generative language model. We show that our framework leads to improved control over the difficulty of the generated questions and better performance compared to baselines trained without rationales, both on automatic evaluation metrics and in human evaluation. Importantly, we show that this is achievable with a modest model size.</abstract>
+      <url hash="d63b96aa">2023.nlrse-1.6</url>
+      <bibkey>kulshreshtha-rumshisky-2023-reasoning</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Knowledge-Augmented Language Model Prompting for Zero-Shot Knowledge Graph Question Answering</title>
+      <author><first>Jinheon</first><last>Baek</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
+      <author><first>Alham Fikri</first><last>Aji</last><affiliation>MBZUAI</affiliation></author>
+      <author><first>Amir</first><last>Saffari</last><affiliation>Amazon</affiliation></author>
+      <pages>78-106</pages>
+      <abstract>Large Language Models (LLMs) are capable of performing zero-shot closed-book question answering tasks, based on their internal knowledge stored in parameters during pre-training. However, such internalized knowledge might be insufficient and incorrect, which could lead LLMs to generate factually wrong answers. Furthermore, fine-tuning LLMs to update their knowledge is expensive. To this end, we propose to augment the knowledge directly in the input of LLMs. Specifically, we first retrieve the relevant facts to the input question from the knowledge graph based on semantic similarities between the question and its associated facts. After that, we prepend the retrieved facts to the input question in the form of the prompt, which is then forwarded to LLMs to generate the answer. Our framework, Knowledge-Augmented language model PromptING (KAPING), requires no model training, thus completely zero-shot. We validate the performance of our KAPING framework on the knowledge graph question answering task, that aims to answer the user’s question based on facts over a knowledge graph, on which ours outperforms relevant zero-shot baselines by up to 48% in average, across multiple LLMs of various sizes.</abstract>
+      <url hash="e3a054d3">2023.nlrse-1.7</url>
+      <bibkey>baek-etal-2023-knowledge</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Can In-context Learners Learn a Reasoning Concept from Demonstrations?</title>
+      <author><first>Michal</first><last>Tefnik</last><affiliation>Masaryk University</affiliation></author>
+      <author><first>Marek</first><last>Kadlcik</last><affiliation>Faculty of Informatics, Masaryk University</affiliation></author>
+      <pages>107-115</pages>
+      <abstract>Large language models show an emergent ability to learn a new task from a small number of input-output demonstrations.However, recent work shows that in-context learners largely rely on their pre-trained knowledge, such as the sentiment of the labels, instead of finding new associations in the input.However, the commonly-used few-shot evaluation settings using a random selection of in-context demonstrations can not disentangle models’ ability to learn a new skill from demonstrations, as most of the randomly-selected demonstrations do not present relations informative for prediction beyond exposing the new task distribution.To disentangle models’ in-context learning ability independent of models’ memory, we introduce a Conceptual few-shot learning method selecting the demonstrations sharing a possibly-informative concept with the predicted sample. We extract a set of such concepts from annotated explanations and measure how much can models benefit from presenting these concepts in few-shot demonstrations.We find that smaller models are more sensitive to the presented concepts. While some of the models are able to benefit from concept-presenting demonstrations for each assessed concept, we find that none of the assessed in-context learners can benefit from all presented reasoning concepts consistently, leaving the in-context concept learning an open challenge.</abstract>
+      <url hash="a339b687">2023.nlrse-1.8</url>
+      <bibkey>tefnik-kadlcik-2023-context</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Effect Graph: Effect Relation Extraction for Explanation Generation</title>
+      <author><first>Jonathan</first><last>Kobbe</last><affiliation>University of Mannheim</affiliation></author>
+      <author><first>Ioana</first><last>Hulpu</last><affiliation>Data and Web Science Group, University of Mannheim</affiliation></author>
+      <author><first>Heiner</first><last>Stuckenschmidt</last><affiliation>University of Mannheim</affiliation></author>
+      <pages>116-127</pages>
+      <abstract>Argumentation is an important means of communication. For describing especially arguments about consequences, the notion of effect relations has been introduced recently. We propose a method to extract effect relations from large text resources and apply it on encyclopedic and argumentative texts. By connecting the extracted relations, we generate a knowledge graph which we call effect graph. For evaluating the effect graph, we perform crowd and expert annotations and create a novel dataset. We demonstrate a possible use case of the effect graph by proposing a method for explaining arguments from consequences.</abstract>
+      <url hash="0841e1b0">2023.nlrse-1.9</url>
+      <bibkey>kobbe-etal-2023-effect</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>OPT</fixed-case>-<fixed-case>R</fixed-case>: Exploring the Role of Explanations in Finetuning and Prompting for Reasoning Skills of Large Language Models</title>
+      <author><first>Badr</first><last>Alkhamissi</last><affiliation>Meta AI</affiliation></author>
+      <author><first>Siddharth</first><last>Verma</last><affiliation>Square</affiliation></author>
+      <author><first>Ping</first><last>Yu</last><affiliation>University at Buffalo</affiliation></author>
+      <author><first>Zhijing</first><last>Jin</last><affiliation>Max Planck Institute &amp; ETH Zurich</affiliation></author>
+      <author><first>Asli</first><last>Celikyilmaz</last><affiliation>FAIR @ Meta</affiliation></author>
+      <author><first>Mona</first><last>Diab</last><affiliation>Meta Responsible AI</affiliation></author>
+      <pages>128-138</pages>
+      <abstract>We conduct a thorough investigation into the reasoning capabilities of Large Language Models (LLMs), focusing specifically on the Open Pretrained Transformers (OPT) models as a representative of such models. Our study entails finetuning three different sizes of OPT on a carefully curated reasoning corpus, resulting in two sets of finetuned models: OPT-R, finetuned without explanations, and OPT-RE, finetuned with explanations. We then evaluate all models on 57 out-of-domain tasks drawn from the Super-NaturalInstructions benchmark, covering 26 distinct reasoning skills, utilizing three prompting techniques. Through a comprehensive grid of 27 configurations and 6,156 test evaluations, we investigate the dimensions of finetuning, prompting, and scale to understand the role of explanations on different reasoning skills. Our findings reveal that having explanations in the fewshot exemplar has no significant impact on the model’s performance when the model is finetuned, while positively affecting the non-finetuned counterpart. Moreover, we observe a slight yet consistent increase in classification accuracy as we incorporate explanations during prompting and finetuning, respectively. Finally, we offer insights on which reasoning skills benefit the most from incorporating explanations during finetuning and prompting, such as Numerical (+20.4%) and Analogical (+13.9%) reasoning, as well as skills that exhibit negligible or negative effects.</abstract>
+      <url hash="39be0da0">2023.nlrse-1.10</url>
+      <bibkey>alkhamissi-etal-2023-opt</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Deductive Additivity for Planning of Natural Language Proofs</title>
+      <author><first>Zayne</first><last>Sprague</last><affiliation>University of Texas at Austin</affiliation></author>
+      <author><first>Kaj</first><last>Bostrom</last><affiliation>University of Texas at Austin</affiliation></author>
+      <author><first>Swarat</first><last>Chaudhuri</last><affiliation>UT Austin</affiliation></author>
+      <author><first>Greg</first><last>Durrett</last><affiliation>UT Austin</affiliation></author>
+      <pages>139-156</pages>
+      <abstract>Current natural language systems designed for multi-step claim validation typically operate in two phases: retrieve a set of relevant premise statements using heuristics (planning), then generate novel conclusions from those statements using a large language model (deduction). The planning step often requires expensive Transformer operations and does not scale to arbitrary numbers of premise statements. In this paper, we investigate whether efficient planning heuristic is possible via embedding spaces compatible with deductive reasoning. Specifically, we evaluate whether embedding spaces exhibit a property we call deductive additivity: the sum of premise statement embeddings should be close to embeddings of conclusions based on those premises. We explore multiple sources of off-the-shelf dense embeddings in addition to fine-tuned embeddings from GPT3 and sparse embeddings from BM25. We study embedding models both intrinsically, evaluating whether the property of deductive additivity holds, and extrinsically, using them to assist planning in natural language proof generation. Lastly, we create a dataset, Single-Step Reasoning Contrast (SSRC), to further probe performance on various reasoning types. Our findings suggest that while standard embedding methods frequently embed conclusions near the sums of their premises, they fall short of being effective heuristics and lack the ability to model certain categories of reasoning.</abstract>
+      <url hash="d64d4dff">2023.nlrse-1.11</url>
+      <bibkey>sprague-etal-2023-deductive</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Synthetic Dataset for Evaluating Complex Compositional Knowledge for Natural Language Inference</title>
+      <author><first>Sushma Anand</first><last>Akoju</last><affiliation>University of Arizona</affiliation></author>
+      <author><first>Robert</first><last>Vacareanu</last><affiliation>University of Arizona</affiliation></author>
+      <author><first>Eduardo</first><last>Blanco</last><affiliation>University of Arizona</affiliation></author>
+      <author><first>Haris</first><last>Riaz</last><affiliation>University of Arizona</affiliation></author>
+      <author><first>Mihai</first><last>Surdeanu</last><affiliation>University of Arizona</affiliation></author>
+      <pages>157-168</pages>
+      <abstract>We introduce a synthetic dataset called Sentences Involving Complex Compositional Knowledge (SICCK) and a novel analysis that investigates the performance of Natural Language Inference (NLI) models to understand compositionality in logic. We produce 1,304 sentence pairs by modifying 15 examples from the SICK dataset (Marelli et al., 2014). To this end, we modify the original texts using a set of phrases modifiers that correspond to universal quantifiers, existential quantifiers, negation, and other concept modifiers in Natural Logic (NL) (MacCartney, 2009). We use these phrases to modify the subject, verb, and object parts of the premise and hypothesis. Lastly, we annotate these modified texts with the corresponding entailment labels following NL rules. We conduct a preliminary verification of how well the change in the structural and semantic composition is captured by neural NLI models, in both zero-shot and fine-tuned scenarios. We found that the performance of NLI models under the zero-shot setting is poor, especially for modified sentences with negation and existential quantifiers. After fine-tuning this dataset, we observe that models continue to perform poorly over negation, existential and universal modifiers.</abstract>
+      <url hash="ddf8c3c6">2023.nlrse-1.12</url>
+      <bibkey>akoju-etal-2023-synthetic</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.repl4nlp.xml b/data/xml/2023.repl4nlp.xml
new file mode 100644
index 0000000000..05fb0738ff
--- /dev/null
+++ b/data/xml/2023.repl4nlp.xml
@@ -0,0 +1,307 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.repl4nlp">
+  <volume id="1" ingest-date="2023-07-12">
+    <meta>
+      <booktitle>Proceedings of the 8th Workshop on Representation Learning for NLP (RepL4NLP 2023)</booktitle>
+      <editor><first>Burcu</first><last>Can</last><affiliation>University of Stirling</affiliation></editor>
+      <editor><first>Maximilian</first><last>Mozes</last><affiliation>University College London</affiliation></editor>
+      <editor><first>Samuel</first><last>Cahyawijaya</last><affiliation>Hong Kong University of Science and Technology</affiliation></editor>
+      <editor><first>Naomi</first><last>Saphra</last><affiliation>New York University</affiliation></editor>
+      <editor><first>Nora</first><last>Kassner</last><affiliation>Meta</affiliation></editor>
+      <editor><first>Shauli</first><last>Ravfogel</last><affiliation>Bar-Ilan University</affiliation></editor>
+      <editor><first>Abhilasha</first><last>Ravichander</last><affiliation>Allen Institute for Artificial Intelligence</affiliation></editor>
+      <editor><first>Chen</first><last>Zhao</last><affiliation>New York University</affiliation></editor>
+      <editor><first>Isabelle</first><last>Augenstein</last><affiliation>University of Copenhagen</affiliation></editor>
+      <editor><first>Anna</first><last>Rogers</last><affiliation>University of Copenhagen</affiliation></editor>
+      <editor><first>Kyunghyun</first><last>Cho</last><affiliation>New York University</affiliation></editor>
+      <editor><first>Edward</first><last>Grefenstette</last><affiliation>DeepMind</affiliation></editor>
+      <editor><first>Lena</first><last>Voita</last><affiliation>Meta AI</affiliation></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Toronto, Canada</address>
+      <month>July</month>
+      <year>2023</year>
+      <venue>repl4nlp</venue>
+    </meta>
+    <frontmatter>
+      <url hash="72c5792b">2023.repl4nlp-1.0</url>
+      <bibkey>repl4nlp-2023-representation</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Adversarial Clean Label Backdoor Attacks and Defenses on Text Classification Systems</title>
+      <author><first>Ashim</first><last>Gupta</last></author>
+      <author><first>Amrith</first><last>Krishna</last><affiliation>University of Cambridge</affiliation></author>
+      <pages>1-12</pages>
+      <abstract>Clean-label (CL) attack is a form of data poisoning attack where an adversary modifies only the textual input of the training data, without requiring access to the labeling function. CL attacks are relatively unexplored in NLP, as compared to label flipping (LF) attacks, where the latter additionally requires access to the labeling function as well. While CL attacks are more resilient to data sanitization and manual relabeling methods than LF attacks, they often demand as high as ten times the poisoning budget than LF attacks. In this work, we first introduce an Adversarial Clean Label attack which can adversarially perturb in-class training examples for poisoning the training set. We then show that an adversary can significantly bring down the data requirements for a CL attack, using the aforementioned approach, to as low as 20 % of the data otherwise required. We then systematically benchmark and analyze a number of defense methods, for both LF and CL attacks, some previously employed solely for LF attacks in the textual domain and others adapted from computer vision. We find that text-specific defenses greatly vary in their effectiveness depending on their properties.</abstract>
+      <url hash="813b76b6">2023.repl4nlp-1.1</url>
+      <bibkey>gupta-krishna-2023-adversarial</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Do not Mask Randomly: Effective Domain-adaptive Pre-training by Masking In-domain Keywords</title>
+      <author><first>Shahriar</first><last>Golchin</last><affiliation>University of Arizona</affiliation></author>
+      <author><first>Mihai</first><last>Surdeanu</last><affiliation>University of Arizona</affiliation></author>
+      <author><first>Nazgol</first><last>Tavabi</last><affiliation>Harvard University</affiliation></author>
+      <author><first>Ata</first><last>Kiapour</last><affiliation>Harvard University</affiliation></author>
+      <pages>13-21</pages>
+      <abstract>We propose a novel task-agnostic in-domain pre-training method that sits between generic pre-training and fine-tuning. Our approach selectively masks in-domain keywords, i.e., words that provide a compact representation of the target domain. We identify such keywords using KeyBERT (Grootendorst, 2020). We evaluate our approach using six different settings: three datasets combined with two distinct pre-trained language models (PLMs). Our results reveal that the fine-tuned PLMs adapted using our in-domain pre-training strategy outperform PLMs that used in-domain pre-training with random masking as well as those that followed the common pre-train-then-fine-tune paradigm. Further, the overhead of identifying in-domain keywords is reasonable, e.g., 7-15% of the pre-training time (for two epochs) for BERT Large (Devlin et al., 2019).</abstract>
+      <url hash="a1becca9">2023.repl4nlp-1.2</url>
+      <bibkey>golchin-etal-2023-mask</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Grammatical information in <fixed-case>BERT</fixed-case> sentence embeddings as two-dimensional arrays</title>
+      <author><first>Vivi</first><last>Nastase</last><affiliation>University of Geneva</affiliation></author>
+      <author><first>Paola</first><last>Merlo</last><affiliation>Uppsala University and University of Geneva, Switzerland</affiliation></author>
+      <pages>22-39</pages>
+      <abstract>Sentence embeddings induced with various transformer architectures encode much semantic and syntactic information in a distributed manner in a one-dimensional array. We investigate whether specific grammatical information can be accessed in these distributed representations. Using data from a task developed to test rule-like generalizations, our experiments on detecting subject-verb agreement yield several promising results. First, we show that while the usual sentence representations encoded as one-dimensional arrays do not easily support extraction of rule-like regularities, a two-dimensional reshaping of these vectors allows various learning architectures to access such information. Next, we show that various architectures can detect patterns in these two-dimensional reshaped sentence embeddings and successfully learn a model based on smaller amounts of simpler training data, which performs well on more complex test data. This indicates that current sentence embeddings contain information that is regularly distributed, and which can be captured when the embeddings are reshaped into higher dimensional arrays. Our results cast light on representations produced by language models and help move towards developing few-shot learning approaches.</abstract>
+      <url hash="bf080aa4">2023.repl4nlp-1.3</url>
+      <bibkey>nastase-merlo-2023-grammatical</bibkey>
+    </paper>
+    <paper id="4">
+      <title>A Multilingual Evaluation of <fixed-case>NER</fixed-case> Robustness to Adversarial Inputs</title>
+      <author><first>Akshay</first><last>Srinivasan</last></author>
+      <author><first>Sowmya</first><last>Vajjala</last><affiliation>National Research Council Canada</affiliation></author>
+      <pages>40-53</pages>
+      <abstract>Adversarial evaluations of language models typically focus on English alone. In this paper, we performed a multilingual evaluation of Named Entity Recognition (NER) in terms of its robustness to small perturbations in the input. Our results showed the NER models we explored across three languages (English, German and Hindi) are not very robust to such changes, as indicated by the fluctuations in the overall F1 score as well as in a more fine-grained evaluation. With that knowledge, we further explored whether it is possible to improve the existing NER models using a part of the generated adversarial data sets as augmented training data to train a new NER model or as fine-tuning data to adapt an existing NER model. Our results showed that both these approaches improve performance on the original as well as adversarial test sets. While there is no significant difference between the two approaches for English, re-training is significantly better than fine-tuning for German and Hindi.</abstract>
+      <url hash="b8ae1e3e">2023.repl4nlp-1.4</url>
+      <bibkey>srinivasan-vajjala-2023-multilingual</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Retrieval-Augmented Domain Adaptation of Language Models</title>
+      <author><first>Benfeng</first><last>Xu</last></author>
+      <author><first>Chunxu</first><last>Zhao</last><affiliation>Beijing Language and Culture University</affiliation></author>
+      <author><first>Wenbin</first><last>Jiang</last></author>
+      <author><first>PengFei</first><last>Zhu</last><affiliation>Baidu</affiliation></author>
+      <author><first>Songtai</first><last>Dai</last><affiliation>Baidu</affiliation></author>
+      <author><first>Chao</first><last>Pang</last><affiliation>Baidu</affiliation></author>
+      <author><first>Zhuo</first><last>Sun</last><affiliation>Baidu</affiliation></author>
+      <author><first>Shuohuan</first><last>Wang</last></author>
+      <author><first>Yu</first><last>Sun</last></author>
+      <pages>54-64</pages>
+      <abstract>Language models pretrained on general domain corpora usually exhibit considerable degradation when generalizing to downstream tasks of specialized domains. Existing approaches try to construct PLMs for each specific domains either from scratch or through further pretraining, which not only costs substantial resources, but also fails to cover all target domains at various granularity. In this work, we propose RADA, a novel Retrieval-Augmented framework for Domain Adaptation. We first construct a textual corpora that covers the downstream task at flexible domain granularity and resource availability. We employ it as a pluggable datastore to retrieve informative background knowledge, and integrate them into the standard language model framework to augment representations. We then propose a two-level selection scheme to integrate the most relevant information while alleviating irrelevant noises. Specifically, we introduce a differentiable sampling module as well as an attention mechanism to achieve both passage-level and word-level selection. Such a retrieval-augmented framework enables domain adaptation of language models with flexible domain coverage and fine-grained domain knowledge integration. We conduct comprehensive experiments across biomedical, science and legal domains to demonstrate the effectiveness of the overall framework, and its advantage over existing solutions.</abstract>
+      <url hash="4d65a19a">2023.repl4nlp-1.5</url>
+      <bibkey>xu-etal-2023-retrieval</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Fine-grained Text Style Transfer with Diffusion-Based Language Models</title>
+      <author><first>Yiwei</first><last>Lyu</last></author>
+      <author><first>Tiange</first><last>Luo</last><affiliation>University of Michigan - Ann Arbor</affiliation></author>
+      <author><first>Jiacheng</first><last>Shi</last></author>
+      <author><first>Todd</first><last>Hollon</last><affiliation>University of Michigan</affiliation></author>
+      <author><first>Honglak</first><last>Lee</last><affiliation>LG AI Research and University of Michigan</affiliation></author>
+      <pages>65-74</pages>
+      <abstract>Diffusion probabilistic models have shown great success in generating high-quality images controllably, and researchers have tried to utilize this controllability into text generation domain. Previous works on diffusion-based language models have shown that they can be trained without external knowledge (such as pre-trained weights) and still achieve stable performance and controllability. In this paper, we trained a diffusion-based model on StylePTB dataset, the standard benchmark for fine-grained text style transfers. The tasks in StylePTB requires much more refined control over the output text compared to tasks evaluated in previous works, and our model was able to achieve state-of-the-art performance on StylePTB on both individual and compositional transfers. Moreover, our model, trained on limited data from StylePTB without external knowledge, outperforms previous works that utilized pretrained weights, embeddings, and external grammar parsers, and this may indicate that diffusion-based language models have great potential under low-resource settings.</abstract>
+      <url hash="0a931c25">2023.repl4nlp-1.6</url>
+      <bibkey>lyu-etal-2023-fine</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Enhancing text comprehension for Question Answering with Contrastive Learning</title>
+      <author><first>Seungyeon</first><last>Lee</last><affiliation>Kyungpook National University</affiliation></author>
+      <author><first>Minho</first><last>Lee</last><affiliation>Kyungpook National University</affiliation></author>
+      <pages>75-86</pages>
+      <abstract>Although Question Answering (QA) have advanced to the human-level language skills in NLP tasks, there is still a problem: the QA model gets confused when there are similar sentences or paragraphs. Existing studies focus on enhancing the text understanding of the candidate answers to improve the overall performance of the QA models. However, since these methods focus on re-ranking queries or candidate answers, they fail to resolve the confusion when many generated answers are similar to the expected answer. To address these issues, we propose a novel contrastive learning framework called ContrastiveQA that alleviates the confusion problem in answer extraction. We propose a supervised method where we generate positive and negative samples from the candidate answers and the given answer, respectively. We thus introduce ContrastiveQA, which uses contrastive learning with sampling data to reduce incorrect answers. Experimental results on four QA benchmarks show the effectiveness of the proposed method.</abstract>
+      <url hash="7c364e24">2023.repl4nlp-1.7</url>
+      <bibkey>lee-lee-2023-enhancing</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Towards Flow Graph Prediction of Open-Domain Procedural Texts</title>
+      <author><first>Keisuke</first><last>Shirai</last></author>
+      <author><first>Hirotaka</first><last>Kameko</last><affiliation>Baidu</affiliation></author>
+      <author><first>Shinsuke</first><last>Mori</last><affiliation>Kyoto University</affiliation></author>
+      <pages>87-96</pages>
+      <abstract>Machine comprehension of procedural texts is essential for reasoning about the steps and automating the procedures. However, this requires identifying entities within a text and resolving the relationships between the entities. Previous work focused on the cooking domain and proposed a framework to convert a recipe text into a flow graph (FG) representation. In this work, we propose a framework based on the recipe FG for flow graph prediction of open-domain procedural texts. To investigate flow graph prediction performance in non-cooking domains, we introduce the wikiHow-FG corpus from articles on wikiHow, a website of how-to instruction articles. In experiments, we consider using the existing recipe corpus and performing domain adaptation from the cooking to the target domain. Experimental results show that the domain adaptation models achieve higher performance than those trained only on the cooking or target domain data.</abstract>
+      <url hash="cb6a27cc">2023.repl4nlp-1.8</url>
+      <bibkey>shirai-etal-2023-towards</bibkey>
+    </paper>
+    <paper id="9">
+      <title>One does not fit all! On the Complementarity of Vision Encoders for Vision and Language Tasks</title>
+      <author><first>Gregor</first><last>Geigle</last><affiliation>Bayerische Julius-Maximilians-Universität Würzburg</affiliation></author>
+      <author><first>Chen</first><last>Liu</last></author>
+      <author><first>Jonas</first><last>Pfeiffer</last><affiliation>Google</affiliation></author>
+      <author><first>Iryna</first><last>Gurevych</last><affiliation>TU Darmstadt</affiliation></author>
+      <pages>97-117</pages>
+      <abstract>Current multimodal models, aimed at solving Vision and Language (V+L) tasks, predominantly repurpose Vision Encoders (VE) as feature extractors. While many VEs—of different architectures, trained on different data and objectives—are publicly available, they are not designed for the downstream V+L tasks. Nonetheless, most current work assumes that a <i>single</i> pre-trained VE can serve as a general-purpose encoder. In this work, we focus on analysis and aim to understand whether the information stored within different VEs is complementary, i.e. if providing the model with features from multiple VEs can improve the performance on a target task, and how they are combined. We exhaustively experiment with three popular VEs on six downstream V+L tasks and analyze the attention and VE-dropout patterns. Our analyses suggest that diverse VEs complement each other, resulting in improved downstream V+L task performance, where the improvements are not due to simple ensemble effects (i.e. the performance does not always improve when increasing the number of encoders). We demonstrate that future VEs, which are not <i>repurposed</i>, but explicitly <i>designed</i> for V+L tasks, have the potential of improving performance on the target V+L tasks. </abstract>
+      <url hash="881afdfe">2023.repl4nlp-1.9</url>
+      <bibkey>geigle-etal-2023-one</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>SPC</fixed-case>: Soft Prompt Construction for Cross Domain Generalization</title>
+      <author><first>Wenbo</first><last>Zhao</last><affiliation>Amazon</affiliation></author>
+      <author><first>Arpit</first><last>Gupta</last><affiliation>Amazon</affiliation></author>
+      <author><first>Tagyoung</first><last>Chung</last><affiliation>Amazon</affiliation></author>
+      <author><first>Jing</first><last>Huang</last><affiliation>Amazon Alexa AI</affiliation></author>
+      <pages>118-130</pages>
+      <abstract>Recent advances in prompt tuning have proven effective as a new language modeling paradigm for various natural language understanding tasks. However, it is challenging to adapt the soft prompt embeddings to different domains or generalize to low-data settings when learning soft prompts itself is unstable, task-specific, and bias-prone. This paper proposes a principled learning framework—soft prompt construction (SPC)—to facilitate learning domain-adaptable soft prompts. Derived from the SPC framework is a simple loss that can plug into various models and tuning approaches to improve their cross-domain performance. We show SPC can improve upon SOTA for contextual query rewriting, summarization, and paraphrase detection by up to 5%, 19%, and 16%, respectively.</abstract>
+      <url hash="80cff754">2023.repl4nlp-1.10</url>
+      <bibkey>zhao-etal-2023-spc</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Friendly Neighbors: Contextualized Sequence-to-Sequence Link Prediction</title>
+      <author><first>Adrian</first><last>Kochsiek</last><affiliation>Universität Mannheim</affiliation></author>
+      <author><first>Apoorv</first><last>Saxena</last><affiliation>Indian Institute of Science, Bangalore</affiliation></author>
+      <author><first>Inderjeet</first><last>Nair</last><affiliation>Adobe Systems</affiliation></author>
+      <author><first>Rainer</first><last>Gemulla</last><affiliation>Universität Mannheim, Germany</affiliation></author>
+      <pages>131-138</pages>
+      <abstract>We propose KGT5-context, a simple sequence-to-sequence model for link prediction (LP) in knowledge graphs (KG). Our work expands on KGT5, a recent LP model that exploits textual features of the KG, has small model size, and is scalable. To reach good predictive performance, however, KGT5 relies on an ensemble with a knowledge graph embedding model, which itself is excessively large and costly to use. In this short paper, we show empirically that adding contextual information — i.e., information about the direct neighborhood of the query entity — alleviates the need for a separate KGE model to obtain good performance. The resulting KGT5-context model is simple, reduces model size significantly, and obtains state-of-the-art performance in our experimental study.</abstract>
+      <url hash="72023aa7">2023.repl4nlp-1.11</url>
+      <bibkey>kochsiek-etal-2023-friendly</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Extracting Multi-valued Relations from Language Models</title>
+      <author><first>Sneha</first><last>Singhania</last><affiliation>Saarland Informatics Campus, Max-Planck Institute for Informatics</affiliation></author>
+      <author><first>Simon</first><last>Razniewski</last><affiliation>Saarland Informatics Campus, Max-Planck Institute</affiliation></author>
+      <author><first>Gerhard</first><last>Weikum</last><affiliation>Max Planck Institute and Max-Planck Institute for Informatics</affiliation></author>
+      <pages>139-154</pages>
+      <abstract>The widespread usage of latent language representations via pre-trained language models (LMs) suggests that they are a promising source of structured knowledge. However, existing methods focus only on a single object per subject-relation pair, even though often multiple objects are correct. To overcome this limitation, we analyze these representations for their potential to yield materialized multi-object relational knowledge. We formulate the problem as a rank-then-select task. For ranking candidate objects, we evaluate existing prompting techniques and propose new ones incorporating domain knowledge. Among the selection methods, we find that choosing objects with a likelihood above a learned relation-specific threshold gives a 49.5% F1 score. Our results highlight the difficulty of employing LMs for the multi-valued slot-filling task, and pave the way for further research on extracting relational knowledge from latent language representations.</abstract>
+      <url hash="1df73402">2023.repl4nlp-1.12</url>
+      <bibkey>singhania-etal-2023-extracting</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Hierarchical Multi-Instance Multi-Label Learning for Detecting Propaganda Techniques</title>
+      <author><first>Anni</first><last>Chen</last></author>
+      <author><first>Bhuwan</first><last>Dhingra</last></author>
+      <pages>155-163</pages>
+      <abstract>Since the introduction of the SemEval 2020 Task 11 (CITATION), several approaches have been proposed in the literature for classifying propagandabased on the rhetorical techniques used to influence readers.These methods, however, classify one span at a time, ignoring dependencies from the labels of other spans within the same context.In this paper, we approach propaganda technique classification as aMulti-Instance Multi-Label (MIML) learning problem (CITATION) and propose a simple RoBERTa-based model (CITATION) for classifying all spans in an article simultaneously. Further, we note that, due to the annotation process whereannotators classified the spans by following a decision tree,there is an inherent hierarchical relationship among the differenttechniques, which existing approaches ignore. We incorporate these hierarchical label dependencies by adding an auxiliary classifier for each node in the decision tree to the training objective and ensembling the predictions from the original and auxiliary classifiers at test time. Overall, our model leads to an absolute improvement of 2.47% micro-F1 over the model from the shared task winning team in a cross-validation setup and is the best performing non-ensemble model on the shared task leaderboard.</abstract>
+      <url hash="bf3502e3">2023.repl4nlp-1.13</url>
+      <bibkey>chen-dhingra-2023-hierarchical</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Contrastive Loss is All You Need to Recover Analogies as Parallel Lines</title>
+      <author><first>Narutatsu</first><last>Ri</last></author>
+      <author><first>Fei-Tzin</first><last>Lee</last><affiliation>Columbia University</affiliation></author>
+      <author><first>Nakul</first><last>Verma</last><affiliation>Columbia University</affiliation></author>
+      <pages>164-173</pages>
+      <abstract>While static word embedding models are known to represent linguistic analogies as parallel lines in high-dimensional space, the underlying mechanism as to why they result in such geometric structures remains obscure. We find that an elementary contrastive-style method employed over distributional information performs competitively with popular word embedding models on analogy recovery tasks, while achieving dramatic speedups in training time. Further, we demonstrate that a contrastive loss is sufficient to create these parallel structures in word embeddings, and establish a precise relationship between the co-occurrence statistics and the geometric structure of the resulting word embeddings.</abstract>
+      <url hash="8aa4966f">2023.repl4nlp-1.14</url>
+      <bibkey>ri-etal-2023-contrastive</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Syntax-Aware Graph-to-Graph Transformer for Semantic Role Labelling</title>
+      <author><first>Alireza</first><last>Mohammadshahi</last></author>
+      <author><first>James</first><last>Henderson</last><affiliation>Idiap Research Institute</affiliation></author>
+      <pages>174-186</pages>
+      <abstract>Recent models have shown that incorporating syntactic knowledge into the semantic role labelling (SRL) task leads to a significant improvement. In this paper, we propose Syntax-aware Graph-to-Graph Transformer (SynG2G-Tr) model, which encodes the syntactic structure using a novel way to input graph relations as embeddings, directly into the self-attention mechanism of Transformer. This approach adds a soft bias towards attention patterns that follow the syntactic structure but also allows the model to use this information to learn alternative patterns. We evaluate our model on both span-based and dependency-based SRL datasets, and outperform previous alternative methods in both in-domain and out-of-domain settings, on CoNLL 2005 and CoNLL 2009 datasets.</abstract>
+      <url hash="804340bd">2023.repl4nlp-1.15</url>
+      <bibkey>mohammadshahi-henderson-2023-syntax</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Improving Zero-shot Relation Classification via Automatically-acquired Entailment Templates</title>
+      <author><first>Mahdi</first><last>Rahimi</last><affiliation>Computer Science Department, University of Arizona</affiliation></author>
+      <author><first>Mihai</first><last>Surdeanu</last><affiliation>University of Arizona</affiliation></author>
+      <pages>187-195</pages>
+      <abstract>While fully supervised relation classification (RC) models perform well on large-scale datasets, their performance drops drastically in low-resource settings. As generating annotated examples are expensive, recent zero-shot methods have been proposed that reformulate RC into other NLP tasks for which supervision exists such as textual entailment. However, these methods rely on templates that are manually created which is costly and requires domain expertise. In this paper, we present a novel strategy for template generation for relation classification, which is based on adapting Harris’ distributional similarity principle to templates encoded using contextualized representations. Further, we perform empirical evaluation of different strategies for combining the automatically acquired templates with manual templates. The experimental results on TACRED show that our approach not only performs better than the zero-shot RC methods that only use manual templates, but also that it achieves state-of-the-art performance for zero-shot TACRED at 64.3 F1 score.</abstract>
+      <url hash="4d189bfe">2023.repl4nlp-1.16</url>
+      <bibkey>rahimi-surdeanu-2023-improving</bibkey>
+    </paper>
+    <paper id="17">
+      <title><fixed-case>MUX</fixed-case>-<fixed-case>PLM</fixed-case>s: Pre-training Language Models with Data Multiplexing</title>
+      <author><first>Vishvak</first><last>Murahari</last><affiliation>Princeton University</affiliation></author>
+      <author><first>Ameet</first><last>Deshpande</last></author>
+      <author><first>Carlos</first><last>Jimenez</last></author>
+      <author><first>Izhak</first><last>Shafran</last><affiliation>Google</affiliation></author>
+      <author><first>Mingqiu</first><last>Wang</last></author>
+      <author><first>Yuan</first><last>Cao</last><affiliation>Google Brain</affiliation></author>
+      <author><first>Karthik</first><last>Narasimhan</last><affiliation>Princeton University</affiliation></author>
+      <pages>196-211</pages>
+      <abstract>The widespread adoption of large language models such as ChatGPT and Bard has led to unprecedented demand for these technologies. The burgeoning cost of inference for ever-increasing model sizes coupled with hardware shortages has limited affordable access and poses a pressing need for efficiency approaches geared towards high throughput and performance. Multi-input multi-output (MIMO) algorithms such as data multiplexing, offer a promising solution with a many-fold increase in throughput by performing inference for multiple inputs at the cost of a single input. Yet these approaches are not currently performant enough to be deployed in modern systems. We change that by developing MUX-PLMs, a class of high throughput pre-trained language models (PLMs) trained with data multiplexing, that can be fine-tuned for any downstream task to yield high-throughput high-performance. Our novel multiplexing and demultiplexing modules proficiently entangle and disentangle inputs, and enable high-performance high throughput that are competitive with vanilla PLMs while achieving 2x/5x inference speedup with only a 1−4% drop on a broad suite of tasks.</abstract>
+      <url hash="ac902053">2023.repl4nlp-1.17</url>
+      <bibkey>murahari-etal-2023-mux</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Mixed Orthographic/Phonemic Language Modeling: Beyond Orthographically Restricted Transformers (<fixed-case>BORT</fixed-case>)</title>
+      <author><first>Robert</first><last>Gale</last><affiliation>Oregon Health Sciences University</affiliation></author>
+      <author><first>Alexandra</first><last>Salem</last><affiliation>Oregon Health Sciences University</affiliation></author>
+      <author><first>Gerasimos</first><last>Fergadiotis</last><affiliation>Portland State University</affiliation></author>
+      <author><first>Steven</first><last>Bedrick</last><affiliation>Oregon Health &amp; Science University</affiliation></author>
+      <pages>212-225</pages>
+      <abstract>Speech language pathologists rely on information spanning the layers of language, often drawing from multiple layers (e.g. phonology &amp; semantics) at once. Recent innovations in large language models (LLMs) have been shown to build powerful representations for many complex language structures, especially syntax and semantics, unlocking the potential of large datasets through self-supervised learning techniques. However, these datasets are overwhelmingly orthographic, favoring writing systems like the English alphabet, a natural but phonetically imprecise choice. Meanwhile, LLM support for the international phonetic alphabet (IPA) ranges from poor to absent. Further, LLMs encode text at a word- or near-word level, and pre-training tasks have little to gain from phonetic/phonemic representations. In this paper, we introduce BORT, an LLM for mixed orthography/IPA meant to overcome these limitations. To this end, we extend the pre-training of an existing LLM with our own self-supervised pronunciation tasks. We then fine-tune for a clinical task that requires simultaneous phonological and semantic analysis. For an “easy” and “hard” version of these tasks, we show that fine-tuning from our models is more accurate by a relative 24% and 29%, and improved on character error rates by a relative 75% and 31%, respectively, than those starting from the original model.</abstract>
+      <url hash="8d294ac2">2023.repl4nlp-1.18</url>
+      <bibkey>gale-etal-2023-mixed</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Effectiveness of Data Augmentation for Parameter Efficient Tuning with Limited Data</title>
+      <author><first>Stephen</first><last>Obadinma</last><affiliation>Queen’s University</affiliation></author>
+      <author><first>Hongyu</first><last>Guo</last></author>
+      <author><first>Xiaodan</first><last>Zhu</last><affiliation>Queen’s University</affiliation></author>
+      <pages>226-237</pages>
+      <abstract>Recent work has demonstrated that using parameter efficient tuning techniques such as prefix tuning (or P-tuning) on pretrained language models can yield performance that is comparable or superior to fine-tuning while dramatically reducing trainable parameters. Nevertheless, the effectiveness of such methods under the context of data augmentation, a common strategy to improve learning under low data regimes, has not been fully explored. In this paper, we examine the effectiveness of several popular task-agnostic data augmentation techniques, i.e., EDA, Back Translation, and Mixup, when using two general parameter efficient tuning methods, P-tuning v2 and LoRA, under data scarcity. We show that data augmentation can be used to boost the performance of P-tuning and LoRA models, but the effectiveness of each technique varies and certain methods can lead to a notable degradation in performance, particularly when using larger models and on harder tasks. We further analyze the sentence representations of P-tuning compared to fine-tuning to help understand the above behaviour, and reveal how P-tuning generally presents a more limited ability to separate the sentence embeddings from different classes of augmented data. In addition, it displays poorer performance on heavily altered data. However, we demonstrate that by adding a simple contrastive loss function it can help mitigate such issues for prefix tuning, resulting in sizable improvements to augmented data performance.</abstract>
+      <url hash="57ac83a0">2023.repl4nlp-1.19</url>
+      <bibkey>obadinma-etal-2023-effectiveness</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Relational Sentence Embedding for Flexible Semantic Matching</title>
+      <author><first>Bin</first><last>Wang</last><affiliation>National University of Singapore, Singapore</affiliation></author>
+      <author><first>Haizhou</first><last>Li</last><affiliation>National University of Singapore, Singapore and School of Data Science, The Chinese University of Hong Kong, Shenzhen, China and Shenzhen Research Institute of Big Data</affiliation></author>
+      <pages>238-252</pages>
+      <url hash="fd37e0da">2023.repl4nlp-1.20</url>
+      <bibkey>wang-li-2023-relational</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Tucker Decomposition with Frequency Attention for Temporal Knowledge Graph Completion</title>
+      <author><first>Likang</first><last>Xiao</last><affiliation>SKLSDE, School of Computer Science and Engineering, Beihang University, Beijing, China and Shen Yuan Honors College, Beihang University, Beijing, China</affiliation></author>
+      <author><first>Richong</first><last>Zhang</last><affiliation>SKLSDE, School of Computer Science and Engineering, Beihang University, Beijing, China</affiliation></author>
+      <author><first>Zijie</first><last>Chen</last><affiliation>School of Electrical and Computer Engineering, University of Toronto, Toronto, Canada</affiliation></author>
+      <author><first>Junfan</first><last>Chen</last><affiliation>SKLSDE, School of Computer Science and Engineering, Beihang University, Beijing, China</affiliation></author>
+      <pages>253-265</pages>
+      <url hash="5ed3735c">2023.repl4nlp-1.21</url>
+      <bibkey>xiao-etal-2023-tucker-decomposition</bibkey>
+    </paper>
+    <paper id="22">
+      <title><fixed-case>CLIP</fixed-case>-based image captioning via unsupervised cycle-consistency in the latent space</title>
+      <author><first>Romain</first><last>Bielawski</last><affiliation>ANITI, Université de Toulouse, France</affiliation></author>
+      <author><first>Rufin</first><last>VanRullen</last><affiliation>CerCo, CNRS UMR5549, Toulouse</affiliation></author>
+      <pages>266-275</pages>
+      <url hash="cbd11372">2023.repl4nlp-1.22</url>
+      <bibkey>bielawski-vanrullen-2023-clip</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Token-level Fitting Issues of Seq2seq Models</title>
+      <author><first>Guangsheng</first><last>Bao</last><affiliation>Zhejiang University and School of Engineering, Westlake University</affiliation></author>
+      <author><first>Zhiyang</first><last>Teng</last><affiliation>Nanyang Technological University</affiliation></author>
+      <author><first>Yue</first><last>Zhang</last><affiliation>School of Engineering, Westlake University and Institute of Advanced Technology, Westlake Institute for Advanced Study</affiliation></author>
+      <pages>276-288</pages>
+      <url hash="35d71706">2023.repl4nlp-1.23</url>
+      <bibkey>bao-etal-2023-token</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Revealing the Blind Spot of Sentence Encoder Evaluation by <fixed-case>HEROS</fixed-case></title>
+      <author><first>Cheng-Han</first><last>Chiang</last><affiliation>National Taiwan University†</affiliation></author>
+      <author><first>Hung-yi</first><last>Lee</last><affiliation>National Taiwan University†</affiliation></author>
+      <author><first>Yung-Sung</first><last>Chuang</last><affiliation>Massachusetts Institute of Technology</affiliation></author>
+      <author><first>James</first><last>Glass</last><affiliation>Massachusetts Institute of Technology</affiliation></author>
+      <pages>289-302</pages>
+      <url hash="c653b5de">2023.repl4nlp-1.24</url>
+      <bibkey>chiang-etal-2023-revealing</bibkey>
+    </paper>
+    <paper id="25">
+      <title>One-Shot Exemplification Modeling via Latent Sense Representations</title>
+      <author><first>John</first><last>Harvill</last><affiliation>University of Illinois Urbana-Champaign</affiliation></author>
+      <author><first>Mark</first><last>Hasegawa-Johnson</last><affiliation>University of Illinois Urbana-Champaign</affiliation></author>
+      <author><first>Hee Suk</first><last>Yoon</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
+      <author><first>Chang D.</first><last>Yoo</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
+      <author><first>Eunseop</first><last>Yoon</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
+      <pages>303-314</pages>
+      <url hash="3f8a5045">2023.repl4nlp-1.25</url>
+      <bibkey>harvill-etal-2023-one</bibkey>
+    </paper>
+    <paper id="26">
+      <title><fixed-case>S</fixed-case>en2<fixed-case>P</fixed-case>ro: A Probabilistic Perspective to Sentence Embedding from Pre-trained Language Model</title>
+      <author><first>Lingfeng</first><last>Shen</last><affiliation>Tencent AI Lab</affiliation></author>
+      <author><first>Haiyun</first><last>Jiang</last><affiliation>Tencent AI Lab</affiliation></author>
+      <author><first>Lemao</first><last>Liu</last><affiliation>Tencent AI Lab</affiliation></author>
+      <author><first>Shuming</first><last>Shi</last><affiliation>Tencent AI Lab</affiliation></author>
+      <pages>315-333</pages>
+      <url hash="3080b9bd">2023.repl4nlp-1.26</url>
+      <bibkey>shen-etal-2023-sen2pro</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Visual Coherence Loss for Coherent and Visually Grounded Story Generation</title>
+      <author><first>Xudong</first><last>Hong</last><affiliation>MPI Informatics and Saarland University and Saarland Informatics Campus</affiliation></author>
+      <author><first>Vera</first><last>Demberg</last><affiliation>Saarland University and Saarland Informatics Campus</affiliation></author>
+      <author><first>Asad</first><last>Sayeed</last><affiliation>University of Gothenburg</affiliation></author>
+      <author><first>Qiankun</first><last>Zheng</last><affiliation>Saarland University and Saarland Informatics Campus</affiliation></author>
+      <author><first>Bernt</first><last>Schiele</last><affiliation>MPI Informatics and Saarland Informatics Campus</affiliation></author>
+      <pages>334-346</pages>
+      <url hash="4ae909fd">2023.repl4nlp-1.27</url>
+      <bibkey>hong-etal-2023-visual-coherence</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.semeval.xml b/data/xml/2023.semeval.xml
index ddb13649de..2993e04b5a 100644
--- a/data/xml/2023.semeval.xml
+++ b/data/xml/2023.semeval.xml
@@ -15,6 +15,10 @@
       <year>2023</year>
       <venue>semeval</venue>
     </meta>
+    <frontmatter>
+      <url hash="4c7b2e5a">2023.semeval-1.0</url>
+      <bibkey>semeval-2023-international</bibkey>
+    </frontmatter>
     <paper id="1">
       <title><fixed-case>K</fixed-case>now<fixed-case>C</fixed-case>omp at <fixed-case>S</fixed-case>em<fixed-case>E</fixed-case>val-2023 Task 7: Fine-tuning Pre-trained Language Models for Clinical Trial Entailment Identification</title>
       <author><first>Weiqi</first><last>Wang</last><affiliation>Hong Kong University of Science and Technology</affiliation></author>
diff --git a/data/xml/2023.sicon.xml b/data/xml/2023.sicon.xml
new file mode 100644
index 0000000000..f3dd03b204
--- /dev/null
+++ b/data/xml/2023.sicon.xml
@@ -0,0 +1,101 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.sicon">
+  <volume id="1" ingest-date="2023-07-12">
+    <meta>
+      <booktitle>Proceedings of the First Workshop on Social Influence in Conversations (SICon 2023)</booktitle>
+      <editor><first>Kushal</first><last>Chawla</last></editor>
+      <editor><first>Weiyan</first><last>Shi</last></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Toronto, Canada</address>
+      <month>July</month>
+      <year>2023</year>
+      <url hash="5bc6a441">2023.sicon-1</url>
+      <venue>sicon</venue>
+    </meta>
+    <frontmatter>
+      <url hash="c798e491">2023.sicon-1.0</url>
+      <bibkey>sicon-2023-social</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Eliciting Rich Positive Emotions in Dialogue Generation</title>
+      <author><first>Ziwei</first><last>Gong</last><affiliation>Columbia University</affiliation></author>
+      <author><first>Qingkai</first><last>Min</last></author>
+      <author><first>Yue</first><last>Zhang</last><affiliation>Westlake University</affiliation></author>
+      <pages>1-8</pages>
+      <abstract>Positive emotion elicitation aims at evoking positive emotion states in human users in open-domain dialogue generation. However, most work focuses on inducing a single-dimension of positive sentiment using human annotated datasets, which limits the scale of the training dataset. In this paper, we propose to model various emotions in large unannotated conversations, such as joy, trust and anticipation, by leveraging a latent variable to control the emotional intention of the response. Our proposed emotion-eliciting-Conditional-Variational-AutoEncoder (EE-CVAE) model generates more diverse and emotionally-intelligent responses compared to single-dimension baseline models in human evaluation.</abstract>
+      <url hash="d1aafcc7">2023.sicon-1.1</url>
+      <bibkey>gong-etal-2023-eliciting</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Detoxifying Online Discourse: A Guided Response Generation Approach for Reducing Toxicity in User-Generated Text</title>
+      <author><first>Ritwik</first><last>Bose</last><affiliation>Knox College</affiliation></author>
+      <author><first>Ian</first><last>Perera</last><affiliation>The Institute for Human &amp; Machine Cognition</affiliation></author>
+      <author><first>Bonnie</first><last>Dorr</last><affiliation>University of Florida</affiliation></author>
+      <pages>9-14</pages>
+      <abstract>The expression of opinions, stances, and moral foundations on social media often coincide with toxic, divisive, or inflammatory language that can make constructive discourse across communities difficult. Natural language generation methods could provide a means to reframe or reword such expressions in a way that fosters more civil discourse, yet current Large Language Model (LLM) methods tend towards language that is too generic or formal to seem authentic for social media discussions. We present preliminary work on training LLMs to maintain authenticity while presenting a community’s ideas and values in a constructive, non-toxic manner.</abstract>
+      <url hash="ee8d24da">2023.sicon-1.2</url>
+      <bibkey>bose-etal-2023-detoxifying</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Large Language Models respond to Influence like Humans</title>
+      <author><first>Lewis</first><last>Griffin</last><affiliation>University College London, University of London</affiliation></author>
+      <author><first>Bennett</first><last>Kleinberg</last><affiliation>Tilburg University</affiliation></author>
+      <author><first>Maximilian</first><last>Mozes</last></author>
+      <author><first>Kimberly</first><last>Mai</last><affiliation>University College London, University of London</affiliation></author>
+      <author><first>Maria Do Mar</first><last>Vau</last></author>
+      <author><first>Matthew</first><last>Caldwell</last><affiliation>NA</affiliation></author>
+      <author><first>Augustine</first><last>Mavor-Parker</last></author>
+      <pages>15-24</pages>
+      <abstract>Two studies tested the hypothesis that a Large Language Model (LLM) can be used to model psychological change following exposure to influential input. The first study tested a generic mode of influence - the Illusory Truth Effect (ITE) - where earlier exposure to a statement boosts a later truthfulness test rating. Analysis of newly collected data from human and LLM-simulated subjects (1000 of each) showed the same pattern of effects in both populations; although with greater per statement variability for the LLM. The second study concerns a specific mode of influence – populist framing of news to increase its persuasion and political mobilization. Newly collected data from simulated subjects was compared to previously published data from a 15 country experiment on 7286 human participants. Several effects from the human study were replicated by the simulated study, including ones that surprised the authors of the human study by contradicting their theoretical expectations; but some significant relationships found in human data were not present in the LLM data. Together the two studies support the view that LLMs have potential to act as models of the effect of influence.</abstract>
+      <url hash="781b0e92">2023.sicon-1.3</url>
+      <bibkey>griffin-etal-2023-large</bibkey>
+    </paper>
+    <paper id="4">
+      <title>What Makes a Good Counter-Stereotype? Evaluating Strategies for Automated Responses to Stereotypical Text</title>
+      <author><first>Kathleen</first><last>Fraser</last><affiliation>National Research Council Canada</affiliation></author>
+      <author><first>Svetlana</first><last>Kiritchenko</last><affiliation>National Research Council Canada</affiliation></author>
+      <author><first>Isar</first><last>Nejadgholi</last></author>
+      <author><first>Anna</first><last>Kerkhof</last></author>
+      <pages>25-38</pages>
+      <abstract>When harmful social stereotypes are expressed on a public platform, they must be addressed in a way that educates and informs both the original poster and other readers, without causing offence or perpetuating new stereotypes. In this paper, we synthesize findings from psychology and computer science to propose a set of potential counter-stereotype strategies. We then automatically generate such counter-stereotypes using ChatGPT, and analyze their correctness and expected effectiveness at reducing stereotypical associations. We identify the strategies of denouncing stereotypes, warning of consequences, and using an empathetic tone as three promising strategies to be further tested.</abstract>
+      <url hash="10df2f89">2023.sicon-1.4</url>
+      <bibkey>fraser-etal-2023-makes</bibkey>
+    </paper>
+    <paper id="5">
+      <title><fixed-case>BC</fixed-case>ause: Reducing group bias and promoting cohesive discussion in online deliberation processes through a simple and engaging online deliberation tool</title>
+      <author><first>Lucas</first><last>Anastasiou</last></author>
+      <author><first>Anna</first><last>De Libbo</last><affiliation>NA</affiliation></author>
+      <pages>39-49</pages>
+      <abstract>Facilitating healthy online deliberation in terms of sensemaking and collaboration of discussion participants proves extremely challenging due to a number of known negative effects of online communication on social media platforms. We start from concerns and aspirations about the use of existing online discussion systems as distilled in previous literature, we then combine them with lessons learned on design and engineering practices from our research team, to inform the design of an easy-to-use tool (BCause.app) that enables higher quality discussions than traditional social media. We describe the design of this tool, highlighting the main interaction features that distinguish it from common social media, namely: i. the low-cost argumentation structuring of the conversations with direct replies; ii. and the distinctive use of reflective feedback rather than appreciative-only feedback. We then present the results of a controlled A/B experiment in which we show that the presence of argumentative and cognitive reflective discussion elements produces better social interaction with less polarization and promotes a more cohesive discussion than common social media-like interactions.</abstract>
+      <url hash="920401d9">2023.sicon-1.5</url>
+      <bibkey>anastasiou-de-libbo-2023-bcause</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Measuring Lexico-Semantic Alignment in Debates with Contextualized Word Representations</title>
+      <author><first>Aina</first><last>Garí Soler</last><affiliation>Télécom-Paris</affiliation></author>
+      <author><first>Matthieu</first><last>Labeau</last><affiliation>Télécom ParisTech</affiliation></author>
+      <author><first>Chloé</first><last>Clavel</last><affiliation>Télécom ParisTech and Télécom Paris</affiliation></author>
+      <pages>50-63</pages>
+      <abstract>Dialog participants sometimes align their linguistic styles, e.g., they use the same words and syntactic constructions as their interlocutors. We propose to investigate the notion of lexico-semantic alignment: to what extent do speakers convey the same meaning when they use the same words? We design measures of lexico-semantic alignment relying on contextualized word representations. We show that they reflect interesting semantic differences between the two sides of a debate and that they can assist in the task of debate’s winner prediction.</abstract>
+      <url hash="a55d1cb7">2023.sicon-1.6</url>
+      <bibkey>gari-soler-etal-2023-measuring</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Exploring Linguistic Style Matching in Online Communities: The Role of Social Context and Conversation Dynamics</title>
+      <author><first>Aparna</first><last>Ananthasubramaniam</last></author>
+      <author><first>Hong</first><last>Chen</last></author>
+      <author><first>Jason</first><last>Yan</last><affiliation>University of Michigan - Ann Arbor</affiliation></author>
+      <author><first>Kenan</first><last>Alkiek</last></author>
+      <author><first>Jiaxin</first><last>Pei</last><affiliation>University of Michigan</affiliation></author>
+      <author><first>Agrima</first><last>Seth</last><affiliation>University of Michigan</affiliation></author>
+      <author><first>Lavinia</first><last>Dunagan</last></author>
+      <author><first>Minje</first><last>Choi</last><affiliation>University of Michigan</affiliation></author>
+      <author><first>Benjamin</first><last>Litterer</last><affiliation>NA</affiliation></author>
+      <author><first>David</first><last>Jurgens</last><affiliation>University of Michigan</affiliation></author>
+      <pages>64-74</pages>
+      <abstract>Linguistic style matching (LSM) in conversations can be reflective of several aspects of social influence such as power or persuasion. However, how LSM relates to the outcomes of online communication on platforms such as Reddit is an unknown question. In this study, we analyze a large corpus of two-party conversation threads in Reddit where we identify all occurrences of LSM using two types of style: the use of function words and formality. Using this framework, we examine how levels of LSM differ in conversations depending on several social factors within Reddit: post and subreddit features, conversation depth, user tenure, and the controversiality of a comment. Finally, we measure the change of LSM following loss of status after community banning. Our findings reveal the interplay of LSM in Reddit conversations with several community metrics, suggesting the importance of understanding conversation engagement when understanding community dynamics.</abstract>
+      <url hash="a2075316">2023.sicon-1.7</url>
+      <bibkey>ananthasubramaniam-etal-2023-exploring</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.sigmorphon.xml b/data/xml/2023.sigmorphon.xml
new file mode 100644
index 0000000000..c3ceb28079
--- /dev/null
+++ b/data/xml/2023.sigmorphon.xml
@@ -0,0 +1,306 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.sigmorphon">
+  <volume id="1" ingest-date="2023-07-12">
+    <meta>
+      <booktitle>Proceedings of the 20th SIGMORPHON workshop on Computational Research in Phonetics, Phonology, and Morphology</booktitle>
+      <editor><first>Garrett</first><last>Nicolai</last></editor>
+      <editor><first>Eleanor</first><last>Chodroff</last></editor>
+      <editor><first>Frederic</first><last>Mailhot</last></editor>
+      <editor><first>Çağrı</first><last>Çöltekin</last></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Toronto, Canada</address>
+      <month>July</month>
+      <year>2023</year>
+      <url hash="8cc77566">2023.sigmorphon-1</url>
+      <venue>sigmorphon</venue>
+    </meta>
+    <frontmatter>
+      <url hash="5950be3a">2023.sigmorphon-1.0</url>
+      <bibkey>sigmorphon-2023-sigmorphon</bibkey>
+    </frontmatter>
+    <paper id="2">
+      <title>Translating a low-resource language using <fixed-case>GPT</fixed-case>-3 and a human-readable dictionary</title>
+      <author><first>Micha</first><last>Elsner</last><affiliation>The Ohio State University</affiliation></author>
+      <author><first>Jordan</first><last>Needle</last><affiliation>The Ohio State University</affiliation></author>
+      <pages>1-13</pages>
+      <abstract>We investigate how well words in the polysynthetic language Inuktitut can be translated by combining dictionary definitions, without use of a neural machine translation model trained on parallel text. Such a translation system would allow natural language technology to benefit from resources designed for community use in a language revitalization or education program, rather than requiring a separate parallel corpus. We show that the text-to-text generation capabilities of GPT-3 allow it to perform this task with BLEU scores of up to 18.5. We investigate prompting GPT-3 to provide multiple translations, which can help slightly, and providing it with grammar information, which is mostly ineffective. Finally, we test GPT-3’s ability to derive morpheme definitions from whole-word translations, but find this process is prone to errors including hallucinations.</abstract>
+      <url hash="abf49caf">2023.sigmorphon-1.2</url>
+      <bibkey>elsner-needle-2023-translating</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Evaluating Cross Lingual Transfer for Morphological Analysis: a Case Study of <fixed-case>I</fixed-case>ndian Languages</title>
+      <author><first>Siddhesh</first><last>Pawar</last><affiliation>Google</affiliation></author>
+      <author><first>Pushpak</first><last>Bhattacharyya</last><affiliation>Indian Institute of Technology Bombay and Patna</affiliation></author>
+      <author><first>Partha</first><last>Talukdar</last><affiliation>Google Research and IISc</affiliation></author>
+      <pages>14-26</pages>
+      <abstract>Recent advances in pretrained multilingual models such as Multilingual T5 (mT5) have facilitated cross-lingual transfer by learning shared representations across languages. Leveraging pretrained multilingual models for scaling morphology analyzers to low-resource languages is a unique opportunity that has been under-explored so far. We investigate this line of research in the context of Indian languages, focusing on two important morphological sub-tasks: root word extraction and tagging morphosyntactic descriptions (MSD), viz., gender, number, and person (GNP). We experiment with six Indian languages from two language families (Dravidian and Indo-Aryan) to train a multilingual morphology analyzers for the first time for Indian languages. We demonstrate the usability of multilingual models for few-shot cross-lingual transfer through an average 7% increase in GNP tagging in a cross-lingual setting as compared to a monolingual setting through controlled experiments. We provide an overview of the state of the datasets available related to our tasks and point-out a few modeling limitations due to datasets. Lastly, we analyze the cross-lingual transfer of morphological tags for verbs and nouns, which provides a proxy for the quality of representations of word markings learned by the model.</abstract>
+      <url hash="fdd77252">2023.sigmorphon-1.3</url>
+      <bibkey>pawar-etal-2023-evaluating</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Joint Learning Model for Low-Resource Agglutinative Language Morphological Tagging</title>
+      <author><first>Gulinigeer</first><last>Abudouwaili</last><affiliation>School of Information Science and Engineering Xinjiang University</affiliation></author>
+      <author><first>Kahaerjiang</first><last>Abiderexiti</last><affiliation>School of Information Science and Engineering, Xinjiang University</affiliation></author>
+      <author><first>Nian</first><last>Yi</last><affiliation>School of Information Science and Engineering Xinjiang University</affiliation></author>
+      <author><first>Aishan</first><last>Wumaier</last><affiliation>School of Science and Engineering, Xinjiang University; Xinjiang Provincial Key Laboratory of Multi-lingual Information Technology</affiliation></author>
+      <pages>27-37</pages>
+      <abstract>Due to the lack of data resources, rule-based or transfer learning is mainly used in the morphological tagging of low-resource languages. However, these methods require expert knowledge, ignore contextual features, and have error propagation. Therefore, we propose a joint morphological tagger for low-resource agglutinative languages to alleviate the above challenges. First, we represent the contextual input with multi-dimensional features of agglutinative words. Second, joint training reduces the direct impact of part-of-speech errors on morphological features and increases the indirect influence between the two types of labels through a fusion mechanism. Finally, our model separately predicts part-of-speech and morphological features. Part-of-speech tagging is regarded as sequence tagging. When predicting morphological features, two-label adjacency graphs are dynamically reconstructed by integrating multilingual global features and monolingual local features. Then, a graph convolution network is used to learn the higher-order intersection of labels. A series of experiments show that the proposed model in this paper is superior to other comparative models.</abstract>
+      <url hash="578f8729">2023.sigmorphon-1.4</url>
+      <bibkey>abudouwaili-etal-2023-joint</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Revisiting and Amending <fixed-case>C</fixed-case>entral <fixed-case>K</fixed-case>urdish Data on <fixed-case>U</fixed-case>ni<fixed-case>M</fixed-case>orph 4.0</title>
+      <author><first>Sina</first><last>Ahmadi</last><affiliation>George Mason University</affiliation></author>
+      <author><first>Aso</first><last>Mahmudi</last><affiliation>Independent</affiliation></author>
+      <pages>38-48</pages>
+      <abstract>UniMorph–the Universal Morphology project is a collaborative initiative to create and maintain morphological data and organize numerous related tasks for various language processing communities. The morphological data is provided by linguists for over 160 languages in the latest version of UniMorph 4.0. This paper sheds light on the Central Kurdish data on UniMorph 4.0 by analyzing the existing data, its fallacies, and systematic morphological errors. It also presents an approach to creating more reliable morphological data by considering various specific phenomena in Central Kurdish that have not been addressed previously, such as Izafe and several enclitics.</abstract>
+      <url hash="c14c46d8">2023.sigmorphon-1.5</url>
+      <bibkey>ahmadi-mahmudi-2023-revisiting</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Investigating Phoneme Similarity with Artificially Accented Speech</title>
+      <author><first>Margot</first><last>Masson</last><affiliation>University College Dublin</affiliation></author>
+      <author><first>Julie</first><last>Carson-berndsen</last><affiliation>University College Dublin</affiliation></author>
+      <pages>49-57</pages>
+      <abstract>While the deep learning revolution has led to significant performance improvements in speech recognition, accented speech remains a challenge. Current approaches to this challenge typically do not seek to understand and provide explanations for the variations of accented speech, whether they stem from native regional variation or non-native error patterns. This paper seeks to address non-native speaker variations from both a knowledge-based and a data-driven perspective. We propose to approximate non-native accented-speech pronunciation patterns by the means of two approaches: based on phonetic and phonological knowledge on the one hand and inferred from a text-to-speech system on the other. Artificial speech is then generated with a range of variants which have been captured in confusion matrices representing phoneme similarities. We then show that non-native accent confusions actually propagate to the transcription from the ASR, thus suggesting that the inference of accent specific phoneme confusions is achievable from artificial speech.</abstract>
+      <url hash="c1e760ed">2023.sigmorphon-1.6</url>
+      <bibkey>masson-carson-berndsen-2023-investigating</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Generalized Glossing Guidelines: An Explicit, Human- and Machine-Readable, Item-and-Process Convention for Morphological Annotation</title>
+      <author><first>David R.</first><last>Mortensen</last><affiliation>Language Technologies Institute, Carnegie Mellon University</affiliation></author>
+      <author><first>Ela</first><last>Gulsen</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Taiqi</first><last>He</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Nathaniel</first><last>Robinson</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Jonathan</first><last>Amith</last><affiliation>Gettysburg College</affiliation></author>
+      <author><first>Lindia</first><last>Tjuatja</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Lori</first><last>Levin</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <pages>58-67</pages>
+      <abstract>Interlinear glossing provides a vital type of morphosyntactic annotation, both for linguists and language revitalists, and numerous conventions exist for representing it formally and computationally. Some of these formats are human readable; others are machine readable. Some are easy to edit with general-purpose tools. Few represent non-concatentative processes like infixation, reduplication, mutation, truncation, and tonal overwriting in a consistent and formally rigorous way (on par with affixation). We propose an annotation conventionâ€”Generalized Glossing Guidelines (GGG) that combines all of these positive properties using an Item-and-Process (IP) framework. We describe the format, demonstrate its linguistic adequacy, and compare it with two other interlinear glossed text annotation schemes.</abstract>
+      <url hash="9f02f658">2023.sigmorphon-1.7</url>
+      <bibkey>mortensen-etal-2023-generalized</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Jambu: A historical linguistic database for <fixed-case>S</fixed-case>outh <fixed-case>A</fixed-case>sian languages</title>
+      <author><first>Aryaman</first><last>Arora</last><affiliation>Georgetown University</affiliation></author>
+      <author><first>Adam</first><last>Farris</last><affiliation>Stanford University</affiliation></author>
+      <author><first>Samopriya</first><last>Basu</last><affiliation>Simon Fraser University</affiliation></author>
+      <author><first>Suresh</first><last>Kolichala</last><affiliation>Microsoft</affiliation></author>
+      <pages>68-77</pages>
+      <abstract>We introduce JAMBU, a cognate database of South Asian languages which unifies dozens of previous sources in a structured and accessible format. The database includes nearly 287k lemmata from 602 lects, grouped together in 23k sets of cognates. We outline the data wrangling necessary to compile the dataset and train neural models for reflex prediction on the Indo- Aryan subset of the data. We hope that JAMBU is an invaluable resource for all historical linguists and Indologists, and look towards further improvement and expansion of the database.</abstract>
+      <url hash="7e343d73">2023.sigmorphon-1.8</url>
+      <bibkey>arora-etal-2023-jambu</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Lightweight morpheme labeling in context: Using structured linguistic representations to support linguistic analysis for the language documentation context</title>
+      <author><first>Bhargav</first><last>Shandilya</last><affiliation>University of Colorado Boulder</affiliation></author>
+      <author><first>Alexis</first><last>Palmer</last><affiliation>University of Colorado Boulder</affiliation></author>
+      <pages>78-92</pages>
+      <abstract>Linguistic analysis is a core task in the process of documenting, analyzing, and describing endangered and less-studied languages. In addition to providing insight into the properties of the language being studied, having tools to automatically label words in a language for grammatical category and morphological features can support a range of applications useful for language pedagogy and revitalization. At the same time, most modern NLP methods for these tasks require both large amounts of data in the language and compute costs well beyond the capacity of most research groups and language communities. In this paper, we present a gloss-to-gloss (g2g) model for linguistic analysis (specifically, morphological analysis and part-of-speech tagging) that is lightweight in terms of both data requirements and computational expense. The model is designed for the interlinear glossed text (IGT) format, in which we expect the source text of a sentence in a low-resource language, a translation of that sentence into a language of wider communication, and a detailed glossing of the morphological properties of each word in the sentence. We first produce silver standard parallel glossed data by automatically labeling the high-resource translation. The model then learns to transform source language morphological labels into output labels for the target language, mediated by a structured linguistic representation layer. We test the model on both low-resource and high-resource languages, and find that our simple CNN-based model achieves comparable performance to a state-of-the-art transformer-based model, at a fraction of the computational cost.</abstract>
+      <url hash="3b8c658a">2023.sigmorphon-1.9</url>
+      <bibkey>shandilya-palmer-2023-lightweight</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Improving Automated Prediction of <fixed-case>E</fixed-case>nglish Lexical Blends Through the Use of Observable Linguistic Features</title>
+      <author><first>Jarem</first><last>Saunders</last><affiliation>University of North Carolina at Chapel Hill</affiliation></author>
+      <pages>93-97</pages>
+      <abstract>The process of lexical blending is difficult to reliably predict. This difficulty has been shown by machine learning approaches in blend modeling, including attempts using then state-of-the-art LSTM deep neural networks trained on character embeddings, which were able to predict lexical blends given the ordered constituent words in less than half of cases, at maximum. This project introduces a novel model architecture which dramatically increases the correct prediction rates for lexical blends, using only Polynomial regression and Random Forest models. This is achieved by generating multiple possible blend candidates for each input word pairing and evaluating them based on observable linguistic features. The success of this model architecture illustrates the potential usefulness of observable linguistic features for problems that elude more advanced models which utilize only features discovered in the latent space.</abstract>
+      <url hash="a92c8c4d">2023.sigmorphon-1.10</url>
+      <bibkey>saunders-2023-improving</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Colexifications for Bootstrapping Cross-lingual Datasets: The Case of Phonology, Concreteness, and Affectiveness</title>
+      <author><first>Yiyi</first><last>Chen</last><affiliation>Aalborg University</affiliation></author>
+      <author><first>Johannes</first><last>Bjerva</last><affiliation>Department of Computer Science, Aalborg University</affiliation></author>
+      <pages>98-109</pages>
+      <abstract>Colexification refers to the linguistic phenomenon where a single lexical form is used to convey multiple meanings. By studying cross-lingual colexifications, researchers have gained valuable insights into fields such as psycholinguistics and cognitive sciences (Jack- son et al., 2019; Xu et al., 2020; Karjus et al., 2021; Schapper and Koptjevskaja-Tamm, 2022; FranÃ§ois, 2022). While several multilingual colexification datasets exist, there is untapped potential in using this information to bootstrap datasets across such semantic features. In this paper, we aim to demonstrate how colexifications can be leveraged to create such cross-lingual datasets. We showcase curation procedures which result in a dataset covering 142 languages across 21 language families across the world. The dataset includes ratings of concreteness and affectiveness, mapped with phonemes and phonological features. We further analyze the dataset along different dimensions to demonstrate potential of the proposed procedures in facilitating further interdisciplinary research in psychology, cognitive science, and multilingual natural language processing (NLP). Based on initial investigations, we observe that i) colexifications that are closer in concreteness/affectiveness are more likely to colexify ; ii) certain initial/last phonemes are significantly correlated with concreteness/affectiveness intra language families, such as /k/ as the initial phoneme in both Turkic and Tai-Kadai correlated with concreteness, and /p/ in Dravidian and Sino-Tibetan correlated with Valence; iii) the type-to-token ratio (TTR) of phonemes are positively correlated with concreteness across several language families, while the length of phoneme segments are negatively correlated with concreteness; iv) certain phonological features are negatively correlated with concreteness across languages. The dataset is made public online for further research.</abstract>
+      <url hash="ec9a5fa3">2023.sigmorphon-1.11</url>
+      <bibkey>chen-bjerva-2023-colexifications</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Character alignment methods for dialect-to-standard normalization</title>
+      <author><first>Yves</first><last>Scherrer</last><affiliation>University of Helsinki</affiliation></author>
+      <pages>110-116</pages>
+      <abstract>This paper evaluates various character alignment methods on the task of sentence-level standardization of dialect transcriptions. We compare alignment methods from different scientific traditions (dialectometry, speech processing, machine translation) and apply them to Finnish, Norwegian and Swiss German dialect datasets. In the absence of gold alignments, we evaluate the methods on a set of characteristics that are deemed undesirable for the task. We find that trained alignment methods only show marginal benefits to simple Levenshtein distance. On this particular task, eflomal outperforms related methods such as GIZA++ or fast_align by a large margin.</abstract>
+      <url hash="3b48144c">2023.sigmorphon-1.12</url>
+      <bibkey>scherrer-2023-character</bibkey>
+    </paper>
+    <paper id="13">
+      <title><fixed-case>SIGMORPHON</fixed-case>–<fixed-case>U</fixed-case>ni<fixed-case>M</fixed-case>orph 2023 Shared Task 0: Typologically Diverse Morphological Inflection</title>
+      <author><first>Omer</first><last>Goldman</last><affiliation>Bar-Ilan University</affiliation></author>
+      <author><first>Khuyagbaatar</first><last>Batsuren</last><affiliation>National University of Mongolia</affiliation></author>
+      <author><first>Salam</first><last>Khalifa</last><affiliation>Stony Brook University</affiliation></author>
+      <author><first>Aryaman</first><last>Arora</last><affiliation>Georgetown University</affiliation></author>
+      <author><first>Garrett</first><last>Nicolai</last><affiliation>University of British Columbia</affiliation></author>
+      <author><first>Reut</first><last>Tsarfaty</last><affiliation>Bar-Ilan University</affiliation></author>
+      <author><first>Ekaterina</first><last>Vylomova</last><affiliation>University of Melbourne</affiliation></author>
+      <pages>117-125</pages>
+      <abstract>The 2023 SIGMORPHON–UniMorph shared task on typologically diverse morphological inflection included a wide range of languages: 26 languages from 9 primary language families. The data this year was all lemma-split, to allow testing models’ generalization ability, and structured along the new hierarchical schema presented in (Batsuren et al., 2022). The systems submitted this year, 9 in number, showed ingenuity and innovativeness, including hard attention for explainability and bidirectional decoding. Special treatment was also given by many participants to the newly-introduced data in Japanese, due to the high abundance of unseen Kanji characters in its test set.</abstract>
+      <url hash="ac67d292">2023.sigmorphon-1.13</url>
+      <bibkey>goldman-etal-2023-sigmorphon</bibkey>
+    </paper>
+    <paper id="14">
+      <title><fixed-case>SIGMORPHON</fixed-case>–<fixed-case>U</fixed-case>ni<fixed-case>M</fixed-case>orph 2023 Shared Task 0, Part 2: Cognitively Plausible Morphophonological Generalization in <fixed-case>K</fixed-case>orean</title>
+      <author><first>Canaan</first><last>Breiss</last><affiliation>Massachusetts Institute of Technology</affiliation></author>
+      <author><first>Jinyoung</first><last>Jo</last><affiliation>University of California, Los Angeles</affiliation></author>
+      <pages>126-131</pages>
+      <abstract>This paper summarises data collection and curation for Part 2 of the 2023 SIGMORPHON-UniMorph Shared Task 0, which focused on modeling speaker knowledge and generalization of a pair of interacting phonological processes in Korean. We briefly describe how modeling the generalization task could be of interest to researchers in both Natural Language Processing and linguistics, and then summarise the traditional description of the phonological processes that are at the center of the modeling challenge. We then describe the criteria we used to select and code cases of process application in two Korean speech corpora, which served as the primary learning data. We also report the technical details of the experiment we carried out that served as the primary test data.</abstract>
+      <url hash="e0b8733d">2023.sigmorphon-1.14</url>
+      <bibkey>breiss-jo-2023-sigmorphon</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Morphological reinflection with weighted finite-state transducers</title>
+      <author><first>Alice</first><last>Kwak</last><affiliation>University of Arizona</affiliation></author>
+      <author><first>Michael</first><last>Hammond</last><affiliation>University of Arizona</affiliation></author>
+      <author><first>Cheyenne</first><last>Wing</last><affiliation>University of Arizona</affiliation></author>
+      <pages>132-137</pages>
+      <abstract>This paper describes the submission by the University of Arizona to the SIGMORPHON 2023 Shared Task on typologically diverse morphological (re-)infection. In our submission, we investigate the role of frequency, length, and weighted transducers in addressing the challenge of morphological reinflection. We start with the non-neural baseline provided for the task and show how some improvement can be gained by integrating length and frequency in prefix selection. We also investigate using weighted finite-state transducers, jump-started from edit distance and directly augmented with frequency. Our specific technique is promising and quite simple, but we see only modest improvements for some languages here.</abstract>
+      <url hash="b9cc6960">2023.sigmorphon-1.15</url>
+      <bibkey>kwak-etal-2023-morphological</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Linear Discriminative Learning: a competitive non-neural baseline for morphological inflection</title>
+      <author><first>Cheonkam</first><last>Jeong</last><affiliation>University of Arizona</affiliation></author>
+      <author><first>Dominic</first><last>Schmitz</last><affiliation>Heinrich Heine University Düsseldorf, Germany</affiliation></author>
+      <author><first>Akhilesh</first><last>Kakolu Ramarao</last><affiliation>Heinrich-Heine-Universität Düsseldorf</affiliation></author>
+      <author><first>Anna</first><last>Stein</last><affiliation>Heinrich Heine Universität</affiliation></author>
+      <author><first>Kevin</first><last>Tang</last><affiliation>Heinrich-Heine-Universität Düsseldorf</affiliation></author>
+      <pages>138-150</pages>
+      <abstract>This paper presents our submission to the SIGMORPHON 2023 task 2 of Cognitively Plausible Morphophonological Generalization in Korean. We implemented both Linear Discriminative Learning and Transformer models and found that the Linear Discriminative Learning model trained on a combination of corpus and experimental data showed the best performance with the overall accuracy of around 83%. We found that the best model must be trained on both corpus data and the experimental data of one particular participant. Our examination of speaker-variability and speaker-specific information did not explain why a particular participant combined well with the corpus data. We recommend Linear Discriminative Learning models as a future non-neural baseline system, owning to its training speed, accuracy, model interpretability and cognitive plausibility. In order to improve the model performance, we suggest using bigger data and/or performing data augmentation and incorporating speaker- and item-specifics considerably.</abstract>
+      <url hash="28240b3a">2023.sigmorphon-1.16</url>
+      <bibkey>jeong-etal-2023-linear</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Tü-<fixed-case>CL</fixed-case> at <fixed-case>SIGMORPHON</fixed-case> 2023: Straight-Through Gradient Estimation for Hard Attention</title>
+      <author><first>Leander</first><last>Girrbach</last><affiliation>University of Tübingen</affiliation></author>
+      <pages>151-165</pages>
+      <abstract>This paper describes our systems participating in the 2023 SIGMORPHON Shared Task on Morphological Inflection and in the 2023 SIGMORPHON Shared Task on Interlinear Glossing. We propose methods to enrich predictions from neural models with discrete, i.e. interpretable, information. For morphological inflection, our models learn deterministic mappings from subsets of source lemma characters and morphological tags to individual target characters, which introduces interpretability. For interlinear glossing, our models learn a shallow morpheme segmentation in an unsupervised way jointly with predicting glossing lines. Estimated segmentation may be useful when no ground-truth segmentation is available. As both methods introduce discreteness into neural models, our technical contribution is to show that straight-through gradient estimators are effective to train hard attention models.</abstract>
+      <url hash="ba37b95d">2023.sigmorphon-1.17</url>
+      <bibkey>girrbach-2023-tu</bibkey>
+    </paper>
+    <paper id="18">
+      <title>The <fixed-case>BGU</fixed-case>-<fixed-case>M</fixed-case>e<fixed-case>L</fixed-case>e<fixed-case>L</fixed-case> System for the <fixed-case>SIGMORPHON</fixed-case> 2023 Shared Task on Morphological Inflection</title>
+      <author><first>Gal</first><last>Astrach</last><affiliation>Ben Gurion University</affiliation></author>
+      <author><first>Yuval</first><last>Pinter</last><affiliation>Ben Gurion University</affiliation></author>
+      <pages>166-170</pages>
+      <abstract>This paper presents the submission by the MeLeL team to the SIGMORPHON–UniMorph Shared Task on Typologically Diverse and Acquisition-Inspired Morphological Inflection Generation Part 3: Models of Acquisition of Inflectional Noun Morphology in Polish, Estonian, and Finnish. This task requires us to produce the word form given a lemma and a grammatical case, while trying to produce the same error-rate as in children. We approach this task with a reduced-size character-based transformer model, multilingual training and an upsampling method to introduce bias.</abstract>
+      <url hash="aa2c4493">2023.sigmorphon-1.18</url>
+      <bibkey>astrach-pinter-2023-bgu</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Tü-<fixed-case>CL</fixed-case> at <fixed-case>SIGMORPHON</fixed-case> 2023: Straight-Through Gradient Estimation for Hard Attention</title>
+      <author><first>Leander</first><last>Girrbach</last><affiliation>University of Tübingen</affiliation></author>
+      <pages>171-185</pages>
+      <abstract>This paper describes our systems participating in the 2023 SIGMORPHON Shared Task on Morphological Inflection and in the 2023 SIGMORPHON Shared Task on Interlinear Glossing. We propose methods to enrich predictions from neural models with discrete, i.e. interpretable, information. For morphological inflection, our models learn deterministic mappings from subsets of source lemma characters and morphological tags to individual target characters, which introduces interpretability. For interlinear glossing, our models learn a shallow morpheme segmentation in an unsupervised way jointly with predicting glossing lines. Estimated segmentation may be useful when no ground-truth segmentation is available. As both methods introduce discreteness into neural models, our technical contribution is to show that straight-through gradient estimators are effective to train hard attention models.</abstract>
+      <url hash="ba37b95d">2023.sigmorphon-1.19</url>
+      <bibkey>girrbach-2023-tu-cl</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Findings of the <fixed-case>SIGMORPHON</fixed-case> 2023 Shared Task on Interlinear Glossing</title>
+      <author><first>Michael</first><last>Ginn</last><affiliation>University of Colorado Boulder</affiliation></author>
+      <author><first>Sarah</first><last>Moeller</last><affiliation>University of Florida</affiliation></author>
+      <author><first>Alexis</first><last>Palmer</last><affiliation>University of Colorado Boulder</affiliation></author>
+      <author><first>Anna</first><last>Stacey</last><affiliation>University of British Columbia</affiliation></author>
+      <author><first>Garrett</first><last>Nicolai</last><affiliation>University of British Columbia</affiliation></author>
+      <author><first>Mans</first><last>Hulden</last><affiliation>University of Colorado Boulder</affiliation></author>
+      <author><first>Miikka</first><last>Silfverberg</last><affiliation>University of British Columbia</affiliation></author>
+      <pages>186-201</pages>
+      <abstract>This paper presents the findings of the SIGMORPHON 2023 Shared Task on Interlinear Glossing. This first iteration of the shared task explores glossing of a set of six typologically diverse languages: Arapaho, Gitksan, Lezgi, Natügu, Tsez and Uspanteko. The shared task encompasses two tracks: a resource-scarce closed track and an open track, where participants are allowed to utilize external data resources. Five teams participated in the shared task. The winning team Tü-CL achieved a 23.99%-point improvement over a baseline RoBERTa system in the closed track and a 17.42%-point improvement in the open track.</abstract>
+      <url hash="aa719d85">2023.sigmorphon-1.20</url>
+      <bibkey>ginn-etal-2023-findings</bibkey>
+    </paper>
+    <paper id="21">
+      <title><fixed-case>LISN</fixed-case> @ <fixed-case>SIGMORPHON</fixed-case> 2023 Shared Task on Interlinear Glossing</title>
+      <author><first>Shu</first><last>Okabe</last><affiliation>LISN/CNRS, UniversitÃ© Paris-Saclay</affiliation></author>
+      <author><first>François</first><last>Yvon</last><affiliation>ISIR CNRS &amp; Sorbonne UniversitÃ©</affiliation></author>
+      <pages>202-208</pages>
+      <abstract>This paper describes LISN”’“s submission to the second track (open track) of the shared task on Interlinear Glossing for SIGMORPHON 2023. Our systems are based on Lost, a variation of linear Conditional Random Fields initially developed as a probabilistic translation model and then adapted to the glossing task. This model allows us to handle one of the main challenges posed by glossing, i.e. the fact that the list of potential labels for lexical morphemes is not fixed in advance and needs to be extended dynamically when labelling units are not seen in training. In such situations, we show how to make use of candidate lexical glosses found in the translation and discuss how such extension affects the training and inference procedures. The resulting automatic glossing systems prove to yield very competitive results, especially in low-resource settings.</abstract>
+      <url hash="3c714c60">2023.sigmorphon-1.21</url>
+      <bibkey>okabe-yvon-2023-lisn</bibkey>
+    </paper>
+    <paper id="22">
+      <title><fixed-case>S</fixed-case>ig<fixed-case>M</fixed-case>ore<fixed-case>F</fixed-case>un Submission to the <fixed-case>SIGMORPHON</fixed-case> Shared Task on Interlinear Glossing</title>
+      <author><first>Taiqi</first><last>He</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Lindia</first><last>Tjuatja</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Nathaniel</first><last>Robinson</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Shinji</first><last>Watanabe</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>David R.</first><last>Mortensen</last><affiliation>Language Technologies Institute, Carnegie Mellon University</affiliation></author>
+      <author><first>Graham</first><last>Neubig</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Lori</first><last>Levin</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <pages>209-216</pages>
+      <abstract>In our submission to the SIGMORPHON 2023 Shared Task on interlinear glossing (IGT), we explore approaches to data augmentation and modeling across seven low-resource languages. For data augmentation, we explore two approaches: creating artificial data from the provided training data and utilizing existing IGT resources in other languages. On the modeling side, we test an enhanced version of the provided token classification baseline as well as a pretrained multilingual seq2seq model. Additionally, we apply post-correction using a dictionary for Gitksan, the language with the smallest amount of data. We find that our token classification models are the best performing, with the highest word-level accuracy for Arapaho and highest morpheme-level accuracy for Gitksan out of all submissions. We also show that data augmentation is an effective strategy, though applying artificial data pretraining has very different effects across both models tested.</abstract>
+      <url hash="056e7b6b">2023.sigmorphon-1.22</url>
+      <bibkey>he-etal-2023-sigmorefun</bibkey>
+    </paper>
+    <paper id="23">
+      <title>An Ensembled Encoder-Decoder System for Interlinear Glossed Text</title>
+      <author><first>Edith</first><last>Coates</last><affiliation>UBC Mathematics</affiliation></author>
+      <pages>217-221</pages>
+      <abstract>This paper presents my submission to Track 1 of the 2023 SIGMORPHON shared task on interlinear glossed text (IGT). There are a wide amount of techniques for building and training IGT models (see Moeller and Hulden, 2018; McMillan-Major, 2020; Zhao et al., 2020). I describe my ensembled sequence-to-sequence approach, perform experiments, and share my submission’s test-set accuracy. I also discuss future areas of research in low-resource token classification methods for IGT.</abstract>
+      <url hash="e6ca8534">2023.sigmorphon-1.23</url>
+      <bibkey>coates-2023-ensembled</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Glossy Bytes: Neural Glossing using Subword Encoding</title>
+      <author><first>Ziggy</first><last>Cross</last><affiliation>University of British Columbia</affiliation></author>
+      <author><first>Michelle</first><last>Yun</last><affiliation>University of British Columbia</affiliation></author>
+      <author><first>Ananya</first><last>Apparaju</last><affiliation>University of British Columbia</affiliation></author>
+      <author><first>Jata</first><last>MacCabe</last><affiliation>University of British Columbia</affiliation></author>
+      <author><first>Garrett</first><last>Nicolai</last><affiliation>University of British Columbia</affiliation></author>
+      <author><first>Miikka</first><last>Silfverberg</last><affiliation>University of British Columbia</affiliation></author>
+      <pages>222-229</pages>
+      <abstract>This paper presents several different neural subword modelling based approaches to interlinear glossing for seven under-resourced languages as a part of the 2023 SIGMORPHON shared task on interlinear glossing. We experiment with various augmentation and tokenization strategies for both the open and closed tracks of data. We found that while byte-level models may perform well for greater amounts of data, character based approaches remain competitive in their performance in lower resource settings.</abstract>
+      <url hash="f3f3de35">2023.sigmorphon-1.24</url>
+      <bibkey>cross-etal-2023-glossy</bibkey>
+    </paper>
+    <paper id="27">
+      <title>The <fixed-case>SIGMORPHON</fixed-case> 2022 Shared Task on Cross-lingual and Low-Resource Grapheme-to-Phoneme Conversion</title>
+      <author><first>Arya D.</first><last>McCarthy</last><affiliation>Johns Hopkins University</affiliation></author>
+      <author><first>Jackson L.</first><last>Lee</last><affiliation/></author>
+      <author><first>Alexandra</first><last>DeLucia</last><affiliation>Johns Hopkins University</affiliation></author>
+      <author><first>Travis</first><last>Bartley</last><affiliation>City University of New York</affiliation></author>
+      <author><first>Milind</first><last>Agarwal</last><affiliation>George Mason University</affiliation></author>
+      <author><first>Lucas F.E.</first><last>Ashby</last><affiliation>City University of New York</affiliation></author>
+      <author><first>Luca</first><last>Del Signore</last><affiliation>City University of New York</affiliation></author>
+      <author><first>Cameron</first><last>Gibson</last><affiliation>City University of New York</affiliation></author>
+      <author><first>Reuben</first><last>Raff</last><affiliation>City University of New York</affiliation></author>
+      <author><first>Winston</first><last>Wu</last><affiliation>University of Michigan</affiliation></author>
+      <pages>230-238</pages>
+      <abstract>Grapheme-to-phoneme conversion is an important component in many speech technologies, but until recently there were no multilingual benchmarks for this task. The third iteration of the SIGMORPHON shared task on multilingual grapheme-to-phoneme conversion features many improvements from the previous year’s task (Ashby et al., 2021), including additional languages, three subtasks varying the amount of available resources, extensive quality assurance procedures, and automated error analyses. Three teams submitted a total of fifteen systems, at best achieving relative reductions of word error rate of 14% in the crosslingual subtask and 14% in the very-low resource subtask. The generally consistent result is that cross-lingual transfer substantially helps grapheme-to-phoneme modeling, but not to the same degree as in-language examples.</abstract>
+      <url hash="3a123f9d">2023.sigmorphon-1.27</url>
+      <bibkey>mccarthy-etal-2023-sigmorphon</bibkey>
+    </paper>
+    <paper id="28">
+      <title><fixed-case>SIGMORPHON</fixed-case> 2022 Shared Task on Grapheme-to-Phoneme Conversion Submission Description: Sequence Labelling for <fixed-case>G</fixed-case>2<fixed-case>P</fixed-case></title>
+      <author><first>Leander</first><last>Girrbach</last><affiliation>The University of Tübingen</affiliation></author>
+      <pages>239-244</pages>
+      <abstract>This paper describes our participation in the Third SIGMORPHON Shared Task on Grapheme-to-Phoneme Conversion (Low-Resource and Cross-Lingual) (McCarthy et al.,2022). Our models rely on different sequence labelling methods. The main model predicts multiple phonemes from each grapheme and is trained using CTC loss (Graves et al., 2006). We find that sequence labelling methods yield worse performance than the baseline when enough data is available, but can still be used when very little data is available. Furthermore, we demonstrate that alignments learned by the sequence labelling models can be easily inspected.</abstract>
+      <url hash="48c553ec">2023.sigmorphon-1.28</url>
+      <bibkey>girrbach-2023-sigmorphon</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Low-resource grapheme-to-phoneme mapping with phonetically-conditioned transfer</title>
+      <author><first>Michael</first><last>Hammond</last><affiliation>The University of Arizona</affiliation></author>
+      <pages>245-248</pages>
+      <abstract>In this paper we explore a very simple nonneural approach to mapping orthography to phonetic transcription in a low-resource context with transfer data from a related language. We start from a baseline system and focus our efforts on data augmentation. We make three principal moves. First, we start with an HMMbased system (Novak et al., 2012). Second, we augment our basic system by recombining legal substrings in restricted fashion (Ryan and Hulden, 2020). Finally, we limit our transfer data by only using training pairs where the phonetic form shares all bigrams with the target language.</abstract>
+      <url hash="46552681">2023.sigmorphon-1.29</url>
+      <bibkey>hammond-2023-low</bibkey>
+    </paper>
+    <paper id="30">
+      <title>A future for universal grapheme-phoneme transduction modeling with neuralized finite-state transducers</title>
+      <author><first>Chu-Cheng Lin</first><last>Lin</last><affiliation>Johns Hopkins University</affiliation></author>
+      <pages>249-249</pages>
+      <abstract>We propose a universal grapheme-phoneme transduction model using neuralized finite-state transducers. Many computational models of grapheme-phoneme transduction nowadays are based on the (autoregressive) sequence-to-sequence string transduction paradigm. While such models have achieved state-of-the-art performance, they suffer from theoretical limitations of autoregressive models. On the other hand, neuralized finite-state transducers (NFSTs) have shown promising results on various string transduction tasks. NFSTs can be seen as a generalization of weighted finite-state transducers (WFSTs), and can be seen as pairs of a featurized finite-state machine (‘marked finite-state transducer’ or MFST in NFST terminology), and a string scoring function. Instead of taking a product of local contextual feature weights on FST arcs, NFSTs can employ arbitrary scoring functions to weight global contextual features of a string transduction, and therefore break the Markov property. Furthermore, NFSTs can be formally shown to be more expressive than (autoregressive) seq2seq models. Empirically, joint grapheme-phoneme transduction NFSTs have consistently outperformed vanilla seq2seq models on grapheme-tophoneme and phoneme-to-grapheme transduction tasks for English. Furthermore, they provide interpretable aligned string transductions, thanks to their finite-state machine component. In this talk, we propose a multilingual extension of the joint grapheme-phoneme NFST. We achieve this goal by modeling typological and phylogenetic features of languages and scripts as optional latent variables using a finite-state machine. The result is a versatile graphemephoneme transduction model: in addition to standard monolingual and multilingual transduction, the proposed multilingual NFST can also be used in various controlled generation scenarios, such as phoneme-to-grapheme transduction of an unseen language-script pair. We also plan to release an NFST software package.</abstract>
+      <url hash="2d3ed4ba">2023.sigmorphon-1.30</url>
+      <bibkey>lin-2023-future</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Fine-tuning m<fixed-case>SLAM</fixed-case> for the <fixed-case>SIGMORPHON</fixed-case> 2022 Shared Task on Grapheme-to-Phoneme Conversion</title>
+      <author><first>Dan</first><last>Garrette</last><affiliation>Google Research</affiliation></author>
+      <pages>250-250</pages>
+      <abstract>Grapheme-to-phoneme (G2P) conversion is a task that is inherently related to both written and spoken language. Therefore, our submission to the G2P shared task builds off of mSLAM (Bapna et al., 2022), a 600M parameter encoder model pretrained simultaneously on text from 101 languages and speech from 51 languages. For fine-tuning a G2P model, we combined mSLAM’s text encoder, which uses characters as its input tokens, with an uninitialized single-layer RNN-T decoder (Graves, 2012) whose vocabulary is the set of all 381 phonemes appearing in the shared task data. We took an explicitly multilingual approach to modeling the G2P tasks, fine-tuning and evaluating a single model that covered all the languages in each task, and adding language codes as prefixes to the input strings as a means of specifying the language of each example. Our models perform well in the shared task’s “high” setting (in which they were trained on 1,000 words from each language), though they do poorly in the “low” task setting (training on only 100 words from each language). Our models also perform reasonably in the “mixed” setting (training on 100 words in the target language and 1000 words in a related language), hinting that mSLAM’s multilingual pretraining may be enabling useful cross-lingual sharing.</abstract>
+      <url hash="958fdfeb">2023.sigmorphon-1.31</url>
+      <bibkey>garrette-2023-fine</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.sustainlp.xml b/data/xml/2023.sustainlp.xml
new file mode 100644
index 0000000000..77925ede03
--- /dev/null
+++ b/data/xml/2023.sustainlp.xml
@@ -0,0 +1,248 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.sustainlp">
+  <volume id="1" ingest-date="2023-07-12">
+    <meta>
+      <booktitle>Proceedings of The Fourth Workshop on Simple and Efficient Natural Language Processing (SustaiNLP)</booktitle>
+      <editor><first>Nafise</first><last>Sadat Moosavi</last></editor>
+      <editor><first>Iryna</first><last>Gurevych</last></editor>
+      <editor><first>Yufang</first><last>Hou</last></editor>
+      <editor><first>Gyuwan</first><last>Kim</last></editor>
+      <editor><first>Young Jin</first><last>Kim</last></editor>
+      <editor><first>Tal</first><last>Schuster</last></editor>
+      <editor><first>Ameeta</first><last>Agrawal</last></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Toronto, Canada (Hybrid)</address>
+      <month>July</month>
+      <year>2023</year>
+      <url hash="479148d0">2023.sustainlp-1</url>
+      <venue>sustainlp</venue>
+    </meta>
+    <frontmatter>
+      <url hash="32f1578c">2023.sustainlp-1.0</url>
+      <bibkey>sustainlp-2023-simple</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title><fixed-case>K</fixed-case>wik<fixed-case>B</fixed-case>ucks: Correlation Clustering with Cheap-Weak and Expensive-Strong Signals</title>
+      <author><first>Sandeep</first><last>Silwal</last><affiliation>MIT</affiliation></author>
+      <author><first>Sara</first><last>Ahmadian</last><affiliation>Google Research</affiliation></author>
+      <author><first>Andrew</first><last>Nystrom</last><affiliation>Google AI</affiliation></author>
+      <author><first>Andrew</first><last>Mccallum</last><affiliation>UMass Amherst</affiliation></author>
+      <author><first>Deepak</first><last>Ramachandran</last><affiliation>Google Research</affiliation></author>
+      <author><first>Mehran</first><last>Kazemi</last><affiliation>Google Research</affiliation></author>
+      <pages>1-31</pages>
+      <url hash="42f2422c">2023.sustainlp-1.1</url>
+      <bibkey>silwal-etal-2023-kwikbucks</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Semantic-Oriented Unlabeled Priming for Large-Scale Language Models</title>
+      <author><first>Yanchen</first><last>Liu</last><affiliation>Harvard University</affiliation></author>
+      <author><first>Timo</first><last>Schick</last><affiliation>Meta AI</affiliation></author>
+      <author><first>Hinrich</first><last>Schtze</last><affiliation>Center for Information and Language Processing, University of Munich</affiliation></author>
+      <pages>32-38</pages>
+      <url hash="db63d1f5">2023.sustainlp-1.2</url>
+      <bibkey>liu-etal-2023-semantic</bibkey>
+    </paper>
+    <paper id="3">
+      <title>o<fixed-case>BERT</fixed-case>a: Improving Sparse Transfer Learning via improved initialization, distillation, and pruning regimes</title>
+      <author><first>Daniel</first><last>Campos</last><affiliation>University of Illinois Urbana Champaign</affiliation></author>
+      <author><first>Alexandre</first><last>Marques</last><affiliation>Neural Magic</affiliation></author>
+      <author><first>Mark</first><last>Kurtz</last><affiliation>Neural Magic</affiliation></author>
+      <author><first>Cheng</first><last>Xiang Zhai</last><affiliation>University of Illinois Urbana Champaign</affiliation></author>
+      <pages>39-58</pages>
+      <url hash="73f7d2b7">2023.sustainlp-1.3</url>
+      <bibkey>campos-etal-2023-oberta</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Quick Dense Retrievers Consume <fixed-case>KALE</fixed-case>: Post Training <fixed-case>K</fixed-case>ullback<fixed-case>L</fixed-case>eibler Alignment of Embeddings for Asymmetrical dual encoders</title>
+      <author><first>Daniel</first><last>Campos</last><affiliation>University of Illinois Urbana Champaign</affiliation></author>
+      <author><first>Alessandro</first><last>Magnani</last><affiliation>Walmart Labs</affiliation></author>
+      <author><first>Chengxiang</first><last>Zhai</last><affiliation>University of Illinois Urbana Champaign</affiliation></author>
+      <pages>59-77</pages>
+      <url hash="5e36886a">2023.sustainlp-1.4</url>
+      <bibkey>campos-etal-2023-quick</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Lessons on Parameter Sharing across Layers in Transformers</title>
+      <author><first>Sho</first><last>Takase</last><affiliation>LINE Corporation</affiliation></author>
+      <author><first>Shun</first><last>Kiyono</last><affiliation>LINE Corporation</affiliation></author>
+      <pages>78-90</pages>
+      <url hash="e10f1d68">2023.sustainlp-1.5</url>
+      <bibkey>takase-kiyono-2023-lessons</bibkey>
+    </paper>
+    <paper id="6">
+      <title>To Asymmetry and Beyond: Structured Pruning of Sequence to Sequence Models for Improved Inference Efficiency</title>
+      <author><first>Daniel</first><last>Campos</last><affiliation>University of Illinois Urbana Champaign</affiliation></author>
+      <author><first>Chengxiang</first><last>Zhai</last><affiliation>University of Illinois Urbana Champaign</affiliation></author>
+      <pages>91-109</pages>
+      <url hash="9dceca34">2023.sustainlp-1.6</url>
+      <bibkey>campos-zhai-2023-asymmetry</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Small is the New Big: Pre-finetuned compact models are better for Asynchronous Active Learning</title>
+      <author><first>Dantong</first><last>Liu</last><affiliation>Amazon</affiliation></author>
+      <author><first>Kaushik</first><last>Pavani</last><affiliation>Amazon</affiliation></author>
+      <author><first>Sunny</first><last>Dasgupta</last><affiliation>Amazon</affiliation></author>
+      <pages>110-120</pages>
+      <url hash="ff648f01">2023.sustainlp-1.7</url>
+      <bibkey>liu-etal-2023-small</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>ADEPT</fixed-case>: Adapter-based Efficient Prompt Tuning Approach for Language Models</title>
+      <author><first>Aditya</first><last>Shah</last><affiliation>Virginia Tech</affiliation></author>
+      <author><first>Surendrabikram</first><last>Thapa</last><affiliation>Virginia Tech</affiliation></author>
+      <author><first>Aneesh</first><last>Jain</last><affiliation>Virginia Tech</affiliation></author>
+      <author><first>Lifu</first><last>Huang</last><affiliation>Virginia Tech</affiliation></author>
+      <pages>121-128</pages>
+      <url hash="132e9f1c">2023.sustainlp-1.8</url>
+      <bibkey>shah-etal-2023-adept</bibkey>
+    </paper>
+    <paper id="9">
+      <title><fixed-case>NLU</fixed-case> on Data Diets: Dynamic Data Subset Selection for <fixed-case>NLP</fixed-case> Classification Tasks</title>
+      <author><first>Jean-michel</first><last>Attendu</last><affiliation>Nuance Communications</affiliation></author>
+      <author><first>Jean-philippe</first><last>Corbeil</last><affiliation>Nuance Communications</affiliation></author>
+      <pages>129-146</pages>
+      <url hash="9ade188e">2023.sustainlp-1.9</url>
+      <bibkey>attendu-corbeil-2023-nlu</bibkey>
+    </paper>
+    <paper id="10">
+      <title>On the Interactions of Structural Constraints and Data Resources for Structured Prediction</title>
+      <author><first>Zhisong</first><last>Zhang</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Emma</first><last>Strubell</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Eduard</first><last>Hovy</last><affiliation>University of Melbourne</affiliation></author>
+      <pages>147-157</pages>
+      <url hash="35a336ec">2023.sustainlp-1.10</url>
+      <bibkey>zhang-etal-2023-interactions</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Can we Pretrain a <fixed-case>S</fixed-case>ot<fixed-case>A</fixed-case> Legal Language Model on a Budget From Scratch?</title>
+      <author><first>Joel</first><last>Niklaus</last><affiliation>University of Bern</affiliation></author>
+      <author><first>Daniele</first><last>Giofre</last><affiliation>Thomson Reuters</affiliation></author>
+      <pages>158-182</pages>
+      <url hash="99bc755e">2023.sustainlp-1.11</url>
+      <bibkey>niklaus-giofre-2023-pretrain</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Is a Video worth n n Images? A Highly Efficient Approach to Transformer-based Video Question Answering</title>
+      <author><first>Chenyang</first><last>Lyu</last><affiliation>Dublin City University</affiliation></author>
+      <author><first>Tianbo</first><last>Ji</last><affiliation>Nantong University</affiliation></author>
+      <author><first>Yvette</first><last>Graham</last><affiliation>ADAPT, Trinity College Dublin</affiliation></author>
+      <author><first>Jennifer</first><last>Foster</last><affiliation>Dublin City University</affiliation></author>
+      <pages>183-189</pages>
+      <url hash="223795ba">2023.sustainlp-1.12</url>
+      <bibkey>lyu-etal-2023-video</bibkey>
+    </paper>
+    <paper id="13">
+      <title>How to Unleash the Power of Large Language Models for Few-shot Relation Extraction?</title>
+      <author><first>Xin</first><last>Xu</last><affiliation>Zhejiang University</affiliation></author>
+      <author><first>Yuqi</first><last>Zhu</last><affiliation>Zhejiang University</affiliation></author>
+      <author><first>Xiaohan</first><last>Wang</last><affiliation>Zhejiang University</affiliation></author>
+      <author><first>Ningyu</first><last>Zhang</last><affiliation>Zhejiang University</affiliation></author>
+      <pages>190-200</pages>
+      <url hash="0ed4b9d5">2023.sustainlp-1.13</url>
+      <bibkey>xu-etal-2023-unleash</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Prompting language models improves performance in imbalanced setting</title>
+      <author><first>Jay</first><last>Mohta</last><affiliation>Amazon</affiliation></author>
+      <pages>201-211</pages>
+      <url hash="22b5ed95">2023.sustainlp-1.14</url>
+      <bibkey>mohta-2023-prompting</bibkey>
+    </paper>
+    <paper id="15">
+      <title><fixed-case>KGQA</fixed-case> Without Retraining</title>
+      <author><first>Nick</first><last>Mckenna</last><affiliation>University of Edinburgh, School of Informatics</affiliation></author>
+      <author><first>Priyanka</first><last>Sen</last><affiliation>Amazon</affiliation></author>
+      <pages>212-218</pages>
+      <url hash="71faa19b">2023.sustainlp-1.15</url>
+      <bibkey>mckenna-sen-2023-kgqa</bibkey>
+    </paper>
+    <paper id="16">
+      <title><fixed-case>MANER</fixed-case>: Mask Augmented Named Entity Recognition for Extreme Low-Resource Languages</title>
+      <author><first>Shashank</first><last>Sonkar</last><affiliation>Rice University</affiliation></author>
+      <author><first>Zichao</first><last>Wang</last><affiliation>Rice University</affiliation></author>
+      <author><first>Richard</first><last>Baraniuk</last><affiliation>Rice University</affiliation></author>
+      <pages>219-226</pages>
+      <url hash="c6531a16">2023.sustainlp-1.16</url>
+      <bibkey>sonkar-etal-2023-maner</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Efficient and Interpretable Compressive Text Summarisation with Unsupervised Dual-Agent Reinforcement Learning</title>
+      <author><first>Peggy</first><last>Tang</last><affiliation>The University of Sydney</affiliation></author>
+      <author><first>Junbin</first><last>Gao</last><affiliation>The University of Sydney</affiliation></author>
+      <author><first>Lei</first><last>Zhang</last><affiliation>International Digital Economy Academy (IDEA)</affiliation></author>
+      <author><first>Zhiyong</first><last>Wang</last><affiliation>The University of Sydney</affiliation></author>
+      <pages>227-238</pages>
+      <url hash="7550c35c">2023.sustainlp-1.17</url>
+      <bibkey>tang-etal-2023-efficient</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Exploring the Effect of Frequency Resolution in <fixed-case>FN</fixed-case>et</title>
+      <author><first>Gregory</first><last>Szumel</last><affiliation>Duke University</affiliation></author>
+      <author><first>Ghazal</first><last>Khalighinejad</last><affiliation>Duke University</affiliation></author>
+      <author><first>Rickard</first><last>Stureborg</last><affiliation>Duke University</affiliation></author>
+      <author><first>Sam</first><last>Wiseman</last><affiliation>Duke University</affiliation></author>
+      <pages>239-244</pages>
+      <url hash="cdd26a24">2023.sustainlp-1.18</url>
+      <bibkey>szumel-etal-2023-exploring</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Towards Adaptable and Interactive Image Captioning with Data Augmentation and Episodic Memory</title>
+      <author><first>Aliki</first><last>Anagnostopoulou</last><affiliation>Carl von Ossietzky University of Oldenburg / German Research Center for Artificial Intelligence</affiliation></author>
+      <author><first>Mareike</first><last>Hartmann</last><affiliation>Saarland University / German Research Center for Artificial Intelligence</affiliation></author>
+      <author><first>Daniel</first><last>Sonntag</last><affiliation>Carl von Ossietzky University of Oldenburg / German Research Center for Artificial Intelligence</affiliation></author>
+      <pages>245-256</pages>
+      <url hash="43b07614">2023.sustainlp-1.19</url>
+      <bibkey>anagnostopoulou-etal-2023-towards</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Corpus Complexity Matters in Pretraining Language Models</title>
+      <author><first>Ameeta</first><last>Agrawal</last><affiliation>Portland State University</affiliation></author>
+      <author><first>Suresh</first><last>Singh</last><affiliation>Portland State University</affiliation></author>
+      <pages>257-263</pages>
+      <url hash="96d8d311">2023.sustainlp-1.20</url>
+      <bibkey>agrawal-singh-2023-corpus</bibkey>
+    </paper>
+    <paper id="21">
+      <title><fixed-case>P</fixed-case>ersona<fixed-case>PKT</fixed-case>: Building Personalized Dialogue Agents via Parameter-efficient Knowledge Transfer</title>
+      <author><first>Xu</first><last>Han</last><affiliation>University of Colorado Boulder</affiliation></author>
+      <author><first>Bin</first><last>Guo</last><affiliation>Amazon.com</affiliation></author>
+      <author><first>Yoon</first><last>Jung</last><affiliation>Amazon</affiliation></author>
+      <author><first>Benjamin</first><last>Yao</last><affiliation>Amazon</affiliation></author>
+      <author><first>Yu</first><last>Zhang</last><affiliation>Amazon.com</affiliation></author>
+      <author><first>Xiaohu</first><last>Liu</last><affiliation>Amazon</affiliation></author>
+      <author><first>Chenlei</first><last>Guo</last><affiliation>Amazon</affiliation></author>
+      <pages>264-273</pages>
+      <url hash="ad700145">2023.sustainlp-1.21</url>
+      <bibkey>han-etal-2023-personapkt</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Small Character Models Match Large Word Models for Autocomplete Under Memory Constraints</title>
+      <author><first>Ganesh</first><last>Jawahar</last><affiliation>The University of British Columbia</affiliation></author>
+      <author><first>Subhabrata</first><last>Mukherjee</last><affiliation>Microsoft Research</affiliation></author>
+      <author><first>Debadeepta</first><last>Dey</last><affiliation>Microsoft Research</affiliation></author>
+      <author><first>Muhammad</first><last>Abdul-mageed</last><affiliation>The University of British Columbia</affiliation></author>
+      <author><first>Laks</first><last>Lakshmanan, V.s.</last><affiliation>UBC</affiliation></author>
+      <author><first>Caio</first><last>Mendes</last><affiliation>Microsoft</affiliation></author>
+      <author><first>Gustavo</first><last>De Rosa</last><affiliation>Microsoft Research</affiliation></author>
+      <author><first>Shital</first><last>Shah</last><affiliation>Microsoft Research</affiliation></author>
+      <pages>274-289</pages>
+      <url hash="54617ff3">2023.sustainlp-1.22</url>
+      <bibkey>jawahar-etal-2023-small</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Query Encoder Distillation via Embedding Alignment is a Strong Baseline Method to Boost Dense Retriever Online Efficiency</title>
+      <author><first>Yuxuan</first><last>Wang</last><affiliation>University of Pennsylvania</affiliation></author>
+      <author><first>Lyu</first><last>Hong</last><affiliation>University of Pennsylvania</affiliation></author>
+      <pages>290-298</pages>
+      <url hash="0d625866">2023.sustainlp-1.23</url>
+      <bibkey>wang-hong-2023-query</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Minimalist Entity Disambiguation for Mid-Resource Languages</title>
+      <author><first>Benno</first><last>Kruit</last><affiliation>VU Amsterdam</affiliation></author>
+      <pages>299-306</pages>
+      <url hash="e013c84f">2023.sustainlp-1.24</url>
+      <bibkey>kruit-2023-minimalist</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.ws.xml b/data/xml/2023.ws.xml
index 8dda476745..74c40991ee 100644
--- a/data/xml/2023.ws.xml
+++ b/data/xml/2023.ws.xml
@@ -31,6 +31,14 @@
       <volume-id>2023.wnu-1</volume-id>
       <volume-id>2023.semeval-1</volume-id>
       <volume-id>2023.woah-1</volume-id>
+      <volume-id>2023.cawl-1</volume-id>
+      <volume-id>2023.clinicalnlp-1</volume-id>
+      <volume-id>2023.repl4nlp-1</volume-id>
+      <volume-id>2023.nlrse-1</volume-id>
+      <volume-id>2023.sustainlp-1</volume-id>
+      <volume-id>2023.dialdoc-1</volume-id>
+      <volume-id>2023.sicon-1</volume-id>
+      <volume-id>2023.americasnlp-1</volume-id>
     </colocated>
   </event>
 </collection>
diff --git a/data/xml/D18.xml b/data/xml/D18.xml
index 221602f8f1..435d2fad35 100644
--- a/data/xml/D18.xml
+++ b/data/xml/D18.xml
@@ -2179,6 +2179,7 @@
       <doi>10.18653/v1/D18-1160</doi>
       <bibkey>he-etal-2018-unsupervised</bibkey>
       <pwccode url="https://github.com/jxhe/struct-learning-with-flow" additional="false">jxhe/struct-learning-with-flow</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/ptb">PTB Diagnostic ECG Database</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="161">
diff --git a/data/xml/D19.xml b/data/xml/D19.xml
index ebbe5639b5..da5316f79a 100644
--- a/data/xml/D19.xml
+++ b/data/xml/D19.xml
@@ -5092,6 +5092,7 @@
       <bibkey>jiang-etal-2019-improved</bibkey>
       <pwccode url="https://github.com/jiangyingjunn/i-darts" additional="false">jiangyingjunn/i-darts</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/ptb">PTB Diagnostic ECG Database</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="368">
diff --git a/data/xml/N19.xml b/data/xml/N19.xml
index 50bf847b51..6c7e9d4d3e 100644
--- a/data/xml/N19.xml
+++ b/data/xml/N19.xml
@@ -1580,6 +1580,7 @@
       <bibkey>kim-etal-2019-unsupervised</bibkey>
       <pwccode url="https://github.com/harvardnlp/urnng" additional="false">harvardnlp/urnng</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/billion-word-benchmark">Billion Word Benchmark</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/ptb">PTB Diagnostic ECG Database</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="115">
@@ -1614,6 +1615,7 @@
       <bibkey>drozdov-etal-2019-unsupervised-latent</bibkey>
       <video href="https://vimeo.com/364678824" permission="false"/>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/ptb">PTB Diagnostic ECG Database</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="117">
diff --git a/data/xml/P19.xml b/data/xml/P19.xml
index 130a39712e..f5f3970dbe 100644
--- a/data/xml/P19.xml
+++ b/data/xml/P19.xml
@@ -3276,6 +3276,7 @@
       <doi>10.18653/v1/P19-1228</doi>
       <bibkey>kim-etal-2019-compound</bibkey>
       <pwccode url="https://github.com/harvardnlp/compound-pcfg" additional="true">harvardnlp/compound-pcfg</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/ptb">PTB Diagnostic ECG Database</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="229">
diff --git a/data/xml/W19.xml b/data/xml/W19.xml
index c5e337bfd7..4f68816532 100644
--- a/data/xml/W19.xml
+++ b/data/xml/W19.xml
@@ -2061,6 +2061,8 @@
       <doi>10.18653/v1/W19-1803</doi>
       <bibkey>pavlopoulos-etal-2019-survey</bibkey>
       <pwccode url="https://github.com/nlpaueb/bio_image_caption" additional="true">nlpaueb/bio_image_caption</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/iu-x-ray">IU X-Ray</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/peir-gross">Peir Gross</pwcdataset>
     </paper>
     <paper id="4">
       <title>Revisiting Visual Grounding</title>
@@ -14121,7 +14123,6 @@ One of the references was wrong therefore it is corrected to cite the appropriat
       <url hash="9b228715">W19-5945</url>
       <doi>10.18653/v1/W19-5945</doi>
       <bibkey>keizer-etal-2019-user</bibkey>
-      <pwccode url="https://bitbucket.org/skeizer/madrigal" additional="false">skeizer/madrigal</pwccode>
     </paper>
     <paper id="46">
       <title>Dialogue Act Classification in Team Communication for Robot Assisted Disaster Response</title>
diff --git a/data/yaml/sigs/sigmorphon.yaml b/data/yaml/sigs/sigmorphon.yaml
index ed92704442..6792211be0 100644
--- a/data/yaml/sigs/sigmorphon.yaml
+++ b/data/yaml/sigs/sigmorphon.yaml
@@ -2,6 +2,8 @@ Name: Special Interest Group on Computational Morphology and Phonology (SIGMORPH
 ShortName: SIGMORPHON
 URL: https://sigmorphon.github.io/
 Meetings:
+  - 2023:
+    - 2023.sigmorphon-1
   - 2022:
     - 2022.sigmorphon-1
   - 2021:
diff --git a/data/yaml/venues/cawl.yaml b/data/yaml/venues/cawl.yaml
new file mode 100644
index 0000000000..4adcd21731
--- /dev/null
+++ b/data/yaml/venues/cawl.yaml
@@ -0,0 +1,2 @@
+acronym: CAWL
+name: Workshop on Computation and Written Language (CAWL)
diff --git a/data/yaml/venues/nlrse.yaml b/data/yaml/venues/nlrse.yaml
new file mode 100644
index 0000000000..d5b63f0d8f
--- /dev/null
+++ b/data/yaml/venues/nlrse.yaml
@@ -0,0 +1,3 @@
+acronym: NLRSE
+is_acl: true
+name: Workshop on Natural Language Reasoning and Structured Explanations
diff --git a/data/yaml/venues/sicon.yaml b/data/yaml/venues/sicon.yaml
new file mode 100644
index 0000000000..0df0de3891
--- /dev/null
+++ b/data/yaml/venues/sicon.yaml
@@ -0,0 +1,2 @@
+acronym: SICon
+name: Workshop on Social Influence in Conversations