return column names in the order requested (#27)

* return column names in the order requested * retain correct ordering of columns in object rows as well
hyparam · Aug 14, 2024 · 8ace1a4 · 8ace1a4
1 parent d13d52b
commit 8ace1a4
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -103,14 +103,15 @@ To parse parquet files from a user drag-and-drop action, see example in [index.h
 
 To read large parquet files, it is recommended that you filter by row and column.
 Hyparquet is designed to load only the minimal amount of data needed to fulfill a query.
-You can filter rows by number, or columns by name:
+You can filter rows by number, or columns by name,
+and columns will be returned in the same order they were requested:
 
 ```js
 import { parquetRead } from 'hyparquet'
 
 await parquetRead({
   file,
-  columns: ['colA', 'colB'], // include columns colA and colB
+  columns: ['colB', 'colA'], // include columns colB and colA
   rowStart: 100,
   rowEnd: 200,
   onComplete: data => console.log(data),

diff --git a/src/read.js b/src/read.js
@@ -190,21 +190,22 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
     const includedColumnNames = children
       .map(child => child.element.name)
       .filter(name => !columns || columns.includes(name))
-    const includedColumns = includedColumnNames
-      .map(name => subcolumnData.get(name))
+    const columnOrder = columns || includedColumnNames
+    const includedColumns = columnOrder
+      .map(name => includedColumnNames.includes(name) ? subcolumnData.get(name) : undefined)
 
     for (let row = 0; row < rowLimit; row++) {
       if (options.rowFormat === 'object') {
         // return each row as an object
         /** @type {Record<string, any>} */
         const rowData = {}
-        includedColumnNames.forEach((name, index) => {
-          rowData[name] = includedColumns[index][row]
+        columnOrder.forEach((name, index) => {
+          rowData[name] = includedColumns[index]?.[row]
         })
         groupData[row] = rowData
       } else {
         // return each row as an array
-        groupData[row] = includedColumns.map(column => column[row])
+        groupData[row] = includedColumns.map(column => column?.[row])
       }
     }
     return groupData

diff --git a/test/read.test.js b/test/read.test.js
@@ -149,4 +149,38 @@ describe('parquetRead', () => {
       },
     })
   })
+
+  it('read columns out of order', async () => {
+    const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
+    await parquetRead({
+      file,
+      columns: ['c', 'missing', 'b', 'c'],
+      onChunk: chunk => {
+        if (chunk.columnName === 'b') {
+          expect(toJson(chunk)).toEqual({
+            columnName: 'b',
+            columnData: [1, 2, 3, 4, 5],
+            rowStart: 0,
+            rowEnd: 5,
+          })
+        } else {
+          expect(toJson(chunk)).toEqual({
+            columnName: 'c',
+            columnData: [2, 3, 4, 5, 2],
+            rowStart: 0,
+            rowEnd: 5,
+          })
+        }
+      },
+      onComplete: (rows) => {
+        expect(toJson(rows)).toEqual([
+          [2, null, 1, 2],
+          [3, null, 2, 3],
+          [4, null, 3, 4],
+          [5, null, 4, 5],
+          [2, null, 5, 2],
+        ])
+      },
+    })
+  })
 })