Skip to content

Commit

Permalink
return column names in the order requested (#27)
Browse files Browse the repository at this point in the history
* return column names in the order requested

* retain correct ordering of columns in object rows as well
  • Loading branch information
ctranstrum authored Aug 14, 2024
1 parent d13d52b commit 8ace1a4
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 7 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,15 @@ To parse parquet files from a user drag-and-drop action, see example in [index.h

To read large parquet files, it is recommended that you filter by row and column.
Hyparquet is designed to load only the minimal amount of data needed to fulfill a query.
You can filter rows by number, or columns by name:
You can filter rows by number, or columns by name,
and columns will be returned in the same order they were requested:

```js
import { parquetRead } from 'hyparquet'

await parquetRead({
file,
columns: ['colA', 'colB'], // include columns colA and colB
columns: ['colB', 'colA'], // include columns colB and colA
rowStart: 100,
rowEnd: 200,
onComplete: data => console.log(data),
Expand Down
11 changes: 6 additions & 5 deletions src/read.js
Original file line number Diff line number Diff line change
Expand Up @@ -190,21 +190,22 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
const includedColumnNames = children
.map(child => child.element.name)
.filter(name => !columns || columns.includes(name))
const includedColumns = includedColumnNames
.map(name => subcolumnData.get(name))
const columnOrder = columns || includedColumnNames
const includedColumns = columnOrder
.map(name => includedColumnNames.includes(name) ? subcolumnData.get(name) : undefined)

for (let row = 0; row < rowLimit; row++) {
if (options.rowFormat === 'object') {
// return each row as an object
/** @type {Record<string, any>} */
const rowData = {}
includedColumnNames.forEach((name, index) => {
rowData[name] = includedColumns[index][row]
columnOrder.forEach((name, index) => {
rowData[name] = includedColumns[index]?.[row]
})
groupData[row] = rowData
} else {
// return each row as an array
groupData[row] = includedColumns.map(column => column[row])
groupData[row] = includedColumns.map(column => column?.[row])
}
}
return groupData
Expand Down
34 changes: 34 additions & 0 deletions test/read.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,38 @@ describe('parquetRead', () => {
},
})
})

it('read columns out of order', async () => {
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
await parquetRead({
file,
columns: ['c', 'missing', 'b', 'c'],
onChunk: chunk => {
if (chunk.columnName === 'b') {
expect(toJson(chunk)).toEqual({
columnName: 'b',
columnData: [1, 2, 3, 4, 5],
rowStart: 0,
rowEnd: 5,
})
} else {
expect(toJson(chunk)).toEqual({
columnName: 'c',
columnData: [2, 3, 4, 5, 2],
rowStart: 0,
rowEnd: 5,
})
}
},
onComplete: (rows) => {
expect(toJson(rows)).toEqual([
[2, null, 1, 2],
[3, null, 2, 3],
[4, null, 3, 4],
[5, null, 4, 5],
[2, null, 5, 2],
])
},
})
})
})

0 comments on commit 8ace1a4

Please sign in to comment.