-
Notifications
You must be signed in to change notification settings - Fork 50
FIX: Request SQL_CHAR as SQL_C_WCHAR in arrow fetch path #575
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
29a9cec
4ce73ca
c4ce528
c574aca
9a3992e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4780,9 +4780,8 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules, | |
| ColumnBuffers buffers(numCols, fetchSize); | ||
|
|
||
| if (!hasLobColumns && fetchSize > 0) { | ||
| // Bind columns — Arrow always uses SQL_C_CHAR for VARCHAR because | ||
| // it processes raw byte buffers directly, not via Python codecs. | ||
| ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, SQL_C_CHAR); | ||
| // Always request WCHARs so we don't have to deal with CHAR encodings | ||
| ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, SQL_C_WCHAR); | ||
| if (!SQL_SUCCEEDED(ret)) { | ||
| LOG("Error when binding columns"); | ||
| return ret; | ||
|
|
@@ -4841,16 +4840,7 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules, | |
| } | ||
| case SQL_CHAR: | ||
| case SQL_VARCHAR: | ||
| case SQL_LONGVARCHAR: { | ||
| ret = GetDataVar(hStmt, idxCol + 1, SQL_C_CHAR, | ||
| buffers.charBuffers[idxCol], | ||
| buffers.indicators[idxCol].data()); | ||
| if (!SQL_SUCCEEDED(ret)) { | ||
| LOG("Error fetching CHAR LOB for column %d", idxCol + 1); | ||
| return ret; | ||
| } | ||
| break; | ||
| } | ||
| case SQL_LONGVARCHAR: | ||
| case SQL_SS_XML: | ||
| case SQL_WCHAR: | ||
| case SQL_WVARCHAR: | ||
|
|
@@ -5093,24 +5083,7 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules, | |
| } | ||
| case SQL_CHAR: | ||
| case SQL_VARCHAR: | ||
| case SQL_LONGVARCHAR: { | ||
| #if defined(__APPLE__) || defined(__linux__) | ||
| uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/; | ||
| #else | ||
| uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/; | ||
| #endif | ||
| auto target_vec = &arrowColumnProducer->varData; | ||
| auto start = arrowColumnProducer->varVal[idxRowArrow]; | ||
| while (target_vec->size() < start + dataLen) { | ||
| target_vec->resize(target_vec->size() * 2); | ||
| } | ||
|
|
||
| std::memcpy(&(*target_vec)[start], | ||
| &buffers.charBuffers[idxCol][idxRowSql * fetchBufferSize], | ||
| dataLen); | ||
| arrowColumnProducer->varVal[idxRowArrow + 1] = start + dataLen; | ||
| break; | ||
| } | ||
| case SQL_LONGVARCHAR: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. similarly here. |
||
| case SQL_SS_XML: | ||
| case SQL_WCHAR: | ||
| case SQL_WVARCHAR: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -313,6 +313,63 @@ def test_arrow_long_string(cursor: mssql_python.Cursor): | |
| assert batch.column(0).to_pylist() == [long_string] | ||
|
|
||
|
|
||
| def test_arrow_varchar_utf8_collation_unicode(cursor: mssql_python.Cursor): | ||
| table = "#t_arrow_utf8_varchar" | ||
| collation = "Latin1_General_100_CI_AS_SC_UTF8" | ||
| expected = [ | ||
| "Grüße", | ||
| "你好😀", | ||
| "こんにちは", | ||
| "Привет", | ||
| "Hello 世界", | ||
| "😀😃😄😁", | ||
| "", | ||
| None, | ||
| ] | ||
|
|
||
| try: | ||
| cursor.execute( | ||
| f"create table {table} (id int primary key, v varchar(32) collate {collation})" | ||
| ) | ||
| except Exception as exc: | ||
| pytest.skip(f"UTF-8 collation '{collation}' not supported: {exc}") | ||
|
|
||
| try: | ||
| for index, value in enumerate(expected, start=1): | ||
| cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value) | ||
| tbl = cursor.execute(f"select v from {table} order by id").arrow() | ||
| assert tbl.column(0).type.equals(pa.large_string()) | ||
| assert tbl.column(0).to_pylist() == expected | ||
| finally: | ||
| cursor.execute(f"drop table if exists {table}") | ||
|
|
||
|
|
||
| def test_arrow_varchar_utf8_collation_cp1252(cursor: mssql_python.Cursor): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The test uses SQL_Latin1_General_CP1_CI_AS (CP1252), NOT a UTF-8 collation. The current name is slightly misleading, would it be better to name it as test_arrow_varchar_cp1252_collation_unicode? |
||
| table = "#t_arrow_cp1252_varchar" | ||
| collation = "SQL_Latin1_General_CP1_CI_AS" | ||
| expected = [ | ||
| "Grüße", | ||
| "café René!", | ||
| "naïve café", | ||
| "Español", | ||
| "Müller-Öztürk", | ||
| "Françoise", | ||
| "", | ||
| None, | ||
| ] | ||
|
|
||
| cursor.execute(f"create table {table} (id int primary key, v varchar(32) collate {collation})") | ||
|
|
||
| try: | ||
| for index, value in enumerate(expected, start=1): | ||
| cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value) | ||
| tbl = cursor.execute(f"select v from {table} order by id").arrow() | ||
| assert tbl.column(0).type.equals(pa.large_string()) | ||
| assert tbl.column(0).to_pylist() == expected | ||
| finally: | ||
| cursor.execute(f"drop table if exists {table}") | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The fix applies to SQL_CHAR, SQL_VARCHAR, and SQL_LONGVARCHAR, but only VARCHAR is tested. Can we add a test for CHAR (fixed-length) type. |
||
|
|
||
| def test_rownumber_arrow_batch_interleaved_fetchmany(cursor: mssql_python.Cursor): | ||
| """Verify that arrow_batch and fetchmany can be interleaved | ||
| on the same result set with correct rownumber tracking and values.""" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we add a comment explaining these now fall through to WCHAR handling because SQL_CHAR is bound as SQL_C_WCHAR at line 4784.