Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 4 additions & 31 deletions mssql_python/pybind/ddbc_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4780,9 +4780,8 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules,
ColumnBuffers buffers(numCols, fetchSize);

if (!hasLobColumns && fetchSize > 0) {
// Bind columns — Arrow always uses SQL_C_CHAR for VARCHAR because
// it processes raw byte buffers directly, not via Python codecs.
ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, SQL_C_CHAR);
// Always request WCHARs so we don't have to deal with CHAR encodings
ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, SQL_C_WCHAR);
if (!SQL_SUCCEEDED(ret)) {
LOG("Error when binding columns");
return ret;
Expand Down Expand Up @@ -4841,16 +4840,7 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules,
}
case SQL_CHAR:
case SQL_VARCHAR:
case SQL_LONGVARCHAR: {
ret = GetDataVar(hStmt, idxCol + 1, SQL_C_CHAR,
buffers.charBuffers[idxCol],
buffers.indicators[idxCol].data());
if (!SQL_SUCCEEDED(ret)) {
LOG("Error fetching CHAR LOB for column %d", idxCol + 1);
return ret;
}
break;
}
case SQL_LONGVARCHAR:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a comment explaining these now fall through to WCHAR handling because SQL_CHAR is bound as SQL_C_WCHAR at line 4784.

case SQL_SS_XML:
case SQL_WCHAR:
case SQL_WVARCHAR:
Expand Down Expand Up @@ -5093,24 +5083,7 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules,
}
case SQL_CHAR:
case SQL_VARCHAR:
case SQL_LONGVARCHAR: {
#if defined(__APPLE__) || defined(__linux__)
uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/;
#else
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
#endif
auto target_vec = &arrowColumnProducer->varData;
auto start = arrowColumnProducer->varVal[idxRowArrow];
while (target_vec->size() < start + dataLen) {
target_vec->resize(target_vec->size() * 2);
}

std::memcpy(&(*target_vec)[start],
&buffers.charBuffers[idxCol][idxRowSql * fetchBufferSize],
dataLen);
arrowColumnProducer->varVal[idxRowArrow + 1] = start + dataLen;
break;
}
case SQL_LONGVARCHAR:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similarly here.

case SQL_SS_XML:
case SQL_WCHAR:
case SQL_WVARCHAR:
Expand Down
57 changes: 57 additions & 0 deletions tests/test_004_cursor_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,63 @@ def test_arrow_long_string(cursor: mssql_python.Cursor):
assert batch.column(0).to_pylist() == [long_string]


def test_arrow_varchar_utf8_collation_unicode(cursor: mssql_python.Cursor):
table = "#t_arrow_utf8_varchar"
collation = "Latin1_General_100_CI_AS_SC_UTF8"
expected = [
"Grüße",
"你好😀",
"こんにちは",
"Привет",
"Hello 世界",
"😀😃😄😁",
"",
None,
]

try:
cursor.execute(
f"create table {table} (id int primary key, v varchar(32) collate {collation})"
)
except Exception as exc:
pytest.skip(f"UTF-8 collation '{collation}' not supported: {exc}")

try:
for index, value in enumerate(expected, start=1):
cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value)
tbl = cursor.execute(f"select v from {table} order by id").arrow()
assert tbl.column(0).type.equals(pa.large_string())
assert tbl.column(0).to_pylist() == expected
finally:
cursor.execute(f"drop table if exists {table}")


def test_arrow_varchar_utf8_collation_cp1252(cursor: mssql_python.Cursor):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test uses SQL_Latin1_General_CP1_CI_AS (CP1252), NOT a UTF-8 collation. The current name is slightly misleading, would it be better to name it as test_arrow_varchar_cp1252_collation_unicode?

table = "#t_arrow_cp1252_varchar"
collation = "SQL_Latin1_General_CP1_CI_AS"
expected = [
"Grüße",
"café René!",
"naïve café",
"Español",
"Müller-Öztürk",
"Françoise",
"",
None,
]

cursor.execute(f"create table {table} (id int primary key, v varchar(32) collate {collation})")

try:
for index, value in enumerate(expected, start=1):
cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value)
tbl = cursor.execute(f"select v from {table} order by id").arrow()
assert tbl.column(0).type.equals(pa.large_string())
assert tbl.column(0).to_pylist() == expected
finally:
cursor.execute(f"drop table if exists {table}")

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fix applies to SQL_CHAR, SQL_VARCHAR, and SQL_LONGVARCHAR, but only VARCHAR is tested. Can we add a test for CHAR (fixed-length) type.


def test_rownumber_arrow_batch_interleaved_fetchmany(cursor: mssql_python.Cursor):
"""Verify that arrow_batch and fetchmany can be interleaved
on the same result set with correct rownumber tracking and values."""
Expand Down
Loading