From 2715f47b037cfd515a012d0ca82caa85944589cd Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Fri, 8 May 2026 23:19:03 +0200 Subject: [PATCH 1/2] fix: support URI jcpan tests Fix several Perl compatibility gaps exercised by URI: - consume Encode FB_QUIET sources after successful encode/decode - allow CORE::GLOBAL::gethostbyname overrides - preserve byte-string escape literals under use utf8 - warn and pass through \Q/\E inside regex character classes - canonicalize chdir cwd paths - load Storable hook classes before thaw and preserve scalar hook ref levels - keep cached method-call mortals mark-scoped for chained AUTOLOAD dispatch Generated with [Codex](https://openai.com/codex) Co-Authored-By: Codex --- .../backend/bytecode/BytecodeCompiler.java | 2 + .../perlonjava/backend/jvm/EmitLiteral.java | 3 +- .../frontend/astnode/StringNode.java | 17 +++++- .../frontend/parser/ParserTables.java | 2 +- .../frontend/parser/StringDoubleQuoted.java | 8 +++ .../frontend/parser/StringSegmentParser.java | 60 +++++++++++++++++-- .../runtime/operators/Directory.java | 10 +++- .../runtime/operators/StringOperators.java | 15 +++++ .../perlonjava/runtime/perlmodule/Encode.java | 8 ++- .../runtime/perlmodule/storable/Hooks.java | 17 ++++++ .../runtime/perlmodule/storable/Refs.java | 17 ++---- .../runtime/regex/RegexQuoteMeta.java | 56 +++++++++++++++++ .../runtime/runtimetypes/RuntimeCode.java | 19 +++--- .../runtime/runtimetypes/RuntimeScalar.java | 7 ++- src/test/resources/unit/autoload.t | 28 ++++++++- src/test/resources/unit/encode_fb_quiet.t | 20 +++++++ src/test/resources/unit/operator_overrides.t | 16 +++++ .../resources/unit/regex/regex_charclass.t | 15 +++++ src/test/resources/unit/storable.t | 34 ++++++++++- src/test/resources/unit/utf8_pragma.t | 10 +++- 20 files changed, 328 insertions(+), 36 deletions(-) create mode 100644 src/test/resources/unit/encode_fb_quiet.t diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java index 1341e6bfc..600e5e030 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java @@ -1384,6 +1384,8 @@ public void visit(StringNode node) { short opcode; if (node.isVString) { opcode = Opcodes.LOAD_VSTRING; + } else if (node.forceByteString) { + opcode = Opcodes.LOAD_BYTE_STRING; } else if (emitterContext != null && emitterContext.symbolTable != null && !emitterContext.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8) && !emitterContext.compilerOptions.isUnicodeSource) { diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitLiteral.java b/src/main/java/org/perlonjava/backend/jvm/EmitLiteral.java index fc341ab18..8fc5c06ed 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitLiteral.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitLiteral.java @@ -255,7 +255,8 @@ public static void emitString(EmitterContext ctx, StringNode node) { return; } - if (!ctx.symbolTable.isStrictOptionEnabled(HINT_UTF8) && !ctx.compilerOptions.isUnicodeSource) { + if (node.forceByteString + || (!ctx.symbolTable.isStrictOptionEnabled(HINT_UTF8) && !ctx.compilerOptions.isUnicodeSource)) { // Under `no utf8` - create an octet string, unless it contains wide characters (> 255) // Wide characters (like \x{100}) force the string to be UTF-8 even without `use utf8` boolean hasWideChars = false; diff --git a/src/main/java/org/perlonjava/frontend/astnode/StringNode.java b/src/main/java/org/perlonjava/frontend/astnode/StringNode.java index 16d035eb2..cdb210b27 100644 --- a/src/main/java/org/perlonjava/frontend/astnode/StringNode.java +++ b/src/main/java/org/perlonjava/frontend/astnode/StringNode.java @@ -20,6 +20,13 @@ public class StringNode extends AbstractNode { */ public final boolean isVString; + /** + * Force this literal to be emitted as a byte string even in a C + * scope. Perl keeps ASCII and fixed-byte escapes such as "\xFC" unupgraded; + * actual non-ASCII source characters still use normal UTF-8 string emission. + */ + public final boolean forceByteString; + /** * Constructs a new StringNode with the specified string value. * @@ -29,6 +36,7 @@ public StringNode(String value, int tokenIndex) { this.value = value; this.tokenIndex = tokenIndex; this.isVString = false; + this.forceByteString = false; } /** @@ -42,6 +50,14 @@ public StringNode(String value, boolean isVString, int tokenIndex) { this.value = value; this.tokenIndex = tokenIndex; this.isVString = isVString; + this.forceByteString = false; + } + + public StringNode(String value, boolean isVString, boolean forceByteString, int tokenIndex) { + this.value = value; + this.tokenIndex = tokenIndex; + this.isVString = isVString; + this.forceByteString = forceByteString; } /** @@ -67,4 +83,3 @@ public void accept(Visitor visitor) { visitor.visit(this); } } - diff --git a/src/main/java/org/perlonjava/frontend/parser/ParserTables.java b/src/main/java/org/perlonjava/frontend/parser/ParserTables.java index 82a4dc8a8..28bb66c57 100644 --- a/src/main/java/org/perlonjava/frontend/parser/ParserTables.java +++ b/src/main/java/org/perlonjava/frontend/parser/ParserTables.java @@ -30,7 +30,7 @@ public class ParserTables { "die", "do", "dump", "exec", "exit", "fork", - "getpwuid", "glob", + "gethostbyname", "getpwuid", "glob", "hex", "kill", "oct", "open", diff --git a/src/main/java/org/perlonjava/frontend/parser/StringDoubleQuoted.java b/src/main/java/org/perlonjava/frontend/parser/StringDoubleQuoted.java index 8634ed1e5..b73ed5e43 100644 --- a/src/main/java/org/perlonjava/frontend/parser/StringDoubleQuoted.java +++ b/src/main/java/org/perlonjava/frontend/parser/StringDoubleQuoted.java @@ -419,6 +419,10 @@ private void parseDoubleQuotedEscapesRegex() { switch (escape) { // Case modification end marker case "E" -> { + if (isInsideRegexCharClass()) { + appendToCurrentSegment("\\E"); + return; + } // Flush any pending literal text flushCurrentSegment(); // Pop and apply the most recent case modifier @@ -436,6 +440,10 @@ private void parseDoubleQuotedEscapesRegex() { // Quotemeta modifier case "Q" -> { + if (isInsideRegexCharClass()) { + appendToCurrentSegment("\\Q"); + return; + } flushCurrentSegment(); caseModifiers.push(new CaseModifier("Q", false)); } diff --git a/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java b/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java index 54d9760dc..7506b1361 100644 --- a/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java +++ b/src/main/java/org/perlonjava/frontend/parser/StringSegmentParser.java @@ -20,6 +20,7 @@ import static org.perlonjava.frontend.parser.ParseBlock.parseBlock; import static org.perlonjava.frontend.parser.Variable.parseArrayHashAccess; +import static org.perlonjava.runtime.perlmodule.Strict.HINT_UTF8; /** * Base class for parsing strings with segments and variable interpolation. @@ -74,6 +75,9 @@ public abstract class StringSegmentParser { * Buffer for accumulating literal text segments */ protected final StringBuilder currentSegment; + private boolean currentSegmentHasSourceNonAscii = false; + private boolean inRegexCharClass = false; + private boolean regexCharClassFirst = false; /** * List of AST nodes representing string segments (literals and interpolated expressions) */ @@ -128,6 +132,35 @@ protected void appendToCurrentSegment(String text) { currentSegment.append(text); } + protected void appendLiteralToCurrentSegment(String text) { + appendToCurrentSegment(text); + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + updateRegexCharClassState(c); + if (c > 127) { + currentSegmentHasSourceNonAscii = true; + } + } + } + + protected boolean isInsideRegexCharClass() { + return isRegex && inRegexCharClass; + } + + private void updateRegexCharClassState(char c) { + if (!isRegex) { + return; + } + if (c == '[' && !inRegexCharClass) { + inRegexCharClass = true; + regexCharClassFirst = true; + } else if (c == ']' && inRegexCharClass && !regexCharClassFirst) { + inRegexCharClass = false; + } else if (inRegexCharClass && regexCharClassFirst && c != '^') { + regexCharClassFirst = false; + } + } + /** * Adds a string segment node to the segments list. * @@ -150,9 +183,28 @@ protected void addStringSegment(Node node) { */ protected void flushCurrentSegment() { if (!currentSegment.isEmpty()) { - addStringSegment(new StringNode(currentSegment.toString(), tokenIndex)); + String value = currentSegment.toString(); + boolean forceByteString = shouldForceByteStringLiteral(value); + addStringSegment(new StringNode(value, false, forceByteString, tokenIndex)); currentSegment.setLength(0); + currentSegmentHasSourceNonAscii = false; + } + } + + private boolean shouldForceByteStringLiteral(String value) { + if (!ctx.symbolTable.isStrictOptionEnabled(HINT_UTF8) + && !ctx.compilerOptions.isUnicodeSource) { + return false; + } + if (currentSegmentHasSourceNonAscii) { + return false; + } + for (int i = 0; i < value.length(); i++) { + if (value.charAt(i) > 255) { + return false; + } } + return true; } /** @@ -639,7 +691,7 @@ public Node parse() { continue; } else { // No heredocs pending, append the newline normally - appendToCurrentSegment(token.text); + appendLiteralToCurrentSegment(token.text); } continue; } @@ -650,7 +702,7 @@ public Node parse() { } // Default: append literal text to current segment - appendToCurrentSegment(text); + appendLiteralToCurrentSegment(text); } if (CompilerOptions.DEBUG_ENABLED) ctx.logDebug("StringSegmentParser.parse: Finished parsing, segments count: " + segments.size()); @@ -1337,4 +1389,4 @@ void handleUnicodeNameEscape() { appendToCurrentSegment("N{" + nameBuilder); } } -} \ No newline at end of file +} diff --git a/src/main/java/org/perlonjava/runtime/operators/Directory.java b/src/main/java/org/perlonjava/runtime/operators/Directory.java index abe071f92..92d828966 100644 --- a/src/main/java/org/perlonjava/runtime/operators/Directory.java +++ b/src/main/java/org/perlonjava/runtime/operators/Directory.java @@ -92,8 +92,14 @@ public static RuntimeScalar chdir(RuntimeScalar runtimeScalar) { File absoluteDir = RuntimeIO.resolveFile(dirName); if (absoluteDir.exists() && absoluteDir.isDirectory()) { - // Normalize the path to remove redundant . and .. components - System.setProperty("user.dir", absoluteDir.toPath().normalize().toString()); + try { + // Match getcwd(3): collapse . and .., and resolve symlinks like + // macOS /var -> /private/var after chdir(). + System.setProperty("user.dir", absoluteDir.getCanonicalPath()); + } catch (IOException e) { + handleIOException(e, "chdir failed"); + return scalarFalse; + } return scalarTrue; } else { // Set errno to ENOENT (No such file or directory) diff --git a/src/main/java/org/perlonjava/runtime/operators/StringOperators.java b/src/main/java/org/perlonjava/runtime/operators/StringOperators.java index 2ed00e770..2cde2639f 100644 --- a/src/main/java/org/perlonjava/runtime/operators/StringOperators.java +++ b/src/main/java/org/perlonjava/runtime/operators/StringOperators.java @@ -142,6 +142,9 @@ public static RuntimeScalar quotemeta(RuntimeScalar runtimeScalar) { * @return a {@link RuntimeScalar} with the case-folded string */ public static RuntimeScalar fc(RuntimeScalar runtimeScalar) { + if (runtimeScalar.type == RuntimeScalarType.BYTE_STRING) { + return caseFoldBytesAsciiOnly(runtimeScalar); + } String str = runtimeScalar.toString(); // Perform full Unicode case folding using ICU4J CaseMap // Note: We do NOT use NFKC normalization because Perl's fc() preserves @@ -174,6 +177,9 @@ public static RuntimeScalar fcBytes(RuntimeScalar runtimeScalar) { * @return a {@link RuntimeScalar} with the lowercase string */ public static RuntimeScalar lc(RuntimeScalar runtimeScalar) { + if (runtimeScalar.type == RuntimeScalarType.BYTE_STRING) { + return caseFoldBytesAsciiOnly(runtimeScalar); + } // Convert the string to lowercase using ICU4J for proper Unicode handling String str = UCharacter.toLowerCase(runtimeScalar.toString()); return makeStringResult(str, runtimeScalar); @@ -196,6 +202,9 @@ public static RuntimeScalar lcBytes(RuntimeScalar runtimeScalar) { * @return a {@link RuntimeScalar} with the first character in lowercase */ public static RuntimeScalar lcfirst(RuntimeScalar runtimeScalar) { + if (runtimeScalar.type == RuntimeScalarType.BYTE_STRING) { + return lcfirstBytes(runtimeScalar); + } String str = runtimeScalar.toString(); // Check if the string is empty if (str.isEmpty()) { @@ -218,6 +227,9 @@ public static RuntimeScalar lcfirst(RuntimeScalar runtimeScalar) { * @return a {@link RuntimeScalar} with the uppercase string */ public static RuntimeScalar uc(RuntimeScalar runtimeScalar) { + if (runtimeScalar.type == RuntimeScalarType.BYTE_STRING) { + return uppercaseBytesAsciiOnly(runtimeScalar); + } // Convert the string to uppercase using ICU4J for proper Unicode handling String str = UCharacter.toUpperCase(runtimeScalar.toString()); return makeStringResult(str, runtimeScalar); @@ -232,6 +244,9 @@ public static RuntimeScalar uc(RuntimeScalar runtimeScalar) { * @return a {@link RuntimeScalar} with the first character in titlecase */ public static RuntimeScalar ucfirst(RuntimeScalar runtimeScalar) { + if (runtimeScalar.type == RuntimeScalarType.BYTE_STRING) { + return ucfirstBytes(runtimeScalar); + } String str = runtimeScalar.toString(); // Check if the string is empty if (str.isEmpty()) { diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Encode.java b/src/main/java/org/perlonjava/runtime/perlmodule/Encode.java index 3485cdc8e..351fdfd98 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Encode.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Encode.java @@ -583,6 +583,7 @@ private static RuntimeScalar encodeWithCharset(String string, Charset charset, S StringBuilder result = new StringBuilder(); CharBuffer input = CharBuffer.wrap(string); ByteBuffer output = ByteBuffer.allocate((int) (string.length() * encoder.maxBytesPerChar()) + 4); + boolean stoppedOnError = false; while (input.hasRemaining()) { encoder.reset(); @@ -600,6 +601,7 @@ private static RuntimeScalar encodeWithCharset(String string, Charset charset, S int badChar = input.get(); // consume the bad character String replacement = handleEncodingError(check, codeRef, badChar, encodingName, true); if (replacement == null) { + stoppedOnError = true; // FB_QUIET: stop processing, put back unprocessed chars if ((check & LEAVE_SRC) == 0 && srcArgs != null && srcArgs.size() > srcArgIndex) { StringBuilder remaining = new StringBuilder(); @@ -635,7 +637,7 @@ private static RuntimeScalar encodeWithCharset(String string, Charset charset, S resultScalar.value = result.toString(); // Update source if LEAVE_SRC is not set (remove processed chars) - if ((check & LEAVE_SRC) == 0 && (check & RETURN_ON_ERR) == 0 + if ((check & LEAVE_SRC) == 0 && !stoppedOnError && srcArgs != null && srcArgs.size() > srcArgIndex) { srcArgs.get(srcArgIndex).set(""); } @@ -707,6 +709,7 @@ private static RuntimeScalar decodeWithCharset(byte[] bytes, Charset charset, St ByteBuffer input = ByteBuffer.wrap(bytes); CharBuffer output = CharBuffer.allocate(bytes.length * 2 + 4); StringBuilder result = new StringBuilder(); + boolean stoppedOnError = false; while (input.hasRemaining()) { decoder.reset(); @@ -724,6 +727,7 @@ private static RuntimeScalar decodeWithCharset(byte[] bytes, Charset charset, St } String replacement = handleEncodingError(check, codeRef, badBytes, encodingName, false); if (replacement == null) { + stoppedOnError = true; // FB_QUIET: stop processing if ((check & LEAVE_SRC) == 0 && srcArgs != null && srcArgs.size() > srcArgIndex) { byte[] remaining = new byte[input.remaining() + malformedLen]; @@ -751,7 +755,7 @@ private static RuntimeScalar decodeWithCharset(byte[] bytes, Charset charset, St result.append(output); // Update source if LEAVE_SRC is not set - if ((check & LEAVE_SRC) == 0 && (check & RETURN_ON_ERR) == 0 + if ((check & LEAVE_SRC) == 0 && !stoppedOnError && srcArgs != null && srcArgs.size() > srcArgIndex) { srcArgs.get(srcArgIndex).set(""); } diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/storable/Hooks.java b/src/main/java/org/perlonjava/runtime/perlmodule/storable/Hooks.java index 12f4d7949..b3f149129 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/storable/Hooks.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/storable/Hooks.java @@ -4,6 +4,8 @@ import java.util.ArrayList; import java.util.List; +import org.perlonjava.runtime.operators.ModuleOperators; +import org.perlonjava.runtime.runtimetypes.GlobalVariable; import org.perlonjava.runtime.mro.InheritanceResolver; import org.perlonjava.runtime.operators.ReferenceOperators; import org.perlonjava.runtime.runtimetypes.RuntimeArray; @@ -130,6 +132,8 @@ public static RuntimeScalar readHook(StorableReader r, StorableContext c) { } } + requireClassForHook(classname); + // Step 6a: if the class defines STORABLE_attach, prefer that over // STORABLE_thaw. The attach hook is a CLASS method that returns a // fully-formed object; we replace the placeholder with the @@ -253,6 +257,19 @@ private static String readClassname(StorableContext c, int flags) { return name; } + private static void requireClassForHook(String classname) { + if (classname == null || classname.isEmpty()) return; + if (classname.equals("main") || classname.equals("UNIVERSAL")) return; + String filename = classname.replace("::", "/").replace("'", "/") + ".pm"; + RuntimeHash inc = GlobalVariable.getGlobalHash("main::INC"); + if (inc.exists(new RuntimeScalar(filename)).getBoolean()) return; + try { + ModuleOperators.require(new RuntimeScalar(filename)); + } catch (Exception ignored) { + // Some blessed data-only packages have no loadable module. + } + } + private static void invokeThaw(String classname, RuntimeScalar self, String frozen, List extraRefs) { RuntimeScalar thawMethod = InheritanceResolver.findMethodInHierarchy( diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/storable/Refs.java b/src/main/java/org/perlonjava/runtime/perlmodule/storable/Refs.java index 1339dc4f7..a4606877c 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/storable/Refs.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/storable/Refs.java @@ -1,7 +1,5 @@ package org.perlonjava.runtime.perlmodule.storable; -import org.perlonjava.runtime.runtimetypes.RuntimeArray; -import org.perlonjava.runtime.runtimetypes.RuntimeHash; import org.perlonjava.runtime.runtimetypes.RuntimeScalar; import org.perlonjava.runtime.runtimetypes.WeakRefRegistry; import org.perlonjava.runtime.runtimetypes.RuntimeScalarType; @@ -154,15 +152,12 @@ public static RuntimeScalar readWeakOverload(StorableReader r, StorableContext c */ private static void installReferent(RuntimeScalar refScalar, RuntimeScalar referent, boolean bodyWasBare) { if (bodyWasBare) { - // Bare-container body: collapse the redundant SX_REF wrap. - // The fresh reference we attach must point at the SAME - // underlying RuntimeArray/RuntimeHash as `referent` so - // mutations through either alias (or backref tags pointing - // at the seen-table entry of the container) stay coherent. - if (referent.value instanceof RuntimeArray arr) { - refScalar.set(arr.createReference()); - } else if (referent.value instanceof RuntimeHash hash) { - refScalar.set(hash.createReference()); + // Bare body: collapse the redundant SX_REF wrap. The fresh + // reference we attach must preserve the SAME reference shape + // and underlying referent, so mutations and blessing remain + // coherent for arrays, hashes, scalar hooks, and backrefs. + if (RuntimeScalarType.isReference(referent)) { + refScalar.set(referent); } else { // Bare flag set but not a recognised container — fall // back to a fresh scalar reference. Defensive; should diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexQuoteMeta.java b/src/main/java/org/perlonjava/runtime/regex/RegexQuoteMeta.java index 12264e054..d63e060bf 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexQuoteMeta.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexQuoteMeta.java @@ -1,17 +1,50 @@ package org.perlonjava.runtime.regex; +import org.perlonjava.runtime.operators.WarnDie; +import org.perlonjava.runtime.runtimetypes.RuntimeScalar; + public class RegexQuoteMeta { public static String escapeQ(String s) { StringBuilder sb = new StringBuilder(); int len = s.length(); int offset = 0; + boolean inCharClass = false; + boolean charClassFirst = false; + boolean escaped = false; // Predefined set of regex metacharacters final String regexMetacharacters = "-.+*?[](){}^$|\\"; while (offset < len) { char c = s.charAt(offset); + if (escaped) { + if (inCharClass && (c == 'Q' || c == 'E')) { + warnUnrecognizedCharClassEscape(c); + sb.append(c); + if (charClassFirst && c != '^') { + charClassFirst = false; + } + escaped = false; + offset++; + continue; + } + sb.append('\\'); + sb.append(c); + escaped = false; + offset++; + continue; + } + if (c == '\\' && offset + 1 < len && s.charAt(offset + 1) == 'Q') { + if (inCharClass) { + warnUnrecognizedCharClassEscape('Q'); + sb.append('Q'); + if (charClassFirst) { + charClassFirst = false; + } + offset += 2; + continue; + } // Skip past \Q offset += 2; @@ -32,11 +65,34 @@ public static String escapeQ(String s) { offset++; } } else { + if (c == '\\') { + escaped = true; + offset++; + continue; + } + if (c == '[' && !inCharClass) { + inCharClass = true; + charClassFirst = true; + } else if (c == ']' && inCharClass && !charClassFirst) { + inCharClass = false; + } else if (inCharClass && charClassFirst && c != '^') { + charClassFirst = false; + } sb.append(c); offset++; } } + if (escaped) { + sb.append('\\'); + } return sb.toString(); } + + private static void warnUnrecognizedCharClassEscape(char c) { + WarnDie.warn( + new RuntimeScalar("Unrecognized escape \\" + c + + " in character class passed through in regex\n"), + new RuntimeScalar("")); + } } diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java index 20e82bcb6..f8b3f6f50 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java @@ -2011,13 +2011,18 @@ private static RuntimeList callCachedInner(int callsiteId, } } - // Prefer PerlSubroutine interface over MethodHandle - if (cachedCode.subroutine != null) { - return cachedCode.subroutine.apply(a, callContext); - } else if (cachedCode.isStatic) { - return (RuntimeList) cachedCode.methodHandle.invoke(a, callContext); - } else { - return (RuntimeList) cachedCode.methodHandle.invoke(cachedCode.codeObject, a, callContext); + MortalList.pushMark(); + try { + // Prefer PerlSubroutine interface over MethodHandle + if (cachedCode.subroutine != null) { + return cachedCode.subroutine.apply(a, callContext); + } else if (cachedCode.isStatic) { + return (RuntimeList) cachedCode.methodHandle.invoke(a, callContext); + } else { + return (RuntimeList) cachedCode.methodHandle.invoke(cachedCode.codeObject, a, callContext); + } + } finally { + MortalList.popMark(); } } catch (Throwable e) { if (e instanceof RuntimeException) throw (RuntimeException) e; diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeScalar.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeScalar.java index 6e662aa20..339009887 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeScalar.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeScalar.java @@ -1261,10 +1261,11 @@ private RuntimeScalar setLargeRefCounted(RuntimeScalar value) { // Update ownership: this scalar now owns a refCount iff we incremented. this.refCountOwned = newOwned; - // Flush deferred mortal decrements. This is the primary flush point for - // the mortal mechanism — called after every assignment involving references. + // Flush deferred mortal decrements from the current function scope. + // Entries below the current MortalList mark belong to the caller, and + // must survive through nested calls such as chained AUTOLOAD dispatch. // Cost when MortalList.active is false: one boolean check (trivially predicted). - MortalList.flush(); + MortalList.flushAboveMark(); return this; } diff --git a/src/test/resources/unit/autoload.t b/src/test/resources/unit/autoload.t index 7e0c16fa8..428663ec9 100644 --- a/src/test/resources/unit/autoload.t +++ b/src/test/resources/unit/autoload.t @@ -3,7 +3,7 @@ use warnings; package X; -use Test::More tests => 5; +use Test::More tests => 6; sub x { callme(@_); @@ -53,4 +53,30 @@ b(789); "inherited AUTOLOAD works for multiple method calls"); } +# Regression: cached AUTOLOAD method calls must keep method-chain +# temporaries alive until the AUTOLOAD body has read $AUTOLOAD. Without +# the cached-call mortal boundary, the temporary invocant's DESTROY +# AUTOLOAD overwrote $AUTOLOAD between dispatch and entry. +{ + package ChainAutoloadBase; + sub missing; + sub AUTOLOAD { + our $AUTOLOAD; + my $method = $AUTOLOAD; + $method =~ s/.*:://; + return if $method eq 'DESTROY'; + return "$method:" . ref($_[0]); + } + + package ChainAutoloadChild; + our @ISA = ("ChainAutoloadBase"); + sub new { bless {}, shift } + + package main; + my $first = ChainAutoloadChild->new->missing; + my $second = ChainAutoloadChild->new->missing; + X::is($second, "missing:ChainAutoloadChild", + "cached inherited AUTOLOAD resets \$AUTOLOAD before chained call body"); +} + 1; diff --git a/src/test/resources/unit/encode_fb_quiet.t b/src/test/resources/unit/encode_fb_quiet.t new file mode 100644 index 000000000..5e371b112 --- /dev/null +++ b/src/test/resources/unit/encode_fb_quiet.t @@ -0,0 +1,20 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Test::More tests => 6; +use Encode qw(encode decode find_encoding FB_QUIET LEAVE_SRC); + +my $enc_src = "abc"; +my $encoded = encode("UTF-8", $enc_src, FB_QUIET); +is($encoded, "abc", "encode FB_QUIET returns encoded bytes"); +is($enc_src, "", "encode FB_QUIET consumes source on success"); + +my $dec_src = "abc"; +my $decoded = decode("UTF-8", $dec_src, FB_QUIET); +is($decoded, "abc", "decode FB_QUIET returns decoded string"); +is($dec_src, "", "decode FB_QUIET consumes source on success"); + +my $leave_src = "abc"; +is(decode("UTF-8", $leave_src, FB_QUIET | LEAVE_SRC), "abc", + "decode FB_QUIET with LEAVE_SRC still decodes"); +is($leave_src, "abc", "LEAVE_SRC preserves source"); diff --git a/src/test/resources/unit/operator_overrides.t b/src/test/resources/unit/operator_overrides.t index c9c670df5..524a58f13 100644 --- a/src/test/resources/unit/operator_overrides.t +++ b/src/test/resources/unit/operator_overrides.t @@ -189,4 +189,20 @@ subtest 'sleep operator override' => sub { is_deeply(\@sleep_args, [5], 'sleep override saw the right argument'); }; +subtest 'gethostbyname operator override' => sub { + plan tests => 2; + + BEGIN { + *CORE::GLOBAL::gethostbyname = sub { + die "unexpected list context" if wantarray; + return "mocked:$_[0]"; + }; + } + + is(gethostbyname("www.perl.org."), "mocked:www.perl.org.", + 'gethostbyname overridden globally'); + ok(defined CORE::gethostbyname("localhost"), + 'CORE::gethostbyname still bypasses override'); +}; + done_testing(); diff --git a/src/test/resources/unit/regex/regex_charclass.t b/src/test/resources/unit/regex/regex_charclass.t index 4e4dd588f..344be1e8d 100644 --- a/src/test/resources/unit/regex/regex_charclass.t +++ b/src/test/resources/unit/regex/regex_charclass.t @@ -60,4 +60,19 @@ subtest 'bracketed \c? matches DEL only' => sub { ok("color" =~ /colou?r/, "? quantifier still works (absent)"); }; +subtest 'bracketed \Q and \E are literal with warnings' => sub { + my @warnings; + local $SIG{__WARN__} = sub { push @warnings, @_ }; + + my $re = qr/[\Qabc\E]/; + ok("Q" =~ $re, '\Q is passed through as literal Q in a character class'); + ok("E" =~ $re, '\E is passed through as literal E in a character class'); + ok("a" =~ $re, 'character class contents still match'); + ok("d" !~ $re, 'characters outside the class do not match'); + like(join("", @warnings), qr/Unrecognized escape \\Q in character class passed through in regex/, + '\Q warning is emitted'); + like(join("", @warnings), qr/Unrecognized escape \\E in character class passed through in regex/, + '\E warning is emitted'); +}; + done_testing(); diff --git a/src/test/resources/unit/storable.t b/src/test/resources/unit/storable.t index 7381df039..e6d6f5f42 100644 --- a/src/test/resources/unit/storable.t +++ b/src/test/resources/unit/storable.t @@ -7,7 +7,7 @@ use File::Temp qw(tempfile); use Storable qw(store retrieve nstore freeze thaw nfreeze dclone); # Test plan -plan tests => 10; +plan tests => 11; subtest 'Basic scalar serialization' => sub { plan tests => 6; @@ -319,4 +319,36 @@ subtest 'STORABLE_freeze nested hook cookie round-trip (binary-safe)' => sub { isa_ok($thawed->{inner}, '_StTestInner', 'inner hooked object survives'); }; +subtest 'STORABLE_freeze scalar hook keeps one ref level' => sub { + plan tests => 3; + + package _StScalarHook; + use overload '""' => sub { ${ $_[0] } }, fallback => 1; + + sub new { + my ($class, $value) = @_; + return bless \$value, $class; + } + + sub STORABLE_freeze { + my ($self, $cloning) = @_; + return $$self; + } + + sub STORABLE_thaw { + my ($self, $cloning, $ice) = @_; + $$self = $ice; + } + + package main; + + my $frozen = freeze([ _StScalarHook->new("http://search.cpan.org") ]); + my $thawed = thaw($frozen); + my $obj = $thawed->[0]; + + is(ref($obj), '_StScalarHook', 'scalar hook object class survives'); + is($$obj, 'http://search.cpan.org', 'scalar hook referent survives'); + is("$obj", 'http://search.cpan.org', 'scalar hook overload sees one ref level'); +}; + done_testing(); diff --git a/src/test/resources/unit/utf8_pragma.t b/src/test/resources/unit/utf8_pragma.t index 5e3d733ba..513b2a290 100644 --- a/src/test/resources/unit/utf8_pragma.t +++ b/src/test/resources/unit/utf8_pragma.t @@ -179,11 +179,18 @@ subtest 'Escape sequences vs source encoding' => sub { my $hex = "\x{100}"; # Creates Unicode string is(length($hex), 1, 'Unicode escape creates single character with utf8'); is(ord($hex), 256, 'Unicode escape value with utf8'); - + + my $byte_escape = "B\xFCcher"; + ok(!utf8::is_utf8($byte_escape), 'fixed byte escape is not UTF-8 flagged under utf8'); + is(lc("\xC3\xBCri"), "\xC3\xBCri", 'lc preserves non-ASCII bytes from fixed byte escapes'); + # Literal source is treated as characters my $literal = "Ā"; is(length($literal), 1, 'Literal Ā is 1 character with utf8'); is(ord($literal), 256, 'Literal Ā has correct value with utf8'); + + my $latin1_literal = "ü"; + ok(utf8::is_utf8($latin1_literal), 'literal non-ASCII source is UTF-8 flagged under utf8'); } }; @@ -257,4 +264,3 @@ subtest 'Octet vs character semantics' => sub { }; done_testing(); - From ece9ee13c1b3d8f480d573076eb65f49194f2f8f Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Sat, 9 May 2026 08:47:53 +0200 Subject: [PATCH 2/2] fix: restore regex and fold counts for URI PR Respect unicode_strings when applying case operations to byte strings, while keeping byte semantics for ordinary byte strings used by URI. Handle source \Q...\E in regex character classes as quotemeta, while preserving Perl warnings for interpolated \Q/\E inside character classes. Generated with [Codex](https://openai.com/codex) Co-Authored-By: Codex --- .../backend/bytecode/BytecodeInterpreter.java | 5 +- .../backend/bytecode/CompileOperator.java | 25 +++++++-- .../backend/bytecode/Disassemble.java | 5 ++ .../perlonjava/backend/bytecode/Opcodes.java | 25 +++++++++ .../bytecode/ScalarUnaryOpcodeHandler.java | 15 ++++++ .../perlonjava/backend/jvm/EmitOperator.java | 15 ++++-- .../frontend/parser/StringDoubleQuoted.java | 8 --- .../runtime/operators/StringOperators.java | 53 ++++++++++++++++++- .../resources/unit/regex/regex_charclass.t | 31 ++++++++--- 9 files changed, 155 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java index 226b7099b..96a2af895 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeInterpreter.java @@ -2056,7 +2056,10 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c Opcodes.ABS, Opcodes.BINARY_NOT, Opcodes.BITWISE_NOT, Opcodes.INTEGER_BITWISE_NOT, Opcodes.ORD, Opcodes.ORD_BYTES, Opcodes.OCT, Opcodes.HEX, Opcodes.SRAND, Opcodes.CHR, Opcodes.CHR_BYTES, Opcodes.LENGTH_BYTES, Opcodes.QUOTEMETA, Opcodes.FC, Opcodes.LC, - Opcodes.LCFIRST, Opcodes.UC, Opcodes.UCFIRST, Opcodes.SLEEP, Opcodes.TELL, + Opcodes.LCFIRST, Opcodes.UC, Opcodes.UCFIRST, Opcodes.FC_BYTES, Opcodes.LC_BYTES, + Opcodes.LCFIRST_BYTES, Opcodes.UC_BYTES, Opcodes.UCFIRST_BYTES, Opcodes.FC_UNICODE, + Opcodes.LC_UNICODE, Opcodes.LCFIRST_UNICODE, Opcodes.UC_UNICODE, Opcodes.UCFIRST_UNICODE, + Opcodes.TO_BYTES_STRING, Opcodes.SLEEP, Opcodes.TELL, Opcodes.RMDIR, Opcodes.CLOSEDIR, Opcodes.REWINDDIR, Opcodes.TELLDIR, Opcodes.CHDIR, Opcodes.EXIT -> { pc = ScalarUnaryOpcodeHandler.execute(opcode, bytecode, pc, registers); diff --git a/src/main/java/org/perlonjava/backend/bytecode/CompileOperator.java b/src/main/java/org/perlonjava/backend/bytecode/CompileOperator.java index d3b015015..160ee033b 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/CompileOperator.java +++ b/src/main/java/org/perlonjava/backend/bytecode/CompileOperator.java @@ -20,6 +20,16 @@ private static void compileScalarOperand(BytecodeCompiler bc, OperatorNode node, } } + private static short selectCaseOpcode(BytecodeCompiler bc, short normalOpcode, short bytesOpcode, short unicodeOpcode) { + if (bc.isBytesEnabled()) { + return bytesOpcode; + } + if (bc.symbolTable != null && bc.symbolTable.isFeatureCategoryEnabled("unicode_strings")) { + return unicodeOpcode; + } + return normalOpcode; + } + private static int compileArrayForExistsDelete(BytecodeCompiler bc, BinaryOperatorNode arrayAccess, int tokenIndex) { if (!(arrayAccess.left instanceof OperatorNode leftOp) || !leftOp.operator.equals("$") || !(leftOp.operand instanceof IdentifierNode)) { @@ -682,11 +692,16 @@ public static void visitOperator(BytecodeCompiler bytecodeCompiler, OperatorNode case "chrBytes" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.CHR_BYTES); case "lengthBytes" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.LENGTH_BYTES); case "quotemeta" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.QUOTEMETA); - case "fc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, bytecodeCompiler.isBytesEnabled() ? Opcodes.FC_BYTES : Opcodes.FC); - case "lc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, bytecodeCompiler.isBytesEnabled() ? Opcodes.LC_BYTES : Opcodes.LC); - case "lcfirst" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, bytecodeCompiler.isBytesEnabled() ? Opcodes.LCFIRST_BYTES : Opcodes.LCFIRST); - case "uc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, bytecodeCompiler.isBytesEnabled() ? Opcodes.UC_BYTES : Opcodes.UC); - case "ucfirst" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, bytecodeCompiler.isBytesEnabled() ? Opcodes.UCFIRST_BYTES : Opcodes.UCFIRST); + case "fc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, + selectCaseOpcode(bytecodeCompiler, Opcodes.FC, Opcodes.FC_BYTES, Opcodes.FC_UNICODE)); + case "lc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, + selectCaseOpcode(bytecodeCompiler, Opcodes.LC, Opcodes.LC_BYTES, Opcodes.LC_UNICODE)); + case "lcfirst" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, + selectCaseOpcode(bytecodeCompiler, Opcodes.LCFIRST, Opcodes.LCFIRST_BYTES, Opcodes.LCFIRST_UNICODE)); + case "uc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, + selectCaseOpcode(bytecodeCompiler, Opcodes.UC, Opcodes.UC_BYTES, Opcodes.UC_UNICODE)); + case "ucfirst" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, + selectCaseOpcode(bytecodeCompiler, Opcodes.UCFIRST, Opcodes.UCFIRST_BYTES, Opcodes.UCFIRST_UNICODE)); case "tell" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.TELL); case "rmdir" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.RMDIR); case "closedir" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.CLOSEDIR); diff --git a/src/main/java/org/perlonjava/backend/bytecode/Disassemble.java b/src/main/java/org/perlonjava/backend/bytecode/Disassemble.java index d440a10ed..1faceb126 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/Disassemble.java +++ b/src/main/java/org/perlonjava/backend/bytecode/Disassemble.java @@ -1574,14 +1574,19 @@ public static String disassemble(InterpretedCode interpretedCode) { case Opcodes.QUOTEMETA: case Opcodes.FC: case Opcodes.FC_BYTES: + case Opcodes.FC_UNICODE: case Opcodes.LC: case Opcodes.LC_BYTES: + case Opcodes.LC_UNICODE: case Opcodes.LCFIRST: case Opcodes.LCFIRST_BYTES: + case Opcodes.LCFIRST_UNICODE: case Opcodes.UC: case Opcodes.UC_BYTES: + case Opcodes.UC_UNICODE: case Opcodes.UCFIRST: case Opcodes.UCFIRST_BYTES: + case Opcodes.UCFIRST_UNICODE: case Opcodes.TO_BYTES_STRING: case Opcodes.SLEEP: case Opcodes.TELL: diff --git a/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java b/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java index 5cf146072..5bb1c6feb 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java +++ b/src/main/java/org/perlonjava/backend/bytecode/Opcodes.java @@ -2321,6 +2321,31 @@ public class Opcodes { */ public static final short HASH_DEREF_FETCH_NONSTRICT_FOR_LOCAL = 484; + /** + * Fold case under unicode_strings: rd = StringOperators.fcUnicode(rs) + */ + public static final short FC_UNICODE = 485; + + /** + * Lowercase under unicode_strings: rd = StringOperators.lcUnicode(rs) + */ + public static final short LC_UNICODE = 486; + + /** + * Lowercase first under unicode_strings: rd = StringOperators.lcfirstUnicode(rs) + */ + public static final short LCFIRST_UNICODE = 487; + + /** + * Uppercase under unicode_strings: rd = StringOperators.ucUnicode(rs) + */ + public static final short UC_UNICODE = 488; + + /** + * Uppercase first under unicode_strings: rd = StringOperators.ucfirstUnicode(rs) + */ + public static final short UCFIRST_UNICODE = 489; + private Opcodes() { } // Utility class - no instantiation } diff --git a/src/main/java/org/perlonjava/backend/bytecode/ScalarUnaryOpcodeHandler.java b/src/main/java/org/perlonjava/backend/bytecode/ScalarUnaryOpcodeHandler.java index 8aefeaac1..4aedd386b 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/ScalarUnaryOpcodeHandler.java +++ b/src/main/java/org/perlonjava/backend/bytecode/ScalarUnaryOpcodeHandler.java @@ -43,14 +43,19 @@ public static int execute(int opcode, int[] bytecode, int pc, case Opcodes.QUOTEMETA -> StringOperators.quotemeta((RuntimeScalar) registers[rs]); case Opcodes.FC -> StringOperators.fc((RuntimeScalar) registers[rs]); case Opcodes.FC_BYTES -> StringOperators.fcBytes((RuntimeScalar) registers[rs]); + case Opcodes.FC_UNICODE -> StringOperators.fcUnicode((RuntimeScalar) registers[rs]); case Opcodes.LC -> StringOperators.lc((RuntimeScalar) registers[rs]); case Opcodes.LC_BYTES -> StringOperators.lcBytes((RuntimeScalar) registers[rs]); + case Opcodes.LC_UNICODE -> StringOperators.lcUnicode((RuntimeScalar) registers[rs]); case Opcodes.LCFIRST -> StringOperators.lcfirst((RuntimeScalar) registers[rs]); case Opcodes.LCFIRST_BYTES -> StringOperators.lcfirstBytes((RuntimeScalar) registers[rs]); + case Opcodes.LCFIRST_UNICODE -> StringOperators.lcfirstUnicode((RuntimeScalar) registers[rs]); case Opcodes.UC -> StringOperators.uc((RuntimeScalar) registers[rs]); case Opcodes.UC_BYTES -> StringOperators.ucBytes((RuntimeScalar) registers[rs]); + case Opcodes.UC_UNICODE -> StringOperators.ucUnicode((RuntimeScalar) registers[rs]); case Opcodes.UCFIRST -> StringOperators.ucfirst((RuntimeScalar) registers[rs]); case Opcodes.UCFIRST_BYTES -> StringOperators.ucfirstBytes((RuntimeScalar) registers[rs]); + case Opcodes.UCFIRST_UNICODE -> StringOperators.ucfirstUnicode((RuntimeScalar) registers[rs]); case Opcodes.TO_BYTES_STRING -> StringOperators.toBytesString((RuntimeScalar) registers[rs]); case Opcodes.SLEEP -> Time.sleep((RuntimeScalar) registers[rs]); case Opcodes.TELL -> IOOperator.tell((RuntimeScalar) registers[rs]); @@ -104,18 +109,28 @@ public static int disassemble(int opcode, int[] bytecode, int pc, case Opcodes.FC -> sb.append("FC r").append(rd).append(" = fc(r").append(rs).append(")\n"); case Opcodes.FC_BYTES -> sb.append("FC_BYTES r").append(rd).append(" = fcBytes(r").append(rs).append(")\n"); + case Opcodes.FC_UNICODE -> + sb.append("FC_UNICODE r").append(rd).append(" = fcUnicode(r").append(rs).append(")\n"); case Opcodes.LC -> sb.append("LC r").append(rd).append(" = lc(r").append(rs).append(")\n"); case Opcodes.LC_BYTES -> sb.append("LC_BYTES r").append(rd).append(" = lcBytes(r").append(rs).append(")\n"); + case Opcodes.LC_UNICODE -> + sb.append("LC_UNICODE r").append(rd).append(" = lcUnicode(r").append(rs).append(")\n"); case Opcodes.LCFIRST -> sb.append("LCFIRST r").append(rd).append(" = lcfirst(r").append(rs).append(")\n"); case Opcodes.LCFIRST_BYTES -> sb.append("LCFIRST_BYTES r").append(rd).append(" = lcfirstBytes(r").append(rs).append(")\n"); + case Opcodes.LCFIRST_UNICODE -> + sb.append("LCFIRST_UNICODE r").append(rd).append(" = lcfirstUnicode(r").append(rs).append(")\n"); case Opcodes.UC -> sb.append("UC r").append(rd).append(" = uc(r").append(rs).append(")\n"); case Opcodes.UC_BYTES -> sb.append("UC_BYTES r").append(rd).append(" = ucBytes(r").append(rs).append(")\n"); + case Opcodes.UC_UNICODE -> + sb.append("UC_UNICODE r").append(rd).append(" = ucUnicode(r").append(rs).append(")\n"); case Opcodes.UCFIRST -> sb.append("UCFIRST r").append(rd).append(" = ucfirst(r").append(rs).append(")\n"); case Opcodes.UCFIRST_BYTES -> sb.append("UCFIRST_BYTES r").append(rd).append(" = ucfirstBytes(r").append(rs).append(")\n"); + case Opcodes.UCFIRST_UNICODE -> + sb.append("UCFIRST_UNICODE r").append(rd).append(" = ucfirstUnicode(r").append(rs).append(")\n"); case Opcodes.TO_BYTES_STRING -> sb.append("TO_BYTES_STRING r").append(rd).append(" = toBytesString(r").append(rs).append(")\n"); case Opcodes.SLEEP -> sb.append("SLEEP r").append(rd).append(" = sleep(r").append(rs).append(")\n"); diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitOperator.java b/src/main/java/org/perlonjava/backend/jvm/EmitOperator.java index 681d25675..73f168c6d 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitOperator.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitOperator.java @@ -1507,7 +1507,7 @@ static void handleFcOperator(OperatorNode node, EmitterVisitor emitterVisitor) { } else { mv.visitMethodInsn(Opcodes.INVOKESTATIC, "org/perlonjava/runtime/operators/StringOperators", - "fc", + isUnicodeStringsEnabled(emitterVisitor) ? "fcUnicode" : "fc", "(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;", false); } @@ -1531,7 +1531,7 @@ static void handleLcOperator(OperatorNode node, EmitterVisitor emitterVisitor) { } else { mv.visitMethodInsn(Opcodes.INVOKESTATIC, "org/perlonjava/runtime/operators/StringOperators", - "lc", + isUnicodeStringsEnabled(emitterVisitor) ? "lcUnicode" : "lc", "(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;", false); } @@ -1555,7 +1555,7 @@ static void handleUcOperator(OperatorNode node, EmitterVisitor emitterVisitor) { } else { mv.visitMethodInsn(Opcodes.INVOKESTATIC, "org/perlonjava/runtime/operators/StringOperators", - "uc", + isUnicodeStringsEnabled(emitterVisitor) ? "ucUnicode" : "uc", "(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;", false); } @@ -1579,7 +1579,7 @@ static void handleLcfirstOperator(OperatorNode node, EmitterVisitor emitterVisit } else { mv.visitMethodInsn(Opcodes.INVOKESTATIC, "org/perlonjava/runtime/operators/StringOperators", - "lcfirst", + isUnicodeStringsEnabled(emitterVisitor) ? "lcfirstUnicode" : "lcfirst", "(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;", false); } @@ -1603,13 +1603,18 @@ static void handleUcfirstOperator(OperatorNode node, EmitterVisitor emitterVisit } else { mv.visitMethodInsn(Opcodes.INVOKESTATIC, "org/perlonjava/runtime/operators/StringOperators", - "ucfirst", + isUnicodeStringsEnabled(emitterVisitor) ? "ucfirstUnicode" : "ucfirst", "(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;", false); } handleVoidContext(emitterVisitor); } + private static boolean isUnicodeStringsEnabled(EmitterVisitor emitterVisitor) { + return emitterVisitor.ctx.symbolTable != null + && emitterVisitor.ctx.symbolTable.isFeatureCategoryEnabled("unicode_strings"); + } + /** * Handles array-specific unary builtin operators. * diff --git a/src/main/java/org/perlonjava/frontend/parser/StringDoubleQuoted.java b/src/main/java/org/perlonjava/frontend/parser/StringDoubleQuoted.java index b73ed5e43..8634ed1e5 100644 --- a/src/main/java/org/perlonjava/frontend/parser/StringDoubleQuoted.java +++ b/src/main/java/org/perlonjava/frontend/parser/StringDoubleQuoted.java @@ -419,10 +419,6 @@ private void parseDoubleQuotedEscapesRegex() { switch (escape) { // Case modification end marker case "E" -> { - if (isInsideRegexCharClass()) { - appendToCurrentSegment("\\E"); - return; - } // Flush any pending literal text flushCurrentSegment(); // Pop and apply the most recent case modifier @@ -440,10 +436,6 @@ private void parseDoubleQuotedEscapesRegex() { // Quotemeta modifier case "Q" -> { - if (isInsideRegexCharClass()) { - appendToCurrentSegment("\\Q"); - return; - } flushCurrentSegment(); caseModifiers.push(new CaseModifier("Q", false)); } diff --git a/src/main/java/org/perlonjava/runtime/operators/StringOperators.java b/src/main/java/org/perlonjava/runtime/operators/StringOperators.java index 2cde2639f..9cf9e321a 100644 --- a/src/main/java/org/perlonjava/runtime/operators/StringOperators.java +++ b/src/main/java/org/perlonjava/runtime/operators/StringOperators.java @@ -101,12 +101,21 @@ public static RuntimeScalar toBytesString(RuntimeScalar runtimeScalar) { */ private static RuntimeScalar makeStringResult(String value, RuntimeScalar source) { RuntimeScalar result = new RuntimeScalar(value); - if (source.type == RuntimeScalarType.BYTE_STRING) { + if (source.type == RuntimeScalarType.BYTE_STRING && canRepresentAsByteString(value)) { result.type = RuntimeScalarType.BYTE_STRING; } return result; } + private static boolean canRepresentAsByteString(String value) { + for (int i = 0; i < value.length(); i++) { + if (value.charAt(i) > 0xFF) { + return false; + } + } + return true; + } + /** * Escapes all non-alphanumeric characters in the string representation of the given {@link RuntimeScalar}. * @@ -145,6 +154,14 @@ public static RuntimeScalar fc(RuntimeScalar runtimeScalar) { if (runtimeScalar.type == RuntimeScalarType.BYTE_STRING) { return caseFoldBytesAsciiOnly(runtimeScalar); } + return fcUnicode(runtimeScalar); + } + + /** + * Performs full Unicode case folding, including Latin-1 byte strings. + * This is used under the unicode_strings feature. + */ + public static RuntimeScalar fcUnicode(RuntimeScalar runtimeScalar) { String str = runtimeScalar.toString(); // Perform full Unicode case folding using ICU4J CaseMap // Note: We do NOT use NFKC normalization because Perl's fc() preserves @@ -180,6 +197,14 @@ public static RuntimeScalar lc(RuntimeScalar runtimeScalar) { if (runtimeScalar.type == RuntimeScalarType.BYTE_STRING) { return caseFoldBytesAsciiOnly(runtimeScalar); } + return lcUnicode(runtimeScalar); + } + + /** + * Converts to lowercase using Unicode semantics, including Latin-1 byte strings. + * This is used under the unicode_strings feature. + */ + public static RuntimeScalar lcUnicode(RuntimeScalar runtimeScalar) { // Convert the string to lowercase using ICU4J for proper Unicode handling String str = UCharacter.toLowerCase(runtimeScalar.toString()); return makeStringResult(str, runtimeScalar); @@ -205,6 +230,14 @@ public static RuntimeScalar lcfirst(RuntimeScalar runtimeScalar) { if (runtimeScalar.type == RuntimeScalarType.BYTE_STRING) { return lcfirstBytes(runtimeScalar); } + return lcfirstUnicode(runtimeScalar); + } + + /** + * Converts the first character to lowercase using Unicode semantics. + * This is used under the unicode_strings feature. + */ + public static RuntimeScalar lcfirstUnicode(RuntimeScalar runtimeScalar) { String str = runtimeScalar.toString(); // Check if the string is empty if (str.isEmpty()) { @@ -230,6 +263,14 @@ public static RuntimeScalar uc(RuntimeScalar runtimeScalar) { if (runtimeScalar.type == RuntimeScalarType.BYTE_STRING) { return uppercaseBytesAsciiOnly(runtimeScalar); } + return ucUnicode(runtimeScalar); + } + + /** + * Converts to uppercase using Unicode semantics, including Latin-1 byte strings. + * This is used under the unicode_strings feature. + */ + public static RuntimeScalar ucUnicode(RuntimeScalar runtimeScalar) { // Convert the string to uppercase using ICU4J for proper Unicode handling String str = UCharacter.toUpperCase(runtimeScalar.toString()); return makeStringResult(str, runtimeScalar); @@ -247,10 +288,18 @@ public static RuntimeScalar ucfirst(RuntimeScalar runtimeScalar) { if (runtimeScalar.type == RuntimeScalarType.BYTE_STRING) { return ucfirstBytes(runtimeScalar); } + return ucfirstUnicode(runtimeScalar); + } + + /** + * Converts the first character to titlecase using Unicode semantics. + * This is used under the unicode_strings feature. + */ + public static RuntimeScalar ucfirstUnicode(RuntimeScalar runtimeScalar) { String str = runtimeScalar.toString(); // Check if the string is empty if (str.isEmpty()) { - return new RuntimeScalar(str); + return makeStringResult(str, runtimeScalar); } int firstCodePoint = str.codePointAt(0); int charCount = Character.charCount(firstCodePoint); diff --git a/src/test/resources/unit/regex/regex_charclass.t b/src/test/resources/unit/regex/regex_charclass.t index 344be1e8d..731f90069 100644 --- a/src/test/resources/unit/regex/regex_charclass.t +++ b/src/test/resources/unit/regex/regex_charclass.t @@ -60,19 +60,38 @@ subtest 'bracketed \c? matches DEL only' => sub { ok("color" =~ /colou?r/, "? quantifier still works (absent)"); }; -subtest 'bracketed \Q and \E are literal with warnings' => sub { +subtest 'bracketed \Q...\E applies quotemeta' => sub { my @warnings; local $SIG{__WARN__} = sub { push @warnings, @_ }; my $re = qr/[\Qabc\E]/; - ok("Q" =~ $re, '\Q is passed through as literal Q in a character class'); - ok("E" =~ $re, '\E is passed through as literal E in a character class'); - ok("a" =~ $re, 'character class contents still match'); + ok("a" =~ $re, '\Q...\E contents match in a character class'); + ok("b" =~ $re, 'middle contents match'); + ok("c" =~ $re, 'last contents match'); + ok("Q" !~ $re, '\Q marker is not a literal Q'); + ok("E" !~ $re, '\E marker is not a literal E'); ok("d" !~ $re, 'characters outside the class do not match'); + + my $bracket = qr/[a\Q]\E]c/; + ok("ac" =~ $bracket, 'plain character still matches'); + ok("]c" =~ $bracket, 'quoted closing bracket remains inside the class'); + is(join("", @warnings), "", 'no warnings are emitted'); +}; + +subtest 'interpolated bracketed \Q and \E are literal with warnings' => sub { + my @warnings; + local $SIG{__WARN__} = sub { push @warnings, @_ }; + + my $chars = '\Qabc\E'; + my $re = qr/[$chars]/; + ok("Q" =~ $re, 'interpolated \Q is passed through as literal Q'); + ok("E" =~ $re, 'interpolated \E is passed through as literal E'); + ok("a" =~ $re, 'interpolated character class contents still match'); + ok("d" !~ $re, 'characters outside the interpolated class do not match'); like(join("", @warnings), qr/Unrecognized escape \\Q in character class passed through in regex/, - '\Q warning is emitted'); + 'interpolated \Q warning is emitted'); like(join("", @warnings), qr/Unrecognized escape \\E in character class passed through in regex/, - '\E warning is emitted'); + 'interpolated \E warning is emitted'); }; done_testing();