Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1384,6 +1384,8 @@ public void visit(StringNode node) {
short opcode;
if (node.isVString) {
opcode = Opcodes.LOAD_VSTRING;
} else if (node.forceByteString) {
opcode = Opcodes.LOAD_BYTE_STRING;
} else if (emitterContext != null && emitterContext.symbolTable != null
&& !emitterContext.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
&& !emitterContext.compilerOptions.isUnicodeSource) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2056,7 +2056,10 @@ public static RuntimeList execute(InterpretedCode code, RuntimeArray args, int c
Opcodes.ABS, Opcodes.BINARY_NOT, Opcodes.BITWISE_NOT, Opcodes.INTEGER_BITWISE_NOT, Opcodes.ORD,
Opcodes.ORD_BYTES, Opcodes.OCT, Opcodes.HEX, Opcodes.SRAND, Opcodes.CHR,
Opcodes.CHR_BYTES, Opcodes.LENGTH_BYTES, Opcodes.QUOTEMETA, Opcodes.FC, Opcodes.LC,
Opcodes.LCFIRST, Opcodes.UC, Opcodes.UCFIRST, Opcodes.SLEEP, Opcodes.TELL,
Opcodes.LCFIRST, Opcodes.UC, Opcodes.UCFIRST, Opcodes.FC_BYTES, Opcodes.LC_BYTES,
Opcodes.LCFIRST_BYTES, Opcodes.UC_BYTES, Opcodes.UCFIRST_BYTES, Opcodes.FC_UNICODE,
Opcodes.LC_UNICODE, Opcodes.LCFIRST_UNICODE, Opcodes.UC_UNICODE, Opcodes.UCFIRST_UNICODE,
Opcodes.TO_BYTES_STRING, Opcodes.SLEEP, Opcodes.TELL,
Opcodes.RMDIR, Opcodes.CLOSEDIR, Opcodes.REWINDDIR, Opcodes.TELLDIR, Opcodes.CHDIR,
Opcodes.EXIT -> {
pc = ScalarUnaryOpcodeHandler.execute(opcode, bytecode, pc, registers);
Expand Down
25 changes: 20 additions & 5 deletions src/main/java/org/perlonjava/backend/bytecode/CompileOperator.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@ private static void compileScalarOperand(BytecodeCompiler bc, OperatorNode node,
}
}

private static short selectCaseOpcode(BytecodeCompiler bc, short normalOpcode, short bytesOpcode, short unicodeOpcode) {
if (bc.isBytesEnabled()) {
return bytesOpcode;
}
if (bc.symbolTable != null && bc.symbolTable.isFeatureCategoryEnabled("unicode_strings")) {
return unicodeOpcode;
}
return normalOpcode;
}

private static int compileArrayForExistsDelete(BytecodeCompiler bc, BinaryOperatorNode arrayAccess, int tokenIndex) {
if (!(arrayAccess.left instanceof OperatorNode leftOp) || !leftOp.operator.equals("$")
|| !(leftOp.operand instanceof IdentifierNode)) {
Expand Down Expand Up @@ -682,11 +692,16 @@ public static void visitOperator(BytecodeCompiler bytecodeCompiler, OperatorNode
case "chrBytes" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.CHR_BYTES);
case "lengthBytes" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.LENGTH_BYTES);
case "quotemeta" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.QUOTEMETA);
case "fc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, bytecodeCompiler.isBytesEnabled() ? Opcodes.FC_BYTES : Opcodes.FC);
case "lc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, bytecodeCompiler.isBytesEnabled() ? Opcodes.LC_BYTES : Opcodes.LC);
case "lcfirst" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, bytecodeCompiler.isBytesEnabled() ? Opcodes.LCFIRST_BYTES : Opcodes.LCFIRST);
case "uc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, bytecodeCompiler.isBytesEnabled() ? Opcodes.UC_BYTES : Opcodes.UC);
case "ucfirst" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, bytecodeCompiler.isBytesEnabled() ? Opcodes.UCFIRST_BYTES : Opcodes.UCFIRST);
case "fc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node,
selectCaseOpcode(bytecodeCompiler, Opcodes.FC, Opcodes.FC_BYTES, Opcodes.FC_UNICODE));
case "lc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node,
selectCaseOpcode(bytecodeCompiler, Opcodes.LC, Opcodes.LC_BYTES, Opcodes.LC_UNICODE));
case "lcfirst" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node,
selectCaseOpcode(bytecodeCompiler, Opcodes.LCFIRST, Opcodes.LCFIRST_BYTES, Opcodes.LCFIRST_UNICODE));
case "uc" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node,
selectCaseOpcode(bytecodeCompiler, Opcodes.UC, Opcodes.UC_BYTES, Opcodes.UC_UNICODE));
case "ucfirst" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node,
selectCaseOpcode(bytecodeCompiler, Opcodes.UCFIRST, Opcodes.UCFIRST_BYTES, Opcodes.UCFIRST_UNICODE));
case "tell" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.TELL);
case "rmdir" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.RMDIR);
case "closedir" -> visitSimpleUnaryWithDefault(bytecodeCompiler, node, Opcodes.CLOSEDIR);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1574,14 +1574,19 @@ public static String disassemble(InterpretedCode interpretedCode) {
case Opcodes.QUOTEMETA:
case Opcodes.FC:
case Opcodes.FC_BYTES:
case Opcodes.FC_UNICODE:
case Opcodes.LC:
case Opcodes.LC_BYTES:
case Opcodes.LC_UNICODE:
case Opcodes.LCFIRST:
case Opcodes.LCFIRST_BYTES:
case Opcodes.LCFIRST_UNICODE:
case Opcodes.UC:
case Opcodes.UC_BYTES:
case Opcodes.UC_UNICODE:
case Opcodes.UCFIRST:
case Opcodes.UCFIRST_BYTES:
case Opcodes.UCFIRST_UNICODE:
case Opcodes.TO_BYTES_STRING:
case Opcodes.SLEEP:
case Opcodes.TELL:
Expand Down
25 changes: 25 additions & 0 deletions src/main/java/org/perlonjava/backend/bytecode/Opcodes.java
Original file line number Diff line number Diff line change
Expand Up @@ -2321,6 +2321,31 @@ public class Opcodes {
*/
public static final short HASH_DEREF_FETCH_NONSTRICT_FOR_LOCAL = 484;

/**
* Fold case under unicode_strings: rd = StringOperators.fcUnicode(rs)
*/
public static final short FC_UNICODE = 485;

/**
* Lowercase under unicode_strings: rd = StringOperators.lcUnicode(rs)
*/
public static final short LC_UNICODE = 486;

/**
* Lowercase first under unicode_strings: rd = StringOperators.lcfirstUnicode(rs)
*/
public static final short LCFIRST_UNICODE = 487;

/**
* Uppercase under unicode_strings: rd = StringOperators.ucUnicode(rs)
*/
public static final short UC_UNICODE = 488;

/**
* Uppercase first under unicode_strings: rd = StringOperators.ucfirstUnicode(rs)
*/
public static final short UCFIRST_UNICODE = 489;

private Opcodes() {
} // Utility class - no instantiation
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,19 @@ public static int execute(int opcode, int[] bytecode, int pc,
case Opcodes.QUOTEMETA -> StringOperators.quotemeta((RuntimeScalar) registers[rs]);
case Opcodes.FC -> StringOperators.fc((RuntimeScalar) registers[rs]);
case Opcodes.FC_BYTES -> StringOperators.fcBytes((RuntimeScalar) registers[rs]);
case Opcodes.FC_UNICODE -> StringOperators.fcUnicode((RuntimeScalar) registers[rs]);
case Opcodes.LC -> StringOperators.lc((RuntimeScalar) registers[rs]);
case Opcodes.LC_BYTES -> StringOperators.lcBytes((RuntimeScalar) registers[rs]);
case Opcodes.LC_UNICODE -> StringOperators.lcUnicode((RuntimeScalar) registers[rs]);
case Opcodes.LCFIRST -> StringOperators.lcfirst((RuntimeScalar) registers[rs]);
case Opcodes.LCFIRST_BYTES -> StringOperators.lcfirstBytes((RuntimeScalar) registers[rs]);
case Opcodes.LCFIRST_UNICODE -> StringOperators.lcfirstUnicode((RuntimeScalar) registers[rs]);
case Opcodes.UC -> StringOperators.uc((RuntimeScalar) registers[rs]);
case Opcodes.UC_BYTES -> StringOperators.ucBytes((RuntimeScalar) registers[rs]);
case Opcodes.UC_UNICODE -> StringOperators.ucUnicode((RuntimeScalar) registers[rs]);
case Opcodes.UCFIRST -> StringOperators.ucfirst((RuntimeScalar) registers[rs]);
case Opcodes.UCFIRST_BYTES -> StringOperators.ucfirstBytes((RuntimeScalar) registers[rs]);
case Opcodes.UCFIRST_UNICODE -> StringOperators.ucfirstUnicode((RuntimeScalar) registers[rs]);
case Opcodes.TO_BYTES_STRING -> StringOperators.toBytesString((RuntimeScalar) registers[rs]);
case Opcodes.SLEEP -> Time.sleep((RuntimeScalar) registers[rs]);
case Opcodes.TELL -> IOOperator.tell((RuntimeScalar) registers[rs]);
Expand Down Expand Up @@ -104,18 +109,28 @@ public static int disassemble(int opcode, int[] bytecode, int pc,
case Opcodes.FC -> sb.append("FC r").append(rd).append(" = fc(r").append(rs).append(")\n");
case Opcodes.FC_BYTES ->
sb.append("FC_BYTES r").append(rd).append(" = fcBytes(r").append(rs).append(")\n");
case Opcodes.FC_UNICODE ->
sb.append("FC_UNICODE r").append(rd).append(" = fcUnicode(r").append(rs).append(")\n");
case Opcodes.LC -> sb.append("LC r").append(rd).append(" = lc(r").append(rs).append(")\n");
case Opcodes.LC_BYTES ->
sb.append("LC_BYTES r").append(rd).append(" = lcBytes(r").append(rs).append(")\n");
case Opcodes.LC_UNICODE ->
sb.append("LC_UNICODE r").append(rd).append(" = lcUnicode(r").append(rs).append(")\n");
case Opcodes.LCFIRST -> sb.append("LCFIRST r").append(rd).append(" = lcfirst(r").append(rs).append(")\n");
case Opcodes.LCFIRST_BYTES ->
sb.append("LCFIRST_BYTES r").append(rd).append(" = lcfirstBytes(r").append(rs).append(")\n");
case Opcodes.LCFIRST_UNICODE ->
sb.append("LCFIRST_UNICODE r").append(rd).append(" = lcfirstUnicode(r").append(rs).append(")\n");
case Opcodes.UC -> sb.append("UC r").append(rd).append(" = uc(r").append(rs).append(")\n");
case Opcodes.UC_BYTES ->
sb.append("UC_BYTES r").append(rd).append(" = ucBytes(r").append(rs).append(")\n");
case Opcodes.UC_UNICODE ->
sb.append("UC_UNICODE r").append(rd).append(" = ucUnicode(r").append(rs).append(")\n");
case Opcodes.UCFIRST -> sb.append("UCFIRST r").append(rd).append(" = ucfirst(r").append(rs).append(")\n");
case Opcodes.UCFIRST_BYTES ->
sb.append("UCFIRST_BYTES r").append(rd).append(" = ucfirstBytes(r").append(rs).append(")\n");
case Opcodes.UCFIRST_UNICODE ->
sb.append("UCFIRST_UNICODE r").append(rd).append(" = ucfirstUnicode(r").append(rs).append(")\n");
case Opcodes.TO_BYTES_STRING ->
sb.append("TO_BYTES_STRING r").append(rd).append(" = toBytesString(r").append(rs).append(")\n");
case Opcodes.SLEEP -> sb.append("SLEEP r").append(rd).append(" = sleep(r").append(rs).append(")\n");
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/perlonjava/backend/jvm/EmitLiteral.java
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,8 @@ public static void emitString(EmitterContext ctx, StringNode node) {
return;
}

if (!ctx.symbolTable.isStrictOptionEnabled(HINT_UTF8) && !ctx.compilerOptions.isUnicodeSource) {
if (node.forceByteString
|| (!ctx.symbolTable.isStrictOptionEnabled(HINT_UTF8) && !ctx.compilerOptions.isUnicodeSource)) {
// Under `no utf8` - create an octet string, unless it contains wide characters (> 255)
// Wide characters (like \x{100}) force the string to be UTF-8 even without `use utf8`
boolean hasWideChars = false;
Expand Down
15 changes: 10 additions & 5 deletions src/main/java/org/perlonjava/backend/jvm/EmitOperator.java
Original file line number Diff line number Diff line change
Expand Up @@ -1507,7 +1507,7 @@ static void handleFcOperator(OperatorNode node, EmitterVisitor emitterVisitor) {
} else {
mv.visitMethodInsn(Opcodes.INVOKESTATIC,
"org/perlonjava/runtime/operators/StringOperators",
"fc",
isUnicodeStringsEnabled(emitterVisitor) ? "fcUnicode" : "fc",
"(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;",
false);
}
Expand All @@ -1531,7 +1531,7 @@ static void handleLcOperator(OperatorNode node, EmitterVisitor emitterVisitor) {
} else {
mv.visitMethodInsn(Opcodes.INVOKESTATIC,
"org/perlonjava/runtime/operators/StringOperators",
"lc",
isUnicodeStringsEnabled(emitterVisitor) ? "lcUnicode" : "lc",
"(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;",
false);
}
Expand All @@ -1555,7 +1555,7 @@ static void handleUcOperator(OperatorNode node, EmitterVisitor emitterVisitor) {
} else {
mv.visitMethodInsn(Opcodes.INVOKESTATIC,
"org/perlonjava/runtime/operators/StringOperators",
"uc",
isUnicodeStringsEnabled(emitterVisitor) ? "ucUnicode" : "uc",
"(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;",
false);
}
Expand All @@ -1579,7 +1579,7 @@ static void handleLcfirstOperator(OperatorNode node, EmitterVisitor emitterVisit
} else {
mv.visitMethodInsn(Opcodes.INVOKESTATIC,
"org/perlonjava/runtime/operators/StringOperators",
"lcfirst",
isUnicodeStringsEnabled(emitterVisitor) ? "lcfirstUnicode" : "lcfirst",
"(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;",
false);
}
Expand All @@ -1603,13 +1603,18 @@ static void handleUcfirstOperator(OperatorNode node, EmitterVisitor emitterVisit
} else {
mv.visitMethodInsn(Opcodes.INVOKESTATIC,
"org/perlonjava/runtime/operators/StringOperators",
"ucfirst",
isUnicodeStringsEnabled(emitterVisitor) ? "ucfirstUnicode" : "ucfirst",
"(Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;",
false);
}
handleVoidContext(emitterVisitor);
}

private static boolean isUnicodeStringsEnabled(EmitterVisitor emitterVisitor) {
return emitterVisitor.ctx.symbolTable != null
&& emitterVisitor.ctx.symbolTable.isFeatureCategoryEnabled("unicode_strings");
}

/**
* Handles array-specific unary builtin operators.
*
Expand Down
17 changes: 16 additions & 1 deletion src/main/java/org/perlonjava/frontend/astnode/StringNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ public class StringNode extends AbstractNode {
*/
public final boolean isVString;

/**
* Force this literal to be emitted as a byte string even in a C<use utf8>
* scope. Perl keeps ASCII and fixed-byte escapes such as "\xFC" unupgraded;
* actual non-ASCII source characters still use normal UTF-8 string emission.
*/
public final boolean forceByteString;

/**
* Constructs a new StringNode with the specified string value.
*
Expand All @@ -29,6 +36,7 @@ public StringNode(String value, int tokenIndex) {
this.value = value;
this.tokenIndex = tokenIndex;
this.isVString = false;
this.forceByteString = false;
}

/**
Expand All @@ -42,6 +50,14 @@ public StringNode(String value, boolean isVString, int tokenIndex) {
this.value = value;
this.tokenIndex = tokenIndex;
this.isVString = isVString;
this.forceByteString = false;
}

public StringNode(String value, boolean isVString, boolean forceByteString, int tokenIndex) {
this.value = value;
this.tokenIndex = tokenIndex;
this.isVString = isVString;
this.forceByteString = forceByteString;
}

/**
Expand All @@ -67,4 +83,3 @@ public void accept(Visitor visitor) {
visitor.visit(this);
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public class ParserTables {
"die", "do", "dump",
"exec", "exit",
"fork",
"getpwuid", "glob",
"gethostbyname", "getpwuid", "glob",
"hex",
"kill",
"oct", "open",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import static org.perlonjava.frontend.parser.ParseBlock.parseBlock;
import static org.perlonjava.frontend.parser.Variable.parseArrayHashAccess;
import static org.perlonjava.runtime.perlmodule.Strict.HINT_UTF8;

/**
* Base class for parsing strings with segments and variable interpolation.
Expand Down Expand Up @@ -74,6 +75,9 @@ public abstract class StringSegmentParser {
* Buffer for accumulating literal text segments
*/
protected final StringBuilder currentSegment;
private boolean currentSegmentHasSourceNonAscii = false;
private boolean inRegexCharClass = false;
private boolean regexCharClassFirst = false;
/**
* List of AST nodes representing string segments (literals and interpolated expressions)
*/
Expand Down Expand Up @@ -128,6 +132,35 @@ protected void appendToCurrentSegment(String text) {
currentSegment.append(text);
}

protected void appendLiteralToCurrentSegment(String text) {
appendToCurrentSegment(text);
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
updateRegexCharClassState(c);
if (c > 127) {
currentSegmentHasSourceNonAscii = true;
}
}
}

protected boolean isInsideRegexCharClass() {
return isRegex && inRegexCharClass;
}

private void updateRegexCharClassState(char c) {
if (!isRegex) {
return;
}
if (c == '[' && !inRegexCharClass) {
inRegexCharClass = true;
regexCharClassFirst = true;
} else if (c == ']' && inRegexCharClass && !regexCharClassFirst) {
inRegexCharClass = false;
} else if (inRegexCharClass && regexCharClassFirst && c != '^') {
regexCharClassFirst = false;
}
}

/**
* Adds a string segment node to the segments list.
*
Expand All @@ -150,9 +183,28 @@ protected void addStringSegment(Node node) {
*/
protected void flushCurrentSegment() {
if (!currentSegment.isEmpty()) {
addStringSegment(new StringNode(currentSegment.toString(), tokenIndex));
String value = currentSegment.toString();
boolean forceByteString = shouldForceByteStringLiteral(value);
addStringSegment(new StringNode(value, false, forceByteString, tokenIndex));
currentSegment.setLength(0);
currentSegmentHasSourceNonAscii = false;
}
}

private boolean shouldForceByteStringLiteral(String value) {
if (!ctx.symbolTable.isStrictOptionEnabled(HINT_UTF8)
&& !ctx.compilerOptions.isUnicodeSource) {
return false;
}
if (currentSegmentHasSourceNonAscii) {
return false;
}
for (int i = 0; i < value.length(); i++) {
if (value.charAt(i) > 255) {
return false;
}
}
return true;
}

/**
Expand Down Expand Up @@ -639,7 +691,7 @@ public Node parse() {
continue;
} else {
// No heredocs pending, append the newline normally
appendToCurrentSegment(token.text);
appendLiteralToCurrentSegment(token.text);
}
continue;
}
Expand All @@ -650,7 +702,7 @@ public Node parse() {
}

// Default: append literal text to current segment
appendToCurrentSegment(text);
appendLiteralToCurrentSegment(text);
}

if (CompilerOptions.DEBUG_ENABLED) ctx.logDebug("StringSegmentParser.parse: Finished parsing, segments count: " + segments.size());
Expand Down Expand Up @@ -1337,4 +1389,4 @@ void handleUnicodeNameEscape() {
appendToCurrentSegment("N{" + nameBuilder);
}
}
}
}
Loading
Loading