Compares ALP vs PLAIN encoding for float and double columns across write and read paths. + * Both encodings use no compression codec (UNCOMPRESSED), so the comparison isolates the + * effect of the encoding itself — not any downstream codec like ZSTD or SNAPPY. + * Uses realistic floating-point data with limited decimal precision — the type of data ALP + * is designed to compress (e.g. sensor readings, prices, timestamps as doubles). + * + *
Run with: + *
+ * mvn package -pl parquet-benchmarks -am -DskipTests + * java -jar parquet-benchmarks/target/parquet-benchmarks.jar AlpEncodingBenchmarks + *+ */ +@State(Scope.Thread) +@BenchmarkMode(Mode.SingleShotTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +public class AlpEncodingBenchmarks { + + private static final int N_ROWS = 1_000_000; + + private static final MessageType SCHEMA = MessageTypeParser.parseMessageType( + "message alp_bench { required double double_col; required float float_col; }"); + + // Pre-generated data arrays — allocated once, reused across iterations. + private static final double[] DOUBLES = new double[N_ROWS]; + private static final float[] FLOATS = new float[N_ROWS]; + + static { + // Simulate sensor/metric data: values with 2-3 decimal digits of precision. + // ALP excels at this pattern; plain encoding stores all 8 bytes per double regardless. + for (int i = 0; i < N_ROWS; i++) { + DOUBLES[i] = Math.round((i * 0.01 + 100.0) * 100.0) / 100.0; + FLOATS[i] = (float) (Math.round((i * 0.01f + 10.0f) * 100.0f) / 100.0f); + } + } + + // Files written once per trial (for read benchmarks). + private Path alpReadFile; + private Path plainReadFile; + + // Files written fresh each iteration (for write benchmarks). + private Path writeTarget; + + @Setup(Level.Trial) + public void generateReadFiles() throws IOException { + alpReadFile = Files.createTempFile("alp_bench_alp_", ".parquet"); + Files.delete(alpReadFile); // LocalOutputFile must not pre-exist + writeParquetFile(alpReadFile, true); + + plainReadFile = Files.createTempFile("alp_bench_plain_", ".parquet"); + Files.delete(plainReadFile); + writeParquetFile(plainReadFile, false); + } + + @TearDown(Level.Trial) + public void deleteReadFiles() throws IOException { + if (alpReadFile != null) Files.deleteIfExists(alpReadFile); + if (plainReadFile != null) Files.deleteIfExists(plainReadFile); + } + + @Setup(Level.Iteration) + public void prepareWriteTarget() throws IOException { + writeTarget = Files.createTempFile("alp_bench_write_", ".parquet"); + Files.delete(writeTarget); // LocalOutputFile must not pre-exist + } + + @TearDown(Level.Iteration) + public void deleteWriteTarget() throws IOException { + if (writeTarget != null) Files.deleteIfExists(writeTarget); + } + + // --------------------------------------------------------------------------- + // Write benchmarks + // --------------------------------------------------------------------------- + + @Benchmark + public void writeDoubleAndFloatALP() throws IOException { + writeParquetFile(writeTarget, true); + } + + @Benchmark + public void writeDoubleAndFloatPlain() throws IOException { + writeParquetFile(writeTarget, false); + } + + // --------------------------------------------------------------------------- + // Read benchmarks + // --------------------------------------------------------------------------- + + @Benchmark + public void readDoubleAndFloatALP(Blackhole bh) throws IOException { + readParquetFile(alpReadFile, bh); + } + + @Benchmark + public void readDoubleAndFloatPlain(Blackhole bh) throws IOException { + readParquetFile(plainReadFile, bh); + } + + // --------------------------------------------------------------------------- + // Helpers + // --------------------------------------------------------------------------- + + private static void writeParquetFile(Path path, boolean alp) throws IOException { + try (org.apache.parquet.hadoop.ParquetWriter
ALP encoding converts floating-point values to integers using decimal scaling, + * then applies Frame of Reference encoding and bit-packing. + * Values that cannot be losslessly converted are stored as exceptions. + * + *
Based on the paper: "ALP: Adaptive Lossless floating-Point Compression" (SIGMOD 2024) + * + * @see ALP Paper + */ +public final class AlpConstants { + + private AlpConstants() { + // Utility class + } + + // Page header fields + public static final int ALP_COMPRESSION_MODE = 0; + public static final int ALP_INTEGER_ENCODING_FOR = 0; + public static final int ALP_HEADER_SIZE = 7; + + public static final int DEFAULT_VECTOR_SIZE = 1024; + public static final int DEFAULT_VECTOR_SIZE_LOG = 10; + + // Capped at 15 (vectorSize=32768) because num_exceptions is uint16, + // so vectorSize must not exceed 65535 to avoid overflow when all values are exceptions. + static final int MAX_LOG_VECTOR_SIZE = 15; + static final int MIN_LOG_VECTOR_SIZE = 3; + + static final int FLOAT_MAX_EXPONENT = 10; + static final int DOUBLE_MAX_EXPONENT = 18; + + // Sampler constants matching C++ AlpConstants. + // Sample SAMPLER_SAMPLE_VECTORS_PER_ROWGROUP vectors evenly distributed across a rowgroup + // of SAMPLER_ROWGROUP_SIZE values, then lock in top MAX_PRESET_COMBINATIONS combos. + static final int SAMPLER_ROWGROUP_SIZE = 122_880; + static final int SAMPLER_SAMPLE_VECTORS_PER_ROWGROUP = 8; + static final int MAX_PRESET_COMBINATIONS = 5; + + // Magic numbers for the fast-rounding trick (see ALP paper, Section 3.2) + static final float MAGIC_FLOAT = 12_582_912.0f; // 2^22 + 2^23 + static final double MAGIC_DOUBLE = 6_755_399_441_055_744.0; // 2^51 + 2^52 + + // Per-vector metadata sizes in bytes + public static final int ALP_INFO_SIZE = 4; // exponent(1) + factor(1) + num_exceptions(2) + public static final int FLOAT_FOR_INFO_SIZE = 5; // frame_of_reference(4) + bit_width(1) + public static final int DOUBLE_FOR_INFO_SIZE = 9; // frame_of_reference(8) + bit_width(1) + + // POWERS_OF_TEN: positive powers used for scaling up during encode/decode. + // Encode: fastRound(value * POW10[e] * POW10_NEGATIVE[f]) + // Decode: encoded * POW10[f] * POW10_NEGATIVE[e] + static final float[] FLOAT_POW10 = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f, 1e8f, 1e9f, 1e10f}; + + static final double[] DOUBLE_POW10 = { + 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18 + }; + + // NEGATIVE_POWERS_OF_TEN: reciprocals used for scaling down (multiply-by-reciprocal). + // Using separate negative-power arrays instead of division ensures C++ wire compatibility. + static final float[] FLOAT_POW10_NEGATIVE = { + 1e0f, 1e-1f, 1e-2f, 1e-3f, 1e-4f, 1e-5f, 1e-6f, 1e-7f, 1e-8f, 1e-9f, 1e-10f + }; + + static final double[] DOUBLE_POW10_NEGATIVE = { + 1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, 1e-13, 1e-14, 1e-15, 1e-16, + 1e-17, 1e-18 + }; + + static final int FLOAT_NEGATIVE_ZERO_BITS = 0x80000000; + static final long DOUBLE_NEGATIVE_ZERO_BITS = 0x8000000000000000L; + + /** Validates vector size: must be a power of 2 in [2^MIN_LOG .. 2^MAX_LOG]. */ + public static int validateVectorSize(int vectorSize) { + Preconditions.checkArgument( + vectorSize > 0 && (vectorSize & (vectorSize - 1)) == 0, + "Vector size must be a power of 2, got: %s", + vectorSize); + int logSize = Integer.numberOfTrailingZeros(vectorSize); + Preconditions.checkArgument( + logSize >= MIN_LOG_VECTOR_SIZE && logSize <= MAX_LOG_VECTOR_SIZE, + "Vector size log2 must be between %s and %s, got: %s (vectorSize=%s)", + MIN_LOG_VECTOR_SIZE, + MAX_LOG_VECTOR_SIZE, + logSize, + vectorSize); + return vectorSize; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpEncoderDecoder.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpEncoderDecoder.java new file mode 100644 index 0000000000..5c6229bb9c --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpEncoderDecoder.java @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.apache.parquet.column.values.alp.AlpConstants.*; + +/** + * Core ALP (Adaptive Lossless floating-Point) encoding and decoding logic. + * + *
ALP works by converting floating-point values to integers using decimal scaling, + * then applying Frame of Reference encoding and bit-packing. + * Values that cannot be losslessly converted are stored as exceptions. + * + *
Encoding formula: encoded = fastRound(value * POW10[e] * POW10_NEGATIVE[f]) + *
Decoding formula: value = encoded * POW10[f] * POW10_NEGATIVE[e] + * + *
The order of operations is critical for IEEE 754 correctness. Both formulas must + * be evaluated as single expressions — storing the intermediate multiplication result + * in a variable before the second multiply changes IEEE 754 rounding and produces extra + * exceptions. Uses multiply-by-reciprocal (via POW10_NEGATIVE) for C++ wire compatibility. + * + *
Exception conditions: + *
Estimated size (in bits) = {@code length * bitWidth + exceptions * (Float.SIZE + Short.SIZE)}, + * where bitWidth is the number of bits needed to represent the unsigned range of non-exception + * encoded values after frame-of-reference subtraction. This matches the C++ ALP cost model and + * produces better compression ratios than minimizing exception count alone. + */ + static EncodingParams findBestFloatParams(float[] values, int offset, int length) { + int bestExponent = 0; + int bestFactor = 0; + int bestExceptions = length; + long bestEstimatedSize = Long.MAX_VALUE; + + for (int e = 0; e <= FLOAT_MAX_EXPONENT; e++) { + for (int f = 0; f <= e; f++) { + int exceptions = 0; + int minEncoded = Integer.MAX_VALUE; + int maxEncoded = Integer.MIN_VALUE; + for (int i = 0; i < length; i++) { + float value = values[offset + i]; + if (isFloatException(value, e, f)) { + exceptions++; + } else { + int encoded = encodeFloat(value, e, f); + if (encoded < minEncoded) minEncoded = encoded; + if (encoded > maxEncoded) maxEncoded = encoded; + } + } + int nonExceptions = length - exceptions; + if (nonExceptions == 0) continue; + long delta = (nonExceptions < 2) ? 0 : + Integer.toUnsignedLong(maxEncoded) - Integer.toUnsignedLong(minEncoded); + int bitsPerValue = (delta == 0) ? 0 : (64 - Long.numberOfLeadingZeros(delta)); + long estimatedSize = (long) length * bitsPerValue + + (long) exceptions * (Float.SIZE + Short.SIZE); + if (estimatedSize < bestEstimatedSize + || (estimatedSize == bestEstimatedSize + && (e > bestExponent || (e == bestExponent && f > bestFactor)))) { + bestEstimatedSize = estimatedSize; + bestExponent = e; + bestFactor = f; + bestExceptions = exceptions; + if (bestExceptions == 0 && bitsPerValue == 0) { + return new EncodingParams(bestExponent, bestFactor, 0); + } + } + } + } + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } + + /** Same as findBestFloatParams but only tries the cached preset combos. */ + static EncodingParams findBestFloatParamsWithPresets(float[] values, int offset, int length, int[][] presets) { + int bestExponent = presets[0][0]; + int bestFactor = presets[0][1]; + int bestExceptions = length; + long bestEstimatedSize = Long.MAX_VALUE; + + for (int[] preset : presets) { + int e = preset[0]; + int f = preset[1]; + int exceptions = 0; + int minEncoded = Integer.MAX_VALUE; + int maxEncoded = Integer.MIN_VALUE; + for (int i = 0; i < length; i++) { + float value = values[offset + i]; + if (isFloatException(value, e, f)) { + exceptions++; + } else { + int encoded = encodeFloat(value, e, f); + if (encoded < minEncoded) minEncoded = encoded; + if (encoded > maxEncoded) maxEncoded = encoded; + } + } + int nonExceptions = length - exceptions; + if (nonExceptions == 0) continue; + long delta = (nonExceptions < 2) ? 0 : + Integer.toUnsignedLong(maxEncoded) - Integer.toUnsignedLong(minEncoded); + int bitsPerValue = (delta == 0) ? 0 : (64 - Long.numberOfLeadingZeros(delta)); + long estimatedSize = (long) length * bitsPerValue + + (long) exceptions * (Float.SIZE + Short.SIZE); + if (estimatedSize < bestEstimatedSize + || (estimatedSize == bestEstimatedSize + && (e > bestExponent || (e == bestExponent && f > bestFactor)))) { + bestEstimatedSize = estimatedSize; + bestExponent = e; + bestFactor = f; + bestExceptions = exceptions; + if (bestExceptions == 0 && bitsPerValue == 0) { + return new EncodingParams(bestExponent, bestFactor, 0); + } + } + } + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } + + /** Try all (exponent, factor) combos and pick the one with the smallest estimated compressed size. */ + static EncodingParams findBestDoubleParams(double[] values, int offset, int length) { + int bestExponent = 0; + int bestFactor = 0; + int bestExceptions = length; + long bestEstimatedSize = Long.MAX_VALUE; + + for (int e = 0; e <= DOUBLE_MAX_EXPONENT; e++) { + for (int f = 0; f <= e; f++) { + int exceptions = 0; + long minEncoded = Long.MAX_VALUE; + long maxEncoded = Long.MIN_VALUE; + for (int i = 0; i < length; i++) { + double value = values[offset + i]; + if (isDoubleException(value, e, f)) { + exceptions++; + } else { + long encoded = encodeDouble(value, e, f); + if (encoded < minEncoded) minEncoded = encoded; + if (encoded > maxEncoded) maxEncoded = encoded; + } + } + int nonExceptions = length - exceptions; + if (nonExceptions == 0) continue; + // delta as signed subtraction; Long.numberOfLeadingZeros handles the unsigned bit width + // correctly even when the subtraction overflows (large range → penalized with 64 bits). + long delta = (nonExceptions < 2) ? 0 : (maxEncoded - minEncoded); + int bitsPerValue = (delta == 0) ? 0 : (64 - Long.numberOfLeadingZeros(delta)); + long estimatedSize = (long) length * bitsPerValue + + (long) exceptions * (Double.SIZE + Short.SIZE); + if (estimatedSize < bestEstimatedSize + || (estimatedSize == bestEstimatedSize + && (e > bestExponent || (e == bestExponent && f > bestFactor)))) { + bestEstimatedSize = estimatedSize; + bestExponent = e; + bestFactor = f; + bestExceptions = exceptions; + if (bestExceptions == 0 && bitsPerValue == 0) { + return new EncodingParams(bestExponent, bestFactor, 0); + } + } + } + } + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } + + /** Same as findBestDoubleParams but only tries the cached preset combos. */ + static EncodingParams findBestDoubleParamsWithPresets(double[] values, int offset, int length, int[][] presets) { + int bestExponent = presets[0][0]; + int bestFactor = presets[0][1]; + int bestExceptions = length; + long bestEstimatedSize = Long.MAX_VALUE; + + for (int[] preset : presets) { + int e = preset[0]; + int f = preset[1]; + int exceptions = 0; + long minEncoded = Long.MAX_VALUE; + long maxEncoded = Long.MIN_VALUE; + for (int i = 0; i < length; i++) { + double value = values[offset + i]; + if (isDoubleException(value, e, f)) { + exceptions++; + } else { + long encoded = encodeDouble(value, e, f); + if (encoded < minEncoded) minEncoded = encoded; + if (encoded > maxEncoded) maxEncoded = encoded; + } + } + int nonExceptions = length - exceptions; + if (nonExceptions == 0) continue; + long delta = (nonExceptions < 2) ? 0 : (maxEncoded - minEncoded); + int bitsPerValue = (delta == 0) ? 0 : (64 - Long.numberOfLeadingZeros(delta)); + long estimatedSize = (long) length * bitsPerValue + + (long) exceptions * (Double.SIZE + Short.SIZE); + if (estimatedSize < bestEstimatedSize + || (estimatedSize == bestEstimatedSize + && (e > bestExponent || (e == bestExponent && f > bestFactor)))) { + bestEstimatedSize = estimatedSize; + bestExponent = e; + bestFactor = f; + bestExceptions = exceptions; + if (bestExceptions == 0 && bitsPerValue == 0) { + return new EncodingParams(bestExponent, bestFactor, 0); + } + } + } + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReader.java new file mode 100644 index 0000000000..cebacdd7bf --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReader.java @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.apache.parquet.column.values.alp.AlpConstants.*; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.io.ParquetDecodingException; + +/** + * Abstract base class for ALP values readers with lazy per-vector decoding. + * + *
Reads ALP-encoded values from the interleaved page layout: + *
+ * ┌─────────┬──────────────────────┬──────────────┬──────────────┬─────┐ + * │ Header │ Offset Array │ Vector 0 │ Vector 1 │ ... │ + * │ 7 bytes │ 4B × numVectors │ (interleaved)│ (interleaved)│ │ + * └─────────┴──────────────────────┴──────────────┴──────────────┴─────┘ + *+ * + *
Each vector is decoded lazily on first access. Skipping values does not + * trigger decoding of intermediate vectors. + */ +abstract class AlpValuesReader extends ValuesReader { + + protected int vectorSize; + protected int totalCount; + protected int numVectors; + protected int currentIndex; + protected int currentVectorIndex; + + protected int[] vectorOffsets; + protected ByteBuffer vectorsData; + protected int offsetArraySize; + + AlpValuesReader() { + this.currentIndex = 0; + this.totalCount = 0; + this.currentVectorIndex = -1; + } + + @Override + public void initFromPage(int valuesCount, ByteBufferInputStream stream) + throws ParquetDecodingException, IOException { + ByteBuffer headerBuf = stream.slice(ALP_HEADER_SIZE).order(ByteOrder.LITTLE_ENDIAN); + int compressionMode = headerBuf.get() & 0xFF; + int integerEncoding = headerBuf.get() & 0xFF; + int logVectorSize = headerBuf.get() & 0xFF; + int numElements = headerBuf.getInt(); + + if (compressionMode != ALP_COMPRESSION_MODE) { + throw new ParquetDecodingException("Unsupported ALP compression mode: " + compressionMode); + } + if (integerEncoding != ALP_INTEGER_ENCODING_FOR) { + throw new ParquetDecodingException("Unsupported ALP integer encoding: " + integerEncoding); + } + if (logVectorSize < MIN_LOG_VECTOR_SIZE || logVectorSize > MAX_LOG_VECTOR_SIZE) { + throw new ParquetDecodingException("Invalid ALP log vector size: " + logVectorSize + ", must be between " + + MIN_LOG_VECTOR_SIZE + " and " + MAX_LOG_VECTOR_SIZE); + } + if (numElements < 0) { + throw new ParquetDecodingException("Invalid ALP element count: " + numElements); + } + // ALP's num_elements is the count of non-null values that went through encoding; + // valuesCount is the page row count, which is larger when the column has nulls. + // The two are equal only for required (non-null) columns. + if (numElements > valuesCount) { + throw new ParquetDecodingException( + "ALP header element count " + numElements + " exceeds page valuesCount " + valuesCount); + } + + this.vectorSize = 1 << logVectorSize; + this.totalCount = numElements; + this.numVectors = (numElements + vectorSize - 1) / vectorSize; + this.currentIndex = 0; + this.currentVectorIndex = -1; + + this.offsetArraySize = numVectors * Integer.BYTES; + ByteBuffer offsetBuf = stream.slice(offsetArraySize).order(ByteOrder.LITTLE_ENDIAN); + this.vectorOffsets = new int[numVectors]; + for (int v = 0; v < numVectors; v++) { + vectorOffsets[v] = offsetBuf.getInt(); + } + + // Slice remaining bytes into a 0-based view so decodeVector can use + // absolute get methods (vectorsData.get(pos)) directly. + int remainingBytes = (int) stream.available(); + ByteBuffer rawSlice = stream.slice(remainingBytes); + this.vectorsData = rawSlice.slice().order(ByteOrder.LITTLE_ENDIAN); + + allocateDecodedBuffer(vectorSize); + } + + protected int getVectorLength(int vectorIdx) { + if (vectorIdx < numVectors - 1) { + return vectorSize; + } + // Last vector may be partial + int lastVectorLen = totalCount % vectorSize; + return lastVectorLen == 0 ? vectorSize : lastVectorLen; + } + + // Offsets in the page are relative to the compression body (after header), + // but vectorsData starts after the offset array, so adjust. + protected int getVectorDataPosition(int vectorIdx) { + return vectorOffsets[vectorIdx] - offsetArraySize; + } + + @Override + public void skip() { + skip(1); + } + + @Override + public void skip(int n) { + if (n < 0 || currentIndex + n > totalCount) { + throw new ParquetDecodingException(String.format( + "Cannot skip this many elements. Current index: %d. Skip %d. Total count: %d", + currentIndex, n, totalCount)); + } + currentIndex += n; + } + + protected void ensureVectorDecoded() { + int vectorIdx = currentIndex / vectorSize; + if (vectorIdx != currentVectorIndex) { + decodeVector(vectorIdx); + currentVectorIndex = vectorIdx; + } + } + + protected abstract void allocateDecodedBuffer(int capacity); + + protected abstract void decodeVector(int vectorIdx); + + // Explicit little-endian reads using absolute get(), since absolute get() ignores ByteBuffer order. + protected static int getShortLE(ByteBuffer buf, int pos) { + return (buf.get(pos) & 0xFF) | ((buf.get(pos + 1) & 0xFF) << 8); + } + + protected static int getIntLE(ByteBuffer buf, int pos) { + return (buf.get(pos) & 0xFF) + | ((buf.get(pos + 1) & 0xFF) << 8) + | ((buf.get(pos + 2) & 0xFF) << 16) + | ((buf.get(pos + 3) & 0xFF) << 24); + } + + protected static long getLongLE(ByteBuffer buf, int pos) { + return (buf.get(pos) & 0xFFL) + | ((buf.get(pos + 1) & 0xFFL) << 8) + | ((buf.get(pos + 2) & 0xFFL) << 16) + | ((buf.get(pos + 3) & 0xFFL) << 24) + | ((buf.get(pos + 4) & 0xFFL) << 32) + | ((buf.get(pos + 5) & 0xFFL) << 40) + | ((buf.get(pos + 6) & 0xFFL) << 48) + | ((buf.get(pos + 7) & 0xFFL) << 56); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForDouble.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForDouble.java new file mode 100644 index 0000000000..278b72c1cf --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForDouble.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.apache.parquet.column.values.alp.AlpConstants.*; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import org.apache.parquet.column.values.bitpacking.BytePackerForLong; +import org.apache.parquet.column.values.bitpacking.Packer; +import org.apache.parquet.io.ParquetDecodingException; + +/** + * ALP values reader for DOUBLE type with lazy per-vector decoding. + * + *
Reads ALP-encoded double values from the interleaved page layout. + * Each vector is decoded on first access using BytePackerForLong-based unpacking. + */ +public class AlpValuesReaderForDouble extends AlpValuesReader { + + private double[] decodedValues; + private long[] deltasBuffer; + private int[] excPositionsBuffer; + private final long[] unpackPadBuf = new long[8]; + private byte[] unpackByteBuf; + + public AlpValuesReaderForDouble() { + super(); + } + + @Override + protected void allocateDecodedBuffer(int capacity) { + this.decodedValues = new double[capacity]; + this.deltasBuffer = new long[capacity]; + this.excPositionsBuffer = new int[capacity]; + this.unpackByteBuf = new byte[Long.SIZE]; // max bit width for long = 64 bytes + } + + @Override + public double readDouble() { + if (currentIndex >= totalCount) { + throw new ParquetDecodingException("ALP double data was already exhausted."); + } + ensureVectorDecoded(); + int indexInVector = currentIndex % vectorSize; + currentIndex++; + return decodedValues[indexInVector]; + } + + @Override + protected void decodeVector(int vectorIdx) { + int vectorLen = getVectorLength(vectorIdx); + int pos = getVectorDataPosition(vectorIdx); + + int exponent = vectorsData.get(pos) & 0xFF; + int factor = vectorsData.get(pos + 1) & 0xFF; + int numExceptions = getShortLE(vectorsData, pos + 2) & 0xFFFF; + pos += ALP_INFO_SIZE; + + if (exponent > DOUBLE_MAX_EXPONENT) { + throw new ParquetDecodingException( + "Invalid ALP double exponent " + exponent + " in vector " + vectorIdx + ", max is " + DOUBLE_MAX_EXPONENT); + } + if (factor > exponent) { + throw new ParquetDecodingException( + "Invalid ALP double factor " + factor + " > exponent " + exponent + " in vector " + vectorIdx); + } + if (numExceptions > vectorLen) { + throw new ParquetDecodingException( + "Invalid ALP numExceptions " + numExceptions + " > vectorLen " + vectorLen + " in vector " + vectorIdx); + } + + long frameOfReference = getLongLE(vectorsData, pos); + int bitWidth = vectorsData.get(pos + 8) & 0xFF; + pos += DOUBLE_FOR_INFO_SIZE; + + if (bitWidth > 0) { + pos = unpackLongsWithBytePacker(vectorsData, pos, deltasBuffer, vectorLen, bitWidth); + } else { + Arrays.fill(deltasBuffer, 0, vectorLen, 0L); + } + + for (int i = 0; i < vectorLen; i++) { + long encoded = deltasBuffer[i] + frameOfReference; + decodedValues[i] = AlpEncoderDecoder.decodeDouble(encoded, exponent, factor); + } + + if (numExceptions > 0) { + for (int e = 0; e < numExceptions; e++) { + excPositionsBuffer[e] = getShortLE(vectorsData, pos) & 0xFFFF; + if (excPositionsBuffer[e] >= vectorLen) { + throw new ParquetDecodingException( + "ALP exception position " + excPositionsBuffer[e] + " out of bounds for vectorLen " + vectorLen); + } + pos += Short.BYTES; + } + for (int e = 0; e < numExceptions; e++) { + decodedValues[excPositionsBuffer[e]] = getDoubleLE(vectorsData, pos); + pos += Double.BYTES; + } + } + } + + private int unpackLongsWithBytePacker(ByteBuffer buf, int pos, long[] output, int count, int bitWidth) { + BytePackerForLong packer = Packer.LITTLE_ENDIAN.newBytePackerForLong(bitWidth); + int numFullGroups = count / 8; + int remaining = count % 8; + + for (int g = 0; g < numFullGroups; g++) { + packer.unpack8Values(buf, pos, output, g * 8); + pos += bitWidth; + } + + // Last group might have fewer than 8 values; zero-pad and unpack, + // but only advance pos by the actual bytes in the page. + if (remaining > 0) { + int totalPackedBytes = (count * bitWidth + 7) / 8; + int alreadyRead = numFullGroups * bitWidth; + int partialBytes = totalPackedBytes - alreadyRead; + + for (int i = 0; i < partialBytes; i++) { + unpackByteBuf[i] = buf.get(pos + i); + } + for (int i = partialBytes; i < bitWidth; i++) { + unpackByteBuf[i] = 0; + } + + packer.unpack8Values(unpackByteBuf, 0, unpackPadBuf, 0); + System.arraycopy(unpackPadBuf, 0, output, numFullGroups * 8, remaining); + pos += partialBytes; + } + + return pos; + } + + private static double getDoubleLE(ByteBuffer buf, int pos) { + return Double.longBitsToDouble(getLongLE(buf, pos)); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForFloat.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForFloat.java new file mode 100644 index 0000000000..45b7366f04 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForFloat.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.apache.parquet.column.values.alp.AlpConstants.*; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import org.apache.parquet.column.values.bitpacking.BytePacker; +import org.apache.parquet.column.values.bitpacking.Packer; +import org.apache.parquet.io.ParquetDecodingException; + +/** + * ALP values reader for FLOAT type with lazy per-vector decoding. + * + *
Reads ALP-encoded float values from the interleaved page layout. + * Each vector is decoded on first access using BytePacker-based unpacking. + */ +public class AlpValuesReaderForFloat extends AlpValuesReader { + + private float[] decodedValues; + private int[] deltasBuffer; + private int[] excPositionsBuffer; + private final int[] unpackPadBuf = new int[8]; + private byte[] unpackByteBuf; + + public AlpValuesReaderForFloat() { + super(); + } + + @Override + protected void allocateDecodedBuffer(int capacity) { + this.decodedValues = new float[capacity]; + this.deltasBuffer = new int[capacity]; + this.excPositionsBuffer = new int[capacity]; + this.unpackByteBuf = new byte[Integer.SIZE]; // max bit width for int = 32 bytes + } + + @Override + public float readFloat() { + if (currentIndex >= totalCount) { + throw new ParquetDecodingException("ALP float data was already exhausted."); + } + ensureVectorDecoded(); + int indexInVector = currentIndex % vectorSize; + currentIndex++; + return decodedValues[indexInVector]; + } + + @Override + protected void decodeVector(int vectorIdx) { + int vectorLen = getVectorLength(vectorIdx); + int pos = getVectorDataPosition(vectorIdx); + + int exponent = vectorsData.get(pos) & 0xFF; + int factor = vectorsData.get(pos + 1) & 0xFF; + int numExceptions = getShortLE(vectorsData, pos + 2) & 0xFFFF; + pos += ALP_INFO_SIZE; + + if (exponent > FLOAT_MAX_EXPONENT) { + throw new ParquetDecodingException( + "Invalid ALP float exponent " + exponent + " in vector " + vectorIdx + ", max is " + FLOAT_MAX_EXPONENT); + } + if (factor > exponent) { + throw new ParquetDecodingException( + "Invalid ALP float factor " + factor + " > exponent " + exponent + " in vector " + vectorIdx); + } + if (numExceptions > vectorLen) { + throw new ParquetDecodingException( + "Invalid ALP numExceptions " + numExceptions + " > vectorLen " + vectorLen + " in vector " + vectorIdx); + } + + int frameOfReference = getIntLE(vectorsData, pos); + int bitWidth = vectorsData.get(pos + 4) & 0xFF; + pos += FLOAT_FOR_INFO_SIZE; + + if (bitWidth > 0) { + pos = unpackIntsWithBytePacker(vectorsData, pos, deltasBuffer, vectorLen, bitWidth); + } else { + Arrays.fill(deltasBuffer, 0, vectorLen, 0); + } + + for (int i = 0; i < vectorLen; i++) { + int encoded = deltasBuffer[i] + frameOfReference; + decodedValues[i] = AlpEncoderDecoder.decodeFloat(encoded, exponent, factor); + } + + // Overwrite exception slots with their original float values + if (numExceptions > 0) { + for (int e = 0; e < numExceptions; e++) { + excPositionsBuffer[e] = getShortLE(vectorsData, pos) & 0xFFFF; + if (excPositionsBuffer[e] >= vectorLen) { + throw new ParquetDecodingException( + "ALP exception position " + excPositionsBuffer[e] + " out of bounds for vectorLen " + vectorLen); + } + pos += Short.BYTES; + } + for (int e = 0; e < numExceptions; e++) { + decodedValues[excPositionsBuffer[e]] = getFloatLE(vectorsData, pos); + pos += Float.BYTES; + } + } + } + + /** Unpack bit-packed ints in groups of 8, returns position after packed data. */ + private int unpackIntsWithBytePacker(ByteBuffer buf, int pos, int[] output, int count, int bitWidth) { + BytePacker packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidth); + int numFullGroups = count / 8; + int remaining = count % 8; + + for (int g = 0; g < numFullGroups; g++) { + packer.unpack8Values(buf, pos, output, g * 8); + pos += bitWidth; + } + + if (remaining > 0) { + int totalPackedBytes = (count * bitWidth + 7) / 8; + int alreadyRead = numFullGroups * bitWidth; + int partialBytes = totalPackedBytes - alreadyRead; + + for (int i = 0; i < partialBytes; i++) { + unpackByteBuf[i] = buf.get(pos + i); + } + for (int i = partialBytes; i < bitWidth; i++) { + unpackByteBuf[i] = 0; + } + + packer.unpack8Values(unpackByteBuf, 0, unpackPadBuf, 0); + System.arraycopy(unpackPadBuf, 0, output, numFullGroups * 8, remaining); + pos += partialBytes; + } + + return pos; + } + + private static float getFloatLE(ByteBuffer buf, int pos) { + return Float.intBitsToFloat(getIntLE(buf, pos)); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesWriter.java new file mode 100644 index 0000000000..59e8974b80 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesWriter.java @@ -0,0 +1,635 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.apache.parquet.column.values.alp.AlpConstants.*; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.parquet.bytes.ByteBufferAllocator; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.CapacityByteArrayOutputStream; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.column.values.bitpacking.BytePacker; +import org.apache.parquet.column.values.bitpacking.BytePackerForLong; +import org.apache.parquet.column.values.bitpacking.Packer; + +/** + * ALP (Adaptive Lossless floating-Point) values writer. + * + *
ALP encoding converts floating-point values to integers using decimal scaling, + * then applies Frame of Reference encoding and bit-packing. + * Values that cannot be losslessly converted are stored as exceptions. + * + *
Writing is incremental: values are buffered in a fixed-size vector buffer, + * and each full vector is encoded and flushed to the output stream immediately. + * On {@link #getBytes()}, any remaining partial vector is flushed, and the + * final page bytes are assembled. + * + *
Interleaved Page Layout: + *
+ * ┌─────────┬──────────────────────┬──────────────┬──────────────┬─────┐ + * │ Header │ Offset Array │ Vector 0 │ Vector 1 │ ... │ + * │ 7 bytes │ 4B × numVectors │ (interleaved)│ (interleaved)│ │ + * └─────────┴──────────────────────┴──────────────┴──────────────┴─────┘ + *+ * + *
Each vector contains interleaved:
+ * AlpInfo(4B) + ForInfo(5B/9B) + PackedValues + ExceptionPositions + ExceptionValues
+ */
+public abstract class AlpValuesWriter extends ValuesWriter {
+
+ protected final int initialCapacity;
+ protected final int pageSize;
+ protected final ByteBufferAllocator allocator;
+ protected final int vectorSize;
+ protected final int logVectorSize;
+
+ AlpValuesWriter(int initialCapacity, int pageSize, ByteBufferAllocator allocator, int vectorSize) {
+ AlpConstants.validateVectorSize(vectorSize);
+ this.initialCapacity = initialCapacity;
+ this.pageSize = pageSize;
+ this.allocator = allocator;
+ this.vectorSize = vectorSize;
+ this.logVectorSize = Integer.numberOfTrailingZeros(vectorSize);
+ }
+
+ @Override
+ public Encoding getEncoding() {
+ return Encoding.ALP;
+ }
+
+ /** Float writer. Buffers one vector at a time, encodes and flushes when full. */
+ public static class FloatAlpValuesWriter extends AlpValuesWriter {
+ private final float[] vectorBuffer;
+ private int bufferCount;
+ private int totalCount;
+ private CapacityByteArrayOutputStream encodedVectors;
+ private final List "Fails cleanly" means raising a meaningful exception — preferably
+ * {@link ParquetDecodingException}, but at minimum a typed exception (not a JVM-level
+ * crash, infinite loop, or wrong answer). The tests cover both:
+ * These are the same datasets used in the meeting discussion to compare Java vs C++ ALP
+ * exception counts. Run from the project root with the alp-test-data/ directory present:
+ *
+ * To use a different data directory:
+ *
+ * Test vector: 3 float values {100.0f, 200.0f, 300.0f} with vectorSize=8.
+ * With exponent=0, factor=0: encoded = {100, 200, 300}.
+ * FOR min = 100, deltas = {0, 100, 200}, maxDelta = 200, bitWidth = 8.
+ * Packed: 3 values in 1 partial group of 8, padded with zeros.
+ * Packed size = ceil(3 * 8 / 8) = 3 bytes.
+ */
+ @Test
+ public void testReaderWithHandCraftedBytes() throws Exception {
+ // Manually build ALP page for {100.0f, 200.0f, 300.0f}, vectorSize=8
+ int vectorSize = 8;
+ int logVectorSize = 3;
+ int numElements = 3;
+ int numVectors = 1;
+
+ // Encoding: e=0, f=0 → multiplier=1.0
+ // encoded = {100, 200, 300}
+ // FOR min = 100
+ // deltas = {0, 100, 200}
+ // maxDelta = 200, bitWidth = 8
+ int exponent = 0;
+ int factor = 0;
+ int numExceptions = 0;
+ int frameOfReference = 100;
+ int bitWidth = 8;
+
+ // Bit-pack deltas using the same BytePacker the reader will use
+ BytePacker packer = Packer.LITTLE_ENDIAN.newBytePacker(8);
+ int[] deltas = {0, 100, 200, 0, 0, 0, 0, 0}; // padded to 8
+ byte[] packed = new byte[8]; // bitWidth bytes for one group of 8
+ packer.pack8Values(deltas, 0, packed, 0);
+ int packedSize = (3 * 8 + 7) / 8; // = 3 bytes for 3 values at 8 bits
+
+ // Build the page
+ int vectorDataSize = 4 + 5 + packedSize; // AlpInfo + ForInfo + packed
+ int offsetArraySize = numVectors * 4;
+
+ ByteBuffer page =
+ ByteBuffer.allocate(7 + offsetArraySize + vectorDataSize).order(ByteOrder.LITTLE_ENDIAN);
+
+ // Header (7 bytes)
+ page.put((byte) 0); // compression_mode
+ page.put((byte) 0); // integer_encoding
+ page.put((byte) logVectorSize);
+ page.putInt(numElements);
+
+ // Offset array (1 vector)
+ page.putInt(offsetArraySize); // offset₀ = past offset array
+
+ // Vector 0: AlpInfo (4 bytes)
+ page.put((byte) exponent);
+ page.put((byte) factor);
+ page.putShort((short) numExceptions);
+
+ // Vector 0: ForInfo (5 bytes)
+ page.putInt(frameOfReference);
+ page.put((byte) bitWidth);
+
+ // Vector 0: Packed data (3 bytes)
+ page.put(packed, 0, packedSize);
+
+ page.flip();
+
+ // Now read it back
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(numElements, ByteBufferInputStream.wrap(page));
+
+ assertEquals(Float.floatToRawIntBits(100.0f), Float.floatToRawIntBits(reader.readFloat()));
+ assertEquals(Float.floatToRawIntBits(200.0f), Float.floatToRawIntBits(reader.readFloat()));
+ assertEquals(Float.floatToRawIntBits(300.0f), Float.floatToRawIntBits(reader.readFloat()));
+ }
+
+ /**
+ * Hand-craft bytes with exceptions to verify reader handles them independently.
+ * Values: {1.0f, NaN, 3.0f} with vectorSize=8.
+ * e=0, f=0: encoded={1, placeholder=1, 3}, 1 exception at position 1.
+ * FOR min=1, deltas={0, 0, 2}, maxDelta=2, bitWidth=2.
+ */
+ @Test
+ public void testReaderWithHandCraftedExceptions() throws Exception {
+ int vectorSize = 8;
+ int logVectorSize = 3;
+ int numElements = 3;
+
+ int exponent = 0;
+ int factor = 0;
+ int numExceptions = 1;
+ int placeholder = 1; // first non-exception encoded value
+ int frameOfReference = 1; // min of {1, 1, 3}
+ int bitWidth = 2; // ceil(log2(2+1)) = 2
+
+ // deltas = {1-1, 1-1, 3-1} = {0, 0, 2}, padded to 8: {0,0,2,0,0,0,0,0}
+ BytePacker packer = Packer.LITTLE_ENDIAN.newBytePacker(2);
+ int[] deltas = {0, 0, 2, 0, 0, 0, 0, 0};
+ byte[] packed = new byte[2]; // bitWidth=2 bytes for a group of 8
+ packer.pack8Values(deltas, 0, packed, 0);
+ int packedSize = (3 * 2 + 7) / 8; // = 1 byte
+
+ // Vector data: AlpInfo(4) + ForInfo(5) + packed(1) + excPos(2) + excVal(4) = 16
+ int vectorDataSize = 4 + 5 + packedSize + 2 + 4;
+ int offsetArraySize = 1 * 4;
+
+ ByteBuffer page =
+ ByteBuffer.allocate(7 + offsetArraySize + vectorDataSize).order(ByteOrder.LITTLE_ENDIAN);
+
+ // Header (7 bytes)
+ page.put((byte) 0); // compression_mode
+ page.put((byte) 0); // integer_encoding
+ page.put((byte) logVectorSize);
+ page.putInt(numElements);
+
+ // Offset array
+ page.putInt(offsetArraySize);
+
+ // AlpInfo
+ page.put((byte) exponent);
+ page.put((byte) factor);
+ page.putShort((short) numExceptions);
+
+ // ForInfo
+ page.putInt(frameOfReference);
+ page.put((byte) bitWidth);
+
+ // Packed data
+ page.put(packed, 0, packedSize);
+
+ // Exception positions (uint16 LE)
+ page.putShort((short) 1); // position 1
+
+ // Exception values (float32 LE)
+ page.putFloat(Float.NaN);
+
+ page.flip();
+
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(numElements, ByteBufferInputStream.wrap(page));
+
+ assertEquals(Float.floatToRawIntBits(1.0f), Float.floatToRawIntBits(reader.readFloat()));
+ float val1 = reader.readFloat();
+ assertTrue("Expected NaN at position 1", Float.isNaN(val1));
+ assertEquals(Float.floatToRawIntBits(3.0f), Float.floatToRawIntBits(reader.readFloat()));
+ }
+
+ /**
+ * Hand-craft double ALP bytes and verify reader independence.
+ * Values: {10.0, 20.0, 30.0}, vectorSize=8, e=0, f=0.
+ * encoded={10,20,30}, FOR min=10, deltas={0,10,20}, maxDelta=20, bitWidth=5.
+ */
+ @Test
+ public void testDoubleReaderWithHandCraftedBytes() throws Exception {
+ int numElements = 3;
+ int exponent = 0;
+ int factor = 0;
+ long frameOfReference = 10;
+ int bitWidth = 5; // ceil(log2(20+1)) = 5
+
+ org.apache.parquet.column.values.bitpacking.BytePackerForLong packer =
+ Packer.LITTLE_ENDIAN.newBytePackerForLong(5);
+ long[] deltas = {0, 10, 20, 0, 0, 0, 0, 0};
+ byte[] packed = new byte[5]; // bitWidth bytes for group of 8
+ packer.pack8Values(deltas, 0, packed, 0);
+ int packedSize = (3 * 5 + 7) / 8; // = 2 bytes
+
+ int vectorDataSize = 4 + 9 + packedSize; // AlpInfo + DoubleForInfo + packed
+ int offsetArraySize = 4;
+
+ ByteBuffer page =
+ ByteBuffer.allocate(7 + offsetArraySize + vectorDataSize).order(ByteOrder.LITTLE_ENDIAN);
+
+ // Header (7 bytes)
+ page.put((byte) 0); // compression_mode
+ page.put((byte) 0); // integer_encoding
+ page.put((byte) 3); // log2(8)
+ page.putInt(numElements);
+
+ // Offset array
+ page.putInt(offsetArraySize);
+
+ // AlpInfo
+ page.put((byte) exponent);
+ page.put((byte) factor);
+ page.putShort((short) 0); // no exceptions
+
+ // ForInfo (9 bytes for double)
+ page.putLong(frameOfReference);
+ page.put((byte) bitWidth);
+
+ // Packed data
+ page.put(packed, 0, packedSize);
+
+ page.flip();
+
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ reader.initFromPage(numElements, ByteBufferInputStream.wrap(page));
+
+ assertEquals(Double.doubleToRawLongBits(10.0), Double.doubleToRawLongBits(reader.readDouble()));
+ assertEquals(Double.doubleToRawLongBits(20.0), Double.doubleToRawLongBits(reader.readDouble()));
+ assertEquals(Double.doubleToRawLongBits(30.0), Double.doubleToRawLongBits(reader.readDouble()));
+ }
+
+ // ========== Writer Output → Hand-Verified Bytes ==========
+
+ /**
+ * Write known values, then verify the exact binary output against hand computation.
+ * This verifies the writer independently of the reader.
+ *
+ * Input: {1.0f, 2.0f, 3.0f} with vectorSize=8.
+ * Expected: e=0, f=0, encoded={1,2,3}, FOR min=1, deltas={0,1,2},
+ * maxDelta=2, bitWidth=2. Packed size = ceil(3*2/8) = 1 byte.
+ */
+ @Test
+ public void testWriterOutputExactBytes() throws Exception {
+ AlpValuesWriter.FloatAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.FloatAlpValuesWriter(256, 256, new DirectByteBufferAllocator(), 8);
+ writer.writeFloat(1.0f);
+ writer.writeFloat(2.0f);
+ writer.writeFloat(3.0f);
+
+ byte[] bytes = writer.getBytes().toByteArray();
+ ByteBuffer buf = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN);
+
+ // Header (7 bytes)
+ assertEquals(0, buf.get() & 0xFF); // compression_mode
+ assertEquals(0, buf.get() & 0xFF); // integer_encoding
+ assertEquals(3, buf.get() & 0xFF); // log2(8) = 3
+ assertEquals(3, buf.getInt()); // num_elements
+
+ // Offset array (1 vector)
+ int offset0 = buf.getInt();
+ assertEquals(4, offset0); // 1 * 4 = past offset array
+
+ // AlpInfo
+ int exp = buf.get() & 0xFF;
+ int fac = buf.get() & 0xFF;
+ int numExc = buf.getShort() & 0xFFFF;
+ assertEquals(0, numExc);
+
+ // ForInfo
+ int forRef = buf.getInt();
+ int bw = buf.get() & 0xFF;
+
+ // Verify: for e=0, f=0: multiplier=1.0
+ // encoded = {1, 2, 3}, min = 1, deltas = {0, 1, 2}, bitWidth = 2
+ assertEquals(1, forRef); // frame of reference = 1
+ assertEquals(2, bw); // bitWidth for max delta 2
+
+ // Packed data: ceil(3 * 2 / 8) = 1 byte
+ // deltas {0, 1, 2} at bitWidth=2:
+ // Use BytePacker to compute expected packed byte
+ BytePacker packer = Packer.LITTLE_ENDIAN.newBytePacker(2);
+ int[] expectedDeltas = {0, 1, 2, 0, 0, 0, 0, 0};
+ byte[] expectedPacked = new byte[2]; // bitWidth=2
+ packer.pack8Values(expectedDeltas, 0, expectedPacked, 0);
+
+ byte actualPackedByte = buf.get();
+ assertEquals("packed byte", expectedPacked[0], actualPackedByte);
+
+ // Should have consumed all bytes
+ assertEquals("all bytes consumed", bytes.length, buf.position());
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ // ========== NaN Bit Pattern Preservation ==========
+
+ /**
+ * Verify that different NaN bit patterns survive the round-trip exactly.
+ * Java allows multiple NaN representations. Our exception handling must
+ * preserve the exact bit pattern, not normalize to Float.NaN.
+ */
+ @Test
+ public void testNaNBitPatternPreservation() throws Exception {
+ // Standard NaN
+ float standardNaN = Float.NaN; // 0x7FC00000
+ // Signaling NaN (different bit pattern)
+ float signalingNaN = Float.intBitsToFloat(0x7F800001);
+ // Negative NaN
+ float negativeNaN = Float.intBitsToFloat(0xFFC00000);
+ // Another NaN variant
+ float customNaN = Float.intBitsToFloat(0x7FFFFFFF);
+
+ assertTrue(Float.isNaN(standardNaN));
+ assertTrue(Float.isNaN(signalingNaN));
+ assertTrue(Float.isNaN(negativeNaN));
+ assertTrue(Float.isNaN(customNaN));
+
+ float[] values = {1.0f, standardNaN, 2.0f, signalingNaN, 3.0f, negativeNaN, 4.0f, customNaN};
+
+ AlpValuesWriter.FloatAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.FloatAlpValuesWriter(256, 256, new DirectByteBufferAllocator());
+ for (float v : values) {
+ writer.writeFloat(v);
+ }
+
+ BytesInput input = writer.getBytes();
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(values.length, ByteBufferInputStream.wrap(input.toByteBuffer()));
+
+ for (int i = 0; i < values.length; i++) {
+ float actual = reader.readFloat();
+ assertEquals(
+ "Bit pattern mismatch at index " + i + " (0x"
+ + Integer.toHexString(Float.floatToRawIntBits(values[i])) + " vs 0x"
+ + Integer.toHexString(Float.floatToRawIntBits(actual)) + ")",
+ Float.floatToRawIntBits(values[i]),
+ Float.floatToRawIntBits(actual));
+ }
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ /**
+ * Same test for double NaN bit patterns.
+ */
+ @Test
+ public void testDoubleNaNBitPatternPreservation() throws Exception {
+ double standardNaN = Double.NaN;
+ double signalingNaN = Double.longBitsToDouble(0x7FF0000000000001L);
+ double negativeNaN = Double.longBitsToDouble(0xFFF8000000000000L);
+ double customNaN = Double.longBitsToDouble(0x7FFFFFFFFFFFFFFFL);
+
+ double[] values = {1.0, standardNaN, 2.0, signalingNaN, 3.0, negativeNaN, 4.0, customNaN};
+
+ AlpValuesWriter.DoubleAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.DoubleAlpValuesWriter(512, 512, new DirectByteBufferAllocator());
+ for (double v : values) {
+ writer.writeDouble(v);
+ }
+
+ BytesInput input = writer.getBytes();
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ reader.initFromPage(values.length, ByteBufferInputStream.wrap(input.toByteBuffer()));
+
+ for (int i = 0; i < values.length; i++) {
+ double actual = reader.readDouble();
+ assertEquals(
+ "Bit pattern mismatch at index " + i + " (0x"
+ + Long.toHexString(Double.doubleToRawLongBits(values[i])) + " vs 0x"
+ + Long.toHexString(Double.doubleToRawLongBits(actual)) + ")",
+ Double.doubleToRawLongBits(values[i]),
+ Double.doubleToRawLongBits(actual));
+ }
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ // ========== Negative Zero Bit-Exact Roundtrip ==========
+
+ /**
+ * Verify that -0.0f roundtrips as -0.0f (bit pattern 0x80000000),
+ * not as +0.0f (bit pattern 0x00000000).
+ */
+ @Test
+ public void testNegativeZeroBitExact() throws Exception {
+ float negZero = -0.0f;
+ float posZero = 0.0f;
+
+ // Sanity: they are == but have different bit patterns
+ assertTrue(negZero == posZero);
+ assertNotEquals(Float.floatToRawIntBits(negZero), Float.floatToRawIntBits(posZero));
+
+ float[] values = {posZero, negZero, 1.0f, negZero};
+
+ AlpValuesWriter.FloatAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.FloatAlpValuesWriter(256, 256, new DirectByteBufferAllocator());
+ for (float v : values) {
+ writer.writeFloat(v);
+ }
+
+ BytesInput input = writer.getBytes();
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(values.length, ByteBufferInputStream.wrap(input.toByteBuffer()));
+
+ for (int i = 0; i < values.length; i++) {
+ float actual = reader.readFloat();
+ assertEquals(
+ "Bit pattern at index " + i,
+ Float.floatToRawIntBits(values[i]),
+ Float.floatToRawIntBits(actual));
+ }
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ // ========== Extreme Float Values ==========
+
+ /**
+ * Test subnormal floats, Float.MIN_VALUE, Float.MAX_VALUE, Float.MIN_NORMAL.
+ * These should all either encode losslessly or be stored as exceptions.
+ */
+ @Test
+ public void testExtremeFloatValues() throws Exception {
+ float[] values = {
+ Float.MIN_VALUE, // smallest positive subnormal: 1.4e-45
+ Float.MIN_NORMAL, // smallest positive normal: 1.17549435e-38
+ Float.MAX_VALUE, // 3.4028235e38
+ -Float.MAX_VALUE, // most negative
+ 1.17549435e-38f, // near MIN_NORMAL
+ 3.4028234e38f, // near MAX_VALUE
+ 1.0e-10f, // very small positive
+ 1.0e10f, // large positive
+ };
+ roundTripFloat(values);
+ }
+
+ /**
+ * Test extreme double values.
+ */
+ @Test
+ public void testExtremeDoubleValues() throws Exception {
+ double[] values = {
+ Double.MIN_VALUE, // smallest positive subnormal
+ Double.MIN_NORMAL, // smallest positive normal
+ Double.MAX_VALUE, // largest positive
+ -Double.MAX_VALUE, // most negative
+ 1.0e-100, // very small
+ 1.0e100, // very large
+ };
+ roundTripDouble(values);
+ }
+
+ // ========== Preset Caching Under Distribution Change ==========
+
+ /**
+ * Write >8 vectors where the optimal (e,f) changes after the sampling phase.
+ * First 8 vectors: monetary data (e=2, f=0 optimal).
+ * Remaining vectors: large integers (e=0, f=0 optimal).
+ * Verify all values survive round-trip despite the distribution change.
+ */
+ @Test
+ public void testPresetCachingWithDistributionChange() throws Exception {
+ int vectorSize = 8; // small vectors to get to 8 quickly
+ int samplingVectors = 8;
+ int postSamplingVectors = 4;
+ int totalValues = vectorSize * (samplingVectors + postSamplingVectors);
+
+ float[] values = new float[totalValues];
+
+ // First 8 vectors: decimal monetary values (best with e=2, f=0)
+ for (int i = 0; i < vectorSize * samplingVectors; i++) {
+ values[i] = 10.00f + (i % 100) * 0.01f; // 10.00, 10.01, 10.02, ...
+ }
+
+ // Last 4 vectors: whole numbers (best with e=0, f=0)
+ for (int i = vectorSize * samplingVectors; i < totalValues; i++) {
+ values[i] = (float) (i * 1000);
+ }
+
+ roundTripFloat(values, vectorSize);
+ }
+
+ /**
+ * Same test for doubles.
+ */
+ @Test
+ public void testDoublePresetCachingWithDistributionChange() throws Exception {
+ int vectorSize = 8;
+ int totalValues = vectorSize * 12; // 8 sampling + 4 post
+
+ double[] values = new double[totalValues];
+
+ for (int i = 0; i < vectorSize * 8; i++) {
+ values[i] = 10.00 + (i % 100) * 0.01;
+ }
+ for (int i = vectorSize * 8; i < totalValues; i++) {
+ values[i] = (double) (i * 1000);
+ }
+
+ roundTripDouble(values, vectorSize);
+ }
+
+ // ========== Multi-Vector Hand-Crafted Binary ==========
+
+ /**
+ * Hand-craft a 2-vector page to verify offset array navigation works.
+ * 10 values with vectorSize=8: vector 0 has values 1-8, vector 1 has 9-10.
+ */
+ @Test
+ public void testMultiVectorHandCrafted() throws Exception {
+ int vectorSize = 8;
+ int numElements = 10;
+ int numVectors = 2;
+
+ // Vector 0: {1,2,3,4,5,6,7,8} → e=0,f=0, encoded={1..8}, FOR min=1, deltas={0..7}
+ // maxDelta=7, bitWidth=3. Packed: 8 values at 3 bits = 3 bytes exactly.
+ int v0BitWidth = 3;
+ BytePacker packer3 = Packer.LITTLE_ENDIAN.newBytePacker(3);
+ int[] v0Deltas = {0, 1, 2, 3, 4, 5, 6, 7};
+ byte[] v0Packed = new byte[3];
+ packer3.pack8Values(v0Deltas, 0, v0Packed, 0);
+ int v0PackedSize = 3; // (8*3+7)/8 = 3
+
+ // Vector 1: {9, 10} → e=0,f=0, encoded={9,10}, FOR min=9, deltas={0,1}
+ // maxDelta=1, bitWidth=1. Packed: ceil(2*1/8) = 1 byte.
+ int v1BitWidth = 1;
+ BytePacker packer1 = Packer.LITTLE_ENDIAN.newBytePacker(1);
+ int[] v1Deltas = {0, 1, 0, 0, 0, 0, 0, 0};
+ byte[] v1Packed = new byte[1];
+ packer1.pack8Values(v1Deltas, 0, v1Packed, 0);
+ int v1PackedSize = 1; // (2*1+7)/8 = 1
+
+ int v0DataSize = 4 + 5 + v0PackedSize; // 12
+ int v1DataSize = 4 + 5 + v1PackedSize; // 10
+ int offsetArraySize = numVectors * 4; // 8
+
+ ByteBuffer page =
+ ByteBuffer.allocate(7 + offsetArraySize + v0DataSize + v1DataSize)
+ .order(ByteOrder.LITTLE_ENDIAN);
+
+ // Header (7 bytes)
+ page.put((byte) 0); // compression_mode
+ page.put((byte) 0); // integer_encoding
+ page.put((byte) 3); // log2(8)
+ page.putInt(numElements);
+
+ // Offset array
+ page.putInt(offsetArraySize); // vector 0 starts at 8
+ page.putInt(offsetArraySize + v0DataSize); // vector 1 starts at 8+12=20
+
+ // Vector 0
+ page.put((byte) 0); // exponent
+ page.put((byte) 0); // factor
+ page.putShort((short) 0); // exceptions
+ page.putInt(1); // FOR = 1
+ page.put((byte) v0BitWidth);
+ page.put(v0Packed, 0, v0PackedSize);
+
+ // Vector 1
+ page.put((byte) 0); // exponent
+ page.put((byte) 0); // factor
+ page.putShort((short) 0); // exceptions
+ page.putInt(9); // FOR = 9
+ page.put((byte) v1BitWidth);
+ page.put(v1Packed, 0, v1PackedSize);
+
+ page.flip();
+
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(numElements, ByteBufferInputStream.wrap(page));
+
+ for (int i = 0; i < numElements; i++) {
+ float expected = (float) (i + 1);
+ float actual = reader.readFloat();
+ assertEquals(
+ "Value mismatch at index " + i,
+ Float.floatToRawIntBits(expected),
+ Float.floatToRawIntBits(actual));
+ }
+ }
+
+ // ========== Skip Over Entire Vector Without Decoding ==========
+
+ /**
+ * Verify that skipping an entire vector and reading the next one works.
+ * Uses hand-crafted bytes to ensure independence from writer.
+ */
+ @Test
+ public void testSkipEntireVectorHandCrafted() throws Exception {
+ // Reuse the multi-vector setup: {1..8} in vector 0, {9, 10} in vector 1
+ int vectorSize = 8;
+ int numElements = 10;
+ int numVectors = 2;
+
+ BytePacker packer3 = Packer.LITTLE_ENDIAN.newBytePacker(3);
+ int[] v0Deltas = {0, 1, 2, 3, 4, 5, 6, 7};
+ byte[] v0Packed = new byte[3];
+ packer3.pack8Values(v0Deltas, 0, v0Packed, 0);
+
+ BytePacker packer1 = Packer.LITTLE_ENDIAN.newBytePacker(1);
+ int[] v1Deltas = {0, 1, 0, 0, 0, 0, 0, 0};
+ byte[] v1Packed = new byte[1];
+ packer1.pack8Values(v1Deltas, 0, v1Packed, 0);
+
+ int v0DataSize = 4 + 5 + 3;
+ int v1DataSize = 4 + 5 + 1;
+ int offsetArraySize = numVectors * 4;
+
+ ByteBuffer page =
+ ByteBuffer.allocate(7 + offsetArraySize + v0DataSize + v1DataSize)
+ .order(ByteOrder.LITTLE_ENDIAN);
+
+ page.put((byte) 0).put((byte) 0).put((byte) 3);
+ page.putInt(numElements);
+ page.putInt(offsetArraySize);
+ page.putInt(offsetArraySize + v0DataSize);
+
+ // Vector 0
+ page.put((byte) 0).put((byte) 0).putShort((short) 0);
+ page.putInt(1).put((byte) 3);
+ page.put(v0Packed, 0, 3);
+
+ // Vector 1
+ page.put((byte) 0).put((byte) 0).putShort((short) 0);
+ page.putInt(9).put((byte) 1);
+ page.put(v1Packed, 0, 1);
+
+ page.flip();
+
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(numElements, ByteBufferInputStream.wrap(page));
+
+ // Skip entire vector 0 (8 values)
+ reader.skip(8);
+
+ // Read from vector 1
+ assertEquals(Float.floatToRawIntBits(9.0f), Float.floatToRawIntBits(reader.readFloat()));
+ assertEquals(Float.floatToRawIntBits(10.0f), Float.floatToRawIntBits(reader.readFloat()));
+ }
+
+ // ========== Vector Size Validation ==========
+
+ /**
+ * Verify that vectorSize=65536 is rejected because num_exceptions (uint16)
+ * cannot represent 65536, which would cause silent data corruption if all
+ * values in a vector are exceptions.
+ */
+ @Test(expected = IllegalArgumentException.class)
+ public void testVectorSize65536Rejected() throws Exception {
+ new AlpValuesWriter.FloatAlpValuesWriter(256, 256, new DirectByteBufferAllocator(), 65536);
+ }
+
+ /**
+ * Verify that vectorSize=32768 (max allowed) works correctly.
+ */
+ @Test
+ public void testVectorSize32768Works() throws Exception {
+ AlpValuesWriter.FloatAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.FloatAlpValuesWriter(256, 256, new DirectByteBufferAllocator(), 32768);
+ // Write a small number of values (partial vector)
+ for (int i = 0; i < 10; i++) {
+ writer.writeFloat(i * 1.0f);
+ }
+
+ BytesInput input = writer.getBytes();
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(10, ByteBufferInputStream.wrap(input.toByteBuffer()));
+
+ for (int i = 0; i < 10; i++) {
+ assertEquals(
+ Float.floatToRawIntBits(i * 1.0f),
+ Float.floatToRawIntBits(reader.readFloat()));
+ }
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ // ========== Factor > 0 Tests ==========
+
+ /**
+ * Data where the best (e,f) combo has factor > 0.
+ * Values like 1234.56 have digits on both sides of the decimal point.
+ * With e=4,f=2: 1234.56 * 10000 / 100 = 123456 — exact integer.
+ * This exercises the division path in the encode/decode formula.
+ */
+ @Test
+ public void testDoubleFactorGreaterThanZero() throws Exception {
+ // These values are designed so the best combo has f > 0
+ double[] values = new double[1024];
+ for (int i = 0; i < values.length; i++) {
+ values[i] = 1000.0 + i * 0.01; // 1000.00, 1000.01, ...
+ }
+
+ // Verify the best params actually have f > 0 or f == 0 with low exceptions
+ AlpEncoderDecoder.EncodingParams params =
+ AlpEncoderDecoder.findBestDoubleParams(values, 0, values.length);
+ // Regardless of the chosen f, the roundtrip must be lossless
+ roundTripDouble(values);
+ }
+
+ @Test
+ public void testFloatFactorGreaterThanZero() throws Exception {
+ float[] values = new float[1024];
+ for (int i = 0; i < values.length; i++) {
+ values[i] = 1000.0f + i * 0.01f;
+ }
+ roundTripFloat(values);
+ }
+
+ // ========== Overflow Boundary Tests ==========
+
+ /**
+ * Values near the encoding limits that test the overflow check.
+ * The old code used Long.MAX_VALUE (which rounds to 2^63 as double),
+ * the fix uses ENCODING_UPPER_LIMIT = 9223372036854774784.0.
+ */
+ @Test
+ public void testDoubleNearOverflowBoundary() throws Exception {
+ double[] values = {
+ 9.2e18, // near Long.MAX_VALUE
+ -9.2e18, // near Long.MIN_VALUE
+ 1.7e308, // near Double.MAX_VALUE
+ -1.7e308, // near -Double.MAX_VALUE
+ 4.9e-324, // Double.MIN_VALUE (subnormal)
+ 1.0e18, // large but in range
+ -1.0e18, // large negative but in range
+ 0.0, // normal
+ 1.23, // normal
+ };
+ roundTripDouble(values);
+ }
+
+ @Test
+ public void testFloatNearOverflowBoundary() throws Exception {
+ float[] values = {
+ 2.1e9f, // near Integer.MAX_VALUE
+ -2.1e9f, // near Integer.MIN_VALUE
+ 3.4e38f, // near Float.MAX_VALUE
+ -3.4e38f, // near -Float.MAX_VALUE
+ 1.4e-45f, // Float.MIN_VALUE (subnormal)
+ 1.0e9f, // large but in range
+ 0.0f, // normal
+ 1.23f, // normal
+ };
+ roundTripFloat(values);
+ }
+
+ // ========== Large Scale Stress Test ==========
+
+ @Test
+ public void testDoubleLargeScale100K() throws Exception {
+ Random rand = new Random(12345);
+ double[] values = new double[100_000];
+ for (int i = 0; i < values.length; i++) {
+ values[i] = rand.nextDouble() * 10000.0 - 5000.0;
+ }
+ roundTripDouble(values);
+ }
+
+ @Test
+ public void testFloatLargeScale100K() throws Exception {
+ Random rand = new Random(12345);
+ float[] values = new float[100_000];
+ for (int i = 0; i < values.length; i++) {
+ values[i] = rand.nextFloat() * 10000.0f - 5000.0f;
+ }
+ roundTripFloat(values);
+ }
+
+ // ========== All Exceptions in Large Vector ==========
+
+ /**
+ * A full vector where every value is an exception (NaN/Inf/-0.0).
+ * This exercises the case where placeholder search finds no valid value.
+ */
+ @Test
+ public void testFloatEntireVectorAllExceptions() throws Exception {
+ float[] values = new float[DEFAULT_VECTOR_SIZE + 5]; // partial second vector
+ for (int i = 0; i < values.length; i++) {
+ switch (i % 4) {
+ case 0: values[i] = Float.NaN; break;
+ case 1: values[i] = Float.POSITIVE_INFINITY; break;
+ case 2: values[i] = Float.NEGATIVE_INFINITY; break;
+ case 3: values[i] = -0.0f; break;
+ }
+ }
+ roundTripFloat(values);
+ }
+
+ @Test
+ public void testDoubleEntireVectorAllExceptions() throws Exception {
+ double[] values = new double[DEFAULT_VECTOR_SIZE + 5];
+ for (int i = 0; i < values.length; i++) {
+ switch (i % 4) {
+ case 0: values[i] = Double.NaN; break;
+ case 1: values[i] = Double.POSITIVE_INFINITY; break;
+ case 2: values[i] = Double.NEGATIVE_INFINITY; break;
+ case 3: values[i] = -0.0; break;
+ }
+ }
+ roundTripDouble(values);
+ }
+
+ // ========== Verify Encoder Produces Expected Values ==========
+
+ /**
+ * Manually verify that the encoder produces the exact integer values
+ * we expect for known inputs. This is independent of the writer/reader.
+ */
+ @Test
+ public void testEncoderProducesExpectedValues() {
+ // 1.23f * 100 = 123
+ assertEquals(123, AlpEncoderDecoder.encodeFloat(1.23f, 2, 0));
+ // 19.99f * 100 = 1999
+ assertEquals(1999, AlpEncoderDecoder.encodeFloat(19.99f, 2, 0));
+ // -5.0f * 10 = -50
+ assertEquals(-50, AlpEncoderDecoder.encodeFloat(-5.0f, 1, 0));
+ // 0.0f * anything = 0
+ assertEquals(0, AlpEncoderDecoder.encodeFloat(0.0f, 5, 0));
+ // 42.0f * 1 = 42
+ assertEquals(42, AlpEncoderDecoder.encodeFloat(42.0f, 0, 0));
+ // 1.5f * 10 = 15
+ assertEquals(15, AlpEncoderDecoder.encodeFloat(1.5f, 1, 0));
+
+ // Double path
+ assertEquals(123L, AlpEncoderDecoder.encodeDouble(1.23, 2, 0));
+ assertEquals(1999L, AlpEncoderDecoder.encodeDouble(19.99, 2, 0));
+ assertEquals(-50L, AlpEncoderDecoder.encodeDouble(-5.0, 1, 0));
+ assertEquals(0L, AlpEncoderDecoder.encodeDouble(0.0, 5, 0));
+ assertEquals(42L, AlpEncoderDecoder.encodeDouble(42.0, 0, 0));
+ }
+}
diff --git a/parquet-format-structures/pom.xml b/parquet-format-structures/pom.xml
index 7ff9c6a113..9f7ced9186 100644
--- a/parquet-format-structures/pom.xml
+++ b/parquet-format-structures/pom.xml
@@ -66,6 +66,35 @@
+ *
+ */
+public class AlpAdversarialTest {
+
+ // ---------------------------------------------------------------------------
+ // Helpers: build a known-good encoded page, then mutate copies of it
+ // ---------------------------------------------------------------------------
+
+ /** Build a valid ALP-encoded double page with N clean values. */
+ private static byte[] validDoublePage(int valueCount, int vectorSize) throws Exception {
+ AlpValuesWriter.DoubleAlpValuesWriter writer = null;
+ try {
+ int cap = Math.max(512, valueCount * 16);
+ writer = new AlpValuesWriter.DoubleAlpValuesWriter(
+ cap, cap, new DirectByteBufferAllocator(), vectorSize);
+ // 2-decimal values — the ALP sweet spot, ensures no exceptions
+ for (int i = 0; i < valueCount; i++) {
+ writer.writeDouble((i % 1000) / 100.0);
+ }
+ BytesInput bi = writer.getBytes();
+ ByteBuffer bb = bi.toByteBuffer();
+ byte[] out = new byte[bb.remaining()];
+ bb.duplicate().get(out);
+ return out;
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ /** Build a valid ALP-encoded float page with N clean values. */
+ private static byte[] validFloatPage(int valueCount, int vectorSize) throws Exception {
+ AlpValuesWriter.FloatAlpValuesWriter writer = null;
+ try {
+ int cap = Math.max(256, valueCount * 8);
+ writer = new AlpValuesWriter.FloatAlpValuesWriter(
+ cap, cap, new DirectByteBufferAllocator(), vectorSize);
+ for (int i = 0; i < valueCount; i++) {
+ writer.writeFloat((i % 1000) / 100.0f);
+ }
+ BytesInput bi = writer.getBytes();
+ ByteBuffer bb = bi.toByteBuffer();
+ byte[] out = new byte[bb.remaining()];
+ bb.duplicate().get(out);
+ return out;
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ /** Sanity baseline: the known-good page actually decodes cleanly. */
+ @Test
+ public void sanityBaselineDecodesClean() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ reader.initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ for (int i = 0; i < 32; i++) reader.readDouble();
+ }
+
+ // ---------------------------------------------------------------------------
+ // Header-level validation (already-validated paths)
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void rejectsBadCompressionMode() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ page[0] = (byte) 0x99; // mode is at byte 0
+ ParquetDecodingException ex = assertThrows(ParquetDecodingException.class, () -> {
+ new AlpValuesReaderForDouble().initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ });
+ assertTrue(ex.getMessage(), ex.getMessage().toLowerCase().contains("compression"));
+ }
+
+ @Test
+ public void rejectsBadIntegerEncoding() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ page[1] = (byte) 0x99; // integer_encoding is at byte 1
+ ParquetDecodingException ex = assertThrows(ParquetDecodingException.class, () -> {
+ new AlpValuesReaderForDouble().initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ });
+ assertTrue(ex.getMessage(), ex.getMessage().toLowerCase().contains("integer encoding"));
+ }
+
+ @Test
+ public void rejectsLogVectorSizeTooLarge() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ page[2] = (byte) 99; // log_vector_size at byte 2
+ ParquetDecodingException ex = assertThrows(ParquetDecodingException.class, () -> {
+ new AlpValuesReaderForDouble().initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ });
+ assertTrue(ex.getMessage(), ex.getMessage().toLowerCase().contains("vector size"));
+ }
+
+ @Test
+ public void rejectsLogVectorSizeTooSmall() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ page[2] = (byte) 2; // below MIN_LOG_VECTOR_SIZE=3
+ assertThrows(ParquetDecodingException.class, () -> {
+ new AlpValuesReaderForDouble().initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ });
+ }
+
+ @Test
+ public void rejectsNegativeNumElements() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ // num_elements is int32 LE at bytes 3..6 — write -1
+ page[3] = (byte) 0xFF;
+ page[4] = (byte) 0xFF;
+ page[5] = (byte) 0xFF;
+ page[6] = (byte) 0xFF;
+ ParquetDecodingException ex = assertThrows(ParquetDecodingException.class, () -> {
+ new AlpValuesReaderForDouble().initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ });
+ assertTrue(ex.getMessage(), ex.getMessage().toLowerCase().contains("element count"));
+ }
+
+ @Test
+ public void rejectsNumElementsGreaterThanValuesCount() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ // num_elements stays 32; pass valuesCount=10 (smaller than encoded count)
+ ParquetDecodingException ex = assertThrows(ParquetDecodingException.class, () -> {
+ new AlpValuesReaderForDouble().initFromPage(10, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ });
+ assertTrue(ex.getMessage(), ex.getMessage().toLowerCase().contains("exceeds"));
+ }
+
+ // ---------------------------------------------------------------------------
+ // Vector-level validation (already-validated paths, surface lazily on decode)
+ // ---------------------------------------------------------------------------
+
+ /** Helper: find the byte position where the first vector's metadata starts. */
+ private static int firstVectorOffset(byte[] page) {
+ // header (7) + first 4 bytes of offset array = the offset value itself
+ int firstVectorOff = ByteBuffer.wrap(page, 7, 4).order(ByteOrder.LITTLE_ENDIAN).getInt();
+ // offsets are measured from the start of the compression body (after the 7B header)
+ return 7 + firstVectorOff;
+ }
+
+ @Test
+ public void rejectsExponentTooHighDouble() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ int v0 = firstVectorOffset(page);
+ page[v0] = (byte) 99; // exponent byte
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ reader.initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ ParquetDecodingException ex = assertThrows(ParquetDecodingException.class, reader::readDouble);
+ assertTrue(ex.getMessage(), ex.getMessage().toLowerCase().contains("exponent"));
+ }
+
+ @Test
+ public void rejectsExponentTooHighFloat() throws Exception {
+ byte[] page = validFloatPage(32, 16);
+ int v0 = firstVectorOffset(page);
+ page[v0] = (byte) 99;
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ ParquetDecodingException ex = assertThrows(ParquetDecodingException.class, reader::readFloat);
+ assertTrue(ex.getMessage(), ex.getMessage().toLowerCase().contains("exponent"));
+ }
+
+ @Test
+ public void rejectsFactorGreaterThanExponent() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ int v0 = firstVectorOffset(page);
+ page[v0] = (byte) 2; // exponent
+ page[v0 + 1] = (byte) 5; // factor > exponent
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ reader.initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ ParquetDecodingException ex = assertThrows(ParquetDecodingException.class, reader::readDouble);
+ assertTrue(ex.getMessage(), ex.getMessage().toLowerCase().contains("factor"));
+ }
+
+ @Test
+ public void rejectsTooManyExceptions() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ int v0 = firstVectorOffset(page);
+ // num_exceptions at v0+2, uint16 LE — set to 9999, way more than vectorLen=16
+ page[v0 + 2] = (byte) (9999 & 0xFF);
+ page[v0 + 3] = (byte) ((9999 >>> 8) & 0xFF);
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ reader.initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ ParquetDecodingException ex = assertThrows(ParquetDecodingException.class, reader::readDouble);
+ assertTrue(ex.getMessage(), ex.getMessage().toLowerCase().contains("numexceptions"));
+ }
+
+ // ---------------------------------------------------------------------------
+ // Currently-unvalidated paths: truncation and corrupted offsets
+ // These currently fail with low-level Throwables (BufferUnderflowException,
+ // IndexOutOfBoundsException). The tests assert any Throwable is raised so we
+ // notice if a regression silently swallows the corruption.
+ // ---------------------------------------------------------------------------
+
+ /** Page with only the 7-byte header — nothing else. */
+ @Test
+ public void rejectsHeaderOnlyPage() {
+ byte[] tiny = new byte[]{0x00, 0x00, 0x0A, 0x20, 0x00, 0x00, 0x00}; // 32 elements, log_vec=10
+ Throwable t = catchAny(() -> {
+ new AlpValuesReaderForDouble().initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(tiny)));
+ });
+ assertNotNull("header-only page must raise", t);
+ }
+
+ @Test
+ public void rejectsPageTruncatedMidOffsetArray() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ // num_vectors = ceil(32/16) = 2, so offset array is 8 bytes. Truncate to chop the 2nd offset.
+ byte[] truncated = new byte[7 + 4]; // header + first offset only
+ System.arraycopy(page, 0, truncated, 0, truncated.length);
+ Throwable t = catchAny(() -> {
+ new AlpValuesReaderForDouble().initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(truncated)));
+ });
+ assertNotNull("truncated offset array must raise", t);
+ }
+
+ @Test
+ public void rejectsPageTruncatedMidVectorData() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ // chop the last 20 bytes — guaranteed to land in the middle of the second vector
+ byte[] truncated = new byte[page.length - 20];
+ System.arraycopy(page, 0, truncated, 0, truncated.length);
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ // initFromPage should still succeed (truncation is inside the vectors section,
+ // which initFromPage just slices without parsing). The failure surfaces on decode.
+ reader.initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(truncated)));
+ Throwable t = catchAny(() -> {
+ for (int i = 0; i < 32; i++) reader.readDouble();
+ });
+ assertNotNull("truncated vector data must raise on read", t);
+ }
+
+ @Test
+ public void rejectsCorruptedOffsetPointingPastEnd() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ // Offset array starts at byte 7. Overwrite the first offset (uint32 LE) with a huge value.
+ page[7] = (byte) 0xFF;
+ page[8] = (byte) 0xFF;
+ page[9] = (byte) 0xFF;
+ page[10] = (byte) 0x7F;
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ reader.initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ Throwable t = catchAny(() -> reader.readDouble());
+ assertNotNull("corrupted offset must raise on decode", t);
+ }
+
+ // ---------------------------------------------------------------------------
+ // skip() / read() bounds
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void rejectsSkipPastEnd() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ reader.initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ assertThrows(ParquetDecodingException.class, () -> reader.skip(33));
+ }
+
+ @Test
+ public void rejectsNegativeSkip() throws Exception {
+ byte[] page = validDoublePage(32, 16);
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ reader.initFromPage(32, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ assertThrows(ParquetDecodingException.class, () -> reader.skip(-1));
+ }
+
+ @Test
+ public void rejectsReadPastEnd() throws Exception {
+ byte[] page = validDoublePage(8, 8);
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ reader.initFromPage(8, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));
+ for (int i = 0; i < 8; i++) reader.readDouble();
+ ParquetDecodingException ex = assertThrows(ParquetDecodingException.class, reader::readDouble);
+ assertTrue(ex.getMessage(), ex.getMessage().toLowerCase().contains("exhausted"));
+ }
+
+ // ---------------------------------------------------------------------------
+ // Utility
+ // ---------------------------------------------------------------------------
+
+ @FunctionalInterface
+ private interface ThrowingRunnable {
+ void run() throws Throwable;
+ }
+
+ /** Catch any Throwable (including low-level RuntimeExceptions / Errors). */
+ private static Throwable catchAny(ThrowingRunnable r) {
+ try {
+ r.run();
+ } catch (Throwable t) {
+ return t;
+ }
+ fail("Expected a Throwable but none was raised");
+ return null; // unreachable
+ }
+}
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpBitPackingTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpBitPackingTest.java
new file mode 100644
index 0000000000..ed6a240f3e
--- /dev/null
+++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpBitPackingTest.java
@@ -0,0 +1,283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.values.alp;
+
+import static org.junit.Assert.*;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.BytesInput;
+import org.apache.parquet.bytes.DirectByteBufferAllocator;
+import org.apache.parquet.column.values.bitpacking.BytePacker;
+import org.apache.parquet.column.values.bitpacking.BytePackerForLong;
+import org.apache.parquet.column.values.bitpacking.Packer;
+import org.junit.Test;
+
+/**
+ * Tests for bit-packing behavior in the ALP encoding pipeline.
+ */
+public class AlpBitPackingTest {
+
+ @Test
+ public void testBytePackerIntRoundTrip() {
+ // Verify BytePacker pack/unpack round-trip for various bit widths
+ for (int bitWidth = 1; bitWidth <= 31; bitWidth++) {
+ BytePacker packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidth);
+ int maxVal = (int) Math.min((1L << bitWidth) - 1, Integer.MAX_VALUE);
+
+ int[] input = new int[8];
+ for (int i = 0; i < 8; i++) {
+ input[i] = (maxVal / 8) * i;
+ }
+
+ byte[] packed = new byte[bitWidth];
+ packer.pack8Values(input, 0, packed, 0);
+
+ int[] unpacked = new int[8];
+ ByteBuffer buf = ByteBuffer.wrap(packed);
+ packer.unpack8Values(buf, 0, unpacked, 0);
+
+ for (int i = 0; i < 8; i++) {
+ assertEquals("BitWidth=" + bitWidth + " index=" + i, input[i], unpacked[i]);
+ }
+ }
+ }
+
+ @Test
+ public void testBytePackerForLongRoundTrip() {
+ // Verify BytePackerForLong pack/unpack round-trip for various bit widths
+ for (int bitWidth = 1; bitWidth <= 63; bitWidth++) {
+ BytePackerForLong packer = Packer.LITTLE_ENDIAN.newBytePackerForLong(bitWidth);
+ long maxVal = (bitWidth == 63) ? Long.MAX_VALUE : (1L << bitWidth) - 1;
+
+ long[] input = new long[8];
+ for (int i = 0; i < 8; i++) {
+ input[i] = (maxVal / 8) * i;
+ }
+
+ byte[] packed = new byte[bitWidth];
+ packer.pack8Values(input, 0, packed, 0);
+
+ long[] unpacked = new long[8];
+ ByteBuffer buf = ByteBuffer.wrap(packed);
+ packer.unpack8Values(buf, 0, unpacked, 0);
+
+ for (int i = 0; i < 8; i++) {
+ assertEquals("BitWidth=" + bitWidth + " index=" + i, input[i], unpacked[i]);
+ }
+ }
+ }
+
+ @Test
+ public void testSimpleTwoFloats() throws Exception {
+ AlpValuesWriter.FloatAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.FloatAlpValuesWriter(256, 256, new DirectByteBufferAllocator());
+ writer.writeFloat(1.0f);
+ writer.writeFloat(2.0f);
+
+ BytesInput input = writer.getBytes();
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(2, ByteBufferInputStream.wrap(input.toByteBuffer()));
+
+ assertEquals(Float.floatToRawIntBits(1.0f), Float.floatToRawIntBits(reader.readFloat()));
+ assertEquals(Float.floatToRawIntBits(2.0f), Float.floatToRawIntBits(reader.readFloat()));
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ @Test
+ public void testThreeFloatsWithNegative() throws Exception {
+ AlpValuesWriter.FloatAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.FloatAlpValuesWriter(256, 256, new DirectByteBufferAllocator());
+ writer.writeFloat(1.0f);
+ writer.writeFloat(-1.0f);
+ writer.writeFloat(2.0f);
+
+ BytesInput input = writer.getBytes();
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(3, ByteBufferInputStream.wrap(input.toByteBuffer()));
+
+ assertEquals(Float.floatToRawIntBits(1.0f), Float.floatToRawIntBits(reader.readFloat()));
+ assertEquals(Float.floatToRawIntBits(-1.0f), Float.floatToRawIntBits(reader.readFloat()));
+ assertEquals(Float.floatToRawIntBits(2.0f), Float.floatToRawIntBits(reader.readFloat()));
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ @Test
+ public void testEncoderDecoderDirectly() {
+ float value = 1.0f;
+ int exponent = 2;
+ int factor = 0;
+
+ assertFalse(AlpEncoderDecoder.isFloatException(value, exponent, factor));
+ int encoded = AlpEncoderDecoder.encodeFloat(value, exponent, factor);
+ float decoded = AlpEncoderDecoder.decodeFloat(encoded, exponent, factor);
+ assertEquals(Float.floatToRawIntBits(value), Float.floatToRawIntBits(decoded));
+ }
+
+ @Test
+ public void testHeaderFormatValidation() throws Exception {
+ AlpValuesWriter.FloatAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.FloatAlpValuesWriter(256, 256, new DirectByteBufferAllocator());
+ writer.writeFloat(1.0f);
+ writer.writeFloat(2.0f);
+
+ BytesInput input = writer.getBytes();
+ byte[] bytes = input.toByteArray();
+
+ // Parse and validate header (7 bytes: mode + encoding + logVectorSize + count)
+ ByteBuffer buf = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN);
+ int compressionMode = buf.get() & 0xFF;
+ int integerEncoding = buf.get() & 0xFF;
+ int logVectorSize = buf.get() & 0xFF;
+ int numElements = buf.getInt();
+
+ assertEquals(AlpConstants.ALP_COMPRESSION_MODE, compressionMode);
+ assertEquals(AlpConstants.ALP_INTEGER_ENCODING_FOR, integerEncoding);
+ assertEquals(AlpConstants.DEFAULT_VECTOR_SIZE_LOG, logVectorSize);
+ assertEquals(2, numElements);
+
+ // Verify offset array follows header (1 vector = 1 offset entry)
+ int offset0 = buf.getInt();
+ // First vector starts right after the offset array (1 vector * 4 bytes = 4)
+ assertEquals(Integer.BYTES, offset0);
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ @Test
+ public void testPartialGroupPacking() throws Exception {
+ // Test with fewer than 8 values to exercise partial group packing/unpacking
+ AlpValuesWriter.FloatAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.FloatAlpValuesWriter(256, 256, new DirectByteBufferAllocator());
+ float[] values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
+ for (float v : values) {
+ writer.writeFloat(v);
+ }
+
+ BytesInput input = writer.getBytes();
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(values.length, ByteBufferInputStream.wrap(input.toByteBuffer()));
+
+ for (float expected : values) {
+ assertEquals(Float.floatToRawIntBits(expected), Float.floatToRawIntBits(reader.readFloat()));
+ }
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ @Test
+ public void testBitWidthZeroPacking() throws Exception {
+ // All identical values should result in bitWidth=0 (no packed data)
+ AlpValuesWriter.FloatAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.FloatAlpValuesWriter(256, 256, new DirectByteBufferAllocator());
+ for (int i = 0; i < 10; i++) {
+ writer.writeFloat(5.0f);
+ }
+
+ BytesInput input = writer.getBytes();
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(10, ByteBufferInputStream.wrap(input.toByteBuffer()));
+
+ for (int i = 0; i < 10; i++) {
+ assertEquals(Float.floatToRawIntBits(5.0f), Float.floatToRawIntBits(reader.readFloat()));
+ }
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ @Test
+ public void testDoublePackingRoundTrip() throws Exception {
+ // Verify double packing round-trip with varying values
+ AlpValuesWriter.DoubleAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.DoubleAlpValuesWriter(512, 512, new DirectByteBufferAllocator());
+ double[] values = {1.0, -1.0, 2.0, 100.5, 0.0, 3.14, 42.0, 99.99, 0.001, 7.77};
+ for (double v : values) {
+ writer.writeDouble(v);
+ }
+
+ BytesInput input = writer.getBytes();
+ AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+ reader.initFromPage(values.length, ByteBufferInputStream.wrap(input.toByteBuffer()));
+
+ for (double expected : values) {
+ assertEquals(Double.doubleToRawLongBits(expected), Double.doubleToRawLongBits(reader.readDouble()));
+ }
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+
+ @Test
+ public void testExactEightValues() throws Exception {
+ // Exactly 8 values = one full packing group, no partial group
+ AlpValuesWriter.FloatAlpValuesWriter writer = null;
+ try {
+ writer = new AlpValuesWriter.FloatAlpValuesWriter(256, 256, new DirectByteBufferAllocator());
+ float[] values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+ for (float v : values) {
+ writer.writeFloat(v);
+ }
+
+ BytesInput input = writer.getBytes();
+ AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+ reader.initFromPage(values.length, ByteBufferInputStream.wrap(input.toByteBuffer()));
+
+ for (float expected : values) {
+ assertEquals(Float.floatToRawIntBits(expected), Float.floatToRawIntBits(reader.readFloat()));
+ }
+ } finally {
+ if (writer != null) {
+ writer.reset();
+ writer.close();
+ }
+ }
+ }
+}
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpEncoderDecoderTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpEncoderDecoderTest.java
new file mode 100644
index 0000000000..b07ed5ede8
--- /dev/null
+++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpEncoderDecoderTest.java
@@ -0,0 +1,349 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.values.alp;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+/**
+ * Tests for the core ALP encoder/decoder logic.
+ */
+public class AlpEncoderDecoderTest {
+
+ // ========== Float Encoding/Decoding Tests ==========
+
+ @Test
+ public void testFloatRoundTrip() {
+ float[] testValues = {0.0f, 1.0f, -1.0f, 3.14159f, 100.5f, 0.001f, 1234567.0f};
+
+ for (float value : testValues) {
+ for (int exponent = 0; exponent <= AlpConstants.FLOAT_MAX_EXPONENT; exponent++) {
+ for (int factor = 0; factor <= exponent; factor++) {
+ if (!AlpEncoderDecoder.isFloatException(value, exponent, factor)) {
+ int encoded = AlpEncoderDecoder.encodeFloat(value, exponent, factor);
+ float decoded = AlpEncoderDecoder.decodeFloat(encoded, exponent, factor);
+ assertEquals(
+ "Round-trip failed for value=" + value + ", exponent=" + exponent + ", factor="
+ + factor,
+ Float.floatToRawIntBits(value),
+ Float.floatToRawIntBits(decoded));
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void testFloatExceptionDetection() {
+ assertTrue("NaN should be an exception", AlpEncoderDecoder.isFloatException(Float.NaN));
+ assertTrue(
+ "Positive infinity should be an exception",
+ AlpEncoderDecoder.isFloatException(Float.POSITIVE_INFINITY));
+ assertTrue(
+ "Negative infinity should be an exception",
+ AlpEncoderDecoder.isFloatException(Float.NEGATIVE_INFINITY));
+ assertTrue("Negative zero should be an exception", AlpEncoderDecoder.isFloatException(-0.0f));
+
+ assertFalse("1.0f should not be a basic exception", AlpEncoderDecoder.isFloatException(1.0f));
+ assertFalse("0.0f should not be a basic exception", AlpEncoderDecoder.isFloatException(0.0f));
+ }
+
+ @Test
+ public void testFloatEncoding() {
+ assertEquals(123, AlpEncoderDecoder.encodeFloat(1.23f, 2, 0));
+ assertEquals(123, AlpEncoderDecoder.encodeFloat(12.3f, 2, 1));
+ assertEquals(0, AlpEncoderDecoder.encodeFloat(0.0f, 5, 0));
+ }
+
+ @Test
+ public void testFloatDecoding() {
+ assertEquals(1.23f, AlpEncoderDecoder.decodeFloat(123, 2, 0), 1e-6f);
+ assertEquals(12.3f, AlpEncoderDecoder.decodeFloat(123, 2, 1), 1e-6f);
+ assertEquals(0.0f, AlpEncoderDecoder.decodeFloat(0, 5, 0), 0.0f);
+ }
+
+ @Test
+ public void testFloatEncodeRounding() {
+ // Verify rounding behavior (magic number trick rounds to nearest)
+ assertEquals(5, AlpEncoderDecoder.encodeFloat(5.4f, 0, 0));
+ assertEquals(6, AlpEncoderDecoder.encodeFloat(5.6f, 0, 0));
+ assertEquals(-5, AlpEncoderDecoder.encodeFloat(-5.4f, 0, 0));
+ assertEquals(-6, AlpEncoderDecoder.encodeFloat(-5.6f, 0, 0));
+ assertEquals(0, AlpEncoderDecoder.encodeFloat(0.0f, 0, 0));
+ }
+
+ @Test
+ public void testFloatEncodeDecodeWithFactor() {
+ // Verify that encode/decode with non-zero factor works correctly.
+ // The key correctness property: encode uses value * POW10[e] * POW10_NEGATIVE[f],
+ // and decode uses encoded * POW10[f] * POW10_NEGATIVE[e].
+ float value = 12.3f;
+ int encoded = AlpEncoderDecoder.encodeFloat(value, 2, 1);
+ assertEquals(123, encoded); // 12.3 * 100 * 0.1 = 123
+ float decoded = AlpEncoderDecoder.decodeFloat(encoded, 2, 1);
+ assertEquals(Float.floatToRawIntBits(value), Float.floatToRawIntBits(decoded));
+ }
+
+ // ========== Double Encoding/Decoding Tests ==========
+
+ @Test
+ public void testDoubleRoundTrip() {
+ double[] testValues = {0.0, 1.0, -1.0, 3.14159265358979, 100.5, 0.001, 12345678901234.0};
+
+ for (double value : testValues) {
+ for (int exponent = 0; exponent <= Math.min(AlpConstants.DOUBLE_MAX_EXPONENT, 10); exponent++) {
+ for (int factor = 0; factor <= exponent; factor++) {
+ if (!AlpEncoderDecoder.isDoubleException(value, exponent, factor)) {
+ long encoded = AlpEncoderDecoder.encodeDouble(value, exponent, factor);
+ double decoded = AlpEncoderDecoder.decodeDouble(encoded, exponent, factor);
+ assertEquals(
+ "Round-trip failed for value=" + value + ", exponent=" + exponent + ", factor="
+ + factor,
+ Double.doubleToRawLongBits(value),
+ Double.doubleToRawLongBits(decoded));
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void testDoubleExceptionDetection() {
+ assertTrue("NaN should be an exception", AlpEncoderDecoder.isDoubleException(Double.NaN));
+ assertTrue(
+ "Positive infinity should be an exception",
+ AlpEncoderDecoder.isDoubleException(Double.POSITIVE_INFINITY));
+ assertTrue(
+ "Negative infinity should be an exception",
+ AlpEncoderDecoder.isDoubleException(Double.NEGATIVE_INFINITY));
+ assertTrue("Negative zero should be an exception", AlpEncoderDecoder.isDoubleException(-0.0));
+
+ assertFalse("1.0 should not be a basic exception", AlpEncoderDecoder.isDoubleException(1.0));
+ assertFalse("0.0 should not be a basic exception", AlpEncoderDecoder.isDoubleException(0.0));
+ }
+
+ @Test
+ public void testDoubleEncoding() {
+ assertEquals(123L, AlpEncoderDecoder.encodeDouble(1.23, 2, 0));
+ assertEquals(123L, AlpEncoderDecoder.encodeDouble(12.3, 2, 1));
+ assertEquals(0L, AlpEncoderDecoder.encodeDouble(0.0, 5, 0));
+ }
+
+ @Test
+ public void testDoubleDecoding() {
+ assertEquals(1.23, AlpEncoderDecoder.decodeDouble(123, 2, 0), 1e-10);
+ assertEquals(12.3, AlpEncoderDecoder.decodeDouble(123, 2, 1), 1e-10);
+ assertEquals(0.0, AlpEncoderDecoder.decodeDouble(0, 5, 0), 0.0);
+ }
+
+ @Test
+ public void testDoubleEncodeRounding() {
+ // Verify rounding behavior (magic number trick rounds to nearest)
+ assertEquals(5L, AlpEncoderDecoder.encodeDouble(5.4, 0, 0));
+ assertEquals(6L, AlpEncoderDecoder.encodeDouble(5.6, 0, 0));
+ assertEquals(-5L, AlpEncoderDecoder.encodeDouble(-5.4, 0, 0));
+ assertEquals(-6L, AlpEncoderDecoder.encodeDouble(-5.6, 0, 0));
+ assertEquals(0L, AlpEncoderDecoder.encodeDouble(0.0, 0, 0));
+ }
+
+ @Test
+ public void testDoubleEncodeDecodeWithFactor() {
+ // Verify that encode/decode with non-zero factor works correctly.
+ // The key correctness property: encode uses value * POW10[e] * POW10_NEGATIVE[f],
+ // and decode uses encoded * POW10[f] * POW10_NEGATIVE[e].
+ double value = 12.3;
+ long encoded = AlpEncoderDecoder.encodeDouble(value, 2, 1);
+ assertEquals(123L, encoded); // 12.3 * 100 * 0.1 = 123
+ double decoded = AlpEncoderDecoder.decodeDouble(encoded, 2, 1);
+ assertEquals(Double.doubleToRawLongBits(value), Double.doubleToRawLongBits(decoded));
+ }
+
+ @Test
+ public void testDoubleEncodeDecodeArithmeticOrder() {
+ // This test verifies that the exact order of operations in encode/decode
+ // is critical for IEEE 754 correctness. The encode uses
+ // fastRound(value * POW10[e] * POW10_NEGATIVE[f]) and decode uses
+ // (encoded * POW10[f] * POW10_NEGATIVE[e]), both as single expressions.
+ // Splitting the multiplies or reordering changes rounding.
+ double[] testValues = {0.123456789, 1.23456789, 12.3456789, 123.456789, 1234.56789};
+ for (double value : testValues) {
+ for (int e = 0; e <= 10; e++) {
+ for (int f = 0; f <= e; f++) {
+ if (!AlpEncoderDecoder.isDoubleException(value, e, f)) {
+ long encoded = AlpEncoderDecoder.encodeDouble(value, e, f);
+ double decoded = AlpEncoderDecoder.decodeDouble(encoded, e, f);
+ assertEquals(
+ "Roundtrip failed for " + value + " (e=" + e + ", f=" + f + ")",
+ Double.doubleToRawLongBits(value),
+ Double.doubleToRawLongBits(decoded));
+ }
+ }
+ }
+ }
+ }
+
+ // ========== Bit Width Tests (renamed methods) ==========
+
+ @Test
+ public void testBitWidthForInt() {
+ assertEquals(0, AlpEncoderDecoder.bitWidthForInt(0));
+ assertEquals(1, AlpEncoderDecoder.bitWidthForInt(1));
+ assertEquals(2, AlpEncoderDecoder.bitWidthForInt(2));
+ assertEquals(2, AlpEncoderDecoder.bitWidthForInt(3));
+ assertEquals(3, AlpEncoderDecoder.bitWidthForInt(4));
+ assertEquals(8, AlpEncoderDecoder.bitWidthForInt(255));
+ assertEquals(9, AlpEncoderDecoder.bitWidthForInt(256));
+ assertEquals(16, AlpEncoderDecoder.bitWidthForInt(65535));
+ assertEquals(31, AlpEncoderDecoder.bitWidthForInt(Integer.MAX_VALUE));
+ }
+
+ @Test
+ public void testBitWidthForLong() {
+ assertEquals(0, AlpEncoderDecoder.bitWidthForLong(0L));
+ assertEquals(1, AlpEncoderDecoder.bitWidthForLong(1L));
+ assertEquals(2, AlpEncoderDecoder.bitWidthForLong(2L));
+ assertEquals(2, AlpEncoderDecoder.bitWidthForLong(3L));
+ assertEquals(3, AlpEncoderDecoder.bitWidthForLong(4L));
+ assertEquals(8, AlpEncoderDecoder.bitWidthForLong(255L));
+ assertEquals(9, AlpEncoderDecoder.bitWidthForLong(256L));
+ assertEquals(16, AlpEncoderDecoder.bitWidthForLong(65535L));
+ assertEquals(31, AlpEncoderDecoder.bitWidthForLong((long) Integer.MAX_VALUE));
+ assertEquals(63, AlpEncoderDecoder.bitWidthForLong(Long.MAX_VALUE));
+ }
+
+ // ========== Best Parameters Tests ==========
+
+ @Test
+ public void testFindBestFloatParams() {
+ float[] values = {1.23f, 4.56f, 7.89f, 10.11f, 12.13f};
+ AlpEncoderDecoder.EncodingParams params = AlpEncoderDecoder.findBestFloatParams(values, 0, values.length);
+
+ assertNotNull(params);
+ assertTrue(params.exponent >= 0 && params.exponent <= AlpConstants.FLOAT_MAX_EXPONENT);
+ assertTrue(params.factor >= 0 && params.factor <= params.exponent);
+
+ for (float v : values) {
+ if (!AlpEncoderDecoder.isFloatException(v, params.exponent, params.factor)) {
+ int encoded = AlpEncoderDecoder.encodeFloat(v, params.exponent, params.factor);
+ float decoded = AlpEncoderDecoder.decodeFloat(encoded, params.exponent, params.factor);
+ assertEquals(Float.floatToRawIntBits(v), Float.floatToRawIntBits(decoded));
+ }
+ }
+ }
+
+ @Test
+ public void testFindBestFloatParamsWithAllExceptions() {
+ float[] values = {Float.NaN, Float.NaN, Float.NaN};
+ AlpEncoderDecoder.EncodingParams params = AlpEncoderDecoder.findBestFloatParams(values, 0, values.length);
+
+ assertNotNull(params);
+ assertEquals(values.length, params.numExceptions);
+ }
+
+ @Test
+ public void testFindBestDoubleParams() {
+ double[] values = {1.23, 4.56, 7.89, 10.11, 12.13};
+ AlpEncoderDecoder.EncodingParams params = AlpEncoderDecoder.findBestDoubleParams(values, 0, values.length);
+
+ assertNotNull(params);
+ assertTrue(params.exponent >= 0 && params.exponent <= AlpConstants.DOUBLE_MAX_EXPONENT);
+ assertTrue(params.factor >= 0 && params.factor <= params.exponent);
+
+ for (double v : values) {
+ if (!AlpEncoderDecoder.isDoubleException(v, params.exponent, params.factor)) {
+ long encoded = AlpEncoderDecoder.encodeDouble(v, params.exponent, params.factor);
+ double decoded = AlpEncoderDecoder.decodeDouble(encoded, params.exponent, params.factor);
+ assertEquals(Double.doubleToRawLongBits(v), Double.doubleToRawLongBits(decoded));
+ }
+ }
+ }
+
+ @Test
+ public void testFindBestDoubleParamsWithAllExceptions() {
+ double[] values = {Double.NaN, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY};
+ AlpEncoderDecoder.EncodingParams params = AlpEncoderDecoder.findBestDoubleParams(values, 0, values.length);
+
+ assertNotNull(params);
+ assertEquals(values.length, params.numExceptions);
+ }
+
+ @Test
+ public void testFindBestParamsWithOffset() {
+ float[] values = {Float.NaN, Float.NaN, 1.23f, 4.56f, 7.89f, Float.NaN};
+ AlpEncoderDecoder.EncodingParams params = AlpEncoderDecoder.findBestFloatParams(values, 2, 3);
+
+ assertNotNull(params);
+ assertEquals(0, params.numExceptions);
+ }
+
+ // ========== Preset-Based Parameter Search Tests ==========
+
+ @Test
+ public void testFindBestFloatParamsWithPresets() {
+ float[] values = {1.23f, 4.56f, 7.89f, 10.11f, 12.13f};
+ int[][] presets = {{2, 0}, {3, 0}, {4, 1}};
+ AlpEncoderDecoder.EncodingParams params =
+ AlpEncoderDecoder.findBestFloatParamsWithPresets(values, 0, values.length, presets);
+
+ assertNotNull(params);
+ // Should select one of the preset combinations
+ boolean foundMatch = false;
+ for (int[] preset : presets) {
+ if (params.exponent == preset[0] && params.factor == preset[1]) {
+ foundMatch = true;
+ break;
+ }
+ }
+ assertTrue("Result should be one of the preset combinations", foundMatch);
+ }
+
+ @Test
+ public void testFindBestDoubleParamsWithPresets() {
+ double[] values = {1.23, 4.56, 7.89, 10.11, 12.13};
+ int[][] presets = {{2, 0}, {3, 0}, {4, 1}};
+ AlpEncoderDecoder.EncodingParams params =
+ AlpEncoderDecoder.findBestDoubleParamsWithPresets(values, 0, values.length, presets);
+
+ assertNotNull(params);
+ boolean foundMatch = false;
+ for (int[] preset : presets) {
+ if (params.exponent == preset[0] && params.factor == preset[1]) {
+ foundMatch = true;
+ break;
+ }
+ }
+ assertTrue("Result should be one of the preset combinations", foundMatch);
+ }
+
+ @Test
+ public void testPresetsProduceSameResultAsFullSearch() {
+ float[] values = {1.23f, 4.56f, 7.89f};
+ AlpEncoderDecoder.EncodingParams fullResult = AlpEncoderDecoder.findBestFloatParams(values, 0, values.length);
+
+ // Include the best params in presets
+ int[][] presets = {{fullResult.exponent, fullResult.factor}, {0, 0}, {1, 0}};
+ AlpEncoderDecoder.EncodingParams presetResult =
+ AlpEncoderDecoder.findBestFloatParamsWithPresets(values, 0, values.length, presets);
+
+ assertTrue(
+ "Preset result should be at least as good as full search",
+ presetResult.numExceptions <= fullResult.numExceptions);
+ }
+}
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpExceptionCountTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpExceptionCountTest.java
new file mode 100644
index 0000000000..11d11401dc
--- /dev/null
+++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpExceptionCountTest.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.column.values.alp;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assume.assumeTrue;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.junit.Test;
+
+/**
+ * Measures exception counts and compression ratios using the real Spotify and Arade datasets from
+ * the parquet-testing repository (Pratik's alpFloatingPointDataset branch).
+ *
+ *
+ * ./mvnw test -pl parquet-column -Dtest=AlpExceptionCountTest
+ *
+ *
+ *
+ * ./mvnw test -pl parquet-column -Dtest=AlpExceptionCountTest \
+ * -DALP_TEST_DATA_DIR=/path/to/alp-test-data
+ *
+ */
+public class AlpExceptionCountTest {
+
+ private static final int VECTOR_SIZE = AlpConstants.DEFAULT_VECTOR_SIZE;
+
+ private File getDataDir() {
+ String dir = System.getProperty("ALP_TEST_DATA_DIR");
+ if (dir == null) dir = System.getenv("ALP_TEST_DATA_DIR");
+ if (dir != null && new File(dir).isDirectory()) return new File(dir);
+ // Default: alp-test-data/ at project root (three levels up from module target)
+ File candidate = new File(System.getProperty("user.dir")).getParentFile();
+ if (candidate != null) {
+ File d = new File(candidate, "alp-test-data");
+ if (d.isDirectory()) return d;
+ }
+ // Also try relative to cwd
+ File d2 = new File(System.getProperty("user.dir"), "alp-test-data");
+ return d2.isDirectory() ? d2 : null;
+ }
+
+ /** Reads a CSV file into per-column double arrays. Handles both ',' and '|' delimiters. */
+ private List> cols = new ArrayList<>();
+ String delim = null;
+ try (BufferedReader br = new BufferedReader(new FileReader(file))) {
+ String line;
+ boolean header = true;
+ while ((line = br.readLine()) != null) {
+ if (delim == null) delim = line.contains("|") ? "\\|" : ",";
+ String[] parts = line.split(delim);
+ if (header) {
+ for (String ignored : parts) cols.add(new ArrayList<>());
+ header = false;
+ continue;
+ }
+ for (int i = 0; i < parts.length && i < cols.size(); i++) {
+ try {
+ cols.get(i).add(Double.parseDouble(parts[i].trim()));
+ } catch (NumberFormatException e) {
+ cols.get(i).add(0.0);
+ }
+ }
+ }
+ }
+ List
Reads ALP-encoded parquet files generated by Arrow C++ and verifies that the Java + * implementation decodes them correctly. + * + *
Set ALP_TEST_FILE (env var or system property) to a single file, or use the
+ * ALP_TEST_DATA_DIR property pointing to the alp-test-data/ directory.
+ *
+ * @see Arrow C++ ALP PR
+ * @see parquet-testing ALP PR
+ */
+public class TestInterOpReadAlp {
+ private static final Logger LOG = LoggerFactory.getLogger(TestInterOpReadAlp.class);
+
+ @Rule
+ public TemporaryFolder temp = new TemporaryFolder();
+
+ private static final String[] CPP_DOUBLE_FILES = {"alp_spotify1.parquet", "alp_arade.parquet"};
+ private static final String[] CPP_FLOAT_FILES = {
+ "alp_float_spotify1.parquet", "alp_float_arade.parquet"
+ };
+
+ private java.nio.file.Path getTestDataDir() {
+ String dir = System.getProperty("ALP_TEST_DATA_DIR");
+ if (dir == null) dir = System.getenv("ALP_TEST_DATA_DIR");
+ if (dir != null && new File(dir).isDirectory()) return Paths.get(dir);
+ // Default: alp-test-data/ relative to project root (two levels up from target/test-classes)
+ java.nio.file.Path candidate =
+ Paths.get(System.getProperty("user.dir")).resolve("alp-test-data");
+ return candidate.toFile().isDirectory() ? candidate : null;
+ }
+
+ private java.nio.file.Path getSingleTestFile() {
+ String filePath = System.getProperty("ALP_TEST_FILE");
+ if (filePath == null) filePath = System.getenv("ALP_TEST_FILE");
+ if (filePath != null && new File(filePath).exists()) return Paths.get(filePath);
+ return null;
+ }
+
+ /** Read all rows from a parquet file using LocalInputFile (no Hadoop FileSystem). */
+ private List This test addresses cross-language compatibility: Java writes, another implementation reads.
+ */
+ @Test
+ public void testJavaWrittenAlpFilesReadableByPyarrow() throws IOException, InterruptedException {
+ // Check python3 is available
+ Process check = new ProcessBuilder("python3", "-c", "import pyarrow.parquet")
+ .redirectErrorStream(true)
+ .start();
+ assumeTrue("python3/pyarrow not available, skipping cross-language test", check.waitFor() == 0);
+
+ for (WriterVersion version : new WriterVersion[] {WriterVersion.PARQUET_1_0, WriterVersion.PARQUET_2_0}) {
+ MessageType schema = MessageTypeParser.parseMessageType(ALP_SCHEMA);
+ java.nio.file.Path outPath =
+ temp.newFolder().toPath().resolve("alp_java_xcompat_" + version.name().toLowerCase() + ".parquet");
+
+ try (ParquetWriter Page version is orthogonal to ALP encoding (the page version difference lives in the
+ * parquet protocol layer, not in the ALP payload), but covering both axes makes the fixture
+ * set fully symmetric for cross-language compatibility — every reader can verify it handles
+ * Java-written ALP regardless of how the surrounding pages are framed.
+ *
+ * To run: clone https://github.com/prtkgaur/parquet-testing/tree/alpFloatingPointDataset
+ * and point ALP_TEST_DATA_DIR at its {@code data/} directory. Set ALP_OUTPUT_DIR to choose
+ * where to write the generated fixtures; if unset, this test skips.
+ */
+ @Test
+ public void generateAlpFixturesAtMultipleVectorSizes() throws IOException {
+ java.nio.file.Path sourceDir = getTestDataDir();
+ assumeTrue("ALP source dir not found, skipping fixture generator", sourceDir != null);
+
+ java.nio.file.Path outDir = getOutputDir();
+ assumeTrue("ALP_OUTPUT_DIR not set, skipping fixture generator", outDir != null);
+ LOG.info("Generating ALP fixtures to {}", outDir);
+
+ int expectedFiles =
+ SOURCE_FILES.length * GENERATOR_PAGE_VERSIONS.length * GENERATOR_VECTOR_SIZES.length;
+ int generated = 0;
+ for (String sourceFile : SOURCE_FILES) {
+ java.nio.file.Path source = sourceDir.resolve(sourceFile);
+ assumeTrue("Source missing: " + source, source.toFile().exists());
+
+ MessageType[] schemaHolder = new MessageType[1];
+ List Conventions:
+ *
+ *
+ */
+ private void writeCornerCaseCsvTruth(java.nio.file.Path csvPath, List