From 8feb495295a88668228a700d2c024e382f690d85 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Fri, 26 Jun 2026 10:06:52 +0000 Subject: [PATCH 01/10] [SYSTEMDS-3949] Add native Delta Lake frame read/write via Delta Kernel Extend the native Delta Lake support from matrices to frames, reading and writing Delta Lake tables through the Spark-free Delta Kernel library on the single-node CP path. DML read/write with format="delta" now works for frames, discovering schema, column names, and dimensions directly from the table. - Add FrameReaderDelta, FrameReaderDeltaParallel and FrameWriterDelta - Wire DELTA into the frame reader and writer factories - Refresh cached frame metadata and schema after a Delta read - Broaden Delta frame component IO coverage Stacked on the matrix Delta support; append/overwrite semantics, distributed execution, and time travel remain out of scope. --- .../controlprogram/caching/FrameObject.java | 11 +- .../sysds/runtime/io/FrameReaderDelta.java | 309 +++++++++ .../runtime/io/FrameReaderDeltaParallel.java | 360 ++++++++++ .../sysds/runtime/io/FrameReaderFactory.java | 2 + .../sysds/runtime/io/FrameWriterDelta.java | 251 +++++++ .../sysds/runtime/io/FrameWriterFactory.java | 2 + .../component/io/DeltaFrameReadWriteTest.java | 624 ++++++++++++++++++ .../io/delta/FrameDeltaReadWriteTest.java | 123 ++++ .../io/delta/FrameDeltaReadCompare.dml | 35 + .../functions/io/delta/FrameDeltaWrite.dml | 32 + 10 files changed, 1748 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java create mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java create mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java create mode 100644 src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java create mode 100644 src/test/java/org/apache/sysds/test/functions/io/delta/FrameDeltaReadWriteTest.java create mode 100644 src/test/scripts/functions/io/delta/FrameDeltaReadCompare.dml create mode 100644 src/test/scripts/functions/io/delta/FrameDeltaWrite.dml diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java index 7151d87211c..219f954cc52 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java @@ -23,6 +23,7 @@ import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.mutable.MutableBoolean; import org.apache.commons.lang3.tuple.Pair; +import org.apache.sysds.api.DMLScript; import org.apache.sysds.common.Types.DataType; import org.apache.sysds.common.Types.FileFormat; import org.apache.sysds.common.Types.ValueType; @@ -203,9 +204,14 @@ protected FrameBlock readBlobFromHDFS(String fname, long[] dims) throws IOExcept .createFrameReader(iimd.getFileFormat(), getFileFormatProperties()) .readFrameFromHDFS(fname, lschema, dc.getRows(), dc.getCols()); - if(iimd.getFileFormat() == FileFormat.CSV) + //Delta and CSV discover dimensions (and Delta also schema) at read time, so + //refresh the cached metadata to reflect the materialized frame block. + if(iimd.getFileFormat() == FileFormat.CSV || iimd.getFileFormat() == FileFormat.DELTA) { _metaData = _metaData instanceof MetaDataFormat ? new MetaDataFormat(data.getDataCharacteristics(), iimd.getFileFormat()) : new MetaData(data.getDataCharacteristics()); + if(iimd.getFileFormat() == FileFormat.DELTA) + _schema = data.getSchema(); + } // sanity check correct output if(data == null) @@ -293,6 +299,9 @@ protected void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatPro FrameWriter writer = FrameWriterFactory.createFrameWriter(fmt, fprop); writer.writeFrameToHDFS(_data, fname, getNumRows(), getNumColumns()); + + if(DMLScript.STATISTICS) + CacheStatistics.incrementHDFSWrites(); } @Override diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java new file mode 100644 index 00000000000..9d5df380552 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.io; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; + +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.frame.data.columns.Array; +import org.apache.sysds.runtime.frame.data.columns.ArrayFactory; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.engine.Engine; +import io.delta.kernel.types.BooleanType; +import io.delta.kernel.types.ByteType; +import io.delta.kernel.types.DataType; +import io.delta.kernel.types.DoubleType; +import io.delta.kernel.types.FloatType; +import io.delta.kernel.types.IntegerType; +import io.delta.kernel.types.LongType; +import io.delta.kernel.types.ShortType; +import io.delta.kernel.types.StringType; +/** + * Single-threaded native Delta Lake reader for frames, built on the Spark-free + * Delta Kernel library. It opens the latest snapshot of a Delta table, reads + * its parquet data files through the kernel's default engine (honoring deletion + * vectors), and materializes the columns into a {@link FrameBlock} whose schema + * and column names are derived from the Delta table schema. + * + *

Data is extracted column-at-a-time into primitive arrays (no per-cell + * boxing or {@code FrameBlock.set} dispatch) and the frame is constructed + * directly from typed column {@link Array}s. Supported column types map to + * SystemDS value types: double, float, long, int, short, byte, boolean, and + * string. Neither the schema nor the dimensions need to be supplied; they are + * discovered from the table.

+ */ +public class FrameReaderDelta extends FrameReader { + + //per-column read codes (how to pull a value out of the Delta column vector); + //package visible so the parallel reader can reuse the same dispatch. + static final int R_DOUBLE = 0, R_FLOAT = 1, R_LONG = 2, R_INT = 3, + R_SHORT = 4, R_BYTE = 5, R_BOOLEAN = 6, R_STRING = 7; + + @Override + public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen, long clen) + throws IOException, DMLRuntimeException + { + Engine engine = DeltaKernelUtils.createEngine(); + String tablePath = DeltaKernelUtils.qualify(fname); + + //per-batch, per-column extracted arrays (boxing free) + ArrayList batchCols = new ArrayList<>(); + ArrayList batchSizes = new ArrayList<>(); + int[] nrowH = new int[1]; + ValueType[][] vtH = new ValueType[1][]; + String[][] nameH = new String[1][]; + int[][] readCodeH = new int[1][]; + + DeltaKernelUtils.scan(engine, tablePath, sch -> { + int ncol = sch.length(); + int[] readCode = new int[ncol]; + ValueType[] vt = new ValueType[ncol]; + String[] cnames = new String[ncol]; + for( int c=0; c { + int n = DeltaKernelUtils.countSelected(size, selected); + Object[] extracted = new Object[ncol]; + for( int c=0; c[] columns = new Array[ncol]; + for( int c=0; c buildColumn(ValueType vt, int nrow, ArrayList batchCols, + ArrayList batchSizes, int c) + { + switch( vt ) { + case FP64: { + double[] all = new double[nrow]; + int off = 0; + for( int b=0; bIt mirrors {@link ReaderDeltaParallel} (the matrix variant) but produces + * typed column {@link Array}s instead of a dense {@code double[]}. As with the + * matrix reader, the expensive part of a Delta read is the per-file parquet + * decode, so parallelizing across data files is the natural speedup. A table + * backed by a single data file cannot be split this way, so the reader + * transparently falls back to the sequential {@link FrameReaderDelta}.

+ */ +public class FrameReaderDeltaParallel extends FrameReaderDelta { + + private final int _numThreads; + + public FrameReaderDeltaParallel() { + _numThreads = OptimizerUtils.getParallelBinaryReadParallelism(); + } + + @Override + public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen, long clen) + throws IOException, DMLRuntimeException + { + Engine engine = DeltaKernelUtils.createEngine(); + String tablePath = DeltaKernelUtils.qualify(fname); + DeltaKernelUtils.ScanHandle handle = DeltaKernelUtils.openScan(engine, tablePath); + + final int nfiles = handle.scanFiles.size(); + //nothing to gain from parallelism for single-file (or empty) tables + if( _numThreads <= 1 || nfiles <= 1 ) + return super.readFrameFromHDFS(fname, schema, names, rlen, clen); + + //derive per-column read codes, value types and names once from the schema + final int ncol = handle.schema.length(); + final int[] readCodes = new int[ncol]; + final ValueType[] vt = new ValueType[ncol]; + final String[] cnames = new String[ncol]; + for( int c=0; c pre-size + //one typed array per column and let each thread decode directly into its + //row offset (no intermediate buffers, no serial concatenation). + if( useDirectPath(handle) ) { + long total = 0; + for( long r : handle.numRecords ) + total += r; + if( total > 0 && total <= Integer.MAX_VALUE ) + return readDirect(fname, handle, ncol, readCodes, vt, cnames, (int) total); + } + + return readBuffered(fname, handle, ncol, readCodes, vt, cnames); + } + + /** + * Whether the metadata-driven direct-write fast path can be used for this + * table (exact per-file row counts and no deletion vectors). Visible for + * testing: the buffered fallback is otherwise only reachable for tables + * lacking row statistics or carrying deletion vectors, which the SystemDS + * Delta writer never produces. + * + * @param handle the opened scan handle + * @return true if the direct path is applicable + */ + protected boolean useDirectPath(DeltaKernelUtils.ScanHandle handle) { + return handle.hasExactRowCounts(); + } + + /** + * Fast path: each thread decodes one data file straight into the final typed + * column arrays at a metadata-derived row offset. Single allocation per + * column, fully parallel. + */ + private FrameBlock readDirect(String fname, DeltaKernelUtils.ScanHandle handle, + int ncol, int[] readCodes, ValueType[] vt, String[] cnames, int nrow) throws IOException + { + final int nfiles = handle.scanFiles.size(); + final int[] rowOffset = new int[nfiles]; + int acc = 0; + for( int i=0; i> tasks = new ArrayList<>(nfiles); + for( int i=0; i { + int[] cur = new int[] {base}; + Engine eng = DeltaKernelUtils.createEngine(); + DeltaKernelUtils.readScanFile(eng, handle.scanState, handle.physicalReadSchema, scanFileRow, + (cols, size, selected) -> { + for( int c=0; c f : pool.invokeAll(tasks) ) + f.get(); + } + catch(Exception ex) { + throw new IOException("Failed parallel read of Delta table: " + fname, ex); + } + finally { + pool.shutdown(); + } + + Array[] columns = new Array[ncol]; + for( int c=0; c[] fileCols = new ArrayList[nfiles]; + @SuppressWarnings("unchecked") + final ArrayList[] fileSizes = new ArrayList[nfiles]; + final ExecutorService pool = CommonThreadPool.get(_numThreads); + try { + ArrayList> tasks = new ArrayList<>(nfiles); + for( int i=0; i { + ArrayList bcs = new ArrayList<>(); + ArrayList bss = new ArrayList<>(); + Engine eng = DeltaKernelUtils.createEngine(); + DeltaKernelUtils.readScanFile(eng, handle.scanState, handle.physicalReadSchema, scanFileRow, + (cols, size, selected) -> { + int n = DeltaKernelUtils.countSelected(size, selected); + Object[] extracted = new Object[ncol]; + for( int c=0; c f : pool.invokeAll(tasks) ) + f.get(); + } + catch(Exception ex) { + throw new IOException("Failed parallel read of Delta table: " + fname, ex); + } + finally { + pool.shutdown(); + } + + //flatten the per-file batches in file order and concatenate per column + ArrayList batchCols = new ArrayList<>(); + ArrayList batchSizes = new ArrayList<>(); + int nrow = 0; + for( int i=0; i[] columns = new Array[ncol]; + for( int c=0; c createColumn(ValueType vt, Object full) { + switch( vt ) { + case FP64: return ArrayFactory.create((double[]) full); + case FP32: return ArrayFactory.create((float[]) full); + case INT64: return ArrayFactory.create((long[]) full); + case INT32: return ArrayFactory.create((int[]) full); + case BOOLEAN: return ArrayFactory.create((boolean[]) full); + default: return ArrayFactory.create((String[]) full); // STRING + } + } + + /** + * Decode the live (selected, after deletion vector) rows of one column batch + * directly into a pre-sized typed array starting at absolute row {@code destOff}. + * Null numeric cells keep the array default (0); string nulls are stored as null. + */ + private static void extractColumnInto(ColumnVector col, int size, boolean[] selected, + int readCode, Object dest, int destOff) + { + switch( readCode ) { + case R_DOUBLE: { + double[] a = (double[]) dest; + int lr = destOff; + for( int r=0; r[] cols = new Array[ncol]; + boolean[] nullable = new boolean[ncol]; + for( int c=0; c { + private final Array[] _cols; + private final boolean[] _nullable; + private final StructType _schema; + private final int _nrow; + private final int _ncol; + private int _pos = 0; + + FrameBatchIterator(Array[] cols, boolean[] nullable, StructType schema, int nrow, int ncol) { + _cols = cols; + _nullable = nullable; + _schema = schema; + _nrow = nrow; + _ncol = ncol; + } + + @Override + public boolean hasNext() { + return _pos < _nrow; + } + + @Override + public FilteredColumnarBatch next() { + if( !hasNext() ) + throw new NoSuchElementException(); + int size = Math.min(BATCH_ROWS, _nrow - _pos); + ColumnarBatch batch = new FrameColumnarBatch(_cols, _nullable, _schema, _pos, size, _ncol); + _pos += size; + return new FilteredColumnarBatch(batch, Optional.empty()); + } + + @Override + public void close() { + //nothing to release + } + } + + /** Read-only view of a row range of the frame columns as a Delta Kernel columnar batch. */ + private static class FrameColumnarBatch implements ColumnarBatch { + private final Array[] _cols; + private final boolean[] _nullable; + private final StructType _schema; + private final int _rowStart; + private final int _size; + private final int _ncol; + + FrameColumnarBatch(Array[] cols, boolean[] nullable, StructType schema, int rowStart, int size, int ncol) { + _cols = cols; + _nullable = nullable; + _schema = schema; + _rowStart = rowStart; + _size = size; + _ncol = ncol; + } + + @Override + public StructType getSchema() { + return _schema; + } + + @Override + public ColumnVector getColumnVector(int ordinal) { + if( ordinal < 0 || ordinal >= _ncol ) + throw new IndexOutOfBoundsException("column ordinal " + ordinal); + return new FrameColumnVector(_cols[ordinal], _nullable[ordinal], + _schema.at(ordinal).getDataType(), _rowStart, _size); + } + + @Override + public int getSize() { + return _size; + } + } + + /** + * Read-only typed column view over one column {@link Array} row range. Numeric + * values are read through {@link Array#getAsDouble(int)} to avoid boxing, and + * non-nullable columns short-circuit {@code isNullAt} so the kernel never pays + * for a redundant boxed fetch. + */ + private static class FrameColumnVector implements ColumnVector { + private final Array _col; + private final boolean _nullable; + private final DataType _type; + private final int _rowStart; + private final int _size; + + FrameColumnVector(Array col, boolean nullable, DataType type, int rowStart, int size) { + _col = col; + _nullable = nullable; + _type = type; + _rowStart = rowStart; + _size = size; + } + + @Override + public DataType getDataType() { + return _type; + } + + @Override + public int getSize() { + return _size; + } + + @Override + public boolean isNullAt(int rowId) { + return _nullable && _col.get(_rowStart + rowId) == null; + } + + @Override + public String getString(int rowId) { + Object v = _col.get(_rowStart + rowId); + return (v == null) ? null : v.toString(); + } + + @Override + public boolean getBoolean(int rowId) { + return _col.getAsDouble(_rowStart + rowId) != 0; + } + + @Override + public double getDouble(int rowId) { + return _col.getAsDouble(_rowStart + rowId); + } + + @Override + public float getFloat(int rowId) { + return (float) _col.getAsDouble(_rowStart + rowId); + } + + @Override + public long getLong(int rowId) { + //exact for INT64 (getAsDouble would lose precision beyond 2^53) + return ((Number) _col.get(_rowStart + rowId)).longValue(); + } + + @Override + public int getInt(int rowId) { + return (int) _col.getAsDouble(_rowStart + rowId); + } + + @Override + public void close() { + //nothing to release + } + } +} diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameWriterFactory.java b/src/main/java/org/apache/sysds/runtime/io/FrameWriterFactory.java index 3fb3968c96f..ff38eb395dd 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameWriterFactory.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameWriterFactory.java @@ -50,6 +50,8 @@ public static FrameWriter createFrameWriter(FileFormat fmt, FileFormatProperties return binaryParallel ? new FrameWriterBinaryBlockParallel() : new FrameWriterBinaryBlock(); case PROTO: return new FrameWriterProto(); + case DELTA: + return new FrameWriterDelta(); default: throw new DMLRuntimeException("Failed to create frame writer for unknown format: " + fmt.toString()); } diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java new file mode 100644 index 00000000000..1c1d33a80d6 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java @@ -0,0 +1,624 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.component.io; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.NoSuchElementException; +import java.util.Objects; +import java.util.Optional; +import java.util.Random; + +import org.apache.commons.io.FileUtils; +import org.apache.sysds.common.Types.FileFormat; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.conf.CompilerConfig; +import org.apache.sysds.conf.CompilerConfig.ConfigType; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.conf.DMLConfig; +import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.io.DeltaKernelUtils; +import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.io.FrameReaderDelta; +import org.apache.sysds.runtime.io.FrameReaderDeltaParallel; +import org.apache.sysds.runtime.io.FrameReaderFactory; +import org.apache.sysds.runtime.io.FrameWriterDelta; +import org.junit.Test; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.ColumnarBatch; +import io.delta.kernel.data.FilteredColumnarBatch; +import io.delta.kernel.engine.Engine; +import io.delta.kernel.types.ByteType; +import io.delta.kernel.types.DataType; +import io.delta.kernel.types.DateType; +import io.delta.kernel.types.DoubleType; +import io.delta.kernel.types.LongType; +import io.delta.kernel.types.ShortType; +import io.delta.kernel.types.StringType; +import io.delta.kernel.types.StructType; +import io.delta.kernel.utils.CloseableIterator; + +/** + * Direct (no DML) round-trip tests for the native Delta Kernel based frame + * reader/writer. Each test writes a FrameBlock to a fresh local Delta table + * directory and reads it back, asserting the discovered schema, column names, + * dimensions, and per-cell values match. Several tests additionally assert that + * the parallel reader ({@link FrameReaderDeltaParallel}) agrees with the serial + * reader cell-for-cell across a multi-file table (both its direct and buffered + * paths). + */ +public class DeltaFrameReadWriteTest { + + //nonsense schema/dims handed to the reader to confirm it discovers everything + private static final ValueType[] NO_SCHEMA = new ValueType[] {ValueType.STRING}; + private static final String[] NO_NAMES = new String[] {"x"}; + + //small target file size + enough random rows so the writer rolls multiple + //data files, exercising the per-file parallel read path rather than the + //single-file serial fallback. + private static final long SMALL_TARGET_FILE_SIZE = 512L * 1024; + private static final int ROWS_MULTI_FILE = 150_000; + + private static FrameBlock writeThenRead(FrameBlock in) throws Exception { + Path dir = Files.createTempDirectory("sysds_delta_frame_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); + //pass nonsense schema/dims: the reader must discover everything from the table + return new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); + } + finally { + FileUtils.deleteQuietly(dir.toFile()); + } + } + + private static FrameBlock alloc(ValueType[] schema, String[] names, int nrow) { + FrameBlock fb = new FrameBlock(schema, names); + fb.ensureAllocatedColumns(nrow); + return fb; + } + + @Test + public void roundTripMixedTypes() throws Exception { + ValueType[] schema = {ValueType.STRING, ValueType.INT64, ValueType.FP64, + ValueType.BOOLEAN, ValueType.INT32, ValueType.FP32}; + String[] names = {"name", "id", "score", "active", "count", "ratio"}; + int nrow = 5; + FrameBlock in = alloc(schema, names, nrow); + for( int r=0; r readBuffered() + FrameBlock buffered = new FrameReaderDeltaParallel() { + @Override protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { return false; } + }.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); + + assertFramesEqual(serial, buffered); + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + + @Test + public void factoryRoutesDeltaToParallelWhenEnabled() { + //the factory must pick the parallel frame reader iff parallel CP read is enabled + CompilerConfig cc = ConfigurationManager.getCompilerConfig(); + try { + cc.set(ConfigType.PARALLEL_CP_READ_TEXTFORMATS, true); + ConfigurationManager.setLocalConfig(cc); + FrameReader par = FrameReaderFactory.createFrameReader(FileFormat.DELTA); + assertTrue("expected FrameReaderDeltaParallel when parallel read enabled", + par instanceof FrameReaderDeltaParallel); + + cc.set(ConfigType.PARALLEL_CP_READ_TEXTFORMATS, false); + ConfigurationManager.setLocalConfig(cc); + FrameReader ser = FrameReaderFactory.createFrameReader(FileFormat.DELTA); + assertTrue("expected serial FrameReaderDelta when parallel read disabled", + ser instanceof FrameReaderDelta && !(ser instanceof FrameReaderDeltaParallel)); + } + finally { + ConfigurationManager.clearLocalConfigs(); + } + } + + @Test + public void readerBatchSizeConfigRoundTrips() throws Exception { + //a non-default reader batch size must not change the result (more, smaller + //batches exercise the per-batch extract/concatenate loop more often). + DMLConfig conf = new DMLConfig(); + conf.setTextValue(DMLConfig.DELTA_READER_BATCH_SIZE, "128"); + ConfigurationManager.setLocalConfig(conf); + Path dir = Files.createTempDirectory("sysds_delta_frame_bs_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + assertEquals("config getter reflects the override", + 128, ConfigurationManager.getDeltaReaderBatchSize()); + + FrameBlock in = genMixedFrame(5000, 31); + new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); + FrameBlock out = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); + assertFramesEqual(in, out); + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + + @Test + public void writerTargetFileSizeConfigProducesMoreFiles() throws Exception { + //a smaller configured target file size must make the writer roll more + //data files for the same frame (the lever the parallel reader relies on). + DMLConfig conf = new DMLConfig(); + conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(SMALL_TARGET_FILE_SIZE)); + ConfigurationManager.setLocalConfig(conf); + Path dir = Files.createTempDirectory("sysds_delta_frame_cfg_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + assertEquals("config getter reflects the override", + SMALL_TARGET_FILE_SIZE, ConfigurationManager.getDeltaWriterTargetFileSize()); + + FrameBlock in = genMixedFrame(ROWS_MULTI_FILE, 41); + new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); + assertMultiFile(tablePath); + + //data still round-trips correctly with the custom layout + FrameBlock out = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); + assertFramesEqual(in, out); + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + + @Test + public void emptyFrameRoundTrip() throws Exception { + //a schema-only Delta table (no data files, 0 rows); the reader must + //rebuild empty typed columns and discover the schema/names from the table. + ValueType[] schema = {ValueType.STRING, ValueType.FP64, ValueType.INT64}; + String[] names = {"s", "d", "k"}; + DataType[] dtypes = {StringType.STRING, DoubleType.DOUBLE, LongType.LONG}; + + Path dir = Files.createTempDirectory("sysds_delta_frame_empty_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + writeEmptyTable(tablePath, names, dtypes); + FrameBlock out = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); + assertEquals("rows", 0, out.getNumRows()); + assertEquals("cols", schema.length, out.getNumColumns()); + for( int c=0; c writer must reject + new FrameWriterDelta().writeFrameToHDFS(fb, tablePath, + fb.getNumRows() + 1, fb.getNumColumns()); + fail("expected an IOException for a frame/metadata dimension mismatch"); + } + catch(IOException ex) { + assertTrue("message should mention the dimension mismatch, got: " + ex.getMessage(), + ex.getMessage() != null && ex.getMessage().contains("dimensions mismatch")); + } + finally { + FileUtils.deleteQuietly(dir.toFile()); + } + } + + @Test + public void readFromInputStreamUnsupported() throws Exception { + //Delta is a directory-based table format; stream reads are not supported + try { + new FrameReaderDelta().readFrameFromInputStream(null, NO_SCHEMA, NO_NAMES, -1, -1); + fail("expected UnsupportedOperationException for a Delta input-stream read"); + } + catch(UnsupportedOperationException ex) { + //expected: must throw before touching the (null) stream + } + } + + @Test + public void parallelReadStringNullsMatchSerialMultiFile() throws Exception { + //string nulls across a multi-file table: the parallel direct path must + //reproduce the serial read cell-for-cell (assertFramesEqual uses + //Objects.equals, so nulls are compared faithfully). + DMLConfig conf = new DMLConfig(); + conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(SMALL_TARGET_FILE_SIZE)); + ConfigurationManager.setLocalConfig(conf); + Path dir = Files.createTempDirectory("sysds_delta_frame_parnull_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + ValueType[] schema = {ValueType.STRING, ValueType.INT64}; + String[] names = {"s", "k"}; + int nrow = ROWS_MULTI_FILE; + FrameBlock in = alloc(schema, names, nrow); + for( int r=0; r s = Files.walk(new File(tablePath).toPath()) ) { + files = s.filter(p -> p.toString().endsWith(".parquet")).count(); + } + assertTrue("expected a multi-file Delta table to exercise the parallel path, got " + files, + files > 1); + } + + private static void assertFramesEqual(FrameBlock expected, FrameBlock actual) { + assertEquals("rows", expected.getNumRows(), actual.getNumRows()); + assertEquals("cols", expected.getNumColumns(), actual.getNumColumns()); + int ncol = expected.getNumColumns(); + for( int c=0; c empty() { + return new CloseableIterator() { + @Override public boolean hasNext() { return false; } + @Override public FilteredColumnarBatch next() { throw new NoSuchElementException(); } + @Override public void close() {} + }; + } + + /** Writes a single date column (kernel stores dates as INT32 days) used to + * assert the frame reader rejects a non-mappable column type. */ + private static void writeDateColumn(String tablePath, int[] days) throws Exception { + Engine engine = DeltaKernelUtils.createEngine(); + final StructType schema = new StructType().add("d", DateType.DATE, false); + ColumnarBatch batch = new ColumnarBatch() { + @Override public StructType getSchema() { return schema; } + @Override public int getSize() { return days.length; } + @Override public ColumnVector getColumnVector(int ordinal) { return new DateVector(days); } + }; + FilteredColumnarBatch fcb = new FilteredColumnarBatch(batch, Optional.empty()); + DeltaKernelUtils.commit(engine, DeltaKernelUtils.qualify(tablePath), schema, singleton(fcb)); + } + + /** Writes a short column and a byte column (kernel stores these as 16/8-bit + * integers) used to assert the frame reader coerces both to INT32. */ + private static void writeShortByteColumns(String tablePath, short[] shorts, byte[] bytes) throws Exception { + Engine engine = DeltaKernelUtils.createEngine(); + final StructType schema = new StructType() + .add("sh", ShortType.SHORT, false) + .add("by", ByteType.BYTE, false); + ColumnarBatch batch = new ColumnarBatch() { + @Override public StructType getSchema() { return schema; } + @Override public int getSize() { return shorts.length; } + @Override public ColumnVector getColumnVector(int ordinal) { + return (ordinal == 0) ? new ShortVector(shorts) : new ByteVector(bytes); + } + }; + FilteredColumnarBatch fcb = new FilteredColumnarBatch(batch, Optional.empty()); + DeltaKernelUtils.commit(engine, DeltaKernelUtils.qualify(tablePath), schema, singleton(fcb)); + } + + private static CloseableIterator singleton(FilteredColumnarBatch fcb) { + return new CloseableIterator() { + private boolean _done = false; + @Override public boolean hasNext() { return !_done; } + @Override public FilteredColumnarBatch next() { + if( _done ) throw new NoSuchElementException(); + _done = true; + return fcb; + } + @Override public void close() {} + }; + } + + /** Column view exposing an int[] as a Delta date column. */ + private static class DateVector implements ColumnVector { + private final int[] _days; + DateVector(int[] days) { _days = days; } + @Override public DataType getDataType() { return DateType.DATE; } + @Override public int getSize() { return _days.length; } + @Override public boolean isNullAt(int rowId) { return false; } + @Override public int getInt(int rowId) { return _days[rowId]; } + @Override public void close() {} + } + + /** Column view exposing a short[] as a Delta short column. */ + private static class ShortVector implements ColumnVector { + private final short[] _vals; + ShortVector(short[] vals) { _vals = vals; } + @Override public DataType getDataType() { return ShortType.SHORT; } + @Override public int getSize() { return _vals.length; } + @Override public boolean isNullAt(int rowId) { return false; } + @Override public short getShort(int rowId) { return _vals[rowId]; } + @Override public void close() {} + } + + /** Column view exposing a byte[] as a Delta byte column. */ + private static class ByteVector implements ColumnVector { + private final byte[] _vals; + ByteVector(byte[] vals) { _vals = vals; } + @Override public DataType getDataType() { return ByteType.BYTE; } + @Override public int getSize() { return _vals.length; } + @Override public boolean isNullAt(int rowId) { return false; } + @Override public byte getByte(int rowId) { return _vals[rowId]; } + @Override public void close() {} + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/io/delta/FrameDeltaReadWriteTest.java b/src/test/java/org/apache/sysds/test/functions/io/delta/FrameDeltaReadWriteTest.java new file mode 100644 index 00000000000..0a6bba5a163 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/io/delta/FrameDeltaReadWriteTest.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.io.delta; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.util.HashMap; + +import org.apache.sysds.runtime.controlprogram.caching.CacheStatistics; +import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; + +/** + * End-to-end DML test of the native Delta frame read/write path. + * + *

As in the matrix variant, the write and the read run as two separate + * SystemDS executions so the read is a genuine disk read rather than an + * in-memory cache hit. We additionally assert via {@link CacheStatistics} that + * the write run wrote (delta + text reference) and the read run read (delta + + * text reference) from HDFS, so a short-circuited path would fail the test.

+ */ +public class FrameDeltaReadWriteTest extends AutomatedTestBase { + + private final static String TEST_DIR = "functions/io/delta/"; + private final static String TEST_CLASS_DIR = TEST_DIR + FrameDeltaReadWriteTest.class.getSimpleName() + "/"; + private final static String WRITE_NAME = "FrameDeltaWrite"; + private final static String READ_NAME = "FrameDeltaReadCompare"; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(WRITE_NAME, + new TestConfiguration(TEST_CLASS_DIR, WRITE_NAME, new String[] { "ref" })); + addTestConfiguration(READ_NAME, + new TestConfiguration(TEST_CLASS_DIR, READ_NAME, new String[] { "R" })); + } + + @Test + public void testDenseRoundTrip() { + runFrameDeltaRoundTrip(200, 12, 1.0); + } + + @Test + public void testSparseRoundTrip() { + runFrameDeltaRoundTrip(640, 8, 0.2); + } + + @Test + public void testMultiBatchRoundTrip() { + runFrameDeltaRoundTrip(9000, 4, 1.0); + } + + private void runFrameDeltaRoundTrip(int rows, int cols, double sparsity) { + try { + String HOME = SCRIPT_DIR + TEST_DIR; + + // ---- phase 1: write the frame as a Delta table + text reference ---- + getAndLoadTestConfiguration(WRITE_NAME); + String deltaPath = output("deltaTable"); + String refPath = output("ref"); + fullDMLScriptName = HOME + WRITE_NAME + ".dml"; + programArgs = new String[] { "-stats", "-args", + String.valueOf(rows), String.valueOf(cols), String.valueOf(sparsity), + deltaPath, refPath }; + runTest(true, false, null, -1); + + //the write run must materialize two objects to disk: the frame Delta + //table under test + the matrix text reference. FrameWriterDelta genuinely + //hitting HDFS is what produces the frame-side write statistic. + long hdfsWrites = CacheStatistics.getHDFSWrites(); + assertTrue("expected >= 2 HDFS writes in the write run (delta frame + reference), got " + + hdfsWrites, hdfsWrites >= 2); + //and a real Delta table (transaction log) must have been created + assertTrue("missing Delta transaction log under " + deltaPath, + new File(deltaPath, "_delta_log").isDirectory()); + + // ---- phase 2: fresh execution reads the Delta frame and compares ---- + getAndLoadTestConfiguration(READ_NAME); + fullDMLScriptName = HOME + READ_NAME + ".dml"; + programArgs = new String[] { "-stats", "-args", + deltaPath, refPath, output("R") }; + runTest(true, false, null, -1); + + long hdfsReads = CacheStatistics.getHDFSHits(); + assertTrue("expected >= 2 HDFS reads in the read run (delta + reference), got " + + hdfsReads, hdfsReads >= 2); + + HashMap R = readDMLMatrixFromOutputDir("R"); + double diff = R.getOrDefault(new CellIndex(1, 1), 0.0); + double nrow = R.getOrDefault(new CellIndex(1, 2), 0.0); + double ncol = R.getOrDefault(new CellIndex(1, 3), 0.0); + + assertEquals("reconstruction error", 0.0, diff, 1e-12); + assertEquals("discovered rows", rows, (int) nrow); + assertEquals("discovered cols", cols, (int) ncol); + } + catch(Exception ex) { + throw new RuntimeException(ex); + } + } +} diff --git a/src/test/scripts/functions/io/delta/FrameDeltaReadCompare.dml b/src/test/scripts/functions/io/delta/FrameDeltaReadCompare.dml new file mode 100644 index 00000000000..cdf1f0794fc --- /dev/null +++ b/src/test/scripts/functions/io/delta/FrameDeltaReadCompare.dml @@ -0,0 +1,35 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Reader side of the native Delta frame round-trip test. Reads the Delta table +# as a frame (schema + dimensions discovered from the transaction log) and the +# text matrix reference, both genuine HDFS reads in a fresh process, then +# reports the elementwise reconstruction error and the discovered dimensions. + +Y = read($1, data_type="frame", format="delta") +Xref = read($2, format="text") + +M = as.matrix(Y) +R = matrix(0, rows=1, cols=3) +R[1,1] = sum(abs(Xref - M)) # 0 if FrameReaderDelta reconstructed the frame exactly +R[1,2] = nrow(Y) # discovered row count +R[1,3] = ncol(Y) # discovered column count +write(R, $3) diff --git a/src/test/scripts/functions/io/delta/FrameDeltaWrite.dml b/src/test/scripts/functions/io/delta/FrameDeltaWrite.dml new file mode 100644 index 00000000000..5e152dde013 --- /dev/null +++ b/src/test/scripts/functions/io/delta/FrameDeltaWrite.dml @@ -0,0 +1,32 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Writer side of the native Delta frame round-trip test. Generates a matrix, +# converts it to a frame, and materializes it as a Delta table (under test). +# The same matrix is also written as a plain text reference. Running the +# read/compare in a SEPARATE process prevents SystemDS from short-circuiting +# the subsequent read against the in-memory frame, so FrameReaderDelta is +# actually exercised. + +X = rand(rows=$1, cols=$2, min=-5, max=5, seed=7, sparsity=$3) +F = as.frame(X) +write(F, $4, format="delta") +write(X, $5, format="text") From a376168dab7e042b5fff94bf33a067c8a5bbbded Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Sun, 28 Jun 2026 21:43:24 +0000 Subject: [PATCH 02/10] Size native Delta data files adaptively for parallel reads The native Delta read decode is CPU-bound and parallelizes per data file, so a table written as one large file cannot use more than one reader thread. Size data files toward roughly one file per expected parallel reader, capped by the configured target and floored to avoid tiny-file proliferation. This materially improves parallel-read throughput for both matrix and frame tables. - Add the sysds.io.delta.writer.adaptivefilesize config (default true) plus adaptiveWriterTargetFileSize/createWriteEngine helpers in DeltaKernelUtils, and document the target file size as an upper bound - Wire FrameWriterDelta and WriterDelta to size files from the block's estimated bytes (dense double footprint for matrices) - Use the configurable DELTA_WRITER_BATCH_SIZE in FrameWriterDelta instead of a hardcoded batch size, matching the matrix writer --- .../sysds/conf/ConfigurationManager.java | 7 ++- .../java/org/apache/sysds/conf/DMLConfig.java | 6 ++- .../sysds/runtime/io/DeltaKernelUtils.java | 51 +++++++++++++++++-- .../sysds/runtime/io/FrameWriterDelta.java | 16 +++--- .../apache/sysds/runtime/io/WriterDelta.java | 6 ++- 5 files changed, 71 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java index 83676da47a7..8b0f5fe06b9 100644 --- a/src/main/java/org/apache/sysds/conf/ConfigurationManager.java +++ b/src/main/java/org/apache/sysds/conf/ConfigurationManager.java @@ -268,11 +268,16 @@ public static int getDeltaWriterBatchSize() { return getDMLConfig().getIntValue(DMLConfig.DELTA_WRITER_BATCH_SIZE); } - /** @return target data-file size (bytes) for the native Delta writer */ + /** @return upper bound (bytes) on the native Delta writer's target data-file size */ public static long getDeltaWriterTargetFileSize() { return Long.parseLong(getDMLConfig().getTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE)); } + /** @return whether the native Delta writer adaptively sizes data files for parallel reads */ + public static boolean isDeltaWriterAdaptiveFileSize() { + return getDMLConfig().getBooleanValue(DMLConfig.DELTA_WRITER_ADAPTIVE_FILE_SIZE); + } + public static boolean isFederatedSSL(){ return getDMLConfig().getBooleanValue(DMLConfig.USE_SSL_FEDERATED_COMMUNICATION); } diff --git a/src/main/java/org/apache/sysds/conf/DMLConfig.java b/src/main/java/org/apache/sysds/conf/DMLConfig.java index e06b58b07c8..d114ccf69b9 100644 --- a/src/main/java/org/apache/sysds/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysds/conf/DMLConfig.java @@ -73,7 +73,8 @@ public class DMLConfig public static final String IO_COMPRESSION_CODEC = "sysds.io.compression.encoding"; public static final String DELTA_READER_BATCH_SIZE = "sysds.io.delta.reader.batchsize"; // int: rows per parquet read batch public static final String DELTA_WRITER_BATCH_SIZE = "sysds.io.delta.writer.batchsize"; // int: matrix rows materialized per columnar batch handed to the engine - public static final String DELTA_WRITER_TARGET_FILE_SIZE = "sysds.io.delta.writer.targetfilesize"; // long: target data-file size in bytes (smaller -> more files -> more parallel-read throughput) + public static final String DELTA_WRITER_TARGET_FILE_SIZE = "sysds.io.delta.writer.targetfilesize"; // long: upper bound on target data-file size in bytes; adaptive sizing may pick smaller -> more files -> more parallel-read throughput + public static final String DELTA_WRITER_ADAPTIVE_FILE_SIZE = "sysds.io.delta.writer.adaptivefilesize"; // boolean: size data files toward one per parallel reader (capped by targetfilesize) public static final String PARALLEL_ENCODE = "sysds.parallel.encode"; // boolean: enable multi-threaded transformencode and apply public static final String PARALLEL_ENCODE_STAGED = "sysds.parallel.encode.staged"; public static final String PARALLEL_ENCODE_APPLY_BLOCKS = "sysds.parallel.encode.applyBlocks"; @@ -163,7 +164,8 @@ public class DMLConfig _defaultVals.put(IO_COMPRESSION_CODEC, "none"); _defaultVals.put(DELTA_READER_BATCH_SIZE, "4096"); // rows per parquet read batch (Delta Kernel default 1024) _defaultVals.put(DELTA_WRITER_BATCH_SIZE, "4096"); // matrix rows materialized per columnar batch handed to the engine - _defaultVals.put(DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(64L * 1024 * 1024)); // 64MB target data-file size (Delta Kernel default 128MB) -> more files -> more parallel-read throughput + _defaultVals.put(DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(64L * 1024 * 1024)); // 64MB cap on target data-file size; adaptive sizing may pick smaller -> more files -> more parallel-read throughput + _defaultVals.put(DELTA_WRITER_ADAPTIVE_FILE_SIZE, "true"); // size data files toward one per parallel reader _defaultVals.put(PARALLEL_TOKENIZE, "false"); _defaultVals.put(PARALLEL_TOKENIZE_NUM_BLOCKS, "64"); _defaultVals.put(FRAME_TO_MATRIX_WARN_CAST, "false"); diff --git a/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java b/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java index 1e06f9acb56..9320bb99b50 100644 --- a/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java +++ b/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java @@ -29,6 +29,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.hops.OptimizerUtils; import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.util.HDFSTool; @@ -157,6 +158,17 @@ public static int countSelected(int size, boolean[] selected) { return n; } + /** Floor on the adaptive writer target file size. Below this the per-file metadata/open + * overhead (and tiny-file proliferation) outweighs the extra read parallelism. */ + public static final long ADAPTIVE_WRITER_MIN_FILE_SIZE = 4L * 1024 * 1024; + + private static Configuration buildConf(Configuration base, int batchSize, long targetFileSize) { + Configuration c = new Configuration(base); + c.setInt(CONF_READER_BATCH_SIZE, batchSize); + c.setLong(CONF_WRITER_TARGET_FILE_SIZE, targetFileSize); + return c; + } + private static synchronized Configuration deltaConf() { Configuration base = ConfigurationManager.getCachedJobConf(); int batchSize = ConfigurationManager.getDeltaReaderBatchSize(); @@ -164,10 +176,7 @@ private static synchronized Configuration deltaConf() { if(cachedConf == null || cachedConfBase != base || cachedBatchSize != batchSize || cachedTargetFileSize != targetFileSize) { - Configuration c = new Configuration(base); - c.setInt(CONF_READER_BATCH_SIZE, batchSize); - c.setLong(CONF_WRITER_TARGET_FILE_SIZE, targetFileSize); - cachedConf = c; + cachedConf = buildConf(base, batchSize, targetFileSize); cachedConfBase = base; cachedBatchSize = batchSize; cachedTargetFileSize = targetFileSize; @@ -179,6 +188,40 @@ public static Engine createEngine() { return DefaultEngine.create(deltaConf()); } + /** + * Compute the parquet target data-file size (bytes) for writing a table of the given + * estimated size. With adaptive sizing enabled the writer aims for roughly one data + * file per expected parallel reader (so the native per-file parallel read can use all + * threads), clamped to {@code [ADAPTIVE_WRITER_MIN_FILE_SIZE, configuredTarget]} so it + * never exceeds the configured/expected target nor produces excessively tiny files. + * + * @param estimatedBytes estimate of the table's size (the block in-memory size is a fine proxy) + * @return the target max parquet data-file size in bytes + */ + public static long adaptiveWriterTargetFileSize(long estimatedBytes) { + long configured = ConfigurationManager.getDeltaWriterTargetFileSize(); + if(!ConfigurationManager.isDeltaWriterAdaptiveFileSize() || estimatedBytes <= 0) + return configured; + int par = Math.max(1, OptimizerUtils.getParallelBinaryReadParallelism()); + long perReader = Math.max(1, estimatedBytes / par); + //never above the configured cap, never below the floor (unless the cap itself is lower) + return Math.min(configured, Math.max(ADAPTIVE_WRITER_MIN_FILE_SIZE, perReader)); + } + + /** + * Create an engine for writing a table of the given estimated size, configured with an + * adaptive target data-file size (see {@link #adaptiveWriterTargetFileSize(long)}). A fresh + * (uncached) configuration is built since writes happen once per table, not per data file. + * + * @param estimatedBytes estimate of the table's size (the block in-memory size is a fine proxy) + * @return a Delta Kernel engine for the write + */ + public static Engine createWriteEngine(long estimatedBytes) { + Configuration c = buildConf(ConfigurationManager.getCachedJobConf(), + ConfigurationManager.getDeltaReaderBatchSize(), adaptiveWriterTargetFileSize(estimatedBytes)); + return DefaultEngine.create(c); + } + /** * Resolve a (possibly relative) path to a fully-qualified URI so the * kernel's default engine can locate the table on the right filesystem. diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java b/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java index 5980f8b938b..99fea2e0e47 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java @@ -24,6 +24,7 @@ import java.util.Optional; import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.conf.ConfigurationManager; import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.frame.data.FrameBlock; import org.apache.sysds.runtime.frame.data.columns.Array; @@ -51,9 +52,6 @@ */ public class FrameWriterDelta extends FrameWriter { - /** Number of frame rows materialized per columnar batch handed to the engine. */ - private static final int BATCH_ROWS = 4096; - @Override public void writeFrameToHDFS(FrameBlock src, String fname, long rlen, long clen) throws IOException, DMLRuntimeException @@ -75,9 +73,11 @@ public void writeFrameToHDFS(FrameBlock src, String fname, long rlen, long clen) nullable[c] = cols[c].containsNull(); } - Engine engine = DeltaKernelUtils.createEngine(); + int batchRows = ConfigurationManager.getDeltaWriterBatchSize(); + //size data files adaptively (toward one file per parallel reader) for faster parallel reads + Engine engine = DeltaKernelUtils.createWriteEngine(src.getInMemorySize()); DeltaKernelUtils.commit(engine, DeltaKernelUtils.qualify(fname), schema, - new FrameBatchIterator(cols, nullable, schema, nrow, ncol)); + new FrameBatchIterator(cols, nullable, schema, nrow, ncol, batchRows)); } private static StructType buildSchema(ValueType[] vtSchema, String[] names, int ncol) { @@ -107,14 +107,16 @@ private static class FrameBatchIterator implements CloseableIterator[] cols, boolean[] nullable, StructType schema, int nrow, int ncol) { + FrameBatchIterator(Array[] cols, boolean[] nullable, StructType schema, int nrow, int ncol, int batchRows) { _cols = cols; _nullable = nullable; _schema = schema; _nrow = nrow; _ncol = ncol; + _batchRows = batchRows; } @Override @@ -126,7 +128,7 @@ public boolean hasNext() { public FilteredColumnarBatch next() { if( !hasNext() ) throw new NoSuchElementException(); - int size = Math.min(BATCH_ROWS, _nrow - _pos); + int size = Math.min(_batchRows, _nrow - _pos); ColumnarBatch batch = new FrameColumnarBatch(_cols, _nullable, _schema, _pos, size, _ncol); _pos += size; return new FilteredColumnarBatch(batch, Optional.empty()); diff --git a/src/main/java/org/apache/sysds/runtime/io/WriterDelta.java b/src/main/java/org/apache/sysds/runtime/io/WriterDelta.java index 0f08bf5517d..55ea8a54297 100644 --- a/src/main/java/org/apache/sysds/runtime/io/WriterDelta.java +++ b/src/main/java/org/apache/sysds/runtime/io/WriterDelta.java @@ -62,7 +62,11 @@ public void writeMatrixToHDFS(MatrixBlock src, String fname, long rlen, long cle //from the backing double[] (avoids per-cell MatrixBlock.get dispatch). double[] dense = (!src.isInSparseFormat() && src.getDenseBlock() != null && src.getDenseBlock().isContiguous()) ? src.getDenseBlockValues() : null; - Engine engine = DeltaKernelUtils.createEngine(); + //size data files adaptively (toward one file per parallel reader) for faster parallel reads. + //Delta writes every cell as a double, so size by the dense footprint rather than the (possibly + //sparse) in-memory size, which would understate the on-disk table for sparse inputs. + long estimatedBytes = (long) nrow * ncol * 8L; + Engine engine = DeltaKernelUtils.createWriteEngine(estimatedBytes); DeltaKernelUtils.commit(engine, DeltaKernelUtils.qualify(fname), buildSchema(ncol), new MatrixBatchIterator(src, dense, nrow, ncol, batchRows)); } From f26b5b6e4d41288669e8da416a76e10f0f445b71 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Wed, 1 Jul 2026 16:19:37 +0000 Subject: [PATCH 03/10] Guard parallel Delta frame reads and reuse shared type mapping The parallel frame reader's metadata-direct path wrote each data file's rows into shared per-column arrays at a fixed offset without bounding the row count, so a table whose per-file numRecords statistic under-counts the actual rows (possible for externally written Delta tables) could overrun its slice into the next file's region under concurrent writes. - Add the per-file row-count overflow guard in FrameReaderDeltaParallel .readDirect, matching the matrix reader: fail fast with a clear message instead of risking overlapping concurrent writes or an array overrun - Reuse DeltaKernelUtils.typeCode/T_* in FrameReaderDelta instead of a forked R_* table and instanceof cascade, keeping the frame and matrix type dispatch in lockstep; drop the now-unused type imports - Extract awaitFileTasks in FrameReaderDeltaParallel to share the pool lifecycle across both read paths and restore the interrupt flag when a parallel read is cancelled - Add a unit test covering the adaptive target-file-size flag on/off and the floor/cap clamp boundaries - Clarify the adaptive-size javadoc floor wording, the createWriteEngine batch-size comment, and rename opaque locals (names2, bcs/bss) --- .../sysds/runtime/io/DeltaKernelUtils.java | 7 +- .../sysds/runtime/io/FrameReaderDelta.java | 41 +++--- .../runtime/io/FrameReaderDeltaParallel.java | 133 ++++++++++-------- .../component/io/DeltaFrameReadWriteTest.java | 28 ++++ 4 files changed, 121 insertions(+), 88 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java b/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java index 9320bb99b50..33d1700fec9 100644 --- a/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java +++ b/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java @@ -192,8 +192,9 @@ public static Engine createEngine() { * Compute the parquet target data-file size (bytes) for writing a table of the given * estimated size. With adaptive sizing enabled the writer aims for roughly one data * file per expected parallel reader (so the native per-file parallel read can use all - * threads), clamped to {@code [ADAPTIVE_WRITER_MIN_FILE_SIZE, configuredTarget]} so it - * never exceeds the configured/expected target nor produces excessively tiny files. + * threads): never above the configured target, and never below + * {@code ADAPTIVE_WRITER_MIN_FILE_SIZE} unless the configured target is itself smaller + * than that floor (in which case the configured target wins). * * @param estimatedBytes estimate of the table's size (the block in-memory size is a fine proxy) * @return the target max parquet data-file size in bytes @@ -217,6 +218,8 @@ public static long adaptiveWriterTargetFileSize(long estimatedBytes) { * @return a Delta Kernel engine for the write */ public static Engine createWriteEngine(long estimatedBytes) { + //the reader batch size is irrelevant on the write path but is set to keep the + //conf shape identical to deltaConf(); only the target file size matters here. Configuration c = buildConf(ConfigurationManager.getCachedJobConf(), ConfigurationManager.getDeltaReaderBatchSize(), adaptiveWriterTargetFileSize(estimatedBytes)); return DefaultEngine.create(c); diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java index 9d5df380552..be1a538419b 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java @@ -31,15 +31,8 @@ import io.delta.kernel.data.ColumnVector; import io.delta.kernel.engine.Engine; -import io.delta.kernel.types.BooleanType; -import io.delta.kernel.types.ByteType; import io.delta.kernel.types.DataType; -import io.delta.kernel.types.DoubleType; -import io.delta.kernel.types.FloatType; -import io.delta.kernel.types.IntegerType; -import io.delta.kernel.types.LongType; -import io.delta.kernel.types.ShortType; -import io.delta.kernel.types.StringType; + /** * Single-threaded native Delta Lake reader for frames, built on the Spark-free * Delta Kernel library. It opens the latest snapshot of a Delta table, reads @@ -57,9 +50,12 @@ public class FrameReaderDelta extends FrameReader { //per-column read codes (how to pull a value out of the Delta column vector); - //package visible so the parallel reader can reuse the same dispatch. - static final int R_DOUBLE = 0, R_FLOAT = 1, R_LONG = 2, R_INT = 3, - R_SHORT = 4, R_BYTE = 5, R_BOOLEAN = 6, R_STRING = 7; + //aliases of the shared codes in DeltaKernelUtils so the frame read dispatch stays + //in lockstep with the matrix reader's type mapping. Package visible so the parallel + //reader can reuse the same dispatch. + static final int R_DOUBLE = DeltaKernelUtils.T_DOUBLE, R_FLOAT = DeltaKernelUtils.T_FLOAT, + R_LONG = DeltaKernelUtils.T_LONG, R_INT = DeltaKernelUtils.T_INT, R_SHORT = DeltaKernelUtils.T_SHORT, + R_BYTE = DeltaKernelUtils.T_BYTE, R_BOOLEAN = DeltaKernelUtils.T_BOOLEAN, R_STRING = DeltaKernelUtils.T_STRING; @Override public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen, long clen) @@ -102,14 +98,14 @@ public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] n }); ValueType[] vt = vtH[0]; - String[] names2 = nameH[0]; + String[] discoveredNames = nameH[0]; int ncol = vt.length; int nrow = nrowH[0]; //empty table: the typed column arrays cannot be zero-length, so return a //schema-only frame with the discovered schema/names and zero rows. if( nrow == 0 ) - return new FrameBlock(vt, names2, 0); + return new FrameBlock(vt, discoveredNames, 0); //concatenate the per-batch column arrays into one typed array per column Array[] columns = new Array[ncol]; @@ -117,7 +113,7 @@ public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] n columns[c] = buildColumn(vt[c], nrow, batchCols, batchSizes, c); FrameBlock ret = new FrameBlock(columns); - ret.setColumnNames(names2); + ret.setColumnNames(discoveredNames); return ret; } @@ -274,16 +270,13 @@ static Array buildColumn(ValueType vt, int nrow, ArrayList batchCol } static int readCode(DataType dt, String name) { - if( dt instanceof StringType ) return R_STRING; - if( dt instanceof DoubleType ) return R_DOUBLE; - if( dt instanceof FloatType ) return R_FLOAT; - if( dt instanceof LongType ) return R_LONG; - if( dt instanceof IntegerType ) return R_INT; - if( dt instanceof ShortType ) return R_SHORT; - if( dt instanceof ByteType ) return R_BYTE; - if( dt instanceof BooleanType ) return R_BOOLEAN; - throw new DMLRuntimeException("Unsupported non-mappable Delta column '" + name - + "' of type " + dt + " for frame read."); + //reuse the shared Delta type -> code mapping; frames additionally reject the + //types the matrix reader also cannot map (typeCode returns -1) + int code = DeltaKernelUtils.typeCode(dt); + if( code < 0 ) + throw new DMLRuntimeException("Unsupported non-mappable Delta column '" + name + + "' of type " + dt + " for frame read."); + return code; } static ValueType valueType(int readCode) { diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java index 81ab88c0c70..209f32eb31a 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; @@ -133,37 +134,30 @@ private FrameBlock readDirect(String fname, DeltaKernelUtils.ScanHandle handle, for( int c=0; c> tasks = new ArrayList<>(nfiles); - for( int i=0; i { - int[] cur = new int[] {base}; - Engine eng = DeltaKernelUtils.createEngine(); - DeltaKernelUtils.readScanFile(eng, handle.scanState, handle.physicalReadSchema, scanFileRow, - (cols, size, selected) -> { - for( int c=0; c f : pool.invokeAll(tasks) ) - f.get(); - } - catch(Exception ex) { - throw new IOException("Failed parallel read of Delta table: " + fname, ex); - } - finally { - pool.shutdown(); + ArrayList> tasks = new ArrayList<>(nfiles); + for( int i=0; i { + int[] cur = new int[] {base}; + Engine eng = DeltaKernelUtils.createEngine(); + DeltaKernelUtils.readScanFile(eng, handle.scanState, handle.physicalReadSchema, scanFileRow, + (cols, size, selected) -> { + if( cur[0] + DeltaKernelUtils.countSelected(size, selected) > limit ) + throw new DMLRuntimeException("Delta file produced more rows than its " + + "numRecords statistic; refusing parallel direct read of " + fname); + for( int c=0; c[] columns = new Array[ncol]; for( int c=0; c[] fileCols = new ArrayList[nfiles]; @SuppressWarnings("unchecked") final ArrayList[] fileSizes = new ArrayList[nfiles]; - final ExecutorService pool = CommonThreadPool.get(_numThreads); - try { - ArrayList> tasks = new ArrayList<>(nfiles); - for( int i=0; i { - ArrayList bcs = new ArrayList<>(); - ArrayList bss = new ArrayList<>(); - Engine eng = DeltaKernelUtils.createEngine(); - DeltaKernelUtils.readScanFile(eng, handle.scanState, handle.physicalReadSchema, scanFileRow, - (cols, size, selected) -> { - int n = DeltaKernelUtils.countSelected(size, selected); - Object[] extracted = new Object[ncol]; - for( int c=0; c f : pool.invokeAll(tasks) ) - f.get(); - } - catch(Exception ex) { - throw new IOException("Failed parallel read of Delta table: " + fname, ex); - } - finally { - pool.shutdown(); + ArrayList> tasks = new ArrayList<>(nfiles); + for( int i=0; i { + ArrayList fileBatchCols = new ArrayList<>(); + ArrayList fileBatchSizes = new ArrayList<>(); + Engine eng = DeltaKernelUtils.createEngine(); + DeltaKernelUtils.readScanFile(eng, handle.scanState, handle.physicalReadSchema, scanFileRow, + (cols, size, selected) -> { + int n = DeltaKernelUtils.countSelected(size, selected); + Object[] extracted = new Object[ncol]; + for( int c=0; c batchCols = new ArrayList<>(); @@ -242,6 +226,31 @@ private FrameBlock readBuffered(String fname, DeltaKernelUtils.ScanHandle handle return ret; } + /** + * Run one decode task per data file on the shared common thread pool and await + * completion. Full parallelism is requested (the task count, one per data file, + * naturally caps concurrency); this avoids the per-thread pool-size caching in + * {@code CommonThreadPool.get(k)} that could otherwise throttle this reader to a + * smaller pool created earlier on the same thread. + */ + private void awaitFileTasks(List> tasks, String fname) throws IOException { + ExecutorService pool = CommonThreadPool.get(_numThreads); + try { + for( Future f : pool.invokeAll(tasks) ) + f.get(); + } + catch(InterruptedException ex) { + Thread.currentThread().interrupt(); + throw new IOException("Interrupted during parallel read of Delta table: " + fname, ex); + } + catch(Exception ex) { + throw new IOException("Failed parallel read of Delta table: " + fname, ex); + } + finally { + pool.shutdown(); + } + } + /** Allocate a pre-sized typed column array matching the target value type. */ private static Object allocColumn(ValueType vt, int n) { switch( vt ) { diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java index 1c1d33a80d6..9d561432d50 100644 --- a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java +++ b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java @@ -235,6 +235,34 @@ public void parallelBufferedPathMatchesSerialMultiFile() throws Exception { } } + @Test + public void adaptiveTargetFileSizeClampsAndRespectsFlag() { + //cap chosen above the 4MB floor so both clamp directions are observable + final long cap = 64L * 1024 * 1024; + DMLConfig conf = new DMLConfig(); + conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(cap)); + conf.setTextValue(DMLConfig.DELTA_WRITER_ADAPTIVE_FILE_SIZE, "true"); + ConfigurationManager.setLocalConfig(conf); + try { + assertEquals("estimatedBytes<=0 -> configured cap", cap, + DeltaKernelUtils.adaptiveWriterTargetFileSize(0)); + assertEquals("negative estimate -> configured cap", cap, + DeltaKernelUtils.adaptiveWriterTargetFileSize(-1)); + assertEquals("huge table -> never above the configured cap", cap, + DeltaKernelUtils.adaptiveWriterTargetFileSize(Long.MAX_VALUE / 2)); + assertEquals("tiny table -> never below the floor", + DeltaKernelUtils.ADAPTIVE_WRITER_MIN_FILE_SIZE, + DeltaKernelUtils.adaptiveWriterTargetFileSize(1)); + + conf.setTextValue(DMLConfig.DELTA_WRITER_ADAPTIVE_FILE_SIZE, "false"); + assertEquals("flag OFF -> always the configured cap regardless of size", cap, + DeltaKernelUtils.adaptiveWriterTargetFileSize(1)); + } + finally { + ConfigurationManager.clearLocalConfigs(); + } + } + @Test public void factoryRoutesDeltaToParallelWhenEnabled() { //the factory must pick the parallel frame reader iff parallel CP read is enabled From ed87484ef870de06de43a45c68e78c239a185a83 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Wed, 1 Jul 2026 17:07:19 +0000 Subject: [PATCH 04/10] Decode serial Delta frame reads directly into pre-sized columns The single-threaded frame reader extracted each batch into temporary per-batch arrays and then concatenated them into the final column arrays, allocating and copying every column twice. Delta's per-file numRecords statistic already provides exact row counts from metadata, so the output can be sized up front and decoded in a single pass with no intermediate buffers. - Rewrite FrameReaderDelta.readFrameFromHDFS to pre-size one typed array per column from the metadata row counts and decode each data file straight into its row offset (with the same per-file overflow guard as the parallel reader); fall back to the buffered extract-then-concatenate path only when exact counts are unavailable (missing statistics or deletion vectors present) - Move allocColumn/createColumn/extractColumnInto up to FrameReaderDelta so the serial and parallel readers share one copy instead of duplicating the per-type column dispatch - The parallel reader's single-file/low-thread fallback now also decodes in a single direct pass --- .../sysds/runtime/io/FrameReaderDelta.java | 257 +++++++++++++++--- .../runtime/io/FrameReaderDeltaParallel.java | 117 -------- 2 files changed, 215 insertions(+), 159 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java index be1a538419b..53033fbc89b 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java @@ -30,6 +30,7 @@ import org.apache.sysds.runtime.frame.data.columns.ArrayFactory; import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.Row; import io.delta.kernel.engine.Engine; import io.delta.kernel.types.DataType; @@ -63,57 +64,113 @@ public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] n { Engine engine = DeltaKernelUtils.createEngine(); String tablePath = DeltaKernelUtils.qualify(fname); + DeltaKernelUtils.ScanHandle handle = DeltaKernelUtils.openScan(engine, tablePath); - //per-batch, per-column extracted arrays (boxing free) - ArrayList batchCols = new ArrayList<>(); - ArrayList batchSizes = new ArrayList<>(); - int[] nrowH = new int[1]; - ValueType[][] vtH = new ValueType[1][]; - String[][] nameH = new String[1][]; - int[][] readCodeH = new int[1][]; + //derive per-column read codes, value types and names once from the schema + final int ncol = handle.schema.length(); + final int[] readCodes = new int[ncol]; + final ValueType[] vt = new ValueType[ncol]; + final String[] cnames = new String[ncol]; + for( int c=0; c { - int ncol = sch.length(); - int[] readCode = new int[ncol]; - ValueType[] vt = new ValueType[ncol]; - String[] cnames = new String[ncol]; - for( int c=0; c { - int n = DeltaKernelUtils.countSelected(size, selected); - Object[] extracted = new Object[ncol]; - for( int c=0; c pre-size one typed array per column and decode each file + //straight into its row offset, avoiding the per-batch extract + concatenate. + if( handle.hasExactRowCounts() ) { + long total = 0; + for( long r : handle.numRecords ) + total += r; + //empty table: the typed column arrays cannot be zero-length, so return a + //schema-only frame with the discovered schema/names and zero rows. + if( total == 0 ) + return new FrameBlock(vt, cnames, 0); + if( total <= Integer.MAX_VALUE ) + return readDirect(fname, engine, handle, ncol, readCodes, vt, cnames, (int) total); + } - ValueType[] vt = vtH[0]; - String[] discoveredNames = nameH[0]; - int ncol = vt.length; - int nrow = nrowH[0]; + //fallback: row counts unknown or deletion vectors present -> decode into + //per-batch arrays and concatenate per column in file order. + return readBuffered(engine, handle, ncol, readCodes, vt, cnames); + } - //empty table: the typed column arrays cannot be zero-length, so return a - //schema-only frame with the discovered schema/names and zero rows. - if( nrow == 0 ) - return new FrameBlock(vt, discoveredNames, 0); + /** + * Fast path: decode each data file straight into pre-sized typed column arrays + * at a metadata-derived row offset. One allocation per column, single pass, no + * intermediate per-batch buffers or serial concatenation. + */ + private FrameBlock readDirect(String fname, Engine engine, DeltaKernelUtils.ScanHandle handle, + int ncol, int[] readCodes, ValueType[] vt, String[] cnames, int nrow) throws IOException + { + final Object[] dest = new Object[ncol]; + for( int c=0; c { + if( cur[0] + DeltaKernelUtils.countSelected(size, selected) > limit ) + throw new DMLRuntimeException("Delta file produced more rows than its " + + "numRecords statistic; refusing direct read of " + fname); + for( int c=0; c[] columns = new Array[ncol]; for( int c=0; c batchCols = new ArrayList<>(); + final ArrayList batchSizes = new ArrayList<>(); + final int[] nrowH = new int[1]; + for( Row scanFileRow : handle.scanFiles ) { + DeltaKernelUtils.readScanFile(engine, handle.scanState, handle.physicalReadSchema, scanFileRow, + (cols, size, selected) -> { + int n = DeltaKernelUtils.countSelected(size, selected); + Object[] extracted = new Object[ncol]; + for( int c=0; c[] columns = new Array[ncol]; + for( int c=0; c createColumn(ValueType vt, Object full) { + switch( vt ) { + case FP64: return ArrayFactory.create((double[]) full); + case FP32: return ArrayFactory.create((float[]) full); + case INT64: return ArrayFactory.create((long[]) full); + case INT32: return ArrayFactory.create((int[]) full); + case BOOLEAN: return ArrayFactory.create((boolean[]) full); + default: return ArrayFactory.create((String[]) full); // STRING + } + } + + /** + * Decode the live (selected, after deletion vector) rows of one column batch + * directly into a pre-sized typed array starting at absolute row {@code destOff}. + * Null numeric cells keep the array default (0); string nulls are stored as null. + */ + static void extractColumnInto(ColumnVector col, int size, boolean[] selected, + int readCode, Object dest, int destOff) + { + switch( readCode ) { + case R_DOUBLE: { + double[] a = (double[]) dest; + int lr = destOff; + for( int r=0; r> tasks, String fname) throws I } } - /** Allocate a pre-sized typed column array matching the target value type. */ - private static Object allocColumn(ValueType vt, int n) { - switch( vt ) { - case FP64: return new double[n]; - case FP32: return new float[n]; - case INT64: return new long[n]; - case INT32: return new int[n]; - case BOOLEAN: return new boolean[n]; - default: return new String[n]; // STRING - } - } - - /** Wrap a fully populated typed column array into a frame {@link Array}. */ - private static Array createColumn(ValueType vt, Object full) { - switch( vt ) { - case FP64: return ArrayFactory.create((double[]) full); - case FP32: return ArrayFactory.create((float[]) full); - case INT64: return ArrayFactory.create((long[]) full); - case INT32: return ArrayFactory.create((int[]) full); - case BOOLEAN: return ArrayFactory.create((boolean[]) full); - default: return ArrayFactory.create((String[]) full); // STRING - } - } - - /** - * Decode the live (selected, after deletion vector) rows of one column batch - * directly into a pre-sized typed array starting at absolute row {@code destOff}. - * Null numeric cells keep the array default (0); string nulls are stored as null. - */ - private static void extractColumnInto(ColumnVector col, int size, boolean[] selected, - int readCode, Object dest, int destOff) - { - switch( readCode ) { - case R_DOUBLE: { - double[] a = (double[]) dest; - int lr = destOff; - for( int r=0; r Date: Wed, 1 Jul 2026 21:05:06 +0000 Subject: [PATCH 05/10] Consolidate Delta frame column materialization via ArrayFactory Route the Delta frame readers' column allocation and wrapping through ArrayFactory instead of reader-local per-type switches: - Add ArrayFactory.allocateBacking(ValueType,int) as the single ValueType -> raw backing array mapping (the inverse of create(ValueType,Object)), and remove the duplicate allocColumn / createColumn / extractColumn / buildColumn switches from FrameReaderDelta and FrameReaderDeltaParallel. The buffered fallback now reuses the same alloc + extractColumnInto + concatColumn primitives as the direct path. - Make create(ValueType,Object) bit-pack boolean columns above the switch point into a BitSetArray (mirroring allocateBoolean), so Delta reads produce the same compact representation as every other frame reader instead of a byte-backed BooleanArray. - Simplify allocate(ValueType,int) to compose create + allocateBacking, keeping only the boolean special case (empty BitSet backing) and moving the UINT4/UINT8 fallback warning into allocateBacking. - Move useDirectPath to FrameReaderDelta so both readers share it. --- .../frame/data/columns/ArrayFactory.java | 102 ++++++-- .../sysds/runtime/io/FrameReaderDelta.java | 224 ++++-------------- .../runtime/io/FrameReaderDeltaParallel.java | 32 +-- .../component/io/DeltaFrameReadWriteTest.java | 29 +++ 4 files changed, 170 insertions(+), 217 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/frame/data/columns/ArrayFactory.java b/src/main/java/org/apache/sysds/runtime/frame/data/columns/ArrayFactory.java index 5f2d08a122f..80a5d699dfa 100644 --- a/src/main/java/org/apache/sysds/runtime/frame/data/columns/ArrayFactory.java +++ b/src/main/java/org/apache/sysds/runtime/frame/data/columns/ArrayFactory.java @@ -123,6 +123,87 @@ public static RaggedArray create(T[] col, int m) { return new RaggedArray<>(col, m); } + /** + * Wrap a fully populated raw typed column array into an {@link Array} of the given value type. The runtime type of + * {@code col} must match the primitive backing type of {@code vt} (e.g. {@code double[]} for {@link ValueType#FP64}, + * {@code String[]} for {@link ValueType#STRING}). + * + *

For {@link ValueType#BOOLEAN} this mirrors {@link #allocateBoolean(int)}: a {@code boolean[]} longer than + * {@link #bitSetSwitchPoint} is bit-packed into a compact {@link BitSetArray} (so a bulk decoder that fills a + * plain {@code boolean[]} still ends up with the same representation as every other frame allocation path), + * while shorter columns stay a plain {@link BooleanArray}.

+ * + * @param vt the value type of the column + * @param col the backing array to wrap + * @return an {@link Array} view over {@code col} (boolean columns may be bit-packed rather than wrapped in place) + */ + public static Array create(ValueType vt, Object col) { + switch(vt) { + case FP64: + return create((double[]) col); + case FP32: + return create((float[]) col); + case INT64: + return create((long[]) col); + case UINT4: + case UINT8: + case INT32: + return create((int[]) col); + case BOOLEAN: { + boolean[] b = (boolean[]) col; + return b.length > bitSetSwitchPoint ? new BitSetArray(b) : create(b); + } + case CHARACTER: + return create((char[]) col); + case HASH64: + return createHash64((long[]) col); + case HASH32: + return createHash32((int[]) col); + case UNKNOWN: + case STRING: + default: + return create((String[]) col); + } + } + + /** + * Allocate the raw backing array for a column of the given value type: the inverse of + * {@link #create(ValueType, Object)}. Returns {@code double[]} for {@link ValueType#FP64}, + * {@code int[]} for INT32/UINT/HASH32, {@code long[]} for INT64/HASH64, {@code String[]} for STRING, etc. The + * runtime array type matches what {@link #create(ValueType, Object)} expects, so a bulk decoder can fill this + * primitive array directly and then wrap it via {@code create(vt, backing)}. + * + * @param vt the value type of the column + * @param nRow the number of rows to allocate + * @return a freshly allocated raw backing array of the matching primitive/object type + */ + public static Object allocateBacking(ValueType vt, int nRow) { + switch(vt) { + case FP64: + return new double[nRow]; + case FP32: + return new float[nRow]; + case INT64: + case HASH64: + return new long[nRow]; + case UINT4: + case UINT8: + LOG.warn("Not supported allocation of UInt 4 or 8 array: defaulting to Int32"); + // fall through: UINT4/UINT8 are backed by int[] (wrapped as Int32) + case INT32: + case HASH32: + return new int[nRow]; + case BOOLEAN: + return new boolean[nRow]; + case CHARACTER: + return new char[nRow]; + case UNKNOWN: + case STRING: + default: + return new String[nRow]; + } + } + public static long getInMemorySize(ValueType type, int _numRows, boolean containsNull) { if(containsNull) { switch(type) { @@ -221,27 +302,8 @@ public static Array allocate(ValueType v, int nRow) { switch(v) { case BOOLEAN: return allocateBoolean(nRow); - case UINT4: - case UINT8: - LOG.warn("Not supported allocation of UInt 4 or 8 array: defaulting to Int32"); - case INT32: - return new IntegerArray(new int[nRow]); - case INT64: - return new LongArray(new long[nRow]); - case FP32: - return new FloatArray(new float[nRow]); - case FP64: - return new DoubleArray(new double[nRow]); - case CHARACTER: - return new CharArray(new char[nRow]); - case HASH64: - return new HashLongArray(new long[nRow]); - case HASH32: - return new HashIntegerArray(new int[nRow]); - case UNKNOWN: - case STRING: default: - return new StringArray(new String[nRow]); + return create(v, allocateBacking(v, nRow)); } } diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java index 53033fbc89b..bb60d30bec2 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java @@ -81,7 +81,7 @@ public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] n //fast path: exact per-file row counts are known from metadata (no deletion //vectors) -> pre-size one typed array per column and decode each file //straight into its row offset, avoiding the per-batch extract + concatenate. - if( handle.hasExactRowCounts() ) { + if( useDirectPath(handle) ) { long total = 0; for( long r : handle.numRecords ) total += r; @@ -98,6 +98,21 @@ public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] n return readBuffered(engine, handle, ncol, readCodes, vt, cnames); } + /** + * Whether the metadata-driven direct read fast path can be used for this table + * (exact per-file row counts and no deletion vectors, so the output can be + * pre-sized and each file decoded straight into its row offset). Visible for + * testing: the buffered fallback is otherwise only reachable for tables lacking + * row statistics or carrying deletion vectors, which the SystemDS Delta writer + * never produces. + * + * @param handle the opened scan handle + * @return true if the direct path is applicable + */ + protected boolean useDirectPath(DeltaKernelUtils.ScanHandle handle) { + return handle.hasExactRowCounts(); + } + /** * Fast path: decode each data file straight into pre-sized typed column arrays * at a metadata-derived row offset. One allocation per column, single pass, no @@ -108,7 +123,7 @@ private FrameBlock readDirect(String fname, Engine engine, DeltaKernelUtils.Scan { final Object[] dest = new Object[ncol]; for( int c=0; c[] columns = new Array[ncol]; for( int c=0; c { int n = DeltaKernelUtils.countSelected(size, selected); Object[] extracted = new Object[ncol]; - for( int c=0; c[] columns = new Array[ncol]; for( int c=0; c buildColumn(ValueType vt, int nrow, ArrayList batchCols, + /** + * Concatenate the per-batch typed arrays of one column (in file/batch order) + * into a single pre-sized array and wrap it as a frame {@link Array}. The copy + * is type-agnostic ({@link System#arraycopy} works on the boxed primitive or + * object arrays), so there is no per-type dispatch here: allocation and + * wrapping reuse {@link ArrayFactory#allocateBacking(ValueType, int)} and + * {@link ArrayFactory#create(ValueType, Object)}, the same primitives the + * single-pass direct path uses. + * + *

Only the buffered fallback needs this concatenation; the default direct + * path decodes straight into one pre-sized array per column with no + * intermediate per-batch arrays.

+ */ + static Array concatColumn(ValueType vt, int nrow, ArrayList batchCols, ArrayList batchSizes, int c) { - switch( vt ) { - case FP64: { - double[] all = new double[nrow]; - int off = 0; - for( int b=0; b createColumn(ValueType vt, Object full) { - switch( vt ) { - case FP64: return ArrayFactory.create((double[]) full); - case FP32: return ArrayFactory.create((float[]) full); - case INT64: return ArrayFactory.create((long[]) full); - case INT32: return ArrayFactory.create((int[]) full); - case BOOLEAN: return ArrayFactory.create((boolean[]) full); - default: return ArrayFactory.create((String[]) full); // STRING - } - } - /** * Decode the live (selected, after deletion vector) rows of one column batch * directly into a pre-sized typed array starting at absolute row {@code destOff}. diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java index 957d53ccc31..05d7ec8edec 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java @@ -31,6 +31,7 @@ import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.frame.data.FrameBlock; import org.apache.sysds.runtime.frame.data.columns.Array; +import org.apache.sysds.runtime.frame.data.columns.ArrayFactory; import org.apache.sysds.runtime.util.CommonThreadPool; import io.delta.kernel.data.Row; @@ -97,20 +98,6 @@ public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] n return readBuffered(fname, handle, ncol, readCodes, vt, cnames); } - /** - * Whether the metadata-driven direct-write fast path can be used for this - * table (exact per-file row counts and no deletion vectors). Visible for - * testing: the buffered fallback is otherwise only reachable for tables - * lacking row statistics or carrying deletion vectors, which the SystemDS - * Delta writer never produces. - * - * @param handle the opened scan handle - * @return true if the direct path is applicable - */ - protected boolean useDirectPath(DeltaKernelUtils.ScanHandle handle) { - return handle.hasExactRowCounts(); - } - /** * Fast path: each thread decodes one data file straight into the final typed * column arrays at a metadata-derived row offset. Single allocation per @@ -130,7 +117,7 @@ private FrameBlock readDirect(String fname, DeltaKernelUtils.ScanHandle handle, //pre-size one typed array per column for the whole table final Object[] dest = new Object[ncol]; for( int c=0; c> tasks = new ArrayList<>(nfiles); for( int i=0; i[] columns = new Array[ncol]; for( int c=0; c { int n = DeltaKernelUtils.countSelected(size, selected); Object[] extracted = new Object[ncol]; - for( int c=0; c[] columns = new Array[ncol]; for( int c=0; c buffered extract+concat + FrameBlock buffered = new FrameReaderDelta() { + @Override protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { return false; } + }.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); + + assertFramesEqual(direct, buffered); + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + @Test public void adaptiveTargetFileSizeClampsAndRespectsFlag() { //cap chosen above the 4MB floor so both clamp directions are observable From 35c8a91b117683c750c3c830411502ef803efab8 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Wed, 1 Jul 2026 21:05:20 +0000 Subject: [PATCH 06/10] Add Delta frame read benchmarks Add timing/throughput benchmarks for the serial and parallel Delta frame readers: - DeltaFrameRead in the performance suite (dispatched as Main id 18): writes a random frame to a temp Delta table once as untimed setup, then repeatedly reads it back under timing for serial/parallel/both modes with a configurable writer target file size. Suitable for running under async-profiler. - DeltaFrameReadPerf: JUnit-based manual micro-benchmarks (all @Ignore so they never run in the normal build) covering direct-vs-buffered serial reads, adaptive file sizing, target-size and batch-size sweeps, and schema-composition breakdowns. --- .../org/apache/sysds/performance/Main.java | 19 + .../performance/frame/DeltaFrameRead.java | 179 +++++++ .../test/component/io/DeltaFrameReadPerf.java | 441 ++++++++++++++++++ 3 files changed, 639 insertions(+) create mode 100644 src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java create mode 100644 src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java diff --git a/src/test/java/org/apache/sysds/performance/Main.java b/src/test/java/org/apache/sysds/performance/Main.java index f8d0bbea852..0622e789baa 100644 --- a/src/test/java/org/apache/sysds/performance/Main.java +++ b/src/test/java/org/apache/sysds/performance/Main.java @@ -24,6 +24,7 @@ import org.apache.sysds.performance.compression.Serialize; import org.apache.sysds.performance.compression.StreamCompress; import org.apache.sysds.performance.compression.TransformPerf; +import org.apache.sysds.performance.frame.DeltaFrameRead; import org.apache.sysds.performance.frame.Transform; import org.apache.sysds.performance.generators.ConstMatrix; import org.apache.sysds.performance.generators.FrameFile; @@ -113,6 +114,9 @@ private static void exec(int prog, String[] args) throws Exception { case 17: run17(args); break; + case 18: + run18(args); + break; case 1000: run1000(args); break; @@ -238,6 +242,21 @@ private static void run17(String[] args) throws Exception { new MatrixReplacePerf(100, g, k).run(); } + /** + * Repeatedly read the same on-disk Delta frame table (written once as setup). + * Args: {@code 18 [mode] [targetFileSizeMB]} + * where mode is one of serial|parallel|both (default parallel) and an omitted + * target file size uses the adaptive default sizing. + */ + private static void run18(String[] args) throws Exception { + int rows = Integer.parseInt(args[1]); + int k = Integer.parseInt(args[2]); + int n = Integer.parseInt(args[3]); + String mode = (args.length > 4) ? args[4] : "parallel"; + long targetFileSize = (args.length > 5) ? Long.parseLong(args[5]) * 1024 * 1024 : -1; + new DeltaFrameRead(n, DeltaFrameRead.mixedFrame(rows, 7), k, mode, targetFileSize).run(); + } + private static void run1000(String[] args) { MMSparsityPerformance perf; if (args.length < 3) { diff --git a/src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java b/src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java new file mode 100644 index 00000000000..f525ae438de --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.performance.frame; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.stream.Stream; + +import org.apache.commons.io.FileUtils; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.conf.DMLConfig; +import org.apache.sysds.performance.compression.APerfTest; +import org.apache.sysds.performance.generators.ConstFrame; +import org.apache.sysds.performance.generators.IGenerate; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.io.FrameReaderDelta; +import org.apache.sysds.runtime.io.FrameReaderDeltaParallel; +import org.apache.sysds.runtime.io.FrameWriterDelta; +import org.apache.sysds.test.TestUtils; + +/** + * Reads the SAME native Delta frame table from disk repeatedly and reports read + * throughput. The table is written to a temporary directory ONCE as (untimed) + * setup; every timed repetition re-opens the latest snapshot and materializes a + * fresh {@link FrameBlock}, so the numbers reflect the read path only (parquet + * decode + column materialization), not the write. + * + *

This is the target for an async-profiler run: launch the perf jar under the + * profiler agent and this loop provides a long, steady-state read workload to + * sample. See {@code src/test/java/org/apache/sysds/performance/README.md} and + * the {@code delta-async-profiler} cursor rule.

+ * + *

Dispatched from {@link org.apache.sysds.performance.Main} (program id 18).

+ */ +public class DeltaFrameRead extends APerfTest { + + //the Delta reader derives schema/names from the table metadata, so the values + //passed here are placeholders (a single detect column) and are ignored. + private static final ValueType[] DETECT_SCHEMA = new ValueType[] {ValueType.STRING}; + private static final String[] DETECT_NAMES = new String[] {"x"}; + + private final int k; + private final String mode; + private final long targetFileSize; //<=0 -> adaptive default sizing + + private String tablePath; + private Path tableDir; + private long inMemSize; + private long files; + + public DeltaFrameRead(int N, IGenerate gen, int k, String mode, long targetFileSize) { + super(N, gen); + this.k = k; + this.mode = mode; + this.targetFileSize = targetFileSize; + } + + public void run() throws Exception { + try { + setup(); + System.out.println(this); + System.out.printf("table: %s%n", tablePath); + System.out.printf("layout: files=%d, in-memory=%.1f MB, target=%s%n", + files, inMemSize / 1048576.0, + targetFileSize > 0 ? (targetFileSize / 1048576) + "MB(fixed)" : "adaptive"); + + if( mode.equals("serial") || mode.equals("both") ) + execute(() -> readSerial(), "Delta read serial"); + if( mode.equals("parallel") || mode.equals("both") ) + execute(() -> readParallel(), "Delta read parallel(k=" + k + ")"); + } + finally { + ConfigurationManager.clearLocalConfigs(); + if( tableDir != null ) + FileUtils.deleteQuietly(tableDir.toFile()); + } + } + + /** Untimed: materialize the source frame and write it to a temp Delta table once. */ + private void setup() throws Exception { + FrameBlock fb = gen.take(); + inMemSize = fb.getInMemorySize(); + + DMLConfig c = new DMLConfig(); + if( targetFileSize > 0 ) { + c.setTextValue(DMLConfig.DELTA_WRITER_ADAPTIVE_FILE_SIZE, "false"); + c.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(targetFileSize)); + } + ConfigurationManager.setLocalConfig(c); + + tableDir = Files.createTempDirectory("sysds_delta_frame_read_"); + tablePath = new File(tableDir.toFile(), "table").getAbsolutePath(); + new FrameWriterDelta().writeFrameToHDFS(fb, tablePath, fb.getNumRows(), fb.getNumColumns()); + files = countParquet(tablePath); + } + + private void readSerial() { + try { + FrameBlock fb = new FrameReaderDelta() + .readFrameFromHDFS(tablePath, DETECT_SCHEMA, DETECT_NAMES, -1, -1); + ret.add(fb.getInMemorySize()); + } + catch(Exception e) { + throw new RuntimeException(e); + } + } + + private void readParallel() { + try { + FrameBlock fb = new FrameReaderDeltaParallel() + .readFrameFromHDFS(tablePath, DETECT_SCHEMA, DETECT_NAMES, -1, -1); + ret.add(fb.getInMemorySize()); + } + catch(Exception e) { + throw new RuntimeException(e); + } + } + + private static long countParquet(String tablePath) throws Exception { + try( Stream s = Files.walk(new File(tablePath).toPath()) ) { + return s.filter(p -> p.toString().endsWith(".parquet")).count(); + } + } + + @Override + protected String makeResString() { + throw new RuntimeException("Do not call"); + } + + @Override + protected String makeResString(double[] times) { + double meanMs = trimmedMean(times); + double mbPerSec = (inMemSize / 1048576.0) / (meanMs / 1000.0); + return String.format("%8.1f MB/s", mbPerSec); + } + + /** 5%-trimmed mean, matching the trimming used by the framework statistics. */ + private static double trimmedMean(double[] times) { + double[] v = times.clone(); + java.util.Arrays.sort(v); + int remove = (int) Math.floor(v.length * 0.05); + double total = 0; + int el = v.length - remove * 2; + for( int i = remove; i < v.length - remove; i++ ) + total += v[i]; + return total / Math.max(el, 1); + } + + @Override + public String toString() { + return super.toString() + " mode: " + mode + ", threads: " + k; + } + + /** Build a representative mixed-schema frame (string + numeric columns). */ + public static IGenerate mixedFrame(int rows, long seed) { + ValueType[] schema = new ValueType[] {ValueType.STRING, ValueType.INT64, ValueType.FP64, + ValueType.BOOLEAN, ValueType.INT32, ValueType.FP32}; + return new ConstFrame(TestUtils.generateRandomFrameBlock(rows, schema, seed)); + } +} diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java new file mode 100644 index 00000000000..d09219cc323 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java @@ -0,0 +1,441 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.component.io; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Random; + +import org.apache.commons.io.FileUtils; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.conf.DMLConfig; +import org.apache.sysds.hops.OptimizerUtils; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.io.DeltaKernelUtils; +import org.apache.sysds.runtime.io.FrameReaderDelta; +import org.apache.sysds.runtime.io.FrameReaderDeltaParallel; +import org.apache.sysds.runtime.io.FrameWriterDelta; +import org.junit.Ignore; +import org.junit.Test; + +/** + * Manual micro-benchmark comparing the serial {@link FrameReaderDelta} against + * the parallel {@link FrameReaderDeltaParallel} on multi-file Delta frame + * tables. Not a correctness test (those live in {@link DeltaFrameReadWriteTest}); + * it just prints timing/throughput numbers and is {@link Ignore}d so it does not + * run in the normal build. + * + *

The parallel reader decodes one task per parquet data file, so the speedup + * scales with the number of files (controlled here via the writer target file + * size). Run it on a JVM with a realistically sized heap; under a tiny young + * generation (e.g. the Surefire fork's {@code -Xmn300m}) the concurrent decode's + * higher allocation rate is dominated by young-GC pauses and the numbers are not + * representative of a normal SystemDS process.

+ * + *

Run explicitly (remove {@link Ignore} or run the compiled class directly), + * e.g. {@code mvn -q test -Dtest=DeltaFrameReadPerf -DfailIfNoTests=false}.

+ */ +public class DeltaFrameReadPerf { + + private static final ValueType[] NO_SCHEMA = new ValueType[] {ValueType.STRING}; + private static final String[] NO_NAMES = new String[] {"x"}; + + private static final long MB = 1024L * 1024; + private static final int WARMUP = 2; + private static final int REPS = 7; + + /** Entry point so the (otherwise {@code @Ignore}d) benchmarks can be run directly. */ + public static void main(String[] args) throws Exception { + DeltaFrameReadPerf p = new DeltaFrameReadPerf(); + p.serialDirectVsBuffered(); + } + + /** + * Isolates the serial-reader change: compares the new direct (pre-sized, + * metadata-driven, single-pass) read against the old buffered (per-batch + * extract + concatenate) read on the SAME single-file table, so the only + * difference is the extra allocation + concatenation copy. Single file => + * no file-level parallelism involved, pure serial decode cost. + */ + @Test + @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") + public void serialDirectVsBuffered() throws Exception { + System.out.println("\n=== serial direct vs buffered (single file, 4M rows) ==="); + System.out.printf("%-9s %11s %11s %9s%n", "schema", "direct(ms)", "buffered(ms)", "speedup"); + for( String kind : new String[] {"numeric", "mixed", "string"} ) { + //force a single data file: disable adaptive sizing, huge target + DMLConfig c = new DMLConfig(); + c.setTextValue(DMLConfig.DELTA_WRITER_ADAPTIVE_FILE_SIZE, "false"); + c.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(4L * 1024 * MB)); + ConfigurationManager.setLocalConfig(c); + Path dir = Files.createTempDirectory("sysds_delta_frame_ab_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + FrameBlock in = genFrame(kind, 4_000_000, 7); + new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); + //direct = default serial reader; buffered = force the fallback path + FrameReaderDelta direct = new FrameReaderDelta(); + FrameReaderDelta buffered = new FrameReaderDelta() { + @Override protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { return false; } + }; + for( int i=0; i direct.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); + tb[i] = time(() -> buffered.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); + } + double md = median(td), mb = median(tb); + long ad = allocBytes(() -> direct.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); + long ab = allocBytes(() -> buffered.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); + System.out.printf("%-9s %11.2f %11.2f %8.2fx alloc: %6.0f / %6.0f MB (%.2fx)%n", + kind, md, mb, mb / md, ad / (double) MB, ab / (double) MB, ab / (double) ad); + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + } + + /** + * End-to-end check of adaptive writer file sizing with NO explicit target size configured + * (the real default): the table should now be split into ~one file per reader and read fast, + * versus the single/few-file layout the fixed 64MB default produced. + */ + @Test + @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") + public void adaptiveCheck() throws Exception { + System.out.println("\n=== adaptive writer file sizing (default config, no target set) ==="); + System.out.println("threads = " + OptimizerUtils.getParallelBinaryReadParallelism()); + System.out.printf("%-9s %-8s %-6s %11s %11s %9s%n", + "rows", "adaptive", "files", "serial(ms)", "par(ms)", "speedup"); + for( int rows : new int[] {1_000_000, 4_000_000} ) { + //default config => 64MB cap, adaptive sizing enabled + ConfigurationManager.setLocalConfig(new DMLConfig()); + Path dir = Files.createTempDirectory("sysds_delta_frame_adp_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + FrameBlock in = genFrame("mixed", rows, 7); + long est = in.getInMemorySize(); + long target = DeltaKernelUtils.adaptiveWriterTargetFileSize(est); + new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); + long files = countParquet(tablePath); + double[] r = measure(tablePath); + System.out.printf("%-9d %-8s %-6d %11.2f %11.2f %8.2fx%n", + rows, (target / MB) + "MB", files, r[0], r[1], r[0] / r[1]); + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + } + + /** + * Sweep the writer target file size ({@link DMLConfig#DELTA_WRITER_TARGET_FILE_SIZE}, the + * one public Delta knob that affects read parallelism) to find where the per-file parallel + * read stops improving, i.e. a good default for read-heavy use. + */ + @Test + @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") + public void targetSizeSweep() throws Exception { + final int rows = 4_000_000; + final long[] sizesMB = {128, 64, 32, 16, 8, 4, 2}; + System.out.println("\n=== writer target-size sweep (mixed, " + rows + " rows, " + + OptimizerUtils.getParallelBinaryReadParallelism() + " threads) ==="); + System.out.printf("%-9s %-6s %11s %11s %9s%n", "targetMB", "files", "serial(ms)", "par(ms)", "speedup"); + for( long mb : sizesMB ) { + DMLConfig c = new DMLConfig(); + c.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(mb * MB)); + ConfigurationManager.setLocalConfig(c); + Path dir = Files.createTempDirectory("sysds_delta_frame_ts_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + FrameBlock in = genFrame("mixed", rows, 7); + new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); + long files = countParquet(tablePath); + double[] r = measure(tablePath); + System.out.printf("%-9d %-6d %11.2f %11.2f %8.2fx%n", mb, files, r[0], r[1], r[0] / r[1]); + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + } + + /** + * Sweep the parquet reader batch size ({@link DMLConfig#DELTA_READER_BATCH_SIZE}, a + * public Delta Kernel knob) on a fixed multi-file table, with and without quieting the + * parquet/delta loggers. Pure "how we call the public API" tuning. + */ + @Test + @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") + public void batchSizeSweep() throws Exception { + final int rows = 2_000_000; + final long fileSize = 8 * MB; + final int[] batches = {1024, 4096, 8192, 16384, 32768, 65536, 131072}; + System.out.println("\n=== reader batch-size sweep (mixed, " + rows + " rows, 8MB files) ==="); + + //write the table ONCE; the batch size only affects the read path + DMLConfig wconf = new DMLConfig(); + wconf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(fileSize)); + ConfigurationManager.setLocalConfig(wconf); + Path dir = Files.createTempDirectory("sysds_delta_frame_bs_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + FrameBlock in = genFrame("mixed", rows, 7); + new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); + System.out.println("files = " + countParquet(tablePath)); + + for( boolean quietLog : new boolean[] {false, true} ) { + if( quietLog ) + silenceParquetLogging(); + System.out.println(quietLog ? "-- parquet/delta logging -> ERROR --" : "-- default logging --"); + System.out.printf("%-9s %11s %11s%n", "batch", "serial(ms)", "par(ms)"); + for( int bs : batches ) { + DMLConfig c = new DMLConfig(); + c.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(fileSize)); + c.setTextValue(DMLConfig.DELTA_READER_BATCH_SIZE, String.valueOf(bs)); + ConfigurationManager.setLocalConfig(c); + double[] r = measure(tablePath); + System.out.printf("%-9d %11.2f %11.2f%n", bs, r[0], r[1]); + } + } + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + + /** Median serial and parallel read time (ms) for a fixed table under the current config. */ + private double[] measure(String tablePath) throws Exception { + FrameReaderDelta serial = new FrameReaderDelta(); + FrameReaderDeltaParallel parallel = new FrameReaderDeltaParallel(); + for( int i=0; i serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); + tp[i] = time(() -> parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); + } + return new double[] {median(ts), median(tp)}; + } + + @Test + @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") + public void benchmark() throws Exception { + System.out.println("\n=== Delta frame reader benchmark ==="); + System.out.println("parallel read threads = " + OptimizerUtils.getParallelBinaryReadParallelism() + + ", processors = " + Runtime.getRuntime().availableProcessors()); + System.out.printf("%-9s %-7s %-6s %11s %11s %9s%n", + "rows", "fileMB", "files", "serial(ms)", "par(ms)", "speedup"); + runCase(1_000_000, 4 * MB); + runCase(1_000_000, 16 * MB); + runCase(4_000_000, 8 * MB); + runCase(4_000_000, 64 * MB); + } + + @Test + @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") + public void schemaBreakdown() throws Exception { + System.out.println("\n=== schema composition breakdown (2M rows, 8MB files) ==="); + System.out.printf("%-10s %-6s %11s %11s %9s%n", "schema", "files", "serial(ms)", "par(ms)", "speedup"); + int rows = 2_000_000; + for( boolean quietLog : new boolean[] {false, true} ) { + if( quietLog ) + silenceParquetLogging(); + System.out.println(quietLog ? "-- parquet/delta logging -> ERROR --" : "-- default logging --"); + runSchema("numeric", rows, 8 * MB); + runSchema("mixed", rows, 8 * MB); + runSchema("string", rows, 8 * MB); + } + } + + private static void silenceParquetLogging() { + org.apache.log4j.Logger.getLogger("org.apache.parquet").setLevel(org.apache.log4j.Level.ERROR); + org.apache.log4j.Logger.getLogger("io.delta").setLevel(org.apache.log4j.Level.ERROR); + org.apache.log4j.Logger.getLogger("shaded.parquet").setLevel(org.apache.log4j.Level.ERROR); + } + + private void runSchema(String kind, int rows, long targetFileSize) throws Exception { + DMLConfig conf = new DMLConfig(); + conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(targetFileSize)); + ConfigurationManager.setLocalConfig(conf); + Path dir = Files.createTempDirectory("sysds_delta_frame_perf_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + FrameBlock in = genFrame(kind, rows, 7); + new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); + long files = countParquet(tablePath); + FrameReaderDelta serial = new FrameReaderDelta(); + FrameReaderDeltaParallel parallel = new FrameReaderDeltaParallel(); + for( int i=0; i serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); + tp[i] = time(() -> parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); + } + double ms = median(ts), mp = median(tp); + System.out.printf("%-10s %-6d %11.2f %11.2f %8.2fx%n", kind, files, ms, mp, ms / mp); + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + + private void runCase(int rows, long targetFileSize) throws Exception { + DMLConfig conf = new DMLConfig(); + conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(targetFileSize)); + ConfigurationManager.setLocalConfig(conf); + Path dir = Files.createTempDirectory("sysds_delta_frame_perf_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + FrameBlock in = genMixedFrame(rows, 7); + new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); + long files = countParquet(tablePath); + + FrameReaderDelta serial = new FrameReaderDelta(); + FrameReaderDeltaParallel parallel = new FrameReaderDeltaParallel(); + + for( int i=0; i serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); + tp[i] = time(() -> parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); + } + double ms = median(ts), mp = median(tp); + System.out.printf("%-9d %-7d %-6d %11.2f %11.2f %8.2fx%n", + rows, targetFileSize / MB, files, ms, mp, ms / mp); + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + + private static FrameBlock genFrame(String kind, int nrow, int seed) { + ValueType[] schema; + switch( kind ) { + case "numeric": + schema = new ValueType[] {ValueType.INT64, ValueType.FP64, ValueType.INT32, + ValueType.FP32, ValueType.BOOLEAN, ValueType.INT64}; + break; + case "string": + schema = new ValueType[] {ValueType.STRING, ValueType.STRING, ValueType.STRING, + ValueType.STRING, ValueType.STRING, ValueType.STRING}; + break; + default: // mixed + schema = new ValueType[] {ValueType.STRING, ValueType.INT64, ValueType.FP64, + ValueType.BOOLEAN, ValueType.INT32, ValueType.FP32}; + } + String[] names = {"c0", "c1", "c2", "c3", "c4", "c5"}; + FrameBlock fb = new FrameBlock(schema, names); + fb.ensureAllocatedColumns(nrow); + Random rnd = new Random(seed); + for( int r=0; r s = Files.walk(new File(tablePath).toPath()) ) { + return s.filter(p -> p.toString().endsWith(".parquet")).count(); + } + } + + private static FrameBlock genMixedFrame(int nrow, int seed) { + ValueType[] schema = {ValueType.STRING, ValueType.INT64, ValueType.FP64, + ValueType.BOOLEAN, ValueType.INT32, ValueType.FP32}; + String[] names = {"name", "id", "score", "active", "count", "ratio"}; + FrameBlock fb = new FrameBlock(schema, names); + fb.ensureAllocatedColumns(nrow); + Random rnd = new Random(seed); + for( int r=0; r Date: Wed, 1 Jul 2026 21:52:06 +0000 Subject: [PATCH 07/10] Apply Eclipse code formatter to new Delta frame files Reformat the new Delta frame reader/writer and test files with the project Eclipse style (dev/CodeStyle_eclipse.xml). Whitespace and layout only, no behavioral changes. --- .../sysds/runtime/io/FrameReaderDelta.java | 255 ++++++------ .../runtime/io/FrameReaderDeltaParallel.java | 107 +++-- .../sysds/runtime/io/FrameWriterDelta.java | 71 ++-- .../performance/frame/DeltaFrameRead.java | 54 +-- .../test/component/io/DeltaFrameReadPerf.java | 180 +++++---- .../component/io/DeltaFrameReadWriteTest.java | 374 +++++++++++------- .../io/delta/FrameDeltaReadWriteTest.java | 41 +- 7 files changed, 603 insertions(+), 479 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java index bb60d30bec2..97d54faf6cd 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java @@ -35,76 +35,72 @@ import io.delta.kernel.types.DataType; /** - * Single-threaded native Delta Lake reader for frames, built on the Spark-free - * Delta Kernel library. It opens the latest snapshot of a Delta table, reads - * its parquet data files through the kernel's default engine (honoring deletion - * vectors), and materializes the columns into a {@link FrameBlock} whose schema - * and column names are derived from the Delta table schema. + * Single-threaded native Delta Lake reader for frames, built on the Spark-free Delta Kernel library. It opens the + * latest snapshot of a Delta table, reads its parquet data files through the kernel's default engine (honoring deletion + * vectors), and materializes the columns into a {@link FrameBlock} whose schema and column names are derived from the + * Delta table schema. * - *

Data is extracted column-at-a-time into primitive arrays (no per-cell - * boxing or {@code FrameBlock.set} dispatch) and the frame is constructed - * directly from typed column {@link Array}s. Supported column types map to - * SystemDS value types: double, float, long, int, short, byte, boolean, and - * string. Neither the schema nor the dimensions need to be supplied; they are - * discovered from the table.

+ *

+ * Data is extracted column-at-a-time into primitive arrays (no per-cell boxing or {@code FrameBlock.set} dispatch) and + * the frame is constructed directly from typed column {@link Array}s. Supported column types map to SystemDS value + * types: double, float, long, int, short, byte, boolean, and string. Neither the schema nor the dimensions need to be + * supplied; they are discovered from the table. + *

*/ public class FrameReaderDelta extends FrameReader { - //per-column read codes (how to pull a value out of the Delta column vector); - //aliases of the shared codes in DeltaKernelUtils so the frame read dispatch stays - //in lockstep with the matrix reader's type mapping. Package visible so the parallel - //reader can reuse the same dispatch. + // per-column read codes (how to pull a value out of the Delta column vector); + // aliases of the shared codes in DeltaKernelUtils so the frame read dispatch stays + // in lockstep with the matrix reader's type mapping. Package visible so the parallel + // reader can reuse the same dispatch. static final int R_DOUBLE = DeltaKernelUtils.T_DOUBLE, R_FLOAT = DeltaKernelUtils.T_FLOAT, R_LONG = DeltaKernelUtils.T_LONG, R_INT = DeltaKernelUtils.T_INT, R_SHORT = DeltaKernelUtils.T_SHORT, R_BYTE = DeltaKernelUtils.T_BYTE, R_BOOLEAN = DeltaKernelUtils.T_BOOLEAN, R_STRING = DeltaKernelUtils.T_STRING; @Override public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen, long clen) - throws IOException, DMLRuntimeException - { + throws IOException, DMLRuntimeException { Engine engine = DeltaKernelUtils.createEngine(); String tablePath = DeltaKernelUtils.qualify(fname); DeltaKernelUtils.ScanHandle handle = DeltaKernelUtils.openScan(engine, tablePath); - //derive per-column read codes, value types and names once from the schema + // derive per-column read codes, value types and names once from the schema final int ncol = handle.schema.length(); final int[] readCodes = new int[ncol]; final ValueType[] vt = new ValueType[ncol]; final String[] cnames = new String[ncol]; - for( int c=0; c pre-size one typed array per column and decode each file - //straight into its row offset, avoiding the per-batch extract + concatenate. - if( useDirectPath(handle) ) { + // fast path: exact per-file row counts are known from metadata (no deletion + // vectors) -> pre-size one typed array per column and decode each file + // straight into its row offset, avoiding the per-batch extract + concatenate. + if(useDirectPath(handle)) { long total = 0; - for( long r : handle.numRecords ) + for(long r : handle.numRecords) total += r; - //empty table: the typed column arrays cannot be zero-length, so return a - //schema-only frame with the discovered schema/names and zero rows. - if( total == 0 ) + // empty table: the typed column arrays cannot be zero-length, so return a + // schema-only frame with the discovered schema/names and zero rows. + if(total == 0) return new FrameBlock(vt, cnames, 0); - if( total <= Integer.MAX_VALUE ) + if(total <= Integer.MAX_VALUE) return readDirect(fname, engine, handle, ncol, readCodes, vt, cnames, (int) total); } - //fallback: row counts unknown or deletion vectors present -> decode into - //per-batch arrays and concatenate per column in file order. + // fallback: row counts unknown or deletion vectors present -> decode into + // per-batch arrays and concatenate per column in file order. return readBuffered(engine, handle, ncol, readCodes, vt, cnames); } /** - * Whether the metadata-driven direct read fast path can be used for this table - * (exact per-file row counts and no deletion vectors, so the output can be - * pre-sized and each file decoded straight into its row offset). Visible for - * testing: the buffered fallback is otherwise only reachable for tables lacking - * row statistics or carrying deletion vectors, which the SystemDS Delta writer - * never produces. + * Whether the metadata-driven direct read fast path can be used for this table (exact per-file row counts and no + * deletion vectors, so the output can be pre-sized and each file decoded straight into its row offset). Visible for + * testing: the buffered fallback is otherwise only reachable for tables lacking row statistics or carrying deletion + * vectors, which the SystemDS Delta writer never produces. * * @param handle the opened scan handle * @return true if the direct path is applicable @@ -114,30 +110,28 @@ protected boolean useDirectPath(DeltaKernelUtils.ScanHandle handle) { } /** - * Fast path: decode each data file straight into pre-sized typed column arrays - * at a metadata-derived row offset. One allocation per column, single pass, no - * intermediate per-batch buffers or serial concatenation. + * Fast path: decode each data file straight into pre-sized typed column arrays at a metadata-derived row offset. + * One allocation per column, single pass, no intermediate per-batch buffers or serial concatenation. */ - private FrameBlock readDirect(String fname, Engine engine, DeltaKernelUtils.ScanHandle handle, - int ncol, int[] readCodes, ValueType[] vt, String[] cnames, int nrow) throws IOException - { + private FrameBlock readDirect(String fname, Engine engine, DeltaKernelUtils.ScanHandle handle, int ncol, + int[] readCodes, ValueType[] vt, String[] cnames, int nrow) throws IOException { final Object[] dest = new Object[ncol]; - for( int c=0; c { - if( cur[0] + DeltaKernelUtils.countSelected(size, selected) > limit ) + DeltaKernelUtils.readScanFile(engine, handle.scanState, handle.physicalReadSchema, handle.scanFiles.get(i), + (cols, size, selected) -> { + if(cur[0] + DeltaKernelUtils.countSelected(size, selected) > limit) throw new DMLRuntimeException("Delta file produced more rows than its " + "numRecords statistic; refusing direct read of " + fname); - for( int c=0; c[] columns = new Array[ncol]; - for( int c=0; c batchCols = new ArrayList<>(); final ArrayList batchSizes = new ArrayList<>(); final int[] nrowH = new int[1]; - for( Row scanFileRow : handle.scanFiles ) { + for(Row scanFileRow : handle.scanFiles) { DeltaKernelUtils.readScanFile(engine, handle.scanState, handle.physicalReadSchema, scanFileRow, (cols, size, selected) -> { int n = DeltaKernelUtils.countSelected(size, selected); Object[] extracted = new Object[ncol]; - for( int c=0; c[] columns = new Array[ncol]; - for( int c=0; cOnly the buffered fallback needs this concatenation; the default direct - * path decodes straight into one pre-sized array per column with no - * intermediate per-batch arrays.

+ *

+ * Only the buffered fallback needs this concatenation; the default direct path decodes straight into one pre-sized + * array per column with no intermediate per-batch arrays. + *

*/ - static Array concatColumn(ValueType vt, int nrow, ArrayList batchCols, - ArrayList batchSizes, int c) - { + static Array concatColumn(ValueType vt, int nrow, ArrayList batchCols, ArrayList batchSizes, + int c) { Object full = ArrayFactory.allocateBacking(vt, nrow); int off = 0; - for( int b=0; b concatColumn(ValueType vt, int nrow, ArrayList batchCo } static int readCode(DataType dt, String name) { - //reuse the shared Delta type -> code mapping; frames additionally reject the - //types the matrix reader also cannot map (typeCode returns -1) + // reuse the shared Delta type -> code mapping; frames additionally reject the + // types the matrix reader also cannot map (typeCode returns -1) int code = DeltaKernelUtils.typeCode(dt); - if( code < 0 ) - throw new DMLRuntimeException("Unsupported non-mappable Delta column '" + name - + "' of type " + dt + " for frame read."); + if(code < 0) + throw new DMLRuntimeException( + "Unsupported non-mappable Delta column '" + name + "' of type " + dt + " for frame read."); return code; } static ValueType valueType(int readCode) { - switch( readCode ) { - case R_DOUBLE: return ValueType.FP64; - case R_FLOAT: return ValueType.FP32; - case R_LONG: return ValueType.INT64; + switch(readCode) { + case R_DOUBLE: + return ValueType.FP64; + case R_FLOAT: + return ValueType.FP32; + case R_LONG: + return ValueType.INT64; case R_INT: case R_SHORT: - case R_BYTE: return ValueType.INT32; - case R_BOOLEAN: return ValueType.BOOLEAN; - default: return ValueType.STRING; + case R_BYTE: + return ValueType.INT32; + case R_BOOLEAN: + return ValueType.BOOLEAN; + default: + return ValueType.STRING; } } /** - * Decode the live (selected, after deletion vector) rows of one column batch - * directly into a pre-sized typed array starting at absolute row {@code destOff}. - * Null numeric cells keep the array default (0); string nulls are stored as null. + * Decode the live (selected, after deletion vector) rows of one column batch directly into a pre-sized typed array + * starting at absolute row {@code destOff}. Null numeric cells keep the array default (0); string nulls are stored + * as null. */ - static void extractColumnInto(ColumnVector col, int size, boolean[] selected, - int readCode, Object dest, int destOff) - { - switch( readCode ) { + static void extractColumnInto(ColumnVector col, int size, boolean[] selected, int readCode, Object dest, + int destOff) { + switch(readCode) { case R_DOUBLE: { double[] a = (double[]) dest; int lr = destOff; - for( int r=0; rIt mirrors {@link ReaderDeltaParallel} (the matrix variant) but produces - * typed column {@link Array}s instead of a dense {@code double[]}. As with the - * matrix reader, the expensive part of a Delta read is the per-file parquet - * decode, so parallelizing across data files is the natural speedup. A table - * backed by a single data file cannot be split this way, so the reader - * transparently falls back to the sequential {@link FrameReaderDelta}.

+ *

+ * It mirrors {@link ReaderDeltaParallel} (the matrix variant) but produces typed column {@link Array}s instead of a + * dense {@code double[]}. As with the matrix reader, the expensive part of a Delta read is the per-file parquet decode, + * so parallelizing across data files is the natural speedup. A table backed by a single data file cannot be split this + * way, so the reader transparently falls back to the sequential {@link FrameReaderDelta}. + *

*/ public class FrameReaderDeltaParallel extends FrameReaderDelta { @@ -61,37 +60,36 @@ public FrameReaderDeltaParallel() { @Override public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen, long clen) - throws IOException, DMLRuntimeException - { + throws IOException, DMLRuntimeException { Engine engine = DeltaKernelUtils.createEngine(); String tablePath = DeltaKernelUtils.qualify(fname); DeltaKernelUtils.ScanHandle handle = DeltaKernelUtils.openScan(engine, tablePath); final int nfiles = handle.scanFiles.size(); - //nothing to gain from parallelism for single-file (or empty) tables - if( _numThreads <= 1 || nfiles <= 1 ) + // nothing to gain from parallelism for single-file (or empty) tables + if(_numThreads <= 1 || nfiles <= 1) return super.readFrameFromHDFS(fname, schema, names, rlen, clen); - //derive per-column read codes, value types and names once from the schema + // derive per-column read codes, value types and names once from the schema final int ncol = handle.schema.length(); final int[] readCodes = new int[ncol]; final ValueType[] vt = new ValueType[ncol]; final String[] cnames = new String[ncol]; - for( int c=0; c pre-size - //one typed array per column and let each thread decode directly into its - //row offset (no intermediate buffers, no serial concatenation). - if( useDirectPath(handle) ) { + // fast path: exact per-file row counts are known from metadata -> pre-size + // one typed array per column and let each thread decode directly into its + // row offset (no intermediate buffers, no serial concatenation). + if(useDirectPath(handle)) { long total = 0; - for( long r : handle.numRecords ) + for(long r : handle.numRecords) total += r; - if( total > 0 && total <= Integer.MAX_VALUE ) + if(total > 0 && total <= Integer.MAX_VALUE) return readDirect(fname, handle, ncol, readCodes, vt, cnames, (int) total); } @@ -99,43 +97,41 @@ public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] n } /** - * Fast path: each thread decodes one data file straight into the final typed - * column arrays at a metadata-derived row offset. Single allocation per - * column, fully parallel. + * Fast path: each thread decodes one data file straight into the final typed column arrays at a metadata-derived + * row offset. Single allocation per column, fully parallel. */ - private FrameBlock readDirect(String fname, DeltaKernelUtils.ScanHandle handle, - int ncol, int[] readCodes, ValueType[] vt, String[] cnames, int nrow) throws IOException - { + private FrameBlock readDirect(String fname, DeltaKernelUtils.ScanHandle handle, int ncol, int[] readCodes, + ValueType[] vt, String[] cnames, int nrow) throws IOException { final int nfiles = handle.scanFiles.size(); final int[] rowOffset = new int[nfiles]; int acc = 0; - for( int i=0; i> tasks = new ArrayList<>(nfiles); - for( int i=0; i { int[] cur = new int[] {base}; Engine eng = DeltaKernelUtils.createEngine(); DeltaKernelUtils.readScanFile(eng, handle.scanState, handle.physicalReadSchema, scanFileRow, (cols, size, selected) -> { - if( cur[0] + DeltaKernelUtils.countSelected(size, selected) > limit ) + if(cur[0] + DeltaKernelUtils.countSelected(size, selected) > limit) throw new DMLRuntimeException("Delta file produced more rows than its " + "numRecords statistic; refusing parallel direct read of " + fname); - for( int c=0; c[] columns = new Array[ncol]; - for( int c=0; c[] fileCols = new ArrayList[nfiles]; @SuppressWarnings("unchecked") final ArrayList[] fileSizes = new ArrayList[nfiles]; ArrayList> tasks = new ArrayList<>(nfiles); - for( int i=0; i { @@ -179,9 +173,9 @@ private FrameBlock readBuffered(String fname, DeltaKernelUtils.ScanHandle handle (cols, size, selected) -> { int n = DeltaKernelUtils.countSelected(size, selected); Object[] extracted = new Object[ncol]; - for( int c=0; c batchCols = new ArrayList<>(); ArrayList batchSizes = new ArrayList<>(); int nrow = 0; - for( int i=0; i[] columns = new Array[ncol]; - for( int c=0; c> tasks, String fname) throws IOException { ExecutorService pool = CommonThreadPool.get(_numThreads); try { - for( Future f : pool.invokeAll(tasks) ) + for(Future f : pool.invokeAll(tasks)) f.get(); } catch(InterruptedException ex) { diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java b/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java index 99fea2e0e47..01f90259598 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java @@ -44,37 +44,35 @@ import io.delta.kernel.utils.CloseableIterator; /** - * Single-threaded native Delta Lake writer for frames, built on the Spark-free - * Delta Kernel library. It creates (or recreates) a Delta table whose schema - * mirrors the frame schema (per-column {@link ValueType} mapped to a Delta - * type and the frame column names), streams the {@link FrameBlock} rows as - * columnar batches into parquet data files, and commits the add-file actions. + * Single-threaded native Delta Lake writer for frames, built on the Spark-free Delta Kernel library. It creates (or + * recreates) a Delta table whose schema mirrors the frame schema (per-column {@link ValueType} mapped to a Delta type + * and the frame column names), streams the {@link FrameBlock} rows as columnar batches into parquet data files, and + * commits the add-file actions. */ public class FrameWriterDelta extends FrameWriter { @Override public void writeFrameToHDFS(FrameBlock src, String fname, long rlen, long clen) - throws IOException, DMLRuntimeException - { - if( src.getNumRows() != rlen || src.getNumColumns() != clen ) - throw new IOException("Frame dimensions mismatch with metadata: (" - + src.getNumRows() + "x" + src.getNumColumns() + ") vs (" + rlen + "x" + clen + ")."); + throws IOException, DMLRuntimeException { + if(src.getNumRows() != rlen || src.getNumColumns() != clen) + throw new IOException("Frame dimensions mismatch with metadata: (" + src.getNumRows() + "x" + + src.getNumColumns() + ") vs (" + rlen + "x" + clen + ")."); int ncol = (int) clen; int nrow = (int) rlen; StructType schema = buildSchema(src.getSchema(), src.getColumnNames(), ncol); - //snapshot the typed column arrays + per-column nullability once, so the - //hot per-cell path can read primitives directly (no boxing) and skip - //null-checks on non-nullable columns. + // snapshot the typed column arrays + per-column nullability once, so the + // hot per-cell path can read primitives directly (no boxing) and skip + // null-checks on non-nullable columns. Array[] cols = new Array[ncol]; boolean[] nullable = new boolean[ncol]; - for( int c=0; c= _ncol ) + if(ordinal < 0 || ordinal >= _ncol) throw new IndexOutOfBoundsException("column ordinal " + ordinal); - return new FrameColumnVector(_cols[ordinal], _nullable[ordinal], - _schema.at(ordinal).getDataType(), _rowStart, _size); + return new FrameColumnVector(_cols[ordinal], _nullable[ordinal], _schema.at(ordinal).getDataType(), + _rowStart, _size); } @Override @@ -178,10 +182,9 @@ public int getSize() { } /** - * Read-only typed column view over one column {@link Array} row range. Numeric - * values are read through {@link Array#getAsDouble(int)} to avoid boxing, and - * non-nullable columns short-circuit {@code isNullAt} so the kernel never pays - * for a redundant boxed fetch. + * Read-only typed column view over one column {@link Array} row range. Numeric values are read through + * {@link Array#getAsDouble(int)} to avoid boxing, and non-nullable columns short-circuit {@code isNullAt} so the + * kernel never pays for a redundant boxed fetch. */ private static class FrameColumnVector implements ColumnVector { private final Array _col; @@ -236,7 +239,7 @@ public float getFloat(int rowId) { @Override public long getLong(int rowId) { - //exact for INT64 (getAsDouble would lose precision beyond 2^53) + // exact for INT64 (getAsDouble would lose precision beyond 2^53) return ((Number) _col.get(_rowStart + rowId)).longValue(); } @@ -247,7 +250,7 @@ public int getInt(int rowId) { @Override public void close() { - //nothing to release + // nothing to release } } } diff --git a/src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java b/src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java index f525ae438de..e326efa0926 100644 --- a/src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java +++ b/src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java @@ -38,29 +38,31 @@ import org.apache.sysds.test.TestUtils; /** - * Reads the SAME native Delta frame table from disk repeatedly and reports read - * throughput. The table is written to a temporary directory ONCE as (untimed) - * setup; every timed repetition re-opens the latest snapshot and materializes a - * fresh {@link FrameBlock}, so the numbers reflect the read path only (parquet - * decode + column materialization), not the write. + * Reads the SAME native Delta frame table from disk repeatedly and reports read throughput. The table is written to a + * temporary directory ONCE as (untimed) setup; every timed repetition re-opens the latest snapshot and materializes a + * fresh {@link FrameBlock}, so the numbers reflect the read path only (parquet decode + column materialization), not + * the write. * - *

This is the target for an async-profiler run: launch the perf jar under the - * profiler agent and this loop provides a long, steady-state read workload to - * sample. See {@code src/test/java/org/apache/sysds/performance/README.md} and - * the {@code delta-async-profiler} cursor rule.

+ *

+ * This is the target for an async-profiler run: launch the perf jar under the profiler agent and this loop provides a + * long, steady-state read workload to sample. See {@code src/test/java/org/apache/sysds/performance/README.md} and the + * {@code delta-async-profiler} cursor rule. + *

* - *

Dispatched from {@link org.apache.sysds.performance.Main} (program id 18).

+ *

+ * Dispatched from {@link org.apache.sysds.performance.Main} (program id 18). + *

*/ public class DeltaFrameRead extends APerfTest { - //the Delta reader derives schema/names from the table metadata, so the values - //passed here are placeholders (a single detect column) and are ignored. + // the Delta reader derives schema/names from the table metadata, so the values + // passed here are placeholders (a single detect column) and are ignored. private static final ValueType[] DETECT_SCHEMA = new ValueType[] {ValueType.STRING}; private static final String[] DETECT_NAMES = new String[] {"x"}; private final int k; private final String mode; - private final long targetFileSize; //<=0 -> adaptive default sizing + private final long targetFileSize; // <=0 -> adaptive default sizing private String tablePath; private Path tableDir; @@ -79,18 +81,17 @@ public void run() throws Exception { setup(); System.out.println(this); System.out.printf("table: %s%n", tablePath); - System.out.printf("layout: files=%d, in-memory=%.1f MB, target=%s%n", - files, inMemSize / 1048576.0, + System.out.printf("layout: files=%d, in-memory=%.1f MB, target=%s%n", files, inMemSize / 1048576.0, targetFileSize > 0 ? (targetFileSize / 1048576) + "MB(fixed)" : "adaptive"); - if( mode.equals("serial") || mode.equals("both") ) + if(mode.equals("serial") || mode.equals("both")) execute(() -> readSerial(), "Delta read serial"); - if( mode.equals("parallel") || mode.equals("both") ) + if(mode.equals("parallel") || mode.equals("both")) execute(() -> readParallel(), "Delta read parallel(k=" + k + ")"); } finally { ConfigurationManager.clearLocalConfigs(); - if( tableDir != null ) + if(tableDir != null) FileUtils.deleteQuietly(tableDir.toFile()); } } @@ -101,7 +102,7 @@ private void setup() throws Exception { inMemSize = fb.getInMemorySize(); DMLConfig c = new DMLConfig(); - if( targetFileSize > 0 ) { + if(targetFileSize > 0) { c.setTextValue(DMLConfig.DELTA_WRITER_ADAPTIVE_FILE_SIZE, "false"); c.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(targetFileSize)); } @@ -115,8 +116,7 @@ private void setup() throws Exception { private void readSerial() { try { - FrameBlock fb = new FrameReaderDelta() - .readFrameFromHDFS(tablePath, DETECT_SCHEMA, DETECT_NAMES, -1, -1); + FrameBlock fb = new FrameReaderDelta().readFrameFromHDFS(tablePath, DETECT_SCHEMA, DETECT_NAMES, -1, -1); ret.add(fb.getInMemorySize()); } catch(Exception e) { @@ -126,8 +126,8 @@ private void readSerial() { private void readParallel() { try { - FrameBlock fb = new FrameReaderDeltaParallel() - .readFrameFromHDFS(tablePath, DETECT_SCHEMA, DETECT_NAMES, -1, -1); + FrameBlock fb = new FrameReaderDeltaParallel().readFrameFromHDFS(tablePath, DETECT_SCHEMA, DETECT_NAMES, -1, + -1); ret.add(fb.getInMemorySize()); } catch(Exception e) { @@ -136,7 +136,7 @@ private void readParallel() { } private static long countParquet(String tablePath) throws Exception { - try( Stream s = Files.walk(new File(tablePath).toPath()) ) { + try(Stream s = Files.walk(new File(tablePath).toPath())) { return s.filter(p -> p.toString().endsWith(".parquet")).count(); } } @@ -160,7 +160,7 @@ private static double trimmedMean(double[] times) { int remove = (int) Math.floor(v.length * 0.05); double total = 0; int el = v.length - remove * 2; - for( int i = remove; i < v.length - remove; i++ ) + for(int i = remove; i < v.length - remove; i++) total += v[i]; return total / Math.max(el, 1); } @@ -172,8 +172,8 @@ public String toString() { /** Build a representative mixed-schema frame (string + numeric columns). */ public static IGenerate mixedFrame(int rows, long seed) { - ValueType[] schema = new ValueType[] {ValueType.STRING, ValueType.INT64, ValueType.FP64, - ValueType.BOOLEAN, ValueType.INT32, ValueType.FP32}; + ValueType[] schema = new ValueType[] {ValueType.STRING, ValueType.INT64, ValueType.FP64, ValueType.BOOLEAN, + ValueType.INT32, ValueType.FP32}; return new ConstFrame(TestUtils.generateRandomFrameBlock(rows, schema, seed)); } } diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java index d09219cc323..ddd53c906d8 100644 --- a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java +++ b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java @@ -39,21 +39,22 @@ import org.junit.Test; /** - * Manual micro-benchmark comparing the serial {@link FrameReaderDelta} against - * the parallel {@link FrameReaderDeltaParallel} on multi-file Delta frame - * tables. Not a correctness test (those live in {@link DeltaFrameReadWriteTest}); - * it just prints timing/throughput numbers and is {@link Ignore}d so it does not - * run in the normal build. + * Manual micro-benchmark comparing the serial {@link FrameReaderDelta} against the parallel + * {@link FrameReaderDeltaParallel} on multi-file Delta frame tables. Not a correctness test (those live in + * {@link DeltaFrameReadWriteTest}); it just prints timing/throughput numbers and is {@link Ignore}d so it does not run + * in the normal build. * - *

The parallel reader decodes one task per parquet data file, so the speedup - * scales with the number of files (controlled here via the writer target file - * size). Run it on a JVM with a realistically sized heap; under a tiny young - * generation (e.g. the Surefire fork's {@code -Xmn300m}) the concurrent decode's - * higher allocation rate is dominated by young-GC pauses and the numbers are not - * representative of a normal SystemDS process.

+ *

+ * The parallel reader decodes one task per parquet data file, so the speedup scales with the number of files + * (controlled here via the writer target file size). Run it on a JVM with a realistically sized heap; under a tiny + * young generation (e.g. the Surefire fork's {@code -Xmn300m}) the concurrent decode's higher allocation rate is + * dominated by young-GC pauses and the numbers are not representative of a normal SystemDS process. + *

* - *

Run explicitly (remove {@link Ignore} or run the compiled class directly), - * e.g. {@code mvn -q test -Dtest=DeltaFrameReadPerf -DfailIfNoTests=false}.

+ *

+ * Run explicitly (remove {@link Ignore} or run the compiled class directly), e.g. + * {@code mvn -q test -Dtest=DeltaFrameReadPerf -DfailIfNoTests=false}. + *

*/ public class DeltaFrameReadPerf { @@ -71,19 +72,18 @@ public static void main(String[] args) throws Exception { } /** - * Isolates the serial-reader change: compares the new direct (pre-sized, - * metadata-driven, single-pass) read against the old buffered (per-batch - * extract + concatenate) read on the SAME single-file table, so the only - * difference is the extra allocation + concatenation copy. Single file => - * no file-level parallelism involved, pure serial decode cost. + * Isolates the serial-reader change: compares the new direct (pre-sized, metadata-driven, single-pass) read against + * the old buffered (per-batch extract + concatenate) read on the SAME single-file table, so the only difference is + * the extra allocation + concatenation copy. Single file => no file-level parallelism involved, pure serial decode + * cost. */ @Test @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") public void serialDirectVsBuffered() throws Exception { System.out.println("\n=== serial direct vs buffered (single file, 4M rows) ==="); System.out.printf("%-9s %11s %11s %9s%n", "schema", "direct(ms)", "buffered(ms)", "speedup"); - for( String kind : new String[] {"numeric", "mixed", "string"} ) { - //force a single data file: disable adaptive sizing, huge target + for(String kind : new String[] {"numeric", "mixed", "string"}) { + // force a single data file: disable adaptive sizing, huge target DMLConfig c = new DMLConfig(); c.setTextValue(DMLConfig.DELTA_WRITER_ADAPTIVE_FILE_SIZE, "false"); c.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(4L * 1024 * MB)); @@ -93,25 +93,28 @@ public void serialDirectVsBuffered() throws Exception { try { FrameBlock in = genFrame(kind, 4_000_000, 7); new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - //direct = default serial reader; buffered = force the fallback path + // direct = default serial reader; buffered = force the fallback path FrameReaderDelta direct = new FrameReaderDelta(); FrameReaderDelta buffered = new FrameReaderDelta() { - @Override protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { return false; } + @Override + protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { + return false; + } }; - for( int i=0; i direct.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); tb[i] = time(() -> buffered.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); } double md = median(td), mb = median(tb); long ad = allocBytes(() -> direct.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); long ab = allocBytes(() -> buffered.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); - System.out.printf("%-9s %11.2f %11.2f %8.2fx alloc: %6.0f / %6.0f MB (%.2fx)%n", - kind, md, mb, mb / md, ad / (double) MB, ab / (double) MB, ab / (double) ad); + System.out.printf("%-9s %11.2f %11.2f %8.2fx alloc: %6.0f / %6.0f MB (%.2fx)%n", kind, md, mb, + mb / md, ad / (double) MB, ab / (double) MB, ab / (double) ad); } finally { ConfigurationManager.clearLocalConfigs(); @@ -121,19 +124,19 @@ public void serialDirectVsBuffered() throws Exception { } /** - * End-to-end check of adaptive writer file sizing with NO explicit target size configured - * (the real default): the table should now be split into ~one file per reader and read fast, - * versus the single/few-file layout the fixed 64MB default produced. + * End-to-end check of adaptive writer file sizing with NO explicit target size configured (the real default): the + * table should now be split into ~one file per reader and read fast, versus the single/few-file layout the fixed + * 64MB default produced. */ @Test @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") public void adaptiveCheck() throws Exception { System.out.println("\n=== adaptive writer file sizing (default config, no target set) ==="); System.out.println("threads = " + OptimizerUtils.getParallelBinaryReadParallelism()); - System.out.printf("%-9s %-8s %-6s %11s %11s %9s%n", - "rows", "adaptive", "files", "serial(ms)", "par(ms)", "speedup"); - for( int rows : new int[] {1_000_000, 4_000_000} ) { - //default config => 64MB cap, adaptive sizing enabled + System.out.printf("%-9s %-8s %-6s %11s %11s %9s%n", "rows", "adaptive", "files", "serial(ms)", "par(ms)", + "speedup"); + for(int rows : new int[] {1_000_000, 4_000_000}) { + // default config => 64MB cap, adaptive sizing enabled ConfigurationManager.setLocalConfig(new DMLConfig()); Path dir = Files.createTempDirectory("sysds_delta_frame_adp_"); String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); @@ -144,8 +147,8 @@ public void adaptiveCheck() throws Exception { new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); long files = countParquet(tablePath); double[] r = measure(tablePath); - System.out.printf("%-9d %-8s %-6d %11.2f %11.2f %8.2fx%n", - rows, (target / MB) + "MB", files, r[0], r[1], r[0] / r[1]); + System.out.printf("%-9d %-8s %-6d %11.2f %11.2f %8.2fx%n", rows, (target / MB) + "MB", files, r[0], + r[1], r[0] / r[1]); } finally { ConfigurationManager.clearLocalConfigs(); @@ -155,9 +158,9 @@ public void adaptiveCheck() throws Exception { } /** - * Sweep the writer target file size ({@link DMLConfig#DELTA_WRITER_TARGET_FILE_SIZE}, the - * one public Delta knob that affects read parallelism) to find where the per-file parallel - * read stops improving, i.e. a good default for read-heavy use. + * Sweep the writer target file size ({@link DMLConfig#DELTA_WRITER_TARGET_FILE_SIZE}, the one public Delta knob + * that affects read parallelism) to find where the per-file parallel read stops improving, i.e. a good default for + * read-heavy use. */ @Test @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") @@ -167,7 +170,7 @@ public void targetSizeSweep() throws Exception { System.out.println("\n=== writer target-size sweep (mixed, " + rows + " rows, " + OptimizerUtils.getParallelBinaryReadParallelism() + " threads) ==="); System.out.printf("%-9s %-6s %11s %11s %9s%n", "targetMB", "files", "serial(ms)", "par(ms)", "speedup"); - for( long mb : sizesMB ) { + for(long mb : sizesMB) { DMLConfig c = new DMLConfig(); c.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(mb * MB)); ConfigurationManager.setLocalConfig(c); @@ -188,9 +191,9 @@ public void targetSizeSweep() throws Exception { } /** - * Sweep the parquet reader batch size ({@link DMLConfig#DELTA_READER_BATCH_SIZE}, a - * public Delta Kernel knob) on a fixed multi-file table, with and without quieting the - * parquet/delta loggers. Pure "how we call the public API" tuning. + * Sweep the parquet reader batch size ({@link DMLConfig#DELTA_READER_BATCH_SIZE}, a public Delta Kernel knob) on a + * fixed multi-file table, with and without quieting the parquet/delta loggers. Pure "how we call the public API" + * tuning. */ @Test @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") @@ -200,7 +203,7 @@ public void batchSizeSweep() throws Exception { final int[] batches = {1024, 4096, 8192, 16384, 32768, 65536, 131072}; System.out.println("\n=== reader batch-size sweep (mixed, " + rows + " rows, 8MB files) ==="); - //write the table ONCE; the batch size only affects the read path + // write the table ONCE; the batch size only affects the read path DMLConfig wconf = new DMLConfig(); wconf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(fileSize)); ConfigurationManager.setLocalConfig(wconf); @@ -211,12 +214,12 @@ public void batchSizeSweep() throws Exception { new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); System.out.println("files = " + countParquet(tablePath)); - for( boolean quietLog : new boolean[] {false, true} ) { - if( quietLog ) + for(boolean quietLog : new boolean[] {false, true}) { + if(quietLog) silenceParquetLogging(); System.out.println(quietLog ? "-- parquet/delta logging -> ERROR --" : "-- default logging --"); System.out.printf("%-9s %11s %11s%n", "batch", "serial(ms)", "par(ms)"); - for( int bs : batches ) { + for(int bs : batches) { DMLConfig c = new DMLConfig(); c.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(fileSize)); c.setTextValue(DMLConfig.DELTA_READER_BATCH_SIZE, String.valueOf(bs)); @@ -236,12 +239,12 @@ public void batchSizeSweep() throws Exception { private double[] measure(String tablePath) throws Exception { FrameReaderDelta serial = new FrameReaderDelta(); FrameReaderDeltaParallel parallel = new FrameReaderDeltaParallel(); - for( int i=0; i serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); tp[i] = time(() -> parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); } @@ -254,8 +257,8 @@ public void benchmark() throws Exception { System.out.println("\n=== Delta frame reader benchmark ==="); System.out.println("parallel read threads = " + OptimizerUtils.getParallelBinaryReadParallelism() + ", processors = " + Runtime.getRuntime().availableProcessors()); - System.out.printf("%-9s %-7s %-6s %11s %11s %9s%n", - "rows", "fileMB", "files", "serial(ms)", "par(ms)", "speedup"); + System.out.printf("%-9s %-7s %-6s %11s %11s %9s%n", "rows", "fileMB", "files", "serial(ms)", "par(ms)", + "speedup"); runCase(1_000_000, 4 * MB); runCase(1_000_000, 16 * MB); runCase(4_000_000, 8 * MB); @@ -268,8 +271,8 @@ public void schemaBreakdown() throws Exception { System.out.println("\n=== schema composition breakdown (2M rows, 8MB files) ==="); System.out.printf("%-10s %-6s %11s %11s %9s%n", "schema", "files", "serial(ms)", "par(ms)", "speedup"); int rows = 2_000_000; - for( boolean quietLog : new boolean[] {false, true} ) { - if( quietLog ) + for(boolean quietLog : new boolean[] {false, true}) { + if(quietLog) silenceParquetLogging(); System.out.println(quietLog ? "-- parquet/delta logging -> ERROR --" : "-- default logging --"); runSchema("numeric", rows, 8 * MB); @@ -296,12 +299,12 @@ private void runSchema(String kind, int rows, long targetFileSize) throws Except long files = countParquet(tablePath); FrameReaderDelta serial = new FrameReaderDelta(); FrameReaderDeltaParallel parallel = new FrameReaderDeltaParallel(); - for( int i=0; i serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); tp[i] = time(() -> parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); } @@ -328,19 +331,19 @@ private void runCase(int rows, long targetFileSize) throws Exception { FrameReaderDelta serial = new FrameReaderDelta(); FrameReaderDeltaParallel parallel = new FrameReaderDeltaParallel(); - for( int i=0; i serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); tp[i] = time(() -> parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); } double ms = median(ts), mp = median(tp); - System.out.printf("%-9d %-7d %-6d %11.2f %11.2f %8.2fx%n", - rows, targetFileSize / MB, files, ms, mp, ms / mp); + System.out.printf("%-9d %-7d %-6d %11.2f %11.2f %8.2fx%n", rows, targetFileSize / MB, files, ms, mp, + ms / mp); } finally { ConfigurationManager.clearLocalConfigs(); @@ -350,52 +353,61 @@ private void runCase(int rows, long targetFileSize) throws Exception { private static FrameBlock genFrame(String kind, int nrow, int seed) { ValueType[] schema; - switch( kind ) { + switch(kind) { case "numeric": - schema = new ValueType[] {ValueType.INT64, ValueType.FP64, ValueType.INT32, - ValueType.FP32, ValueType.BOOLEAN, ValueType.INT64}; + schema = new ValueType[] {ValueType.INT64, ValueType.FP64, ValueType.INT32, ValueType.FP32, + ValueType.BOOLEAN, ValueType.INT64}; break; case "string": - schema = new ValueType[] {ValueType.STRING, ValueType.STRING, ValueType.STRING, - ValueType.STRING, ValueType.STRING, ValueType.STRING}; + schema = new ValueType[] {ValueType.STRING, ValueType.STRING, ValueType.STRING, ValueType.STRING, + ValueType.STRING, ValueType.STRING}; break; default: // mixed - schema = new ValueType[] {ValueType.STRING, ValueType.INT64, ValueType.FP64, - ValueType.BOOLEAN, ValueType.INT32, ValueType.FP32}; + schema = new ValueType[] {ValueType.STRING, ValueType.INT64, ValueType.FP64, ValueType.BOOLEAN, + ValueType.INT32, ValueType.FP32}; } String[] names = {"c0", "c1", "c2", "c3", "c4", "c5"}; FrameBlock fb = new FrameBlock(schema, names); fb.ensureAllocatedColumns(nrow); Random rnd = new Random(seed); - for( int r=0; r s = Files.walk(new File(tablePath).toPath()) ) { + try(java.util.stream.Stream s = Files.walk(new File(tablePath).toPath())) { return s.filter(p -> p.toString().endsWith(".parquet")).count(); } } private static FrameBlock genMixedFrame(int nrow, int seed) { - ValueType[] schema = {ValueType.STRING, ValueType.INT64, ValueType.FP64, - ValueType.BOOLEAN, ValueType.INT32, ValueType.FP32}; + ValueType[] schema = {ValueType.STRING, ValueType.INT64, ValueType.FP64, ValueType.BOOLEAN, ValueType.INT32, + ValueType.FP32}; String[] names = {"name", "id", "score", "active", "count", "ratio"}; FrameBlock fb = new FrameBlock(schema, names); fb.ensureAllocatedColumns(nrow); Random rnd = new Random(seed); - for( int r=0; r readBuffered() + // subclass that always declines the direct path -> readBuffered() FrameBlock buffered = new FrameReaderDeltaParallel() { - @Override protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { return false; } + @Override + protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { + return false; + } }.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); assertFramesEqual(serial, buffered); @@ -237,9 +242,9 @@ public void parallelBufferedPathMatchesSerialMultiFile() throws Exception { @Test public void serialBufferedPathMatchesDirectMultiFile() throws Exception { - //the direct (pre-sized, metadata-driven) path is always taken for SystemDS- - //written tables; force the serial buffered fallback (per-batch extract + - //concatenate) to exercise it and assert it matches the direct read. + // the direct (pre-sized, metadata-driven) path is always taken for SystemDS- + // written tables; force the serial buffered fallback (per-batch extract + + // concatenate) to exercise it and assert it matches the direct read. DMLConfig conf = new DMLConfig(); conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(SMALL_TARGET_FILE_SIZE)); ConfigurationManager.setLocalConfig(conf); @@ -251,9 +256,12 @@ public void serialBufferedPathMatchesDirectMultiFile() throws Exception { assertMultiFile(tablePath); FrameBlock direct = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - //subclass that always declines the direct path -> buffered extract+concat + // subclass that always declines the direct path -> buffered extract+concat FrameBlock buffered = new FrameReaderDelta() { - @Override protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { return false; } + @Override + protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { + return false; + } }.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); assertFramesEqual(direct, buffered); @@ -266,21 +274,18 @@ public void serialBufferedPathMatchesDirectMultiFile() throws Exception { @Test public void adaptiveTargetFileSizeClampsAndRespectsFlag() { - //cap chosen above the 4MB floor so both clamp directions are observable + // cap chosen above the 4MB floor so both clamp directions are observable final long cap = 64L * 1024 * 1024; DMLConfig conf = new DMLConfig(); conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(cap)); conf.setTextValue(DMLConfig.DELTA_WRITER_ADAPTIVE_FILE_SIZE, "true"); ConfigurationManager.setLocalConfig(conf); try { - assertEquals("estimatedBytes<=0 -> configured cap", cap, - DeltaKernelUtils.adaptiveWriterTargetFileSize(0)); - assertEquals("negative estimate -> configured cap", cap, - DeltaKernelUtils.adaptiveWriterTargetFileSize(-1)); + assertEquals("estimatedBytes<=0 -> configured cap", cap, DeltaKernelUtils.adaptiveWriterTargetFileSize(0)); + assertEquals("negative estimate -> configured cap", cap, DeltaKernelUtils.adaptiveWriterTargetFileSize(-1)); assertEquals("huge table -> never above the configured cap", cap, DeltaKernelUtils.adaptiveWriterTargetFileSize(Long.MAX_VALUE / 2)); - assertEquals("tiny table -> never below the floor", - DeltaKernelUtils.ADAPTIVE_WRITER_MIN_FILE_SIZE, + assertEquals("tiny table -> never below the floor", DeltaKernelUtils.ADAPTIVE_WRITER_MIN_FILE_SIZE, DeltaKernelUtils.adaptiveWriterTargetFileSize(1)); conf.setTextValue(DMLConfig.DELTA_WRITER_ADAPTIVE_FILE_SIZE, "false"); @@ -294,7 +299,7 @@ public void adaptiveTargetFileSizeClampsAndRespectsFlag() { @Test public void factoryRoutesDeltaToParallelWhenEnabled() { - //the factory must pick the parallel frame reader iff parallel CP read is enabled + // the factory must pick the parallel frame reader iff parallel CP read is enabled CompilerConfig cc = ConfigurationManager.getCompilerConfig(); try { cc.set(ConfigType.PARALLEL_CP_READ_TEXTFORMATS, true); @@ -316,16 +321,15 @@ public void factoryRoutesDeltaToParallelWhenEnabled() { @Test public void readerBatchSizeConfigRoundTrips() throws Exception { - //a non-default reader batch size must not change the result (more, smaller - //batches exercise the per-batch extract/concatenate loop more often). + // a non-default reader batch size must not change the result (more, smaller + // batches exercise the per-batch extract/concatenate loop more often). DMLConfig conf = new DMLConfig(); conf.setTextValue(DMLConfig.DELTA_READER_BATCH_SIZE, "128"); ConfigurationManager.setLocalConfig(conf); Path dir = Files.createTempDirectory("sysds_delta_frame_bs_"); String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); try { - assertEquals("config getter reflects the override", - 128, ConfigurationManager.getDeltaReaderBatchSize()); + assertEquals("config getter reflects the override", 128, ConfigurationManager.getDeltaReaderBatchSize()); FrameBlock in = genMixedFrame(5000, 31); new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); @@ -340,22 +344,22 @@ public void readerBatchSizeConfigRoundTrips() throws Exception { @Test public void writerTargetFileSizeConfigProducesMoreFiles() throws Exception { - //a smaller configured target file size must make the writer roll more - //data files for the same frame (the lever the parallel reader relies on). + // a smaller configured target file size must make the writer roll more + // data files for the same frame (the lever the parallel reader relies on). DMLConfig conf = new DMLConfig(); conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(SMALL_TARGET_FILE_SIZE)); ConfigurationManager.setLocalConfig(conf); Path dir = Files.createTempDirectory("sysds_delta_frame_cfg_"); String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); try { - assertEquals("config getter reflects the override", - SMALL_TARGET_FILE_SIZE, ConfigurationManager.getDeltaWriterTargetFileSize()); + assertEquals("config getter reflects the override", SMALL_TARGET_FILE_SIZE, + ConfigurationManager.getDeltaWriterTargetFileSize()); FrameBlock in = genMixedFrame(ROWS_MULTI_FILE, 41); new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); assertMultiFile(tablePath); - //data still round-trips correctly with the custom layout + // data still round-trips correctly with the custom layout FrameBlock out = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); assertFramesEqual(in, out); } @@ -367,8 +371,8 @@ public void writerTargetFileSizeConfigProducesMoreFiles() throws Exception { @Test public void emptyFrameRoundTrip() throws Exception { - //a schema-only Delta table (no data files, 0 rows); the reader must - //rebuild empty typed columns and discover the schema/names from the table. + // a schema-only Delta table (no data files, 0 rows); the reader must + // rebuild empty typed columns and discover the schema/names from the table. ValueType[] schema = {ValueType.STRING, ValueType.FP64, ValueType.INT64}; String[] names = {"s", "d", "k"}; DataType[] dtypes = {StringType.STRING, DoubleType.DOUBLE, LongType.LONG}; @@ -380,7 +384,7 @@ public void emptyFrameRoundTrip() throws Exception { FrameBlock out = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); assertEquals("rows", 0, out.getNumRows()); assertEquals("cols", schema.length, out.getNumColumns()); - for( int c=0; c writer must reject - new FrameWriterDelta().writeFrameToHDFS(fb, tablePath, - fb.getNumRows() + 1, fb.getNumColumns()); + // declare one more row than the frame actually has -> writer must reject + new FrameWriterDelta().writeFrameToHDFS(fb, tablePath, fb.getNumRows() + 1, fb.getNumColumns()); fail("expected an IOException for a frame/metadata dimension mismatch"); } catch(IOException ex) { @@ -499,21 +502,21 @@ public void writerRejectsDimensionMismatch() throws Exception { @Test public void readFromInputStreamUnsupported() throws Exception { - //Delta is a directory-based table format; stream reads are not supported + // Delta is a directory-based table format; stream reads are not supported try { new FrameReaderDelta().readFrameFromInputStream(null, NO_SCHEMA, NO_NAMES, -1, -1); fail("expected UnsupportedOperationException for a Delta input-stream read"); } catch(UnsupportedOperationException ex) { - //expected: must throw before touching the (null) stream + // expected: must throw before touching the (null) stream } } @Test public void parallelReadStringNullsMatchSerialMultiFile() throws Exception { - //string nulls across a multi-file table: the parallel direct path must - //reproduce the serial read cell-for-cell (assertFramesEqual uses - //Objects.equals, so nulls are compared faithfully). + // string nulls across a multi-file table: the parallel direct path must + // reproduce the serial read cell-for-cell (assertFramesEqual uses + // Objects.equals, so nulls are compared faithfully). DMLConfig conf = new DMLConfig(); conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(SMALL_TARGET_FILE_SIZE)); ConfigurationManager.setLocalConfig(conf); @@ -524,8 +527,8 @@ public void parallelReadStringNullsMatchSerialMultiFile() throws Exception { String[] names = {"s", "k"}; int nrow = ROWS_MULTI_FILE; FrameBlock in = alloc(schema, names, nrow); - for( int r=0; r s = Files.walk(new File(tablePath).toPath()) ) { + try(java.util.stream.Stream s = Files.walk(new File(tablePath).toPath())) { files = s.filter(p -> p.toString().endsWith(".parquet")).count(); } - assertTrue("expected a multi-file Delta table to exercise the parallel path, got " + files, - files > 1); + assertTrue("expected a multi-file Delta table to exercise the parallel path, got " + files, files > 1); } private static void assertFramesEqual(FrameBlock expected, FrameBlock actual) { assertEquals("rows", expected.getNumRows(), actual.getNumRows()); assertEquals("cols", expected.getNumColumns(), actual.getNumColumns()); int ncol = expected.getNumColumns(); - for( int c=0; c empty() { return new CloseableIterator() { - @Override public boolean hasNext() { return false; } - @Override public FilteredColumnarBatch next() { throw new NoSuchElementException(); } - @Override public void close() {} + @Override + public boolean hasNext() { + return false; + } + + @Override + public FilteredColumnarBatch next() { + throw new NoSuchElementException(); + } + + @Override + public void close() { + } }; } - /** Writes a single date column (kernel stores dates as INT32 days) used to - * assert the frame reader rejects a non-mappable column type. */ + /** + * Writes a single date column (kernel stores dates as INT32 days) used to assert the frame reader rejects a + * non-mappable column type. + */ private static void writeDateColumn(String tablePath, int[] days) throws Exception { Engine engine = DeltaKernelUtils.createEngine(); final StructType schema = new StructType().add("d", DateType.DATE, false); ColumnarBatch batch = new ColumnarBatch() { - @Override public StructType getSchema() { return schema; } - @Override public int getSize() { return days.length; } - @Override public ColumnVector getColumnVector(int ordinal) { return new DateVector(days); } + @Override + public StructType getSchema() { + return schema; + } + + @Override + public int getSize() { + return days.length; + } + + @Override + public ColumnVector getColumnVector(int ordinal) { + return new DateVector(days); + } }; FilteredColumnarBatch fcb = new FilteredColumnarBatch(batch, Optional.empty()); DeltaKernelUtils.commit(engine, DeltaKernelUtils.qualify(tablePath), schema, singleton(fcb)); } - /** Writes a short column and a byte column (kernel stores these as 16/8-bit - * integers) used to assert the frame reader coerces both to INT32. */ + /** + * Writes a short column and a byte column (kernel stores these as 16/8-bit integers) used to assert the frame + * reader coerces both to INT32. + */ private static void writeShortByteColumns(String tablePath, short[] shorts, byte[] bytes) throws Exception { Engine engine = DeltaKernelUtils.createEngine(); - final StructType schema = new StructType() - .add("sh", ShortType.SHORT, false) - .add("by", ByteType.BYTE, false); + final StructType schema = new StructType().add("sh", ShortType.SHORT, false).add("by", ByteType.BYTE, false); ColumnarBatch batch = new ColumnarBatch() { - @Override public StructType getSchema() { return schema; } - @Override public int getSize() { return shorts.length; } - @Override public ColumnVector getColumnVector(int ordinal) { + @Override + public StructType getSchema() { + return schema; + } + + @Override + public int getSize() { + return shorts.length; + } + + @Override + public ColumnVector getColumnVector(int ordinal) { return (ordinal == 0) ? new ShortVector(shorts) : new ByteVector(bytes); } }; @@ -636,46 +670,122 @@ private static void writeShortByteColumns(String tablePath, short[] shorts, byte private static CloseableIterator singleton(FilteredColumnarBatch fcb) { return new CloseableIterator() { private boolean _done = false; - @Override public boolean hasNext() { return !_done; } - @Override public FilteredColumnarBatch next() { - if( _done ) throw new NoSuchElementException(); + + @Override + public boolean hasNext() { + return !_done; + } + + @Override + public FilteredColumnarBatch next() { + if(_done) + throw new NoSuchElementException(); _done = true; return fcb; } - @Override public void close() {} + + @Override + public void close() { + } }; } /** Column view exposing an int[] as a Delta date column. */ private static class DateVector implements ColumnVector { private final int[] _days; - DateVector(int[] days) { _days = days; } - @Override public DataType getDataType() { return DateType.DATE; } - @Override public int getSize() { return _days.length; } - @Override public boolean isNullAt(int rowId) { return false; } - @Override public int getInt(int rowId) { return _days[rowId]; } - @Override public void close() {} + + DateVector(int[] days) { + _days = days; + } + + @Override + public DataType getDataType() { + return DateType.DATE; + } + + @Override + public int getSize() { + return _days.length; + } + + @Override + public boolean isNullAt(int rowId) { + return false; + } + + @Override + public int getInt(int rowId) { + return _days[rowId]; + } + + @Override + public void close() { + } } /** Column view exposing a short[] as a Delta short column. */ private static class ShortVector implements ColumnVector { private final short[] _vals; - ShortVector(short[] vals) { _vals = vals; } - @Override public DataType getDataType() { return ShortType.SHORT; } - @Override public int getSize() { return _vals.length; } - @Override public boolean isNullAt(int rowId) { return false; } - @Override public short getShort(int rowId) { return _vals[rowId]; } - @Override public void close() {} + + ShortVector(short[] vals) { + _vals = vals; + } + + @Override + public DataType getDataType() { + return ShortType.SHORT; + } + + @Override + public int getSize() { + return _vals.length; + } + + @Override + public boolean isNullAt(int rowId) { + return false; + } + + @Override + public short getShort(int rowId) { + return _vals[rowId]; + } + + @Override + public void close() { + } } /** Column view exposing a byte[] as a Delta byte column. */ private static class ByteVector implements ColumnVector { private final byte[] _vals; - ByteVector(byte[] vals) { _vals = vals; } - @Override public DataType getDataType() { return ByteType.BYTE; } - @Override public int getSize() { return _vals.length; } - @Override public boolean isNullAt(int rowId) { return false; } - @Override public byte getByte(int rowId) { return _vals[rowId]; } - @Override public void close() {} + + ByteVector(byte[] vals) { + _vals = vals; + } + + @Override + public DataType getDataType() { + return ByteType.BYTE; + } + + @Override + public int getSize() { + return _vals.length; + } + + @Override + public boolean isNullAt(int rowId) { + return false; + } + + @Override + public byte getByte(int rowId) { + return _vals[rowId]; + } + + @Override + public void close() { + } } } diff --git a/src/test/java/org/apache/sysds/test/functions/io/delta/FrameDeltaReadWriteTest.java b/src/test/java/org/apache/sysds/test/functions/io/delta/FrameDeltaReadWriteTest.java index 0a6bba5a163..2c4799d16f0 100644 --- a/src/test/java/org/apache/sysds/test/functions/io/delta/FrameDeltaReadWriteTest.java +++ b/src/test/java/org/apache/sysds/test/functions/io/delta/FrameDeltaReadWriteTest.java @@ -35,11 +35,12 @@ /** * End-to-end DML test of the native Delta frame read/write path. * - *

As in the matrix variant, the write and the read run as two separate - * SystemDS executions so the read is a genuine disk read rather than an - * in-memory cache hit. We additionally assert via {@link CacheStatistics} that - * the write run wrote (delta + text reference) and the read run read (delta + - * text reference) from HDFS, so a short-circuited path would fail the test.

+ *

+ * As in the matrix variant, the write and the read run as two separate SystemDS executions so the read is a genuine + * disk read rather than an in-memory cache hit. We additionally assert via {@link CacheStatistics} that the write run + * wrote (delta + text reference) and the read run read (delta + text reference) from HDFS, so a short-circuited path + * would fail the test. + *

*/ public class FrameDeltaReadWriteTest extends AutomatedTestBase { @@ -51,10 +52,8 @@ public class FrameDeltaReadWriteTest extends AutomatedTestBase { @Override public void setUp() { TestUtils.clearAssertionInformation(); - addTestConfiguration(WRITE_NAME, - new TestConfiguration(TEST_CLASS_DIR, WRITE_NAME, new String[] { "ref" })); - addTestConfiguration(READ_NAME, - new TestConfiguration(TEST_CLASS_DIR, READ_NAME, new String[] { "R" })); + addTestConfiguration(WRITE_NAME, new TestConfiguration(TEST_CLASS_DIR, WRITE_NAME, new String[] {"ref"})); + addTestConfiguration(READ_NAME, new TestConfiguration(TEST_CLASS_DIR, READ_NAME, new String[] {"R"})); } @Test @@ -81,31 +80,29 @@ private void runFrameDeltaRoundTrip(int rows, int cols, double sparsity) { String deltaPath = output("deltaTable"); String refPath = output("ref"); fullDMLScriptName = HOME + WRITE_NAME + ".dml"; - programArgs = new String[] { "-stats", "-args", - String.valueOf(rows), String.valueOf(cols), String.valueOf(sparsity), - deltaPath, refPath }; + programArgs = new String[] {"-stats", "-args", String.valueOf(rows), String.valueOf(cols), + String.valueOf(sparsity), deltaPath, refPath}; runTest(true, false, null, -1); - //the write run must materialize two objects to disk: the frame Delta - //table under test + the matrix text reference. FrameWriterDelta genuinely - //hitting HDFS is what produces the frame-side write statistic. + // the write run must materialize two objects to disk: the frame Delta + // table under test + the matrix text reference. FrameWriterDelta genuinely + // hitting HDFS is what produces the frame-side write statistic. long hdfsWrites = CacheStatistics.getHDFSWrites(); - assertTrue("expected >= 2 HDFS writes in the write run (delta frame + reference), got " - + hdfsWrites, hdfsWrites >= 2); - //and a real Delta table (transaction log) must have been created + assertTrue("expected >= 2 HDFS writes in the write run (delta frame + reference), got " + hdfsWrites, + hdfsWrites >= 2); + // and a real Delta table (transaction log) must have been created assertTrue("missing Delta transaction log under " + deltaPath, new File(deltaPath, "_delta_log").isDirectory()); // ---- phase 2: fresh execution reads the Delta frame and compares ---- getAndLoadTestConfiguration(READ_NAME); fullDMLScriptName = HOME + READ_NAME + ".dml"; - programArgs = new String[] { "-stats", "-args", - deltaPath, refPath, output("R") }; + programArgs = new String[] {"-stats", "-args", deltaPath, refPath, output("R")}; runTest(true, false, null, -1); long hdfsReads = CacheStatistics.getHDFSHits(); - assertTrue("expected >= 2 HDFS reads in the read run (delta + reference), got " - + hdfsReads, hdfsReads >= 2); + assertTrue("expected >= 2 HDFS reads in the read run (delta + reference), got " + hdfsReads, + hdfsReads >= 2); HashMap R = readDMLMatrixFromOutputDir("R"); double diff = R.getOrDefault(new CellIndex(1, 1), 0.0); From 7e01725aecf14325383887803073088400381508 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Wed, 1 Jul 2026 22:23:32 +0000 Subject: [PATCH 08/10] Reuse TestUtils for random frames and add Spark/Delta frame interop test - Replace the bespoke genMixedFrame generator in DeltaFrameReadWriteTest with TestUtils.generateRandomFrameBlock, removing duplicated random frame generation code. - Add DeltaFrameSparkInteropTest exercising cross-engine round-trips against the reference Delta Spark connector: SystemDS-written multi-file frame read by Spark, Spark-written multi-file frame read by the serial and parallel SystemDS readers, and a Spark table with deletion vectors read via the buffered selection-mask path. Comparisons are keyed by a unique id column so no row order is assumed. --- .../component/io/DeltaFrameReadWriteTest.java | 33 +-- .../io/DeltaFrameSparkInteropTest.java | 275 ++++++++++++++++++ 2 files changed, 286 insertions(+), 22 deletions(-) create mode 100644 src/test/java/org/apache/sysds/test/component/io/DeltaFrameSparkInteropTest.java diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java index be1c782f6e1..28775ed8e3f 100644 --- a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java +++ b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java @@ -48,6 +48,7 @@ import org.apache.sysds.runtime.io.FrameReaderDeltaParallel; import org.apache.sysds.runtime.io.FrameReaderFactory; import org.apache.sysds.runtime.io.FrameWriterDelta; +import org.apache.sysds.test.TestUtils; import org.junit.Test; import io.delta.kernel.data.ColumnVector; @@ -83,6 +84,11 @@ public class DeltaFrameReadWriteTest { private static final long SMALL_TARGET_FILE_SIZE = 512L * 1024; private static final int ROWS_MULTI_FILE = 150_000; + // mixed-type schema used by the multi-file round-trip tests; random data is + // generated via TestUtils rather than a bespoke per-test generator. + private static final ValueType[] MIXED_SCHEMA = {ValueType.STRING, ValueType.INT64, ValueType.FP64, + ValueType.BOOLEAN, ValueType.INT32, ValueType.FP32}; + private static FrameBlock writeThenRead(FrameBlock in) throws Exception { Path dir = Files.createTempDirectory("sysds_delta_frame_"); String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); @@ -192,7 +198,7 @@ public void parallelReadMatchesSerialMultiFile() throws Exception { Path dir = Files.createTempDirectory("sysds_delta_frame_par_"); String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); try { - FrameBlock in = genMixedFrame(ROWS_MULTI_FILE, 13); + FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 13); new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); assertMultiFile(tablePath); @@ -219,7 +225,7 @@ public void parallelBufferedPathMatchesSerialMultiFile() throws Exception { Path dir = Files.createTempDirectory("sysds_delta_frame_buf_"); String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); try { - FrameBlock in = genMixedFrame(ROWS_MULTI_FILE, 23); + FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 23); new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); assertMultiFile(tablePath); @@ -251,7 +257,7 @@ public void serialBufferedPathMatchesDirectMultiFile() throws Exception { Path dir = Files.createTempDirectory("sysds_delta_frame_sbuf_"); String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); try { - FrameBlock in = genMixedFrame(ROWS_MULTI_FILE, 29); + FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 29); new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); assertMultiFile(tablePath); @@ -331,7 +337,7 @@ public void readerBatchSizeConfigRoundTrips() throws Exception { try { assertEquals("config getter reflects the override", 128, ConfigurationManager.getDeltaReaderBatchSize()); - FrameBlock in = genMixedFrame(5000, 31); + FrameBlock in = TestUtils.generateRandomFrameBlock(5000, MIXED_SCHEMA, 31); new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); FrameBlock out = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); assertFramesEqual(in, out); @@ -355,7 +361,7 @@ public void writerTargetFileSizeConfigProducesMoreFiles() throws Exception { assertEquals("config getter reflects the override", SMALL_TARGET_FILE_SIZE, ConfigurationManager.getDeltaWriterTargetFileSize()); - FrameBlock in = genMixedFrame(ROWS_MULTI_FILE, 41); + FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 41); new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); assertMultiFile(tablePath); @@ -547,23 +553,6 @@ public void parallelReadStringNullsMatchSerialMultiFile() throws Exception { } } - private static FrameBlock genMixedFrame(int nrow, int seed) { - ValueType[] schema = {ValueType.STRING, ValueType.INT64, ValueType.FP64, ValueType.BOOLEAN, ValueType.INT32, - ValueType.FP32}; - String[] names = {"name", "id", "score", "active", "count", "ratio"}; - FrameBlock fb = alloc(schema, names, nrow); - Random rnd = new Random(seed); - for(int r = 0; r < nrow; r++) { - fb.set(r, 0, "row" + rnd.nextInt(1_000_000)); - fb.set(r, 1, (long) rnd.nextInt()); - fb.set(r, 2, rnd.nextDouble() * 200 - 100); - fb.set(r, 3, rnd.nextBoolean()); - fb.set(r, 4, rnd.nextInt()); - fb.set(r, 5, rnd.nextFloat()); - } - return fb; - } - private static void assertMultiFile(String tablePath) throws Exception { long files; try(java.util.stream.Stream s = Files.walk(new File(tablePath).toPath())) { diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameSparkInteropTest.java b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameSparkInteropTest.java new file mode 100644 index 00000000000..73f0b9834c0 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameSparkInteropTest.java @@ -0,0 +1,275 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.component.io; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.FileUtils; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.conf.DMLConfig; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.io.FrameReaderDelta; +import org.apache.sysds.runtime.io.FrameReaderDeltaParallel; +import org.apache.sysds.runtime.io.FrameWriterDelta; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Cross-engine interoperability tests for the native (Delta Kernel based) frame reader/writer against the reference + * Delta implementation (Delta's Spark connector, {@code delta-spark}, pulled in test-only). + * + *

+ * The other Delta frame tests round-trip exclusively through SystemDS' own Kernel-based read/write paths, so they + * cannot catch a table that SystemDS writes in a way other Delta engines reject (or vice versa). These tests close that + * gap by routing a mixed-type (long/double/string/boolean) frame through two independent engines: + *

    + *
  • SystemDS writes -> Spark/Delta reads (our output is spec-compliant), and
  • + *
  • Spark/Delta writes -> SystemDS reads, including a multi-file layout and a table with deletion vectors / a + * second commit that the SystemDS writer never produces itself.
  • + *
+ * + *

+ * Row order is never assumed: every table carries a unique id in column 0 and comparisons are keyed by that id, since + * neither engine guarantees row order across files. + */ +@net.jcip.annotations.NotThreadSafe +public class DeltaFrameSparkInteropTest { + + // nonsense schema/dims handed to the reader to confirm it discovers everything from the table + private static final ValueType[] NO_SCHEMA = new ValueType[] {ValueType.STRING}; + private static final String[] NO_NAMES = new String[] {"x"}; + + private static SparkSession spark; + + @BeforeClass + public static void startSpark() { + // each test class runs in its own fork (surefire reuseForks=false), so this + // is the only SparkSession in the JVM and gets the Delta extensions injected. + SparkSession.clearActiveSession(); + SparkSession.clearDefaultSession(); + spark = SparkSession.builder().appName("sysds-delta-frame-interop").master("local[2]") + .config("spark.ui.enabled", "false").config("spark.sql.shuffle.partitions", "2") + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog").getOrCreate(); + } + + @AfterClass + public static void stopSpark() { + if(spark != null) + spark.stop(); + SparkSession.clearActiveSession(); + SparkSession.clearDefaultSession(); + spark = null; + } + + @Test + public void systemdsWriteSparkReadMultiFile() throws Exception { + // SystemDS writes a (forced) multi-file mixed-type frame Delta table; the + // reference Delta engine (Spark) must read every data file back with + // matching values across all four column types. + int rows = 20_000, cols = 4; + FrameBlock in = indexedFrame(rows); + + // small target file size -> multiple parquet data files (exercise that an + // external reader stitches all of our data files, not just the first). + DMLConfig conf = new DMLConfig(); + conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(16L * 1024)); + ConfigurationManager.setLocalConfig(conf); + Path dir = Files.createTempDirectory("sysds_delta_frame_s2s_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + new FrameWriterDelta().writeFrameToHDFS(in, tablePath, rows, cols); + assertTrue("writer should have produced a multi-file table", countParquet(tablePath) > 1); + + Dataset df = spark.read().format("delta").load(tablePath); + assertEquals("rows", rows, df.count()); + assertEquals("cols", cols, df.schema().fields().length); + + List read = df.collectAsList(); + assertEquals(rows, read.size()); + boolean[] seen = new boolean[rows]; + for(Row r : read) { + int id = (int) r.getLong(0); + assertTrue("id in range and unique: " + id, id >= 0 && id < rows && !seen[id]); + seen[id] = true; + assertEquals("id" + id + " c1", dval(id), r.getDouble(1), 1e-9); + assertEquals("id" + id + " c2", sval(id), r.getString(2)); + assertEquals("id" + id + " c3", Boolean.valueOf(bval(id)), Boolean.valueOf(r.getBoolean(3))); + } + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + + @Test + public void sparkWriteSystemdsReadMultiFile() throws Exception { + // the reference Delta engine writes a multi-file mixed-type table; both the + // serial and parallel SystemDS frame readers must reconstruct it cell-for-cell. + int rows = 600; + Dataset df = indexedDataFrame(rows).repartition(3); // -> multiple data files + Path dir = Files.createTempDirectory("sysds_delta_frame_p2s_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + df.write().format("delta").save(tablePath); + assertTrue("spark should have written a multi-file table", countParquet(tablePath) > 1); + + Set expected = idRange(0, rows); + assertFrameMatchesIds(new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1), + expected, "serial"); + assertFrameMatchesIds( + new FrameReaderDeltaParallel().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1), expected, + "parallel"); + } + finally { + FileUtils.deleteQuietly(dir.toFile()); + } + } + + @Test + public void sparkDeletionVectorsSystemdsRead() throws Exception { + // a Delta table with deletion vectors + a second commit (the DELETE) is a + // layout the SystemDS writer never emits; the frame readers must honor the DV + // and return only the surviving rows. With DVs present hasExactRowCounts() is + // false, so this drives the buffered (selection-mask) frame read path. + int rows = 400, deleteBelow = 50; + Path dir = Files.createTempDirectory("sysds_delta_frame_dv_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + // enable deletion vectors for tables created in this block, then delete a + // row range so Delta records a DV rather than rewriting the data files. + spark.conf().set(DV_DEFAULT, "true"); + indexedDataFrame(rows).write().format("delta").save(tablePath); + spark.sql("DELETE FROM delta.`" + tablePath + "` WHERE c0 < " + deleteBelow); + + Set expected = idRange(deleteBelow, rows); + + FrameBlock serial = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); + assertEquals("surviving rows (serial)", rows - deleteBelow, serial.getNumRows()); + assertFrameMatchesIds(serial, expected, "serial-dv"); + + FrameBlock parallel = new FrameReaderDeltaParallel().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, + -1); + assertEquals("surviving rows (parallel)", rows - deleteBelow, parallel.getNumRows()); + assertFrameMatchesIds(parallel, expected, "parallel-dv"); + } + finally { + // fresh fork per test class, so simply clearing the override is enough + spark.conf().unset(DV_DEFAULT); + FileUtils.deleteQuietly(dir.toFile()); + } + } + + private static final String DV_DEFAULT = "spark.databricks.delta.properties.defaults.enableDeletionVectors"; + + // deterministic, exactly-representable cell values keyed by the row id in column 0 + private static double dval(int id) { + return id * 0.5 - 1.0; + } + + private static String sval(int id) { + return "s" + id; + } + + private static boolean bval(int id) { + return id % 2 == 0; + } + + /** Frame whose column 0 is the row id and the remaining columns are exact per-id values. */ + private static FrameBlock indexedFrame(int rows) { + ValueType[] schema = {ValueType.INT64, ValueType.FP64, ValueType.STRING, ValueType.BOOLEAN}; + String[] names = {"c0", "c1", "c2", "c3"}; + FrameBlock fb = new FrameBlock(schema, names); + fb.ensureAllocatedColumns(rows); + for(int r = 0; r < rows; r++) { + fb.set(r, 0, (long) r); + fb.set(r, 1, dval(r)); + fb.set(r, 2, sval(r)); + fb.set(r, 3, bval(r)); + } + return fb; + } + + /** Spark DataFrame mirroring {@link #indexedFrame} with columns c0..c3 (long/double/string/boolean). */ + private Dataset indexedDataFrame(int rows) { + StructType schema = DataTypes + .createStructType(new StructField[] {DataTypes.createStructField("c0", DataTypes.LongType, false), + DataTypes.createStructField("c1", DataTypes.DoubleType, false), + DataTypes.createStructField("c2", DataTypes.StringType, false), + DataTypes.createStructField("c3", DataTypes.BooleanType, false)}); + + List data = new ArrayList<>(rows); + for(int r = 0; r < rows; r++) + data.add(RowFactory.create((long) r, dval(r), sval(r), bval(r))); + return spark.createDataFrame(data, schema); + } + + private static Set idRange(int fromInclusive, int toExclusive) { + Set ids = new LinkedHashSet<>(toExclusive - fromInclusive); + for(int id = fromInclusive; id < toExclusive; id++) + ids.add(id); + return ids; + } + + /** Asserts every row of {@code out} (keyed by its column-0 id) is expected and carries the exact per-id values. */ + private static void assertFrameMatchesIds(FrameBlock out, Set expectedIds, String tag) { + assertEquals(tag + " rows", expectedIds.size(), out.getNumRows()); + assertEquals(tag + " cols", 4, out.getNumColumns()); + // discovered types: long->INT64, double->FP64, string->STRING, boolean->BOOLEAN + assertEquals(tag + " c0 type", ValueType.INT64, out.getSchema()[0]); + assertEquals(tag + " c1 type", ValueType.FP64, out.getSchema()[1]); + assertEquals(tag + " c2 type", ValueType.STRING, out.getSchema()[2]); + assertEquals(tag + " c3 type", ValueType.BOOLEAN, out.getSchema()[3]); + Set seen = new HashSet<>(); + for(int r = 0; r < out.getNumRows(); r++) { + int id = ((Number) out.get(r, 0)).intValue(); + assertTrue(tag + ": unexpected/duplicate id " + id, expectedIds.contains(id) && seen.add(id)); + assertEquals(tag + " id" + id + " c1", dval(id), ((Number) out.get(r, 1)).doubleValue(), 1e-9); + assertEquals(tag + " id" + id + " c2", sval(id), out.get(r, 2).toString()); + assertEquals(tag + " id" + id + " c3", Boolean.valueOf(bval(id)), out.get(r, 3)); + } + } + + private static long countParquet(String tablePath) throws Exception { + try(java.util.stream.Stream s = Files.walk(new File(tablePath).toPath())) { + return s.filter(p -> p.toString().endsWith(".parquet")).count(); + } + } +} From a71c4272e8e32813486cd3d728f2d33f945f3bc6 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 2 Jul 2026 12:59:13 +0000 Subject: [PATCH 09/10] Harden Delta frame reads and consolidate test helpers - Fail loud when a data file decodes fewer rows than its numRecords statistic (silent underflow) in both the serial and parallel direct read paths, alongside the existing overflow guard, and compute the selected-row count once per batch. - Extract a shared ReadPlan (read codes, value types, names) and a readWithHandle entry point so the schema derivation lives in one place and the parallel reader reuses its already-opened scan handle for the single-file fallback instead of re-opening the snapshot. - Move the null-result check above the metadata refresh in FrameObject so it can actually fire before data is dereferenced. - Log the adaptive writer file-size decision at debug level and document the INT64 boxing and null precondition on the frame writer. - Consolidate duplicated countParquet test logic into DeltaFrameTestUtils, add a loan-pattern helper for multi-file tables, assert on exception messages, and use assertEquals for per-cell comparisons. - Remove the redundant DeltaFrameReadPerf micro-benchmark. --- .../controlprogram/caching/FrameObject.java | 169 +++---- .../sysds/runtime/io/DeltaKernelUtils.java | 267 ++++++----- .../sysds/runtime/io/FrameReaderDelta.java | 109 +++-- .../runtime/io/FrameReaderDeltaParallel.java | 55 +-- .../sysds/runtime/io/FrameWriterDelta.java | 5 +- .../performance/frame/DeltaFrameRead.java | 16 +- .../test/component/io/DeltaFrameReadPerf.java | 453 ------------------ .../component/io/DeltaFrameReadWriteTest.java | 161 +++---- .../io/DeltaFrameSparkInteropTest.java | 4 +- .../component/io/DeltaFrameTestUtils.java | 40 ++ 10 files changed, 430 insertions(+), 849 deletions(-) delete mode 100644 src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java create mode 100644 src/test/java/org/apache/sysds/test/component/io/DeltaFrameTestUtils.java diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java index 219f954cc52..c8eabc9aac6 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java @@ -19,7 +19,6 @@ package org.apache.sysds.runtime.controlprogram.caching; - import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.mutable.MutableBoolean; import org.apache.commons.lang3.tuple.Pair; @@ -55,13 +54,11 @@ import java.util.List; import java.util.concurrent.Future; - -public class FrameObject extends CacheableData -{ +public class FrameObject extends CacheableData { private static final long serialVersionUID = 1755082174281927785L; private ValueType[] _schema = null; - + protected FrameObject() { super(DataType.FRAME, ValueType.STRING); } @@ -83,22 +80,21 @@ public FrameObject(String fname, MetaData meta, ValueType[] schema) { setMetaData(meta); setSchema(schema); } - + /** * Copy constructor that copies meta data but NO data. - * + * * @param fo frame object */ public FrameObject(FrameObject fo) { super(fo); - + MetaDataFormat metaOld = (MetaDataFormat) fo.getMetaData(); - _metaData = new MetaDataFormat( - new MatrixCharacteristics(metaOld.getDataCharacteristics()), + _metaData = new MetaDataFormat(new MatrixCharacteristics(metaOld.getDataCharacteristics()), metaOld.getFileFormat()); _schema = fo._schema.clone(); } - + @Override public ValueType[] getSchema() { return _schema; @@ -106,43 +102,43 @@ public ValueType[] getSchema() { /** * Obtain schema of value types - * + * * @param cl column lower bound, inclusive * @param cu column upper bound, inclusive * @return schema of value types */ public ValueType[] getSchema(int cl, int cu) { - return (_schema!=null && _schema.length>cu) ? Arrays.copyOfRange(_schema, cl, cu+1) : - UtilFunctions.nCopies(cu-cl+1, ValueType.STRING); + return (_schema != null && _schema.length > cu) ? Arrays.copyOfRange(_schema, cl, cu + 1) : UtilFunctions + .nCopies(cu - cl + 1, ValueType.STRING); } - + /** - * Creates a new collection which contains the schema of the current - * frame object concatenated with the schema of the passed frame object. - * + * Creates a new collection which contains the schema of the current frame object concatenated with the schema of + * the passed frame object. + * * @param fo frame object * @return schema of value types */ public ValueType[] mergeSchemas(FrameObject fo) { return ArrayUtils.addAll( - (_schema!=null) ? _schema : UtilFunctions.nCopies((int)getNumColumns(), ValueType.STRING), - (fo._schema!=null) ? fo._schema : UtilFunctions.nCopies((int)fo.getNumColumns(), ValueType.STRING)); - } - + (_schema != null) ? _schema : UtilFunctions.nCopies((int) getNumColumns(), ValueType.STRING), + (fo._schema != null) ? fo._schema : UtilFunctions.nCopies((int) fo.getNumColumns(), ValueType.STRING)); + } + public void setSchema(String schema) { - if( schema.equals("*") ) { - //populate default schema + if(schema.equals("*")) { + // populate default schema int clen = (int) getNumColumns(); - if( clen >= 0 ) //known number of cols + if(clen >= 0) // known number of cols _schema = UtilFunctions.nCopies(clen, ValueType.STRING); } - else + else _schema = parseSchema(schema); } public static ValueType[] parseSchema(String schema) { if(schema == null) - return new ValueType[]{ValueType.STRING}; + return new ValueType[] {ValueType.STRING}; // parse given schema String[] parts = schema.split(DataExpression.DEFAULT_DELIM_DELIMITER); ValueType[] ret = new ValueType[parts.length]; @@ -150,22 +146,22 @@ public static ValueType[] parseSchema(String schema) { ret[i] = ValueType.fromExternalString(parts[i].toUpperCase()); return ret; } - + public void setSchema(ValueType[] schema) { _schema = schema; } - + @Override public void refreshMetaData() { - if ( _data == null || _metaData ==null ) //refresh only for existing data - throw new DMLRuntimeException("Cannot refresh meta data because there is no data or meta data. "); + if(_data == null || _metaData == null) // refresh only for existing data + throw new DMLRuntimeException("Cannot refresh meta data because there is no data or meta data. "); - //update matrix characteristics + // update matrix characteristics DataCharacteristics dc = _metaData.getDataCharacteristics(); - dc.setDimension( _data.getNumRows(),_data.getNumColumns() ); - dc.setNonZeros(_data.getNumRows()*_data.getNumColumns()); - - //update schema information + dc.setDimension(_data.getNumRows(), _data.getNumColumns()); + dc.setNonZeros(_data.getNumRows() * _data.getNumColumns()); + + // update schema information _schema = _data.getSchema(); } @@ -178,14 +174,14 @@ public long getNumColumns() { DataCharacteristics dc = getDataCharacteristics(); return dc.getCols(); } - + @Override protected FrameBlock readBlobFromCache(String fname) throws IOException { FrameBlock fb = null; - if (OptimizerUtils.isUMMEnabled()) + if(OptimizerUtils.isUMMEnabled()) fb = (FrameBlock) UnifiedMemoryManager.readBlock(fname, false); else - fb = (FrameBlock)LazyWriteBuffer.readBlock(fname, false); + fb = (FrameBlock) LazyWriteBuffer.readBlock(fname, false); return fb; } @@ -204,8 +200,12 @@ protected FrameBlock readBlobFromHDFS(String fname, long[] dims) throws IOExcept .createFrameReader(iimd.getFileFormat(), getFileFormatProperties()) .readFrameFromHDFS(fname, lschema, dc.getRows(), dc.getCols()); - //Delta and CSV discover dimensions (and Delta also schema) at read time, so - //refresh the cached metadata to reflect the materialized frame block. + // sanity check correct output (before dereferencing data below) + if(data == null) + throw new IOException("Unable to load frame from file: " + fname); + + // Delta and CSV discover dimensions (and Delta also schema) at read time, so + // refresh the cached metadata to reflect the materialized frame block. if(iimd.getFileFormat() == FileFormat.CSV || iimd.getFileFormat() == FileFormat.DELTA) { _metaData = _metaData instanceof MetaDataFormat ? new MetaDataFormat(data.getDataCharacteristics(), iimd.getFileFormat()) : new MetaData(data.getDataCharacteristics()); @@ -213,57 +213,50 @@ protected FrameBlock readBlobFromHDFS(String fname, long[] dims) throws IOExcept _schema = data.getSchema(); } - // sanity check correct output - if(data == null) - throw new IOException("Unable to load frame from file: " + fname); return data; } @Override - protected FrameBlock readBlobFromRDD(RDDObject rdd, MutableBoolean status) - throws IOException - { - //note: the read of a frame block from an RDD might trigger - //lazy evaluation of pending transformations. + protected FrameBlock readBlobFromRDD(RDDObject rdd, MutableBoolean status) throws IOException { + // note: the read of a frame block from an RDD might trigger + // lazy evaluation of pending transformations. RDDObject lrdd = rdd; - //prepare return status (by default only collect) + // prepare return status (by default only collect) status.setValue(false); - + MetaDataFormat iimd = (MetaDataFormat) _metaData; DataCharacteristics dc = iimd.getDataCharacteristics(); - int rlen = (int)dc.getRows(); - int clen = (int)dc.getCols(); - - //handle missing schema if necessary - ValueType[] lschema = (_schema!=null) ? _schema : - UtilFunctions.nCopies(clen>=1 ? (int)clen : 1, ValueType.STRING); - + int rlen = (int) dc.getRows(); + int clen = (int) dc.getCols(); + + // handle missing schema if necessary + ValueType[] lschema = (_schema != null) ? _schema : UtilFunctions.nCopies(clen >= 1 ? (int) clen : 1, + ValueType.STRING); + FrameBlock fb = null; - try { - //prevent unnecessary collect through rdd checkpoint - if( rdd.allowsShortCircuitCollect() ) { - lrdd = (RDDObject)rdd.getLineageChilds().get(0); + try { + // prevent unnecessary collect through rdd checkpoint + if(rdd.allowsShortCircuitCollect()) { + lrdd = (RDDObject) rdd.getLineageChilds().get(0); } - - //collect frame block from binary block RDD - fb = SparkExecutionContext.toFrameBlock(lrdd, lschema, rlen, clen); + + // collect frame block from binary block RDD + fb = SparkExecutionContext.toFrameBlock(lrdd, lschema, rlen, clen); } catch(DMLRuntimeException ex) { throw new IOException(ex); } - - //sanity check correct output - if( fb == null ) + + // sanity check correct output + if(fb == null) throw new IOException("Unable to load frame from rdd."); - + return fb; } - + @Override - protected FrameBlock readBlobFromFederated(FederationMap fedMap, long[] dims) - throws IOException - { + protected FrameBlock readBlobFromFederated(FederationMap fedMap, long[] dims) throws IOException { FrameBlock ret = new FrameBlock(_schema); // provide long support? ret.ensureAllocatedColumns((int) dims[0]); @@ -274,8 +267,8 @@ protected FrameBlock readBlobFromFederated(FederationMap fedMap, long[] dims) FederatedResponse response = readResponse.getRight().get(); // add result FrameBlock multRes = (FrameBlock) response.getData()[0]; - for (int r = 0; r < multRes.getNumRows(); r++) { - for (int c = 0; c < multRes.getNumColumns(); c++) { + for(int r = 0; r < multRes.getNumRows(); r++) { + for(int c = 0; c < multRes.getNumColumns(); c++) { int destRow = range.getBeginDimsInt()[0] + r; int destCol = range.getBeginDimsInt()[1] + c; ret.set(destRow, destCol, multRes.get(r, c)); @@ -286,17 +279,16 @@ protected FrameBlock readBlobFromFederated(FederationMap fedMap, long[] dims) catch(Exception e) { throw new DMLRuntimeException("Federated Frame read failed.", e); } - + return ret; } @Override protected void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop) - throws IOException, DMLRuntimeException - { + throws IOException, DMLRuntimeException { MetaDataFormat iimd = (MetaDataFormat) _metaData; FileFormat fmt = (ofmt != null ? FileFormat.safeValueOf(ofmt) : iimd.getFileFormat()); - + FrameWriter writer = FrameWriterFactory.createFrameWriter(fmt, fprop); writer.writeFrameToHDFS(_data, fname, getNumRows(), getNumColumns()); @@ -306,21 +298,18 @@ protected void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatPro @Override protected long writeStreamToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop) - throws IOException, DMLRuntimeException - { + throws IOException, DMLRuntimeException { throw new UnsupportedOperationException(); } - @Override protected void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String ofmt) - throws IOException, DMLRuntimeException - { - //prepare output info + throws IOException, DMLRuntimeException { + // prepare output info MetaDataFormat iimd = (MetaDataFormat) _metaData; - //note: the write of an RDD to HDFS might trigger - //lazy evaluation of pending transformations. + // note: the write of an RDD to HDFS might trigger + // lazy evaluation of pending transformations. SparkExecutionContext.writeFrameRDDtoHDFS(rdd, fname, iimd.getFileFormat()); } @@ -329,11 +318,9 @@ protected FrameBlock readBlobFromStream(OOCStream stream) th // TODO Auto-generated method stub return null; } - + @Override protected FrameBlock reconstructByLineage(LineageItem li) throws IOException { - return ((FrameObject) LineageRecomputeUtils - .parseNComputeLineageTrace(li.getData())) - .acquireReadAndRelease(); + return ((FrameObject) LineageRecomputeUtils.parseNComputeLineageTrace(li.getData())).acquireReadAndRelease(); } } diff --git a/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java b/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java index 33d1700fec9..25a1572223c 100644 --- a/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java +++ b/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java @@ -26,6 +26,8 @@ import java.util.Optional; import java.util.function.Function; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.sysds.conf.ConfigurationManager; @@ -70,53 +72,60 @@ import io.delta.kernel.utils.FileStatus; /** - * Shared helpers for the native (Spark-free) Delta Lake read/write paths used - * by both the matrix and frame readers/writers. Centralizes engine creation, - * path qualification, the scan loop (snapshot -> data files -> logical - * columnar batches, honoring deletion vectors), and the write transaction - * (logical data -> parquet -> commit). + * Shared helpers for the native (Spark-free) Delta Lake read/write paths used by both the matrix and frame + * readers/writers. Centralizes engine creation, path qualification, the scan loop (snapshot -> data files -> + * logical columnar batches, honoring deletion vectors), and the write transaction (logical data -> parquet -> + * commit). */ public class DeltaKernelUtils { + private static final Log LOG = LogFactory.getLog(DeltaKernelUtils.class.getName()); + private static final String ENGINE_INFO = "Apache SystemDS"; /** Reused thread-safe JSON reader for the per-file Delta stats (numRecords). */ private static final ObjectMapper JSON_MAPPER = new ObjectMapper(); - /** Delta Kernel config key: number of rows per parquet read batch, overridable via - * {@link org.apache.sysds.conf.DMLConfig#DELTA_READER_BATCH_SIZE}. */ + /** + * Delta Kernel config key: number of rows per parquet read batch, overridable via + * {@link org.apache.sysds.conf.DMLConfig#DELTA_READER_BATCH_SIZE}. + */ private static final String CONF_READER_BATCH_SIZE = "delta.kernel.default.parquet.reader.batch-size"; - /** Delta Kernel config key: target size (bytes) at which the writer rolls a new data file, overridable via - * {@link org.apache.sysds.conf.DMLConfig#DELTA_WRITER_TARGET_FILE_SIZE}. */ + /** + * Delta Kernel config key: target size (bytes) at which the writer rolls a new data file, overridable via + * {@link org.apache.sysds.conf.DMLConfig#DELTA_WRITER_TARGET_FILE_SIZE}. + */ private static final String CONF_WRITER_TARGET_FILE_SIZE = "delta.kernel.default.parquet.writer.targetMaxFileSize"; - /** Internal Delta column type codes shared by the matrix and frame readers to - * dispatch boxing-free primitive column access. */ - public static final int T_DOUBLE = 0; - public static final int T_FLOAT = 1; - public static final int T_LONG = 2; - public static final int T_INT = 3; - public static final int T_SHORT = 4; - public static final int T_BYTE = 5; + /** + * Internal Delta column type codes shared by the matrix and frame readers to dispatch boxing-free primitive column + * access. + */ + public static final int T_DOUBLE = 0; + public static final int T_FLOAT = 1; + public static final int T_LONG = 2; + public static final int T_INT = 3; + public static final int T_SHORT = 4; + public static final int T_BYTE = 5; public static final int T_BOOLEAN = 6; - public static final int T_STRING = 7; + public static final int T_STRING = 7; - //derived configuration cached to avoid copying the (large) base conf on every - //engine creation (createEngine is called once per data file in parallel reads); - //rebuilt whenever the base conf or the relevant SystemDS settings change. + // derived configuration cached to avoid copying the (large) base conf on every + // engine creation (createEngine is called once per data file in parallel reads); + // rebuilt whenever the base conf or the relevant SystemDS settings change. private static Configuration cachedConf; private static Configuration cachedConfBase; private static int cachedBatchSize; private static long cachedTargetFileSize; - private DeltaKernelUtils() {} + private DeltaKernelUtils() { + } /** - * Consumes a whole columnar batch. {@code selected} is {@code null} when all - * {@code size} rows are live; otherwise {@code selected[r]} indicates whether - * row {@code r} survived the deletion/selection vector. Batch-level consumption - * lets callers extract data column-at-a-time (cache friendly, boxing free) - * instead of paying a per-row callback. + * Consumes a whole columnar batch. {@code selected} is {@code null} when all {@code size} rows are live; otherwise + * {@code selected[r]} indicates whether row {@code r} survived the deletion/selection vector. Batch-level + * consumption lets callers extract data column-at-a-time (cache friendly, boxing free) instead of paying a per-row + * callback. */ @FunctionalInterface public interface BatchConsumer { @@ -124,22 +133,29 @@ public interface BatchConsumer { } /** - * Map a Delta Kernel {@link DataType} to an internal type code (see the - * {@code T_*} constants). Returned once per column so the per-cell read loop - * can switch on a primitive int instead of repeating {@code instanceof} checks. + * Map a Delta Kernel {@link DataType} to an internal type code (see the {@code T_*} constants). Returned once per + * column so the per-cell read loop can switch on a primitive int instead of repeating {@code instanceof} checks. * * @param dt the Delta column data type * @return the matching {@code T_*} code, or {@code -1} if the type is not supported */ public static int typeCode(DataType dt) { - if( dt instanceof DoubleType ) return T_DOUBLE; - if( dt instanceof FloatType ) return T_FLOAT; - if( dt instanceof LongType ) return T_LONG; - if( dt instanceof IntegerType ) return T_INT; - if( dt instanceof ShortType ) return T_SHORT; - if( dt instanceof ByteType ) return T_BYTE; - if( dt instanceof BooleanType ) return T_BOOLEAN; - if( dt instanceof StringType ) return T_STRING; + if(dt instanceof DoubleType) + return T_DOUBLE; + if(dt instanceof FloatType) + return T_FLOAT; + if(dt instanceof LongType) + return T_LONG; + if(dt instanceof IntegerType) + return T_INT; + if(dt instanceof ShortType) + return T_SHORT; + if(dt instanceof ByteType) + return T_BYTE; + if(dt instanceof BooleanType) + return T_BOOLEAN; + if(dt instanceof StringType) + return T_STRING; return -1; } @@ -158,8 +174,10 @@ public static int countSelected(int size, boolean[] selected) { return n; } - /** Floor on the adaptive writer target file size. Below this the per-file metadata/open - * overhead (and tiny-file proliferation) outweighs the extra read parallelism. */ + /** + * Floor on the adaptive writer target file size. Below this the per-file metadata/open overhead (and tiny-file + * proliferation) outweighs the extra read parallelism. + */ public static final long ADAPTIVE_WRITER_MIN_FILE_SIZE = 4L * 1024 * 1024; private static Configuration buildConf(Configuration base, int batchSize, long targetFileSize) { @@ -173,9 +191,8 @@ private static synchronized Configuration deltaConf() { Configuration base = ConfigurationManager.getCachedJobConf(); int batchSize = ConfigurationManager.getDeltaReaderBatchSize(); long targetFileSize = ConfigurationManager.getDeltaWriterTargetFileSize(); - if(cachedConf == null || cachedConfBase != base - || cachedBatchSize != batchSize || cachedTargetFileSize != targetFileSize) - { + if(cachedConf == null || cachedConfBase != base || cachedBatchSize != batchSize || + cachedTargetFileSize != targetFileSize) { cachedConf = buildConf(base, batchSize, targetFileSize); cachedConfBase = base; cachedBatchSize = batchSize; @@ -189,12 +206,11 @@ public static Engine createEngine() { } /** - * Compute the parquet target data-file size (bytes) for writing a table of the given - * estimated size. With adaptive sizing enabled the writer aims for roughly one data - * file per expected parallel reader (so the native per-file parallel read can use all - * threads): never above the configured target, and never below - * {@code ADAPTIVE_WRITER_MIN_FILE_SIZE} unless the configured target is itself smaller - * than that floor (in which case the configured target wins). + * Compute the parquet target data-file size (bytes) for writing a table of the given estimated size. With adaptive + * sizing enabled the writer aims for roughly one data file per expected parallel reader (so the native per-file + * parallel read can use all threads): never above the configured target, and never below + * {@code ADAPTIVE_WRITER_MIN_FILE_SIZE} unless the configured target is itself smaller than that floor (in which + * case the configured target wins). * * @param estimatedBytes estimate of the table's size (the block in-memory size is a fine proxy) * @return the target max parquet data-file size in bytes @@ -205,29 +221,33 @@ public static long adaptiveWriterTargetFileSize(long estimatedBytes) { return configured; int par = Math.max(1, OptimizerUtils.getParallelBinaryReadParallelism()); long perReader = Math.max(1, estimatedBytes / par); - //never above the configured cap, never below the floor (unless the cap itself is lower) - return Math.min(configured, Math.max(ADAPTIVE_WRITER_MIN_FILE_SIZE, perReader)); + // never above the configured cap, never below the floor (unless the cap itself is lower) + long target = Math.min(configured, Math.max(ADAPTIVE_WRITER_MIN_FILE_SIZE, perReader)); + if(LOG.isDebugEnabled()) + LOG.debug("Delta adaptive file size: est=" + estimatedBytes + "B par=" + par + " -> target=" + target + + "B (cap=" + configured + "B, floor=" + ADAPTIVE_WRITER_MIN_FILE_SIZE + "B)"); + return target; } /** - * Create an engine for writing a table of the given estimated size, configured with an - * adaptive target data-file size (see {@link #adaptiveWriterTargetFileSize(long)}). A fresh - * (uncached) configuration is built since writes happen once per table, not per data file. + * Create an engine for writing a table of the given estimated size, configured with an adaptive target data-file + * size (see {@link #adaptiveWriterTargetFileSize(long)}). A fresh (uncached) configuration is built since writes + * happen once per table, not per data file. * * @param estimatedBytes estimate of the table's size (the block in-memory size is a fine proxy) * @return a Delta Kernel engine for the write */ public static Engine createWriteEngine(long estimatedBytes) { - //the reader batch size is irrelevant on the write path but is set to keep the - //conf shape identical to deltaConf(); only the target file size matters here. + // the reader batch size is irrelevant on the write path but is set to keep the + // conf shape identical to deltaConf(); only the target file size matters here. Configuration c = buildConf(ConfigurationManager.getCachedJobConf(), ConfigurationManager.getDeltaReaderBatchSize(), adaptiveWriterTargetFileSize(estimatedBytes)); return DefaultEngine.create(c); } /** - * Resolve a (possibly relative) path to a fully-qualified URI so the - * kernel's default engine can locate the table on the right filesystem. + * Resolve a (possibly relative) path to a fully-qualified URI so the kernel's default engine can locate the table + * on the right filesystem. * * @param fname input path * @return fully-qualified table path @@ -244,11 +264,9 @@ public static String qualify(String fname) { } /** - * Opened latest snapshot of a Delta table: the logical schema plus everything - * needed to (re)read its data files, including the list of per-data-file scan - * rows. Delta Kernel scan-file rows are self-contained (the kernel's - * distributed design serializes them to workers), so they can be retained and - * read independently / in parallel. + * Opened latest snapshot of a Delta table: the logical schema plus everything needed to (re)read its data files, + * including the list of per-data-file scan rows. Delta Kernel scan-file rows are self-contained (the kernel's + * distributed design serializes them to workers), so they can be retained and read independently / in parallel. */ public static final class ScanHandle { public final StructType schema; @@ -256,19 +274,18 @@ public static final class ScanHandle { public final StructType physicalReadSchema; public final List scanFiles; /** - * Per-file record counts taken from the Delta {@code numRecords} statistic, - * aligned with {@link #scanFiles}; {@code -1} where the statistic is absent. + * Per-file record counts taken from the Delta {@code numRecords} statistic, aligned with {@link #scanFiles}; + * {@code -1} where the statistic is absent. */ public final long[] numRecords; /** - * Per-file flag indicating a deletion vector is present (so the live row - * count differs from {@link #numRecords}), aligned with {@link #scanFiles}. + * Per-file flag indicating a deletion vector is present (so the live row count differs from + * {@link #numRecords}), aligned with {@link #scanFiles}. */ public final boolean[] hasDeletionVector; - private ScanHandle(StructType schema, Row scanState, StructType physicalReadSchema, - List scanFiles, long[] numRecords, boolean[] hasDeletionVector) - { + private ScanHandle(StructType schema, Row scanState, StructType physicalReadSchema, List scanFiles, + long[] numRecords, boolean[] hasDeletionVector) { this.schema = schema; this.scanState = scanState; this.physicalReadSchema = physicalReadSchema; @@ -278,13 +295,12 @@ private ScanHandle(StructType schema, Row scanState, StructType physicalReadSche } /** - * @return true iff every data file carries a {@code numRecords} statistic - * and none has a deletion vector, i.e. exact per-file row offsets - * can be derived from metadata without reading the data. + * @return true iff every data file carries a {@code numRecords} statistic and none has a deletion vector, i.e. + * exact per-file row offsets can be derived from metadata without reading the data. */ public boolean hasExactRowCounts() { - for( int i=0; i scanFileIter = (scan instanceof ScanImpl) - ? ((ScanImpl) scan).getScanFiles(engine, true) - : scan.getScanFiles(engine); + // request the scan files WITH per-file statistics (numRecords) so callers can + // pre-size output and place rows without reading the data; harmless extra + // column for the data-read path. Fall back to the stats-less iterator if the + // concrete scan does not support it. + CloseableIterator scanFileIter = (scan instanceof ScanImpl) ? ((ScanImpl) scan) + .getScanFiles(engine, true) : scan.getScanFiles(engine); List files = new ArrayList<>(); List recs = new ArrayList<>(); List dvs = new ArrayList<>(); - try( CloseableIterator scanFiles = scanFileIter ) { - while( scanFiles.hasNext() ) { + try(CloseableIterator scanFiles = scanFileIter) { + while(scanFiles.hasNext()) { FilteredColumnarBatch scanFileBatch = scanFiles.next(); - try( CloseableIterator scanFileRows = scanFileBatch.getRows() ) { - while( scanFileRows.hasNext() ) { + try(CloseableIterator scanFileRows = scanFileBatch.getRows()) { + while(scanFileRows.hasNext()) { Row scanFileRow = scanFileRows.next(); files.add(scanFileRow); recs.add(numRecords(scanFileRow)); @@ -334,7 +348,7 @@ public static ScanHandle openScan(Engine engine, String tablePath) throws IOExce } long[] numRecords = new long[recs.size()]; boolean[] hasDv = new boolean[dvs.size()]; - for( int i=0; i physicalData = engine.getParquetHandler() .readParquetFiles(Utils.singletonCloseableIterator(dataFile), physicalReadSchema, Optional.empty()); - try( CloseableIterator logicalData = - Scan.transformPhysicalData(engine, scanState, scanFileRow, physicalData) ) - { - while( logicalData.hasNext() ) + try(CloseableIterator logicalData = Scan.transformPhysicalData(engine, scanState, + scanFileRow, physicalData)) { + while(logicalData.hasNext()) consumeBatch(logicalData.next(), consumer); } } /** - * Scan the latest snapshot of a Delta table sequentially, invoking the batch - * consumer for every data batch. The consumer is created lazily from the table - * schema (so callers can size buffers / derive per-column types up front). + * Scan the latest snapshot of a Delta table sequentially, invoking the batch consumer for every data batch. The + * consumer is created lazily from the table schema (so callers can size buffers / derive per-column types up + * front). * * @param engine delta kernel engine * @param tablePath fully-qualified table path @@ -402,11 +412,10 @@ public static void readScanFile(Engine engine, Row scanState, StructType physica * @throws IOException on read failure */ public static StructType scan(Engine engine, String tablePath, Function consumerFactory) - throws IOException - { + throws IOException { ScanHandle h = openScan(engine, tablePath); BatchConsumer consumer = consumerFactory.apply(h.schema); - for( Row scanFileRow : h.scanFiles ) + for(Row scanFileRow : h.scanFiles) readScanFile(engine, h.scanState, h.physicalReadSchema, scanFileRow, consumer); return h.schema; } @@ -415,28 +424,27 @@ private static void consumeBatch(FilteredColumnarBatch fcb, BatchConsumer consum ColumnarBatch batch = fcb.getData(); int ncol = batch.getSchema().length(); ColumnVector[] cols = new ColumnVector[ncol]; - for( int c=0; c all rows live) + // materialize the deletion/selection mask once (null => all rows live) Optional selVector = fcb.getSelectionVector(); boolean[] selected = null; - if( selVector.isPresent() ) { + if(selVector.isPresent()) { ColumnVector sv = selVector.get(); selected = new boolean[size]; - for( int r=0; r logicalData) throws IOException - { - //replace any existing table at the path (the other SystemDS writers delete - //the output first; the caching layer does not do it on our behalf) + CloseableIterator logicalData) throws IOException { + // replace any existing table at the path (the other SystemDS writers delete + // the output first; the caching layer does not do it on our behalf) HDFSTool.deleteFileIfExistOnHDFS(tablePath); Table table = Table.forPath(engine, tablePath); - TransactionBuilder txnBuilder = table - .createTransactionBuilder(engine, ENGINE_INFO, Operation.CREATE_TABLE) + TransactionBuilder txnBuilder = table.createTransactionBuilder(engine, ENGINE_INFO, Operation.CREATE_TABLE) .withSchema(engine, schema); Transaction txn = txnBuilder.build(engine); Row txnState = txn.getTransactionState(engine); - CloseableIterator physicalData = - Transaction.transformLogicalData(engine, txnState, logicalData, Collections.emptyMap()); - DataWriteContext writeContext = - Transaction.getWriteContext(engine, txnState, Collections.emptyMap()); + CloseableIterator physicalData = Transaction.transformLogicalData(engine, txnState, + logicalData, Collections.emptyMap()); + DataWriteContext writeContext = Transaction.getWriteContext(engine, txnState, Collections.emptyMap()); CloseableIterator dataFiles = engine.getParquetHandler() .writeParquetFiles(writeContext.getTargetDirectory(), physicalData, writeContext.getStatisticsColumns()); - CloseableIterator appendActions = - Transaction.generateAppendActions(engine, txnState, dataFiles, writeContext); + CloseableIterator appendActions = Transaction.generateAppendActions(engine, txnState, dataFiles, + writeContext); txn.commit(engine, CloseableIterable.inMemoryIterable(appendActions)); } } diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java index 97d54faf6cd..9e8823f7ecf 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDelta.java @@ -63,18 +63,22 @@ public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] n Engine engine = DeltaKernelUtils.createEngine(); String tablePath = DeltaKernelUtils.qualify(fname); DeltaKernelUtils.ScanHandle handle = DeltaKernelUtils.openScan(engine, tablePath); + return readWithHandle(fname, engine, handle); + } - // derive per-column read codes, value types and names once from the schema - final int ncol = handle.schema.length(); - final int[] readCodes = new int[ncol]; - final ValueType[] vt = new ValueType[ncol]; - final String[] cnames = new String[ncol]; - for(int c = 0; c < ncol; c++) { - DataType dt = handle.schema.at(c).getDataType(); - readCodes[c] = readCode(dt, handle.schema.at(c).getName()); - vt[c] = valueType(readCodes[c]); - cnames[c] = handle.schema.at(c).getName(); - } + /** + * Materialize the frame from an already-opened engine and scan handle. Factored out so the parallel reader can + * reuse a handle it already opened for its single-file/single-thread fallback instead of re-opening the (expensive) + * Delta snapshot a second time. + * + * @param fname the table path (for error messages) + * @param engine the Delta Kernel engine + * @param handle the opened scan handle + * @return the materialized frame block + */ + protected FrameBlock readWithHandle(String fname, Engine engine, DeltaKernelUtils.ScanHandle handle) + throws IOException { + final ReadPlan plan = planColumns(handle); // fast path: exact per-file row counts are known from metadata (no deletion // vectors) -> pre-size one typed array per column and decode each file @@ -86,14 +90,48 @@ public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] n // empty table: the typed column arrays cannot be zero-length, so return a // schema-only frame with the discovered schema/names and zero rows. if(total == 0) - return new FrameBlock(vt, cnames, 0); + return new FrameBlock(plan.vt, plan.cnames, 0); if(total <= Integer.MAX_VALUE) - return readDirect(fname, engine, handle, ncol, readCodes, vt, cnames, (int) total); + return readDirect(fname, engine, handle, plan, (int) total); } // fallback: row counts unknown or deletion vectors present -> decode into // per-batch arrays and concatenate per column in file order. - return readBuffered(engine, handle, ncol, readCodes, vt, cnames); + return readBuffered(engine, handle, plan); + } + + /** + * Immutable per-column read plan derived once from the Delta table schema: how to pull each column out of the + * kernel column vector ({@code readCodes}), the resulting SystemDS value types, and the column names. Shared by the + * serial and parallel readers so the schema-to-column mapping lives in exactly one place. + */ + protected static final class ReadPlan { + final int ncol; + final int[] readCodes; + final ValueType[] vt; + final String[] cnames; + + private ReadPlan(int ncol, int[] readCodes, ValueType[] vt, String[] cnames) { + this.ncol = ncol; + this.readCodes = readCodes; + this.vt = vt; + this.cnames = cnames; + } + } + + /** Derive the {@link ReadPlan} (read codes, value types, names) from the opened scan handle's schema. */ + protected static ReadPlan planColumns(DeltaKernelUtils.ScanHandle handle) { + final int ncol = handle.schema.length(); + final int[] readCodes = new int[ncol]; + final ValueType[] vt = new ValueType[ncol]; + final String[] cnames = new String[ncol]; + for(int c = 0; c < ncol; c++) { + DataType dt = handle.schema.at(c).getDataType(); + readCodes[c] = readCode(dt, handle.schema.at(c).getName()); + vt[c] = valueType(readCodes[c]); + cnames[c] = handle.schema.at(c).getName(); + } + return new ReadPlan(ncol, readCodes, vt, cnames); } /** @@ -113,11 +151,13 @@ protected boolean useDirectPath(DeltaKernelUtils.ScanHandle handle) { * Fast path: decode each data file straight into pre-sized typed column arrays at a metadata-derived row offset. * One allocation per column, single pass, no intermediate per-batch buffers or serial concatenation. */ - private FrameBlock readDirect(String fname, Engine engine, DeltaKernelUtils.ScanHandle handle, int ncol, - int[] readCodes, ValueType[] vt, String[] cnames, int nrow) throws IOException { + private FrameBlock readDirect(String fname, Engine engine, DeltaKernelUtils.ScanHandle handle, ReadPlan plan, + int nrow) throws IOException { + final int ncol = plan.ncol; + final int[] readCodes = plan.readCodes; final Object[] dest = new Object[ncol]; for(int c = 0; c < ncol; c++) - dest[c] = ArrayFactory.allocateBacking(vt[c], nrow); + dest[c] = ArrayFactory.allocateBacking(plan.vt[c], nrow); int base = 0; for(int i = 0; i < handle.scanFiles.size(); i++) { @@ -128,21 +168,28 @@ private FrameBlock readDirect(String fname, Engine engine, DeltaKernelUtils.Scan final int[] cur = new int[] {base}; DeltaKernelUtils.readScanFile(engine, handle.scanState, handle.physicalReadSchema, handle.scanFiles.get(i), (cols, size, selected) -> { - if(cur[0] + DeltaKernelUtils.countSelected(size, selected) > limit) + int n = DeltaKernelUtils.countSelected(size, selected); + if(cur[0] + n > limit) throw new DMLRuntimeException("Delta file produced more rows than its " + "numRecords statistic; refusing direct read of " + fname); for(int c = 0; c < ncol; c++) extractColumnInto(cols[c], size, selected, readCodes[c], dest[c], cur[0]); - cur[0] += DeltaKernelUtils.countSelected(size, selected); + cur[0] += n; }); + // also fail loud on underflow: a file decoding fewer rows than its + // numRecords statistic would leave the tail of the slice at the array + // default (0/null) while nrow still reports the (inflated) statistic. + if(cur[0] != limit) + throw new DMLRuntimeException("Delta file produced " + (cur[0] - base) + " rows, expected " + + (limit - base) + " from its numRecords statistic; refusing direct read of " + fname); base = limit; } Array[] columns = new Array[ncol]; for(int c = 0; c < ncol; c++) - columns[c] = ArrayFactory.create(vt[c], dest[c]); + columns[c] = ArrayFactory.create(plan.vt[c], dest[c]); FrameBlock ret = new FrameBlock(columns); - ret.setColumnNames(cnames); + ret.setColumnNames(plan.cnames); return ret; } @@ -151,11 +198,13 @@ private FrameBlock readDirect(String fname, Engine engine, DeltaKernelUtils.Scan * when exact per-file row counts are not available (missing statistics or deletion vectors present), so the output * cannot be pre-sized up front. */ - private FrameBlock readBuffered(Engine engine, DeltaKernelUtils.ScanHandle handle, int ncol, int[] readCodes, - ValueType[] vt, String[] cnames) throws IOException { + private FrameBlock readBuffered(Engine engine, DeltaKernelUtils.ScanHandle handle, ReadPlan plan) + throws IOException { + final int ncol = plan.ncol; + final int[] readCodes = plan.readCodes; final ArrayList batchCols = new ArrayList<>(); final ArrayList batchSizes = new ArrayList<>(); - final int[] nrowH = new int[1]; + final int[] nrowHolder = new int[1]; for(Row scanFileRow : handle.scanFiles) { DeltaKernelUtils.readScanFile(engine, handle.scanState, handle.physicalReadSchema, scanFileRow, (cols, size, selected) -> { @@ -164,25 +213,25 @@ private FrameBlock readBuffered(Engine engine, DeltaKernelUtils.ScanHandle handl for(int c = 0; c < ncol; c++) { // decode into a fresh per-batch array via the shared alloc + // decode primitives (the same ones the direct path uses) - Object col = ArrayFactory.allocateBacking(vt[c], n); + Object col = ArrayFactory.allocateBacking(plan.vt[c], n); extractColumnInto(cols[c], size, selected, readCodes[c], col, 0); extracted[c] = col; } batchCols.add(extracted); batchSizes.add(n); - nrowH[0] += n; + nrowHolder[0] += n; }); } - int nrow = nrowH[0]; + int nrow = nrowHolder[0]; // empty table: return a schema-only frame with the discovered schema/names. if(nrow == 0) - return new FrameBlock(vt, cnames, 0); + return new FrameBlock(plan.vt, plan.cnames, 0); Array[] columns = new Array[ncol]; for(int c = 0; c < ncol; c++) - columns[c] = concatColumn(vt[c], nrow, batchCols, batchSizes, c); + columns[c] = concatColumn(plan.vt[c], nrow, batchCols, batchSizes, c); FrameBlock ret = new FrameBlock(columns); - ret.setColumnNames(cnames); + ret.setColumnNames(plan.cnames); return ret; } diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java index 351369a8f14..106264afe6c 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderDeltaParallel.java @@ -36,7 +36,6 @@ import io.delta.kernel.data.Row; import io.delta.kernel.engine.Engine; -import io.delta.kernel.types.DataType; /** * Parallel native Delta Lake frame reader. Delta tables are stored as one or more parquet data files; this reader @@ -66,21 +65,13 @@ public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] n DeltaKernelUtils.ScanHandle handle = DeltaKernelUtils.openScan(engine, tablePath); final int nfiles = handle.scanFiles.size(); - // nothing to gain from parallelism for single-file (or empty) tables + // nothing to gain from parallelism for single-file (or empty) tables: reuse + // the already-opened engine + scan handle instead of re-opening the snapshot. if(_numThreads <= 1 || nfiles <= 1) - return super.readFrameFromHDFS(fname, schema, names, rlen, clen); + return readWithHandle(fname, engine, handle); // derive per-column read codes, value types and names once from the schema - final int ncol = handle.schema.length(); - final int[] readCodes = new int[ncol]; - final ValueType[] vt = new ValueType[ncol]; - final String[] cnames = new String[ncol]; - for(int c = 0; c < ncol; c++) { - DataType dt = handle.schema.at(c).getDataType(); - readCodes[c] = readCode(dt, handle.schema.at(c).getName()); - vt[c] = valueType(readCodes[c]); - cnames[c] = handle.schema.at(c).getName(); - } + final ReadPlan plan = planColumns(handle); // fast path: exact per-file row counts are known from metadata -> pre-size // one typed array per column and let each thread decode directly into its @@ -90,18 +81,20 @@ public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] n for(long r : handle.numRecords) total += r; if(total > 0 && total <= Integer.MAX_VALUE) - return readDirect(fname, handle, ncol, readCodes, vt, cnames, (int) total); + return readDirect(fname, handle, plan, (int) total); } - return readBuffered(fname, handle, ncol, readCodes, vt, cnames); + return readBuffered(fname, handle, plan); } /** * Fast path: each thread decodes one data file straight into the final typed column arrays at a metadata-derived * row offset. Single allocation per column, fully parallel. */ - private FrameBlock readDirect(String fname, DeltaKernelUtils.ScanHandle handle, int ncol, int[] readCodes, - ValueType[] vt, String[] cnames, int nrow) throws IOException { + private FrameBlock readDirect(String fname, DeltaKernelUtils.ScanHandle handle, ReadPlan plan, int nrow) + throws IOException { + final int ncol = plan.ncol; + final int[] readCodes = plan.readCodes; final int nfiles = handle.scanFiles.size(); final int[] rowOffset = new int[nfiles]; int acc = 0; @@ -113,7 +106,7 @@ private FrameBlock readDirect(String fname, DeltaKernelUtils.ScanHandle handle, // pre-size one typed array per column for the whole table final Object[] dest = new Object[ncol]; for(int c = 0; c < ncol; c++) - dest[c] = ArrayFactory.allocateBacking(vt[c], nrow); + dest[c] = ArrayFactory.allocateBacking(plan.vt[c], nrow); ArrayList> tasks = new ArrayList<>(nfiles); for(int i = 0; i < nfiles; i++) { @@ -128,13 +121,19 @@ private FrameBlock readDirect(String fname, DeltaKernelUtils.ScanHandle handle, Engine eng = DeltaKernelUtils.createEngine(); DeltaKernelUtils.readScanFile(eng, handle.scanState, handle.physicalReadSchema, scanFileRow, (cols, size, selected) -> { - if(cur[0] + DeltaKernelUtils.countSelected(size, selected) > limit) + int n = DeltaKernelUtils.countSelected(size, selected); + if(cur[0] + n > limit) throw new DMLRuntimeException("Delta file produced more rows than its " + "numRecords statistic; refusing parallel direct read of " + fname); for(int c = 0; c < ncol; c++) extractColumnInto(cols[c], size, selected, readCodes[c], dest[c], cur[0]); - cur[0] += DeltaKernelUtils.countSelected(size, selected); + cur[0] += n; }); + // fail loud on underflow too: fewer decoded rows than the statistic + // would leave this slice's tail at the array default (0/null). + if(cur[0] != limit) + throw new DMLRuntimeException("Delta file produced " + (cur[0] - base) + " rows, expected " + + (limit - base) + " from its numRecords statistic; refusing parallel direct read of " + fname); return null; }); } @@ -142,10 +141,10 @@ private FrameBlock readDirect(String fname, DeltaKernelUtils.ScanHandle handle, Array[] columns = new Array[ncol]; for(int c = 0; c < ncol; c++) - columns[c] = ArrayFactory.create(vt[c], dest[c]); + columns[c] = ArrayFactory.create(plan.vt[c], dest[c]); FrameBlock ret = new FrameBlock(columns); - ret.setColumnNames(cnames); + ret.setColumnNames(plan.cnames); return ret; } @@ -154,8 +153,10 @@ private FrameBlock readDirect(String fname, DeltaKernelUtils.ScanHandle handle, * unknown or deletion vectors are present), then concatenate per column in file order via the shared * {@link FrameReaderDelta#concatColumn} helper. */ - private FrameBlock readBuffered(String fname, DeltaKernelUtils.ScanHandle handle, int ncol, int[] readCodes, - ValueType[] vt, String[] cnames) throws IOException { + private FrameBlock readBuffered(String fname, DeltaKernelUtils.ScanHandle handle, ReadPlan plan) + throws IOException { + final int ncol = plan.ncol; + final int[] readCodes = plan.readCodes; final int nfiles = handle.scanFiles.size(); @SuppressWarnings("unchecked") final ArrayList[] fileCols = new ArrayList[nfiles]; @@ -176,7 +177,7 @@ private FrameBlock readBuffered(String fname, DeltaKernelUtils.ScanHandle handle for(int c = 0; c < ncol; c++) { // decode into a fresh per-batch array via the shared alloc + // decode primitives (the same ones the direct path uses) - Object col = ArrayFactory.allocateBacking(vt[c], n); + Object col = ArrayFactory.allocateBacking(plan.vt[c], n); extractColumnInto(cols[c], size, selected, readCodes[c], col, 0); extracted[c] = col; } @@ -203,10 +204,10 @@ private FrameBlock readBuffered(String fname, DeltaKernelUtils.ScanHandle handle Array[] columns = new Array[ncol]; for(int c = 0; c < ncol; c++) - columns[c] = concatColumn(vt[c], nrow, batchCols, batchSizes, c); + columns[c] = concatColumn(plan.vt[c], nrow, batchCols, batchSizes, c); FrameBlock ret = new FrameBlock(columns); - ret.setColumnNames(cnames); + ret.setColumnNames(plan.cnames); return ret; } diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java b/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java index 01f90259598..fe66a7d195f 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameWriterDelta.java @@ -239,7 +239,10 @@ public float getFloat(int rowId) { @Override public long getLong(int rowId) { - // exact for INT64 (getAsDouble would lose precision beyond 2^53) + // exact for INT64 (getAsDouble would lose precision beyond 2^53). This boxes one + // Number per cell because Array exposes no primitive getAsLong; a boxing-free + // getAsLong on Array would remove this write-path allocation (follow-up). The + // kernel only calls this after isNullAt() is false, so the cell is never null here. return ((Number) _col.get(_rowStart + rowId)).longValue(); } diff --git a/src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java b/src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java index e326efa0926..ea76fef51b1 100644 --- a/src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java +++ b/src/test/java/org/apache/sysds/performance/frame/DeltaFrameRead.java @@ -22,7 +22,6 @@ import java.io.File; import java.nio.file.Files; import java.nio.file.Path; -import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.sysds.common.Types.ValueType; @@ -36,6 +35,7 @@ import org.apache.sysds.runtime.io.FrameReaderDeltaParallel; import org.apache.sysds.runtime.io.FrameWriterDelta; import org.apache.sysds.test.TestUtils; +import org.apache.sysds.test.component.io.DeltaFrameTestUtils; /** * Reads the SAME native Delta frame table from disk repeatedly and reports read throughput. The table is written to a @@ -45,8 +45,8 @@ * *

* This is the target for an async-profiler run: launch the perf jar under the profiler agent and this loop provides a - * long, steady-state read workload to sample. See {@code src/test/java/org/apache/sysds/performance/README.md} and the - * {@code delta-async-profiler} cursor rule. + * long, steady-state read workload to sample. See {@code src/test/java/org/apache/sysds/performance/README.md} for how + * to run this under async-profiler. *

* *

@@ -111,7 +111,7 @@ private void setup() throws Exception { tableDir = Files.createTempDirectory("sysds_delta_frame_read_"); tablePath = new File(tableDir.toFile(), "table").getAbsolutePath(); new FrameWriterDelta().writeFrameToHDFS(fb, tablePath, fb.getNumRows(), fb.getNumColumns()); - files = countParquet(tablePath); + files = DeltaFrameTestUtils.countParquet(tablePath); } private void readSerial() { @@ -135,15 +135,9 @@ private void readParallel() { } } - private static long countParquet(String tablePath) throws Exception { - try(Stream s = Files.walk(new File(tablePath).toPath())) { - return s.filter(p -> p.toString().endsWith(".parquet")).count(); - } - } - @Override protected String makeResString() { - throw new RuntimeException("Do not call"); + throw new UnsupportedOperationException("Use makeResString(double[]) with the timed measurements instead."); } @Override diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java deleted file mode 100644 index ddd53c906d8..00000000000 --- a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadPerf.java +++ /dev/null @@ -1,453 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.component.io; - -import java.io.File; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Random; - -import org.apache.commons.io.FileUtils; -import org.apache.sysds.common.Types.ValueType; -import org.apache.sysds.conf.ConfigurationManager; -import org.apache.sysds.conf.DMLConfig; -import org.apache.sysds.hops.OptimizerUtils; -import org.apache.sysds.runtime.frame.data.FrameBlock; -import org.apache.sysds.runtime.io.DeltaKernelUtils; -import org.apache.sysds.runtime.io.FrameReaderDelta; -import org.apache.sysds.runtime.io.FrameReaderDeltaParallel; -import org.apache.sysds.runtime.io.FrameWriterDelta; -import org.junit.Ignore; -import org.junit.Test; - -/** - * Manual micro-benchmark comparing the serial {@link FrameReaderDelta} against the parallel - * {@link FrameReaderDeltaParallel} on multi-file Delta frame tables. Not a correctness test (those live in - * {@link DeltaFrameReadWriteTest}); it just prints timing/throughput numbers and is {@link Ignore}d so it does not run - * in the normal build. - * - *

- * The parallel reader decodes one task per parquet data file, so the speedup scales with the number of files - * (controlled here via the writer target file size). Run it on a JVM with a realistically sized heap; under a tiny - * young generation (e.g. the Surefire fork's {@code -Xmn300m}) the concurrent decode's higher allocation rate is - * dominated by young-GC pauses and the numbers are not representative of a normal SystemDS process. - *

- * - *

- * Run explicitly (remove {@link Ignore} or run the compiled class directly), e.g. - * {@code mvn -q test -Dtest=DeltaFrameReadPerf -DfailIfNoTests=false}. - *

- */ -public class DeltaFrameReadPerf { - - private static final ValueType[] NO_SCHEMA = new ValueType[] {ValueType.STRING}; - private static final String[] NO_NAMES = new String[] {"x"}; - - private static final long MB = 1024L * 1024; - private static final int WARMUP = 2; - private static final int REPS = 7; - - /** Entry point so the (otherwise {@code @Ignore}d) benchmarks can be run directly. */ - public static void main(String[] args) throws Exception { - DeltaFrameReadPerf p = new DeltaFrameReadPerf(); - p.serialDirectVsBuffered(); - } - - /** - * Isolates the serial-reader change: compares the new direct (pre-sized, metadata-driven, single-pass) read against - * the old buffered (per-batch extract + concatenate) read on the SAME single-file table, so the only difference is - * the extra allocation + concatenation copy. Single file => no file-level parallelism involved, pure serial decode - * cost. - */ - @Test - @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") - public void serialDirectVsBuffered() throws Exception { - System.out.println("\n=== serial direct vs buffered (single file, 4M rows) ==="); - System.out.printf("%-9s %11s %11s %9s%n", "schema", "direct(ms)", "buffered(ms)", "speedup"); - for(String kind : new String[] {"numeric", "mixed", "string"}) { - // force a single data file: disable adaptive sizing, huge target - DMLConfig c = new DMLConfig(); - c.setTextValue(DMLConfig.DELTA_WRITER_ADAPTIVE_FILE_SIZE, "false"); - c.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(4L * 1024 * MB)); - ConfigurationManager.setLocalConfig(c); - Path dir = Files.createTempDirectory("sysds_delta_frame_ab_"); - String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); - try { - FrameBlock in = genFrame(kind, 4_000_000, 7); - new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - // direct = default serial reader; buffered = force the fallback path - FrameReaderDelta direct = new FrameReaderDelta(); - FrameReaderDelta buffered = new FrameReaderDelta() { - @Override - protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { - return false; - } - }; - for(int i = 0; i < WARMUP; i++) { - direct.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - buffered.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - } - double[] td = new double[REPS], tb = new double[REPS]; - for(int i = 0; i < REPS; i++) { - td[i] = time(() -> direct.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); - tb[i] = time(() -> buffered.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); - } - double md = median(td), mb = median(tb); - long ad = allocBytes(() -> direct.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); - long ab = allocBytes(() -> buffered.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); - System.out.printf("%-9s %11.2f %11.2f %8.2fx alloc: %6.0f / %6.0f MB (%.2fx)%n", kind, md, mb, - mb / md, ad / (double) MB, ab / (double) MB, ab / (double) ad); - } - finally { - ConfigurationManager.clearLocalConfigs(); - FileUtils.deleteQuietly(dir.toFile()); - } - } - } - - /** - * End-to-end check of adaptive writer file sizing with NO explicit target size configured (the real default): the - * table should now be split into ~one file per reader and read fast, versus the single/few-file layout the fixed - * 64MB default produced. - */ - @Test - @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") - public void adaptiveCheck() throws Exception { - System.out.println("\n=== adaptive writer file sizing (default config, no target set) ==="); - System.out.println("threads = " + OptimizerUtils.getParallelBinaryReadParallelism()); - System.out.printf("%-9s %-8s %-6s %11s %11s %9s%n", "rows", "adaptive", "files", "serial(ms)", "par(ms)", - "speedup"); - for(int rows : new int[] {1_000_000, 4_000_000}) { - // default config => 64MB cap, adaptive sizing enabled - ConfigurationManager.setLocalConfig(new DMLConfig()); - Path dir = Files.createTempDirectory("sysds_delta_frame_adp_"); - String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); - try { - FrameBlock in = genFrame("mixed", rows, 7); - long est = in.getInMemorySize(); - long target = DeltaKernelUtils.adaptiveWriterTargetFileSize(est); - new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - long files = countParquet(tablePath); - double[] r = measure(tablePath); - System.out.printf("%-9d %-8s %-6d %11.2f %11.2f %8.2fx%n", rows, (target / MB) + "MB", files, r[0], - r[1], r[0] / r[1]); - } - finally { - ConfigurationManager.clearLocalConfigs(); - FileUtils.deleteQuietly(dir.toFile()); - } - } - } - - /** - * Sweep the writer target file size ({@link DMLConfig#DELTA_WRITER_TARGET_FILE_SIZE}, the one public Delta knob - * that affects read parallelism) to find where the per-file parallel read stops improving, i.e. a good default for - * read-heavy use. - */ - @Test - @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") - public void targetSizeSweep() throws Exception { - final int rows = 4_000_000; - final long[] sizesMB = {128, 64, 32, 16, 8, 4, 2}; - System.out.println("\n=== writer target-size sweep (mixed, " + rows + " rows, " - + OptimizerUtils.getParallelBinaryReadParallelism() + " threads) ==="); - System.out.printf("%-9s %-6s %11s %11s %9s%n", "targetMB", "files", "serial(ms)", "par(ms)", "speedup"); - for(long mb : sizesMB) { - DMLConfig c = new DMLConfig(); - c.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(mb * MB)); - ConfigurationManager.setLocalConfig(c); - Path dir = Files.createTempDirectory("sysds_delta_frame_ts_"); - String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); - try { - FrameBlock in = genFrame("mixed", rows, 7); - new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - long files = countParquet(tablePath); - double[] r = measure(tablePath); - System.out.printf("%-9d %-6d %11.2f %11.2f %8.2fx%n", mb, files, r[0], r[1], r[0] / r[1]); - } - finally { - ConfigurationManager.clearLocalConfigs(); - FileUtils.deleteQuietly(dir.toFile()); - } - } - } - - /** - * Sweep the parquet reader batch size ({@link DMLConfig#DELTA_READER_BATCH_SIZE}, a public Delta Kernel knob) on a - * fixed multi-file table, with and without quieting the parquet/delta loggers. Pure "how we call the public API" - * tuning. - */ - @Test - @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") - public void batchSizeSweep() throws Exception { - final int rows = 2_000_000; - final long fileSize = 8 * MB; - final int[] batches = {1024, 4096, 8192, 16384, 32768, 65536, 131072}; - System.out.println("\n=== reader batch-size sweep (mixed, " + rows + " rows, 8MB files) ==="); - - // write the table ONCE; the batch size only affects the read path - DMLConfig wconf = new DMLConfig(); - wconf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(fileSize)); - ConfigurationManager.setLocalConfig(wconf); - Path dir = Files.createTempDirectory("sysds_delta_frame_bs_"); - String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); - try { - FrameBlock in = genFrame("mixed", rows, 7); - new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - System.out.println("files = " + countParquet(tablePath)); - - for(boolean quietLog : new boolean[] {false, true}) { - if(quietLog) - silenceParquetLogging(); - System.out.println(quietLog ? "-- parquet/delta logging -> ERROR --" : "-- default logging --"); - System.out.printf("%-9s %11s %11s%n", "batch", "serial(ms)", "par(ms)"); - for(int bs : batches) { - DMLConfig c = new DMLConfig(); - c.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(fileSize)); - c.setTextValue(DMLConfig.DELTA_READER_BATCH_SIZE, String.valueOf(bs)); - ConfigurationManager.setLocalConfig(c); - double[] r = measure(tablePath); - System.out.printf("%-9d %11.2f %11.2f%n", bs, r[0], r[1]); - } - } - } - finally { - ConfigurationManager.clearLocalConfigs(); - FileUtils.deleteQuietly(dir.toFile()); - } - } - - /** Median serial and parallel read time (ms) for a fixed table under the current config. */ - private double[] measure(String tablePath) throws Exception { - FrameReaderDelta serial = new FrameReaderDelta(); - FrameReaderDeltaParallel parallel = new FrameReaderDeltaParallel(); - for(int i = 0; i < WARMUP; i++) { - serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - } - double[] ts = new double[REPS], tp = new double[REPS]; - for(int i = 0; i < REPS; i++) { - ts[i] = time(() -> serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); - tp[i] = time(() -> parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); - } - return new double[] {median(ts), median(tp)}; - } - - @Test - @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") - public void benchmark() throws Exception { - System.out.println("\n=== Delta frame reader benchmark ==="); - System.out.println("parallel read threads = " + OptimizerUtils.getParallelBinaryReadParallelism() - + ", processors = " + Runtime.getRuntime().availableProcessors()); - System.out.printf("%-9s %-7s %-6s %11s %11s %9s%n", "rows", "fileMB", "files", "serial(ms)", "par(ms)", - "speedup"); - runCase(1_000_000, 4 * MB); - runCase(1_000_000, 16 * MB); - runCase(4_000_000, 8 * MB); - runCase(4_000_000, 64 * MB); - } - - @Test - @Ignore("manual benchmark; remove @Ignore (or run the compiled class directly) to run") - public void schemaBreakdown() throws Exception { - System.out.println("\n=== schema composition breakdown (2M rows, 8MB files) ==="); - System.out.printf("%-10s %-6s %11s %11s %9s%n", "schema", "files", "serial(ms)", "par(ms)", "speedup"); - int rows = 2_000_000; - for(boolean quietLog : new boolean[] {false, true}) { - if(quietLog) - silenceParquetLogging(); - System.out.println(quietLog ? "-- parquet/delta logging -> ERROR --" : "-- default logging --"); - runSchema("numeric", rows, 8 * MB); - runSchema("mixed", rows, 8 * MB); - runSchema("string", rows, 8 * MB); - } - } - - private static void silenceParquetLogging() { - org.apache.log4j.Logger.getLogger("org.apache.parquet").setLevel(org.apache.log4j.Level.ERROR); - org.apache.log4j.Logger.getLogger("io.delta").setLevel(org.apache.log4j.Level.ERROR); - org.apache.log4j.Logger.getLogger("shaded.parquet").setLevel(org.apache.log4j.Level.ERROR); - } - - private void runSchema(String kind, int rows, long targetFileSize) throws Exception { - DMLConfig conf = new DMLConfig(); - conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(targetFileSize)); - ConfigurationManager.setLocalConfig(conf); - Path dir = Files.createTempDirectory("sysds_delta_frame_perf_"); - String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); - try { - FrameBlock in = genFrame(kind, rows, 7); - new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - long files = countParquet(tablePath); - FrameReaderDelta serial = new FrameReaderDelta(); - FrameReaderDeltaParallel parallel = new FrameReaderDeltaParallel(); - for(int i = 0; i < WARMUP; i++) { - serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - } - double[] ts = new double[REPS], tp = new double[REPS]; - for(int i = 0; i < REPS; i++) { - ts[i] = time(() -> serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); - tp[i] = time(() -> parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); - } - double ms = median(ts), mp = median(tp); - System.out.printf("%-10s %-6d %11.2f %11.2f %8.2fx%n", kind, files, ms, mp, ms / mp); - } - finally { - ConfigurationManager.clearLocalConfigs(); - FileUtils.deleteQuietly(dir.toFile()); - } - } - - private void runCase(int rows, long targetFileSize) throws Exception { - DMLConfig conf = new DMLConfig(); - conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(targetFileSize)); - ConfigurationManager.setLocalConfig(conf); - Path dir = Files.createTempDirectory("sysds_delta_frame_perf_"); - String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); - try { - FrameBlock in = genMixedFrame(rows, 7); - new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - long files = countParquet(tablePath); - - FrameReaderDelta serial = new FrameReaderDelta(); - FrameReaderDeltaParallel parallel = new FrameReaderDeltaParallel(); - - for(int i = 0; i < WARMUP; i++) { - serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - } - - double[] ts = new double[REPS], tp = new double[REPS]; - for(int i = 0; i < REPS; i++) { - ts[i] = time(() -> serial.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); - tp[i] = time(() -> parallel.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1)); - } - double ms = median(ts), mp = median(tp); - System.out.printf("%-9d %-7d %-6d %11.2f %11.2f %8.2fx%n", rows, targetFileSize / MB, files, ms, mp, - ms / mp); - } - finally { - ConfigurationManager.clearLocalConfigs(); - FileUtils.deleteQuietly(dir.toFile()); - } - } - - private static FrameBlock genFrame(String kind, int nrow, int seed) { - ValueType[] schema; - switch(kind) { - case "numeric": - schema = new ValueType[] {ValueType.INT64, ValueType.FP64, ValueType.INT32, ValueType.FP32, - ValueType.BOOLEAN, ValueType.INT64}; - break; - case "string": - schema = new ValueType[] {ValueType.STRING, ValueType.STRING, ValueType.STRING, ValueType.STRING, - ValueType.STRING, ValueType.STRING}; - break; - default: // mixed - schema = new ValueType[] {ValueType.STRING, ValueType.INT64, ValueType.FP64, ValueType.BOOLEAN, - ValueType.INT32, ValueType.FP32}; - } - String[] names = {"c0", "c1", "c2", "c3", "c4", "c5"}; - FrameBlock fb = new FrameBlock(schema, names); - fb.ensureAllocatedColumns(nrow); - Random rnd = new Random(seed); - for(int r = 0; r < nrow; r++) - for(int c = 0; c < schema.length; c++) - fb.set(r, c, randVal(schema[c], rnd, r)); - return fb; - } - - private static Object randVal(ValueType vt, Random rnd, int r) { - switch(vt) { - case STRING: - return "row" + rnd.nextInt(1_000_000); - case INT64: - return (long) rnd.nextInt(); - case FP64: - return rnd.nextDouble() * 200 - 100; - case INT32: - return rnd.nextInt(); - case FP32: - return rnd.nextFloat(); - case BOOLEAN: - return rnd.nextBoolean(); - default: - return null; - } - } - - private interface IORun { - FrameBlock run() throws Exception; - } - - /** Bytes allocated by the calling (single) thread during one read. */ - private static long allocBytes(IORun r) throws Exception { - com.sun.management.ThreadMXBean tb = (com.sun.management.ThreadMXBean) java.lang.management.ManagementFactory - .getThreadMXBean(); - long id = Thread.currentThread().getId(); - long a0 = tb.getThreadAllocatedBytes(id); - FrameBlock fb = r.run(); - long alloc = tb.getThreadAllocatedBytes(id) - a0; - if(fb.getNumRows() <= 0) - throw new IllegalStateException("empty read"); - return alloc; - } - - private static double time(IORun r) throws Exception { - long t0 = System.nanoTime(); - FrameBlock fb = r.run(); - long t1 = System.nanoTime(); - if(fb.getNumRows() <= 0) - throw new IllegalStateException("empty read"); - return (t1 - t0) / 1e6; - } - - private static double median(double[] v) { - double[] c = v.clone(); - Arrays.sort(c); - return c[c.length / 2]; - } - - private static long countParquet(String tablePath) throws Exception { - try(java.util.stream.Stream s = Files.walk(new File(tablePath).toPath())) { - return s.filter(p -> p.toString().endsWith(".parquet")).count(); - } - } - - private static FrameBlock genMixedFrame(int nrow, int seed) { - ValueType[] schema = {ValueType.STRING, ValueType.INT64, ValueType.FP64, ValueType.BOOLEAN, ValueType.INT32, - ValueType.FP32}; - String[] names = {"name", "id", "score", "active", "count", "ratio"}; - FrameBlock fb = new FrameBlock(schema, names); - fb.ensureAllocatedColumns(nrow); - Random rnd = new Random(seed); - for(int r = 0; r < nrow; r++) { - fb.set(r, 0, "row" + rnd.nextInt(1_000_000)); - fb.set(r, 1, (long) rnd.nextInt()); - fb.set(r, 2, rnd.nextDouble() * 200 - 100); - fb.set(r, 3, rnd.nextBoolean()); - fb.set(r, 4, rnd.nextInt()); - fb.set(r, 5, rnd.nextFloat()); - } - return fb; - } -} diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java index 28775ed8e3f..7012be44426 100644 --- a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java +++ b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameReadWriteTest.java @@ -29,7 +29,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.NoSuchElementException; -import java.util.Objects; import java.util.Optional; import java.util.Random; @@ -108,6 +107,33 @@ private static FrameBlock alloc(ValueType[] schema, String[] names, int nrow) { return fb; } + @FunctionalInterface + private interface TableTest { + void accept(FrameBlock in, String tablePath) throws Exception; + } + + /** + * Write {@code in} to a fresh temp Delta table with a small target file size (so the writer rolls multiple data + * files), assert the layout really is multi-file, then run {@code body} against the table. Local config and the + * temp directory are always cleaned up. + */ + private static void withSmallTargetTable(FrameBlock in, TableTest body) throws Exception { + DMLConfig conf = new DMLConfig(); + conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(SMALL_TARGET_FILE_SIZE)); + ConfigurationManager.setLocalConfig(conf); + Path dir = Files.createTempDirectory("sysds_delta_frame_mf_"); + String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); + try { + new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); + assertMultiFile(tablePath); + body.accept(in, tablePath); + } + finally { + ConfigurationManager.clearLocalConfigs(); + FileUtils.deleteQuietly(dir.toFile()); + } + } + @Test public void roundTripMixedTypes() throws Exception { ValueType[] schema = {ValueType.STRING, ValueType.INT64, ValueType.FP64, ValueType.BOOLEAN, ValueType.INT32, @@ -192,26 +218,13 @@ public void roundTripWithStringNulls() throws Exception { @Test public void parallelReadMatchesSerialMultiFile() throws Exception { - DMLConfig conf = new DMLConfig(); - conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(SMALL_TARGET_FILE_SIZE)); - ConfigurationManager.setLocalConfig(conf); - Path dir = Files.createTempDirectory("sysds_delta_frame_par_"); - String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); - try { - FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 13); - new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - assertMultiFile(tablePath); - + FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 13); + withSmallTargetTable(in, (frame, tablePath) -> { FrameBlock serial = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); FrameBlock parallel = new FrameReaderDeltaParallel().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - assertFramesEqual(serial, parallel); - } - finally { - ConfigurationManager.clearLocalConfigs(); - FileUtils.deleteQuietly(dir.toFile()); - } + }); } @Test @@ -219,16 +232,8 @@ public void parallelBufferedPathMatchesSerialMultiFile() throws Exception { // the direct fast path is always taken for SystemDS-written tables (exact // row stats, no deletion vectors); force the buffered fallback to exercise // its per-file decode + serial concatenation and assert it matches serial. - DMLConfig conf = new DMLConfig(); - conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(SMALL_TARGET_FILE_SIZE)); - ConfigurationManager.setLocalConfig(conf); - Path dir = Files.createTempDirectory("sysds_delta_frame_buf_"); - String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); - try { - FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 23); - new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - assertMultiFile(tablePath); - + FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 23); + withSmallTargetTable(in, (frame, tablePath) -> { FrameBlock serial = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); // subclass that always declines the direct path -> readBuffered() FrameBlock buffered = new FrameReaderDeltaParallel() { @@ -237,13 +242,8 @@ protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { return false; } }.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - assertFramesEqual(serial, buffered); - } - finally { - ConfigurationManager.clearLocalConfigs(); - FileUtils.deleteQuietly(dir.toFile()); - } + }); } @Test @@ -251,16 +251,8 @@ public void serialBufferedPathMatchesDirectMultiFile() throws Exception { // the direct (pre-sized, metadata-driven) path is always taken for SystemDS- // written tables; force the serial buffered fallback (per-batch extract + // concatenate) to exercise it and assert it matches the direct read. - DMLConfig conf = new DMLConfig(); - conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(SMALL_TARGET_FILE_SIZE)); - ConfigurationManager.setLocalConfig(conf); - Path dir = Files.createTempDirectory("sysds_delta_frame_sbuf_"); - String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); - try { - FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 29); - new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - assertMultiFile(tablePath); - + FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 29); + withSmallTargetTable(in, (frame, tablePath) -> { FrameBlock direct = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); // subclass that always declines the direct path -> buffered extract+concat FrameBlock buffered = new FrameReaderDelta() { @@ -269,13 +261,8 @@ protected boolean useDirectPath(DeltaKernelUtils.ScanHandle h) { return false; } }.readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - assertFramesEqual(direct, buffered); - } - finally { - ConfigurationManager.clearLocalConfigs(); - FileUtils.deleteQuietly(dir.toFile()); - } + }); } @Test @@ -335,8 +322,6 @@ public void readerBatchSizeConfigRoundTrips() throws Exception { Path dir = Files.createTempDirectory("sysds_delta_frame_bs_"); String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); try { - assertEquals("config getter reflects the override", 128, ConfigurationManager.getDeltaReaderBatchSize()); - FrameBlock in = TestUtils.generateRandomFrameBlock(5000, MIXED_SCHEMA, 31); new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); FrameBlock out = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); @@ -351,28 +336,14 @@ public void readerBatchSizeConfigRoundTrips() throws Exception { @Test public void writerTargetFileSizeConfigProducesMoreFiles() throws Exception { // a smaller configured target file size must make the writer roll more - // data files for the same frame (the lever the parallel reader relies on). - DMLConfig conf = new DMLConfig(); - conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(SMALL_TARGET_FILE_SIZE)); - ConfigurationManager.setLocalConfig(conf); - Path dir = Files.createTempDirectory("sysds_delta_frame_cfg_"); - String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); - try { - assertEquals("config getter reflects the override", SMALL_TARGET_FILE_SIZE, - ConfigurationManager.getDeltaWriterTargetFileSize()); - - FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 41); - new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - assertMultiFile(tablePath); - + // data files for the same frame (the lever the parallel reader relies on); + // the multi-file layout is asserted inside withSmallTargetTable. + FrameBlock in = TestUtils.generateRandomFrameBlock(ROWS_MULTI_FILE, MIXED_SCHEMA, 41); + withSmallTargetTable(in, (frame, tablePath) -> { // data still round-trips correctly with the custom layout FrameBlock out = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - assertFramesEqual(in, out); - } - finally { - ConfigurationManager.clearLocalConfigs(); - FileUtils.deleteQuietly(dir.toFile()); - } + assertFramesEqual(frame, out); + }); } @Test @@ -514,7 +485,9 @@ public void readFromInputStreamUnsupported() throws Exception { fail("expected UnsupportedOperationException for a Delta input-stream read"); } catch(UnsupportedOperationException ex) { - // expected: must throw before touching the (null) stream + // must throw before touching the (null) stream, for the documented reason + assertTrue("message should mention input stream, got: " + ex.getMessage(), + ex.getMessage() != null && ex.getMessage().contains("input stream")); } } @@ -522,42 +495,26 @@ public void readFromInputStreamUnsupported() throws Exception { public void parallelReadStringNullsMatchSerialMultiFile() throws Exception { // string nulls across a multi-file table: the parallel direct path must // reproduce the serial read cell-for-cell (assertFramesEqual uses - // Objects.equals, so nulls are compared faithfully). - DMLConfig conf = new DMLConfig(); - conf.setTextValue(DMLConfig.DELTA_WRITER_TARGET_FILE_SIZE, String.valueOf(SMALL_TARGET_FILE_SIZE)); - ConfigurationManager.setLocalConfig(conf); - Path dir = Files.createTempDirectory("sysds_delta_frame_parnull_"); - String tablePath = new File(dir.toFile(), "table").getAbsolutePath(); - try { - ValueType[] schema = {ValueType.STRING, ValueType.INT64}; - String[] names = {"s", "k"}; - int nrow = ROWS_MULTI_FILE; - FrameBlock in = alloc(schema, names, nrow); - for(int r = 0; r < nrow; r++) { - // interspersed string nulls (every 7th row) plus a numeric column - in.set(r, 0, (r % 7 == 0) ? null : "s" + r); - in.set(r, 1, (long) r); - } - new FrameWriterDelta().writeFrameToHDFS(in, tablePath, in.getNumRows(), in.getNumColumns()); - assertMultiFile(tablePath); - + // assertEquals, so nulls are compared faithfully). + ValueType[] schema = {ValueType.STRING, ValueType.INT64}; + String[] names = {"s", "k"}; + int nrow = ROWS_MULTI_FILE; + FrameBlock in = alloc(schema, names, nrow); + for(int r = 0; r < nrow; r++) { + // interspersed string nulls (every 7th row) plus a numeric column + in.set(r, 0, (r % 7 == 0) ? null : "s" + r); + in.set(r, 1, (long) r); + } + withSmallTargetTable(in, (frame, tablePath) -> { FrameBlock serial = new FrameReaderDelta().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); FrameBlock parallel = new FrameReaderDeltaParallel().readFrameFromHDFS(tablePath, NO_SCHEMA, NO_NAMES, -1, -1); - assertFramesEqual(serial, parallel); - } - finally { - ConfigurationManager.clearLocalConfigs(); - FileUtils.deleteQuietly(dir.toFile()); - } + }); } private static void assertMultiFile(String tablePath) throws Exception { - long files; - try(java.util.stream.Stream s = Files.walk(new File(tablePath).toPath())) { - files = s.filter(p -> p.toString().endsWith(".parquet")).count(); - } + long files = DeltaFrameTestUtils.countParquet(tablePath); assertTrue("expected a multi-file Delta table to exercise the parallel path, got " + files, files > 1); } @@ -572,7 +529,7 @@ private static void assertFramesEqual(FrameBlock expected, FrameBlock actual) { int nrow = expected.getNumRows(); for(int r = 0; r < nrow; r++) for(int c = 0; c < ncol; c++) - assertTrue("cell (" + r + "," + c + ")", Objects.equals(expected.get(r, c), actual.get(r, c))); + assertEquals("cell (" + r + "," + c + ")", expected.get(r, c), actual.get(r, c)); } /** Commits a schema-only Delta table (no data files) to exercise the 0-row read path. */ diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameSparkInteropTest.java b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameSparkInteropTest.java index 73f0b9834c0..d17d9ae4005 100644 --- a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameSparkInteropTest.java +++ b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameSparkInteropTest.java @@ -268,8 +268,6 @@ private static void assertFrameMatchesIds(FrameBlock out, Set expectedI } private static long countParquet(String tablePath) throws Exception { - try(java.util.stream.Stream s = Files.walk(new File(tablePath).toPath())) { - return s.filter(p -> p.toString().endsWith(".parquet")).count(); - } + return DeltaFrameTestUtils.countParquet(tablePath); } } diff --git a/src/test/java/org/apache/sysds/test/component/io/DeltaFrameTestUtils.java b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameTestUtils.java new file mode 100644 index 00000000000..fe025c40eae --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/io/DeltaFrameTestUtils.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.component.io; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.stream.Stream; + +/** Shared helpers for the native Delta frame read/write tests and benchmarks. */ +public class DeltaFrameTestUtils { + + private DeltaFrameTestUtils() { + // utility class + } + + /** Count the parquet data files under a Delta table directory. */ + public static long countParquet(String tablePath) throws Exception { + try(Stream s = Files.walk(new File(tablePath).toPath())) { + return s.filter(p -> p.toString().endsWith(".parquet")).count(); + } + } +} From 682a208c2f739a6224084c7cceb5fb0fa57f8312 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 2 Jul 2026 16:05:04 +0000 Subject: [PATCH 10/10] Revert unrelated whole-file reformatting in FrameObject and DeltaKernelUtils The prior commit ran the Eclipse formatter over the entire files, reflowing many lines this PR does not touch. Restore both files to a minimal diff that carries only the intended changes: the null-result check moved above the metadata refresh in FrameObject, and the debug log for the adaptive writer file-size decision in DeltaKernelUtils. --- .../controlprogram/caching/FrameObject.java | 162 ++++++----- .../sysds/runtime/io/DeltaKernelUtils.java | 257 +++++++++--------- 2 files changed, 218 insertions(+), 201 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java index c8eabc9aac6..87d14dbf87e 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java @@ -19,6 +19,7 @@ package org.apache.sysds.runtime.controlprogram.caching; + import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.mutable.MutableBoolean; import org.apache.commons.lang3.tuple.Pair; @@ -54,11 +55,13 @@ import java.util.List; import java.util.concurrent.Future; -public class FrameObject extends CacheableData { + +public class FrameObject extends CacheableData +{ private static final long serialVersionUID = 1755082174281927785L; private ValueType[] _schema = null; - + protected FrameObject() { super(DataType.FRAME, ValueType.STRING); } @@ -80,21 +83,22 @@ public FrameObject(String fname, MetaData meta, ValueType[] schema) { setMetaData(meta); setSchema(schema); } - + /** * Copy constructor that copies meta data but NO data. - * + * * @param fo frame object */ public FrameObject(FrameObject fo) { super(fo); - + MetaDataFormat metaOld = (MetaDataFormat) fo.getMetaData(); - _metaData = new MetaDataFormat(new MatrixCharacteristics(metaOld.getDataCharacteristics()), + _metaData = new MetaDataFormat( + new MatrixCharacteristics(metaOld.getDataCharacteristics()), metaOld.getFileFormat()); _schema = fo._schema.clone(); } - + @Override public ValueType[] getSchema() { return _schema; @@ -102,43 +106,43 @@ public ValueType[] getSchema() { /** * Obtain schema of value types - * + * * @param cl column lower bound, inclusive * @param cu column upper bound, inclusive * @return schema of value types */ public ValueType[] getSchema(int cl, int cu) { - return (_schema != null && _schema.length > cu) ? Arrays.copyOfRange(_schema, cl, cu + 1) : UtilFunctions - .nCopies(cu - cl + 1, ValueType.STRING); + return (_schema!=null && _schema.length>cu) ? Arrays.copyOfRange(_schema, cl, cu+1) : + UtilFunctions.nCopies(cu-cl+1, ValueType.STRING); } - + /** - * Creates a new collection which contains the schema of the current frame object concatenated with the schema of - * the passed frame object. - * + * Creates a new collection which contains the schema of the current + * frame object concatenated with the schema of the passed frame object. + * * @param fo frame object * @return schema of value types */ public ValueType[] mergeSchemas(FrameObject fo) { return ArrayUtils.addAll( - (_schema != null) ? _schema : UtilFunctions.nCopies((int) getNumColumns(), ValueType.STRING), - (fo._schema != null) ? fo._schema : UtilFunctions.nCopies((int) fo.getNumColumns(), ValueType.STRING)); - } - + (_schema!=null) ? _schema : UtilFunctions.nCopies((int)getNumColumns(), ValueType.STRING), + (fo._schema!=null) ? fo._schema : UtilFunctions.nCopies((int)fo.getNumColumns(), ValueType.STRING)); + } + public void setSchema(String schema) { - if(schema.equals("*")) { - // populate default schema + if( schema.equals("*") ) { + //populate default schema int clen = (int) getNumColumns(); - if(clen >= 0) // known number of cols + if( clen >= 0 ) //known number of cols _schema = UtilFunctions.nCopies(clen, ValueType.STRING); } - else + else _schema = parseSchema(schema); } public static ValueType[] parseSchema(String schema) { if(schema == null) - return new ValueType[] {ValueType.STRING}; + return new ValueType[]{ValueType.STRING}; // parse given schema String[] parts = schema.split(DataExpression.DEFAULT_DELIM_DELIMITER); ValueType[] ret = new ValueType[parts.length]; @@ -146,22 +150,22 @@ public static ValueType[] parseSchema(String schema) { ret[i] = ValueType.fromExternalString(parts[i].toUpperCase()); return ret; } - + public void setSchema(ValueType[] schema) { _schema = schema; } - + @Override public void refreshMetaData() { - if(_data == null || _metaData == null) // refresh only for existing data - throw new DMLRuntimeException("Cannot refresh meta data because there is no data or meta data. "); + if ( _data == null || _metaData ==null ) //refresh only for existing data + throw new DMLRuntimeException("Cannot refresh meta data because there is no data or meta data. "); - // update matrix characteristics + //update matrix characteristics DataCharacteristics dc = _metaData.getDataCharacteristics(); - dc.setDimension(_data.getNumRows(), _data.getNumColumns()); - dc.setNonZeros(_data.getNumRows() * _data.getNumColumns()); - - // update schema information + dc.setDimension( _data.getNumRows(),_data.getNumColumns() ); + dc.setNonZeros(_data.getNumRows()*_data.getNumColumns()); + + //update schema information _schema = _data.getSchema(); } @@ -174,14 +178,14 @@ public long getNumColumns() { DataCharacteristics dc = getDataCharacteristics(); return dc.getCols(); } - + @Override protected FrameBlock readBlobFromCache(String fname) throws IOException { FrameBlock fb = null; - if(OptimizerUtils.isUMMEnabled()) + if (OptimizerUtils.isUMMEnabled()) fb = (FrameBlock) UnifiedMemoryManager.readBlock(fname, false); else - fb = (FrameBlock) LazyWriteBuffer.readBlock(fname, false); + fb = (FrameBlock)LazyWriteBuffer.readBlock(fname, false); return fb; } @@ -204,8 +208,8 @@ protected FrameBlock readBlobFromHDFS(String fname, long[] dims) throws IOExcept if(data == null) throw new IOException("Unable to load frame from file: " + fname); - // Delta and CSV discover dimensions (and Delta also schema) at read time, so - // refresh the cached metadata to reflect the materialized frame block. + //Delta and CSV discover dimensions (and Delta also schema) at read time, so + //refresh the cached metadata to reflect the materialized frame block. if(iimd.getFileFormat() == FileFormat.CSV || iimd.getFileFormat() == FileFormat.DELTA) { _metaData = _metaData instanceof MetaDataFormat ? new MetaDataFormat(data.getDataCharacteristics(), iimd.getFileFormat()) : new MetaData(data.getDataCharacteristics()); @@ -217,46 +221,50 @@ protected FrameBlock readBlobFromHDFS(String fname, long[] dims) throws IOExcept } @Override - protected FrameBlock readBlobFromRDD(RDDObject rdd, MutableBoolean status) throws IOException { - // note: the read of a frame block from an RDD might trigger - // lazy evaluation of pending transformations. + protected FrameBlock readBlobFromRDD(RDDObject rdd, MutableBoolean status) + throws IOException + { + //note: the read of a frame block from an RDD might trigger + //lazy evaluation of pending transformations. RDDObject lrdd = rdd; - // prepare return status (by default only collect) + //prepare return status (by default only collect) status.setValue(false); - + MetaDataFormat iimd = (MetaDataFormat) _metaData; DataCharacteristics dc = iimd.getDataCharacteristics(); - int rlen = (int) dc.getRows(); - int clen = (int) dc.getCols(); - - // handle missing schema if necessary - ValueType[] lschema = (_schema != null) ? _schema : UtilFunctions.nCopies(clen >= 1 ? (int) clen : 1, - ValueType.STRING); - + int rlen = (int)dc.getRows(); + int clen = (int)dc.getCols(); + + //handle missing schema if necessary + ValueType[] lschema = (_schema!=null) ? _schema : + UtilFunctions.nCopies(clen>=1 ? (int)clen : 1, ValueType.STRING); + FrameBlock fb = null; - try { - // prevent unnecessary collect through rdd checkpoint - if(rdd.allowsShortCircuitCollect()) { - lrdd = (RDDObject) rdd.getLineageChilds().get(0); + try { + //prevent unnecessary collect through rdd checkpoint + if( rdd.allowsShortCircuitCollect() ) { + lrdd = (RDDObject)rdd.getLineageChilds().get(0); } - - // collect frame block from binary block RDD - fb = SparkExecutionContext.toFrameBlock(lrdd, lschema, rlen, clen); + + //collect frame block from binary block RDD + fb = SparkExecutionContext.toFrameBlock(lrdd, lschema, rlen, clen); } catch(DMLRuntimeException ex) { throw new IOException(ex); } - - // sanity check correct output - if(fb == null) + + //sanity check correct output + if( fb == null ) throw new IOException("Unable to load frame from rdd."); - + return fb; } - + @Override - protected FrameBlock readBlobFromFederated(FederationMap fedMap, long[] dims) throws IOException { + protected FrameBlock readBlobFromFederated(FederationMap fedMap, long[] dims) + throws IOException + { FrameBlock ret = new FrameBlock(_schema); // provide long support? ret.ensureAllocatedColumns((int) dims[0]); @@ -267,8 +275,8 @@ protected FrameBlock readBlobFromFederated(FederationMap fedMap, long[] dims) th FederatedResponse response = readResponse.getRight().get(); // add result FrameBlock multRes = (FrameBlock) response.getData()[0]; - for(int r = 0; r < multRes.getNumRows(); r++) { - for(int c = 0; c < multRes.getNumColumns(); c++) { + for (int r = 0; r < multRes.getNumRows(); r++) { + for (int c = 0; c < multRes.getNumColumns(); c++) { int destRow = range.getBeginDimsInt()[0] + r; int destCol = range.getBeginDimsInt()[1] + c; ret.set(destRow, destCol, multRes.get(r, c)); @@ -279,16 +287,17 @@ protected FrameBlock readBlobFromFederated(FederationMap fedMap, long[] dims) th catch(Exception e) { throw new DMLRuntimeException("Federated Frame read failed.", e); } - + return ret; } @Override protected void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop) - throws IOException, DMLRuntimeException { + throws IOException, DMLRuntimeException + { MetaDataFormat iimd = (MetaDataFormat) _metaData; FileFormat fmt = (ofmt != null ? FileFormat.safeValueOf(ofmt) : iimd.getFileFormat()); - + FrameWriter writer = FrameWriterFactory.createFrameWriter(fmt, fprop); writer.writeFrameToHDFS(_data, fname, getNumRows(), getNumColumns()); @@ -298,18 +307,21 @@ protected void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatPro @Override protected long writeStreamToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop) - throws IOException, DMLRuntimeException { + throws IOException, DMLRuntimeException + { throw new UnsupportedOperationException(); } + @Override protected void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String ofmt) - throws IOException, DMLRuntimeException { - // prepare output info + throws IOException, DMLRuntimeException + { + //prepare output info MetaDataFormat iimd = (MetaDataFormat) _metaData; - // note: the write of an RDD to HDFS might trigger - // lazy evaluation of pending transformations. + //note: the write of an RDD to HDFS might trigger + //lazy evaluation of pending transformations. SparkExecutionContext.writeFrameRDDtoHDFS(rdd, fname, iimd.getFileFormat()); } @@ -318,9 +330,11 @@ protected FrameBlock readBlobFromStream(OOCStream stream) th // TODO Auto-generated method stub return null; } - + @Override protected FrameBlock reconstructByLineage(LineageItem li) throws IOException { - return ((FrameObject) LineageRecomputeUtils.parseNComputeLineageTrace(li.getData())).acquireReadAndRelease(); + return ((FrameObject) LineageRecomputeUtils + .parseNComputeLineageTrace(li.getData())) + .acquireReadAndRelease(); } } diff --git a/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java b/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java index 25a1572223c..bbca857a1cd 100644 --- a/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java +++ b/src/main/java/org/apache/sysds/runtime/io/DeltaKernelUtils.java @@ -72,10 +72,11 @@ import io.delta.kernel.utils.FileStatus; /** - * Shared helpers for the native (Spark-free) Delta Lake read/write paths used by both the matrix and frame - * readers/writers. Centralizes engine creation, path qualification, the scan loop (snapshot -> data files -> - * logical columnar batches, honoring deletion vectors), and the write transaction (logical data -> parquet -> - * commit). + * Shared helpers for the native (Spark-free) Delta Lake read/write paths used + * by both the matrix and frame readers/writers. Centralizes engine creation, + * path qualification, the scan loop (snapshot -> data files -> logical + * columnar batches, honoring deletion vectors), and the write transaction + * (logical data -> parquet -> commit). */ public class DeltaKernelUtils { @@ -86,46 +87,40 @@ public class DeltaKernelUtils { /** Reused thread-safe JSON reader for the per-file Delta stats (numRecords). */ private static final ObjectMapper JSON_MAPPER = new ObjectMapper(); - /** - * Delta Kernel config key: number of rows per parquet read batch, overridable via - * {@link org.apache.sysds.conf.DMLConfig#DELTA_READER_BATCH_SIZE}. - */ + /** Delta Kernel config key: number of rows per parquet read batch, overridable via + * {@link org.apache.sysds.conf.DMLConfig#DELTA_READER_BATCH_SIZE}. */ private static final String CONF_READER_BATCH_SIZE = "delta.kernel.default.parquet.reader.batch-size"; - /** - * Delta Kernel config key: target size (bytes) at which the writer rolls a new data file, overridable via - * {@link org.apache.sysds.conf.DMLConfig#DELTA_WRITER_TARGET_FILE_SIZE}. - */ + /** Delta Kernel config key: target size (bytes) at which the writer rolls a new data file, overridable via + * {@link org.apache.sysds.conf.DMLConfig#DELTA_WRITER_TARGET_FILE_SIZE}. */ private static final String CONF_WRITER_TARGET_FILE_SIZE = "delta.kernel.default.parquet.writer.targetMaxFileSize"; - /** - * Internal Delta column type codes shared by the matrix and frame readers to dispatch boxing-free primitive column - * access. - */ - public static final int T_DOUBLE = 0; - public static final int T_FLOAT = 1; - public static final int T_LONG = 2; - public static final int T_INT = 3; - public static final int T_SHORT = 4; - public static final int T_BYTE = 5; + /** Internal Delta column type codes shared by the matrix and frame readers to + * dispatch boxing-free primitive column access. */ + public static final int T_DOUBLE = 0; + public static final int T_FLOAT = 1; + public static final int T_LONG = 2; + public static final int T_INT = 3; + public static final int T_SHORT = 4; + public static final int T_BYTE = 5; public static final int T_BOOLEAN = 6; - public static final int T_STRING = 7; + public static final int T_STRING = 7; - // derived configuration cached to avoid copying the (large) base conf on every - // engine creation (createEngine is called once per data file in parallel reads); - // rebuilt whenever the base conf or the relevant SystemDS settings change. + //derived configuration cached to avoid copying the (large) base conf on every + //engine creation (createEngine is called once per data file in parallel reads); + //rebuilt whenever the base conf or the relevant SystemDS settings change. private static Configuration cachedConf; private static Configuration cachedConfBase; private static int cachedBatchSize; private static long cachedTargetFileSize; - private DeltaKernelUtils() { - } + private DeltaKernelUtils() {} /** - * Consumes a whole columnar batch. {@code selected} is {@code null} when all {@code size} rows are live; otherwise - * {@code selected[r]} indicates whether row {@code r} survived the deletion/selection vector. Batch-level - * consumption lets callers extract data column-at-a-time (cache friendly, boxing free) instead of paying a per-row - * callback. + * Consumes a whole columnar batch. {@code selected} is {@code null} when all + * {@code size} rows are live; otherwise {@code selected[r]} indicates whether + * row {@code r} survived the deletion/selection vector. Batch-level consumption + * lets callers extract data column-at-a-time (cache friendly, boxing free) + * instead of paying a per-row callback. */ @FunctionalInterface public interface BatchConsumer { @@ -133,29 +128,22 @@ public interface BatchConsumer { } /** - * Map a Delta Kernel {@link DataType} to an internal type code (see the {@code T_*} constants). Returned once per - * column so the per-cell read loop can switch on a primitive int instead of repeating {@code instanceof} checks. + * Map a Delta Kernel {@link DataType} to an internal type code (see the + * {@code T_*} constants). Returned once per column so the per-cell read loop + * can switch on a primitive int instead of repeating {@code instanceof} checks. * * @param dt the Delta column data type * @return the matching {@code T_*} code, or {@code -1} if the type is not supported */ public static int typeCode(DataType dt) { - if(dt instanceof DoubleType) - return T_DOUBLE; - if(dt instanceof FloatType) - return T_FLOAT; - if(dt instanceof LongType) - return T_LONG; - if(dt instanceof IntegerType) - return T_INT; - if(dt instanceof ShortType) - return T_SHORT; - if(dt instanceof ByteType) - return T_BYTE; - if(dt instanceof BooleanType) - return T_BOOLEAN; - if(dt instanceof StringType) - return T_STRING; + if( dt instanceof DoubleType ) return T_DOUBLE; + if( dt instanceof FloatType ) return T_FLOAT; + if( dt instanceof LongType ) return T_LONG; + if( dt instanceof IntegerType ) return T_INT; + if( dt instanceof ShortType ) return T_SHORT; + if( dt instanceof ByteType ) return T_BYTE; + if( dt instanceof BooleanType ) return T_BOOLEAN; + if( dt instanceof StringType ) return T_STRING; return -1; } @@ -174,10 +162,8 @@ public static int countSelected(int size, boolean[] selected) { return n; } - /** - * Floor on the adaptive writer target file size. Below this the per-file metadata/open overhead (and tiny-file - * proliferation) outweighs the extra read parallelism. - */ + /** Floor on the adaptive writer target file size. Below this the per-file metadata/open + * overhead (and tiny-file proliferation) outweighs the extra read parallelism. */ public static final long ADAPTIVE_WRITER_MIN_FILE_SIZE = 4L * 1024 * 1024; private static Configuration buildConf(Configuration base, int batchSize, long targetFileSize) { @@ -191,8 +177,9 @@ private static synchronized Configuration deltaConf() { Configuration base = ConfigurationManager.getCachedJobConf(); int batchSize = ConfigurationManager.getDeltaReaderBatchSize(); long targetFileSize = ConfigurationManager.getDeltaWriterTargetFileSize(); - if(cachedConf == null || cachedConfBase != base || cachedBatchSize != batchSize || - cachedTargetFileSize != targetFileSize) { + if(cachedConf == null || cachedConfBase != base + || cachedBatchSize != batchSize || cachedTargetFileSize != targetFileSize) + { cachedConf = buildConf(base, batchSize, targetFileSize); cachedConfBase = base; cachedBatchSize = batchSize; @@ -206,11 +193,12 @@ public static Engine createEngine() { } /** - * Compute the parquet target data-file size (bytes) for writing a table of the given estimated size. With adaptive - * sizing enabled the writer aims for roughly one data file per expected parallel reader (so the native per-file - * parallel read can use all threads): never above the configured target, and never below - * {@code ADAPTIVE_WRITER_MIN_FILE_SIZE} unless the configured target is itself smaller than that floor (in which - * case the configured target wins). + * Compute the parquet target data-file size (bytes) for writing a table of the given + * estimated size. With adaptive sizing enabled the writer aims for roughly one data + * file per expected parallel reader (so the native per-file parallel read can use all + * threads): never above the configured target, and never below + * {@code ADAPTIVE_WRITER_MIN_FILE_SIZE} unless the configured target is itself smaller + * than that floor (in which case the configured target wins). * * @param estimatedBytes estimate of the table's size (the block in-memory size is a fine proxy) * @return the target max parquet data-file size in bytes @@ -221,7 +209,7 @@ public static long adaptiveWriterTargetFileSize(long estimatedBytes) { return configured; int par = Math.max(1, OptimizerUtils.getParallelBinaryReadParallelism()); long perReader = Math.max(1, estimatedBytes / par); - // never above the configured cap, never below the floor (unless the cap itself is lower) + //never above the configured cap, never below the floor (unless the cap itself is lower) long target = Math.min(configured, Math.max(ADAPTIVE_WRITER_MIN_FILE_SIZE, perReader)); if(LOG.isDebugEnabled()) LOG.debug("Delta adaptive file size: est=" + estimatedBytes + "B par=" + par + " -> target=" + target @@ -230,24 +218,24 @@ public static long adaptiveWriterTargetFileSize(long estimatedBytes) { } /** - * Create an engine for writing a table of the given estimated size, configured with an adaptive target data-file - * size (see {@link #adaptiveWriterTargetFileSize(long)}). A fresh (uncached) configuration is built since writes - * happen once per table, not per data file. + * Create an engine for writing a table of the given estimated size, configured with an + * adaptive target data-file size (see {@link #adaptiveWriterTargetFileSize(long)}). A fresh + * (uncached) configuration is built since writes happen once per table, not per data file. * * @param estimatedBytes estimate of the table's size (the block in-memory size is a fine proxy) * @return a Delta Kernel engine for the write */ public static Engine createWriteEngine(long estimatedBytes) { - // the reader batch size is irrelevant on the write path but is set to keep the - // conf shape identical to deltaConf(); only the target file size matters here. + //the reader batch size is irrelevant on the write path but is set to keep the + //conf shape identical to deltaConf(); only the target file size matters here. Configuration c = buildConf(ConfigurationManager.getCachedJobConf(), ConfigurationManager.getDeltaReaderBatchSize(), adaptiveWriterTargetFileSize(estimatedBytes)); return DefaultEngine.create(c); } /** - * Resolve a (possibly relative) path to a fully-qualified URI so the kernel's default engine can locate the table - * on the right filesystem. + * Resolve a (possibly relative) path to a fully-qualified URI so the + * kernel's default engine can locate the table on the right filesystem. * * @param fname input path * @return fully-qualified table path @@ -264,9 +252,11 @@ public static String qualify(String fname) { } /** - * Opened latest snapshot of a Delta table: the logical schema plus everything needed to (re)read its data files, - * including the list of per-data-file scan rows. Delta Kernel scan-file rows are self-contained (the kernel's - * distributed design serializes them to workers), so they can be retained and read independently / in parallel. + * Opened latest snapshot of a Delta table: the logical schema plus everything + * needed to (re)read its data files, including the list of per-data-file scan + * rows. Delta Kernel scan-file rows are self-contained (the kernel's + * distributed design serializes them to workers), so they can be retained and + * read independently / in parallel. */ public static final class ScanHandle { public final StructType schema; @@ -274,18 +264,19 @@ public static final class ScanHandle { public final StructType physicalReadSchema; public final List scanFiles; /** - * Per-file record counts taken from the Delta {@code numRecords} statistic, aligned with {@link #scanFiles}; - * {@code -1} where the statistic is absent. + * Per-file record counts taken from the Delta {@code numRecords} statistic, + * aligned with {@link #scanFiles}; {@code -1} where the statistic is absent. */ public final long[] numRecords; /** - * Per-file flag indicating a deletion vector is present (so the live row count differs from - * {@link #numRecords}), aligned with {@link #scanFiles}. + * Per-file flag indicating a deletion vector is present (so the live row + * count differs from {@link #numRecords}), aligned with {@link #scanFiles}. */ public final boolean[] hasDeletionVector; - private ScanHandle(StructType schema, Row scanState, StructType physicalReadSchema, List scanFiles, - long[] numRecords, boolean[] hasDeletionVector) { + private ScanHandle(StructType schema, Row scanState, StructType physicalReadSchema, + List scanFiles, long[] numRecords, boolean[] hasDeletionVector) + { this.schema = schema; this.scanState = scanState; this.physicalReadSchema = physicalReadSchema; @@ -295,12 +286,13 @@ private ScanHandle(StructType schema, Row scanState, StructType physicalReadSche } /** - * @return true iff every data file carries a {@code numRecords} statistic and none has a deletion vector, i.e. - * exact per-file row offsets can be derived from metadata without reading the data. + * @return true iff every data file carries a {@code numRecords} statistic + * and none has a deletion vector, i.e. exact per-file row offsets + * can be derived from metadata without reading the data. */ public boolean hasExactRowCounts() { - for(int i = 0; i < numRecords.length; i++) - if(numRecords[i] < 0 || hasDeletionVector[i]) + for( int i=0; i scanFileIter = (scan instanceof ScanImpl) ? ((ScanImpl) scan) - .getScanFiles(engine, true) : scan.getScanFiles(engine); + //request the scan files WITH per-file statistics (numRecords) so callers can + //pre-size output and place rows without reading the data; harmless extra + //column for the data-read path. Fall back to the stats-less iterator if the + //concrete scan does not support it. + CloseableIterator scanFileIter = (scan instanceof ScanImpl) + ? ((ScanImpl) scan).getScanFiles(engine, true) + : scan.getScanFiles(engine); List files = new ArrayList<>(); List recs = new ArrayList<>(); List dvs = new ArrayList<>(); - try(CloseableIterator scanFiles = scanFileIter) { - while(scanFiles.hasNext()) { + try( CloseableIterator scanFiles = scanFileIter ) { + while( scanFiles.hasNext() ) { FilteredColumnarBatch scanFileBatch = scanFiles.next(); - try(CloseableIterator scanFileRows = scanFileBatch.getRows()) { - while(scanFileRows.hasNext()) { + try( CloseableIterator scanFileRows = scanFileBatch.getRows() ) { + while( scanFileRows.hasNext() ) { Row scanFileRow = scanFileRows.next(); files.add(scanFileRow); recs.add(numRecords(scanFileRow)); @@ -348,7 +342,7 @@ public static ScanHandle openScan(Engine engine, String tablePath) throws IOExce } long[] numRecords = new long[recs.size()]; boolean[] hasDv = new boolean[dvs.size()]; - for(int i = 0; i < numRecords.length; i++) { + for( int i=0; i physicalData = engine.getParquetHandler() .readParquetFiles(Utils.singletonCloseableIterator(dataFile), physicalReadSchema, Optional.empty()); - try(CloseableIterator logicalData = Scan.transformPhysicalData(engine, scanState, - scanFileRow, physicalData)) { - while(logicalData.hasNext()) + try( CloseableIterator logicalData = + Scan.transformPhysicalData(engine, scanState, scanFileRow, physicalData) ) + { + while( logicalData.hasNext() ) consumeBatch(logicalData.next(), consumer); } } /** - * Scan the latest snapshot of a Delta table sequentially, invoking the batch consumer for every data batch. The - * consumer is created lazily from the table schema (so callers can size buffers / derive per-column types up - * front). + * Scan the latest snapshot of a Delta table sequentially, invoking the batch + * consumer for every data batch. The consumer is created lazily from the table + * schema (so callers can size buffers / derive per-column types up front). * * @param engine delta kernel engine * @param tablePath fully-qualified table path @@ -412,10 +410,11 @@ public static void readScanFile(Engine engine, Row scanState, StructType physica * @throws IOException on read failure */ public static StructType scan(Engine engine, String tablePath, Function consumerFactory) - throws IOException { + throws IOException + { ScanHandle h = openScan(engine, tablePath); BatchConsumer consumer = consumerFactory.apply(h.schema); - for(Row scanFileRow : h.scanFiles) + for( Row scanFileRow : h.scanFiles ) readScanFile(engine, h.scanState, h.physicalReadSchema, scanFileRow, consumer); return h.schema; } @@ -424,27 +423,28 @@ private static void consumeBatch(FilteredColumnarBatch fcb, BatchConsumer consum ColumnarBatch batch = fcb.getData(); int ncol = batch.getSchema().length(); ColumnVector[] cols = new ColumnVector[ncol]; - for(int c = 0; c < ncol; c++) + for( int c=0; c all rows live) + //materialize the deletion/selection mask once (null => all rows live) Optional selVector = fcb.getSelectionVector(); boolean[] selected = null; - if(selVector.isPresent()) { + if( selVector.isPresent() ) { ColumnVector sv = selVector.get(); selected = new boolean[size]; - for(int r = 0; r < size; r++) + for( int r=0; r logicalData) throws IOException { - // replace any existing table at the path (the other SystemDS writers delete - // the output first; the caching layer does not do it on our behalf) + CloseableIterator logicalData) throws IOException + { + //replace any existing table at the path (the other SystemDS writers delete + //the output first; the caching layer does not do it on our behalf) HDFSTool.deleteFileIfExistOnHDFS(tablePath); Table table = Table.forPath(engine, tablePath); - TransactionBuilder txnBuilder = table.createTransactionBuilder(engine, ENGINE_INFO, Operation.CREATE_TABLE) + TransactionBuilder txnBuilder = table + .createTransactionBuilder(engine, ENGINE_INFO, Operation.CREATE_TABLE) .withSchema(engine, schema); Transaction txn = txnBuilder.build(engine); Row txnState = txn.getTransactionState(engine); - CloseableIterator physicalData = Transaction.transformLogicalData(engine, txnState, - logicalData, Collections.emptyMap()); - DataWriteContext writeContext = Transaction.getWriteContext(engine, txnState, Collections.emptyMap()); + CloseableIterator physicalData = + Transaction.transformLogicalData(engine, txnState, logicalData, Collections.emptyMap()); + DataWriteContext writeContext = + Transaction.getWriteContext(engine, txnState, Collections.emptyMap()); CloseableIterator dataFiles = engine.getParquetHandler() .writeParquetFiles(writeContext.getTargetDirectory(), physicalData, writeContext.getStatisticsColumns()); - CloseableIterator appendActions = Transaction.generateAppendActions(engine, txnState, dataFiles, - writeContext); + CloseableIterator appendActions = + Transaction.generateAppendActions(engine, txnState, dataFiles, writeContext); txn.commit(engine, CloseableIterable.inMemoryIterable(appendActions)); } }