Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,14 @@ public class CompressionSettings {

public final double[] scaleFactors;

public final boolean preferDeltaEncoding;

protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary,
String transposeInput, int seed, boolean lossy, EnumSet<CompressionType> validCompressions,
boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage,
int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType,
double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors) {
double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors,
boolean preferDeltaEncoding) {
this.samplingRatio = samplingRatio;
this.samplePower = samplePower;
this.allowSharedDictionary = allowSharedDictionary;
Expand All @@ -157,6 +160,7 @@ protected CompressionSettings(double samplingRatio, double samplePower, boolean
this.isInSparkInstruction = isInSparkInstruction;
this.sdcSortType = sdcSortType;
this.scaleFactors = scaleFactors;
this.preferDeltaEncoding = preferDeltaEncoding;

if(!printedStatus && LOG.isDebugEnabled()) {
printedStatus = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ public class CompressionSettingsBuilder {
private boolean isInSparkInstruction = false;
private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE;
private double[] scaleFactors = null;
private boolean preferDeltaEncoding = false;

public CompressionSettingsBuilder() {

Expand Down Expand Up @@ -101,6 +102,7 @@ public CompressionSettingsBuilder copySettings(CompressionSettings that) {
this.maxColGroupCoCode = that.maxColGroupCoCode;
this.coCodePercentage = that.coCodePercentage;
this.minimumSampleSize = that.minimumSampleSize;
this.preferDeltaEncoding = that.preferDeltaEncoding;
return this;
}

Expand Down Expand Up @@ -336,6 +338,19 @@ public CompressionSettingsBuilder setSDCSortType(SORT_TYPE sdcSortType) {
return this;
}

/**
* Set whether to prefer delta encoding during compression estimation.
* When enabled, the compression estimator will use delta encoding statistics
* instead of regular encoding statistics.
*
* @param preferDeltaEncoding Whether to prefer delta encoding
* @return The CompressionSettingsBuilder
*/
public CompressionSettingsBuilder setPreferDeltaEncoding(boolean preferDeltaEncoding) {
this.preferDeltaEncoding = preferDeltaEncoding;
return this;
}

/**
* Create the CompressionSettings object to use in the compression.
*
Expand All @@ -345,6 +360,6 @@ public CompressionSettings create() {
return new CompressionSettings(samplingRatio, samplePower, allowSharedDictionary, transposeInput, seed, lossy,
validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage,
minimumSampleSize, maxSampleSize, estimationType, costType, minimumCompressionRatio, isInSparkInstruction,
sdcSortType, scaleFactors);
sdcSortType, scaleFactors, preferDeltaEncoding);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
import org.apache.sysds.runtime.compress.DMLCompressionException;
import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
import org.apache.sysds.runtime.compress.colgroup.dictionary.DeltaDictionary;
import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
Expand All @@ -43,6 +44,9 @@
import org.apache.sysds.runtime.compress.colgroup.indexes.RangeIndex;
import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
import org.apache.sysds.runtime.compress.utils.ACount;
import org.apache.sysds.runtime.compress.utils.DblArray;
import org.apache.sysds.runtime.compress.utils.DblArrayCountHashMap;
import org.apache.sysds.runtime.compress.colgroup.offset.AOffsetIterator;
import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
import org.apache.sysds.runtime.compress.colgroup.scheme.DDCScheme;
Expand Down Expand Up @@ -77,7 +81,7 @@ public class ColGroupDDC extends APreAgg implements IMapToDataGroup {

static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;

private ColGroupDDC(IColIndex colIndexes, IDictionary dict, AMapToData data, int[] cachedCounts) {
protected ColGroupDDC(IColIndex colIndexes, IDictionary dict, AMapToData data, int[] cachedCounts) {
super(colIndexes, dict, cachedCounts);
_data = data;

Expand Down Expand Up @@ -1105,4 +1109,57 @@ protected boolean allowShallowIdentityRightMult() {
return true;
}

public AColGroup convertToDeltaDDC() {
int numCols = _colIndexes.size();
int numRows = _data.size();

DblArrayCountHashMap map = new DblArrayCountHashMap(Math.max(numRows, 64));
double[] rowDelta = new double[numCols];
double[] prevRow = new double[numCols];
DblArray dblArray = new DblArray(rowDelta);
int[] rowToDictId = new int[numRows];

double[] dictVals = _dict.getValues();

for(int i = 0; i < numRows; i++) {
int dictIdx = _data.getIndex(i);
int off = dictIdx * numCols;
for(int j = 0; j < numCols; j++) {
double val = dictVals[off + j];
if(i == 0) {
rowDelta[j] = val;
prevRow[j] = val;
} else {
rowDelta[j] = val - prevRow[j];
prevRow[j] = val;
}
}

rowToDictId[i] = map.increment(dblArray);
}

if(map.size() == 0)
return new ColGroupEmpty(_colIndexes);

ACount<DblArray>[] vals = map.extractValues();
final int nVals = vals.length;
final double[] dictValues = new double[nVals * numCols];
final int[] oldIdToNewId = new int[map.size()];
int idx = 0;
for(int i = 0; i < nVals; i++) {
final ACount<DblArray> dac = vals[i];
final double[] arrData = dac.key().getData();
System.arraycopy(arrData, 0, dictValues, idx, numCols);
oldIdToNewId[dac.id] = i;
idx += numCols;
}

DeltaDictionary deltaDict = new DeltaDictionary(dictValues, numCols);
AMapToData newData = MapToFactory.create(numRows, nVals);
for(int i = 0; i < numRows; i++) {
newData.set(i, oldIdToNewId[rowToDictId[i]]);
}
return ColGroupDeltaDDC.create(_colIndexes, deltaDict, newData, null);
}

}
Loading
Loading