/*
 * Decompiled with CFR 0.152.
 */
package org.nd4j.linalg.jcublas;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import org.bytedeco.javacpp.BytePointer;
import org.bytedeco.javacpp.DoublePointer;
import org.bytedeco.javacpp.FloatPointer;
import org.bytedeco.javacpp.IntPointer;
import org.bytedeco.javacpp.Loader;
import org.bytedeco.javacpp.LongPointer;
import org.bytedeco.javacpp.Pointer;
import org.bytedeco.javacpp.PointerPointer;
import org.bytedeco.javacpp.ShortPointer;
import org.nd4j.jita.allocator.enums.CudaConstants;
import org.nd4j.jita.allocator.impl.AllocationPoint;
import org.nd4j.jita.allocator.impl.AtomicAllocator;
import org.nd4j.jita.allocator.pointers.CudaPointer;
import org.nd4j.jita.allocator.pointers.cuda.cudaStream_t;
import org.nd4j.jita.allocator.utils.AllocationUtils;
import org.nd4j.jita.conf.CudaEnvironment;
import org.nd4j.linalg.api.buffer.DataBuffer;
import org.nd4j.linalg.api.memory.MemoryWorkspace;
import org.nd4j.linalg.api.memory.enums.MemoryKind;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.ops.executioner.GridExecutioner;
import org.nd4j.linalg.api.ops.performance.PerformanceTracker;
import org.nd4j.linalg.api.shape.options.ArrayOptionsHelper;
import org.nd4j.linalg.api.shape.options.ArrayType;
import org.nd4j.linalg.cache.TADManager;
import org.nd4j.linalg.compression.CompressedDataBuffer;
import org.nd4j.linalg.compression.CompressionDescriptor;
import org.nd4j.linalg.compression.CompressionType;
import org.nd4j.linalg.compression.CompressionUtils;
import org.nd4j.linalg.exception.ND4JIllegalStateException;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.jcublas.JCublasNDArray;
import org.nd4j.linalg.jcublas.JCusparseNDArrayCOO;
import org.nd4j.linalg.jcublas.blas.CudaBlas;
import org.nd4j.linalg.jcublas.blas.JcublasLapack;
import org.nd4j.linalg.jcublas.blas.JcublasLevel1;
import org.nd4j.linalg.jcublas.blas.JcublasLevel2;
import org.nd4j.linalg.jcublas.blas.JcublasLevel3;
import org.nd4j.linalg.jcublas.buffer.AddressRetriever;
import org.nd4j.linalg.jcublas.buffer.CudaDoubleDataBuffer;
import org.nd4j.linalg.jcublas.buffer.CudaIntDataBuffer;
import org.nd4j.linalg.jcublas.buffer.CudaLongDataBuffer;
import org.nd4j.linalg.jcublas.context.CudaContext;
import org.nd4j.linalg.memory.MemcpyDirection;
import org.nd4j.linalg.primitives.Pair;
import org.nd4j.linalg.util.ArrayUtil;
import org.nd4j.nativeblas.BaseNativeNDArrayFactory;
import org.nd4j.nativeblas.LongPointerWrapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class JCublasNDArrayFactory
extends BaseNativeNDArrayFactory {
    private static final Logger log = LoggerFactory.getLogger(JCublasNDArrayFactory.class);

    public JCublasNDArrayFactory() {
    }

    public JCublasNDArrayFactory(DataBuffer.Type dtype, Character order) {
        super(dtype, order);
    }

    public JCublasNDArrayFactory(DataBuffer.Type dtype, char order) {
        super(dtype, order);
        AtomicAllocator.getInstance();
    }

    public void createBlas() {
        this.blas = new CudaBlas();
        PointerPointer functions = new PointerPointer(13L);
        functions.put(0L, Loader.addressof((String)"cublasSgemv_v2"));
        functions.put(1L, Loader.addressof((String)"cublasDgemv_v2"));
        functions.put(2L, Loader.addressof((String)"cublasHgemm"));
        functions.put(3L, Loader.addressof((String)"cublasSgemm_v2"));
        functions.put(4L, Loader.addressof((String)"cublasDgemm_v2"));
        functions.put(5L, Loader.addressof((String)"cublasSgemmEx"));
        functions.put(6L, Loader.addressof((String)"cublasHgemmBatched"));
        functions.put(7L, Loader.addressof((String)"cublasSgemmBatched"));
        functions.put(8L, Loader.addressof((String)"cublasDgemmBatched"));
        functions.put(9L, Loader.addressof((String)"cusolverDnSgesvd_bufferSize"));
        functions.put(10L, Loader.addressof((String)"cusolverDnDgesvd_bufferSize"));
        functions.put(11L, Loader.addressof((String)"cusolverDnSgesvd"));
        functions.put(12L, Loader.addressof((String)"cusolverDnDgesvd"));
        this.nativeOps.initializeFunctions(functions);
    }

    public void createLevel1() {
        this.level1 = new JcublasLevel1();
    }

    public void createLevel2() {
        this.level2 = new JcublasLevel2();
    }

    public void createLevel3() {
        this.level3 = new JcublasLevel3();
    }

    public void createLapack() {
        this.lapack = new JcublasLapack();
    }

    public INDArray create(int[] shape, DataBuffer buffer) {
        return new JCublasNDArray(shape, buffer);
    }

    public INDArray create(double[][] data) {
        return new JCublasNDArray(data);
    }

    public INDArray create(double[][] data, char ordering) {
        return new JCublasNDArray(data, ordering);
    }

    public INDArray create(DataBuffer data) {
        return new JCublasNDArray(data);
    }

    public INDArray create(DataBuffer data, long rows, long columns, int[] stride, long offset) {
        return new JCublasNDArray(data, new long[]{rows, columns}, ArrayUtil.toLongArray((int[])stride), offset, Nd4j.order().charValue());
    }

    public INDArray create(int[] shape, char ordering) {
        return new JCublasNDArray(shape, ordering);
    }

    public INDArray createUninitialized(int[] shape, char ordering) {
        return new JCublasNDArray(shape, Nd4j.getStrides((int[])shape, (char)ordering), 0L, ordering, false);
    }

    public INDArray createUninitializedDetached(int[] shape, char ordering) {
        MemoryWorkspace workspace = Nd4j.getMemoryManager().getCurrentWorkspace();
        Nd4j.getMemoryManager().setCurrentWorkspace(null);
        JCublasNDArray ret = new JCublasNDArray(shape, Nd4j.getStrides((int[])shape, (char)ordering), 0L, ordering, false);
        Nd4j.getMemoryManager().setCurrentWorkspace(workspace);
        return ret;
    }

    public INDArray create(DataBuffer data, int[] newShape, int[] newStride, long offset, char ordering) {
        return new JCublasNDArray(data, newShape, newStride, offset, ordering);
    }

    public INDArray create(float[] data, int[] shape, long offset, Character order) {
        return new JCublasNDArray(data, shape, offset, order.charValue());
    }

    public INDArray create(float[] data, long rows, long columns, int[] stride, long offset, char ordering) {
        return new JCublasNDArray(data, new long[]{rows, columns}, ArrayUtil.toLongArray((int[])stride), offset, ordering);
    }

    public INDArray create(double[] data, int[] shape, char ordering) {
        return new JCublasNDArray(data, shape, ordering);
    }

    public INDArray create(double[] data, long[] shape, char ordering) {
        return new JCublasNDArray(data, shape, ordering);
    }

    public INDArray create(List<INDArray> list, int[] shape, char ordering) {
        return new JCublasNDArray(list, shape, ordering);
    }

    public INDArray create(double[] data, int[] shape, long offset) {
        return new JCublasNDArray(data, shape, (char)offset);
    }

    public INDArray create(double[] data, int[] shape, int[] stride, long offset, char ordering) {
        return new JCublasNDArray(data, shape, stride, offset, ordering);
    }

    public INDArray create(float[] data, int[] shape, int[] stride, long offset) {
        return new JCublasNDArray(data, shape, stride, offset);
    }

    public INDArray create(double[] data, int[] shape, int[] stride, long offset) {
        return new JCublasNDArray(data, shape, stride, offset);
    }

    public INDArray create(DataBuffer data, int[] shape) {
        return new JCublasNDArray(data, shape);
    }

    public INDArray create(DataBuffer data, int[] shape, int[] stride, long offset) {
        return new JCublasNDArray(data, shape, stride, offset);
    }

    public INDArray create(List<INDArray> list, int[] shape) {
        if (this.order == 'f') {
            return new JCublasNDArray(list, shape, ArrayUtil.calcStridesFortran((int[])shape));
        }
        return new JCublasNDArray(list, shape);
    }

    public INDArray create(float[] data, int[] shape, long offset) {
        return new JCublasNDArray(data, shape, offset);
    }

    public INDArray create(float[][] floats) {
        return new JCublasNDArray(floats);
    }

    public INDArray create(float[][] data, char ordering) {
        return new JCublasNDArray(data, ordering);
    }

    public INDArray create(float[] data, int[] shape, int[] stride, long offset, char ordering) {
        return new JCublasNDArray(data, shape, stride, offset, ordering);
    }

    public INDArray create(DataBuffer buffer, int[] shape, long offset) {
        return new JCublasNDArray(buffer, shape, offset);
    }

    public INDArray toFlattened(Collection<INDArray> matrices) {
        return this.toFlattened(this.order(), matrices);
    }

    public INDArray toFlattened(char order, Collection<INDArray> matrices) {
        if (Nd4j.getExecutioner() instanceof GridExecutioner) {
            ((GridExecutioner)Nd4j.getExecutioner()).flushQueue();
        }
        int length = 0;
        for (INDArray m : matrices) {
            length = (int)((long)length + m.length());
        }
        INDArray ret = Nd4j.create((int[])new int[]{1, length}, (char)order);
        int linearIndex = 0;
        AtomicAllocator allocator = AtomicAllocator.getInstance();
        for (INDArray m : matrices) {
            CudaContext context = allocator.getFlowController().prepareAction(ret, m);
            if (m.ordering() == order && ret.elementWiseStride() == m.elementWiseStride() && ret.elementWiseStride() == 1) {
                allocator.memcpyAsync(ret.data(), new CudaPointer(allocator.getHostPointer(m).address()), AllocationUtils.getRequiredMemory(AllocationUtils.buildAllocationShape(m)), linearIndex * (m.data().dataType() == DataBuffer.Type.DOUBLE ? 8 : (m.data().dataType() == DataBuffer.Type.FLOAT ? 4 : 2)));
                linearIndex = (int)((long)linearIndex + m.length());
            } else {
                Pointer hostYShapeInfo = AddressRetriever.retrieveHostPointer(m.shapeInfoDataBuffer());
                PointerPointer extras = new PointerPointer(new Pointer[]{AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()), context.getOldStream(), allocator.getDeviceIdPointer(), context.getBufferAllocation(), context.getBufferReduction(), context.getBufferScalar(), context.getBufferSpecial(), hostYShapeInfo, AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer())});
                if (m.data().dataType() == DataBuffer.Type.DOUBLE) {
                    this.nativeOps.flattenDouble(extras, linearIndex, order, (DoublePointer)allocator.getPointer(ret, context), (LongPointer)allocator.getPointer(ret.shapeInfoDataBuffer(), context), (DoublePointer)allocator.getPointer(m, context), (LongPointer)allocator.getPointer(m.shapeInfoDataBuffer(), context));
                } else if (m.data().dataType() == DataBuffer.Type.FLOAT) {
                    this.nativeOps.flattenFloat(extras, linearIndex, order, (FloatPointer)allocator.getPointer(ret, context), (LongPointer)allocator.getPointer(ret.shapeInfoDataBuffer(), context), (FloatPointer)allocator.getPointer(m, context), (LongPointer)allocator.getPointer(m.shapeInfoDataBuffer(), context));
                } else {
                    this.nativeOps.flattenHalf(extras, linearIndex, order, (ShortPointer)allocator.getPointer(ret, context), (LongPointer)allocator.getPointer(ret.shapeInfoDataBuffer(), context), (ShortPointer)allocator.getPointer(m, context), (LongPointer)allocator.getPointer(m.shapeInfoDataBuffer(), context));
                }
                linearIndex = (int)((long)linearIndex + m.length());
            }
            if (ret == null) continue;
            allocator.registerAction(context, ret, m);
        }
        return ret;
    }

    public INDArray concat(int dimension, INDArray ... toConcat) {
        if (Nd4j.getExecutioner() instanceof GridExecutioner) {
            ((GridExecutioner)Nd4j.getExecutioner()).flushQueue();
        }
        if (toConcat.length == 1) {
            return toConcat[0];
        }
        int sumAlongDim = 0;
        for (int i = 0; i < toConcat.length; ++i) {
            if (toConcat[i].isCompressed()) {
                Nd4j.getCompressor().decompressi(toConcat[i]);
            }
            sumAlongDim = (int)((long)sumAlongDim + toConcat[i].size(dimension));
        }
        long[] outputShape = ArrayUtil.copy((long[])toConcat[0].shape());
        outputShape[dimension] = sumAlongDim;
        INDArray ret = Nd4j.createUninitialized((long[])outputShape, (char)Nd4j.order().charValue());
        AtomicAllocator allocator = AtomicAllocator.getInstance();
        CudaContext context = allocator.getFlowController().prepareAction(ret, toConcat);
        long[] shapeInfoPointers = new long[toConcat.length];
        long[] dataPointers = new long[toConcat.length];
        long[] tadPointers = new long[toConcat.length];
        long[] offsetsPointers = new long[toConcat.length];
        long[] hostShapeInfoPointers = new long[toConcat.length];
        TADManager tadManager = Nd4j.getExecutioner().getTADManager();
        for (int i = 0; i < toConcat.length; ++i) {
            shapeInfoPointers[i] = AddressRetriever.retrieveDeviceAddress(toConcat[i].shapeInfoDataBuffer(), context);
            dataPointers[i] = AtomicAllocator.getInstance().getPointer(toConcat[i], context).address();
            hostShapeInfoPointers[i] = AtomicAllocator.getInstance().getHostPointer(toConcat[i].shapeInfoDataBuffer()).address();
            sumAlongDim = (int)((long)sumAlongDim + toConcat[i].size(dimension));
            for (int j = 0; j < toConcat[i].rank(); ++j) {
                if (j == dimension || toConcat[i].size(j) == outputShape[j]) continue;
                throw new IllegalArgumentException("Illegal concatenation at array " + i + " and shape element " + j);
            }
            Pair tadBuffers = tadManager.getTADOnlyShapeInfo(toConcat[i], new int[]{dimension});
            long devTadShapeInfo = AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getFirst(), context).address();
            DataBuffer offsets = (DataBuffer)tadBuffers.getSecond();
            long devTadOffsets = AtomicAllocator.getInstance().getPointer(offsets, context).address();
            tadPointers[i] = devTadShapeInfo;
            offsetsPointers[i] = devTadOffsets;
        }
        Pair zBuffers = tadManager.getTADOnlyShapeInfo(ret, new int[]{dimension});
        Pointer dZ = AtomicAllocator.getInstance().getPointer(ret, context);
        Pointer dZShapeInfo = AddressRetriever.retrieveDevicePointer(ret.shapeInfoDataBuffer(), context);
        CudaDoubleDataBuffer tempData = new CudaDoubleDataBuffer(toConcat.length);
        CudaDoubleDataBuffer tempShapes = new CudaDoubleDataBuffer(toConcat.length);
        CudaDoubleDataBuffer tempTAD = new CudaDoubleDataBuffer(toConcat.length);
        CudaDoubleDataBuffer tempOffsets = new CudaDoubleDataBuffer(toConcat.length);
        AtomicAllocator.getInstance().memcpyBlocking(tempData, (Pointer)new LongPointer(dataPointers), dataPointers.length * 8, 0L);
        AtomicAllocator.getInstance().memcpyBlocking(tempShapes, (Pointer)new LongPointer(shapeInfoPointers), shapeInfoPointers.length * 8, 0L);
        AtomicAllocator.getInstance().memcpyBlocking(tempTAD, (Pointer)new LongPointer(tadPointers), tadPointers.length * 8, 0L);
        AtomicAllocator.getInstance().memcpyBlocking(tempOffsets, (Pointer)new LongPointer(offsetsPointers), offsetsPointers.length * 8, 0L);
        Pointer dataPointer = AtomicAllocator.getInstance().getPointer(tempData, context);
        Pointer shapesPointer = AtomicAllocator.getInstance().getPointer(tempShapes, context);
        Pointer tadPointer = AtomicAllocator.getInstance().getPointer(tempTAD, context);
        Pointer offsetPointer = AtomicAllocator.getInstance().getPointer(tempOffsets, context);
        PointerPointer extras = new PointerPointer(new Pointer[]{AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()), context.getOldStream(), allocator.getDeviceIdPointer(), context.getBufferAllocation(), context.getBufferReduction(), context.getBufferScalar(), context.getBufferSpecial(), AddressRetriever.retrieveHostPointer(toConcat[0].shapeInfoDataBuffer()), AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()), new LongPointer(hostShapeInfoPointers), AtomicAllocator.getInstance().getPointer((DataBuffer)zBuffers.getFirst(), context), AtomicAllocator.getInstance().getPointer((DataBuffer)zBuffers.getSecond(), context)});
        if (ret.data().dataType() == DataBuffer.Type.DOUBLE) {
            this.nativeOps.concatDouble(extras, dimension, toConcat.length, new PointerPointer(new Pointer[]{dataPointer}), new PointerPointer(new Pointer[]{shapesPointer}), (DoublePointer)dZ, (LongPointer)dZShapeInfo, new PointerPointer(new Pointer[]{tadPointer}), new PointerPointer(new Pointer[]{offsetPointer}));
        } else if (ret.data().dataType() == DataBuffer.Type.FLOAT) {
            this.nativeOps.concatFloat(extras, dimension, toConcat.length, new PointerPointer(new Pointer[]{dataPointer}), new PointerPointer(new Pointer[]{shapesPointer}), (FloatPointer)dZ, (LongPointer)dZShapeInfo, new PointerPointer(new Pointer[]{tadPointer}), new PointerPointer(new Pointer[]{offsetPointer}));
        } else {
            this.nativeOps.concatHalf(extras, dimension, toConcat.length, new PointerPointer(new Pointer[]{dataPointer}), new PointerPointer(new Pointer[]{shapesPointer}), (ShortPointer)dZ, (LongPointer)dZShapeInfo, new PointerPointer(new Pointer[]{tadPointer}), new PointerPointer(new Pointer[]{offsetPointer}));
        }
        allocator.registerAction(context, ret, toConcat);
        return ret;
    }

    public INDArray specialConcat(int dimension, INDArray ... toConcat) {
        if (toConcat.length == 1) {
            return toConcat[0];
        }
        if (Nd4j.getExecutioner() instanceof GridExecutioner) {
            ((GridExecutioner)Nd4j.getExecutioner()).flushQueue();
        }
        PointerPointer shapeInfoPointers = new PointerPointer((long)toConcat.length);
        PointerPointer dataPointers = new PointerPointer((long)toConcat.length);
        AtomicAllocator allocator = AtomicAllocator.getInstance();
        CudaContext context = (CudaContext)allocator.getDeviceContext().getContext();
        int sumAlongDim = 0;
        long[] outputShape = ArrayUtil.copy((long[])toConcat[0].shape());
        for (int i = 0; i < toConcat.length; ++i) {
            if (toConcat[i].isCompressed()) {
                Nd4j.getCompressor().decompressi(toConcat[i]);
            }
            allocator.synchronizeHostData(toConcat[i]);
            shapeInfoPointers.put((long)i, allocator.getHostPointer(toConcat[i].shapeInfoDataBuffer()));
            dataPointers.put((long)i, allocator.getHostPointer(toConcat[i].data()));
            sumAlongDim = (int)((long)sumAlongDim + toConcat[i].size(dimension));
            for (int j = 0; j < toConcat[i].rank(); ++j) {
                if (j == dimension || toConcat[i].size(j) == outputShape[j]) continue;
                throw new IllegalArgumentException("Illegal concatenation at array " + i + " and shape element " + j);
            }
        }
        outputShape[dimension] = sumAlongDim;
        PointerPointer dummy = new PointerPointer(new Pointer[]{null});
        INDArray ret = Nd4j.createUninitialized((long[])outputShape, (char)Nd4j.order().charValue());
        if (ret.data().dataType() == DataBuffer.Type.DOUBLE) {
            this.nativeOps.specialConcatDouble(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (DoublePointer)ret.data().addressPointer(), (LongPointer)ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[]{null}), new PointerPointer(new Pointer[]{null}));
        } else if (ret.data().dataType() == DataBuffer.Type.FLOAT) {
            this.nativeOps.specialConcatFloat(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (FloatPointer)ret.data().addressPointer(), (LongPointer)ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[]{null}), new PointerPointer(new Pointer[]{null}));
        } else if (ret.data().dataType() == DataBuffer.Type.HALF) {
            this.nativeOps.specialConcatHalf(dummy, dimension, toConcat.length, dataPointers, shapeInfoPointers, (ShortPointer)ret.data().addressPointer(), (LongPointer)ret.shapeInfoDataBuffer().addressPointer(), new PointerPointer(new Pointer[]{null}), new PointerPointer(new Pointer[]{null}));
        } else {
            throw new ND4JIllegalStateException("Unknown dataType: " + ret.data().dataType());
        }
        AllocationPoint point = allocator.getAllocationPoint(ret);
        long perfD = PerformanceTracker.getInstance().helperStartTransaction();
        this.nativeOps.memcpyAsync(point.getDevicePointer(), point.getHostPointer(), ret.lengthLong() * (long)Nd4j.sizeOfDataType((DataBuffer.Type)ret.data().dataType()), CudaConstants.cudaMemcpyHostToDevice, (Pointer)context.getSpecialStream());
        context.getSpecialStream().synchronize();
        PerformanceTracker.getInstance().helperRegisterTransaction(point.getDeviceId(), perfD, point.getNumberOfBytes(), MemcpyDirection.HOST_TO_DEVICE);
        point.tickHostRead();
        point.tickDeviceWrite();
        return ret;
    }

    public INDArray pullRows(INDArray source, int sourceDimension, int[] indexes) {
        return this.pullRows(source, sourceDimension, indexes, Nd4j.order().charValue());
    }

    public INDArray pullRows(INDArray source, int sourceDimension, long[] indexes) {
        return this.pullRows(source, sourceDimension, ArrayUtil.toInts((long[])indexes));
    }

    public INDArray pullRows(INDArray source, int sourceDimension, int[] indexes, char order) {
        long[] shape;
        if (indexes == null || indexes.length < 1) {
            throw new IllegalStateException("Indexes can't be null or zero-length");
        }
        if (sourceDimension == 1) {
            shape = new long[]{indexes.length, source.shape()[sourceDimension]};
        } else if (sourceDimension == 0) {
            shape = new long[]{source.shape()[sourceDimension], indexes.length};
        } else {
            throw new UnsupportedOperationException("2D input is expected");
        }
        return this.pullRows(source, Nd4j.createUninitialized((long[])shape, (char)order), sourceDimension, indexes);
    }

    public INDArray pullRows(INDArray source, INDArray destination, int sourceDimension, int[] indexes) {
        if (Nd4j.getExecutioner() instanceof GridExecutioner) {
            ((GridExecutioner)Nd4j.getExecutioner()).flushQueue();
        }
        if (indexes == null || indexes.length < 1) {
            throw new IllegalStateException("Indexes can't be null or zero-length");
        }
        long[] shape = null;
        if (sourceDimension == 1) {
            shape = new long[]{indexes.length, source.shape()[sourceDimension]};
        } else if (sourceDimension == 0) {
            shape = new long[]{source.shape()[sourceDimension], indexes.length};
        } else {
            throw new UnsupportedOperationException("2D input is expected");
        }
        INDArray ret = destination;
        if (ret == null) {
            ret = Nd4j.createUninitialized((long[])shape, (char)this.order);
        } else if (!Arrays.equals(shape, destination.shape())) {
            throw new IllegalStateException("Cannot pull rows into destination array: expected destination array of shape " + Arrays.toString(shape) + " but got destination array of shape " + Arrays.toString(destination.shape()));
        }
        AtomicAllocator allocator = AtomicAllocator.getInstance();
        CudaContext context = allocator.getFlowController().prepareAction(ret, source);
        Pointer x = AtomicAllocator.getInstance().getPointer(source, context);
        Pointer xShape = AtomicAllocator.getInstance().getPointer(source.shapeInfoDataBuffer(), context);
        Pointer z = AtomicAllocator.getInstance().getPointer(ret, context);
        Pointer zShape = AtomicAllocator.getInstance().getPointer(ret.shapeInfoDataBuffer(), context);
        PointerPointer extras = new PointerPointer(new Pointer[]{AddressRetriever.retrieveHostPointer(ret.shapeInfoDataBuffer()), context.getOldStream(), allocator.getDeviceIdPointer()});
        CudaLongDataBuffer tempIndexes = new CudaLongDataBuffer(indexes.length);
        AtomicAllocator.getInstance().memcpyBlocking(tempIndexes, (Pointer)new LongPointer(ArrayUtil.toLongArray((int[])indexes)), indexes.length * 8, 0L);
        Pointer pIndex = AtomicAllocator.getInstance().getPointer(tempIndexes, context);
        TADManager tadManager = Nd4j.getExecutioner().getTADManager();
        Pair tadBuffers = tadManager.getTADOnlyShapeInfo(source, new int[]{sourceDimension});
        Pair zTadBuffers = tadManager.getTADOnlyShapeInfo(ret, new int[]{sourceDimension});
        Pointer tadShapeInfo = AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getFirst(), context);
        Pointer zTadShapeInfo = AtomicAllocator.getInstance().getPointer((DataBuffer)zTadBuffers.getFirst(), context);
        DataBuffer offsets = (DataBuffer)tadBuffers.getSecond();
        Pointer tadOffsets = AtomicAllocator.getInstance().getPointer(offsets, context);
        Pointer zTadOffsets = AtomicAllocator.getInstance().getPointer((DataBuffer)zTadBuffers.getSecond(), context);
        if (ret.data().dataType() == DataBuffer.Type.DOUBLE) {
            this.nativeOps.pullRowsDouble(extras, (DoublePointer)x, (LongPointer)xShape, (DoublePointer)z, (LongPointer)zShape, (long)indexes.length, (LongPointer)pIndex, (LongPointer)tadShapeInfo, (LongPointer)new LongPointerWrapper(tadOffsets), (LongPointer)zTadShapeInfo, (LongPointer)new LongPointerWrapper(zTadOffsets));
        } else if (ret.data().dataType() == DataBuffer.Type.FLOAT) {
            this.nativeOps.pullRowsFloat(extras, (FloatPointer)x, (LongPointer)xShape, (FloatPointer)z, (LongPointer)zShape, (long)indexes.length, (LongPointer)pIndex, (LongPointer)tadShapeInfo, (LongPointer)new LongPointerWrapper(tadOffsets), (LongPointer)zTadShapeInfo, (LongPointer)new LongPointerWrapper(zTadOffsets));
        } else {
            this.nativeOps.pullRowsHalf(extras, (ShortPointer)x, (LongPointer)xShape, (ShortPointer)z, (LongPointer)zShape, (long)indexes.length, (LongPointer)pIndex, (LongPointer)tadShapeInfo, (LongPointer)new LongPointerWrapper(tadOffsets), (LongPointer)zTadShapeInfo, (LongPointer)new LongPointerWrapper(zTadOffsets));
        }
        allocator.registerAction(context, ret, source);
        return ret;
    }

    public INDArray accumulate(INDArray target, INDArray ... arrays) {
        if (arrays == null || arrays.length == 0) {
            throw new RuntimeException("Input arrays are missing");
        }
        if (arrays.length == 1) {
            return target.assign(arrays[0]);
        }
        if (CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() && this.nativeOps.isP2PAvailable()) {
            Nd4j.getExecutioner().push();
            long len = target.lengthLong();
            AtomicAllocator allocator = AtomicAllocator.getInstance();
            CudaContext context = allocator.getFlowController().prepareAction(target, arrays);
            PointerPointer extras = new PointerPointer(new Pointer[]{null, context.getOldStream(), allocator.getDeviceIdPointer(), new CudaPointer(0L)});
            Pointer z = AtomicAllocator.getInstance().getPointer(target, context);
            long[] xPointers = new long[arrays.length];
            for (int i = 0; i < arrays.length; ++i) {
                if (arrays[i].elementWiseStride() != 1) {
                    throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
                }
                if (arrays[i].lengthLong() != len) {
                    throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
                }
                AllocationPoint point = allocator.getAllocationPoint(arrays[i]);
                xPointers[i] = point.getPointers().getDevicePointer().address();
                point.tickDeviceWrite();
            }
            CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.length);
            allocator.memcpyBlocking(tempX, (Pointer)new LongPointer(xPointers), xPointers.length * 8, 0L);
            PointerPointer x = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
            if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
                this.nativeOps.accumulateDouble(extras, x, (DoublePointer)z, arrays.length, len);
            } else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
                this.nativeOps.accumulateFloat(extras, x, (FloatPointer)z, arrays.length, len);
            } else {
                this.nativeOps.accumulateHalf(extras, x, (ShortPointer)z, arrays.length, len);
            }
            allocator.getFlowController().registerAction(context, target, arrays);
            tempX.address();
            return target;
        }
        long len = target.lengthLong();
        Nd4j.getExecutioner().commit();
        CudaContext context = (CudaContext)AtomicAllocator.getInstance().getDeviceContext().getContext();
        PointerPointer dataPointers = new PointerPointer((long)arrays.length);
        PointerPointer extras = new PointerPointer(new Pointer[]{null, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer(), new CudaPointer(1L)});
        for (int i = 0; i < arrays.length; ++i) {
            Nd4j.getCompressor().autoDecompress(arrays[i]);
            if (arrays[i].elementWiseStride() != 1) {
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            }
            if (arrays[i].lengthLong() != len) {
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            }
            dataPointers.put((long)i, AtomicAllocator.getInstance().getHostPointer(arrays[i]));
        }
        if (target.data().dataType() == DataBuffer.Type.DOUBLE) {
            this.nativeOps.accumulateDouble(extras, dataPointers, (DoublePointer)AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        } else if (target.data().dataType() == DataBuffer.Type.FLOAT) {
            this.nativeOps.accumulateFloat(extras, dataPointers, (FloatPointer)AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        } else {
            this.nativeOps.accumulateHalf(extras, dataPointers, (ShortPointer)AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
        }
        AtomicAllocator.getInstance().getAllocationPoint(target).tickHostWrite();
        return target;
    }

    public INDArray average(INDArray target, INDArray[] arrays) {
        int i;
        if (arrays == null || arrays.length == 0) {
            throw new RuntimeException("Input arrays are missing");
        }
        if (arrays.length == 1) {
            return target.assign(arrays[0]);
        }
        if (this.nativeOps.isP2PAvailable() && CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed()) {
            Nd4j.getExecutioner().push();
            long len = target != null ? target.lengthLong() : arrays[0].lengthLong();
            AtomicAllocator allocator = AtomicAllocator.getInstance();
            CudaContext context = allocator.getFlowController().prepareAction(target, arrays);
            PointerPointer extras = new PointerPointer(new Pointer[]{null, context.getOldStream(), allocator.getDeviceIdPointer(), new CudaPointer(0L)});
            Pointer z = target == null ? null : AtomicAllocator.getInstance().getPointer(target, context);
            long[] xPointers = new long[arrays.length];
            for (int i2 = 0; i2 < arrays.length; ++i2) {
                if (arrays[i2].elementWiseStride() != 1) {
                    throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
                }
                if (arrays[i2].lengthLong() != len) {
                    throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
                }
                AllocationPoint point = allocator.getAllocationPoint(arrays[i2]);
                xPointers[i2] = point.getPointers().getDevicePointer().address();
                point.tickDeviceWrite();
            }
            CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.length);
            allocator.memcpyBlocking(tempX, (Pointer)new LongPointer(xPointers), xPointers.length * 8, 0L);
            PointerPointer x = new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context));
            if (arrays[0].data().dataType() == DataBuffer.Type.DOUBLE) {
                this.nativeOps.averageDouble(extras, x, target == null ? null : (DoublePointer)z, arrays.length, len, true);
            } else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) {
                this.nativeOps.averageFloat(extras, x, target == null ? null : (FloatPointer)z, arrays.length, len, true);
            } else {
                this.nativeOps.averageHalf(extras, x, target == null ? null : (ShortPointer)z, arrays.length, len, true);
            }
            allocator.getFlowController().registerAction(context, target, arrays);
            tempX.address();
            return target;
        }
        long len = target == null ? arrays[0].lengthLong() : target.lengthLong();
        CudaContext context = (CudaContext)AtomicAllocator.getInstance().getDeviceContext().getContext();
        PointerPointer dataPointers = new PointerPointer((long)arrays.length);
        PointerPointer extras = new PointerPointer(new Pointer[]{null, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer(), new CudaPointer(1L)});
        for (i = 0; i < arrays.length; ++i) {
            Nd4j.getCompressor().autoDecompress(arrays[i]);
            if (arrays[i].elementWiseStride() != 1) {
                throw new ND4JIllegalStateException("Native averaging is applicable only to continuous INDArrays");
            }
            if (arrays[i].lengthLong() != len) {
                throw new ND4JIllegalStateException("All arrays should have equal length for averaging");
            }
            dataPointers.put((long)i, AtomicAllocator.getInstance().getHostPointer(arrays[i]));
        }
        if (arrays[0].data().dataType() == DataBuffer.Type.DOUBLE) {
            this.nativeOps.averageDouble(extras, dataPointers, target == null ? null : (DoublePointer)AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
        } else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) {
            this.nativeOps.averageFloat(extras, dataPointers, target == null ? null : (FloatPointer)AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
        } else {
            this.nativeOps.averageHalf(extras, dataPointers, target == null ? null : (ShortPointer)AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
        }
        if (target != null) {
            AtomicAllocator.getInstance().getAllocationPoint(target).tickHostWrite();
        }
        for (i = 0; i < arrays.length; ++i) {
            AtomicAllocator.getInstance().getAllocationPoint(arrays[i]).tickHostWrite();
        }
        return target;
    }

    public INDArray average(Collection<INDArray> arrays) {
        return this.average(arrays.toArray(new INDArray[0]));
    }

    public INDArray average(INDArray[] arrays) {
        if (arrays == null || arrays.length == 0) {
            throw new RuntimeException("Input arrays are missing");
        }
        INDArray ret = Nd4j.createUninitialized((long[])arrays[0].shape(), (char)arrays[0].ordering());
        return this.average(ret, arrays);
    }

    public INDArray average(INDArray target, Collection<INDArray> arrays) {
        return this.average(target, arrays.toArray(new INDArray[0]));
    }

    public void shuffle(INDArray array, Random rnd, int ... dimension) {
        this.shuffle(Collections.singletonList(array), rnd, dimension);
    }

    public void shuffle(List<INDArray> arrays, Random rnd, List<int[]> dimensions) {
        if (dimensions == null || dimensions.size() == 0) {
            throw new RuntimeException("Dimension can't be null or 0-length");
        }
        if (arrays == null || arrays.size() == 0) {
            throw new RuntimeException("No input arrays provided");
        }
        if (dimensions.size() > 1 && arrays.size() != dimensions.size()) {
            throw new IllegalStateException("Number of dimensions do not match number of arrays to shuffle");
        }
        Nd4j.getExecutioner().push();
        AtomicAllocator allocator = AtomicAllocator.getInstance();
        CudaContext context = null;
        for (int x = 0; x < arrays.size(); ++x) {
            context = allocator.getFlowController().prepareAction(arrays.get(x), new INDArray[0]);
        }
        int tadLength = 1;
        for (int i = 0; i < dimensions.get(0).length; ++i) {
            tadLength = (int)((long)tadLength * arrays.get(0).shape()[dimensions.get(0)[i]]);
        }
        long numTads = arrays.get(0).length() / (long)tadLength;
        int[] map = ArrayUtil.buildInterleavedVector((Random)rnd, (int)((int)numTads));
        CudaIntDataBuffer shuffle = new CudaIntDataBuffer(map);
        Pointer shuffleMap = allocator.getPointer(shuffle, context);
        PointerPointer extras = new PointerPointer(new Pointer[]{null, context.getOldStream(), allocator.getDeviceIdPointer()});
        long[] xPointers = new long[arrays.size()];
        long[] xShapes = new long[arrays.size()];
        long[] tadShapes = new long[arrays.size()];
        long[] tadOffsets = new long[arrays.size()];
        for (int i = 0; i < arrays.size(); ++i) {
            INDArray array = arrays.get(i);
            Pointer x = AtomicAllocator.getInstance().getPointer(array, context);
            Pointer xShapeInfo = AtomicAllocator.getInstance().getPointer(array.shapeInfoDataBuffer(), context);
            TADManager tadManager = Nd4j.getExecutioner().getTADManager();
            int[] dimension = dimensions.size() > 1 ? dimensions.get(i) : dimensions.get(0);
            Pair tadBuffers = tadManager.getTADOnlyShapeInfo(array, dimension);
            Pointer tadShapeInfo = AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getFirst(), context);
            DataBuffer offsets = (DataBuffer)tadBuffers.getSecond();
            if (offsets.length() != numTads) {
                throw new ND4JIllegalStateException("Can't symmetrically shuffle arrays with non-equal number of TADs");
            }
            Pointer tadOffset = AtomicAllocator.getInstance().getPointer(offsets, context);
            xPointers[i] = x.address();
            xShapes[i] = xShapeInfo.address();
            tadShapes[i] = tadShapeInfo.address();
            tadOffsets[i] = tadOffset.address();
        }
        CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(arrays.size());
        CudaDoubleDataBuffer tempShapes = new CudaDoubleDataBuffer(arrays.size());
        CudaDoubleDataBuffer tempTAD = new CudaDoubleDataBuffer(arrays.size());
        CudaDoubleDataBuffer tempOffsets = new CudaDoubleDataBuffer(arrays.size());
        AtomicAllocator.getInstance().memcpyBlocking(tempX, (Pointer)new LongPointer(xPointers), xPointers.length * 8, 0L);
        AtomicAllocator.getInstance().memcpyBlocking(tempShapes, (Pointer)new LongPointer(xShapes), xPointers.length * 8, 0L);
        AtomicAllocator.getInstance().memcpyBlocking(tempTAD, (Pointer)new LongPointer(tadShapes), xPointers.length * 8, 0L);
        AtomicAllocator.getInstance().memcpyBlocking(tempOffsets, (Pointer)new LongPointer(tadOffsets), xPointers.length * 8, 0L);
        if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
            this.nativeOps.shuffleDouble(extras, new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), arrays.size(), (IntPointer)shuffleMap, new PointerPointer(allocator.getPointer(tempTAD, context)), new PointerPointer(allocator.getPointer(tempOffsets, context)));
        } else if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
            this.nativeOps.shuffleFloat(extras, new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), arrays.size(), (IntPointer)shuffleMap, new PointerPointer(allocator.getPointer(tempTAD, context)), new PointerPointer(allocator.getPointer(tempOffsets, context)));
        } else {
            this.nativeOps.shuffleHalf(extras, new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), new PointerPointer(allocator.getPointer(tempX, context)), new PointerPointer(allocator.getPointer(tempShapes, context)), arrays.size(), (IntPointer)shuffleMap, new PointerPointer(allocator.getPointer(tempTAD, context)), new PointerPointer(allocator.getPointer(tempOffsets, context)));
        }
        for (int f = 0; f < arrays.size(); ++f) {
            allocator.getFlowController().registerAction(context, arrays.get(f), new INDArray[0]);
        }
        shuffle.address();
        tempX.dataType();
        tempShapes.dataType();
        tempOffsets.dataType();
        tempTAD.dataType();
    }

    public void shuffle(Collection<INDArray> sourceArrays, Random rnd, int ... dimension) {
        this.shuffle(new ArrayList<INDArray>(sourceArrays), rnd, Collections.singletonList(dimension));
    }

    public INDArray convertDataEx(DataBuffer.TypeEx typeSrc, INDArray source, DataBuffer.TypeEx typeDst) {
        if (source.isView()) {
            throw new UnsupportedOperationException("Impossible to compress View. Consider using dup() before. ");
        }
        DataBuffer buffer = this.convertDataEx(typeSrc, source.data(), typeDst);
        source.setData(buffer);
        if (buffer instanceof CompressedDataBuffer) {
            source.markAsCompressed(true);
        } else {
            source.markAsCompressed(false);
        }
        return source;
    }

    public void convertDataEx(DataBuffer.TypeEx typeSrc, Pointer source, DataBuffer.TypeEx typeDst, Pointer target, long length) {
        cudaStream_t stream = ((CudaContext)AtomicAllocator.getInstance().getDeviceContext().getContext()).getOldStream();
        PointerPointer p = new PointerPointer(new Pointer[]{null, stream});
        this.nativeOps.convertTypes(p, typeSrc.ordinal(), source, length, typeDst.ordinal(), target);
    }

    public void convertDataEx(DataBuffer.TypeEx typeSrc, Pointer source, DataBuffer.TypeEx typeDst, DataBuffer buffer) {
        Pointer srcPtr = null;
        Pointer dstPtr = null;
        long size = 0L;
        long ssize = 0L;
        cudaStream_t stream = ((CudaContext)AtomicAllocator.getInstance().getDeviceContext().getContext()).getOldStream();
        if (!(buffer instanceof CompressedDataBuffer)) {
            throw new UnsupportedOperationException();
        }
        size = ((CompressedDataBuffer)buffer).getCompressionDescriptor().getCompressedLength();
        ssize = ((CompressedDataBuffer)buffer).getCompressionDescriptor().getOriginalLength();
        srcPtr = this.nativeOps.mallocDevice(ssize, null, 0);
        dstPtr = this.nativeOps.mallocDevice(size, null, 0);
        this.nativeOps.memcpyAsync(srcPtr, source, ssize, CudaConstants.cudaMemcpyHostToDevice, (Pointer)stream);
        this.convertDataEx(typeSrc, srcPtr, typeDst, dstPtr, buffer.length());
        this.nativeOps.memcpyAsync(buffer.addressPointer(), dstPtr, size, CudaConstants.cudaMemcpyHostToHost, (Pointer)stream);
        stream.synchronize();
        if (buffer instanceof CompressedDataBuffer) {
            this.nativeOps.freeDevice(srcPtr, null);
            this.nativeOps.freeDevice(dstPtr, null);
        }
    }

    public void convertDataEx(DataBuffer.TypeEx typeSrc, DataBuffer source, DataBuffer.TypeEx typeDst, DataBuffer target) {
        cudaStream_t stream = ((CudaContext)AtomicAllocator.getInstance().getDeviceContext().getContext()).getOldStream();
        Pointer srcPtr = null;
        Pointer dstPtr = null;
        if (Nd4j.getWorkspaceManager().anyWorkspaceActiveForCurrentThread()) {
            long size;
            MemoryWorkspace ws = Nd4j.getMemoryManager().getCurrentWorkspace();
            if (source instanceof CompressedDataBuffer) {
                size = ((CompressedDataBuffer)source).getCompressionDescriptor().getCompressedLength();
                srcPtr = ws.alloc(size, MemoryKind.DEVICE, DataBuffer.Type.HALF, false);
                this.nativeOps.memcpyAsync(srcPtr, source.addressPointer(), size, CudaConstants.cudaMemcpyHostToHost, (Pointer)stream);
            }
            if (target instanceof CompressedDataBuffer) {
                size = ((CompressedDataBuffer)target).getCompressionDescriptor().getCompressedLength();
                dstPtr = ws.alloc(size, MemoryKind.DEVICE, DataBuffer.Type.HALF, false);
            }
        } else {
            long size;
            if (source instanceof CompressedDataBuffer) {
                log.info("Replacing source ptr");
                size = ((CompressedDataBuffer)source).getCompressionDescriptor().getCompressedLength();
                srcPtr = this.nativeOps.mallocDevice(size, null, 0);
                this.nativeOps.memcpyAsync(srcPtr, source.addressPointer(), size, CudaConstants.cudaMemcpyHostToHost, (Pointer)stream);
                stream.synchronize();
            } else {
                srcPtr = AtomicAllocator.getInstance().getPointer(source);
            }
            if (target instanceof CompressedDataBuffer) {
                log.info("Replacing target ptr");
                size = ((CompressedDataBuffer)target).getCompressionDescriptor().getCompressedLength();
                dstPtr = this.nativeOps.mallocDevice(size, null, 0);
            } else {
                dstPtr = AtomicAllocator.getInstance().getPointer(target);
            }
        }
        this.convertDataEx(typeSrc, srcPtr, typeDst, dstPtr, target.length());
        Nd4j.getExecutioner().commit();
        if (target instanceof CompressedDataBuffer) {
            this.nativeOps.memcpyAsync(target.addressPointer(), dstPtr, target.capacity(), CudaConstants.cudaMemcpyHostToHost, (Pointer)stream);
            if (!Nd4j.getWorkspaceManager().anyWorkspaceActiveForCurrentThread()) {
                this.nativeOps.freeDevice(dstPtr, null);
            }
        }
        if (source instanceof CompressedDataBuffer && !Nd4j.getWorkspaceManager().anyWorkspaceActiveForCurrentThread()) {
            this.nativeOps.freeDevice(srcPtr, null);
        }
        Nd4j.getExecutioner().commit();
    }

    public DataBuffer convertDataEx(DataBuffer.TypeEx typeSrc, DataBuffer source, DataBuffer.TypeEx typeDst) {
        int elementSize = 0;
        if (typeDst.ordinal() <= 2) {
            elementSize = 1;
        } else if (typeDst.ordinal() <= 5) {
            elementSize = 2;
        } else if (typeDst.ordinal() == 6) {
            elementSize = 4;
        } else if (typeDst.ordinal() == 7) {
            elementSize = 8;
        } else {
            throw new UnsupportedOperationException("Unknown target TypeEx: " + typeDst.name());
        }
        Nd4j.getExecutioner().commit();
        DataBuffer buffer = null;
        if (!(source instanceof CompressedDataBuffer)) {
            AtomicAllocator.getInstance().synchronizeHostData(source);
        }
        if (CompressionUtils.goingToCompress((DataBuffer.TypeEx)typeSrc, (DataBuffer.TypeEx)typeDst)) {
            BytePointer pointer = new BytePointer(source.length() * (long)elementSize);
            CompressionDescriptor descriptor = new CompressionDescriptor(source, typeDst.name());
            descriptor.setCompressionType(CompressionType.LOSSY);
            descriptor.setCompressedLength(source.length() * (long)elementSize);
            buffer = new CompressedDataBuffer((Pointer)pointer, descriptor);
        } else {
            CompressedDataBuffer compressed = (CompressedDataBuffer)source;
            CompressionDescriptor descriptor = compressed.getCompressionDescriptor();
            buffer = Nd4j.createBuffer((long)descriptor.getNumberOfElements(), (boolean)false);
            AllocationPoint point = AtomicAllocator.getInstance().getAllocationPoint(buffer);
            point.tickDeviceWrite();
        }
        this.convertDataEx(typeSrc, source, typeDst, buffer);
        return buffer;
    }

    public INDArray[] tear(INDArray tensor, int ... dimensions) {
        if (tensor.isCompressed()) {
            Nd4j.getCompressor().decompressi(tensor);
        }
        Arrays.sort(dimensions);
        Pair tadBuffers = Nd4j.getExecutioner().getTADManager().getTADOnlyShapeInfo(tensor, dimensions);
        long tadLength = 1L;
        long[] shape = new long[dimensions.length];
        for (int i = 0; i < dimensions.length; ++i) {
            tadLength *= tensor.shape()[dimensions[i]];
            shape[i] = tensor.shape()[dimensions[i]];
        }
        int numTads = (int)(tensor.lengthLong() / tadLength);
        INDArray[] result = new INDArray[numTads];
        long[] xPointers = new long[numTads];
        CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(null, tensor);
        for (int x = 0; x < numTads; ++x) {
            result[x] = Nd4j.createUninitialized((long[])shape);
            context = AtomicAllocator.getInstance().getFlowController().prepareAction(result[x], new INDArray[0]);
            xPointers[x] = AtomicAllocator.getInstance().getPointer(result[x], context).address();
        }
        CudaDoubleDataBuffer tempX = new CudaDoubleDataBuffer(numTads);
        AtomicAllocator.getInstance().memcpyBlocking(tempX, (Pointer)new LongPointer(xPointers), xPointers.length * 8, 0L);
        PointerPointer extraz = new PointerPointer(new Pointer[]{null, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer()});
        if (Nd4j.dataType() == DataBuffer.Type.DOUBLE) {
            this.nativeOps.tearDouble(extraz, (DoublePointer)AtomicAllocator.getInstance().getPointer(tensor, context), (LongPointer)AtomicAllocator.getInstance().getPointer(tensor.shapeInfoDataBuffer(), context), new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context)), (LongPointer)AtomicAllocator.getInstance().getPointer(result[0].shapeInfoDataBuffer(), context), (LongPointer)AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getFirst(), context), (LongPointer)new LongPointerWrapper(AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getSecond(), context)));
        } else if (Nd4j.dataType() == DataBuffer.Type.FLOAT) {
            this.nativeOps.tearFloat(extraz, (FloatPointer)AtomicAllocator.getInstance().getPointer(tensor, context), (LongPointer)AtomicAllocator.getInstance().getPointer(tensor.shapeInfoDataBuffer(), context), new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context)), (LongPointer)AtomicAllocator.getInstance().getPointer(result[0].shapeInfoDataBuffer(), context), (LongPointer)AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getFirst(), context), (LongPointer)new LongPointerWrapper(AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getSecond(), context)));
        } else if (Nd4j.dataType() == DataBuffer.Type.HALF) {
            this.nativeOps.tearHalf(extraz, (ShortPointer)AtomicAllocator.getInstance().getPointer(tensor, context), (LongPointer)AtomicAllocator.getInstance().getPointer(tensor.shapeInfoDataBuffer(), context), new PointerPointer(AtomicAllocator.getInstance().getPointer(tempX, context)), (LongPointer)AtomicAllocator.getInstance().getPointer(result[0].shapeInfoDataBuffer(), context), (LongPointer)AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getFirst(), context), (LongPointer)new LongPointerWrapper(AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getSecond(), context)));
        }
        AtomicAllocator.getInstance().getFlowController().registerActionAllWrite(context, result);
        AtomicAllocator.getInstance().getFlowController().registerAction(context, null, result);
        return result;
    }

    public INDArray sort(INDArray x, boolean descending) {
        if (x.isScalar()) {
            return x;
        }
        Nd4j.getExecutioner().push();
        CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(x, new INDArray[0]);
        Pointer ptr = AtomicAllocator.getInstance().getHostPointer(x.shapeInfoDataBuffer());
        PointerPointer extraz = new PointerPointer(new Pointer[]{ptr, context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer(), context.getBufferAllocation(), context.getBufferReduction(), context.getBufferScalar(), context.getBufferSpecial(), ptr, AtomicAllocator.getInstance().getHostPointer(x.shapeInfoDataBuffer()), ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, new CudaPointer(0L)});
        boolean isRadix = !x.isView() && x.lengthLong() > 0xA00000L;
        INDArray tmpX = x;
        if (isRadix) {
            Nd4j.getExecutioner().commit();
        }
        if (x.data().dataType() == DataBuffer.Type.FLOAT) {
            this.nativeOps.sortFloat(extraz, (FloatPointer)AtomicAllocator.getInstance().getPointer(tmpX, context), (LongPointer)AtomicAllocator.getInstance().getPointer(tmpX.shapeInfoDataBuffer(), context), descending);
        } else if (x.data().dataType() == DataBuffer.Type.DOUBLE) {
            this.nativeOps.sortDouble(extraz, (DoublePointer)AtomicAllocator.getInstance().getPointer(tmpX, context), (LongPointer)AtomicAllocator.getInstance().getPointer(tmpX.shapeInfoDataBuffer(), context), descending);
        } else if (x.data().dataType() == DataBuffer.Type.HALF) {
            this.nativeOps.sortHalf(extraz, (ShortPointer)AtomicAllocator.getInstance().getPointer(tmpX, context), (LongPointer)AtomicAllocator.getInstance().getPointer(tmpX.shapeInfoDataBuffer(), context), descending);
        } else {
            throw new UnsupportedOperationException("Unknown dataType " + x.data().dataType());
        }
        AtomicAllocator.getInstance().getFlowController().registerAction(context, x, new INDArray[0]);
        return x;
    }

    public INDArray empty(DataBuffer.Type type) {
        long extras = ArrayOptionsHelper.setOptionBit((long)0L, (ArrayType)ArrayType.EMPTY);
        extras = ArrayOptionsHelper.setOptionBit((long)extras, (DataBuffer.Type)type);
        Pair shape = Nd4j.getShapeInfoProvider().createShapeInformation(new int[0], new int[0], 0L, 1, 'c', extras);
        return new JCublasNDArray(null, (CudaLongDataBuffer)shape.getFirst(), (long[])shape.getSecond());
    }

    public INDArray sort(INDArray x, boolean descending, int ... dimension) {
        if (x.isScalar()) {
            return x;
        }
        Arrays.sort(dimension);
        Nd4j.getExecutioner().push();
        Pair tadBuffers = Nd4j.getExecutioner().getTADManager().getTADOnlyShapeInfo(x, dimension);
        CudaContext context = AtomicAllocator.getInstance().getFlowController().prepareAction(x, new INDArray[0]);
        PointerPointer extraz = new PointerPointer(new Pointer[]{AtomicAllocator.getInstance().getHostPointer(x.shapeInfoDataBuffer()), context.getOldStream(), AtomicAllocator.getInstance().getDeviceIdPointer()});
        Pointer dimensionPointer = AtomicAllocator.getInstance().getPointer(AtomicAllocator.getInstance().getConstantBuffer(dimension), context);
        if (x.data().dataType() == DataBuffer.Type.FLOAT) {
            this.nativeOps.sortTadFloat(extraz, (FloatPointer)AtomicAllocator.getInstance().getPointer(x, context), (LongPointer)AtomicAllocator.getInstance().getPointer(x.shapeInfoDataBuffer(), context), (IntPointer)dimensionPointer, dimension.length, (LongPointer)AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getFirst(), context), (LongPointer)new LongPointerWrapper(AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getSecond(), context)), descending);
        } else if (x.data().dataType() == DataBuffer.Type.DOUBLE) {
            this.nativeOps.sortTadDouble(extraz, (DoublePointer)AtomicAllocator.getInstance().getPointer(x, context), (LongPointer)AtomicAllocator.getInstance().getPointer(x.shapeInfoDataBuffer(), context), (IntPointer)dimensionPointer, dimension.length, (LongPointer)AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getFirst(), context), (LongPointer)new LongPointerWrapper(AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getSecond(), context)), descending);
        } else if (x.data().dataType() == DataBuffer.Type.HALF) {
            this.nativeOps.sortTadHalf(extraz, (ShortPointer)AtomicAllocator.getInstance().getPointer(x, context), (LongPointer)AtomicAllocator.getInstance().getPointer(x.shapeInfoDataBuffer(), context), (IntPointer)dimensionPointer, dimension.length, (LongPointer)AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getFirst(), context), (LongPointer)new LongPointerWrapper(AtomicAllocator.getInstance().getPointer((DataBuffer)tadBuffers.getSecond(), context)), descending);
        } else {
            throw new UnsupportedOperationException("Unknown dataType " + x.data().dataType());
        }
        AtomicAllocator.getInstance().getFlowController().registerAction(context, x, new INDArray[0]);
        return x;
    }

    public INDArray create(float[] data, long[] shape, long[] stride, long offset) {
        return new JCublasNDArray(data, shape, stride, offset, Nd4j.order().charValue());
    }

    public INDArray create(double[] data, long[] shape, long[] stride, long offset) {
        return new JCublasNDArray(data, shape, stride, offset, Nd4j.order().charValue());
    }

    public INDArray create(DataBuffer data, long[] shape) {
        return new JCublasNDArray(data, shape);
    }

    public INDArray create(DataBuffer data, long[] shape, long[] stride, long offset) {
        return new JCublasNDArray(data, shape, stride, offset, Nd4j.order().charValue());
    }

    public INDArray create(List<INDArray> list, long[] shape) {
        return new JCublasNDArray(list, shape);
    }

    public INDArray create(long rows, long columns, long[] stride, long offset) {
        return this.create(new long[]{rows, columns}, stride, offset, Nd4j.order().charValue());
    }

    public INDArray create(long[] shape, char ordering) {
        return new JCublasNDArray(shape, 0L, ordering);
    }

    public INDArray createUninitialized(long[] shape, char ordering) {
        return new JCublasNDArray(shape, Nd4j.getStrides((long[])shape, (char)ordering), 0L, ordering, false);
    }

    public INDArray createUninitializedDetached(long[] shape, char ordering) {
        MemoryWorkspace workspace = Nd4j.getMemoryManager().getCurrentWorkspace();
        Nd4j.getMemoryManager().setCurrentWorkspace(null);
        JCublasNDArray ret = new JCublasNDArray(shape, Nd4j.getStrides((long[])shape, (char)ordering), 0L, ordering, false);
        Nd4j.getMemoryManager().setCurrentWorkspace(workspace);
        return ret;
    }

    public INDArray create(DataBuffer data, long[] newShape, long[] newStride, long offset, char ordering) {
        return new JCublasNDArray(data, newShape, newStride, offset, ordering);
    }

    public INDArray create(List<INDArray> list, long[] shape, char ordering) {
        return new JCublasNDArray(list, shape, ordering);
    }

    public INDArray create(float[] data, long[] shape, long[] stride, char order, long offset) {
        return new JCublasNDArray(data, shape, stride, offset, order);
    }

    public INDArray create(float[] data, long[] shape, long[] stride, long offset, char ordering) {
        return new JCublasNDArray(data, shape, stride, offset, ordering);
    }

    public INDArray create(double[] data, long[] shape, long[] stride, long offset, char ordering) {
        return new JCublasNDArray(data, shape, stride, offset, ordering);
    }

    public INDArray create(float[] data, long[] shape, long offset, Character order) {
        return new JCublasNDArray(data, shape, Nd4j.getStrides((long[])shape, (char)order.charValue()), offset, order.charValue());
    }

    public INDArray create(double[] data, long[] shape, long offset, Character order) {
        return new JCublasNDArray(data, shape, Nd4j.getStrides((long[])shape, (char)order.charValue()), offset, order.charValue());
    }

    public INDArray create(float[] data, long[] shape, char ordering) {
        return new JCublasNDArray(data, shape, Nd4j.getStrides((long[])shape, (char)this.order), 0L, ordering);
    }

    public INDArray createSparseCSR(double[] data, int[] columns, int[] pointerB, int[] pointerE, long[] shape) {
        throw new UnsupportedOperationException();
    }

    public INDArray createSparseCSR(float[] data, int[] columns, int[] pointerB, int[] pointerE, long[] shape) {
        throw new UnsupportedOperationException();
    }

    public INDArray createSparseCSR(DataBuffer data, int[] columns, int[] pointerB, int[] pointerE, long[] shape) {
        throw new UnsupportedOperationException();
    }

    public INDArray createSparseCOO(double[] values, long[][] indices, long[] shape) {
        throw new UnsupportedOperationException();
    }

    public INDArray createSparseCOO(float[] values, long[][] indices, long[] shape) {
        throw new UnsupportedOperationException();
    }

    public INDArray createSparseCOO(double[] values, int[][] indices, long[] shape) {
        return new JCusparseNDArrayCOO(values, indices, shape);
    }

    public INDArray createSparseCOO(float[] values, int[][] indices, long[] shape) {
        return new JCusparseNDArrayCOO(values, indices, shape);
    }

    public INDArray createSparseCOO(DataBuffer values, DataBuffer indices, long[] shape) {
        throw new UnsupportedOperationException();
    }

    public INDArray createSparseCOO(DataBuffer values, DataBuffer indices, DataBuffer sparseInformation, long[] shape) {
        throw new UnsupportedOperationException();
    }

    public INDArray createSparseCOO(DataBuffer values, DataBuffer indices, long[] sparseOffsets, int[] flags, int[] hiddenDimensions, int underlyingRank, long[] shape) {
        throw new UnsupportedOperationException();
    }

    public INDArray sortCooIndices(INDArray x) {
        throw new UnsupportedOperationException();
    }
}

