From 2703f0bf49edf2f30b27b3adca30b474c1b2d64b Mon Sep 17 00:00:00 2001 From: JohnLCaron Date: Mon, 28 Jul 2025 10:19:06 -0600 Subject: [PATCH] minor cleanup and class renaming. --- Readme.md | 22 ++++++- .../kotlin/com/sunya/cdm/api/Netchdf.kt | 9 +-- .../com/sunya/cdm/array/ArrayStructureData.kt | 5 -- .../kotlin/com/sunya/cdm/util/Math.kt | 3 - .../com/sunya/netchdf/NetchdfFileFormat.kt | 2 +- .../com/sunya/netchdf/hdf5/BTree1data.kt | 3 +- .../com/sunya/netchdf/hdf5/FractalHeap.kt | 1 - .../com/sunya/netchdf/hdf5/H5TypeInfo.kt | 1 + ...ncurrent.kt => H5readChunkedConcurrent.kt} | 2 +- .../{H5chunkReader.kt => H5readerChunked.kt} | 61 +------------------ ...{H5dataReader.kt => H5readerNonChunked.kt} | 0 .../kotlin/com/sunya/netchdf/hdf5/Hdf5File.kt | 7 ++- .../com/sunya/netchdf/netcdf3/N3builder.kt | 2 +- .../com/sunya/netchdf/netcdf4/Netcdf4.kt | 2 +- .../com/sunya/netchdf/hdf5/Btree1dataTest.kt | 2 +- .../kotlin/com/sunya/netchdf/CountVersions.kt | 3 + .../netchdf/hdf5/H5readConcurrentTest.kt | 2 +- 17 files changed, 40 insertions(+), 87 deletions(-) rename core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/{H5chunkConcurrent.kt => H5readChunkedConcurrent.kt} (98%) rename core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/{H5chunkReader.kt => H5readerChunked.kt} (67%) rename core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/{H5dataReader.kt => H5readerNonChunked.kt} (100%) diff --git a/Readme.md b/Readme.md index 2e13e5ea..68febf6d 100644 --- a/Readme.md +++ b/Readme.md @@ -1,5 +1,5 @@ # netchdf -_last updated: 7/27/2025_ +_last updated: 7/28/2025_ This is a rewrite in Kotlin of parts of the devcdm and netcdf-java libraries. @@ -33,6 +33,8 @@ Please contact me if you'd like to help out. Especially needed are test datasets * [Compare with HDF5 data model](#compare-with-hdf5-data-model) * [Compare with HDF4 data model](#compare-with-hdf4-data-model) * [Compare with HDF-EOS data model](#compare-with-hdf-eos-data-model) + * [Implementation Notes](#implementation-notes) + * [Netcdf4 vs HDF5](#netcdf4-vs-hdf5) * [Elevator blurb](#elevator-blurb) @@ -305,6 +307,24 @@ Please carefully check results if you have this kind of data, and send us sample * The _StructMetadata_ ODL is gathered and applied to the file header metadata as well as possible. Contact us with example files if you see something we are missing. +## Implementation Notes + +### Netcdf4 vs HDF5 + +All Netcdf4 files are HDF5, but not all HDF5 files are Netcdf4. We'd like to be able to detect when a file was written +using the Netcdf-4 library, but its not possible to always tell for certain. If any of the following are true, we set +isNetcdf4 = true. + + 1. If a group or variable has an attribute with name "_NCProperties", "_Netcdf4Coordinates", "_Netcdf4Dimid" or "_nc3_strict". + 2. If a variable name starts with "_nc4_non_coord_". + 3. If a variable has an attrinute named "DIMENSION_LIST with type vlen of reference. + 4. If a dimenson name starts with "This is a netCDF dimension but not a netCDF variable" + +Other than trying to identify which library wrote the file, Netchdf does not do any special processing for Netcdf4 files, +except: + + 1. When testing, use the Netcdf4 C library when comparing data and metadata. + ## Elevator blurb An independent implementation of HDF4/HDF5/HDF-EOS in Kotlin. diff --git a/core/src/commonMain/kotlin/com/sunya/cdm/api/Netchdf.kt b/core/src/commonMain/kotlin/com/sunya/cdm/api/Netchdf.kt index 8806c1b2..218c9dfe 100644 --- a/core/src/commonMain/kotlin/com/sunya/cdm/api/Netchdf.kt +++ b/core/src/commonMain/kotlin/com/sunya/cdm/api/Netchdf.kt @@ -17,7 +17,7 @@ interface Netchdf : AutoCloseable { // TODO I think the output type is not always the input type fun readArrayData(v2: Variable, wantSection: SectionPartial? = null) : ArrayTyped - // iterate over all the chunks in section, order is arbitrary. TODO where is intersection with wantSection done ?? + // iterate over all the chunks in section, order is arbitrary. fun chunkIterator(v2: Variable, wantSection: SectionPartial? = null, maxElements : Int? = null) : Iterator> // iterate over all the chunks in section, order is arbitrary, callbacks are in multiple threads. @@ -31,9 +31,4 @@ interface Netchdf : AutoCloseable { } // the section describes the array chunk reletive to the variable's shape. -data class ArraySection(val array : ArrayTyped, val chunkSection : Section) { - fun intersect(wantSection: SectionPartial) : ArrayTyped { - // TODO ?? - return array - } -} \ No newline at end of file +data class ArraySection(val array : ArrayTyped, val chunkSection : Section) \ No newline at end of file diff --git a/core/src/commonMain/kotlin/com/sunya/cdm/array/ArrayStructureData.kt b/core/src/commonMain/kotlin/com/sunya/cdm/array/ArrayStructureData.kt index 12f4dc1a..88049f64 100644 --- a/core/src/commonMain/kotlin/com/sunya/cdm/array/ArrayStructureData.kt +++ b/core/src/commonMain/kotlin/com/sunya/cdm/array/ArrayStructureData.kt @@ -22,13 +22,8 @@ class ArrayStructureData(shape : IntArray, val ba : ByteArray, val isBE: Boolean } private val heap = mutableMapOf() - // private var heapIndex = 0 internal fun putOnHeap(offset: Int, value: Any) { heap[offset] = value - // ba.putInt(offset, heapIndex) // TODO clobber the ByteArray ?? Or just use the byte pos, which is unique - //val result = heapIndex - // heapIndex++ - // return result } internal fun getFromHeap(offset: Int): Any? { diff --git a/core/src/commonMain/kotlin/com/sunya/cdm/util/Math.kt b/core/src/commonMain/kotlin/com/sunya/cdm/util/Math.kt index 8404e39d..06cf3665 100644 --- a/core/src/commonMain/kotlin/com/sunya/cdm/util/Math.kt +++ b/core/src/commonMain/kotlin/com/sunya/cdm/util/Math.kt @@ -129,9 +129,6 @@ fun unsignedByteToShort(b: Byte): Short { */ //////////////////////////////////////////////////////////////////////// -// TODO -// doubleIsNearlyEqual() doublesAreNearlyEqual - const val defaultMaxRelativeDiffFloat = 1.0e-5f /** The default maximum relative difference for floats, when comparing as doubles. */ diff --git a/core/src/commonMain/kotlin/com/sunya/netchdf/NetchdfFileFormat.kt b/core/src/commonMain/kotlin/com/sunya/netchdf/NetchdfFileFormat.kt index 76dddd33..70ab2276 100644 --- a/core/src/commonMain/kotlin/com/sunya/netchdf/NetchdfFileFormat.kt +++ b/core/src/commonMain/kotlin/com/sunya/netchdf/NetchdfFileFormat.kt @@ -107,7 +107,7 @@ enum class NetchdfFileFormat(private val version: Int, private val formatName: S NC_FORMAT_64BIT_OFFSET(2, "netcdf-3 64bit-offset"), NC_FORMAT_NETCDF4(3, "NetCDF-4"), // This is really just HDF-5, dont know yet if its written by netcdf4. NC_FORMAT_NETCDF4_CLASSIC(4, "netcdf-4 classic"), // psuedo format I think - NC_FORMAT_64BIT_DATA(5, "netcdf-5"), // TODO support this; need test files + NC_FORMAT_64BIT_DATA(5, "netcdf-5"), // we have one test file: ../core/src/commonTest/data/jays_DOMAIN000.nc HDF5(5, "hdf5"), // not written by netcdf C library HDF4(6, "hdf4"); // not written by netcdf C library diff --git a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/BTree1data.kt b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/BTree1data.kt index 2cf40047..9a9ce0b4 100644 --- a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/BTree1data.kt +++ b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/BTree1data.kt @@ -24,7 +24,6 @@ internal class BTree1data( rootNode = BTreeNode(rootNodeAddress, null) } - // if other layouts like BTree2data had this interface we could use in chunkConcurrent override fun asSequence(): Sequence = sequence { repeat( tiling.nelems) { yield(findDataChunk(it) ?: missingDataChunk(it, tiling)) @@ -33,6 +32,8 @@ internal class BTree1data( fun chunkIterator(): Iterator = asSequence().iterator() + fun countChunks() = asSequence().count() + internal fun findDataChunk(order: Int): DataChunk? { return rootNode.findDataChunk(order) } diff --git a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/FractalHeap.kt b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/FractalHeap.kt index f6a9105e..01e595f8 100644 --- a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/FractalHeap.kt +++ b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/FractalHeap.kt @@ -186,7 +186,6 @@ internal class FractalHeap(private val h5: H5builder, forWho: String, address: L return record1.hugeObjectAddress } - // 3, 4 -> return offset.toLong() // TODO only a guess else -> throw RuntimeException("Unknown DHeapId subtype =$subtype") } } diff --git a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5TypeInfo.kt b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5TypeInfo.kt index a9b94099..ae89d7af 100644 --- a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5TypeInfo.kt +++ b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5TypeInfo.kt @@ -57,6 +57,7 @@ internal data class H5TypeInfo(val isVlenString: Boolean, val isRefObject : Bool Datatype5.Floating -> when (this.elemSize) { + // 2 -> "half float" see jhdf 4 -> Datatype.FLOAT 8 -> Datatype.DOUBLE else -> throw RuntimeException("Bad hdf5 float type with size= ${this.elemSize}") diff --git a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5chunkConcurrent.kt b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5readChunkedConcurrent.kt similarity index 98% rename from core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5chunkConcurrent.kt rename to core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5readChunkedConcurrent.kt index 26232fa2..a4cf9586 100644 --- a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5chunkConcurrent.kt +++ b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5readChunkedConcurrent.kt @@ -27,7 +27,7 @@ import kotlinx.coroutines.runBlocking import kotlinx.coroutines.yield @ExperimentalCoroutinesApi -class H5chunkConcurrent(val h5: H5builder, val v2: Variable, wantSection: SectionPartial?, ) { +class H5readChunkedConcurrent(val h5: H5builder, val v2: Variable, wantSection: SectionPartial?, ) { val rafext: OpenFileExtended = h5.makeFileExtended() val varShape = v2.shape diff --git a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5chunkReader.kt b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5readerChunked.kt similarity index 67% rename from core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5chunkReader.kt rename to core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5readerChunked.kt index c912934e..f8e457a1 100644 --- a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5chunkReader.kt +++ b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5readerChunked.kt @@ -27,14 +27,10 @@ internal fun H5builder.readChunkedData(v2: Variable, wantSection: Section throw RuntimeException("Illegal nbytes to read = $sizeBytes") } val ba = ByteArray(sizeBytes.toInt()) - - // just reading into memory the entire index for now - // val index = BTree2j(h5, v2.name, vinfo.dataPos, vinfo.storageDims) - val filters = FilterPipeline(v2.name, vinfo.mfp, vinfo.h5type.isBE) val state = OpenFileState(0L, vinfo.h5type.isBE) - // just run through all the chunks, we wont read any that we dont need + // run through all the chunks, we wont read any that we dont need for (dataChunk: DataChunk in index) { val dataSection = IndexSpace(v2.rank, dataChunk.offsets.toLongArray(), vinfo.storageDims) val chunker = Chunker(dataSection, wantSpace) // each DataChunkEntry has its own Chunker iteration @@ -62,60 +58,6 @@ internal fun H5builder.readChunkedData(v2: Variable, wantSection: Section } } -/* DataLayoutBTreeVer1 -internal fun H5builder.readBtree1data(v2: Variable, wantSection: Section): ArrayTyped { - val vinfo = v2.spObject as DataContainerVariable - val h5type = vinfo.h5type - - val elemSize = vinfo.storageDims[vinfo.storageDims.size - 1].toInt() // last one is always the elements size - val datatype = vinfo.h5type.datatype() - - val wantSpace = IndexSpace(wantSection) - val sizeBytes = wantSpace.totalElements * elemSize - if (sizeBytes <= 0 || sizeBytes >= Int.MAX_VALUE) { - throw RuntimeException("Illegal nbytes to read = $sizeBytes") - } - val ba = ByteArray(sizeBytes.toInt()) - - val btree1 = if (vinfo.mdl is DataLayoutBTreeVer1) { - val rafext: OpenFileExtended = this.openNewFileExtended() - BTree1data(rafext, vinfo.dataPos, v2.shape, vinfo.storageDims) - } else { - throw RuntimeException("Unsupported mdl ${vinfo.mdl}") - } - - //val tiledData = H5TiledData1(btree1, v2.shape, vinfo.storageDims) - val filters = FilterPipeline(v2.name, vinfo.mfp, vinfo.h5type.isBE) - //if (debugChunking) println(" readChunkedData tiles=${tiledData.tiling}") - - var transferChunks = 0 - val state = OpenFileState(0L, vinfo.h5type.isBE) - btree1.asSequence().forEach { dataChunk -> - val dataSection = IndexSpace(v2.rank, dataChunk.offsets.toLongArray(), vinfo.storageDims) - val chunker = Chunker(dataSection, wantSpace) // each DataChunkEntry has its own Chunker iteration - if (dataChunk.isMissing()) { - if (debugChunking) println(" missing ${dataChunk.show()}") - chunker.transferMissing(vinfo.fillValue, elemSize, ba) - } else { - if (debugChunking) println(" chunk=${dataChunk.show()}") - state.pos = dataChunk.address - val chunkData = this.raf.readByteArray(state, dataChunk.size) - val filteredData = if (dataChunk.filterMask == null) chunkData - else filters.apply(chunkData, dataChunk.filterMask) - chunker.transferBA(filteredData, 0, elemSize, ba, 0) - transferChunks += chunker.transferChunks - } - } - - val shape = wantSpace.shape.toIntArray() - - return if (h5type.datatype5 == Datatype5.Vlen) { - this.processVlenIntoArray(h5type, shape, ba, wantSpace.totalElements.toInt(), elemSize) - } else { - this.processDataIntoArray(ba, vinfo.h5type.isBE, datatype, shape, h5type, elemSize) as ArrayTyped - } -} */ - internal fun readChunkedDataWithIterator(hdf5: Hdf5File, v2: Variable, wantSection: SectionPartial?): ArrayTyped { val vinfo = v2.spObject as DataContainerVariable val datatype = vinfo.h5type.datatype() @@ -149,7 +91,6 @@ internal fun readChunkedDataWithIterator(hdf5: Hdf5File, v2: Variable, wa val dataSection = IndexSpace(dataChunk.chunkSection) val chunker = Chunker(dataSection, wantSpace) // each DataChunkEntry has its own Chunker iteration chunker.forEach { - // println(it) dataChunk.array.transfer(values, it) } } diff --git a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5dataReader.kt b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5readerNonChunked.kt similarity index 100% rename from core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5dataReader.kt rename to core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/H5readerNonChunked.kt diff --git a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/Hdf5File.kt b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/Hdf5File.kt index 383925ac..59c168fa 100644 --- a/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/Hdf5File.kt +++ b/core/src/commonMain/kotlin/com/sunya/netchdf/hdf5/Hdf5File.kt @@ -51,7 +51,7 @@ class Hdf5File(val filename : String, strict : Boolean = false) : Netchdf { return readArrayData(v2, wantSection, recurse = false) } - fun readArrayData(v2: Variable, wantSection: SectionPartial?, recurse: Boolean): ArrayTyped { + fun readArrayData(v2: Variable, wantSection: SectionPartial?, recurse: Boolean, countChunks: Boolean = false): ArrayTyped { if (v2.nelems == 0L) { return ArrayEmpty(v2.shape.toIntArray(), v2.datatype) } @@ -87,6 +87,7 @@ class Hdf5File(val filename : String, strict : Boolean = false) : Netchdf { v2.datatype == Datatype.STRING || v2.datatype == Datatype.VLEN)) { val btree1 = BTree1data(header.makeFileExtended(), vinfo.dataPos, v2.shape, vinfo.storageDims) + if (countChunks) println(" nchunks = ${btree1.countChunks()}") header.readChunkedData(v2, section, btree1.chunkIterator()) // header.readBtree1data(v2, section) } else { @@ -160,7 +161,7 @@ class Hdf5File(val filename : String, strict : Boolean = false) : Netchdf { } class H5chunkIterator2(hdfFile: Hdf5File, val v2: Variable, val wantSection: SectionPartial?): AbstractIterator>() { - val reader = H5chunkConcurrent(hdfFile.header, v2, wantSection) + val reader = H5readChunkedConcurrent(hdfFile.header, v2, wantSection) val nthreads = hdfFile.useNThreads() val deque = Deque>(10) @@ -184,7 +185,7 @@ class Hdf5File(val filename : String, strict : Boolean = false) : Netchdf { override fun readChunksConcurrent(v2: Variable, lamda : (ArraySection) -> Unit, done : () -> Unit, wantSection: SectionPartial?, nthreads: Int?) { - val reader = H5chunkConcurrent(header, v2, wantSection) + val reader = H5readChunkedConcurrent(header, v2, wantSection) val availableProcessors = this.useNThreads() // println("availableProcessors = $availableProcessors") reader.readChunks(nthreads ?: availableProcessors, lamda, done = { done() }) diff --git a/core/src/commonMain/kotlin/com/sunya/netchdf/netcdf3/N3builder.kt b/core/src/commonMain/kotlin/com/sunya/netchdf/netcdf3/N3builder.kt index 879ee6c5..577c403d 100644 --- a/core/src/commonMain/kotlin/com/sunya/netchdf/netcdf3/N3builder.kt +++ b/core/src/commonMain/kotlin/com/sunya/netchdf/netcdf3/N3builder.kt @@ -264,7 +264,7 @@ internal class N3header(rafOrg: OpenFileIF, val root: Group.Builder) { nelems } Datatype.CHAR -> { - // a CHAR is made into a String with UTF8 assumed. TODO make this settable ?? + // a CHAR is made into a String with UTF8 assumed. attBuilder.setValue(raf.readString(filePos, nelems)) nelems } diff --git a/core/src/commonMain/kotlin/com/sunya/netchdf/netcdf4/Netcdf4.kt b/core/src/commonMain/kotlin/com/sunya/netchdf/netcdf4/Netcdf4.kt index b69ae53b..fe3672b1 100644 --- a/core/src/commonMain/kotlin/com/sunya/netchdf/netcdf4/Netcdf4.kt +++ b/core/src/commonMain/kotlin/com/sunya/netchdf/netcdf4/Netcdf4.kt @@ -48,7 +48,7 @@ object Netcdf4 { val NETCDF4_SPECIAL_ATTS = listOf(NCPROPERTIES, NETCDF4_COORDINATES, NETCDF4_STRICT, NETCDF4_DIMID) - // appended to variable when it conflicts with dimension scale + // prepended to variable when it conflicts with dimension scale const val NETCDF4_NON_COORD = "_nc4_non_coord_" const val NETCDF4_NOT_VARIABLE = "This is a netCDF dimension but not a netCDF variable" diff --git a/core/src/commonTest/kotlin/com/sunya/netchdf/hdf5/Btree1dataTest.kt b/core/src/commonTest/kotlin/com/sunya/netchdf/hdf5/Btree1dataTest.kt index d890cd63..e45fab9f 100644 --- a/core/src/commonTest/kotlin/com/sunya/netchdf/hdf5/Btree1dataTest.kt +++ b/core/src/commonTest/kotlin/com/sunya/netchdf/hdf5/Btree1dataTest.kt @@ -60,7 +60,7 @@ class Btree1dataTest { for (nthreads in listOf(1, 2, 4, 8, 10, 16, 20, 24, 32, 40, 48)) { val time = measureNanoTime { // fun readChunks(nthreads: Int, lamda: (ArraySection<*>) -> Unit, done: () -> Unit) { - val reader = H5chunkConcurrent(myfile.header, myvar, null) + val reader = H5readChunkedConcurrent(myfile.header, myvar, null) reader.readChunks(nthreads, lamda = { asect: ArraySection<*> -> // println(" section = ${asect.chunkSection}") }, { }, ) diff --git a/testfiles/src/test/kotlin/com/sunya/netchdf/CountVersions.kt b/testfiles/src/test/kotlin/com/sunya/netchdf/CountVersions.kt index 2e6e54d3..42ed6862 100644 --- a/testfiles/src/test/kotlin/com/sunya/netchdf/CountVersions.kt +++ b/testfiles/src/test/kotlin/com/sunya/netchdf/CountVersions.kt @@ -68,6 +68,9 @@ class CountVersions { } else { val paths = versions.getOrPut(ncfile.type()) { mutableListOf() } paths.add(filename) + if (ncfile.type() == "netcdf3.5") + println("ncfile.type() file=$filename ") + } } } catch (e: Throwable) { diff --git a/testfiles/src/test/kotlin/com/sunya/netchdf/hdf5/H5readConcurrentTest.kt b/testfiles/src/test/kotlin/com/sunya/netchdf/hdf5/H5readConcurrentTest.kt index c94b714f..9c93a95e 100644 --- a/testfiles/src/test/kotlin/com/sunya/netchdf/hdf5/H5readConcurrentTest.kt +++ b/testfiles/src/test/kotlin/com/sunya/netchdf/hdf5/H5readConcurrentTest.kt @@ -66,7 +66,7 @@ class H5readConcurrentTest { for (nthreads in listOf(1, 2, 4, 8, 10, 16, 20, 24, 32, 40, 48)) { myfile.useNThreads = nthreads val time = measureNanoTime { - myfile.readArrayData(myvar) + myfile.readArrayData(myvar) // , null, recurse = true, countChunks = (nthreads == 1)) } println("$nthreads, ${time * nano}") val map1 = timing.getOrPut(nthreads) { mutableMapOf() }