Fix a few type issues in multithreaded parsing (#944)

quinnj · web-flow · commit bf82bf382e83 · 2021-11-16T00:22:30.000-07:00
* Fix a few type issues in multithreaded parsing Fixes #939. There ended up being a couple issues here. In the originally posted issue, the "hang" was just due to a very slow code path where our `nonstandardtype` call was not correctly ignoring `NeedsTypeDetection`, which is obviously the most common type for columns when no other columns are provided. The "hang" was due to the 60_000 columns, all calling `nonstandardtype`, then `tupcat`, which is a pretty expensive dynamic tuple type creation call. Fixing `nonstandardtype` to correctly ignore `NeedsTypeDetection` speeds up the case dramatically and avoids the `tupcat` call. The other issue found once that "hang" was avoided, was that the new multithreaded detection code was ignoring the `typemap` keyword arg. This also fixes that case by piping it through to the `findrowstarts!` family of functions. Note that the new `downcast=true` keyword arg can be used instead of the original use of `typemap`. * fix 32-bit
diff --git a/src/context.jl b/src/context.jl
@@ -614,7 +614,7 @@ end
             limit = Int(limit)
             limitposguess = ceil(Int, (limit / (origrowsguess * 0.8)) * len)
             newlen = [0, limitposguess, min(limitposguess * 2, len)]
-            findrowstarts!(buf, options, newlen, ncols, columns, stringtype, downcast, 5)
+            findrowstarts!(buf, options, newlen, ncols, columns, stringtype, typemap, downcast, 5)
             len = newlen[2] - 1
             origrowsguess = limit
             debug && println("limiting, adjusting len to $len")
@@ -625,7 +625,7 @@ end
             chunkpositions[i + 1] = i == 0 ? datapos : i == ntasks ? len : (datapos + chunksize * i)
         end
         debug && println("initial byte positions before adjusting for start of rows: $chunkpositions")
-        avgbytesperrow, successfullychunked = findrowstarts!(buf, options, chunkpositions, ncols, columns, stringtype, downcast, rows_to_check)
+        avgbytesperrow, successfullychunked = findrowstarts!(buf, options, chunkpositions, ncols, columns, stringtype, typemap, downcast, rows_to_check)
         if successfullychunked
             origbytesperrow = ((len - datapos) / origrowsguess)
             weightedavgbytesperrow = ceil(Int, avgbytesperrow * ((ntasks - 1) / ntasks) + origbytesperrow * (1 / ntasks))
diff --git a/src/detection.jl b/src/detection.jl
@@ -336,7 +336,7 @@ ColumnProperties(T) = ColumnProperties(T, 0x00)
     end
 end
 
-function findchunkrowstart(ranges, i, buf, opts, downcast, ncols, rows_to_check, columns, origcoltypes, columnlock, @nospecialize(stringtype), totalbytes, totalrows, succeeded)
+function findchunkrowstart(ranges, i, buf, opts, typemap, downcast, ncols, rows_to_check, columns, origcoltypes, columnlock, @nospecialize(stringtype), totalbytes, totalrows, succeeded)
     pos = ranges[i]
     len = ranges[i + 1]
     while pos <= len
@@ -405,7 +405,7 @@ function findchunkrowstart(ranges, i, buf, opts, downcast, ncols, rows_to_check,
                     if type === stringtype
                         type = pickstringtype(stringtype, cp.maxstringsize)
                     end
-                    col.type = type
+                    col.type = get(typemap, type, type)
                 end
             end
         end
@@ -458,7 +458,7 @@ end
 # right # of expected columns then we move on to the next file chunk byte position. If we fail, we start over
 # at the byte position, assuming we were in a quoted field (and encountered a newline inside the quoted
 # field the first time through)
-function findrowstarts!(buf, opts, ranges, ncols, columns, @nospecialize(stringtype), downcast, rows_to_check=5)
+function findrowstarts!(buf, opts, ranges, ncols, columns, @nospecialize(stringtype), typemap, downcast, rows_to_check=5)
     totalbytes = Threads.Atomic{Int}(0)
     totalrows = Threads.Atomic{Int}(0)
     succeeded = Threads.Atomic{Bool}(true)
@@ -467,7 +467,7 @@ function findrowstarts!(buf, opts, ranges, ncols, columns, @nospecialize(stringt
     origcoltypes = Type[col.type for col in columns]
     @sync for i = 2:(length(ranges) - 1)
         Threads.@spawn begin
-            findchunkrowstart(ranges, i, buf, opts, downcast, ncols, rows_to_check, columns, origcoltypes, lock, stringtype, totalbytes, totalrows, succeeded)
+            findchunkrowstart(ranges, i, buf, opts, typemap, downcast, ncols, rows_to_check, columns, origcoltypes, lock, stringtype, totalbytes, totalrows, succeeded)
         end
     end
     return totalbytes[] / totalrows[], succeeded[]
diff --git a/src/utils.jl b/src/utils.jl
@@ -72,6 +72,7 @@ end
 @inline function nonstandardtype(T)
     T = nonmissingtype(T)
     if T === Union{} ||
+       T === NeedsTypeDetection ||
        T isa StringTypes ||
        isinttype(T) ||
        T === Float16 ||
diff --git a/test/basics.jl b/test/basics.jl
@@ -735,4 +735,12 @@ f = CSV.File(IOBuffer(data); select=[2], type=Int32)
 @test length(f) == 2
 @test length(f.names) == 1
 
+# 939
+row = join((i == 1 ? string(i + 10000000000) : i == 60_000 ? "0\n" : rand(("-1", "0", "1")) for i = 1:60_000), " ")
+data = repeat(row, 271);
+f = CSV.File(IOBuffer(data); header=false, types=Dict(1 => String), typemap=Dict(Int => Int8));
+@test f.types == [i == 1 ? String : Int8 for i = 1:60_000]
+f = CSV.File(IOBuffer(data); header=false, types=Dict(1 => String), downcast=true);
+@test f.types == [i == 1 ? String : Int8 for i = 1:60_000]
+
 end