From 3a9c01e782a0c214ecc3a2d30d7f8addeea33902 Mon Sep 17 00:00:00 2001 From: Lee Iverson Date: Mon, 31 Jul 2023 10:30:21 -0700 Subject: [PATCH 1/2] Implement new dupcol keyword that indicates what to do with duplicate columns in joins and DataFrame constructors --- src/abstractdataframe/abstractdataframe.jl | 125 ++++++++++------- src/abstractdataframe/reshape.jl | 16 +-- src/dataframe/dataframe.jl | 97 ++++++++----- src/join/composer.jl | 155 +++++++++++++-------- src/join/inplace.jl | 52 +++++-- src/other/index.jl | 51 ++++--- src/other/metadata.jl | 18 +++ src/other/tables.jl | 4 +- src/other/utils.jl | 46 ++++-- test/cat.jl | 40 ++++-- test/dataframe.jl | 35 ++++- test/join.jl | 77 +++++++++- 12 files changed, 502 insertions(+), 214 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index a40627c6a4..b6a6d9e6e5 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -117,9 +117,9 @@ Compat.hasproperty(df::AbstractDataFrame, s::AbstractString) = haskey(index(df), """ rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false) + makeunique::Bool=false, dupcol::Symbol=:error) rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false) + makeunique::Bool=false, dupcol::Symbol=:error) rename!(df::AbstractDataFrame, (from => to)::Pair...) rename!(df::AbstractDataFrame, d::AbstractDict) rename!(df::AbstractDataFrame, d::AbstractVector{<:Pair}) @@ -179,9 +179,9 @@ julia> rename!(df, [:a, :b, :c]) 1 │ 1 2 3 julia> rename!(df, [:a, :b, :a]) -ERROR: ArgumentError: Duplicate variable names: :a. Pass makeunique=true to make them unique using a suffix automatically. +ERROR: ArgumentError: Duplicate variable names: :a. Pass dupcol=:makeunique to make them unique using a suffix automatically. -julia> rename!(df, [:a, :b, :a], makeunique=true) +julia> rename!(df, [:a, :b, :a], dupcol=:makeunique) 1×3 DataFrame Row │ a b a_1 │ Int64 Int64 Int64 @@ -197,16 +197,16 @@ julia> rename!(uppercase, df) ``` """ function rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false) - rename!(index(df), vals, makeunique=makeunique) + makeunique::Bool=false, dupcol::Symbol=:error) + rename!(index(df), vals, makeunique=makeunique, dupcol=dupcol) # renaming columns of SubDataFrame has to clean non-note metadata in its parent _drop_all_nonnote_metadata!(parent(df)) return df end function rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false) - rename!(index(df), Symbol.(vals), makeunique=makeunique) + makeunique::Bool=false, dupcol::Symbol=:error) + rename!(index(df), Symbol.(vals), makeunique=makeunique, dupcol=dupcol) # renaming columns of SubDataFrame has to clean non-note metadata in its parent _drop_all_nonnote_metadata!(parent(df)) return df @@ -353,9 +353,9 @@ julia> rename(uppercase, df) ``` """ rename(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique) + makeunique::Bool=false, dupcol::Symbol=:error) = rename!(copy(df), vals, makeunique=makeunique, dupcol=dupcol) rename(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique) + makeunique::Bool=false, dupcol::Symbol=:error) = rename!(copy(df), vals, makeunique=makeunique, dupcol=dupcol) rename(df::AbstractDataFrame, args...) = rename!(copy(df), args...) rename(f::Function, df::AbstractDataFrame) = rename!(f, copy(df)) @@ -1536,13 +1536,20 @@ end """ hcat(df::AbstractDataFrame...; - makeunique::Bool=false, copycols::Bool=true) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) Horizontally concatenate data frames. If `makeunique=false` (the default) column names of passed objects must be unique. If `makeunique=true` then duplicate column names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +Deprecated in favor of `dupcol` + +If `dupcol=:error` (the default) then columns names of passed objects must be unique. +If `dupcol=:makeunique` then duplicate column names will be suffixed +with `_i` (`i` starting at 1 for the first duplicate). +If `dupcol=:update` then duplicate columns names will be combined with the left-hand +column overwritten by non-missing values from the right hand column(s) If `copycols=true` (the default) then the `DataFrame` returned by `hcat` will contain copied columns from the source data frames. @@ -1575,7 +1582,7 @@ julia> df2 = DataFrame(A=4:6, B=4:6) 2 │ 5 5 3 │ 6 6 -julia> df3 = hcat(df1, df2, makeunique=true) +julia> df3 = hcat(df1, df2, dupcol=:makeunique) 3×4 DataFrame Row │ A B A_1 B_1 │ Int64 Int64 Int64 Int64 @@ -1587,32 +1594,32 @@ julia> df3 = hcat(df1, df2, makeunique=true) julia> df3.A === df1.A false -julia> df3 = hcat(df1, df2, makeunique=true, copycols=false); +julia> df3 = hcat(df1, df2, dupcol=:makeunique, copycols=false); julia> df3.A === df1.A true ``` """ -function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true) +function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) df = DataFrame(df, copycols=copycols) _drop_all_nonnote_metadata!(df) return df end # TODO: after deprecation remove AbstractVector methods -Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, copycols::Bool=true) = - hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, copycols=copycols) -Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true) = - hcat!(x, df, makeunique=makeunique, copycols=copycols) +Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, dupcol=dupcol, copycols=copycols) +Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + hcat!(x, df, makeunique=makeunique, dupcol=dupcol, copycols=copycols) Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, copycols::Bool=true) = + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = hcat!(DataFrame(df1, copycols=copycols), df2, - makeunique=makeunique, copycols=copycols) + makeunique=makeunique, dupcol=dupcol, copycols=copycols) Base.hcat(df::AbstractDataFrame, x::Union{AbstractVector, AbstractDataFrame}, y::Union{AbstractVector, AbstractDataFrame}...; - makeunique::Bool=false, copycols::Bool=true) = - hcat!(hcat(df, x, makeunique=makeunique, copycols=copycols), y..., - makeunique=makeunique, copycols=copycols) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + hcat!(hcat(df, x, makeunique=makeunique, dupcol=dupcol, copycols=copycols), y..., + makeunique=makeunique, dupcol=dupcol, copycols=copycols) """ vcat(dfs::AbstractDataFrame...; @@ -2870,6 +2877,10 @@ const INSERTCOLS_ARGUMENTS = - `makeunique` : defines what to do if `name` already exists in `df`; if it is `false` an error will be thrown; if it is `true` a new unique name will be generated by adding a suffix + - `dupcol` : defines what to do if `name` already exists in `df`; + if it is :error an error will be thrown; if is :makeunique a new unique name will + be generated by adding a suffix; if it is :update then the existing column will be + updated with the non-missing values - `copycols` : whether vectors passed as columns should be copied If `val` is an `AbstractRange` then the result of `collect(val)` is inserted. @@ -2891,7 +2902,7 @@ const INSERTCOLS_ARGUMENTS = """ insertcols(df::AbstractDataFrame[, col], (name=>val)::Pair...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) + after::Bool=false, makeunique::Bool=false, dupcol=:error, copycols::Bool=true) Insert a column into a copy of `df` data frame using the [`insertcols!`](@ref) function and return the newly created data frame. @@ -2922,7 +2933,7 @@ julia> insertcols(df, 1, :b => 'a':'c') 2 │ b 2 3 │ c 3 -julia> insertcols(df, :c => 2:4, :c => 3:5, makeunique=true) +julia> insertcols(df, :c => 2:4, :c => 3:5, dupcol=:error) 3×3 DataFrame Row │ a c c_1 │ Int64 Int64 Int64 @@ -2942,13 +2953,13 @@ julia> insertcols(df, :a, :d => 7:9, after=true) ``` """ insertcols(df::AbstractDataFrame, args...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = insertcols!(copy(df), args...; - after=after, makeunique=makeunique, copycols=copycols) + after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols) """ insertcols!(df::AbstractDataFrame[, col], (name=>val)::Pair...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) + after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) Insert a column into a data frame in place. Return the updated data frame. @@ -2979,7 +2990,7 @@ julia> insertcols!(df, 1, :b => 'a':'c') 2 │ b 2 3 │ c 3 -julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true) +julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, dupcol=:error) 3×4 DataFrame Row │ b c c_1 a │ Char Int64 Int64 Int64 @@ -2999,7 +3010,10 @@ julia> insertcols!(df, :b, :d => 7:9, after=true) ``` """ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) + after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) + + dupcol = _dupcol(dupcol, makeunique) + if !is_column_insertion_allowed(df) throw(ArgumentError("insertcols! is only supported for DataFrame, or for " * "SubDataFrame created with `:` as column selector")) @@ -3025,15 +3039,15 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy "$(ncol(df)) columns at index $col_ind")) end - if !makeunique + if dupcol == :error if !allunique(first.(name_cols)) throw(ArgumentError("Names of columns to be inserted into a data frame " * - "must be unique when `makeunique=true`")) + "must be unique when `dupcol=:error`")) end for (n, _) in name_cols if hasproperty(df, n) throw(ArgumentError("Column $n is already present in the data frame " * - "which is not allowed when `makeunique=true`")) + "which is not allowed when `dupcol=:error`")) end end end @@ -3103,19 +3117,28 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy dfp[!, name] = item_new else if hasproperty(dfp, name) - @assert makeunique - k = 1 - while true - nn = Symbol("$(name)_$k") - if !hasproperty(dfp, nn) - name = nn - break + if dupcol == :makeunique + k = 1 + while true + nn = Symbol("$(name)_$k") + if !hasproperty(dfp, nn) + name = nn + break + end + k += 1 end - k += 1 + insert!(index(dfp), col_ind, name) + insert!(_columns(dfp), col_ind, item_new) + else + @assert dupcol == :update + # Just update without adding to index + dfp[!, name] = _update_missing.(dfp[!, name], item_new) + col_ind -= 1 end + else + insert!(index(dfp), col_ind, name) + insert!(_columns(dfp), col_ind, item_new) end - insert!(index(dfp), col_ind, name) - insert!(_columns(dfp), col_ind, item_new) end col_ind += 1 end @@ -3134,22 +3157,22 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy end insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)..., - after=after, makeunique=makeunique, copycols=copycols) + after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols) insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = insertcols!(df, ncol(df)+1, name_cols..., after=after, - makeunique=makeunique, copycols=copycols) + makeunique=makeunique, dupcol=dupcol, copycols=copycols) insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)..., - after=after, makeunique=makeunique, copycols=copycols) + after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols) function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false, - makeunique::Bool=false, copycols::Bool=true) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) if col isa SymbolOrString col_ind = Int(columnindex(df, col)) if col_ind == 0 @@ -3173,7 +3196,7 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false, end function insertcols!(df::AbstractDataFrame; after::Bool=false, - makeunique::Bool=false, copycols::Bool=true) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) _drop_all_nonnote_metadata!(parent(df)) return df end diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 2effb6f2fd..3da3ce0912 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -823,7 +823,7 @@ julia> permutedims(df2, 1, "different_name") """ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, dest_namescol::Union{Symbol, AbstractString}; - makeunique::Bool=false, strict::Bool=true) + makeunique::Bool=false, dupcol::Symbol=:error, strict::Bool=true) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) @@ -854,18 +854,18 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, if ncol(df_notsrc) == 0 df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], new_col_names, - makeunique=makeunique, copycols=false) + makeunique=makeunique, dupcol=dupcol, copycols=false) else m = permutedims(Matrix(df_notsrc)) - df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique) + df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique, dupcol=dupcol) end - out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false) + out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, dupcol=dupcol, copycols=false) _copy_table_note_metadata!(out_df, df) return out_df end function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; - makeunique::Bool=false, strict::Bool=true) + makeunique::Bool=false, dupcol::Symbol=:error, strict::Bool=true) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) dest_namescol = _names(df)[src_namescol] @@ -873,7 +873,7 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; dest_namescol = src_namescol end return permutedims(df, src_namescol, dest_namescol; - makeunique=makeunique, strict=strict) + makeunique=makeunique, dupcol=dupcol, strict=strict) end function Base.permutedims(df::AbstractDataFrame) @@ -883,8 +883,8 @@ function Base.permutedims(df::AbstractDataFrame) end function Base.permutedims(df::AbstractDataFrame, cnames::AbstractVector; - makeunique::Bool=false) - out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique) + makeunique::Bool=false, dupcol::Symbol=:error) + out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique, dupcol=dupcol) _copy_table_note_metadata!(out_df, df) return out_df end diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 3f4afafecf..f76bcd4db7 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -8,16 +8,16 @@ particularly a `Vector`, `PooledVector` or `CategoricalVector`. # Constructors ```julia -DataFrame(pairs::Pair...; makeunique::Bool=false, copycols::Bool=true) -DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, copycols::Bool=true) +DataFrame(pairs::Pair...; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) +DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) DataFrame(ds::AbstractDict; copycols::Bool=true) DataFrame(; kwargs..., copycols::Bool=true) DataFrame(table; copycols::Union{Bool, Nothing}=nothing) DataFrame(table, names::AbstractVector; - makeunique::Bool=false, copycols::Union{Bool, Nothing}=nothing) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Union{Bool, Nothing}=nothing) DataFrame(columns::AbstractVecOrMat, names::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) DataFrame(::DataFrameRow; copycols::Bool=true) DataFrame(::GroupedDataFrame; copycols::Bool=true, keepkeys::Bool=true) @@ -35,6 +35,7 @@ DataFrame(::GroupedDataFrame; copycols::Bool=true, keepkeys::Bool=true) To force a copy in such cases, or to get mutable columns from an immutable input table (like `Arrow.Table`), pass `copycols=true` explicitly. - `makeunique` : if `false` (the default), an error will be raised +- `Symbol` : One of :error (the default), :makeunique (same as makeunique=true), or :update (note that not all constructors support these keyword arguments) @@ -84,10 +85,14 @@ Pass the `copycols=false` keyword argument (where supported) to reuse vectors wi copying them. By default an error will be raised if duplicates in column names are found. Pass -`makeunique=true` keyword argument (where supported) to accept duplicate names, +`makeunique=true` keyword argument or `dupcol=:makeunique` (where supported) to accept duplicate names, in which case they will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +If duplicate column names are found and `dupcol=:update` then the left-hand column is `updated` +with values from the right-hand column (i.e. non-missing values in the right-hand column will +overwrite values in the left-hand column) + If an `AbstractRange` is passed to a `DataFrame` constructor as a column it is always collected to a `Vector` (even if `copycols=false`). As a general rule `AbstractRange` values are always materialized to a `Vector` by all functions in @@ -194,7 +199,7 @@ mutable struct DataFrame <: AbstractDataFrame colindex::Index; copycols::Bool=true) if length(columns) == length(colindex) == 0 return new(AbstractVector[], Index(), nothing, nothing, true) - elseif length(columns) != length(colindex) + elseif length(columns) != column_length(colindex) throw(DimensionMismatch("Number of columns ($(length(columns))) and number of " * "column names ($(length(colindex))) are not equal")) end @@ -232,6 +237,22 @@ mutable struct DataFrame <: AbstractDataFrame firstindex(col) != 1 && _onebased_check_error(i, col) end + # process updates if they exist + if !isempty(colindex.updates) + updated = Vector{Any}(nothing, length(colindex.names)) + for src in eachindex(colindex.updates) + name = colindex.updates[src] + dst = colindex.lookup[name] + if isnothing(updated[dst]) + updated[dst] = columns[src] + else + updated[dst] = _update_missing.(updated[dst], columns[src]) + end + end + columns = updated + colindex = Index(colindex.lookup, colindex.names) + end + return new(convert(Vector{AbstractVector}, columns), colindex, nothing, nothing, true) end end @@ -254,24 +275,27 @@ end DataFrame(df::DataFrame; copycols::Bool=true) = copy(df, copycols=copycols) -function DataFrame(pairs::Pair{Symbol, <:Any}...; makeunique::Bool=false, +function DataFrame(pairs::Pair{Symbol, <:Any}...; + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)::DataFrame colnames = [Symbol(k) for (k, v) in pairs] columns = Any[v for (k, v) in pairs] - return DataFrame(columns, Index(colnames, makeunique=makeunique), + return DataFrame(columns, Index(colnames, dupcol=_dupcol(dupcol, makeunique)), copycols=copycols) end -function DataFrame(pairs::Pair{<:AbstractString, <:Any}...; makeunique::Bool=false, +function DataFrame(pairs::Pair{<:AbstractString, <:Any}...; + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)::DataFrame colnames = [Symbol(k) for (k, v) in pairs] columns = Any[v for (k, v) in pairs] - return DataFrame(columns, Index(colnames, makeunique=makeunique), + return DataFrame(columns, Index(colnames, dupcol=_dupcol(dupcol, makeunique)), copycols=copycols) end # this is needed as a workaround for Tables.jl dispatch -function DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, +function DataFrame(pairs::AbstractVector{<:Pair}; + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) if isempty(pairs) return DataFrame() @@ -281,7 +305,7 @@ function DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, end colnames = [Symbol(k) for (k, v) in pairs] columns = Any[v for (k, v) in pairs] - return DataFrame(columns, Index(colnames, makeunique=makeunique), + return DataFrame(columns, Index(colnames, dupcol=_dupcol(dupcol, makeunique)), copycols=copycols) end end @@ -334,12 +358,13 @@ function DataFrame(; kwargs...) end function DataFrame(columns::AbstractVector, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, copycols::Bool=true)::DataFrame + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)::DataFrame + dupcol = _dupcol(dupcol, makeunique) if !(eltype(columns) <: AbstractVector) && !all(col -> isa(col, AbstractVector), columns) - return rename!(DataFrame(columns, copycols=copycols), cnames, makeunique=makeunique) + return rename!(DataFrame(columns, copycols=copycols), cnames, dupcol=dupcol) end return DataFrame(collect(AbstractVector, columns), - Index(convert(Vector{Symbol}, cnames), makeunique=makeunique), + Index(convert(Vector{Symbol}, cnames), dupcol=dupcol), copycols=copycols) end @@ -351,18 +376,18 @@ function _name2symbol(str::AbstractVector) end DataFrame(columns::AbstractVector, cnames::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) = - DataFrame(columns, _name2symbol(cnames), makeunique=makeunique, copycols=copycols) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + DataFrame(columns, _name2symbol(cnames), dupcol=_dupcol(dupcol, makeunique), copycols=copycols) DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, copycols::Bool=true)::DataFrame = + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)::DataFrame = DataFrame(collect(AbstractVector, columns), - Index(convert(Vector{Symbol}, cnames), makeunique=makeunique), + Index(convert(Vector{Symbol}, cnames), dupcol=_dupcol(dupcol, makeunique)), copycols=copycols) DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) = - DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, copycols=copycols) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + DataFrame(columns, _name2symbol(cnames); dupcol=_dupcol(dupcol, makeunique), copycols=copycols) function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true) if cnames !== :auto @@ -375,15 +400,15 @@ function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true) end function DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, copycols::Bool=true) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) getter = copycols ? getindex : view return DataFrame(AbstractVector[getter(columns, :, i) for i in 1:size(columns, 2)], - cnames, makeunique=makeunique, copycols=false) + cnames, dupcol=_dupcol(dupcol, makeunique), copycols=false) end DataFrame(columns::AbstractMatrix, cnames::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) = - DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, copycols=copycols) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + DataFrame(columns, _name2symbol(cnames); dupcol=_dupcol(dupcol, makeunique), copycols=copycols) function DataFrame(columns::AbstractMatrix, cnames::Symbol; copycols::Bool=true) if cnames !== :auto @@ -392,7 +417,7 @@ function DataFrame(columns::AbstractMatrix, cnames::Symbol; copycols::Bool=true) "positional argument is passed then the second " * "argument must be a vector of column names or :auto")) end - return DataFrame(columns, gennames(size(columns, 2)), makeunique=false, copycols=copycols) + return DataFrame(columns, gennames(size(columns, 2)), dupcol=:error, copycols=copycols) end # Discontinued constructors @@ -1202,8 +1227,8 @@ end # hcat! for 2 arguments, only a vector or a data frame is allowed function hcat!(df1::DataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, copycols::Bool=true) - u = add_names(index(df1), index(df2), makeunique=makeunique) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) + u = add_names(index(df1), index(df2), dupcol=_dupcol(dupcol, makeunique)) _drop_all_nonnote_metadata!(df1) _keep_matching_table_note_metadata!(df1, df2) @@ -1217,31 +1242,31 @@ end # TODO: after deprecation remove AbstractVector methods -function hcat!(df::DataFrame, x::AbstractVector; makeunique::Bool=false, copycols::Bool=true) +function hcat!(df::DataFrame, x::AbstractVector; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) Base.depwarn("horizontal concatenation of data frame with a vector is deprecated. " * "Pass DataFrame(x1=x) instead.", :hcat!) return hcat!(df, DataFrame(AbstractVector[x], [:x1], copycols=false), - makeunique=makeunique, copycols=copycols) + dupcol=_dupcol(dupcol, makeunique), copycols=copycols) end -function hcat!(x::AbstractVector, df::DataFrame; makeunique::Bool=false, copycols::Bool=true) +function hcat!(x::AbstractVector, df::DataFrame; makeunique::Bool=false, dupcol::Symbol=:error,copycols::Bool=true) Base.depwarn("horizontal concatenation of data frame with a vector is deprecated. " * "Pass DataFrame(x1=x) instead.", :hcat!) return hcat!(DataFrame(AbstractVector[x], [:x1], copycols=copycols), df, - makeunique=makeunique, copycols=copycols) + dupcol=_dupcol(dupcol, makeunique), copycols=copycols) end # hcat! for 1-n arguments -function hcat!(df::DataFrame; makeunique::Bool=false, copycols::Bool=true) +function hcat!(df::DataFrame; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) _drop_all_nonnote_metadata!(df) return df end hcat!(a::DataFrame, b::Union{AbstractDataFrame, AbstractVector}, c::Union{AbstractDataFrame, AbstractVector}...; - makeunique::Bool=false, copycols::Bool=true) = - hcat!(hcat!(a, b, makeunique=makeunique, copycols=copycols), - c..., makeunique=makeunique, copycols=copycols) + makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + hcat!(hcat!(a, b, dupcol=_dupcol(dupcol, makeunique), copycols=copycols), + c..., dupcol=_dupcol(dupcol, makeunique), copycols=copycols) ############################################################################## ## diff --git a/src/join/composer.jl b/src/join/composer.jl index 3cd4e90b3e..48fd39d820 100644 --- a/src/join/composer.jl +++ b/src/join/composer.jl @@ -118,7 +118,8 @@ _rename_cols(old_names::AbstractVector{Symbol}, for n in old_names] function _propagate_join_metadata!(joiner::DataFrameJoiner, dfr_noon::AbstractDataFrame, - res::DataFrame, kind::Symbol) + res::DataFrame, kind::Symbol; + dupcol::Symbol=:error, names=nothing) @assert kind == :left || kind == :right || kind == :outer || kind == :inner # The steps taken in this function are (all applies only to :note-style metadata): @@ -174,8 +175,17 @@ function _propagate_join_metadata!(joiner::DataFrameJoiner, dfr_noon::AbstractDa end end - for i in 1:ncol(dfr_noon) - _copy_col_note_metadata!(res, ncol(joiner.dfl) + i, dfr_noon, i) + if dupcol != :update + for i in 1:ncol(dfr_noon) + _copy_col_note_metadata!(res, ncol(joiner.dfl) + i, dfr_noon, i) + end + else + map = Index(names, dupcol=dupcol) + for i in 1:ncol(dfr_noon) + name = map.updates[ncol(joiner.dfl) + i] + dst = map.lookup[name] + _merge_col_note_metadata!(res, dst, dfr_noon, i) + end end if kind == :outer || kind == :inner @@ -235,7 +245,7 @@ function _count_sortperm!(input::Vector{Int}, count::Vector, end function compose_inner_table(joiner::DataFrameJoiner, - makeunique::Bool, + dupcol::Symbol, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, order::Symbol) @@ -278,9 +288,9 @@ function compose_inner_table(joiner::DataFrameJoiner, new_names = vcat(_rename_cols(_names(joiner.dfl), left_rename, joiner.left_on), _rename_cols(_names(dfr_noon), right_rename)) - res = DataFrame(cols, new_names, makeunique=makeunique, copycols=false) + res = DataFrame(cols, new_names, dupcol=dupcol, copycols=false) - _propagate_join_metadata!(joiner, dfr_noon, res, :inner) + _propagate_join_metadata!(joiner, dfr_noon, res, :inner, dupcol=dupcol, names=new_names) return res end @@ -292,7 +302,7 @@ function find_missing_idxs(present::Vector{Int}, target_len::Int) return _findall(not_seen) end -function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique::Bool, +function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, dupcol::Symbol, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, indicator::Union{Nothing, Symbol, AbstractString}, @@ -314,12 +324,12 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique: else rightonly_ixs = 1:0 end - return _compose_joined_table(joiner, kind, makeunique, left_rename, right_rename, + return _compose_joined_table(joiner, kind, dupcol, left_rename, right_rename, indicator, left_ixs, right_ixs, leftonly_ixs, rightonly_ixs, order) end -function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique::Bool, +function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, dupcol::Symbol, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, indicator::Union{Nothing, Symbol, AbstractString}, @@ -440,14 +450,14 @@ function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique new_names = vcat(_rename_cols(_names(joiner.dfl), left_rename, joiner.left_on), _rename_cols(_names(dfr_noon), right_rename)) - res = DataFrame(cols, new_names, makeunique=makeunique, copycols=false) + res = DataFrame(cols, new_names, dupcol=dupcol, copycols=false) if new_order !== nothing isnothing(src_indicator) || permute!(src_indicator, new_order) permute!(res, new_order) end - _propagate_join_metadata!(joiner, dfr_noon, res, kind) + _propagate_join_metadata!(joiner, dfr_noon, res, kind, dupcol=dupcol, names=new_names) return res, src_indicator end @@ -484,7 +494,7 @@ function _sort_compose_helper(fillval::Int, # value to use to fill unused indice end function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector}, kind::Symbol, makeunique::Bool, + on::Union{<:OnType, AbstractVector}, kind::Symbol, dupcol::Symbol, indicator::Union{Nothing, Symbol, AbstractString}, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}, left_rename::Union{Function, AbstractString, Symbol}, @@ -579,16 +589,16 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; src_indicator = nothing if kind == :inner - joined = compose_inner_table(joiner, makeunique, left_rename, right_rename, order) + joined = compose_inner_table(joiner, dupcol, left_rename, right_rename, order) elseif kind == :left joined, src_indicator = - compose_joined_table(joiner, kind, makeunique, left_rename, right_rename, indicator, order) + compose_joined_table(joiner, kind, dupcol, left_rename, right_rename, indicator, order) elseif kind == :right joined, src_indicator = - compose_joined_table(joiner, kind, makeunique, left_rename, right_rename, indicator, order) + compose_joined_table(joiner, kind, dupcol, left_rename, right_rename, indicator, order) elseif kind == :outer joined, src_indicator = - compose_joined_table(joiner, kind, makeunique, left_rename, right_rename, indicator, order) + compose_joined_table(joiner, kind, dupcol, left_rename, right_rename, indicator, order) elseif kind == :semi joined = joiner.dfl[find_semi_rows(joiner), :] elseif kind == :anti @@ -606,7 +616,7 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; invpool, pool) unique_indicator = indicator - if makeunique + if dupcol == :makeunique try_idx = 0 while hasproperty(joined, unique_indicator) try_idx += 1 @@ -614,12 +624,16 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; end end - if hasproperty(joined, unique_indicator) - throw(ArgumentError("joined data frame already has column " * - ":$unique_indicator. Pass makeunique=true to " * - "make it unique using a suffix automatically.")) + if unique_indicator == indicator && dupcol == :update + joined[!, indicator] = _update_missing.(joined[!, indicator], indicatorcol) + else + if hasproperty(joined, unique_indicator) + throw(ArgumentError("joined data frame already has column " * + ":$unique_indicator. Pass dupcol=:makeunique to " * + "make it unique using a suffix automatically.")) + end + joined[!, unique_indicator] = indicatorcol end - joined[!, unique_indicator] = indicatorcol else @assert isnothing(src_indicator) end @@ -628,10 +642,10 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; end """ - innerjoin(df1, df2; on, makeunique=false, validate=(false, false), + innerjoin(df1, df2; on, makeunique=false, dupcol=:error, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) - innerjoin(df1, df2, dfs...; on, makeunique=false, + innerjoin(df1, df2, dfs...; on, makeunique=false, dupcol=:error, validate=(false, false), matchmissing=:error, order=:undefined) @@ -755,7 +769,7 @@ julia> innerjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = """ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, + makeunique::Bool=false, dupcol::Symbol=:error, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), renamecols::Pair=identity => identity, matchmissing::Symbol=:error, @@ -764,7 +778,7 @@ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; throw(ArgumentError("renamecols keyword argument must be a `Pair` " * "containing functions, strings, or `Symbol`s")) end - return _join(df1, df2, on=on, kind=:inner, makeunique=makeunique, + return _join(df1, df2, on=on, kind=:inner, dupcol=_dupcol(dupcol, makeunique), indicator=nothing, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) @@ -772,16 +786,17 @@ end function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, + makeunique::Bool=false, dupcol::Symbol=:error, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error, order::Symbol=:undefined) @assert !isempty(dfs) - res = innerjoin(df1, df2, on=on, makeunique=makeunique, validate=validate, + dupcol = _dupcol(dupcol, makeunique) + res = innerjoin(df1, df2, on=on, dupcol=dupcol, validate=validate, matchmissing=matchmissing, order=order === :right ? :undefined : order) for (i, dfn) in enumerate(dfs) - res = innerjoin(res, dfn, on=on, makeunique=makeunique, validate=validate, + res = innerjoin(res, dfn, on=on, dupcol=dupcol, validate=validate, matchmissing=matchmissing, order= order === :right ? (i == length(dfs) ? :right : :undefined) : @@ -791,7 +806,7 @@ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::Abstract end """ - leftjoin(df1, df2; on, makeunique=false, source=nothing, validate=(false, false), + leftjoin(df1, df2; on, makeunique=false, dupcol=:error, source=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) Perform a left join of two data frame objects and return a `DataFrame` containing @@ -813,8 +828,13 @@ change in future releases. `isequal`. `on` is a required argument. - `makeunique` : if `false` (the default), an error will be raised if duplicate names are found in columns not joined on; - if `true`, duplicate names will be suffixed with `_i` + if `true`, duplicate names will be suffixed with `_i` (deprecated) (`i` starting at 1 for the first duplicate). +- `dupcol` : if `dupcol=:error` (the default) then columns names of passed objects must be unique. + If `dupcol=:makeunique` then duplicate column names will be suffixed + with `_i` (`i` starting at 1 for the first duplicate). + If `dupcol=:update` then duplicate columns names will be combined with the left-hand + column overwritten by non-missing values from the right hand column(s) - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name, for whether a row appeared in only `df1` (`"left_only"`) or in both (`"both"`). If the name is already in use, @@ -915,12 +935,14 @@ julia> leftjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase => ``` """ function leftjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique::Bool=false, dupcol::Symbol=:error, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), renamecols::Pair=identity => identity, matchmissing::Symbol=:error, order::Symbol=:undefined) + if !all(x -> x isa Union{Function, AbstractString, Symbol}, renamecols) throw(ArgumentError("renamecols keyword argument must be a `Pair` " * "containing functions, strings, or `Symbol`s")) @@ -937,14 +959,14 @@ function leftjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; "It is not allowed to pass both `indicator` and `source` " * "keyword arguments at the same time.")) end - return _join(df1, df2, on=on, kind=:left, makeunique=makeunique, + return _join(df1, df2, on=on, kind=:left, dupcol=_dupcol(dupcol, makeunique), indicator=source, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) end """ - rightjoin(df1, df2; on, makeunique=false, source=nothing, + rightjoin(df1, df2; on, makeunique=false, dupcol=:error, source=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) @@ -970,7 +992,12 @@ change in future releases. - `makeunique` : if `false` (the default), an error will be raised if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` - (`i` starting at 1 for the first duplicate). + (`i` starting at 1 for the first duplicate). (deprecated) +- `dupcol` : if `dupcol=:error` (the default) then columns names of passed objects must be unique. + If `dupcol=:makeunique` then duplicate column names will be suffixed + with `_i` (`i` starting at 1 for the first duplicate). + If `dupcol=:update` then duplicate columns names will be combined with the left-hand + column overwritten by non-missing values from the right hand column(s) - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name for whether a row appeared in only `df2` (`"right_only"`) or in both (`"both"`). If the name is already in use, @@ -1071,7 +1098,8 @@ julia> rightjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = ``` """ function rightjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique::Bool=false, dupcol::Symbol=:error, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), @@ -1093,16 +1121,16 @@ function rightjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; "It is not allowed to pass both `indicator` and `source` " * "keyword arguments at the same time.")) end - return _join(df1, df2, on=on, kind=:right, makeunique=makeunique, + return _join(df1, df2, on=on, kind=:right, dupcol=_dupcol(dupcol, makeunique), indicator=source, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) end """ - outerjoin(df1, df2; on, makeunique=false, source=nothing, validate=(false, false), + outerjoin(df1, df2; on, makeunique=false, dupcol::Symbol=:error, source=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) - outerjoin(df1, df2, dfs...; on, makeunique = false, + outerjoin(df1, df2, dfs...; on, makeunique=false, dupcol::Symbol=:error, validate = (false, false), matchmissing=:error, order=:undefined) Perform an outer join of two or more data frame objects and return a `DataFrame` @@ -1128,7 +1156,12 @@ This behavior may change in future releases. - `makeunique` : if `false` (the default), an error will be raised if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` - (`i` starting at 1 for the first duplicate). + (`i` starting at 1 for the first duplicate). (deprecated) + - `dupcol` : if `dupcol=:error` (the default) then columns names of passed objects must be unique. + If `dupcol=:makeunique` then duplicate column names will be suffixed + with `_i` (`i` starting at 1 for the first duplicate). + If `dupcol=:update` then duplicate columns names will be combined with the left-hand + column overwritten by non-missing values from the right hand column(s) - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name for whether a row appeared in only `df1` (`"left_only"`), only `df2` (`"right_only"`) or in both (`"both"`). If the name is already in use, @@ -1240,7 +1273,8 @@ julia> outerjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = ``` """ function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique::Bool=false, dupcol::Symbol=:error, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), @@ -1262,27 +1296,29 @@ function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; "It is not allowed to pass both `indicator` and `source` " * "keyword arguments at the same time.")) end - return _join(df1, df2, on=on, kind=:outer, makeunique=makeunique, + return _join(df1, df2, on=on, kind=:outer, dupcol=_dupcol(dupcol, makeunique), indicator=source, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) end function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique::Bool=false, dupcol::Symbol=:error, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error, order::Symbol=:undefined) - res = outerjoin(df1, df2, on=on, makeunique=makeunique, validate=validate, + dupcol = _dupcol(dupcol, makeunique) + res = outerjoin(df1, df2, on=on, dupcol=dupcol, validate=validate, matchmissing=matchmissing, order=order) for dfn in dfs - res = outerjoin(res, dfn, on=on, makeunique=makeunique, validate=validate, + res = outerjoin(res, dfn, on=on, dupcol=dupcol, validate=validate, matchmissing=matchmissing, order=order) end return res end """ - semijoin(df1, df2; on, makeunique=false, validate=(false, false), matchmissing=:error) + semijoin(df1, df2; on, makeunique=false, dupcol=:error, validate=(false, false), matchmissing=:error) Perform a semi join of two data frame objects and return a `DataFrame` containing the result. A semi join returns the subset of rows of `df1` that @@ -1384,16 +1420,16 @@ julia> semijoin(name, job2, on = [:ID => :identifier]) ``` """ semijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, dupcol::Symbol=:error, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error) = - _join(df1, df2, on=on, kind=:semi, makeunique=makeunique, + _join(df1, df2, on=on, kind=:semi, dupcol=_dupcol(dupcol, makeunique), indicator=nothing, validate=validate, left_rename=identity, right_rename=identity, matchmissing=matchmissing, order=:left) """ - antijoin(df1, df2; on, makeunique=false, validate=(false, false), matchmissing=:error) + antijoin(df1, df2; on, makeunique=false, dupcol=:error, validate=(false, false), matchmissing=:error) Perform an anti join of two data frame objects and return a `DataFrame` containing the result. An anti join returns the subset of rows of `df1` that do @@ -1488,10 +1524,10 @@ julia> antijoin(name, job2, on = [:ID => :identifier]) ``` """ antijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, dupcol::Symbol=:error, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error) = - _join(df1, df2, on=on, kind=:anti, makeunique=makeunique, + _join(df1, df2, on=on, kind=:anti, dupcol=_dupcol(dupcol, makeunique), indicator=nothing, validate=validate, left_rename=identity, right_rename=identity, matchmissing=matchmissing, @@ -1499,7 +1535,7 @@ antijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; """ crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, renamecols=identity => identity) + makeunique::Bool=false, dupcol::Symbol=:error, renamecols=identity => identity) crossjoin(df1, df2, dfs...; makeunique = false) Perform a cross join of two or more data frame objects and return a `DataFrame` @@ -1565,22 +1601,25 @@ julia> crossjoin(df1, df2) ``` """ function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, renamecols::Pair=identity => identity) + makeunique::Bool=false, dupcol::Symbol=:error, renamecols::Pair=identity => identity) _check_consistency(df1) _check_consistency(df2) + dupcol = _dupcol(dupcol, makeunique) r1, r2 = size(df1, 1), size(df2, 1) new_names = vcat(_rename_cols(_names(df1), first(renamecols)), _rename_cols(_names(df2), last(renamecols))) cols = Any[[repeat(c, inner=r2) for c in eachcol(df1)]; [repeat(c, outer=r1) for c in eachcol(df2)]] - res = DataFrame(cols, new_names, copycols=false, makeunique=makeunique) + res = DataFrame(cols, new_names, copycols=false, dupcol=dupcol) for i in 1:ncol(df1) _copy_col_note_metadata!(res, i, df1, i) end - for i in 1:ncol(df2) - _copy_col_note_metadata!(res, ncol(df1) + i, df2, i) + if dupcol != :update + for i in 1:ncol(df2) + _copy_col_note_metadata!(res, ncol(df1) + i, df2, i) + end end _merge_matching_table_note_metadata!(res, (df1, df2)) @@ -1589,8 +1628,8 @@ function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; end crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; - makeunique::Bool=false) = - crossjoin(crossjoin(df1, df2, makeunique=makeunique), dfs..., makeunique=makeunique) + makeunique::Bool=false, dupcol::Symbol=:error) = + crossjoin(crossjoin(df1, df2, dupcol=_dupcol(dupcol, makeunique)), dfs..., dupcol=_dupcol(dupcol, makeunique)) # an explicit error is thrown as join was supported in the past Base.join(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; diff --git a/src/join/inplace.jl b/src/join/inplace.jl index 9f1a9e0c6c..d7cda1112b 100644 --- a/src/join/inplace.jl +++ b/src/join/inplace.jl @@ -1,5 +1,5 @@ """ - leftjoin!(df1, df2; on, makeunique=false, source=nothing, + leftjoin!(df1, df2; on, makeunique=false, dupcol=:error, source=nothing, matchmissing=:error) @@ -25,10 +25,15 @@ added to `df1`. if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +- `dupcol` : one of :error (the default), :makeunique or :update. If :error, + an error will be raised if duplicate names are found in columns not joined on; + if :makeunique, duplicate names will be suffixed with `_i` + (`i` starting at 1 for the first duplicate); if :update, left-hand side columns + will be overwritten by non-missing values in the right-hand side column(s). - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name, for whether a row appeared in only `df1` (`"left_only"`) or in both (`"both"`). If the name is already in use, - the column name will be modified if `makeunique=true`. + the column name will be modified if `makeunique=true` or `dupcol=:makeunique`. - `matchmissing` : if equal to `:error` throw an error if `missing` is present in `on` columns; if equal to `:equal` then `missing` is allowed and missings are matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns. @@ -95,12 +100,14 @@ julia> leftjoin!(name, job2, on = :ID => :identifier, makeunique=true, source=:s ``` """ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector}=Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector}=Symbol[], + makeunique::Bool=false, dupcol::Symbol=:error, source::Union{Nothing, Symbol, AbstractString}=nothing, matchmissing::Symbol=:error) _check_consistency(df1) _check_consistency(df2) + dupcol = _dupcol(dupcol, makeunique) if !is_column_insertion_allowed(df1) throw(ArgumentError("leftjoin! is only supported if `df1` is a `DataFrame`, " * @@ -114,11 +121,11 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; joiner = DataFrameJoiner(df1, df2, on, matchmissing, :left) right_noon_names = names(joiner.dfr, Not(joiner.right_on)) - if !(makeunique || isempty(intersect(right_noon_names, names(df1)))) + if dupcol == :error && !isempty(intersect(right_noon_names, names(df1))) throw(ArgumentError("the following columns are present in both " * "left and right data frames but not listed in `on`: " * join(intersect(right_noon_names, names(df1)), ", ") * - ". Pass makeunique=true to add a suffix automatically to " * + ". Pass makeunique=true or dupcol=:makeunique to add a suffix automatically to " * "columns names from the right data frame.")) end @@ -134,7 +141,7 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; rcol_joined = compose_joined_rcol!(rcol, similar_missing(rcol, nrow(df1)), right_ixs) # if df1 isa SubDataFrame we must copy columns - insertcols!(df1, colname => rcol_joined, makeunique=makeunique, + insertcols!(df1, colname => rcol_joined, dupcol=dupcol, copycols=!(df1 isa DataFrame)) # need to call parent as df1 can be a SubDataFrame _copy_col_note_metadata!(parent(df1), ncol(df1), joiner.dfr, colname) @@ -149,7 +156,7 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; invpool, pool) unique_indicator = source - if makeunique + if dupcol == :makeunique try_idx = 0 while hasproperty(df1, unique_indicator) try_idx += 1 @@ -158,11 +165,18 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; end if hasproperty(df1, unique_indicator) - throw(ArgumentError("joined data frame already has column " * - ":$unique_indicator. Pass makeunique=true to " * - "make it unique using a suffix automatically.")) + if dupcol == :update + df1[!, unique_indicator] = _update_missing.(df1[!, unique_indicator], indicatorcol) + else + throw(ArgumentError("joined data frame already has column " * + ":$unique_indicator. Pass dupcol=:makeunique to " * + "make it unique using a suffix automatically or dupcol=:update " * + "to update left-hand column from right-hand.")) + end + else + df1[!, unique_indicator] = indicatorcol end - df1[!, unique_indicator] = indicatorcol + end return df1 @@ -192,3 +206,19 @@ function compose_joined_rcol!(rcol::AbstractVector, end return rcol_joined end + +function outerjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; + on::Union{<:OnType, AbstractVector}=Symbol[], makeunique::Bool=false, + dupcol::Symbol=:error, + source::Union{Nothing, Symbol, AbstractString}=nothing, + matchmissing::Symbol=:error) + + dupcol = (makeunique ? :makeunique : dupcol) + + leftjoin!(df1, df2, on=on, dupcol=dupcol, source=source, matchmissing=matchmissing) + + aj = antijoin(df2, df1, on=on, dupcol=dupcol, matchmissing=matchmissing) + append!(df1, aj) + + return df1 +end diff --git a/src/other/index.jl b/src/other/index.jl index 51aa3a31cc..2246a1f9ff 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -21,18 +21,28 @@ const MULTICOLUMNINDEX_STR = "`:`, `Cols`, `All`, `Between`, `Not`, a regular ex struct Index <: AbstractIndex # an OrderedDict would be nice here... lookup::Dict{Symbol, Int} # name => names array position names::Vector{Symbol} + updates::Vector{Symbol} end -function Index(names::AbstractVector{Symbol}; makeunique::Bool=false) - u = make_unique(names, makeunique=makeunique) - lookup = Dict{Symbol, Int}(zip(u, 1:length(u))) - return Index(lookup, u) +Index(l,u) = Index(l,u,[]) + +function Index(names::AbstractVector{Symbol}; dupcol::Symbol=:error) + if dupcol == :update + lookup = Dict{Symbol, Int}(zip(reverse(names), length(names):-1:1)) + return Index(lookup, unique(names), names) + else + u = make_unique(names, dupcol=dupcol) + lookup = Dict{Symbol, Int}(zip(u, 1:length(u))) + return Index(lookup, u) + end end Index() = Index(Dict{Symbol, Int}(), Symbol[]) Base.length(x::Index) = length(x.names) Base.names(x::Index) = string.(x.names) +column_length(x::Index) = isempty(x.updates) ? length(x.names) : length(x.updates) + # _names returns Vector{Symbol} _names(x::Index) = x.names @@ -41,12 +51,13 @@ Base.isequal(x::AbstractIndex, y::AbstractIndex) = _names(x) == _names(y) # it i Base.:(==)(x::AbstractIndex, y::AbstractIndex) = isequal(x, y) -function rename!(x::Index, nms::AbstractVector{Symbol}; makeunique::Bool=false) - if !makeunique +function rename!(x::Index, nms::AbstractVector{Symbol}; makeunique::Bool=false, dupcol::Symbol=:error) + dupcol = _dupcol(dupcol, makeunique) + if dupcol == :error if length(unique(nms)) != length(nms) dup = unique(nms[nonunique(DataFrame(nms=nms))]) dupstr = join(string.(':', dup), ", ", " and ") - msg = "Duplicate variable names: $dupstr. Pass makeunique=true " * + msg = "Duplicate variable names: $dupstr. Pass dupcol=:makeunique " * "to make them unique using a suffix automatically." throw(ArgumentError(msg)) end @@ -54,7 +65,7 @@ function rename!(x::Index, nms::AbstractVector{Symbol}; makeunique::Bool=false) if length(nms) != length(x) throw(DimensionMismatch("Length of nms doesn't match length of x.")) end - make_unique!(x.names, nms, makeunique=makeunique) + make_unique!(x.names, nms, dupcol=dupcol) empty!(x.lookup) for (i, n) in enumerate(x.names) x.lookup[n] = i @@ -128,8 +139,8 @@ function Base.push!(x::Index, nm::Symbol) return x end -function Base.merge!(x::Index, y::AbstractIndex; makeunique::Bool=false) - adds = add_names(x, y, makeunique=makeunique) +function Base.merge!(x::Index, y::AbstractIndex; makeunique::Bool=false, dupcol::Symbol=:error) + adds = add_names(x, y, dupcol=_dupcol(dupcol, makeunique)) i = length(x) for add in adds i += 1 @@ -139,8 +150,8 @@ function Base.merge!(x::Index, y::AbstractIndex; makeunique::Bool=false) return x end -Base.merge(x::AbstractIndex, y::AbstractIndex; makeunique::Bool=false) = - merge!(copy(x), y, makeunique=makeunique) +Base.merge(x::AbstractIndex, y::AbstractIndex; makeunique::Bool=false, dupcol::Symbol=:error) = + merge!(copy(x), y, makeunique=makeunique, dupcol=dupcol) function Base.delete!(x::Index, idx::Integer) # reset the lookup's beyond the deleted item @@ -431,9 +442,10 @@ end # Helpers # return Vector{Symbol} of names from add_ind that do not clash with `ind`. -# if `makeunique=false` error on collision -# if `makeunique=false` generate new names that are deduplicated -function add_names(ind::Index, add_ind::AbstractIndex; makeunique::Bool=false) +# if `dupcol=:error` error on collision +# if `dupcol=:makeunique` generate new names that are deduplicated +# if `dupcol=:update` just return the names including duplicates +function add_names(ind::Index, add_ind::AbstractIndex; dupcol::Symbol=:error) u = copy(_names(add_ind)) seen = Set(_names(ind)) @@ -444,13 +456,18 @@ function add_names(ind::Index, add_ind::AbstractIndex; makeunique::Bool=false) in(name, seen) ? push!(dups, i) : push!(seen, name) end if length(dups) > 0 - if !makeunique + if dupcol == :error dupstr = join(string.(':', unique(u[dups])), ", ", " and ") msg = "Duplicate variable names: $dupstr. Pass makeunique=true " * "to make them unique using a suffix automatically." throw(ArgumentError(msg)) end end + + if dupcol == :update + return u + end + for i in dups nm = u[i] k = 1 @@ -578,7 +595,7 @@ function Base.getindex(x::SubIndex, idx::Union{AbstractVector{Symbol}, return [x[i] for i in idx] end -rename!(x::SubIndex, nms::AbstractVector{Symbol}; makeunique::Bool=false) = +rename!(x::SubIndex, nms::AbstractVector{Symbol}; makeunique::Bool=false, dupcol::Symbol=:error) = throw(ArgumentError("rename! is not supported for views other than created " * "with Colon as a column selector")) diff --git a/src/other/metadata.jl b/src/other/metadata.jl index 60a283d5a0..38ac848769 100644 --- a/src/other/metadata.jl +++ b/src/other/metadata.jl @@ -705,6 +705,24 @@ function _copy_col_note_metadata!(dst::DataFrame, dst_col, src, src_col) return nothing end +# copy column-level :note-style metadata from Tables.jl table src to dst +# from column src_col to dst_col +# discarding previous metadata contents of dst +function _merge_col_note_metadata!(dst::DataFrame, dst_col, src, src_col) + #emptycolmetadata!(dst, dst_col) + metadata = colmetadata(dst, dst_col) + if DataAPI.colmetadatasupport(typeof(src)).read + for key in colmetadatakeys(src, src_col) + val, style = colmetadata(src, src_col, key, style=true) + # TODO write only if does not overwrite + if style === :note && !haskey(metadata, key) + colmetadata!(dst, dst_col, key, val, style=:note) + end + end + end + return nothing +end + # this is a function used to copy table-level and column-level :note-style metadata # from Tables.jl table src to dst, discarding previous metadata contents of dst function _copy_all_note_metadata!(dst::DataFrame, src) diff --git a/src/other/tables.jl b/src/other/tables.jl index 4213c98881..070e53b438 100644 --- a/src/other/tables.jl +++ b/src/other/tables.jl @@ -64,11 +64,11 @@ end # the logic here relies on the fact that Tables.CopiedColumns # is the only exception for default copycols value -DataFrame(x, cnames::AbstractVector; makeunique::Bool=false, +DataFrame(x, cnames::AbstractVector; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Union{Nothing, Bool}=nothing) = rename!(DataFrame(x, copycols=something(copycols, !(x isa Tables.CopiedColumns))), _name2symbol(cnames), - makeunique=makeunique) + makeunique=makeunique, dupcol=dupcol) function Base.append!(df::DataFrame, table; cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset])) diff --git a/src/other/utils.jl b/src/other/utils.jl index 455c406f46..0422a18071 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -72,10 +72,26 @@ struct AsTable end end +_update_missing(v1, v2) = ismissing(v2) ? v1 : v2 + +function _dupcol(dupcol::Symbol, makeunique=false) + if dupcol ∉ [:error, :makeunique, :update] + throw(ArgumentError("dupcol must be one of :error, :makeunique, or :update")) + end + if makeunique && dupcol == :update + throw(ArgumentError("makeunique=true and dupcol==:update are incompatible")) + end + if makeunique + Base.depwarn("makeunique=true will be replaced by dupcol=:makeunique", :unstack) + end + makeunique ? :makeunique : dupcol +end + Base.broadcastable(x::AsTable) = Ref(x) function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; - makeunique::Bool=false) + makeunique::Bool=false, dupcol::Symbol=:error) + dupcol = _dupcol(dupcol, makeunique) if length(names) != length(src) throw(DimensionMismatch("Length of src doesn't match length of names.")) end @@ -92,9 +108,9 @@ function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; end if length(dups) > 0 - if !makeunique + if dupcol == :error dupstr = join(string.(':', unique(src[dups])), ", ", " and ") - msg = "Duplicate variable names: $dupstr. Pass makeunique=true " * + msg = "Duplicate variable names: $dupstr. Pass dupcol=:makeunique " * "to make them unique using a suffix automatically." throw(ArgumentError(msg)) end @@ -102,23 +118,27 @@ function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; for i in dups nm = src[i] - k = 1 - while true - newnm = Symbol("$(nm)_$k") - if !in(newnm, seen) - names[i] = newnm - push!(seen, newnm) - break + if dupcol == :makeunique + k = 1 + while true + newnm = Symbol("$(nm)_$k") + if !in(newnm, seen) + names[i] = newnm + push!(seen, newnm) + break + end + k += 1 end - k += 1 + else + names[i] = nm end end return names end -function make_unique(names::AbstractVector{Symbol}; makeunique::Bool=false) - make_unique!(similar(names), names, makeunique=makeunique) +function make_unique(names::AbstractVector{Symbol}; makeunique::Bool=false, dupcol::Symbol=:error) + make_unique!(similar(names), names, dupcol=_dupcol(dupcol, makeunique)) end """ diff --git a/test/cat.jl b/test/cat.jl index b5aa1cfd9b..4bf4e3cd55 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -13,29 +13,47 @@ const ≅ = isequal df5 = DataFrame([Union{Int, Missing}[1, 2, 3, 4], nvstr], :auto) ref_df = copy(df3) - dfh = hcat(df3, df4, makeunique=true) + dfh = hcat(df3, df4, dupcol=:makeunique) @test ref_df ≅ df3 # make sure that df3 is not mutated by hcat @test size(dfh, 2) == 3 @test names(dfh) ≅ ["x1", "x1_1", "x2"] @test dfh[!, :x1] ≅ df3[!, :x1] - @test dfh ≅ DataFrames.hcat!(DataFrame(), df3, df4, makeunique=true) + @test dfh ≅ DataFrames.hcat!(DataFrame(), df3, df4, dupcol=:makeunique) + + dfhu = hcat(df3, df4, dupcol=:update) + @test ref_df ≅ df3 # make sure that df3 is not mutated by hcat + @test size(dfhu, 2) == 2 + @test names(dfhu) ≅ ["x1", "x2"] + @test ! (dfhu[!, :x1] ≅ df3[!, :x1]) dfa = DataFrame(a=[1, 2]) dfb = DataFrame(b=[3, missing]) @test hcat(dfa, dfb) ≅ [dfa dfb] - dfh3 = hcat(df3, df4, df5, makeunique=true) + dfh3 = hcat(df3, df4, df5, dupcol=:makeunique) @test names(dfh3) == ["x1", "x1_1", "x2", "x1_2", "x2_1"] - @test dfh3 ≅ hcat(dfh, df5, makeunique=true) - @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, makeunique=true) + @test dfh3 ≅ hcat(dfh, df5, dupcol=:makeunique) + @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, dupcol=:makeunique) + + @test df2 ≅ DataFrames.hcat!(df2, dupcol=:makeunique) - @test df2 ≅ DataFrames.hcat!(df2, makeunique=true) + dfh3 = hcat(df3, df4, df5, dupcol=:update) + @test names(dfh3) == ["x1", "x2"] + @test dfh3 ≅ hcat(dfhu, df5, dupcol=:update) + @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, dupcol=:update) end @testset "hcat: copying" begin df = DataFrame(x=1:3) @test hcat(df)[!, 1] == df[!, 1] @test hcat(df)[!, 1] !== df[!, 1] + hdf = hcat(df, df, dupcol=:makeunique) + @test hdf[!, 1] == df[!, 1] + @test hdf[!, 1] !== df[!, 1] + @test hdf[!, 2] == df[!, 1] + @test hdf[!, 2] !== df[!, 1] + @test hdf[!, 1] == hdf[!, 2] + @test hdf[!, 1] !== hdf[!, 2] hdf = hcat(df, df, makeunique=true) @test hdf[!, 1] == df[!, 1] @test hdf[!, 1] !== df[!, 1] @@ -43,7 +61,7 @@ end @test hdf[!, 2] !== df[!, 1] @test hdf[!, 1] == hdf[!, 2] @test hdf[!, 1] !== hdf[!, 2] - hdf = hcat(df, df, df, makeunique=true) + hdf = hcat(df, df, df, dupcol=:makeunique) @test hdf[!, 1] == df[!, 1] @test hdf[!, 1] !== df[!, 1] @test hdf[!, 2] == df[!, 1] @@ -56,18 +74,22 @@ end @test hdf[!, 1] !== hdf[!, 3] @test hdf[!, 2] == hdf[!, 3] @test hdf[!, 2] !== hdf[!, 3] + hdf = hcat(df, df, dupcol=:update) + @test hdf ≅ df end @testset "hcat ::AbstractDataFrame" begin df = DataFrame(A=repeat('A':'C', inner=4), B=1:12) gd = groupby(df, :A) answer = DataFrame(A=fill('A', 4), B=1:4, A_1='B', B_1=5:8, A_2='C', B_2=9:12) - @test hcat(gd..., makeunique=true) == answer + @test hcat(gd..., dupcol=:makeunique) == answer answer = answer[:, 1:4] - @test hcat(gd[1], gd[2], makeunique=true) == answer + @test hcat(gd[1], gd[2], dupcol=:makeunique) == answer @test_throws MethodError hcat("a", df, makeunique=true) @test_throws MethodError hcat(df, "a", makeunique=true) + @test_throws MethodError hcat("a", df, dupcol=:makeunique) + @test_throws MethodError hcat(df, "a", dupcol=:makeunique) end @testset "hcat: copycols" begin diff --git a/test/dataframe.jl b/test/dataframe.jl index 971d7626dd..c36388ab5a 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -152,6 +152,8 @@ end @test names(rename(df, [:f, :g])) == ["f", "g"] @test names(rename(df, [:f, :f], makeunique=true)) == ["f", "f_1"] + @test names(rename(df, [:f, :f], dupcol=:makeunique)) == ["f", "f_1"] + @test names(rename(df, [:f, :f], dupcol=:update)) == ["f", "f"] @test names(df) == ["a", "b"] rename!(df, [:f, :g]) @@ -208,7 +210,7 @@ end @test df.newcol == ["a", "b"] @test_throws ArgumentError insertcols!(df, 1, :newcol => ["a1", "b1"]) - @test insertcols!(df, 1, :newcol => ["a1", "b1"], makeunique=true) == df + @test insertcols!(df, 1, :newcol => ["a1", "b1"], dupcol=:makeunique) == df @test propertynames(df) == [:newcol_1, :newcol, :a, :b] @test df.a == [1, 2] @test df.b == [3.0, 4.0] @@ -235,7 +237,7 @@ end @test df.newcol == ["a", "b"] @test_throws ArgumentError insertcols!(df, 1, "newcol" => ["a1", "b1"]) - @test insertcols!(df, 1, "newcol" => ["a1", "b1"], makeunique=true) == df + @test insertcols!(df, 1, "newcol" => ["a1", "b1"], dupcol=:makeunique) == df @test propertynames(df) == [:newcol_1, :newcol, :a, :b] @test df.a == [1, 2] @test df.b == [3.0, 4.0] @@ -254,10 +256,23 @@ end df = DataFrame(a=[1, 2], a_1=[3, 4]) @test_throws ArgumentError insertcols!(df, 1, :a => [11, 12]) @test df == DataFrame(a=[1, 2], a_1=[3, 4]) + insertcols!(df, 1, :a => [11, 12], dupcol=:makeunique) + @test propertynames(df) == [:a_2, :a, :a_1] + insertcols!(df, 4, :a => [11, 12], dupcol=:makeunique) + @test propertynames(df) == [:a_2, :a, :a_1, :a_3] + + df = DataFrame(a=[1, 2], a_1=[3, 4]) insertcols!(df, 1, :a => [11, 12], makeunique=true) @test propertynames(df) == [:a_2, :a, :a_1] insertcols!(df, 4, :a => [11, 12], makeunique=true) @test propertynames(df) == [:a_2, :a, :a_1, :a_3] + + df = DataFrame(a=[1, 2], a_1=[3, 4]) + insertcols!(df, 1, :a => [11, 12], dupcol=:update) + @test propertynames(df) == [:a, :a_1] + @test df == DataFrame(a=[11, 12], a_1=[3, 4]) + + @test_throws ArgumentError insertcols!(df, 10, :a => [11, 12], dupcol=:makeunique) @test_throws ArgumentError insertcols!(df, 10, :a => [11, 12], makeunique=true) dfc = copy(df) @@ -303,6 +318,11 @@ end @test df.a_1 === v2 @test df.a_2 === v3 + df = DataFrame() + @test insertcols!(df, 1, :a=>v1, :a=>v2, :a=>v3, dupcol=:update, copycols=false) == + DataFrame(a=v3) + @test df.a isa Vector{Int} + df = DataFrame(p='a':'b', q='r':'s') @test insertcols!(df, 2, :a=>v1, :b=>v2, :c=>v3) == DataFrame(p='a':'b', a=v1, b=v2, c=v3, q='r':'s') @@ -313,12 +333,21 @@ end df = DataFrame(p='a':'b', q='r':'s') @test_throws ArgumentError insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3) - @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, makeunique=true, copycols=true) == + @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, dupcol=:makeunique, copycols=true) == DataFrame(p='a':'b', p_1=v1, q_1=v2, p_2=v3, q='r':'s') @test df.p_1 isa Vector{Int} @test df.q_1 !== v2 @test df.p_2 !== v3 + df = DataFrame(p='a':'b', q='r':'s') + @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, makeunique=true, copycols=true) == + DataFrame(p='a':'b', p_1=v1, q_1=v2, p_2=v3, q='r':'s') + + df = DataFrame(p='a':'b', q='r':'s') + @test_throws ArgumentError insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3) + @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, dupcol=:update, copycols=true) == + DataFrame(p=v3, q=v2) + df = DataFrame(a=1:3, b=4:6) @test insertcols!(copy(df), :c=>7:9) == insertcols!(copy(df), 3, :c=>7:9) df = DataFrame() diff --git a/test/join.jl b/test/join.jl index 478cca98d3..3d19b2b115 100644 --- a/test/join.jl +++ b/test/join.jl @@ -182,12 +182,12 @@ end @test typeof.(eachcol(crossjoin(df1, df2, makeunique=true))) == [Vector{Int}, Vector{Float64}, Vector{Int}, Vector{Float64}] - i(on) = innerjoin(df1, df2, on=on, makeunique=true) - l(on) = leftjoin(df1, df2, on=on, makeunique=true) - r(on) = rightjoin(df1, df2, on=on, makeunique=true) - o(on) = outerjoin(df1, df2, on=on, makeunique=true) - s(on) = semijoin(df1, df2, on=on, makeunique=true) - a(on) = antijoin(df1, df2, on=on, makeunique=true) + i(on,makeunique=true) = innerjoin(df1, df2, on=on, makeunique=makeunique) + l(on,makeunique=true) = leftjoin(df1, df2, on=on, makeunique=makeunique) + r(on,makeunique=true) = rightjoin(df1, df2, on=on, makeunique=makeunique) + o(on,makeunique=true) = outerjoin(df1, df2, on=on, makeunique=makeunique) + s(on,makeunique=true) = semijoin(df1, df2, on=on, makeunique=makeunique) + a(on,makeunique=true) = antijoin(df1, df2, on=on, makeunique=makeunique) @test s(:id) == s(:fid) == @@ -251,6 +251,71 @@ end @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] end +@testset "update joins" begin + df1 = DataFrame(Any[[1, 3, 5], [1.0, 3.0, 5.0]], [:id, :fid]) + df2 = DataFrame(Any[[0, 1, 2, 3, 4], [0.0, 1.0, 2.0, 3.0, 4.0]], [:id, :fid]) + + @test crossjoin(df1, df2, dupcol=:update) == + DataFrame(Any[repeat([0, 1, 2, 3, 4], outer=3), + repeat([0.0, 1.0, 2.0, 3.0, 4.0], outer=3)], + [:id, :fid]) + + i(on,dupcol=:update) = innerjoin(df1, df2, on=on, dupcol=dupcol) + l(on,dupcol=:update) = leftjoin(df1, df2, on=on, dupcol=dupcol) + r(on,dupcol=:update) = rightjoin(df1, df2, on=on, dupcol=dupcol) + o(on,dupcol=:update) = outerjoin(df1, df2, on=on, dupcol=dupcol) + s(on,dupcol=:update) = semijoin(df1, df2, on=on, dupcol=dupcol) + a(on,dupcol=:update) = antijoin(df1, df2, on=on, dupcol=dupcol) + + @test s(:id) == + s(:fid) == + s([:id, :fid]) == DataFrame([[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(eachcol(s(:id))) == + typeof.(eachcol(s(:fid))) == + typeof.(eachcol(s([:id, :fid]))) == [Vector{Int}, Vector{Float64}] + @test a(:id) == + a(:fid) == + a([:id, :fid]) == DataFrame([[5], [5]], [:id, :fid]) + @test typeof.(eachcol(a(:id))) == + typeof.(eachcol(a(:fid))) == + typeof.(eachcol(a([:id, :fid]))) == [Vector{Int}, Vector{Float64}] + + on = :id + @test i(on) == DataFrame([[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(eachcol(i(on))) == [Vector{Int}, Vector{Float64}] + @test l(on) ≅ DataFrame(id=[1, 3, 5], fid=[1, 3, 5]) + @test typeof.(eachcol(l(on))) == [Vector{Int}, Vector{Float64}] + @test r(on) ≅ DataFrame(id=[1, 3, 0, 2, 4], fid=[1, 3, 0, 2, 4]) + @test typeof.(eachcol(r(on))) == [Vector{Int}, Vector{Float64}] + @test o(on) ≅ DataFrame(id=[1, 3, 5, 0, 2, 4], + fid=[1, 3, 5, 0, 2, 4]) + @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] + + on = :fid + df1.id = [1, missing, 5] + @test i(on) == DataFrame([[1, 3], [1.0, 3.0]], [:id, :fid]) + @test typeof.(eachcol(i(on))) == [Vector{Int}, Vector{Float64}] + @test l(on) ≅ DataFrame(id=[1, 3, 5], fid=[1, 3, 5]) + @test typeof.(eachcol(l(on))) == [Vector{Int}, Vector{Float64}] + @test r(on) ≅ DataFrame(id=[1, 3, 0, 2, 4], + fid=[1, 3, 0, 2, 4]) + @test typeof.(eachcol(r(on))) == [Vector{Int}, Vector{Float64}] + @test o(on) ≅ DataFrame(id=[1, 3, 5, 0, 2, 4], + fid=[1, 3, 5, 0, 2, 4]) + @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] + + on = [:id, :fid] + df1.id = [1, 3, 5] + @test i(on) == DataFrame([[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(eachcol(i(on))) == [Vector{Int}, Vector{Float64}] + @test l(on) == DataFrame(id=[1, 3, 5], fid=[1, 3, 5]) + @test typeof.(eachcol(l(on))) == [Vector{Int}, Vector{Float64}] + @test r(on) == DataFrame(id=[1, 3, 0, 2, 4], fid=[1, 3, 0, 2, 4]) + @test typeof.(eachcol(r(on))) == [Vector{Int}, Vector{Float64}] + @test o(on) == DataFrame(id=[1, 3, 5, 0, 2, 4], fid=[1, 3, 5, 0, 2, 4]) + @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] +end + @testset "all joins with CategoricalArrays" begin df1 = DataFrame(Any[CategoricalArray([1, 3, 5]), CategoricalArray([1.0, 3.0, 5.0])], [:id, :fid]) From cca04bb9f2b8f37e8731472ad12a3833ab241ebd Mon Sep 17 00:00:00 2001 From: Lee Iverson Date: Tue, 19 Sep 2023 10:52:56 -0700 Subject: [PATCH 2/2] Change dupcol keyword to mergeduplicates and adapt behaviour to match bkamins comment in https://github.com/JuliaData/DataFrames.jl/pull/3366 Is now used to pass a Function to handle cases where makequnique=false by combining those values (passed as parameters) into a returned result. --- src/abstractdataframe/abstractdataframe.jl | 117 +++++++++--------- src/abstractdataframe/reshape.jl | 16 +-- src/dataframe/dataframe.jl | 110 +++++++++-------- src/join/composer.jl | 135 ++++++++++----------- src/join/inplace.jl | 42 +++---- src/other/index.jl | 37 +++--- src/other/tables.jl | 4 +- src/other/utils.jl | 29 ++--- test/cat.jl | 33 +++-- test/dataframe.jl | 27 +++-- test/join.jl | 16 +-- 11 files changed, 284 insertions(+), 282 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index b6a6d9e6e5..5b1f0abf00 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -117,9 +117,9 @@ Compat.hasproperty(df::AbstractDataFrame, s::AbstractString) = haskey(index(df), """ rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false, dupcol::Symbol=:error) + makeunique::Bool=false) rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false, dupcol::Symbol=:error) + makeunique::Bool=false) rename!(df::AbstractDataFrame, (from => to)::Pair...) rename!(df::AbstractDataFrame, d::AbstractDict) rename!(df::AbstractDataFrame, d::AbstractVector{<:Pair}) @@ -179,9 +179,9 @@ julia> rename!(df, [:a, :b, :c]) 1 │ 1 2 3 julia> rename!(df, [:a, :b, :a]) -ERROR: ArgumentError: Duplicate variable names: :a. Pass dupcol=:makeunique to make them unique using a suffix automatically. +ERROR: ArgumentError: Duplicate variable names: :a. Pass makeunique=true to make them unique using a suffix automatically. -julia> rename!(df, [:a, :b, :a], dupcol=:makeunique) +julia> rename!(df, [:a, :b, :a], makeunique=true) 1×3 DataFrame Row │ a b a_1 │ Int64 Int64 Int64 @@ -197,16 +197,16 @@ julia> rename!(uppercase, df) ``` """ function rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false, dupcol::Symbol=:error) - rename!(index(df), vals, makeunique=makeunique, dupcol=dupcol) + makeunique::Bool=false) + rename!(index(df), vals, makeunique=makeunique) # renaming columns of SubDataFrame has to clean non-note metadata in its parent _drop_all_nonnote_metadata!(parent(df)) return df end function rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false, dupcol::Symbol=:error) - rename!(index(df), Symbol.(vals), makeunique=makeunique, dupcol=dupcol) + makeunique::Bool=false) + rename!(index(df), Symbol.(vals), makeunique=makeunique) # renaming columns of SubDataFrame has to clean non-note metadata in its parent _drop_all_nonnote_metadata!(parent(df)) return df @@ -353,9 +353,9 @@ julia> rename(uppercase, df) ``` """ rename(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false, dupcol::Symbol=:error) = rename!(copy(df), vals, makeunique=makeunique, dupcol=dupcol) + makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique) rename(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false, dupcol::Symbol=:error) = rename!(copy(df), vals, makeunique=makeunique, dupcol=dupcol) + makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique) rename(df::AbstractDataFrame, args...) = rename!(copy(df), args...) rename(f::Function, df::AbstractDataFrame) = rename!(f, copy(df)) @@ -1536,20 +1536,17 @@ end """ hcat(df::AbstractDataFrame...; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) Horizontally concatenate data frames. If `makeunique=false` (the default) column names of passed objects must be unique. If `makeunique=true` then duplicate column names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). -Deprecated in favor of `dupcol` -If `dupcol=:error` (the default) then columns names of passed objects must be unique. -If `dupcol=:makeunique` then duplicate column names will be suffixed -with `_i` (`i` starting at 1 for the first duplicate). -If `dupcol=:update` then duplicate columns names will be combined with the left-hand -column overwritten by non-missing values from the right hand column(s) +If `makeunique=false` and `mergeduplicates` is a Function then duplicate column names +will be combined by this function with the column named overwritten by the results of +the function on all values from the duplicated column(s). If `copycols=true` (the default) then the `DataFrame` returned by `hcat` will contain copied columns from the source data frames. @@ -1582,7 +1579,7 @@ julia> df2 = DataFrame(A=4:6, B=4:6) 2 │ 5 5 3 │ 6 6 -julia> df3 = hcat(df1, df2, dupcol=:makeunique) +julia> df3 = hcat(df1, df2, makeunique=true) 3×4 DataFrame Row │ A B A_1 B_1 │ Int64 Int64 Int64 Int64 @@ -1594,32 +1591,32 @@ julia> df3 = hcat(df1, df2, dupcol=:makeunique) julia> df3.A === df1.A false -julia> df3 = hcat(df1, df2, dupcol=:makeunique, copycols=false); +julia> df3 = hcat(df1, df2, mergeduplicates=:makeunique, copycols=false); julia> df3.A === df1.A true ``` """ -function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) +function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) df = DataFrame(df, copycols=copycols) _drop_all_nonnote_metadata!(df) return df end # TODO: after deprecation remove AbstractVector methods -Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = - hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, dupcol=dupcol, copycols=copycols) -Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = - hcat!(x, df, makeunique=makeunique, dupcol=dupcol, copycols=copycols) +Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = + hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) +Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = + hcat!(x, df, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = hcat!(DataFrame(df1, copycols=copycols), df2, - makeunique=makeunique, dupcol=dupcol, copycols=copycols) + makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) Base.hcat(df::AbstractDataFrame, x::Union{AbstractVector, AbstractDataFrame}, y::Union{AbstractVector, AbstractDataFrame}...; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = - hcat!(hcat(df, x, makeunique=makeunique, dupcol=dupcol, copycols=copycols), y..., - makeunique=makeunique, dupcol=dupcol, copycols=copycols) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = + hcat!(hcat(df, x, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols), y..., + makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) """ vcat(dfs::AbstractDataFrame...; @@ -2875,12 +2872,11 @@ const INSERTCOLS_ARGUMENTS = are unwrapped and treated in the same way - `after` : if `true` columns are inserted after `col` - `makeunique` : defines what to do if `name` already exists in `df`; - if it is `false` an error will be thrown; if it is `true` a new unique name will - be generated by adding a suffix - - `dupcol` : defines what to do if `name` already exists in `df`; - if it is :error an error will be thrown; if is :makeunique a new unique name will - be generated by adding a suffix; if it is :update then the existing column will be - updated with the non-missing values + if it is `true` a new unique name will be generated by adding a suffix, + if it is `false` an error will be thrown unless a `mergeduplicates` functiom is provided. + - `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of all of the duplicated + columns which will be passed as a varargs. The return value is used. - `copycols` : whether vectors passed as columns should be copied If `val` is an `AbstractRange` then the result of `collect(val)` is inserted. @@ -2902,7 +2898,7 @@ const INSERTCOLS_ARGUMENTS = """ insertcols(df::AbstractDataFrame[, col], (name=>val)::Pair...; - after::Bool=false, makeunique::Bool=false, dupcol=:error, copycols::Bool=true) + after::Bool=false, makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) Insert a column into a copy of `df` data frame using the [`insertcols!`](@ref) function and return the newly created data frame. @@ -2933,7 +2929,7 @@ julia> insertcols(df, 1, :b => 'a':'c') 2 │ b 2 3 │ c 3 -julia> insertcols(df, :c => 2:4, :c => 3:5, dupcol=:error) +julia> insertcols(df, :c => 2:4, :c => 3:5, mergeduplicates=nothing) 3×3 DataFrame Row │ a c c_1 │ Int64 Int64 Int64 @@ -2953,13 +2949,13 @@ julia> insertcols(df, :a, :d => 7:9, after=true) ``` """ insertcols(df::AbstractDataFrame, args...; - after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = insertcols!(copy(df), args...; - after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols) + after=after, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) """ insertcols!(df::AbstractDataFrame[, col], (name=>val)::Pair...; - after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) + after::Bool=false, makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) Insert a column into a data frame in place. Return the updated data frame. @@ -2990,7 +2986,7 @@ julia> insertcols!(df, 1, :b => 'a':'c') 2 │ b 2 3 │ c 3 -julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, dupcol=:error) +julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, mergeduplicates=nothing) 3×4 DataFrame Row │ b c c_1 a │ Char Int64 Int64 Int64 @@ -3010,9 +3006,9 @@ julia> insertcols!(df, :b, :d => 7:9, after=true) ``` """ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol}...; - after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) + after::Bool=false, makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) - dupcol = _dupcol(dupcol, makeunique) + _check_makeunique_args(mergeduplicates, makeunique) if !is_column_insertion_allowed(df) throw(ArgumentError("insertcols! is only supported for DataFrame, or for " * @@ -3039,15 +3035,15 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy "$(ncol(df)) columns at index $col_ind")) end - if dupcol == :error + if !makeunique && isnothing(mergeduplicates) if !allunique(first.(name_cols)) throw(ArgumentError("Names of columns to be inserted into a data frame " * - "must be unique when `dupcol=:error`")) + "must be unique when `mergeduplicates=nothing`")) end for (n, _) in name_cols if hasproperty(df, n) throw(ArgumentError("Column $n is already present in the data frame " * - "which is not allowed when `dupcol=:error`")) + "which is not allowed when `mergeduplicates=nothing`")) end end end @@ -3081,6 +3077,7 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy target_row_count = 1 end + mergecolumns = Dict{Symbol, Any}() start_col_ind = col_ind for (name, item) in name_cols if !(item isa AbstractVector) @@ -3117,7 +3114,7 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy dfp[!, name] = item_new else if hasproperty(dfp, name) - if dupcol == :makeunique + if makeunique k = 1 while true nn = Symbol("$(name)_$k") @@ -3130,9 +3127,10 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy insert!(index(dfp), col_ind, name) insert!(_columns(dfp), col_ind, item_new) else - @assert dupcol == :update # Just update without adding to index - dfp[!, name] = _update_missing.(dfp[!, name], item_new) + merge = get(mergecolumns, name, (dfp=dfp, cols=[])) + push!(merge.cols, item_new) + mergecolumns[name] = merge col_ind -= 1 end else @@ -3143,6 +3141,11 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy col_ind += 1 end + # Combine columns using mergeduplicates + for (name, merge) in mergecolumns + merge.dfp[!, name] = mergeduplicates.(merge.dfp[!, name], merge.cols...) + end + delta = col_ind - start_col_ind colmetadata_dict = getfield(parent(df), :colmetadata) if !isnothing(colmetadata_dict) && delta > 0 @@ -3157,22 +3160,22 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy end insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString}...; - after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)..., - after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols) + after=after, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol}...; - after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = insertcols!(df, ncol(df)+1, name_cols..., after=after, - makeunique=makeunique, dupcol=dupcol, copycols=copycols) + makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString}...; - after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)..., - after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols) + after=after, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false, - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) if col isa SymbolOrString col_ind = Int(columnindex(df, col)) if col_ind == 0 @@ -3196,7 +3199,7 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false, end function insertcols!(df::AbstractDataFrame; after::Bool=false, - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) _drop_all_nonnote_metadata!(parent(df)) return df end diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 3da3ce0912..1a4e9e9bb4 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -823,7 +823,7 @@ julia> permutedims(df2, 1, "different_name") """ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, dest_namescol::Union{Symbol, AbstractString}; - makeunique::Bool=false, dupcol::Symbol=:error, strict::Bool=true) + makeunique::Bool=false, mergeduplicates=nothing, strict::Bool=true) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) @@ -854,18 +854,18 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, if ncol(df_notsrc) == 0 df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], new_col_names, - makeunique=makeunique, dupcol=dupcol, copycols=false) + makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=false) else m = permutedims(Matrix(df_notsrc)) - df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique, dupcol=dupcol) + df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique) end - out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, dupcol=dupcol, copycols=false) + out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=false) _copy_table_note_metadata!(out_df, df) return out_df end function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; - makeunique::Bool=false, dupcol::Symbol=:error, strict::Bool=true) + makeunique::Bool=false, mergeduplicates=nothing, strict::Bool=true) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) dest_namescol = _names(df)[src_namescol] @@ -873,7 +873,7 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; dest_namescol = src_namescol end return permutedims(df, src_namescol, dest_namescol; - makeunique=makeunique, dupcol=dupcol, strict=strict) + makeunique=makeunique, mergeduplicates=mergeduplicates, strict=strict) end function Base.permutedims(df::AbstractDataFrame) @@ -883,8 +883,8 @@ function Base.permutedims(df::AbstractDataFrame) end function Base.permutedims(df::AbstractDataFrame, cnames::AbstractVector; - makeunique::Bool=false, dupcol::Symbol=:error) - out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique, dupcol=dupcol) + makeunique::Bool=false, mergeduplicates=nothing) + out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique, mergeduplicates=mergeduplicates) _copy_table_note_metadata!(out_df, df) return out_df end diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index f76bcd4db7..37db743ac3 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -8,16 +8,16 @@ particularly a `Vector`, `PooledVector` or `CategoricalVector`. # Constructors ```julia -DataFrame(pairs::Pair...; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) -DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) +DataFrame(pairs::Pair...; makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) +DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) DataFrame(ds::AbstractDict; copycols::Bool=true) DataFrame(; kwargs..., copycols::Bool=true) DataFrame(table; copycols::Union{Bool, Nothing}=nothing) DataFrame(table, names::AbstractVector; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Union{Bool, Nothing}=nothing) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Union{Bool, Nothing}=nothing) DataFrame(columns::AbstractVecOrMat, names::AbstractVector; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) DataFrame(::DataFrameRow; copycols::Bool=true) DataFrame(::GroupedDataFrame; copycols::Bool=true, keepkeys::Bool=true) @@ -35,7 +35,6 @@ DataFrame(::GroupedDataFrame; copycols::Bool=true, keepkeys::Bool=true) To force a copy in such cases, or to get mutable columns from an immutable input table (like `Arrow.Table`), pass `copycols=true` explicitly. - `makeunique` : if `false` (the default), an error will be raised -- `Symbol` : One of :error (the default), :makeunique (same as makeunique=true), or :update (note that not all constructors support these keyword arguments) @@ -85,13 +84,13 @@ Pass the `copycols=false` keyword argument (where supported) to reuse vectors wi copying them. By default an error will be raised if duplicates in column names are found. Pass -`makeunique=true` keyword argument or `dupcol=:makeunique` (where supported) to accept duplicate names, +`makeunique=true` keyword argument to accept duplicate names, in which case they will be suffixed with `_i` (`i` starting at 1 for the first -duplicate). +duplicate), or provide a `mergeduplicates` function. -If duplicate column names are found and `dupcol=:update` then the left-hand column is `updated` -with values from the right-hand column (i.e. non-missing values in the right-hand column will -overwrite values in the left-hand column) +If `makeunique=false` and `mergeduplicates` is a Function then duplicate column names +will be combined by this function with the column named overwritten by the results of +the function on all values from the duplicated column(s). If an `AbstractRange` is passed to a `DataFrame` constructor as a column it is always collected to a `Vector` (even if `copycols=false`). As a general rule @@ -196,7 +195,8 @@ mutable struct DataFrame <: AbstractDataFrame # the inner constructor should not be used directly function DataFrame(columns::Union{Vector{Any}, Vector{AbstractVector}}, - colindex::Index; copycols::Bool=true) + colindex::Index; copycols::Bool=true, + mergeduplicates=nothing) if length(columns) == length(colindex) == 0 return new(AbstractVector[], Index(), nothing, nothing, true) elseif length(columns) != column_length(colindex) @@ -239,6 +239,7 @@ mutable struct DataFrame <: AbstractDataFrame # process updates if they exist if !isempty(colindex.updates) + merges = Dict{Symbol,Any}() updated = Vector{Any}(nothing, length(colindex.names)) for src in eachindex(colindex.updates) name = colindex.updates[src] @@ -246,9 +247,22 @@ mutable struct DataFrame <: AbstractDataFrame if isnothing(updated[dst]) updated[dst] = columns[src] else - updated[dst] = _update_missing.(updated[dst], columns[src]) + if isnothing(mergeduplicates) + msg = "Duplicate variable names: $name. Pass makeunique=true " * + "to make them unique using a suffix automatically." + throw(ArgumentError(msg)) + end + merge = get(merges, name, (dst=dst, columns=[])) + push!(merge.columns, columns[src]) + merges[name] = merge end end + + # Handle mergeduplicates updates + for (_, merge) in merges + updated[merge.dst] = mergeduplicates.(updated[merge.dst], merge.columns...) + end + columns = updated colindex = Index(colindex.lookup, colindex.names) end @@ -276,26 +290,26 @@ end DataFrame(df::DataFrame; copycols::Bool=true) = copy(df, copycols=copycols) function DataFrame(pairs::Pair{Symbol, <:Any}...; - makeunique::Bool=false, dupcol::Symbol=:error, + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true)::DataFrame colnames = [Symbol(k) for (k, v) in pairs] columns = Any[v for (k, v) in pairs] - return DataFrame(columns, Index(colnames, dupcol=_dupcol(dupcol, makeunique)), - copycols=copycols) + return DataFrame(columns, Index(colnames, makeunique=makeunique), + copycols=copycols, mergeduplicates=mergeduplicates) end function DataFrame(pairs::Pair{<:AbstractString, <:Any}...; - makeunique::Bool=false, dupcol::Symbol=:error, + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true)::DataFrame colnames = [Symbol(k) for (k, v) in pairs] columns = Any[v for (k, v) in pairs] - return DataFrame(columns, Index(colnames, dupcol=_dupcol(dupcol, makeunique)), - copycols=copycols) + return DataFrame(columns, Index(colnames, makeunique=makeunique), + copycols=copycols, mergeduplicates=mergeduplicates) end # this is needed as a workaround for Tables.jl dispatch function DataFrame(pairs::AbstractVector{<:Pair}; - makeunique::Bool=false, dupcol::Symbol=:error, + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) if isempty(pairs) return DataFrame() @@ -305,8 +319,8 @@ function DataFrame(pairs::AbstractVector{<:Pair}; end colnames = [Symbol(k) for (k, v) in pairs] columns = Any[v for (k, v) in pairs] - return DataFrame(columns, Index(colnames, dupcol=_dupcol(dupcol, makeunique)), - copycols=copycols) + return DataFrame(columns, Index(colnames, makeunique=makeunique), + copycols=copycols, mergeduplicates=mergeduplicates) end end @@ -358,14 +372,14 @@ function DataFrame(; kwargs...) end function DataFrame(columns::AbstractVector, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)::DataFrame - dupcol = _dupcol(dupcol, makeunique) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true)::DataFrame + _check_makeunique_args(mergeduplicates, makeunique) if !(eltype(columns) <: AbstractVector) && !all(col -> isa(col, AbstractVector), columns) - return rename!(DataFrame(columns, copycols=copycols), cnames, dupcol=dupcol) + return rename!(DataFrame(columns, copycols=copycols), cnames, makeunique=makeunique) end return DataFrame(collect(AbstractVector, columns), - Index(convert(Vector{Symbol}, cnames), dupcol=dupcol), - copycols=copycols) + Index(convert(Vector{Symbol}, cnames), makeunique=makeunique), + copycols=copycols, mergeduplicates=mergeduplicates) end function _name2symbol(str::AbstractVector) @@ -376,18 +390,18 @@ function _name2symbol(str::AbstractVector) end DataFrame(columns::AbstractVector, cnames::AbstractVector; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = - DataFrame(columns, _name2symbol(cnames), dupcol=_dupcol(dupcol, makeunique), copycols=copycols) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = + DataFrame(columns, _name2symbol(cnames), mergeduplicates=mergeduplicates, makeunique=makeunique, copycols=copycols) DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)::DataFrame = + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true)::DataFrame = DataFrame(collect(AbstractVector, columns), - Index(convert(Vector{Symbol}, cnames), dupcol=_dupcol(dupcol, makeunique)), - copycols=copycols) + Index(convert(Vector{Symbol}, cnames), makeunique=makeunique), + copycols=copycols, mergeduplicates=mergeduplicates) DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = - DataFrame(columns, _name2symbol(cnames); dupcol=_dupcol(dupcol, makeunique), copycols=copycols) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = + DataFrame(columns, _name2symbol(cnames); mergeduplicates=mergeduplicates, makeunique=makeunique, copycols=copycols) function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true) if cnames !== :auto @@ -400,15 +414,15 @@ function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true) end function DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) getter = copycols ? getindex : view return DataFrame(AbstractVector[getter(columns, :, i) for i in 1:size(columns, 2)], - cnames, dupcol=_dupcol(dupcol, makeunique), copycols=false) + cnames, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=false) end DataFrame(columns::AbstractMatrix, cnames::AbstractVector; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = - DataFrame(columns, _name2symbol(cnames); dupcol=_dupcol(dupcol, makeunique), copycols=copycols) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = + DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) function DataFrame(columns::AbstractMatrix, cnames::Symbol; copycols::Bool=true) if cnames !== :auto @@ -417,7 +431,7 @@ function DataFrame(columns::AbstractMatrix, cnames::Symbol; copycols::Bool=true) "positional argument is passed then the second " * "argument must be a vector of column names or :auto")) end - return DataFrame(columns, gennames(size(columns, 2)), dupcol=:error, copycols=copycols) + return DataFrame(columns, gennames(size(columns, 2)), mergeduplicates=nothing, copycols=copycols) end # Discontinued constructors @@ -1227,8 +1241,8 @@ end # hcat! for 2 arguments, only a vector or a data frame is allowed function hcat!(df1::DataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) - u = add_names(index(df1), index(df2), dupcol=_dupcol(dupcol, makeunique)) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) + u = add_names(index(df1), index(df2), makeunique=true, mergeduplicates=mergeduplicates) _drop_all_nonnote_metadata!(df1) _keep_matching_table_note_metadata!(df1, df2) @@ -1242,31 +1256,31 @@ end # TODO: after deprecation remove AbstractVector methods -function hcat!(df::DataFrame, x::AbstractVector; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) +function hcat!(df::DataFrame, x::AbstractVector; makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) Base.depwarn("horizontal concatenation of data frame with a vector is deprecated. " * "Pass DataFrame(x1=x) instead.", :hcat!) return hcat!(df, DataFrame(AbstractVector[x], [:x1], copycols=false), - dupcol=_dupcol(dupcol, makeunique), copycols=copycols) + makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) end -function hcat!(x::AbstractVector, df::DataFrame; makeunique::Bool=false, dupcol::Symbol=:error,copycols::Bool=true) +function hcat!(x::AbstractVector, df::DataFrame; makeunique::Bool=false, mergeduplicates=nothing,copycols::Bool=true) Base.depwarn("horizontal concatenation of data frame with a vector is deprecated. " * "Pass DataFrame(x1=x) instead.", :hcat!) return hcat!(DataFrame(AbstractVector[x], [:x1], copycols=copycols), df, - dupcol=_dupcol(dupcol, makeunique), copycols=copycols) + makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) end # hcat! for 1-n arguments -function hcat!(df::DataFrame; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) +function hcat!(df::DataFrame; makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) _drop_all_nonnote_metadata!(df) return df end hcat!(a::DataFrame, b::Union{AbstractDataFrame, AbstractVector}, c::Union{AbstractDataFrame, AbstractVector}...; - makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) = - hcat!(hcat!(a, b, dupcol=_dupcol(dupcol, makeunique), copycols=copycols), - c..., dupcol=_dupcol(dupcol, makeunique), copycols=copycols) + makeunique::Bool=false, mergeduplicates=nothing, copycols::Bool=true) = + hcat!(hcat!(a, b, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols), + c..., makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) ############################################################################## ## diff --git a/src/join/composer.jl b/src/join/composer.jl index 48fd39d820..d1664c9747 100644 --- a/src/join/composer.jl +++ b/src/join/composer.jl @@ -119,7 +119,7 @@ _rename_cols(old_names::AbstractVector{Symbol}, function _propagate_join_metadata!(joiner::DataFrameJoiner, dfr_noon::AbstractDataFrame, res::DataFrame, kind::Symbol; - dupcol::Symbol=:error, names=nothing) + makeunique::Bool=false, mergeduplicates=nothing, names=nothing) @assert kind == :left || kind == :right || kind == :outer || kind == :inner # The steps taken in this function are (all applies only to :note-style metadata): @@ -175,12 +175,12 @@ function _propagate_join_metadata!(joiner::DataFrameJoiner, dfr_noon::AbstractDa end end - if dupcol != :update + if isnothing(mergeduplicates) for i in 1:ncol(dfr_noon) _copy_col_note_metadata!(res, ncol(joiner.dfl) + i, dfr_noon, i) end else - map = Index(names, dupcol=dupcol) + map = Index(names, makeunique=makeunique) for i in 1:ncol(dfr_noon) name = map.updates[ncol(joiner.dfl) + i] dst = map.lookup[name] @@ -245,7 +245,7 @@ function _count_sortperm!(input::Vector{Int}, count::Vector, end function compose_inner_table(joiner::DataFrameJoiner, - dupcol::Symbol, + makeunique, mergeduplicates, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, order::Symbol) @@ -288,9 +288,9 @@ function compose_inner_table(joiner::DataFrameJoiner, new_names = vcat(_rename_cols(_names(joiner.dfl), left_rename, joiner.left_on), _rename_cols(_names(dfr_noon), right_rename)) - res = DataFrame(cols, new_names, dupcol=dupcol, copycols=false) + res = DataFrame(cols, new_names, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=false) - _propagate_join_metadata!(joiner, dfr_noon, res, :inner, dupcol=dupcol, names=new_names) + _propagate_join_metadata!(joiner, dfr_noon, res, :inner, makeunique=makeunique, mergeduplicates=mergeduplicates, names=new_names) return res end @@ -302,7 +302,7 @@ function find_missing_idxs(present::Vector{Int}, target_len::Int) return _findall(not_seen) end -function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, dupcol::Symbol, +function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique, mergeduplicates, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, indicator::Union{Nothing, Symbol, AbstractString}, @@ -324,12 +324,12 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, dupcol::Sym else rightonly_ixs = 1:0 end - return _compose_joined_table(joiner, kind, dupcol, left_rename, right_rename, + return _compose_joined_table(joiner, kind, makeunique, mergeduplicates, left_rename, right_rename, indicator, left_ixs, right_ixs, leftonly_ixs, rightonly_ixs, order) end -function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, dupcol::Symbol, +function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique, mergeduplicates, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, indicator::Union{Nothing, Symbol, AbstractString}, @@ -450,14 +450,14 @@ function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, dupcol::Sy new_names = vcat(_rename_cols(_names(joiner.dfl), left_rename, joiner.left_on), _rename_cols(_names(dfr_noon), right_rename)) - res = DataFrame(cols, new_names, dupcol=dupcol, copycols=false) + res = DataFrame(cols, new_names, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=false) if new_order !== nothing isnothing(src_indicator) || permute!(src_indicator, new_order) permute!(res, new_order) end - _propagate_join_metadata!(joiner, dfr_noon, res, kind, dupcol=dupcol, names=new_names) + _propagate_join_metadata!(joiner, dfr_noon, res, kind, makeunique=makeunique, mergeduplicates=mergeduplicates, names=new_names) return res, src_indicator end @@ -494,7 +494,8 @@ function _sort_compose_helper(fillval::Int, # value to use to fill unused indice end function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector}, kind::Symbol, dupcol::Symbol, + on::Union{<:OnType, AbstractVector}, kind::Symbol, + makeunique::Bool, mergeduplicates, indicator::Union{Nothing, Symbol, AbstractString}, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}, left_rename::Union{Function, AbstractString, Symbol}, @@ -589,16 +590,16 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; src_indicator = nothing if kind == :inner - joined = compose_inner_table(joiner, dupcol, left_rename, right_rename, order) + joined = compose_inner_table(joiner, makeunique, mergeduplicates, left_rename, right_rename, order) elseif kind == :left joined, src_indicator = - compose_joined_table(joiner, kind, dupcol, left_rename, right_rename, indicator, order) + compose_joined_table(joiner, kind, makeunique, mergeduplicates, left_rename, right_rename, indicator, order) elseif kind == :right joined, src_indicator = - compose_joined_table(joiner, kind, dupcol, left_rename, right_rename, indicator, order) + compose_joined_table(joiner, kind, makeunique, mergeduplicates, left_rename, right_rename, indicator, order) elseif kind == :outer joined, src_indicator = - compose_joined_table(joiner, kind, dupcol, left_rename, right_rename, indicator, order) + compose_joined_table(joiner, kind, makeunique, mergeduplicates, left_rename, right_rename, indicator, order) elseif kind == :semi joined = joiner.dfl[find_semi_rows(joiner), :] elseif kind == :anti @@ -616,7 +617,7 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; invpool, pool) unique_indicator = indicator - if dupcol == :makeunique + if makeunique try_idx = 0 while hasproperty(joined, unique_indicator) try_idx += 1 @@ -624,12 +625,12 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; end end - if unique_indicator == indicator && dupcol == :update - joined[!, indicator] = _update_missing.(joined[!, indicator], indicatorcol) + if unique_indicator == indicator && !isnothing(mergeduplicates) + joined[!, indicator] = mergeduplicates.(joined[!, indicator], indicatorcol) else if hasproperty(joined, unique_indicator) throw(ArgumentError("joined data frame already has column " * - ":$unique_indicator. Pass dupcol=:makeunique to " * + ":$unique_indicator. Pass makeunique=true to " * "make it unique using a suffix automatically.")) end joined[!, unique_indicator] = indicatorcol @@ -642,10 +643,10 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; end """ - innerjoin(df1, df2; on, makeunique=false, dupcol=:error, validate=(false, false), + innerjoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) - innerjoin(df1, df2, dfs...; on, makeunique=false, dupcol=:error, + innerjoin(df1, df2, dfs...; on, makeunique=false, mergeduplicates=nothing, validate=(false, false), matchmissing=:error, order=:undefined) @@ -769,7 +770,7 @@ julia> innerjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = """ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, dupcol::Symbol=:error, + makeunique::Bool=false, mergeduplicates=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), renamecols::Pair=identity => identity, matchmissing::Symbol=:error, @@ -778,7 +779,7 @@ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; throw(ArgumentError("renamecols keyword argument must be a `Pair` " * "containing functions, strings, or `Symbol`s")) end - return _join(df1, df2, on=on, kind=:inner, dupcol=_dupcol(dupcol, makeunique), + return _join(df1, df2, on=on, kind=:inner, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=nothing, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) @@ -786,17 +787,17 @@ end function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, dupcol::Symbol=:error, + makeunique::Bool=false, mergeduplicates=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error, order::Symbol=:undefined) @assert !isempty(dfs) - dupcol = _dupcol(dupcol, makeunique) - res = innerjoin(df1, df2, on=on, dupcol=dupcol, validate=validate, + _check_makeunique_args(mergeduplicates, makeunique) + res = innerjoin(df1, df2, on=on, mergeduplicates=mergeduplicates, validate=validate, matchmissing=matchmissing, order=order === :right ? :undefined : order) for (i, dfn) in enumerate(dfs) - res = innerjoin(res, dfn, on=on, dupcol=dupcol, validate=validate, + res = innerjoin(res, dfn, on=on, mergeduplicates=mergeduplicates, validate=validate, matchmissing=matchmissing, order= order === :right ? (i == length(dfs) ? :right : :undefined) : @@ -806,7 +807,7 @@ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::Abstract end """ - leftjoin(df1, df2; on, makeunique=false, dupcol=:error, source=nothing, validate=(false, false), + leftjoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, source=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) Perform a left join of two data frame objects and return a `DataFrame` containing @@ -830,11 +831,9 @@ change in future releases. if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` (deprecated) (`i` starting at 1 for the first duplicate). -- `dupcol` : if `dupcol=:error` (the default) then columns names of passed objects must be unique. - If `dupcol=:makeunique` then duplicate column names will be suffixed - with `_i` (`i` starting at 1 for the first duplicate). - If `dupcol=:update` then duplicate columns names will be combined with the left-hand - column overwritten by non-missing values from the right hand column(s) +- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of all of the duplicated + columns which will be passed as a varargs. The return value is used. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name, for whether a row appeared in only `df1` (`"left_only"`) or in both (`"both"`). If the name is already in use, @@ -936,7 +935,7 @@ julia> leftjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase => """ function leftjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, dupcol::Symbol=:error, + makeunique::Bool=false, mergeduplicates=nothing, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), @@ -959,14 +958,14 @@ function leftjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; "It is not allowed to pass both `indicator` and `source` " * "keyword arguments at the same time.")) end - return _join(df1, df2, on=on, kind=:left, dupcol=_dupcol(dupcol, makeunique), + return _join(df1, df2, on=on, kind=:left, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=source, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) end """ - rightjoin(df1, df2; on, makeunique=false, dupcol=:error, source=nothing, + rightjoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, source=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) @@ -993,11 +992,9 @@ change in future releases. if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). (deprecated) -- `dupcol` : if `dupcol=:error` (the default) then columns names of passed objects must be unique. - If `dupcol=:makeunique` then duplicate column names will be suffixed - with `_i` (`i` starting at 1 for the first duplicate). - If `dupcol=:update` then duplicate columns names will be combined with the left-hand - column overwritten by non-missing values from the right hand column(s) +- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of all of the duplicated + columns which will be passed as a varargs. The return value is used. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name for whether a row appeared in only `df2` (`"right_only"`) or in both (`"both"`). If the name is already in use, @@ -1099,7 +1096,7 @@ julia> rightjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = """ function rightjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, dupcol::Symbol=:error, + makeunique::Bool=false, mergeduplicates=nothing, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), @@ -1121,16 +1118,16 @@ function rightjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; "It is not allowed to pass both `indicator` and `source` " * "keyword arguments at the same time.")) end - return _join(df1, df2, on=on, kind=:right, dupcol=_dupcol(dupcol, makeunique), + return _join(df1, df2, on=on, kind=:right, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=source, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) end """ - outerjoin(df1, df2; on, makeunique=false, dupcol::Symbol=:error, source=nothing, validate=(false, false), + outerjoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, source=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) - outerjoin(df1, df2, dfs...; on, makeunique=false, dupcol::Symbol=:error, + outerjoin(df1, df2, dfs...; on, makeunique=false, mergeduplicates=nothing, validate = (false, false), matchmissing=:error, order=:undefined) Perform an outer join of two or more data frame objects and return a `DataFrame` @@ -1157,11 +1154,9 @@ This behavior may change in future releases. if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). (deprecated) - - `dupcol` : if `dupcol=:error` (the default) then columns names of passed objects must be unique. - If `dupcol=:makeunique` then duplicate column names will be suffixed - with `_i` (`i` starting at 1 for the first duplicate). - If `dupcol=:update` then duplicate columns names will be combined with the left-hand - column overwritten by non-missing values from the right hand column(s) +- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of all of the duplicated + columns which will be passed as a varargs. The return value is used. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name for whether a row appeared in only `df1` (`"left_only"`), only `df2` (`"right_only"`) or in both (`"both"`). If the name is already in use, @@ -1274,7 +1269,7 @@ julia> outerjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = """ function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, dupcol::Symbol=:error, + makeunique::Bool=false, mergeduplicates=nothing, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), @@ -1296,7 +1291,7 @@ function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; "It is not allowed to pass both `indicator` and `source` " * "keyword arguments at the same time.")) end - return _join(df1, df2, on=on, kind=:outer, dupcol=_dupcol(dupcol, makeunique), + return _join(df1, df2, on=on, kind=:outer, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=source, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) @@ -1304,21 +1299,21 @@ end function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, dupcol::Symbol=:error, + makeunique::Bool=false, mergeduplicates=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error, order::Symbol=:undefined) - dupcol = _dupcol(dupcol, makeunique) - res = outerjoin(df1, df2, on=on, dupcol=dupcol, validate=validate, + _check_makeunique_args(mergeduplicates, makeunique) + res = outerjoin(df1, df2, on=on, mergeduplicates=mergeduplicates, validate=validate, matchmissing=matchmissing, order=order) for dfn in dfs - res = outerjoin(res, dfn, on=on, dupcol=dupcol, validate=validate, + res = outerjoin(res, dfn, on=on, mergeduplicates=mergeduplicates, validate=validate, matchmissing=matchmissing, order=order) end return res end """ - semijoin(df1, df2; on, makeunique=false, dupcol=:error, validate=(false, false), matchmissing=:error) + semijoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, validate=(false, false), matchmissing=:error) Perform a semi join of two data frame objects and return a `DataFrame` containing the result. A semi join returns the subset of rows of `df1` that @@ -1420,16 +1415,16 @@ julia> semijoin(name, job2, on = [:ID => :identifier]) ``` """ semijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, dupcol::Symbol=:error, + on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, mergeduplicates=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error) = - _join(df1, df2, on=on, kind=:semi, dupcol=_dupcol(dupcol, makeunique), + _join(df1, df2, on=on, kind=:semi, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=nothing, validate=validate, left_rename=identity, right_rename=identity, matchmissing=matchmissing, order=:left) """ - antijoin(df1, df2; on, makeunique=false, dupcol=:error, validate=(false, false), matchmissing=:error) + antijoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, validate=(false, false), matchmissing=:error) Perform an anti join of two data frame objects and return a `DataFrame` containing the result. An anti join returns the subset of rows of `df1` that do @@ -1524,10 +1519,10 @@ julia> antijoin(name, job2, on = [:ID => :identifier]) ``` """ antijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, dupcol::Symbol=:error, + on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, mergeduplicates=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error) = - _join(df1, df2, on=on, kind=:anti, dupcol=_dupcol(dupcol, makeunique), + _join(df1, df2, on=on, kind=:anti, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=nothing, validate=validate, left_rename=identity, right_rename=identity, matchmissing=matchmissing, @@ -1535,7 +1530,7 @@ antijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; """ crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, dupcol::Symbol=:error, renamecols=identity => identity) + makeunique::Bool=false, mergeduplicates=nothing, renamecols=identity => identity) crossjoin(df1, df2, dfs...; makeunique = false) Perform a cross join of two or more data frame objects and return a `DataFrame` @@ -1601,22 +1596,22 @@ julia> crossjoin(df1, df2) ``` """ function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, dupcol::Symbol=:error, renamecols::Pair=identity => identity) + makeunique::Bool=false, mergeduplicates=nothing, renamecols::Pair=identity => identity) _check_consistency(df1) _check_consistency(df2) - dupcol = _dupcol(dupcol, makeunique) + _check_makeunique_args(mergeduplicates, makeunique) r1, r2 = size(df1, 1), size(df2, 1) new_names = vcat(_rename_cols(_names(df1), first(renamecols)), _rename_cols(_names(df2), last(renamecols))) cols = Any[[repeat(c, inner=r2) for c in eachcol(df1)]; [repeat(c, outer=r1) for c in eachcol(df2)]] - res = DataFrame(cols, new_names, copycols=false, dupcol=dupcol) + res = DataFrame(cols, new_names, copycols=false, makeunique=makeunique, mergeduplicates=mergeduplicates) for i in 1:ncol(df1) _copy_col_note_metadata!(res, i, df1, i) end - if dupcol != :update + if isnothing(mergeduplicates) for i in 1:ncol(df2) _copy_col_note_metadata!(res, ncol(df1) + i, df2, i) end @@ -1628,8 +1623,8 @@ function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; end crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; - makeunique::Bool=false, dupcol::Symbol=:error) = - crossjoin(crossjoin(df1, df2, dupcol=_dupcol(dupcol, makeunique)), dfs..., dupcol=_dupcol(dupcol, makeunique)) + makeunique::Bool=false, mergeduplicates=nothing) = + crossjoin(crossjoin(df1, df2, makeunique=makeunique, mergeduplicates=mergeduplicates), dfs..., makeunique=makeunique, mergeduplicates=mergeduplicates) # an explicit error is thrown as join was supported in the past Base.join(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; diff --git a/src/join/inplace.jl b/src/join/inplace.jl index d7cda1112b..becdab2562 100644 --- a/src/join/inplace.jl +++ b/src/join/inplace.jl @@ -1,5 +1,5 @@ """ - leftjoin!(df1, df2; on, makeunique=false, dupcol=:error, source=nothing, + leftjoin!(df1, df2; on, makeunique=false, mergeduplicates=nothing, source=nothing, matchmissing=:error) @@ -25,15 +25,13 @@ added to `df1`. if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). -- `dupcol` : one of :error (the default), :makeunique or :update. If :error, - an error will be raised if duplicate names are found in columns not joined on; - if :makeunique, duplicate names will be suffixed with `_i` - (`i` starting at 1 for the first duplicate); if :update, left-hand side columns - will be overwritten by non-missing values in the right-hand side column(s). +- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of all of the duplicated + columns which will be passed as a varargs. The return value is used. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name, for whether a row appeared in only `df1` (`"left_only"`) or in both (`"both"`). If the name is already in use, - the column name will be modified if `makeunique=true` or `dupcol=:makeunique`. + the column name will be modified if `makeunique=true` or `makeunique=true`. - `matchmissing` : if equal to `:error` throw an error if `missing` is present in `on` columns; if equal to `:equal` then `missing` is allowed and missings are matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns. @@ -101,13 +99,13 @@ julia> leftjoin!(name, job2, on = :ID => :identifier, makeunique=true, source=:s """ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Union{<:OnType, AbstractVector}=Symbol[], - makeunique::Bool=false, dupcol::Symbol=:error, + makeunique::Bool=false, mergeduplicates=nothing, source::Union{Nothing, Symbol, AbstractString}=nothing, matchmissing::Symbol=:error) _check_consistency(df1) _check_consistency(df2) - dupcol = _dupcol(dupcol, makeunique) + _check_makeunique_args(mergeduplicates, makeunique) if !is_column_insertion_allowed(df1) throw(ArgumentError("leftjoin! is only supported if `df1` is a `DataFrame`, " * @@ -121,11 +119,11 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; joiner = DataFrameJoiner(df1, df2, on, matchmissing, :left) right_noon_names = names(joiner.dfr, Not(joiner.right_on)) - if dupcol == :error && !isempty(intersect(right_noon_names, names(df1))) + if mergeduplicates == :error && !isempty(intersect(right_noon_names, names(df1))) throw(ArgumentError("the following columns are present in both " * "left and right data frames but not listed in `on`: " * join(intersect(right_noon_names, names(df1)), ", ") * - ". Pass makeunique=true or dupcol=:makeunique to add a suffix automatically to " * + ". Pass makeunique=true or makeunique=true to add a suffix automatically to " * "columns names from the right data frame.")) end @@ -141,7 +139,7 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; rcol_joined = compose_joined_rcol!(rcol, similar_missing(rcol, nrow(df1)), right_ixs) # if df1 isa SubDataFrame we must copy columns - insertcols!(df1, colname => rcol_joined, dupcol=dupcol, + insertcols!(df1, colname => rcol_joined, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=!(df1 isa DataFrame)) # need to call parent as df1 can be a SubDataFrame _copy_col_note_metadata!(parent(df1), ncol(df1), joiner.dfr, colname) @@ -156,7 +154,7 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; invpool, pool) unique_indicator = source - if dupcol == :makeunique + if makeunique try_idx = 0 while hasproperty(df1, unique_indicator) try_idx += 1 @@ -165,13 +163,13 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; end if hasproperty(df1, unique_indicator) - if dupcol == :update - df1[!, unique_indicator] = _update_missing.(df1[!, unique_indicator], indicatorcol) + if !isnothing(mergeduplicates) + df1[!, unique_indicator] = mergeduplicates.(df1[!, unique_indicator], indicatorcol) else throw(ArgumentError("joined data frame already has column " * - ":$unique_indicator. Pass dupcol=:makeunique to " * - "make it unique using a suffix automatically or dupcol=:update " * - "to update left-hand column from right-hand.")) + ":$unique_indicator. Pass makeunique=true to " * + "make it unique using a suffix automatically or set mergeduplicates " * + "to a function.")) end else df1[!, unique_indicator] = indicatorcol @@ -209,15 +207,15 @@ end function outerjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Union{<:OnType, AbstractVector}=Symbol[], makeunique::Bool=false, - dupcol::Symbol=:error, + mergeduplicates=nothing, source::Union{Nothing, Symbol, AbstractString}=nothing, matchmissing::Symbol=:error) - dupcol = (makeunique ? :makeunique : dupcol) + mergeduplicates = (makeunique ? :makeunique : mergeduplicates) - leftjoin!(df1, df2, on=on, dupcol=dupcol, source=source, matchmissing=matchmissing) + leftjoin!(df1, df2, on=on, mergeduplicates=mergeduplicates, source=source, matchmissing=matchmissing) - aj = antijoin(df2, df1, on=on, dupcol=dupcol, matchmissing=matchmissing) + aj = antijoin(df2, df1, on=on, mergeduplicates=mergeduplicates, matchmissing=matchmissing) append!(df1, aj) return df1 diff --git a/src/other/index.jl b/src/other/index.jl index 2246a1f9ff..fad3d426bb 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -26,12 +26,12 @@ end Index(l,u) = Index(l,u,[]) -function Index(names::AbstractVector{Symbol}; dupcol::Symbol=:error) - if dupcol == :update +function Index(names::AbstractVector{Symbol}; makeunique::Bool=false) + if !makeunique lookup = Dict{Symbol, Int}(zip(reverse(names), length(names):-1:1)) return Index(lookup, unique(names), names) else - u = make_unique(names, dupcol=dupcol) + u = make_unique(names, makeunique=makeunique) lookup = Dict{Symbol, Int}(zip(u, 1:length(u))) return Index(lookup, u) end @@ -51,13 +51,12 @@ Base.isequal(x::AbstractIndex, y::AbstractIndex) = _names(x) == _names(y) # it i Base.:(==)(x::AbstractIndex, y::AbstractIndex) = isequal(x, y) -function rename!(x::Index, nms::AbstractVector{Symbol}; makeunique::Bool=false, dupcol::Symbol=:error) - dupcol = _dupcol(dupcol, makeunique) - if dupcol == :error +function rename!(x::Index, nms::AbstractVector{Symbol}; makeunique::Bool=false) + if !makeunique if length(unique(nms)) != length(nms) dup = unique(nms[nonunique(DataFrame(nms=nms))]) dupstr = join(string.(':', dup), ", ", " and ") - msg = "Duplicate variable names: $dupstr. Pass dupcol=:makeunique " * + msg = "Duplicate variable names: $dupstr. Pass makeunique=true " * "to make them unique using a suffix automatically." throw(ArgumentError(msg)) end @@ -65,7 +64,7 @@ function rename!(x::Index, nms::AbstractVector{Symbol}; makeunique::Bool=false, if length(nms) != length(x) throw(DimensionMismatch("Length of nms doesn't match length of x.")) end - make_unique!(x.names, nms, dupcol=dupcol) + make_unique!(x.names, nms, makeunique=makeunique) empty!(x.lookup) for (i, n) in enumerate(x.names) x.lookup[n] = i @@ -139,8 +138,8 @@ function Base.push!(x::Index, nm::Symbol) return x end -function Base.merge!(x::Index, y::AbstractIndex; makeunique::Bool=false, dupcol::Symbol=:error) - adds = add_names(x, y, dupcol=_dupcol(dupcol, makeunique)) +function Base.merge!(x::Index, y::AbstractIndex; makeunique::Bool=false, mergeduplicates=nothing) + adds = add_names(x, y, makeunique=makeunique, mergeduplicates=mergeduplicates) i = length(x) for add in adds i += 1 @@ -150,8 +149,8 @@ function Base.merge!(x::Index, y::AbstractIndex; makeunique::Bool=false, dupcol: return x end -Base.merge(x::AbstractIndex, y::AbstractIndex; makeunique::Bool=false, dupcol::Symbol=:error) = - merge!(copy(x), y, makeunique=makeunique, dupcol=dupcol) +Base.merge(x::AbstractIndex, y::AbstractIndex; makeunique::Bool=false, mergeduplicates=nothing) = + merge!(copy(x), y, makeunique=makeunique, mergeduplicates=mergeduplicates) function Base.delete!(x::Index, idx::Integer) # reset the lookup's beyond the deleted item @@ -442,10 +441,10 @@ end # Helpers # return Vector{Symbol} of names from add_ind that do not clash with `ind`. -# if `dupcol=:error` error on collision -# if `dupcol=:makeunique` generate new names that are deduplicated -# if `dupcol=:update` just return the names including duplicates -function add_names(ind::Index, add_ind::AbstractIndex; dupcol::Symbol=:error) +# if `makeunique=false` error on collision +# if `make=:makeunique` generate new names that are deduplicated +# if `mergeduplicates=Function` just return the names including duplicates +function add_names(ind::Index, add_ind::AbstractIndex; makeunique::Bool=false, mergeduplicates=nothing) u = copy(_names(add_ind)) seen = Set(_names(ind)) @@ -456,7 +455,7 @@ function add_names(ind::Index, add_ind::AbstractIndex; dupcol::Symbol=:error) in(name, seen) ? push!(dups, i) : push!(seen, name) end if length(dups) > 0 - if dupcol == :error + if !makeunique && isnothing(mergeduplicates) dupstr = join(string.(':', unique(u[dups])), ", ", " and ") msg = "Duplicate variable names: $dupstr. Pass makeunique=true " * "to make them unique using a suffix automatically." @@ -464,7 +463,7 @@ function add_names(ind::Index, add_ind::AbstractIndex; dupcol::Symbol=:error) end end - if dupcol == :update + if !isnothing(mergeduplicates) return u end @@ -595,7 +594,7 @@ function Base.getindex(x::SubIndex, idx::Union{AbstractVector{Symbol}, return [x[i] for i in idx] end -rename!(x::SubIndex, nms::AbstractVector{Symbol}; makeunique::Bool=false, dupcol::Symbol=:error) = +rename!(x::SubIndex, nms::AbstractVector{Symbol}; makeunique::Bool=false) = throw(ArgumentError("rename! is not supported for views other than created " * "with Colon as a column selector")) diff --git a/src/other/tables.jl b/src/other/tables.jl index 070e53b438..8d28487832 100644 --- a/src/other/tables.jl +++ b/src/other/tables.jl @@ -64,11 +64,11 @@ end # the logic here relies on the fact that Tables.CopiedColumns # is the only exception for default copycols value -DataFrame(x, cnames::AbstractVector; makeunique::Bool=false, dupcol::Symbol=:error, +DataFrame(x, cnames::AbstractVector; makeunique::Bool=false, mergeduplicates=nothing, copycols::Union{Nothing, Bool}=nothing) = rename!(DataFrame(x, copycols=something(copycols, !(x isa Tables.CopiedColumns))), _name2symbol(cnames), - makeunique=makeunique, dupcol=dupcol) + makeunique=makeunique) function Base.append!(df::DataFrame, table; cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset])) diff --git a/src/other/utils.jl b/src/other/utils.jl index 0422a18071..8a2e7560b1 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -72,26 +72,17 @@ struct AsTable end end -_update_missing(v1, v2) = ismissing(v2) ? v1 : v2 - -function _dupcol(dupcol::Symbol, makeunique=false) - if dupcol ∉ [:error, :makeunique, :update] - throw(ArgumentError("dupcol must be one of :error, :makeunique, or :update")) - end - if makeunique && dupcol == :update - throw(ArgumentError("makeunique=true and dupcol==:update are incompatible")) - end - if makeunique - Base.depwarn("makeunique=true will be replaced by dupcol=:makeunique", :unstack) +function _check_makeunique_args(mergeduplicates, makeunique::Bool=false) + if makeunique && !isnothing(mergeduplicates) + throw(ArgumentError("mergeduplicates should not be set if makeunique=true")) end - makeunique ? :makeunique : dupcol + mergeduplicates end Base.broadcastable(x::AsTable) = Ref(x) function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; - makeunique::Bool=false, dupcol::Symbol=:error) - dupcol = _dupcol(dupcol, makeunique) + makeunique::Bool=false) if length(names) != length(src) throw(DimensionMismatch("Length of src doesn't match length of names.")) end @@ -108,9 +99,9 @@ function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; end if length(dups) > 0 - if dupcol == :error + if !makeunique dupstr = join(string.(':', unique(src[dups])), ", ", " and ") - msg = "Duplicate variable names: $dupstr. Pass dupcol=:makeunique " * + msg = "Duplicate variable names: $dupstr. Pass makeunique=true " * "to make them unique using a suffix automatically." throw(ArgumentError(msg)) end @@ -118,7 +109,7 @@ function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; for i in dups nm = src[i] - if dupcol == :makeunique + if makeunique k = 1 while true newnm = Symbol("$(nm)_$k") @@ -137,8 +128,8 @@ function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; return names end -function make_unique(names::AbstractVector{Symbol}; makeunique::Bool=false, dupcol::Symbol=:error) - make_unique!(similar(names), names, dupcol=_dupcol(dupcol, makeunique)) +function make_unique(names::AbstractVector{Symbol}; makeunique::Bool=false) + make_unique!(similar(names), names, makeunique=makeunique) end """ diff --git a/test/cat.jl b/test/cat.jl index 4bf4e3cd55..d8999311ea 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -3,6 +3,8 @@ module TestCat using Test, Random, DataFrames, CategoricalArrays const ≅ = isequal +update_missing = (x...) -> coalesce(reverse(x)...) + @testset "hcat" begin nvint = [1, 2, missing, 4] nvstr = ["one", "two", missing, "four"] @@ -13,14 +15,14 @@ const ≅ = isequal df5 = DataFrame([Union{Int, Missing}[1, 2, 3, 4], nvstr], :auto) ref_df = copy(df3) - dfh = hcat(df3, df4, dupcol=:makeunique) + dfh = hcat(df3, df4, makeunique=true) @test ref_df ≅ df3 # make sure that df3 is not mutated by hcat @test size(dfh, 2) == 3 @test names(dfh) ≅ ["x1", "x1_1", "x2"] @test dfh[!, :x1] ≅ df3[!, :x1] - @test dfh ≅ DataFrames.hcat!(DataFrame(), df3, df4, dupcol=:makeunique) + @test dfh ≅ DataFrames.hcat!(DataFrame(), df3, df4, makeunique=true) - dfhu = hcat(df3, df4, dupcol=:update) + dfhu = hcat(df3, df4, mergeduplicates=update_missing) @test ref_df ≅ df3 # make sure that df3 is not mutated by hcat @test size(dfhu, 2) == 2 @test names(dfhu) ≅ ["x1", "x2"] @@ -30,24 +32,24 @@ const ≅ = isequal dfb = DataFrame(b=[3, missing]) @test hcat(dfa, dfb) ≅ [dfa dfb] - dfh3 = hcat(df3, df4, df5, dupcol=:makeunique) + dfh3 = hcat(df3, df4, df5, makeunique=true) @test names(dfh3) == ["x1", "x1_1", "x2", "x1_2", "x2_1"] - @test dfh3 ≅ hcat(dfh, df5, dupcol=:makeunique) - @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, dupcol=:makeunique) + @test dfh3 ≅ hcat(dfh, df5, makeunique=true) + @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, makeunique=true) - @test df2 ≅ DataFrames.hcat!(df2, dupcol=:makeunique) + @test df2 ≅ DataFrames.hcat!(df2, makeunique=true) - dfh3 = hcat(df3, df4, df5, dupcol=:update) + dfh3 = hcat(df3, df4, df5, mergeduplicates=update_missing) @test names(dfh3) == ["x1", "x2"] - @test dfh3 ≅ hcat(dfhu, df5, dupcol=:update) - @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, dupcol=:update) + @test dfh3 ≅ hcat(dfhu, df5, mergeduplicates=update_missing) + @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, mergeduplicates=update_missing) end @testset "hcat: copying" begin df = DataFrame(x=1:3) @test hcat(df)[!, 1] == df[!, 1] @test hcat(df)[!, 1] !== df[!, 1] - hdf = hcat(df, df, dupcol=:makeunique) + hdf = hcat(df, df, makeunique=true) @test hdf[!, 1] == df[!, 1] @test hdf[!, 1] !== df[!, 1] @test hdf[!, 2] == df[!, 1] @@ -61,7 +63,7 @@ end @test hdf[!, 2] !== df[!, 1] @test hdf[!, 1] == hdf[!, 2] @test hdf[!, 1] !== hdf[!, 2] - hdf = hcat(df, df, df, dupcol=:makeunique) + hdf = hcat(df, df, df, makeunique=true) @test hdf[!, 1] == df[!, 1] @test hdf[!, 1] !== df[!, 1] @test hdf[!, 2] == df[!, 1] @@ -74,7 +76,7 @@ end @test hdf[!, 1] !== hdf[!, 3] @test hdf[!, 2] == hdf[!, 3] @test hdf[!, 2] !== hdf[!, 3] - hdf = hcat(df, df, dupcol=:update) + hdf = hcat(df, df, mergeduplicates=update_missing) @test hdf ≅ df end @@ -82,14 +84,11 @@ end df = DataFrame(A=repeat('A':'C', inner=4), B=1:12) gd = groupby(df, :A) answer = DataFrame(A=fill('A', 4), B=1:4, A_1='B', B_1=5:8, A_2='C', B_2=9:12) - @test hcat(gd..., dupcol=:makeunique) == answer + @test hcat(gd..., makeunique=true) == answer answer = answer[:, 1:4] - @test hcat(gd[1], gd[2], dupcol=:makeunique) == answer @test_throws MethodError hcat("a", df, makeunique=true) @test_throws MethodError hcat(df, "a", makeunique=true) - @test_throws MethodError hcat("a", df, dupcol=:makeunique) - @test_throws MethodError hcat(df, "a", dupcol=:makeunique) end @testset "hcat: copycols" begin diff --git a/test/dataframe.jl b/test/dataframe.jl index c36388ab5a..1f4f0e69b2 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -124,6 +124,8 @@ end @test DataFrame(a=[1, 2, missing], b=[4, 5, 6]) ≇ DataFrame(a=[1, 2, 3], b=[4, 5, 6]) end +update_missing = (x...) -> coalesce(reverse(x)...) + @testset "copying" begin df = DataFrame(a=Union{Int, Missing}[2, 3], b=Union{DataFrame, Missing}[DataFrame(c=1), DataFrame(d=2)]) @@ -152,8 +154,7 @@ end @test names(rename(df, [:f, :g])) == ["f", "g"] @test names(rename(df, [:f, :f], makeunique=true)) == ["f", "f_1"] - @test names(rename(df, [:f, :f], dupcol=:makeunique)) == ["f", "f_1"] - @test names(rename(df, [:f, :f], dupcol=:update)) == ["f", "f"] + #@test names(rename(df, [:f, :f], mergeduplicates=update_missing)) == ["f", "f"] @test names(df) == ["a", "b"] rename!(df, [:f, :g]) @@ -210,7 +211,7 @@ end @test df.newcol == ["a", "b"] @test_throws ArgumentError insertcols!(df, 1, :newcol => ["a1", "b1"]) - @test insertcols!(df, 1, :newcol => ["a1", "b1"], dupcol=:makeunique) == df + @test insertcols!(df, 1, :newcol => ["a1", "b1"], makeunique=true) == df @test propertynames(df) == [:newcol_1, :newcol, :a, :b] @test df.a == [1, 2] @test df.b == [3.0, 4.0] @@ -237,7 +238,7 @@ end @test df.newcol == ["a", "b"] @test_throws ArgumentError insertcols!(df, 1, "newcol" => ["a1", "b1"]) - @test insertcols!(df, 1, "newcol" => ["a1", "b1"], dupcol=:makeunique) == df + @test insertcols!(df, 1, "newcol" => ["a1", "b1"], makeunique=true) == df @test propertynames(df) == [:newcol_1, :newcol, :a, :b] @test df.a == [1, 2] @test df.b == [3.0, 4.0] @@ -256,9 +257,9 @@ end df = DataFrame(a=[1, 2], a_1=[3, 4]) @test_throws ArgumentError insertcols!(df, 1, :a => [11, 12]) @test df == DataFrame(a=[1, 2], a_1=[3, 4]) - insertcols!(df, 1, :a => [11, 12], dupcol=:makeunique) + insertcols!(df, 1, :a => [11, 12], makeunique=true) @test propertynames(df) == [:a_2, :a, :a_1] - insertcols!(df, 4, :a => [11, 12], dupcol=:makeunique) + insertcols!(df, 4, :a => [11, 12], makeunique=true) @test propertynames(df) == [:a_2, :a, :a_1, :a_3] df = DataFrame(a=[1, 2], a_1=[3, 4]) @@ -267,12 +268,12 @@ end insertcols!(df, 4, :a => [11, 12], makeunique=true) @test propertynames(df) == [:a_2, :a, :a_1, :a_3] - df = DataFrame(a=[1, 2], a_1=[3, 4]) - insertcols!(df, 1, :a => [11, 12], dupcol=:update) + df = DataFrame(a=[1, 2, 3], a_1=[3, 4, 5]) + insertcols!(df, 1, :a => [11, 12, missing], mergeduplicates=update_missing) @test propertynames(df) == [:a, :a_1] - @test df == DataFrame(a=[11, 12], a_1=[3, 4]) + @test df == DataFrame(a=[11, 12, 3], a_1=[3, 4, 5]) - @test_throws ArgumentError insertcols!(df, 10, :a => [11, 12], dupcol=:makeunique) + @test_throws ArgumentError insertcols!(df, 10, :a => [11, 12], makeunique=true) @test_throws ArgumentError insertcols!(df, 10, :a => [11, 12], makeunique=true) dfc = copy(df) @@ -319,7 +320,7 @@ end @test df.a_2 === v3 df = DataFrame() - @test insertcols!(df, 1, :a=>v1, :a=>v2, :a=>v3, dupcol=:update, copycols=false) == + @test insertcols!(df, 1, :a=>v1, :a=>v2, :a=>v3, mergeduplicates=update_missing, copycols=false) == DataFrame(a=v3) @test df.a isa Vector{Int} @@ -333,7 +334,7 @@ end df = DataFrame(p='a':'b', q='r':'s') @test_throws ArgumentError insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3) - @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, dupcol=:makeunique, copycols=true) == + @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, makeunique=true, copycols=true) == DataFrame(p='a':'b', p_1=v1, q_1=v2, p_2=v3, q='r':'s') @test df.p_1 isa Vector{Int} @test df.q_1 !== v2 @@ -345,7 +346,7 @@ end df = DataFrame(p='a':'b', q='r':'s') @test_throws ArgumentError insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3) - @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, dupcol=:update, copycols=true) == + @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, mergeduplicates=update_missing, copycols=true) == DataFrame(p=v3, q=v2) df = DataFrame(a=1:3, b=4:6) diff --git a/test/join.jl b/test/join.jl index 3d19b2b115..f0397fbd6b 100644 --- a/test/join.jl +++ b/test/join.jl @@ -251,21 +251,23 @@ end @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] end +update_missing = (x...) -> coalesce(reverse(x)...) + @testset "update joins" begin df1 = DataFrame(Any[[1, 3, 5], [1.0, 3.0, 5.0]], [:id, :fid]) df2 = DataFrame(Any[[0, 1, 2, 3, 4], [0.0, 1.0, 2.0, 3.0, 4.0]], [:id, :fid]) - @test crossjoin(df1, df2, dupcol=:update) == + @test crossjoin(df1, df2, mergeduplicates=update_missing) == DataFrame(Any[repeat([0, 1, 2, 3, 4], outer=3), repeat([0.0, 1.0, 2.0, 3.0, 4.0], outer=3)], [:id, :fid]) - i(on,dupcol=:update) = innerjoin(df1, df2, on=on, dupcol=dupcol) - l(on,dupcol=:update) = leftjoin(df1, df2, on=on, dupcol=dupcol) - r(on,dupcol=:update) = rightjoin(df1, df2, on=on, dupcol=dupcol) - o(on,dupcol=:update) = outerjoin(df1, df2, on=on, dupcol=dupcol) - s(on,dupcol=:update) = semijoin(df1, df2, on=on, dupcol=dupcol) - a(on,dupcol=:update) = antijoin(df1, df2, on=on, dupcol=dupcol) + i(on,mergeduplicates=update_missing) = innerjoin(df1, df2, on=on, mergeduplicates=mergeduplicates) + l(on,mergeduplicates=update_missing) = leftjoin(df1, df2, on=on, mergeduplicates=mergeduplicates) + r(on,mergeduplicates=update_missing) = rightjoin(df1, df2, on=on, mergeduplicates=mergeduplicates) + o(on,mergeduplicates=update_missing) = outerjoin(df1, df2, on=on, mergeduplicates=mergeduplicates) + s(on,mergeduplicates=update_missing) = semijoin(df1, df2, on=on, mergeduplicates=mergeduplicates) + a(on,mergeduplicates=update_missing) = antijoin(df1, df2, on=on, mergeduplicates=mergeduplicates) @test s(:id) == s(:fid) ==