Skip to content

Commit d25ef12

Browse files
authored
Add Melt method to DataFrame (#7578)
* Add Melt method to DataFrame * Add test for empty string * Remove unused import * Add tests for Melt on empty dataframes * Treat different column types as the same as long as they have the same underlying data type. * Specify paramName in ArgumentException * Use nameof() in ArgumentExceptions * Moved validation out of separate method to allow for nameof() use. * Guard against possible null idColumns parameter * Do not allow missing variableName or valueName parameters * Add validation for column names that match existing column names * Add more tests for invalid data * Use HashSet in case the DataFrame has many columns * Add tests for null idColumns and valueColumns parameters * Remove bad test * Test default parameter values * Cache idColumns to avoid repeated lookups * Refactor Melt exceptions to use string resources * Use ArgumentNullException for null idColumns parameter * Optimize column name check * Require variableName and valueName to be different * Test for ID and value columns that do not exist * Do not drop empty strings with dropNulls = true * Rename mixed types to convertToString * Add remark to XML doc for melt * Add test case with multiple id columns * Remove unused import * Remove dead code in Melt * Fix example output for Melt
1 parent 0cef6a2 commit d25ef12

4 files changed

Lines changed: 631 additions & 1 deletion

File tree

src/Microsoft.Data.Analysis/DataFrame.cs

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,275 @@ public DataFrame Append(IEnumerable<KeyValuePair<string, object>> row, bool inPl
667667
return ret;
668668
}
669669

670+
/// <summary>
671+
/// Transforms the DataFrame from wide format to long format by unpivoting specified columns.
672+
/// This operation takes multiple value columns and "melts" them into two columns: one containing
673+
/// the original column names (variable) and one containing the values.
674+
/// </summary>
675+
/// <param name="idColumns">
676+
/// Column names to use as identifier variables. These columns will be repeated in the output
677+
/// for each value column. Must contain at least one column name.
678+
/// </param>
679+
/// <param name="valueColumns">
680+
/// Column names to unpivot into the variable and value columns. If null, all columns not
681+
/// specified in <paramref name="idColumns"/> will be used as value columns.
682+
/// </param>
683+
/// <param name="variableName">
684+
/// Name for the new column that will contain the original value column names. Defaults to "variable".
685+
/// </param>
686+
/// <param name="valueName">
687+
/// Name for the new column that will contain the values from the unpivoted columns. Defaults to "value".
688+
/// If value columns contain different types, this column will be of type string; otherwise, it will
689+
/// match the type of the first value column.
690+
/// </param>
691+
/// <param name="dropNulls">
692+
/// If true, rows where the value is null will be excluded from the result.
693+
/// Defaults to false.
694+
/// </param>
695+
/// <returns>
696+
/// A new DataFrame in long format with columns for each ID column, plus the variable and value columns.
697+
/// The number of rows will be approximately (number of original rows × number of value columns),
698+
/// or fewer if <paramref name="dropNulls"/> is true.
699+
/// </returns>
700+
/// <exception cref="ArgumentException">
701+
/// Thrown when <paramref name="idColumns"/> is empty, when <paramref name="valueColumns"/> is specified
702+
/// but empty, or when any column appears in both <paramref name="idColumns"/> and <paramref name="valueColumns"/>.
703+
/// </exception>
704+
/// <exception cref="InvalidOperationException">
705+
/// Thrown when <paramref name="valueColumns"/> is null and there are no columns available to use as
706+
/// value columns after excluding the ID columns.
707+
/// </exception>
708+
/// <example>
709+
/// <code>
710+
/// // Original DataFrame:
711+
/// // | ID | Name | 2020 | 2021 | 2022 |
712+
/// // |----|-------|------|------|------|
713+
/// // | 1 | Alice | 100 | 110 | 120 |
714+
/// // | 2 | Bob | 200 | 210 | 220 |
715+
///
716+
/// var melted = df.Melt(
717+
/// idColumns: new[] { "ID", "Name" },
718+
/// valueColumns: new[] { "2020", "2021", "2022" },
719+
/// variableName: "Year",
720+
/// valueName: "Sales"
721+
/// );
722+
///
723+
/// // Result:
724+
/// // | ID | Name | Year | Sales |
725+
/// // |----|-------|------|-------|
726+
/// // | 1 | Alice | 2020 | 100 |
727+
/// // | 2 | Bob | 2020 | 200 |
728+
/// // | 1 | Alice | 2021 | 110 |
729+
/// // | 2 | Bob | 2021 | 210 |
730+
/// // | 1 | Alice | 2022 | 120 |
731+
/// // | 2 | Bob | 2022 | 220 |
732+
/// </code>
733+
/// </example>
734+
/// <remarks>
735+
/// Note: The output rows are ordered by value column (all rows for the first value column,
736+
/// then all rows for the second, etc.), which differs from pandas.melt() which orders by
737+
/// source row.
738+
/// </remarks>
739+
public DataFrame Melt(IEnumerable<string> idColumns, IEnumerable<string> valueColumns = null, string variableName = "variable", string valueName = "value", bool dropNulls = false)
740+
{
741+
if (string.IsNullOrWhiteSpace(variableName))
742+
{
743+
throw new ArgumentException(Strings.ParameterMustNotBeNullOrWhitespace, nameof(variableName));
744+
}
745+
746+
if (string.IsNullOrWhiteSpace(valueName))
747+
{
748+
throw new ArgumentException(Strings.ParameterMustNotBeNullOrWhitespace, nameof(valueName));
749+
}
750+
751+
if (idColumns == null)
752+
{
753+
throw new ArgumentNullException(nameof(idColumns));
754+
}
755+
756+
var idColumnList = idColumns.ToList();
757+
758+
HashSet<string> idColumnSet = null;
759+
760+
if (valueColumns is null)
761+
{
762+
idColumnSet = [.. idColumnList];
763+
}
764+
765+
var valueColumnList = valueColumns?.ToList()
766+
?? _columnCollection
767+
.Where(c => !idColumnSet.Contains(c.Name))
768+
.Select(c => c.Name)
769+
.ToList();
770+
771+
if (idColumnList.Count == 0)
772+
{
773+
throw new ArgumentException(Strings.MissingIdColumns, nameof(idColumns));
774+
}
775+
776+
if (valueColumns != null && valueColumnList.Count == 0)
777+
{
778+
throw new ArgumentException(Strings.MissingValueColumns, nameof(valueColumns));
779+
}
780+
781+
if (valueColumns != null && valueColumnList.Any(v => idColumnList.Contains(v)))
782+
{
783+
throw new ArgumentException(Strings.DuplicateColumnsInIdAndValueLists, nameof(valueColumns));
784+
}
785+
786+
if (valueColumns == null && valueColumnList.Count == 0)
787+
{
788+
throw new InvalidOperationException(Strings.NoValueColumnsRemaining);
789+
}
790+
791+
if (_columnCollection.IndexOf(variableName) >= 0)
792+
{
793+
throw new ArgumentException(string.Format(Strings.VariableNameAlreadyExists, variableName), nameof(variableName));
794+
}
795+
796+
if (_columnCollection.IndexOf(valueName) >= 0)
797+
{
798+
throw new ArgumentException(string.Format(Strings.ValueNameAlreadyExists, valueName), nameof(valueName));
799+
}
800+
801+
if (string.Equals(variableName, valueName))
802+
{
803+
throw new ArgumentException(string.Format(Strings.VariableNameAndValueNameMustBeDifferent, nameof(variableName), nameof(valueName)), nameof(valueName));
804+
}
805+
806+
foreach (var columnName in idColumnList)
807+
{
808+
if (_columnCollection.IndexOf(columnName) < 0)
809+
{
810+
throw new ArgumentException(string.Format(Strings.InvalidColumnName, columnName), nameof(idColumns));
811+
}
812+
}
813+
814+
foreach (var columnName in valueColumnList)
815+
{
816+
if (_columnCollection.IndexOf(columnName) < 0)
817+
{
818+
throw new ArgumentException(string.Format(Strings.InvalidColumnName, columnName), nameof(valueColumns));
819+
}
820+
}
821+
822+
long totalOutputRows = CalculateTotalOutputRows(valueColumnList, dropNulls);
823+
824+
var outputCols = InitializeIdColumns(idColumnList, totalOutputRows);
825+
var variableColumn = new StringDataFrameColumn(variableName, totalOutputRows);
826+
var valueColumn = CreateValueColumn(valueColumnList, valueName, totalOutputRows);
827+
828+
FillMeltedData(idColumnList, valueColumnList, outputCols, variableColumn, valueColumn, dropNulls);
829+
830+
outputCols.Add(variableColumn);
831+
outputCols.Add(valueColumn);
832+
833+
return new DataFrame(outputCols);
834+
}
835+
836+
private long CalculateTotalOutputRows(List<string> valueColumnList, bool dropNulls)
837+
{
838+
if (!dropNulls)
839+
{
840+
return _rowCollection.Count * valueColumnList.Count;
841+
}
842+
843+
long total = 0;
844+
845+
foreach (var columnName in valueColumnList)
846+
{
847+
var column = _columnCollection[columnName];
848+
849+
foreach (var item in column)
850+
{
851+
if (item != null)
852+
{
853+
total++;
854+
}
855+
}
856+
}
857+
858+
return total;
859+
}
860+
861+
private List<DataFrameColumn> InitializeIdColumns(List<string> idColumnList, long size)
862+
{
863+
PrimitiveDataFrameColumn<long> empty = new PrimitiveDataFrameColumn<long>("Empty");
864+
var outputCols = new List<DataFrameColumn>(idColumnList.Count);
865+
866+
foreach (var idColumnName in idColumnList)
867+
{
868+
var sourceColumn = _columnCollection[idColumnName];
869+
var newColumn = sourceColumn.Clone(empty);
870+
newColumn.Resize(size);
871+
outputCols.Add(newColumn);
872+
}
873+
874+
return outputCols;
875+
}
876+
877+
private DataFrameColumn CreateValueColumn(List<string> valueColumnList, string valueName, long size)
878+
{
879+
var valueTypes = valueColumnList
880+
.Select(name => _columnCollection[name].DataType)
881+
.Distinct()
882+
.Count();
883+
884+
DataFrameColumn valueColumn;
885+
886+
if (valueTypes > 1)
887+
{
888+
valueColumn = new StringDataFrameColumn(valueName, size);
889+
}
890+
else
891+
{
892+
PrimitiveDataFrameColumn<long> empty = new PrimitiveDataFrameColumn<long>("Empty");
893+
valueColumn = _columnCollection[valueColumnList[0]].Clone(empty);
894+
valueColumn.SetName(valueName);
895+
valueColumn.Resize(size);
896+
}
897+
898+
return valueColumn;
899+
}
900+
901+
private void FillMeltedData(List<string> idColumnList, List<string> valueColumnList, List<DataFrameColumn> outputIdCols, StringDataFrameColumn variableColumn, DataFrameColumn valueColumn, bool dropNulls)
902+
{
903+
bool convertToString = valueColumn is StringDataFrameColumn;
904+
long currentRow = 0;
905+
long rowCount = _rowCollection.Count;
906+
int idColumnCount = idColumnList.Count;
907+
908+
var idColumns = new DataFrameColumn[idColumnCount];
909+
for (int i = 0; i < idColumnCount; i++)
910+
{
911+
idColumns[i] = _columnCollection[idColumnList[i]];
912+
}
913+
914+
foreach (var valueColumnName in valueColumnList)
915+
{
916+
var sourceValueColumn = _columnCollection[valueColumnName];
917+
918+
for (long sourceRow = 0; sourceRow < rowCount; sourceRow++)
919+
{
920+
var value = sourceValueColumn[sourceRow];
921+
922+
if (dropNulls && (value == null))
923+
{
924+
continue;
925+
}
926+
927+
for (int i = 0; i < idColumnCount; i++)
928+
{
929+
outputIdCols[i][currentRow] = idColumns[i][sourceRow];
930+
}
931+
932+
variableColumn[currentRow] = valueColumnName;
933+
valueColumn[currentRow] = convertToString ? value?.ToString() : value;
934+
currentRow++;
935+
}
936+
}
937+
}
938+
670939
/// <summary>
671940
/// Invalidates any cached data after a column has changed.
672941
/// </summary>

src/Microsoft.Data.Analysis/Strings.Designer.cs

Lines changed: 73 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)