Skip to content

Commit 62cf117

Browse files
committed
Merge branch 'master' into fread-file-spaces
2 parents f2beb63 + 3e579ee commit 62cf117

8 files changed

Lines changed: 73 additions & 46 deletions

File tree

.Rbuildignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
^\.devcontainer$
1818
^\.graphics$
1919
^\.github$
20+
^\.jj$
2021
^\.vscode$
2122
^\.zed$
2223
^\.lintr$

.github/workflows/test-coverage.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,11 @@ jobs:
5151
covr::to_cobertura(cov)
5252
shell: Rscript {0}
5353

54-
- uses: codecov/codecov-action@v4
54+
- uses: codecov/codecov-action@v5
5555
with:
56-
fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }}
57-
file: ./cobertura.xml
58-
plugin: noop
56+
fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }}
57+
files: ./cobertura.xml
58+
plugins: noop
5959
disable_search: true
6060
token: ${{ secrets.CODECOV_TOKEN }}
6161

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ Version: 1.18.99
33
Title: Extension of `data.frame`
44
Depends: R (>= 3.5.0)
55
Imports: methods
6-
Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), R.utils, xts, zoo (>= 1.8-1), yaml, litedown
6+
Suggests: bit64 (>= 4.0.0), R.utils, xts, zoo (>= 1.8-1), yaml, litedown
77
Enhances: knitr, xfun
88
Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development.
99
License: MPL-2.0 | file LICENSE

NEWS.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@
2828

2929
1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix.
3030

31-
2. `fread("file://...")` works for file URIs with spaces, [#7550](https://github.com/Rdatatable/data.table/issues/7550). Thanks @aitap for the report and @MichaelChirico for the PR.
31+
2. `set()` now automatically pre-allocates new column slots if needed, similar to what `:=` already does, [#1831](https://github.com/Rdatatable/data.table/issues/1831) [#4100](https://github.com/Rdatatable/data.table/issues/4100). Thanks to @zachokeeffe and @tyner for the report and @ben-schwen for the fix.
32+
33+
3. `fread("file://...")` works for file URIs with spaces, [#7550](https://github.com/Rdatatable/data.table/issues/7550). Thanks @aitap for the report and @MichaelChirico for the PR.
3234

3335
## data.table [v1.18.0](https://github.com/Rdatatable/data.table/milestone/37?closed=1) 23 December 2025
3436

R/data.table.R

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2854,10 +2854,20 @@ setcolorder = function(x, neworder=key(x), before=NULL, after=NULL, skip_absent=
28542854
invisible(x)
28552855
}
28562856

2857+
.set_needs_alloccol = function(x, value) {
2858+
# automatically allocate more space when tl <= ncol (either full or loaded from disk)
2859+
if (truelength(x) <= length(x)) return(TRUE)
2860+
if (selfrefok(x, verbose=FALSE) >= 1L) return(FALSE)
2861+
# value can be NULL or list with NULLs inside
2862+
if (is.null(value)) return(TRUE)
2863+
if (!is.list(value)) return(FALSE)
2864+
any(vapply_1b(value, is.null))
2865+
}
2866+
28572867
set = function(x,i=NULL,j,value) # low overhead, loopable
28582868
{
28592869
# If removing columns from a table that's not selfrefok, need to call setalloccol first, #7488
2860-
if ((is.null(value) || (is.list(value) && any(vapply_1b(value, is.null)))) && selfrefok(x, verbose=FALSE) < 1L) {
2870+
if (.set_needs_alloccol(x, value)) {
28612871
name = substitute(x)
28622872
setalloccol(x, verbose=FALSE)
28632873
if (is.name(name)) {

inst/tests/froll.Rraw

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
1111

1212
exact_NaN = isTRUE(capabilities()["long.double"]) && identical(as.integer(.Machine$longdouble.digits), 64L)
1313
if (!exact_NaN) {
14-
cat("\n**** Skipping 7 NaN/NA algo='exact' tests because .Machine$longdouble.digits==", .Machine$longdouble.digits, " (!=64); e.g. under valgrind\n\n", sep="")
14+
cat("\n**** Skipping 8 NaN/NA algo='exact' tests because .Machine$longdouble.digits==", .Machine$longdouble.digits, " (!=64); e.g. under valgrind\n\n", sep="")
1515
# for Matt when he runs valgrind it is 53, but 64 when running regular R
1616
# froll.c uses long double and appears to require full long double accuracy in the algo='exact'
1717
}
@@ -1448,9 +1448,12 @@ test(6001.727, frollvar(adaptive=TRUE, c(1:2,NA), c(2,0,2), algo="exact"), c(NA_
14481448
test(6001.728, frollvar(adaptive=TRUE, c(1:2,NA), c(2,0,2), algo="exact", na.rm=TRUE), c(NA_real_,NA_real_,NA_real_))
14491449
test(6001.729, frollvar(adaptive=TRUE, c(1:2,NA), c(2,0,2), algo="exact", na.rm=TRUE, partial=TRUE), c(NA_real_,NA_real_,NA_real_))
14501450
test(6001.730, frollvar(adaptive=TRUE, c(1:2,NA), c(2,0,2), fill=99, algo="exact", na.rm=TRUE), c(99,NA,NA))
1451-
y = c(1e8+2.980232e-8, 1e8, 1e8, 1e8) # CLAMP0 test
1452-
test(6001.731, frollvar(y, 3)[4L], 0)
1453-
test(6001.732, frollsd(y, 3)[4L], 0)
1451+
# numerical stability: we need to guarantee frollvar(x, n) >= 0 for all x, n
1452+
# the exact epsilon here is a bit implementation-dependent (as in #7546), but what's
1453+
# crucial is the output is never negative (or NaN after sqrt() for frollsd).
1454+
y = c(1e8+2.980232e-8, 1e8, 1e8, 1e8)
1455+
test(6001.731, between(frollvar(y, 3)[4L], 0, 1e-7))
1456+
test(6001.732, between(frollsd(y, 3)[4L], 0, 1e-7))
14541457
test(6001.733, frollvar(y, c(3,3,3,3), adaptive=TRUE)[4L], 0)
14551458
test(6001.734, frollsd(y, c(3,3,3,3), adaptive=TRUE)[4L], 0)
14561459
test(6001.740, frollvar(c(1.5,2.5,2,NA), c(3,3)), list(c(NA,NA,0.25,NA), c(NA,NA,0.25,NA)), output="running sequentially, because outer parallelism has been used", options=c(datatable.verbose=TRUE)) # ensure no nested parallelism in rolling functions #7352
@@ -2084,7 +2087,8 @@ if (use.fork) {
20842087
test(6010.772, .selfref.ok(ans[[2L]]))
20852088
ans = frollapply(1:2, 2, function(x) list(data.table(x)), fill=list(data.table(NA)), simplify=FALSE)
20862089
test(6010.773, !.selfref.ok(ans[[2L]][[1L]]))
2087-
test(6010.7731, set(ans[[2L]][[1L]],, "newcol", 1L), error="data.table has either been loaded from disk")
2090+
# deactivated by #5443
2091+
# test(6010.7731, set(ans[[2L]][[1L]],, "newcol", 1L), error="data.table has either been loaded from disk")
20882092
ans = lapply(ans, lapply, setDT)
20892093
test(6010.774, .selfref.ok(ans[[2L]][[1L]])) ## fix after
20902094
ans = frollapply(1:2, 2, function(x) list(data.table(x)), fill=list(data.table(NA)), simplify=function(x) lapply(x, lapply, setDT))

inst/tests/tests.Rraw

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14797,7 +14797,7 @@ test(2016.1, name, "DT")
1479714797
test(2016.2, DT, data.table(a=1:3))
1479814798
test(2016.3, DT[2,a:=4L], data.table(a=INT(1,4,3))) # no error for := when existing column
1479914799
test(2016.4, set(DT,3L,1L,5L), data.table(a=INT(1,4,5))) # no error for set() when existing column
14800-
test(2016.5, set(DT,2L,"newCol",5L), error="either been loaded from disk.*or constructed manually.*Please run setDT.*setalloccol.*on it first") # just set()
14800+
test(2016.5, set(DT,2L,"newCol",5L), data.table(a=INT(1,4,5), newCol=INT(NA,5L,NA))) # works since set overallocates #4100
1480114801
test(2016.6, DT[2,newCol:=6L], data.table(a=INT(1,4,5), newCol=INT(NA,6L,NA))) # := ok (it changes DT in caller)
1480214802
unlink(tt)
1480314803

@@ -19478,7 +19478,7 @@ test(2290.4, DT[, `:=`(a = 2, c := 3)], error="It looks like you re-used `:=` in
1947819478
df = data.frame(a=1:3)
1947919479
setDT(df)
1948019480
attr(df, "att") = 1
19481-
test(2291.1, set(df, NULL, "new", "new"), error="either been loaded from disk.*or constructed manually.*Please run setDT.*setalloccol.*on it first")
19481+
test(2291.1, set(df, NULL, "new", "new"), setattr(data.table(a=1:3, new="new"), "att", 1)) # fixed when calling setalloccol before set #4100
1948219482

1948319483
# ns-qualified bysub error, #6493
1948419484
DT = data.table(a = 1)
@@ -21960,12 +21960,21 @@ test(2355.2, fread(txt, skip=0, header=TRUE), data.table(V1 = c("b1", "c1"), a1
2196021960
test(2355.3, fread(txt, skip=0, header=FALSE), data.table(V1=character(), V2=character(), V3=character()), warning="Consider fill=TRUE")
2196121961
test(2355.4, fread(txt, skip=0, fill=TRUE), data.table(V1 = c("a1", "b1", "c1"), V2 = c("a2", "b2", "c2"), V3 = c("", "b3", "c3")))
2196221962

21963+
# re-overallocate in set if quota is reached #496 #1831 #4100
21964+
DT = data.table()
21965+
test(2356.1, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = paste0("V",i), value = i); ncol(DT)}, 10L)
21966+
DT = structure(list(a = 1, b = 2), class = c("data.table", "data.frame"))
21967+
test(2356.2, options=c(datatable.alloccol=1L), set(DT, j="c", value=3), data.table(a=1, b=2, c=3))
21968+
# ensure := and set are consistent if they need to overallocate
21969+
DT = data.table(); DT2 = data.table()
21970+
test(2356.3, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = sprintf("V%d",i), value = i); DT}, {for (i in seq(10)) DT2[, sprintf("V%d",i) := i]; DT2})
21971+
2196321972
# fread works on file:// URIs with spaces, #7550
2196421973
local({
2196521974
f = tempfile("with spaces"); on.exit(unlink(f))
2196621975
DT = data.table(a = 1L, b = 2L)
2196721976
fwrite(DT, f)
2196821977

21969-
test(2356.1, fread(f), DT)
21970-
test(2356.2, fread(paste0("file://", f)), DT)
21978+
test(2357.1, fread(f), DT)
21979+
test(2357.2, fread(paste0("file://", f)), DT)
2197121980
})

src/cj.c

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,22 @@
66
- The memory copying operations (blockwise replication of data using memcpy)
77
- The creation of all combinations of the input vectors over the cross-product space
88
*/
9-
SEXP cj(SEXP base_list) {
9+
SEXP cj(SEXP base_list)
10+
{
1011
int ncol = LENGTH(base_list);
1112
SEXP out = PROTECT(allocVector(VECSXP, ncol));
1213
int nrow = 1;
1314
// already confirmed to be less than .Machine$integer.max at R level
14-
for (int j=0; j<ncol; ++j) nrow *= length(VECTOR_ELT(base_list, j));
15+
for (int j = 0; j < ncol; j++) nrow *= length(VECTOR_ELT(base_list, j));
1516
int eachrep = 1;
16-
for (int j=ncol-1; j>=0; --j) {
17+
for (int j = ncol - 1; j >= 0; j--) {
1718
SEXP source = VECTOR_ELT(base_list, j), target;
18-
SET_VECTOR_ELT(out, j, target=allocVector(TYPEOF(source), nrow));
19+
SET_VECTOR_ELT(out, j, target = allocVector(TYPEOF(source), nrow));
1920
copyMostAttrib(source, target); // includes levels of factors, integer64, custom classes, etc
20-
if (nrow==0) continue; // one or more columns are empty so the result will be empty, #2511
21+
if (nrow == 0) continue; // one or more columns are empty so the result will be empty, #2511
2122
int thislen = LENGTH(source);
22-
int blocklen = thislen*eachrep;
23-
int ncopy = nrow/blocklen;
23+
int blocklen = thislen * eachrep;
24+
int ncopy = nrow / blocklen;
2425
switch(TYPEOF(source)) {
2526
case LGLSXP:
2627
case INTSXP: {
@@ -29,64 +30,64 @@ SEXP cj(SEXP base_list) {
2930
#pragma omp parallel for num_threads(getDTthreads(thislen*eachrep, true))
3031
// default static schedule so two threads won't write to same cache line in last column
3132
// if they did write to same cache line (and will when last column's thislen is small) there's no correctness issue
32-
for (int i=0; i<thislen; ++i) {
33+
for (int i = 0; i < thislen; i++) {
3334
const int item = sourceP[i];
34-
const int end = (i+1)*eachrep;
35-
for (int j=i*eachrep; j<end; ++j) targetP[j] = item; // no div, mod or read ops inside loop; just rep a const contiguous write
35+
const int end = (i + 1) * eachrep;
36+
for (int j = i * eachrep; j < end; j++) targetP[j] = item; // no div, mod or read ops inside loop; just rep a const contiguous write
3637
}
3738
#pragma omp parallel for num_threads(getDTthreads(ncopy*blocklen, true))
38-
for (int i=1; i<ncopy; ++i) {
39-
memcpy(targetP + i*blocklen, targetP, blocklen*sizeof(*targetP));
39+
for (int i = 1; i < ncopy; i++) {
40+
memcpy(targetP + i * blocklen, targetP, blocklen * sizeof(*targetP));
4041
}
4142
} break;
4243
case REALSXP: {
4344
const double *restrict sourceP = REAL(source);
4445
double *restrict targetP = REAL(target);
4546
#pragma omp parallel for num_threads(getDTthreads(thislen*eachrep, true))
46-
for (int i=0; i<thislen; ++i) {
47+
for (int i = 0; i < thislen; i++) {
4748
const double item = sourceP[i];
48-
const int end=(i+1)*eachrep;
49-
for (int j=i*eachrep; j<end; ++j) targetP[j] = item;
49+
const int end = (i + 1) * eachrep;
50+
for (int j = i * eachrep; j < end; j++) targetP[j] = item;
5051
}
5152
#pragma omp parallel for num_threads(getDTthreads(ncopy*blocklen, true))
52-
for (int i=1; i<ncopy; ++i) {
53-
memcpy(targetP + i*blocklen, targetP, blocklen*sizeof(double));
53+
for (int i = 1; i < ncopy; i++) {
54+
memcpy(targetP + i * blocklen, targetP, blocklen * sizeof(double));
5455
}
5556
} break;
5657
case CPLXSXP: {
5758
const Rcomplex *restrict sourceP = COMPLEX(source);
5859
Rcomplex *restrict targetP = COMPLEX(target);
5960
#pragma omp parallel for num_threads(getDTthreads(thislen*eachrep, true))
60-
for (int i=0; i<thislen; ++i) {
61+
for (int i = 0; i < thislen; i++) {
6162
const Rcomplex item = sourceP[i];
62-
const int end=(i+1)*eachrep;
63-
for (int j=i*eachrep; j<end; ++j) targetP[j] = item;
63+
const int end = (i + 1) * eachrep;
64+
for (int j = i * eachrep; j < end; j++) targetP[j] = item;
6465
}
6566
#pragma omp parallel for num_threads(getDTthreads(ncopy*blocklen, true))
66-
for (int i=1; i<ncopy; ++i) {
67-
memcpy(targetP + i*blocklen, targetP, blocklen*sizeof(Rcomplex));
67+
for (int i = 1; i < ncopy; i++) {
68+
memcpy(targetP + i * blocklen, targetP, blocklen * sizeof(Rcomplex));
6869
}
6970
} break;
7071
case STRSXP: {
7172
const SEXP *sourceP = STRING_PTR_RO(source);
7273
int start = 0;
73-
for (int i=0; i<ncopy; ++i) {
74-
for (int j=0; j<thislen; ++j) {
74+
for (int i = 0; i < ncopy; i++) {
75+
for (int j = 0; j < thislen; j++) {
7576
const SEXP item = sourceP[j];
76-
const int end = start+eachrep;
77-
for (int k=start; k<end; ++k) SET_STRING_ELT(target, k, item); // no div, mod, or read-API call to STRING_ELT
77+
const int end = start + eachrep;
78+
for (int k = start; k < end; k++) SET_STRING_ELT(target, k, item); // no div, mod, or read-API call to STRING_ELT
7879
start = end;
7980
}
8081
}
8182
} break;
8283
case VECSXP: {
8384
const SEXP *sourceP = SEXPPTR_RO(source);
8485
int start = 0;
85-
for (int i=0; i<ncopy; ++i) {
86-
for (int j=0; j<thislen; ++j) {
86+
for (int i = 0; i < ncopy; i++) {
87+
for (int j = 0; j < thislen; j++) {
8788
const SEXP item = sourceP[j];
88-
const int end = start+eachrep;
89-
for (int k=start; k<end; ++k) SET_VECTOR_ELT(target, k, item);
89+
const int end = start + eachrep;
90+
for (int k = start; k < end; k++) SET_VECTOR_ELT(target, k, item);
9091
start = end;
9192
}
9293
}

0 commit comments

Comments
 (0)