Skip to content

Commit 4c5fa60

Browse files
committed
reuse global mem and optimized set key
1 parent d55b929 commit 4c5fa60

4 files changed

Lines changed: 8 additions & 18 deletions

File tree

Device/CSR/BagCsrManager.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ uint *BagCsrManager::getMutableCsrLen(){
5454

5555
uint *BagCsrManager::getMutableCsrKey(){
5656
PROCESS_ERROR(curNumCsr > 0);
57-
printf("csrKey reservedSize %d; curNumCsr %d\n", csrKey.reservedSize, curNumCsr);
57+
//printf("csrKey reservedSize %d; curNumCsr %d\n", csrKey.reservedSize, curNumCsr);
5858
if(csrKey.reservedSize < curNumCsr){
5959
csrKey.reserveSpace(curNumCsr * 2, sizeof(uint));
6060
printf("reserving memory\n");

Device/FindSplit/FindFeaCsr.cu

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -165,18 +165,15 @@ void AfterCompression(GBDTGPUMemManager &manager, BagCsrManager &csrManager, Bag
165165
if(optimiseSetKey == false)
166166
SetKey<<<numSeg, blockSize, sizeof(uint) * 2, (*(cudaStream_t*)pStream)>>>
167167
(csrManager.pEachCsrFeaStartPos, csrManager.pEachCsrFeaLen, csrManager.getMutableCsrKey());
168-
// (csrManager.pEachCsrFeaStartPos, csrManager.pEachCsrFeaLen, pCSRMultableKey);
169168
else{
170169
if(numSeg < 1000000)
171170
SetKey<<<numSeg, blockSize, sizeof(uint) * 2, (*(cudaStream_t*)pStream)>>>
172171
(csrManager.pEachCsrFeaStartPos, csrManager.pEachCsrFeaLen, csrManager.getMutableCsrKey());
173-
// (csrManager.pEachCsrFeaStartPos, csrManager.pEachCsrFeaLen, pCSRMultableKey);
174172
else{
175173
int numSegEachBlk = numSeg/10000;
176174
int numofBlkSetKey = (numSeg + numSegEachBlk - 1) / numSegEachBlk;
177175
SetKey<<<numofBlkSetKey, blockSize, 0, (*(cudaStream_t*)pStream)>>>(csrManager.pEachCsrFeaStartPos, csrManager.pEachCsrFeaLen,
178176
numSegEachBlk, numSeg, csrManager.getMutableCsrKey());
179-
// numSegEachBlk, numSeg, pCSRMultableKey);
180177
}
181178
}
182179
cudaStreamSynchronize((*(cudaStream_t*)pStream));
@@ -304,15 +301,13 @@ void AllNode2CompGD(GBDTGPUMemManager &manager, BagCsrManager &csrManager, BagMa
304301
cudaDeviceSynchronize();
305302
thrust::exclusive_scan(thrust::device, csrManager.getCsrLen(), csrManager.getCsrLen() + csrManager.curNumCsr, csrManager.getMutableCsrStart());
306303
cudaDeviceSynchronize();
307-
uint *pCsrNewLen_d;// = (uint*)(indexComp.histogram_d.addr);
308-
//uint *pCsrNewLen_d = (uint*)(indexComp.histogram_d.addr);
309-
checkCudaErrors(cudaMallocHost((void**)&pCsrNewLen_d, sizeof(uint) * csrManager.curNumCsr * 2));
304+
uint *pCsrNewLen_d = (uint*)(indexComp.histogram_d.addr);
310305
checkCudaErrors(cudaMemset(pCsrNewLen_d, 0, sizeof(uint) * csrManager.curNumCsr * 2));
311306
checkCudaErrors(cudaMemset(csrManager.pEachCsrFeaLen, 0, sizeof(uint) * bagManager.m_numFea * numofSNode));
312307
dim3 dimNumofBlockToCsrLen;
313308
uint blockSizeCsrLen = 128;
314309

315-
cudaDeviceSynchronize();
310+
cudaDeviceSynchronize();
316311
dimNumofBlockToCsrLen.x = (numofDenseValue_previous + blockSizeCsrLen - 1) / blockSizeCsrLen;
317312
newCsrLenFvalue<<<dimNumofBlockToCsrLen, blockSizeCsrLen, blockSizeCsrLen * sizeof(uint)>>>(
318313
csrManager.preFvalueInsId, numofDenseValue_previous,
@@ -321,7 +316,6 @@ cudaDeviceSynchronize();
321316
csrManager.getCsrFvalue(), csrManager.curNumCsr,
322317
csrManager.pEachCsrFeaStartPos, bagManager.m_pPreNumSN_h[0],
323318
bagManager.m_numFea, csrManager.getCsrKey(), pCsrNewLen_d, pCsrId2Pid);
324-
// bagManager.m_numFea, pCSRKey, pCsrNewLen_d, pCsrId2Pid);
325319
cudaDeviceSynchronize();
326320

327321
GETERROR("after newCsrLenFvalue");
@@ -356,9 +350,7 @@ cudaDeviceSynchronize();
356350
int blockSizeLoadCsrLen;
357351
dim3 dimNumofBlockToLoadCsrLen;
358352
conf.ConfKernel(csrManager.curNumCsr * 2, blockSizeLoadCsrLen, dimNumofBlockToLoadCsrLen);
359-
//uint *pCsrMarker = (uint*)indexComp.partitionMarker.addr;
360-
uint *pCsrMarker;
361-
checkCudaErrors(cudaMalloc((void**)&pCsrMarker, sizeof(uint) * csrManager.curNumCsr * 2));
353+
uint *pCsrMarker = (uint*)indexComp.partitionMarker.addr;
362354
checkCudaErrors(cudaMemset(pCsrMarker, 0, sizeof(uint) * csrManager.curNumCsr * 2));
363355
map2One<<<dimNumofBlockToLoadCsrLen, blockSizeLoadCsrLen>>>(pCsrNewLen_d, csrManager.curNumCsr * 2, pCsrMarker);
364356
GETERROR("after map2One");
@@ -369,14 +361,14 @@ checkCudaErrors(cudaMalloc((void**)&pCsrMarker, sizeof(uint) * csrManager.curNum
369361
checkCudaErrors(cudaMemcpy(&csrManager.curNumCsr, pCsrMarker + csrManager.curNumCsr * 2 - 1, sizeof(uint), cudaMemcpyDefault));
370362

371363
checkCudaErrors(cudaMemset(csrManager.getMutableCsrLen(), 0, sizeof(uint) * csrManager.curNumCsr));
372-
cudaDeviceSynchronize();
364+
cudaDeviceSynchronize();
373365
loadDenseCsr<<<dimNumofBlockToLoadCsrLen, blockSizeLoadCsrLen>>>(pCsrFvalueSpare, pCsrNewLen_d,
374366
previousNumCsr * 2, csrManager.curNumCsr, pCsrMarker,
375367
csrManager.getMutableCsrFvalue(), csrManager.getMutableCsrLen());
376368
GETERROR("after loadDenseCsr");
377369
printf("done load dense csr: number of csr is %d\n", csrManager.curNumCsr);
378370
thrust::exclusive_scan(thrust::device, csrManager.pEachCsrFeaLen, csrManager.pEachCsrFeaLen + numofSNode * bagManager.m_numFea, csrManager.pEachCsrFeaStartPos);
379-
cudaDeviceSynchronize();
371+
cudaDeviceSynchronize();
380372

381373

382374
thrust::exclusive_scan(thrust::device, csrManager.pEachNodeSizeInCsr, csrManager.pEachNodeSizeInCsr + numofSNode, csrManager.pEachCsrNodeStartPos);
@@ -385,8 +377,6 @@ cudaDeviceSynchronize();
385377
thrust::exclusive_scan(thrust::device, csrManager.getCsrLen(), csrManager.getCsrLen() + csrManager.curNumCsr, pCsrStartCurRound);
386378
PROCESS_ERROR(csrManager.curNumCsr <= bagManager.m_numFeaValue);
387379
cudaDeviceSynchronize();
388-
checkCudaErrors(cudaFree(pCsrMarker));
389-
printf("exit if\n");
390380
}
391381
else
392382
{

Device/FindSplit/FindFeaOrg.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
#include "../../SharedUtility/segmentedMax.h"
2323
#include "../../SharedUtility/setSegmentKey.h"
2424

25-
bool optimiseSetKey = false;
25+
bool optimiseSetKey = true;
2626
/**
2727
* @brief: efficient best feature finder
2828
*/

SharedUtility/CudaMacro.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#include <assert.h>
55
#include <stdio.h>
66

7-
//#define _DEBUG
7+
#define _DEBUG
88

99
#define BLOCK_SIZE 64
1010

0 commit comments

Comments
 (0)