Point Cloud Library (PCL)
1.15.1
Toggle main menu visibility
Loading...
Searching...
No Matches
utils
include
pcl
gpu
utils
device
block.hpp
1
/*
2
* Software License Agreement (BSD License)
3
*
4
* Copyright (c) 2011, Willow Garage, Inc.
5
* All rights reserved.
6
*
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted provided that the following conditions
9
* are met:
10
*
11
* * Redistributions of source code must retain the above copyright
12
* notice, this list of conditions and the following disclaimer.
13
* * Redistributions in binary form must reproduce the above
14
* copyright notice, this list of conditions and the following
15
* disclaimer in the documentation and/or other materials provided
16
* with the distribution.
17
* * Neither the name of Willow Garage, Inc. nor the names of its
18
* contributors may be used to endorse or promote products derived
19
* from this software without specific prior written permission.
20
*
21
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32
* POSSIBILITY OF SUCH DAMAGE.
33
*
34
* Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com)
35
*/
36
37
#ifndef PCL_DEVICE_UTILS_BLOCK_HPP_
38
#define PCL_DEVICE_UTILS_BLOCK_HPP_
39
40
namespace
pcl
41
{
42
namespace
device
43
{
44
struct
Block
45
{
46
static
__device__ __forceinline__
unsigned
int
id
()
47
{
48
return
blockIdx.x;
49
}
50
51
static
__device__ __forceinline__
unsigned
int
stride
()
52
{
53
return
blockDim.x * blockDim.y * blockDim.z;
54
}
55
56
static
__device__ __forceinline__
void
sync
()
57
{
58
__syncthreads();
59
}
60
61
static
__device__ __forceinline__
int
flattenedThreadId
()
62
{
63
return
threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
64
}
65
66
template
<
typename
It,
typename
T>
67
static
__device__ __forceinline__
void
fill
(It beg, It end,
const
T& value)
68
{
69
int
STRIDE =
stride
();
70
It t = beg +
flattenedThreadId
();
71
72
for
(; t < end; t += STRIDE)
73
*t = value;
74
}
75
76
template
<
typename
OutIt,
typename
T>
77
static
__device__ __forceinline__
void
yota
(OutIt beg, OutIt end, T value)
78
{
79
int
STRIDE =
stride
();
80
int
tid =
flattenedThreadId
();
81
value += tid;
82
83
for
(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
84
*t = value;
85
}
86
87
template
<
typename
InIt,
typename
OutIt>
88
static
__device__ __forceinline__
void
copy
(InIt beg, InIt end, OutIt out)
89
{
90
int
STRIDE =
stride
();
91
InIt t = beg +
flattenedThreadId
();
92
OutIt o = out + (t - beg);
93
94
for
(; t < end; t += STRIDE, o += STRIDE)
95
*o = *t;
96
}
97
98
template
<
typename
InIt,
typename
OutIt,
class
UnOp>
99
static
__device__ __forceinline__
void
transform
(InIt beg, InIt end, OutIt out, UnOp op)
100
{
101
int
STRIDE =
stride
();
102
InIt t = beg +
flattenedThreadId
();
103
OutIt o = out + (t - beg);
104
105
for
(; t < end; t += STRIDE, o += STRIDE)
106
*o = op(*t);
107
}
108
109
template
<
typename
InIt1,
typename
InIt2,
typename
OutIt,
class
BinOp>
110
static
__device__ __forceinline__
void
transform
(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
111
{
112
int
STRIDE =
stride
();
113
InIt1 t1 = beg1 +
flattenedThreadId
();
114
InIt2 t2 = beg2 +
flattenedThreadId
();
115
OutIt o = out + (t1 - beg1);
116
117
for
(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
118
*o = op(*t1, *t2);
119
}
120
121
template
<
int
CTA_SIZE,
typename
T,
class
BinOp>
122
static
__device__ __forceinline__
void
reduce
(
volatile
T* buffer, BinOp op)
123
{
124
int
tid =
flattenedThreadId
();
125
T val = buffer[tid];
126
127
if
(CTA_SIZE >= 1024) {
if
(tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
128
if
(CTA_SIZE >= 512) {
if
(tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
129
if
(CTA_SIZE >= 256) {
if
(tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
130
if
(CTA_SIZE >= 128) {
if
(tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
131
132
if
(tid < 32)
133
{
134
if
(CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
135
if
(CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
136
if
(CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
137
if
(CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
138
if
(CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
139
if
(CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
140
}
141
}
142
143
template
<
int
CTA_SIZE,
typename
T,
class
BinOp>
144
static
__device__ __forceinline__ T
reduce
(
volatile
T* buffer, T init, BinOp op)
145
{
146
int
tid =
flattenedThreadId
();
147
T val = buffer[tid] = init;
148
__syncthreads();
149
150
if
(CTA_SIZE >= 1024) {
if
(tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
151
if
(CTA_SIZE >= 512) {
if
(tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
152
if
(CTA_SIZE >= 256) {
if
(tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
153
if
(CTA_SIZE >= 128) {
if
(tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
154
155
if
(tid < 32)
156
{
157
if
(CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
158
if
(CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
159
if
(CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
160
if
(CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
161
if
(CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
162
if
(CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
163
}
164
__syncthreads();
165
return
buffer[0];
166
}
167
168
template
<
typename
T,
class
BinOp>
169
static
__device__ __forceinline__
void
reduce_n
(T* data,
unsigned
int
n, BinOp op)
170
{
171
int
ftid =
flattenedThreadId
();
172
int
sft =
stride
();
173
174
if
(sft < n)
175
{
176
for
(
unsigned
int
i = sft + ftid; i < n; i += sft)
177
data[ftid] = op(data[ftid], data[i]);
178
179
__syncthreads();
180
181
n = sft;
182
}
183
184
while
(n > 1)
185
{
186
unsigned
int
half = n/2;
187
188
if
(ftid < half)
189
data[ftid] = op(data[ftid], data[n - ftid - 1]);
190
191
__syncthreads();
192
193
n = n - half;
194
}
195
}
196
};
197
}
198
}
199
200
#endif
/* PCL_DEVICE_UTILS_BLOCK_HPP_ */
201
pcl::device
Definition
device_array.h:315
pcl
Definition
convolution.h:46
pcl::device::Block
Definition
utils.hpp:463
pcl::device::Block::yota
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
Definition
block.hpp:77
pcl::device::Block::fill
static __device__ __forceinline__ void fill(It beg, It end, const T &value)
Definition
block.hpp:67
pcl::device::Block::reduce
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition
block.hpp:122
pcl::device::Block::reduce
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition
block.hpp:144
pcl::device::Block::sync
static __device__ __forceinline__ void sync()
Definition
block.hpp:56
pcl::device::Block::reduce_n
static __device__ __forceinline__ void reduce_n(T *data, unsigned int n, BinOp op)
Definition
block.hpp:169
pcl::device::Block::stride
static __device__ __forceinline__ unsigned int stride()
Definition
block.hpp:51
pcl::device::Block::transform
static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
Definition
block.hpp:110
pcl::device::Block::copy
static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
Definition
block.hpp:88
pcl::device::Block::id
static __device__ __forceinline__ unsigned int id()
Definition
block.hpp:46
pcl::device::Block::transform
static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
Definition
block.hpp:99
pcl::device::Block::flattenedThreadId
static __device__ __forceinline__ int flattenedThreadId()
Definition
block.hpp:61