10
10
import torch
11
11
from PIL import Image
12
12
from pytorch3d .datasets .shapenet_base import ShapeNetBase
13
- from pytorch3d .datasets .utils import compute_extrinsic_matrix
14
13
from pytorch3d .io import load_obj
15
14
from pytorch3d .renderer import HardPhongShader
16
- from pytorch3d .renderer .cameras import CamerasBase
17
- from pytorch3d .transforms import Transform3d
18
15
from tabulate import tabulate
19
16
17
+ from .utils import (
18
+ BlenderCamera ,
19
+ align_bbox ,
20
+ compute_extrinsic_matrix ,
21
+ read_binvox_coords ,
22
+ voxelize ,
23
+ )
20
24
21
- SYNSET_DICT_DIR = Path (__file__ ).resolve ().parent
22
25
23
- # Default values of rotation, translation and intrinsic matrices for BlenderCamera.
24
- r = np .expand_dims (np .eye (3 ), axis = 0 ) # (1, 3, 3)
25
- t = np .expand_dims (np .zeros (3 ), axis = 0 ) # (1, 3)
26
- k = np .expand_dims (np .eye (4 ), axis = 0 ) # (1, 4, 4)
26
+ SYNSET_DICT_DIR = Path (__file__ ).resolve ().parent
27
+ MAX_CAMERA_DISTANCE = 1.75 # Constant from R2N2.
28
+ VOXEL_SIZE = 128
29
+ # Intrinsic matrix extracted from Blender. Taken from meshrcnn codebase:
30
+ # https://github.com/facebookresearch/meshrcnn/blob/master/shapenet/utils/coords.py
31
+ BLENDER_INTRINSIC = torch .tensor (
32
+ [
33
+ [2.1875 , 0.0 , 0.0 , 0.0 ],
34
+ [0.0 , 2.1875 , 0.0 , 0.0 ],
35
+ [0.0 , 0.0 , - 1.002002 , - 0.2002002 ],
36
+ [0.0 , 0.0 , - 1.0 , 0.0 ],
37
+ ]
38
+ )
27
39
28
40
29
41
class R2N2 (ShapeNetBase ):
@@ -42,6 +54,7 @@ def __init__(
42
54
r2n2_dir ,
43
55
splits_file ,
44
56
return_all_views : bool = True ,
57
+ return_voxels : bool = False ,
45
58
):
46
59
"""
47
60
Store each object's synset id and models id the given directories.
@@ -54,6 +67,8 @@ def __init__(
54
67
return_all_views (bool): Indicator of whether or not to load all the views in
55
68
the split. If set to False, one of the views in the split will be randomly
56
69
selected and loaded.
70
+ return_voxels(bool): Indicator of whether or not to return voxels as a tensor
71
+ of shape (D, D, D) where D is the number of voxels along each dimension.
57
72
"""
58
73
super ().__init__ ()
59
74
self .shapenet_dir = shapenet_dir
@@ -83,6 +98,16 @@ def __init__(
83
98
) % (r2n2_dir )
84
99
warnings .warn (msg )
85
100
101
+ self .return_voxels = return_voxels
102
+ # Check if the folder containing voxel coordinates is included in r2n2_dir.
103
+ if not path .isdir (path .join (r2n2_dir , "ShapeNetVox32" )):
104
+ self .return_voxels = False
105
+ msg = (
106
+ "ShapeNetVox32 not found in %s. Voxel coordinates will "
107
+ "be skipped when returning models."
108
+ ) % (r2n2_dir )
109
+ warnings .warn (msg )
110
+
86
111
synset_set = set ()
87
112
# Store lists of views of each model in a list.
88
113
self .views_per_model_list = []
@@ -173,6 +198,8 @@ def __getitem__(self, model_idx, view_idxs: Optional[List[int]] = None) -> Dict:
173
198
- R: Rotation matrix of shape (V, 3, 3), where V is number of views returned.
174
199
- T: Translation matrix of shape (V, 3), where V is number of views returned.
175
200
- K: Intrinsic matrix of shape (V, 4, 4), where V is number of views returned.
201
+ - voxels: Voxels of shape (D, D, D), where D is the number of voxels along each
202
+ dimension.
176
203
"""
177
204
if isinstance (model_idx , tuple ):
178
205
model_idx , view_idxs = model_idx
@@ -208,6 +235,7 @@ def __getitem__(self, model_idx, view_idxs: Optional[List[int]] = None) -> Dict:
208
235
model ["label" ] = self .synset_dict [model ["synset_id" ]]
209
236
210
237
model ["images" ] = None
238
+ images , Rs , Ts , voxel_RTs = [], [], [], []
211
239
# Retrieve R2N2's renderings if required.
212
240
if self .return_images :
213
241
rendering_path = path .join (
@@ -217,12 +245,9 @@ def __getitem__(self, model_idx, view_idxs: Optional[List[int]] = None) -> Dict:
217
245
model ["model_id" ],
218
246
"rendering" ,
219
247
)
220
-
221
248
# Read metadata file to obtain params for calibration matrices.
222
249
with open (path .join (rendering_path , "rendering_metadata.txt" ), "r" ) as f :
223
250
metadata_lines = f .readlines ()
224
-
225
- images , Rs , Ts = [], [], []
226
251
for i in model_views :
227
252
# Read image.
228
253
image_path = path .join (rendering_path , "%02d.png" % i )
@@ -234,9 +259,13 @@ def __getitem__(self, model_idx, view_idxs: Optional[List[int]] = None) -> Dict:
234
259
azim , elev , yaw , dist_ratio , fov = [
235
260
float (v ) for v in metadata_lines [i ].strip ().split (" " )
236
261
]
237
- R , T = self ._compute_camera_calibration (azim , elev , dist_ratio )
262
+ dist = dist_ratio * MAX_CAMERA_DISTANCE
263
+ # Extrinsic matrix before transformation to PyTorch3D world space.
264
+ RT = compute_extrinsic_matrix (azim , elev , dist )
265
+ R , T = self ._compute_camera_calibration (RT )
238
266
Rs .append (R )
239
267
Ts .append (T )
268
+ voxel_RTs .append (RT )
240
269
241
270
# Intrinsic matrix extracted from the Blender with slight modification to work with
242
271
# PyTorch3D world space. Taken from meshrcnn codebase:
@@ -254,27 +283,48 @@ def __getitem__(self, model_idx, view_idxs: Optional[List[int]] = None) -> Dict:
254
283
model ["T" ] = torch .stack (Ts )
255
284
model ["K" ] = K .expand (len (model_views ), 4 , 4 )
256
285
286
+ voxels_list = []
287
+ # Read voxels if required.
288
+ voxel_path = path .join (
289
+ self .r2n2_dir ,
290
+ "ShapeNetVox32" ,
291
+ model ["synset_id" ],
292
+ model ["model_id" ],
293
+ "model.binvox" ,
294
+ )
295
+ if self .return_voxels :
296
+ if not path .isfile (voxel_path ):
297
+ msg = "Voxel file not found for model %s from category %s."
298
+ raise FileNotFoundError (msg % (model ["model_id" ], model ["synset_id" ]))
299
+
300
+ with open (voxel_path , "rb" ) as f :
301
+ # Read voxel coordinates as a tensor of shape (N, 3).
302
+ voxel_coords = read_binvox_coords (f )
303
+ # Align voxels to the same coordinate system as mesh verts.
304
+ voxel_coords = align_bbox (voxel_coords , model ["verts" ])
305
+ for RT in voxel_RTs :
306
+ # Compute projection matrix.
307
+ P = BLENDER_INTRINSIC .mm (RT )
308
+ # Convert voxel coordinates of shape (N, 3) to voxels of shape (D, D, D).
309
+ voxels = voxelize (voxel_coords , P , VOXEL_SIZE )
310
+ voxels_list .append (voxels )
311
+ model ["voxels" ] = torch .stack (voxels_list )
312
+
257
313
return model
258
314
259
- def _compute_camera_calibration (self , azim : float , elev : float , dist_ratio : float ):
315
+ def _compute_camera_calibration (self , RT ):
260
316
"""
261
- Helper function for calculating rotation and translation matrices from azimuth
262
- angle, elevation and distance ratio .
317
+ Helper function for calculating rotation and translation matrices from ShapeNet
318
+ to camera transformation and ShapeNet to PyTorch3D transformation .
263
319
264
320
Args:
265
- azim: Rotation about the z-axis, in degrees.
266
- elev: Rotation above the xy-plane, in degrees.
267
- dist_ratio: Ratio of distance from the origin to the maximum camera distance.
321
+ RT: Extrinsic matrix that performs ShapeNet world view to camera view
322
+ transformation.
268
323
269
324
Returns:
270
- - R: Rotation matrix of shape (3, 3).
271
- - T: Translation matrix of shape (3).
325
+ R: Rotation matrix of shape (3, 3).
326
+ T: Translation matrix of shape (3).
272
327
"""
273
- # Retrive R,T,K of the selected view(s) by reading the metadata.
274
- MAX_CAMERA_DISTANCE = 1.75 # Constant from R2N2.
275
- dist = dist_ratio * MAX_CAMERA_DISTANCE
276
- RT = compute_extrinsic_matrix (azim , elev , dist )
277
-
278
328
# Transform the mesh vertices from shapenet world to pytorch3d world.
279
329
shapenet_to_pytorch3d = torch .tensor (
280
330
[
@@ -285,9 +335,7 @@ def _compute_camera_calibration(self, azim: float, elev: float, dist_ratio: floa
285
335
],
286
336
dtype = torch .float32 ,
287
337
)
288
- RT = compute_extrinsic_matrix (azim , elev , dist ) # (4, 4)
289
338
RT = torch .transpose (RT , 0 , 1 ).mm (shapenet_to_pytorch3d ) # (4, 4)
290
-
291
339
# Extract rotation and translation matrices from RT.
292
340
R = RT [:3 , :3 ]
293
341
T = RT [3 , :3 ]
@@ -348,27 +396,3 @@ def render(
348
396
return super ().render (
349
397
idxs = idxs , shader_type = shader_type , device = device , cameras = cameras , ** kwargs
350
398
)
351
-
352
-
353
- class BlenderCamera (CamerasBase ):
354
- """
355
- Camera for rendering objects with calibration matrices from the R2N2 dataset
356
- (which uses Blender for rendering the views for each model).
357
- """
358
-
359
- def __init__ (self , R = r , T = t , K = k , device = "cpu" ):
360
- """
361
- Args:
362
- R: Rotation matrix of shape (N, 3, 3).
363
- T: Translation matrix of shape (N, 3).
364
- K: Intrinsic matrix of shape (N, 4, 4).
365
- device: torch.device or str.
366
- """
367
- # The initializer formats all inputs to torch tensors and broadcasts
368
- # all the inputs to have the same batch dimension where necessary.
369
- super ().__init__ (device = device , R = R , T = T , K = K )
370
-
371
- def get_projection_transform (self , ** kwargs ) -> Transform3d :
372
- transform = Transform3d (device = self .device )
373
- transform ._matrix = self .K .transpose (1 , 2 ).contiguous () # pyre-ignore[16]
374
- return transform
0 commit comments