1
1
#!/usr/bin/env python
2
- from pathlib import Path
3
2
import argparse
4
3
import os
4
+ from pathlib import Path
5
5
6
6
import numpy as np
7
7
import pympi
8
8
import torch
9
9
from pose_format import Pose
10
10
from pose_format .utils .generic import pose_normalization_info , pose_hide_legs , normalize_hands_3d
11
+ from torch .fx .experimental .symbolic_shapes import lru_cache
11
12
12
13
from sign_language_segmentation .src .utils .probs_to_segments import probs_to_segments
13
14
15
+ DEFAULT_MODEL = "model_E1s-1.pth"
16
+
14
17
15
- def add_optical_flow (pose : Pose )-> None :
18
+ def add_optical_flow (pose : Pose ) -> None :
16
19
from pose_format .numpy .representation .distance import DistanceRepresentation
17
20
from pose_format .utils .optical_flow import OpticalFlowCalculator
18
21
@@ -44,6 +47,7 @@ def process_pose(pose: Pose, optical_flow=False, hand_normalization=False) -> Po
44
47
return pose
45
48
46
49
50
+ @lru_cache (maxsize = 1 )
47
51
def load_model (model_path : str ):
48
52
model = torch .jit .load (model_path )
49
53
model .eval ()
@@ -58,7 +62,7 @@ def predict(model, pose: Pose):
58
62
return model (pose_data )
59
63
60
64
61
- def save_pose_segments (tiers :dict , tier_id :str , input_file_path :Path )-> None :
65
+ def save_pose_segments (tiers : dict , tier_id : str , input_file_path : Path ) -> None :
62
66
# reload it without any of the processing, so we get all the original points and such.
63
67
with input_file_path .open ("rb" ) as f :
64
68
pose = Pose .read (f .read ())
@@ -83,42 +87,64 @@ def get_args():
83
87
)
84
88
parser .add_argument ("--video" , default = None , required = False , type = str , help = "path to video file" )
85
89
parser .add_argument ("--subtitles" , default = None , required = False , type = str , help = "path to subtitle file" )
86
- parser .add_argument ("--model" , default = "model_E1s-1.pth" , required = False , type = str , help = "path to model file" )
90
+ parser .add_argument ("--model" , default = DEFAULT_MODEL , required = False , type = str , help = "path to model file" )
87
91
parser .add_argument ("--no-pose-link" , action = "store_true" , help = "whether to link the pose file" )
88
92
89
93
return parser .parse_args ()
90
94
91
95
92
- def main ():
93
- args = get_args ()
96
+ def segment_pose (pose : Pose , model : str = DEFAULT_MODEL , verbose = True ):
97
+ if "E4" in model :
98
+ pose = process_pose (pose , optical_flow = True , hand_normalization = True )
99
+ else :
100
+ pose = process_pose (pose )
94
101
95
- print ("Loading pose ..." )
96
- with open (args .pose , "rb" ) as f :
97
- pose = Pose .read (f .read ())
98
- if "E4" in args .model :
99
- pose = process_pose (pose , optical_flow = True , hand_normalization = True )
100
- else :
101
- pose = process_pose (pose )
102
-
103
- print ("Loading model ..." )
102
+ if verbose :
103
+ print ("Loading model ..." )
104
104
install_dir = str (os .path .dirname (os .path .abspath (__file__ )))
105
- model = load_model (os .path .join (install_dir , "dist" , args . model ))
105
+ model = load_model (os .path .join (install_dir , "dist" , model ))
106
106
107
- print ("Estimating segments ..." )
107
+ if verbose :
108
+ print ("Estimating segments ..." )
108
109
probs = predict (model , pose )
109
110
110
111
sign_segments = probs_to_segments (probs ["sign" ], 60 , 50 )
111
112
sentence_segments = probs_to_segments (probs ["sentence" ], 90 , 90 )
112
113
113
- print ("Building ELAN file ..." )
114
+ if verbose :
115
+ print ("Building ELAN file ..." )
116
+ eaf = pympi .Elan .Eaf (author = "sign-language-processing/transcription" )
117
+
118
+ fps = pose .body .fps
119
+
114
120
tiers = {
115
121
"SIGN" : sign_segments ,
116
122
"SENTENCE" : sentence_segments ,
117
123
}
118
124
119
- fps = pose .body .fps
125
+ for tier_id , segments in tiers .items ():
126
+ eaf .add_tier (tier_id )
127
+ for segment in segments :
128
+ if segment ["end" ] == segment ["start" ]:
129
+ segment ["end" ] += 1
130
+
131
+ # convert frame numbers to millisecond timestamps, for Elan
132
+ start_time_ms = int (segment ["start" ] / fps * 1000 )
133
+ end_time_ms = int (segment ["end" ] / fps * 1000 )
134
+ eaf .add_annotation (tier_id , start_time_ms , end_time_ms )
135
+
136
+ return eaf , tiers
137
+
138
+
139
+ def main ():
140
+ args = get_args ()
141
+
142
+ print ("Loading pose ..." )
143
+ with open (args .pose , "rb" ) as f :
144
+ pose = Pose .read (f .read ())
145
+
146
+ eaf , tiers = segment_pose (pose , model = args .model )
120
147
121
- eaf = pympi .Elan .Eaf (author = "sign-language-processing/transcription" )
122
148
if args .video is not None :
123
149
mimetype = None # pympi is not familiar with mp4 files
124
150
if args .video .endswith (".mp4" ):
@@ -128,18 +154,6 @@ def main():
128
154
if not args .no_pose_link :
129
155
eaf .add_linked_file (args .pose , mimetype = "application/pose" )
130
156
131
- for tier_id , segments in tiers .items ():
132
- eaf .add_tier (tier_id )
133
- for segment in segments :
134
- # convert frame numbers to millisecond timestamps, for Elan
135
- start_time_ms = int (segment ["start" ] / fps * 1000 )
136
- end_time_ms = int (segment ["end" ] / fps * 1000 )
137
- eaf .add_annotation (tier_id , start_time_ms , end_time_ms )
138
-
139
- if args .save_segments :
140
- print (f"Saving { args .save_segments } cropped .pose files" )
141
- save_pose_segments (tiers , tier_id = args .save_segments , input_file_path = args .pose )
142
-
143
157
if args .subtitles and os .path .exists (args .subtitles ):
144
158
import srt
145
159
0 commit comments