11#include "Python.h"
22#include "errcode.h"
3+ #include "internal/pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
34#include "../Parser/lexer/state.h"
45#include "../Parser/lexer/lexer.h"
56#include "../Parser/tokenizer/tokenizer.h"
6- #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
7+ #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
78
89static struct PyModuleDef _tokenizemodule ;
910
@@ -84,14 +85,16 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
8485}
8586
8687static int
87- _tokenizer_error (struct tok_state * tok )
88+ _tokenizer_error (tokenizeriterobject * it )
8889{
90+ _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED (it );
8991 if (PyErr_Occurred ()) {
9092 return -1 ;
9193 }
9294
9395 const char * msg = NULL ;
9496 PyObject * errtype = PyExc_SyntaxError ;
97+ struct tok_state * tok = it -> tok ;
9598 switch (tok -> done ) {
9699 case E_TOKEN :
97100 msg = "invalid token" ;
@@ -177,17 +180,78 @@ _tokenizer_error(struct tok_state *tok)
177180 return result ;
178181}
179182
183+ static PyObject *
184+ _get_current_line (tokenizeriterobject * it , const char * line_start , Py_ssize_t size ,
185+ int * line_changed )
186+ {
187+ _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED (it );
188+ PyObject * line ;
189+ if (it -> tok -> lineno != it -> last_lineno ) {
190+ // Line has changed since last token, so we fetch the new line and cache it
191+ // in the iter object.
192+ Py_XDECREF (it -> last_line );
193+ line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
194+ it -> last_line = line ;
195+ it -> byte_col_offset_diff = 0 ;
196+ }
197+ else {
198+ line = it -> last_line ;
199+ * line_changed = 0 ;
200+ }
201+ return line ;
202+ }
203+
204+ static void
205+ _get_col_offsets (tokenizeriterobject * it , struct token token , const char * line_start ,
206+ PyObject * line , int line_changed , Py_ssize_t lineno , Py_ssize_t end_lineno ,
207+ Py_ssize_t * col_offset , Py_ssize_t * end_col_offset )
208+ {
209+ _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED (it );
210+ Py_ssize_t byte_offset = -1 ;
211+ if (token .start != NULL && token .start >= line_start ) {
212+ byte_offset = token .start - line_start ;
213+ if (line_changed ) {
214+ * col_offset = _PyPegen_byte_offset_to_character_offset_line (line , 0 , byte_offset );
215+ it -> byte_col_offset_diff = byte_offset - * col_offset ;
216+ }
217+ else {
218+ * col_offset = byte_offset - it -> byte_col_offset_diff ;
219+ }
220+ }
221+
222+ if (token .end != NULL && token .end >= it -> tok -> line_start ) {
223+ Py_ssize_t end_byte_offset = token .end - it -> tok -> line_start ;
224+ if (lineno == end_lineno ) {
225+ // If the whole token is at the same line, we can just use the token.start
226+ // buffer for figuring out the new column offset, since using line is not
227+ // performant for very long lines.
228+ Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line (line , byte_offset , end_byte_offset );
229+ * end_col_offset = * col_offset + token_col_offset ;
230+ it -> byte_col_offset_diff += token .end - token .start - token_col_offset ;
231+ }
232+ else {
233+ * end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , end_byte_offset );
234+ it -> byte_col_offset_diff += end_byte_offset - * end_col_offset ;
235+ }
236+ }
237+ it -> last_lineno = lineno ;
238+ it -> last_end_lineno = end_lineno ;
239+ }
240+
180241static PyObject *
181242tokenizeriter_next (tokenizeriterobject * it )
182243{
183244 PyObject * result = NULL ;
245+
246+ Py_BEGIN_CRITICAL_SECTION (it );
247+
184248 struct token token ;
185249 _PyToken_Init (& token );
186250
187251 int type = _PyTokenizer_Get (it -> tok , & token );
188252 if (type == ERRORTOKEN ) {
189253 if (!PyErr_Occurred ()) {
190- _tokenizer_error (it -> tok );
254+ _tokenizer_error (it );
191255 assert (PyErr_Occurred ());
192256 }
193257 goto exit ;
@@ -224,18 +288,7 @@ tokenizeriter_next(tokenizeriterobject *it)
224288 size -= 1 ;
225289 }
226290
227- if (it -> tok -> lineno != it -> last_lineno ) {
228- // Line has changed since last token, so we fetch the new line and cache it
229- // in the iter object.
230- Py_XDECREF (it -> last_line );
231- line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
232- it -> last_line = line ;
233- it -> byte_col_offset_diff = 0 ;
234- } else {
235- // Line hasn't changed so we reuse the cached one.
236- line = it -> last_line ;
237- line_changed = 0 ;
238- }
291+ line = _get_current_line (it , line_start , size , & line_changed );
239292 }
240293 if (line == NULL ) {
241294 Py_DECREF (str );
@@ -244,36 +297,10 @@ tokenizeriter_next(tokenizeriterobject *it)
244297
245298 Py_ssize_t lineno = ISSTRINGLIT (type ) ? it -> tok -> first_lineno : it -> tok -> lineno ;
246299 Py_ssize_t end_lineno = it -> tok -> lineno ;
247- it -> last_lineno = lineno ;
248- it -> last_end_lineno = end_lineno ;
249-
250300 Py_ssize_t col_offset = -1 ;
251301 Py_ssize_t end_col_offset = -1 ;
252- Py_ssize_t byte_offset = -1 ;
253- if (token .start != NULL && token .start >= line_start ) {
254- byte_offset = token .start - line_start ;
255- if (line_changed ) {
256- col_offset = _PyPegen_byte_offset_to_character_offset_line (line , 0 , byte_offset );
257- it -> byte_col_offset_diff = byte_offset - col_offset ;
258- }
259- else {
260- col_offset = byte_offset - it -> byte_col_offset_diff ;
261- }
262- }
263- if (token .end != NULL && token .end >= it -> tok -> line_start ) {
264- Py_ssize_t end_byte_offset = token .end - it -> tok -> line_start ;
265- if (lineno == end_lineno ) {
266- // If the whole token is at the same line, we can just use the token.start
267- // buffer for figuring out the new column offset, since using line is not
268- // performant for very long lines.
269- Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line (line , byte_offset , end_byte_offset );
270- end_col_offset = col_offset + token_col_offset ;
271- it -> byte_col_offset_diff += token .end - token .start - token_col_offset ;
272- } else {
273- end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , end_byte_offset );
274- it -> byte_col_offset_diff += end_byte_offset - end_col_offset ;
275- }
276- }
302+ _get_col_offsets (it , token , line_start , line , line_changed ,
303+ lineno , end_lineno , & col_offset , & end_col_offset );
277304
278305 if (it -> tok -> tok_extra_tokens ) {
279306 if (is_trailing_token ) {
@@ -315,6 +342,8 @@ tokenizeriter_next(tokenizeriterobject *it)
315342 if (type == ENDMARKER ) {
316343 it -> done = 1 ;
317344 }
345+
346+ Py_END_CRITICAL_SECTION ();
318347 return result ;
319348}
320349
0 commit comments