1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """
5
+ Deep validator module for WRITEME to check for issues in the codebase.
6
+ This version performs a more thorough check for duplicate snippet tags by
7
+ directly scanning the files in the repository.
8
+ """
9
+
10
+ import logging
11
+ import os
12
+ import re
13
+ import concurrent .futures
14
+ from collections import defaultdict
15
+ from pathlib import Path
16
+ from typing import Dict , List , Set , Tuple , Optional , Any
17
+
18
+ from aws_doc_sdk_examples_tools .doc_gen import DocGen
19
+
20
+ logger = logging .getLogger (__name__ )
21
+
22
+
23
+ class ValidationError (Exception ):
24
+ """Exception raised for validation errors."""
25
+ pass
26
+
27
+
28
+ def find_snippet_tags_in_file (file_path : Path ) -> List [Tuple [str , int ]]:
29
+ """
30
+ Find all snippet tags in a file by directly parsing the file content.
31
+
32
+ Args:
33
+ file_path: Path to the file to check
34
+
35
+ Returns:
36
+ List of tuples containing (tag, line_number)
37
+ """
38
+ if not file_path .exists ():
39
+ return []
40
+
41
+ try :
42
+ with open (file_path , 'r' , encoding = 'utf-8' , errors = 'replace' ) as f :
43
+ lines = f .readlines ()
44
+ except Exception as e :
45
+ logger .warning (f"Error reading file { file_path } : { e } " )
46
+ return []
47
+
48
+ # Common snippet tag patterns
49
+ patterns = [
50
+ # Standard snippet tag format
51
+ r'snippet-start:\s*\[([^\]]+)\]' ,
52
+ r'snippet-end:\s*\[([^\]]+)\]' ,
53
+ # Alternative formats
54
+ r'SNIPPET\s+START\s+\[([^\]]+)\]' ,
55
+ r'SNIPPET\s+END\s+\[([^\]]+)\]' ,
56
+ r'//\s*SNIPPET:\s*([^\s]+)' ,
57
+ r'#\s*SNIPPET:\s*([^\s]+)' ,
58
+ r'<!--\s*SNIPPET:\s*([^\s]+)\s*-->' ,
59
+ # Look for any other potential tag formats
60
+ r'snippet[:\-_]([a-zA-Z0-9_\-]+)' ,
61
+ # Common AWS SDK snippet formats
62
+ r'//\s*snippet-start:\s*([^\s]+)' ,
63
+ r'#\s*snippet-start:\s*([^\s]+)' ,
64
+ r'<!--\s*snippet-start:\s*([^\s]+)\s*-->' ,
65
+ r'//\s*snippet-end:\s*([^\s]+)' ,
66
+ r'#\s*snippet-end:\s*([^\s]+)' ,
67
+ r'<!--\s*snippet-end:\s*([^\s]+)\s*-->' ,
68
+ ]
69
+
70
+ results = []
71
+ for i , line in enumerate (lines , 1 ):
72
+ for pattern in patterns :
73
+ matches = re .findall (pattern , line , re .IGNORECASE )
74
+ for match in matches :
75
+ results .append ((match , i ))
76
+
77
+ return results
78
+
79
+
80
+ def scan_directory_for_snippet_tags (
81
+ root_dir : Path ,
82
+ extensions : Optional [List [str ]] = None ,
83
+ max_workers : int = 10
84
+ ) -> Dict [str , List [Tuple [str , int , str ]]]:
85
+ """
86
+ Scan a directory recursively for files containing snippet tags.
87
+ Uses parallel processing for faster scanning.
88
+
89
+ Args:
90
+ root_dir: Root directory to scan
91
+ extensions: Optional list of file extensions to check
92
+ max_workers: Maximum number of parallel workers
93
+
94
+ Returns:
95
+ Dictionary mapping snippet tags to lists of (file_path, line_number, context)
96
+ """
97
+ if extensions is None :
98
+ # Default extensions to check
99
+ extensions = [
100
+ '.py' , '.java' , '.js' , '.ts' , '.cs' , '.cpp' , '.c' , '.go' , '.rb' ,
101
+ '.php' , '.swift' , '.kt' , '.rs' , '.abap' , '.md' , '.html' , '.xml'
102
+ ]
103
+
104
+ # Find all files with the specified extensions
105
+ files_to_scan = []
106
+ for root , _ , files in os .walk (root_dir ):
107
+ for file in files :
108
+ if any (file .endswith (ext ) for ext in extensions ):
109
+ files_to_scan .append (Path (root ) / file )
110
+
111
+ # Process files in parallel
112
+ tag_to_locations = defaultdict (list )
113
+
114
+ def process_file (file_path ):
115
+ try :
116
+ relative_path = file_path .relative_to (root_dir )
117
+ tags = find_snippet_tags_in_file (file_path )
118
+
119
+ results = []
120
+ for tag , line_number in tags :
121
+ # Get some context from the file
122
+ try :
123
+ with open (file_path , 'r' , encoding = 'utf-8' , errors = 'replace' ) as f :
124
+ lines = f .readlines ()
125
+ start_line = max (0 , line_number - 2 )
126
+ end_line = min (len (lines ), line_number + 1 )
127
+ context = '' .join (lines [start_line :end_line ]).strip ()
128
+ except Exception :
129
+ context = "<context unavailable>"
130
+
131
+ results .append ((str (relative_path ), line_number , context ))
132
+
133
+ return {tag : [loc ] for tag , line_number in tags for loc in [(str (relative_path ), line_number , "" )]}
134
+ except Exception as e :
135
+ logger .warning (f"Error processing file { file_path } : { e } " )
136
+ return {}
137
+
138
+ # Use ThreadPoolExecutor for parallel processing
139
+ with concurrent .futures .ThreadPoolExecutor (max_workers = max_workers ) as executor :
140
+ future_to_file = {executor .submit (process_file , file ): file for file in files_to_scan }
141
+
142
+ for future in concurrent .futures .as_completed (future_to_file ):
143
+ file_results = future .result ()
144
+ for tag , locations in file_results .items ():
145
+ tag_to_locations [tag ].extend (locations )
146
+
147
+ return tag_to_locations
148
+
149
+
150
+ def check_duplicate_snippet_tags_deep (doc_gen : DocGen ) -> List [Tuple [str , List [Dict [str , Any ]]]]:
151
+ """
152
+ Deep check for duplicate snippet tags in the codebase.
153
+ This function scans all files directly to find snippet tags.
154
+
155
+ Args:
156
+ doc_gen: The DocGen instance containing snippets
157
+
158
+ Returns:
159
+ List of tuples containing (tag, [location_details]) for duplicate tags
160
+ """
161
+ logger .info ("Starting deep scan for duplicate snippet tags..." )
162
+
163
+ # Scan the repository directly for snippet tags
164
+ root_dir = doc_gen .root
165
+ tag_locations = scan_directory_for_snippet_tags (root_dir )
166
+
167
+ # Find tags that appear in multiple files
168
+ duplicates = []
169
+ for tag , locations in tag_locations .items ():
170
+ # Group locations by file path
171
+ files = {}
172
+ for file_path , line_number , context in locations :
173
+ if file_path not in files :
174
+ files [file_path ] = []
175
+ files [file_path ].append ({"line" : line_number , "context" : context })
176
+
177
+ # If the tag appears in multiple files, it's a duplicate
178
+ if len (files ) > 1 :
179
+ duplicate_info = []
180
+ for file_path , occurrences in files .items ():
181
+ duplicate_info .append ({
182
+ "file" : file_path ,
183
+ "occurrences" : occurrences
184
+ })
185
+ duplicates .append ((tag , duplicate_info ))
186
+
187
+ logger .info (f"Deep scan complete. Found { len (duplicates )} duplicate tags." )
188
+ return duplicates
189
+
190
+
191
+ def format_duplicate_report (duplicates : List [Tuple [str , List [Dict [str , Any ]]]]) -> str :
192
+ """
193
+ Format a detailed report of duplicate snippet tags.
194
+
195
+ Args:
196
+ duplicates: List of duplicate tag information
197
+
198
+ Returns:
199
+ Formatted report as a string
200
+ """
201
+ if not duplicates :
202
+ return "No duplicate snippet tags found."
203
+
204
+ report = [f"Found { len (duplicates )} duplicate snippet tags:" ]
205
+
206
+ for tag , locations in duplicates :
207
+ report .append (f"\n Tag: '{ tag } ' found in { len (locations )} files:" )
208
+
209
+ for location in locations :
210
+ file_path = location ["file" ]
211
+ occurrences = location ["occurrences" ]
212
+
213
+ report .append (f" File: { file_path } " )
214
+ for occurrence in occurrences :
215
+ line = occurrence .get ("line" , "unknown" )
216
+ context = occurrence .get ("context" , "" ).replace ("\n " , " " ).strip ()
217
+ if context :
218
+ context = f" - Context: { context [:60 ]} ..."
219
+ report .append (f" Line { line } { context } " )
220
+
221
+ return "\n " .join (report )
222
+
223
+
224
+ def validate_snippets_deep (doc_gen : DocGen , strict : bool = False ) -> bool :
225
+ """
226
+ Deep validation of snippets in the codebase.
227
+
228
+ Args:
229
+ doc_gen: The DocGen instance containing snippets
230
+ strict: If True, raise an exception for validation errors
231
+
232
+ Returns:
233
+ True if validation passed, False otherwise
234
+ """
235
+ validation_passed = True
236
+
237
+ # Check for duplicate snippet tags using the deep method
238
+ duplicates = check_duplicate_snippet_tags_deep (doc_gen )
239
+ if duplicates :
240
+ validation_passed = False
241
+ report = format_duplicate_report (duplicates )
242
+ print ("\n === DUPLICATE SNIPPET TAGS (DEEP SCAN) ===" )
243
+ print (report )
244
+
245
+ # Exit with error if strict validation is enabled
246
+ if strict :
247
+ raise ValidationError ("Validation failed: duplicate snippet tags found" )
248
+ else :
249
+ print ("No duplicate snippet tags found in deep scan." )
250
+
251
+ return validation_passed
0 commit comments