Skip to content

Commit e8dc9dc

Browse files
committed
Adds a new writeme with improved validation and error messaging.
1 parent 9aaa63d commit e8dc9dc

11 files changed

+1713
-2
lines changed

.tools/readmes/cache.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""
5+
Cache implementation for WRITEME to speed up repeated runs.
6+
"""
7+
8+
import json
9+
import logging
10+
import os
11+
import pickle
12+
from pathlib import Path
13+
from typing import Any, Dict, Optional
14+
15+
logger = logging.getLogger(__name__)
16+
17+
# Cache directory relative to the readmes directory
18+
CACHE_DIR = Path(__file__).parent / ".cache"
19+
20+
21+
def get_cache_enabled() -> bool:
22+
"""Check if caching is enabled via environment variable."""
23+
return os.environ.get("USE_METADATA_CACHE", "0") == "1"
24+
25+
26+
def ensure_cache_dir() -> None:
27+
"""Ensure the cache directory exists."""
28+
if not CACHE_DIR.exists():
29+
CACHE_DIR.mkdir(exist_ok=True)
30+
logger.debug(f"Created cache directory: {CACHE_DIR}")
31+
32+
33+
def get_cache_path(key: str) -> Path:
34+
"""Get the cache file path for a given key."""
35+
# Create a filename-safe version of the key
36+
safe_key = key.replace("/", "_").replace(":", "_")
37+
return CACHE_DIR / f"{safe_key}.pickle"
38+
39+
40+
def save_to_cache(key: str, data: Any) -> bool:
41+
"""
42+
Save data to cache.
43+
44+
Args:
45+
key: Cache key
46+
data: Data to cache (must be pickle-able)
47+
48+
Returns:
49+
bool: True if successfully cached, False otherwise
50+
"""
51+
if not get_cache_enabled():
52+
return False
53+
54+
try:
55+
ensure_cache_dir()
56+
cache_path = get_cache_path(key)
57+
58+
with open(cache_path, "wb") as f:
59+
pickle.dump(data, f)
60+
61+
logger.debug(f"Cached data for key: {key}")
62+
return True
63+
except Exception as e:
64+
logger.warning(f"Failed to cache data for key {key}: {e}")
65+
return False
66+
67+
68+
def load_from_cache(key: str) -> Optional[Any]:
69+
"""
70+
Load data from cache.
71+
72+
Args:
73+
key: Cache key
74+
75+
Returns:
76+
The cached data or None if not found or caching disabled
77+
"""
78+
if not get_cache_enabled():
79+
return None
80+
81+
cache_path = get_cache_path(key)
82+
83+
if not cache_path.exists():
84+
return None
85+
86+
try:
87+
with open(cache_path, "rb") as f:
88+
data = pickle.load(f)
89+
90+
logger.debug(f"Loaded data from cache for key: {key}")
91+
return data
92+
except Exception as e:
93+
logger.warning(f"Failed to load cache for key {key}: {e}")
94+
return None
95+
96+
97+
def clear_cache() -> None:
98+
"""Clear all cached data."""
99+
if CACHE_DIR.exists():
100+
for cache_file in CACHE_DIR.glob("*.pickle"):
101+
try:
102+
cache_file.unlink()
103+
except Exception as e:
104+
logger.warning(f"Failed to delete cache file {cache_file}: {e}")
105+
106+
logger.info("Cache cleared")

.tools/readmes/deep_validator.py

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""
5+
Deep validator module for WRITEME to check for issues in the codebase.
6+
This version performs a more thorough check for duplicate snippet tags by
7+
directly scanning the files in the repository.
8+
"""
9+
10+
import logging
11+
import os
12+
import re
13+
import concurrent.futures
14+
from collections import defaultdict
15+
from pathlib import Path
16+
from typing import Dict, List, Set, Tuple, Optional, Any
17+
18+
from aws_doc_sdk_examples_tools.doc_gen import DocGen
19+
20+
logger = logging.getLogger(__name__)
21+
22+
23+
class ValidationError(Exception):
24+
"""Exception raised for validation errors."""
25+
pass
26+
27+
28+
def find_snippet_tags_in_file(file_path: Path) -> List[Tuple[str, int]]:
29+
"""
30+
Find all snippet tags in a file by directly parsing the file content.
31+
32+
Args:
33+
file_path: Path to the file to check
34+
35+
Returns:
36+
List of tuples containing (tag, line_number)
37+
"""
38+
if not file_path.exists():
39+
return []
40+
41+
try:
42+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
43+
lines = f.readlines()
44+
except Exception as e:
45+
logger.warning(f"Error reading file {file_path}: {e}")
46+
return []
47+
48+
# Common snippet tag patterns
49+
patterns = [
50+
# Standard snippet tag format
51+
r'snippet-start:\s*\[([^\]]+)\]',
52+
r'snippet-end:\s*\[([^\]]+)\]',
53+
# Alternative formats
54+
r'SNIPPET\s+START\s+\[([^\]]+)\]',
55+
r'SNIPPET\s+END\s+\[([^\]]+)\]',
56+
r'//\s*SNIPPET:\s*([^\s]+)',
57+
r'#\s*SNIPPET:\s*([^\s]+)',
58+
r'<!--\s*SNIPPET:\s*([^\s]+)\s*-->',
59+
# Look for any other potential tag formats
60+
r'snippet[:\-_]([a-zA-Z0-9_\-]+)',
61+
# Common AWS SDK snippet formats
62+
r'//\s*snippet-start:\s*([^\s]+)',
63+
r'#\s*snippet-start:\s*([^\s]+)',
64+
r'<!--\s*snippet-start:\s*([^\s]+)\s*-->',
65+
r'//\s*snippet-end:\s*([^\s]+)',
66+
r'#\s*snippet-end:\s*([^\s]+)',
67+
r'<!--\s*snippet-end:\s*([^\s]+)\s*-->',
68+
]
69+
70+
results = []
71+
for i, line in enumerate(lines, 1):
72+
for pattern in patterns:
73+
matches = re.findall(pattern, line, re.IGNORECASE)
74+
for match in matches:
75+
results.append((match, i))
76+
77+
return results
78+
79+
80+
def scan_directory_for_snippet_tags(
81+
root_dir: Path,
82+
extensions: Optional[List[str]] = None,
83+
max_workers: int = 10
84+
) -> Dict[str, List[Tuple[str, int, str]]]:
85+
"""
86+
Scan a directory recursively for files containing snippet tags.
87+
Uses parallel processing for faster scanning.
88+
89+
Args:
90+
root_dir: Root directory to scan
91+
extensions: Optional list of file extensions to check
92+
max_workers: Maximum number of parallel workers
93+
94+
Returns:
95+
Dictionary mapping snippet tags to lists of (file_path, line_number, context)
96+
"""
97+
if extensions is None:
98+
# Default extensions to check
99+
extensions = [
100+
'.py', '.java', '.js', '.ts', '.cs', '.cpp', '.c', '.go', '.rb',
101+
'.php', '.swift', '.kt', '.rs', '.abap', '.md', '.html', '.xml'
102+
]
103+
104+
# Find all files with the specified extensions
105+
files_to_scan = []
106+
for root, _, files in os.walk(root_dir):
107+
for file in files:
108+
if any(file.endswith(ext) for ext in extensions):
109+
files_to_scan.append(Path(root) / file)
110+
111+
# Process files in parallel
112+
tag_to_locations = defaultdict(list)
113+
114+
def process_file(file_path):
115+
try:
116+
relative_path = file_path.relative_to(root_dir)
117+
tags = find_snippet_tags_in_file(file_path)
118+
119+
results = []
120+
for tag, line_number in tags:
121+
# Get some context from the file
122+
try:
123+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
124+
lines = f.readlines()
125+
start_line = max(0, line_number - 2)
126+
end_line = min(len(lines), line_number + 1)
127+
context = ''.join(lines[start_line:end_line]).strip()
128+
except Exception:
129+
context = "<context unavailable>"
130+
131+
results.append((str(relative_path), line_number, context))
132+
133+
return {tag: [loc] for tag, line_number in tags for loc in [(str(relative_path), line_number, "")]}
134+
except Exception as e:
135+
logger.warning(f"Error processing file {file_path}: {e}")
136+
return {}
137+
138+
# Use ThreadPoolExecutor for parallel processing
139+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
140+
future_to_file = {executor.submit(process_file, file): file for file in files_to_scan}
141+
142+
for future in concurrent.futures.as_completed(future_to_file):
143+
file_results = future.result()
144+
for tag, locations in file_results.items():
145+
tag_to_locations[tag].extend(locations)
146+
147+
return tag_to_locations
148+
149+
150+
def check_duplicate_snippet_tags_deep(doc_gen: DocGen) -> List[Tuple[str, List[Dict[str, Any]]]]:
151+
"""
152+
Deep check for duplicate snippet tags in the codebase.
153+
This function scans all files directly to find snippet tags.
154+
155+
Args:
156+
doc_gen: The DocGen instance containing snippets
157+
158+
Returns:
159+
List of tuples containing (tag, [location_details]) for duplicate tags
160+
"""
161+
logger.info("Starting deep scan for duplicate snippet tags...")
162+
163+
# Scan the repository directly for snippet tags
164+
root_dir = doc_gen.root
165+
tag_locations = scan_directory_for_snippet_tags(root_dir)
166+
167+
# Find tags that appear in multiple files
168+
duplicates = []
169+
for tag, locations in tag_locations.items():
170+
# Group locations by file path
171+
files = {}
172+
for file_path, line_number, context in locations:
173+
if file_path not in files:
174+
files[file_path] = []
175+
files[file_path].append({"line": line_number, "context": context})
176+
177+
# If the tag appears in multiple files, it's a duplicate
178+
if len(files) > 1:
179+
duplicate_info = []
180+
for file_path, occurrences in files.items():
181+
duplicate_info.append({
182+
"file": file_path,
183+
"occurrences": occurrences
184+
})
185+
duplicates.append((tag, duplicate_info))
186+
187+
logger.info(f"Deep scan complete. Found {len(duplicates)} duplicate tags.")
188+
return duplicates
189+
190+
191+
def format_duplicate_report(duplicates: List[Tuple[str, List[Dict[str, Any]]]]) -> str:
192+
"""
193+
Format a detailed report of duplicate snippet tags.
194+
195+
Args:
196+
duplicates: List of duplicate tag information
197+
198+
Returns:
199+
Formatted report as a string
200+
"""
201+
if not duplicates:
202+
return "No duplicate snippet tags found."
203+
204+
report = [f"Found {len(duplicates)} duplicate snippet tags:"]
205+
206+
for tag, locations in duplicates:
207+
report.append(f"\nTag: '{tag}' found in {len(locations)} files:")
208+
209+
for location in locations:
210+
file_path = location["file"]
211+
occurrences = location["occurrences"]
212+
213+
report.append(f" File: {file_path}")
214+
for occurrence in occurrences:
215+
line = occurrence.get("line", "unknown")
216+
context = occurrence.get("context", "").replace("\n", " ").strip()
217+
if context:
218+
context = f" - Context: {context[:60]}..."
219+
report.append(f" Line {line}{context}")
220+
221+
return "\n".join(report)
222+
223+
224+
def validate_snippets_deep(doc_gen: DocGen, strict: bool = False) -> bool:
225+
"""
226+
Deep validation of snippets in the codebase.
227+
228+
Args:
229+
doc_gen: The DocGen instance containing snippets
230+
strict: If True, raise an exception for validation errors
231+
232+
Returns:
233+
True if validation passed, False otherwise
234+
"""
235+
validation_passed = True
236+
237+
# Check for duplicate snippet tags using the deep method
238+
duplicates = check_duplicate_snippet_tags_deep(doc_gen)
239+
if duplicates:
240+
validation_passed = False
241+
report = format_duplicate_report(duplicates)
242+
print("\n=== DUPLICATE SNIPPET TAGS (DEEP SCAN) ===")
243+
print(report)
244+
245+
# Exit with error if strict validation is enabled
246+
if strict:
247+
raise ValidationError("Validation failed: duplicate snippet tags found")
248+
else:
249+
print("No duplicate snippet tags found in deep scan.")
250+
251+
return validation_passed

0 commit comments

Comments
 (0)