Skip to content

collect MDN API information #130

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 13, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pubspec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@ dev_dependencies:
build_runner: ^2.4.0
build_web_compilers: ^4.0.7
code_builder: ^4.9.0
collection: ^1.18.0
dart_flutter_team_lints: ^2.0.0
dart_style: ^2.2.4
html: ^0.15.0
http: ^1.0.0
io: ^1.0.4
path: ^1.8.3
pool: ^1.5.0
test: ^1.22.2
239 changes: 239 additions & 0 deletions tool/scrape_mdn.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
// Copyright (c) 2023, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.

// ignore_for_file: prefer_expression_function_bodies

import 'dart:convert';
import 'dart:io';

import 'package:html/dom.dart' as dom;
import 'package:html/dom_parsing.dart' show TreeVisitor;
import 'package:html/parser.dart' show parse;
import 'package:http/http.dart' as http;
import 'package:pool/pool.dart';

const apiUrl = 'https://developer.mozilla.org/en-US/docs/Web/API';

Future<void> main(List<String> args) async {
final client = http.Client();

// Get the API page with all the interface references.
final response = await client.get(Uri.parse(apiUrl));
final doc = parse(response.body);

final section = doc.querySelector('section[aria-labelledby=interfaces]')!;
final anchorItems = section.querySelectorAll('li a');

final interfaceNames = <String>[];

for (final item in anchorItems) {
final href = item.attributes['href']!;
final interfaceName = href.split('/').last;

interfaceNames.add(interfaceName);
}

interfaceNames.sort();

print('${interfaceNames.length} items read from $apiUrl.');

final pool = Pool(6);

final interfaces = await pool.forEach(interfaceNames, (item) async {
return populateInterfaceInfo(item, client: client);
}).toList();

client.close();

const encoder = JsonEncoder.withIndent(' ');

final file = File('tool/mdn.json');
final json = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any benefit to writing out the json and then consuming it later versus just having the node information in memory? If this is to just visualize the contents for now, then ignore this comment.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Collecting the info from MDN is ~slower (around 20s) and could fail depending on networking conditions. My idea was to separate collecting the info (something you might run ~monthly) from using that info to generate docs + generated code (something you do on every commit).

It's probably not critical to separate these things out, but is what I was going for w/ this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see, I didn't realize it was that slow. Then yes, let's avoid this slowing down iteration of the generation script if it's that significant.

I'm a little concerned on how to keep this updated (both in terms of the the script you added and the docs in the output files) if we aren't running the script to run the docs when we generate code regularly. This may not be a big issue after the original generation, but we're likely to have stale documentation. For this CL and the following ones, it's okay to not worry about this for now.

'__meta__': {
'source': '[MDN Web Docs](https://developer.mozilla.org/en-US/docs/Web)',
'license':
'[CC-BY-SA 2.5](https://creativecommons.org/licenses/by-sa/2.5/)',
},
for (var i in interfaces) i.name: i.asJson,
};
file.writeAsStringSync('${encoder.convert(json)}\n');
}

Future<InterfaceInfo> populateInterfaceInfo(
String interfaceName, {
required http.Client client,
}) async {
print(' $interfaceName');

final info = InterfaceInfo(name: interfaceName);

final url = '$apiUrl/$interfaceName';

// Retrieve the interface docs page.
final response = await client.get(Uri.parse(url));
final doc = parse(response.body);

final article = doc.querySelector('main article')!;
final content = article.querySelector('div[class=section-content]')!;

info.docs = '''
${_nodesToMarkdown(content.children)}

See also $url.''';

// Gather property info.
for (final dt in article.querySelectorAll('dt[id]')) {
final id = dt.attributes['id']!;

if (id.startsWith('${interfaceName.toLowerCase()}.')) {
final name = id.substring(interfaceName.length + 1);
final property = Property(name: name);

final index = dt.parent!.children.indexOf(dt);
final dd = dt.parent!.children[index + 1];
if (dd.localName == 'dd') {
property.docs = _nodesToMarkdown(dd.children);
}

info.properties.add(property);
}
}

info.properties.sort((a, b) => a.name.compareTo(b.name));

return info;
}

class InterfaceInfo {
final String name;
late final String docs;

final List<Property> properties = [];

InterfaceInfo({required this.name});

Map<String, dynamic> get asJson => {
'docs': docs,
if (properties.isNotEmpty)
'properties': {for (var p in properties) p.name: p.docs},
};
}

class Property {
final String name;
late final String docs;

Property({required this.name});
}

String _nodesToMarkdown(List<dom.Element> nodes) {
return nodes.map(_nodeToMarkdown).whereType<String>().join('\n\n');
}

String? _nodeToMarkdown(dom.Element node) {
String value;

switch (node.localName) {
case 'p':
value = getTextForNote(node);
break;
case 'blockquote':
value = '> ${getTextForNote(node)}';
break;
case 'ul':
case 'ol':
final buf = StringBuffer();
for (var child in node.querySelectorAll('li')) {
buf.writeln('- ${getTextForNote(child)}');
}
value = buf.toString();
break;
case 'div':
if (node.classes.contains('notecard')) {
value =
node.children.map(_nodeToMarkdown).whereType<String>().join('\n');
} else if (node.classes.contains('code-example')) {
final buf = StringBuffer();
final pre = node.querySelector('pre')!;
buf.writeln('```');
buf.writeln(pre.text.trimRight());
buf.writeln('```');
value = buf.toString();
} else {
throw Exception('unhandled div type: ${node.classes}');
}
break;
case 'dl':
final buf = StringBuffer();
buf.writeln('| --- | --- |');
for (var child in node.children) {
if (child.localName == 'dt') {
buf.write('| ${getTextForNote(child).trim()} ');
} else if (child.localName == 'dd') {
buf.writeln('| ${getTextForNote(child).trim()} |');
}
}
value = buf.toString();
break;
case 'figure':
case 'svg':
return null;
default:
throw Exception('unhandled node type: ${node.localName}');
}

return value.trim();
}

String getTextForNote(dom.Element node) {
final visitor = MarkdownTextVisitor();
visitor.visit(node);
return visitor.toString();
}

class MarkdownTextVisitor extends TreeVisitor {
final StringBuffer buf = StringBuffer();

@override
void visitText(dom.Text node) {
buf.write(node.data);
}

@override
void visitElement(dom.Element node) {
switch (node.localName) {
case 'strong':
buf.write('**');
visitChildren(node);
buf.write('**');
break;
case 'br':
buf.writeln();
buf.writeln();
break;
case 'a':
// TODO(devoncarew): Fixup relative urls? Convert to symbol references?
final href = node.attributes['href'];
if (href != null && href.startsWith('https://')) {
buf.write('[');
visitChildren(node);
buf.write(']($href)');
} else {
visitChildren(node);
}
break;
case 'code':
buf.write('`');
visitChildren(node);
buf.write('`');
break;
default:
visitChildren(node);
break;
}
}

@override
String toString() => buf.toString();
}