Skip to content

Commit

Permalink
Major rewrite to make things more consistent and extensible
Browse files Browse the repository at this point in the history
**IMPORTANT: Work in progress, not yet functional!**

The code in Strudy did the same thing in multiple ways:
- The main CLI was linked to the first type of analysis reports, that still get
published to `w3c/webref-analysis`.
- A few CLIs were there, used mainly for exploration purpose.
- The study modules could sometimes be used as CLI, for no apparent reason.
- The study modules used different conventions for inputs/outputs
- The reporting code was somewhat separated from the rest, whereas it could be
useful to extend it.
- Preparing flexible reports and amending the "file an issue" workflow was a
bit tedious.

This rewrite attempts to solve the above by:
- Rewriting study modules to expose one similar study function
- Adding a generic study module that knows about all anomaly types, dispatches
the study to the different study modules depending on the request, and can
structure and format the result as needed in markdown to file one or more
issues.
- Adding a few tests to be more confident that the analyses are doing the right
thing!
- **(Not done yet)** Updating the entry points to use the new study module
- **(Not done yet)** Getting rid of all side CLIs
  • Loading branch information
tidoust committed Aug 2, 2024
1 parent ba21462 commit 7a89322
Show file tree
Hide file tree
Showing 12 changed files with 1,470 additions and 161 deletions.
27 changes: 12 additions & 15 deletions src/lib/study-algorithms.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@
import { JSDOM } from 'jsdom';
import { recordCategorizedAnomaly } from './util.js';

const possibleAnomalies = [
'missingTaskForPromise',
'missingTaskForEvent'
];


/**
* Normalize whitespaces in string to make analysis easier
Expand Down Expand Up @@ -57,9 +50,8 @@ function nestParallelSteps(algo) {
/**
* Main function, study all algorithms
*/
function studyAlgorithms(edResults) {
function studyAlgorithms(specs) {
const report = [];
const recordAnomaly = recordCategorizedAnomaly(report, 'algorithms', possibleAnomalies);

// Return human-friendly markdown that identifies the given algorithm
function getAlgoName(algo) {
Expand Down Expand Up @@ -95,11 +87,19 @@ function studyAlgorithms(edResults) {
// https://w3c.github.io/clipboard-apis/#dom-clipboard-read
!html.includes('systemClipboardRepresentation')
) {
recordAnomaly(spec, 'missingTaskForPromise', `${getAlgoName(algo)} has a parallel step that resolves/rejects a promise directly`);
report.push({
name: 'missingTaskForPromise',
message: `${getAlgoName(algo)} has a parallel step that resolves/rejects a promise directly`,
spec
});
return true;
}
else if (html.match(/fire an?( \w+)? event/i)) {
recordAnomaly(spec, 'missingTaskForEvent', `${getAlgoName(algo)} has a parallel step that fires an event directly`);
report.push({
name: 'missingTaskForEvent',
message: `${getAlgoName(algo)} has a parallel step that fires an event directly`,
spec
});
return true;
}
}
Expand Down Expand Up @@ -133,13 +133,10 @@ function studyAlgorithms(edResults) {
return anomalyFound;
}

// We're only interested in specs that define algorithms
const specs = edResults.filter(spec => !!spec.algorithms);

// Study algorithms in turn.
// Note: the root level of each algorithm is its first step. It may say
// something like "run these steps in parallel" in particular.
for (const spec of specs) {
for (const spec of specs.filter(spec => !!spec.algorithms)) {
for (const algo of spec.algorithms) {
nestParallelSteps(algo);
studyAlgorithmStep(spec, algo, algo);
Expand Down
124 changes: 65 additions & 59 deletions src/lib/study-backrefs.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,3 @@
import { loadCrawlResults, recordCategorizedAnomaly } from './util.js';
import { fileURLToPath } from 'node:url';

const possibleAnomalies = [
'brokenLinks',
'datedUrls',
'evolvingLinks',
'frailLinks',
'nonCanonicalRefs',
'notDfn',
'notExported',
'outdatedSpecs',
'unknownSpecs'
];

/**
* The backrefs analyzer only checks links to other specs. This function returns
* true when a link does target a spec, and false if it targets something else
Expand All @@ -39,65 +24,99 @@ const matchSpecUrl = url =>
TODO: DRY
Copied from browser-specs/src/compute-shortname.js
*/
function computeShortname (url) {
function parseUrl (url) {
function computeShortname(url) {
function parseUrl(url) {
// Handle /TR/ URLs
const w3cTr = url.match(/^https?:\/\/(?:www\.)?w3\.org\/TR\/([^/]+)\/$/);
const w3cTr = url.match(/^https?:\/\/(?:www\.)?w3\.org\/TR\/([^\/]+)\/$/);
if (w3cTr) {
return w3cTr[1];
}

// Handle WHATWG specs
const whatwg = url.match(/\/\/(.+)\.spec\.whatwg\.org\/?/);
const whatwg = url.match(/\/\/(.+)\.spec\.whatwg\.org\//);
if (whatwg) {
return whatwg[1];
return whatwg[1];
}

// Handle TC39 Proposals
const tc39 = url.match(/\/\/tc39\.es\/proposal-([^/]+)\/$/);
const tc39 = url.match(/\/\/tc39\.es\/proposal-([^\/]+)\/$/);
if (tc39) {
return 'tc39-' + tc39[1];
return "tc39-" + tc39[1];
}


// Handle Khronos extensions
const khronos = url.match(/https:\/\/registry\.khronos\.org\/webgl\/extensions\/([^/]+)\/$/);
const khronos = url.match(/https:\/\/registry\.khronos\.org\/webgl\/extensions\/([^\/]+)\/$/);
if (khronos) {
return khronos[1];
return khronos[1];
}

// Handle extension specs defined in the same repo as the main spec
// (e.g. generate a "gamepad-extensions" name for
// https://w3c.github.io/gamepad/extensions.html")
const ext = url.match(/\/.*\.github\.io\/([^/]+)\/(extensions?)\.html$/);
const ext = url.match(/\/.*\.github\.io\/([^\/]+)\/(extensions?)\.html$/);
if (ext) {
return ext[1] + '-' + ext[2];
}

// Handle draft specs on GitHub, excluding the "webappsec-" prefix for
// specifications developed by the Web Application Security Working Group
const github = url.match(/\/.*\.github\.io\/(?:webappsec-)?([^/]+)\//);
const github = url.match(/\/.*\.github\.io\/(?:webappsec-)?([^\/]+)\//);
if (github) {
return github[1];
return github[1];
}

// Handle CSS WG specs
const css = url.match(/\/drafts\.(?:csswg|fxtf|css-houdini)\.org\/([^/]+)\//);
const css = url.match(/\/drafts\.(?:csswg|fxtf|css-houdini)\.org\/([^\/]+)\//);
if (css) {
return css[1];
}

// Handle SVG drafts
const svg = url.match(/\/svgwg\.org\/specs\/(?:svg-)?([^/]+)\//);
const svg = url.match(/\/svgwg\.org\/specs\/(?:svg-)?([^\/]+)\//);
if (svg) {
return 'svg-' + svg[1];
return "svg-" + svg[1];
}

// Handle IETF RFCs
const rfcs = url.match(/\/www.rfc-editor\.org\/rfc\/(rfc[0-9]+)/);
if (rfcs) {
return rfcs[1];
}

// Handle IETF group drafts
const ietfDraft = url.match(/\/datatracker\.ietf\.org\/doc\/html\/draft-ietf-[^\-]+-([^\/]+)/);
if (ietfDraft) {
return ietfDraft[1];
}

// Handle IETF individual drafts, stripping group name
// TODO: retrieve the list of IETF groups to make sure that the group name
// is an actual group name and not the beginning of the shortname:
// https://datatracker.ietf.org/api/v1/group/group/
// (multiple requests needed due to pagination, "?limit=1000" is the max)
const ietfIndDraft = url.match(/\/datatracker\.ietf\.org\/doc\/html\/draft-[^\-]+-([^\/]+)/);
if (ietfIndDraft) {
if (ietfIndDraft[1].indexOf('-') !== -1) {
return ietfIndDraft[1].slice(ietfIndDraft[1].indexOf('-') + 1);
}
else {
return ietfIndDraft[1];
}
}

// Handle TAG findings
const tag = url.match(/^https?:\/\/(?:www\.)?w3\.org\/2001\/tag\/doc\/([^\/]+)\/?$/);
if (tag) {
return tag[1];
}

// Return name when one was given
if (!url.match(/\//)) {
return url;
}

throw new Error(`Cannot extract meaningful name from ${url}`);
throw `Cannot extract meaningful name from ${url}`;
}

// Parse the URL to extract the name
Expand All @@ -107,8 +126,8 @@ function computeShortname (url) {
// Latin characters (a-z letters, digits, underscore and "-"), and that it
// only contains a dot for fractional levels at the end of the name
// (e.g. "blah-1.2" is good but "blah.blah" and "blah-3.1-blah" are not)
if (!name.match(/^[\w-]+((?<=-\d+)\.\d+)?$/)) {
throw new Error(`Specification name contains unexpected characters: ${name} (extracted from ${url})`);
if (!name.match(/^[\w\-]+((?<=\-v?\d+)\.\d+)?$/)) {
throw `Specification name contains unexpected characters: ${name} (extracted from ${url})`;
}

return name;
Expand Down Expand Up @@ -234,18 +253,21 @@ const matchAnchor = (url, anchor) => link => {
return link === (url + '#' + anchor) || link === (url + '#' + encodeURIComponent(anchor));
};

function studyBackrefs (edResults, trResults = [], htmlFragments = {}, shortnameFilter) {
trResults = trResults || [];
async function studyBackrefs(specs, { crawlResults = null, trResults = [], htmlFragments = null } = {}) {
crawlResults = crawlResults ?? specs;
const report = [];

edResults.forEach(spec => {
if (shortnameFilter && spec.shortname !== shortnameFilter) return;
studyLinks(spec, spec.links?.rawlinks, report, edResults, trResults, htmlFragments);
// Donwload automatic map of multipages anchors in HTML spec
const fragmentsUrl = 'https://html.spec.whatwg.org/multipage/fragment-links.json';
htmlFragments = htmlFragments ?? await fetch(fragmentsUrl).then(r => r.json());

specs.forEach(spec => {
studyLinks(spec, spec.links?.rawlinks, report, crawlResults, trResults, htmlFragments);
// given the current limitation of classification of links for bikeshed
// https://github.com/w3c/reffy/issues/1584
// we also check autolinks for bikeshed specs
if (spec.generator === "bikeshed") {
studyLinks(spec, spec.links?.autolinks, report, edResults, trResults, htmlFragments);
studyLinks(spec, spec.links?.autolinks, report, crawlResults, trResults, htmlFragments);
}
});
return report;
Expand All @@ -254,7 +276,9 @@ function studyBackrefs (edResults, trResults = [], htmlFragments = {}, shortname
function studyLinks(spec, links, report, edResults, trResults, htmlFragments) {
if (!links) return;

const recordAnomaly = recordCategorizedAnomaly(report, 'links', possibleAnomalies);
function recordAnomaly(spec, name, message) {
report.push({ name, message, spec });
}

Object.keys(links)
.filter(matchSpecUrl)
Expand Down Expand Up @@ -421,22 +445,4 @@ function studyLinks(spec, links, report, edResults, trResults, htmlFragments) {
});
}

/**************************************************
Export methods for use as module
**************************************************/
export default studyBackrefs;

if (process.argv[1] === fileURLToPath(import.meta.url)) {
const crawl = await loadCrawlResults(process.argv[2], process.argv[3]);
let htmlFragments = {};
try {
console.info('Downloading HTML spec fragments data…');
htmlFragments = await fetch('https://html.spec.whatwg.org/multipage/fragment-links.json').then(r => r.json());
console.info('- done');
} catch (err) {
console.error('- failed: could not fetch HTML fragments data, may report false positive broken links on HTML spec');
}

const results = studyBackrefs(crawl.ed, crawl.tr, htmlFragments, process.argv[4] ?? undefined);
console.log(results);
}
export default studyBackrefs;
Loading

0 comments on commit 7a89322

Please # to comment.