From 24bd7a8d2c7ce005c24b41a712679f8882d90c41 Mon Sep 17 00:00:00 2001 From: Michal Piechowiak Date: Wed, 28 Mar 2018 00:20:22 +0200 Subject: [PATCH] [gatsby-transformer-remark] Don't generate AST for same node multiple times in parallel. (#4734) If we are already generating AST for given node - wait for result of that. --- .../src/extend-node-type.js | 178 ++++++++++-------- 1 file changed, 98 insertions(+), 80 deletions(-) diff --git a/packages/gatsby-transformer-remark/src/extend-node-type.js b/packages/gatsby-transformer-remark/src/extend-node-type.js index c0f4dbe622ea7..1b6c03e3ae4d5 100644 --- a/packages/gatsby-transformer-remark/src/extend-node-type.js +++ b/packages/gatsby-transformer-remark/src/extend-node-type.js @@ -51,6 +51,14 @@ const tableOfContentsCacheKey = node => const withPathPrefix = (url, pathPrefix) => (pathPrefix + url).replace(/\/\//, `/`) +/** + * Map that keeps track of generation of AST to not generate it multiple + * times in parallel. + * + * @type {Map} + */ +const ASTPromiseMap = new Map() + module.exports = ( { type, store, pathPrefix, getNode, cache, reporter }, pluginOptions @@ -87,90 +95,28 @@ module.exports = ( } async function getAST(markdownNode) { - const cachedAST = await cache.get(astCacheKey(markdownNode)) + const cacheKey = astCacheKey(markdownNode) + const cachedAST = await cache.get(cacheKey) if (cachedAST) { return cachedAST + } else if (ASTPromiseMap.has(cacheKey)) { + // We are already generating AST, so let's wait for it + return await ASTPromiseMap.get(cacheKey) } else { - const files = _.values(store.getState().nodes).filter( - n => n.internal.type === `File` - ) - const ast = await new Promise((resolve, reject) => { - // Use Bluebird's Promise function "each" to run remark plugins serially. - Promise.each(pluginOptions.plugins, plugin => { - const requiredPlugin = require(plugin.resolve) - if (_.isFunction(requiredPlugin.mutateSource)) { - return requiredPlugin.mutateSource( - { - markdownNode, - files, - getNode, - reporter, - }, - plugin.pluginOptions - ) - } else { - return Promise.resolve() - } - }).then(() => { - const markdownAST = remark.parse(markdownNode.internal.content) - - if (pathPrefix) { - // Ensure relative links include `pathPrefix` - visit(markdownAST, `link`, node => { - if ( - node.url && - node.url.startsWith(`/`) && - !node.url.startsWith(`//`) - ) { - node.url = withPathPrefix(node.url, pathPrefix) - } - }) - } - - // source => parse (can order parsing for dependencies) => typegen - // - // source plugins identify nodes, provide id, initial parse, know - // when nodes are created/removed/deleted - // get passed cached DataTree and return list of clean and dirty nodes. - // Also get passed `dirtyNodes` function which they can call with an array - // of node ids which will then get re-parsed and the inferred schema - // recreated (if inferring schema gets too expensive, can also - // cache the schema until a query fails at which point recreate the - // schema). - // - // parse plugins take data from source nodes and extend it, never mutate - // it. Freeze all nodes once done so typegen plugins can't change it - // this lets us save off the DataTree at that point as well as create - // indexes. - // - // typegen plugins identify further types of data that should be lazily - // computed due to their expense, or are hard to infer graphql type - // (markdown ast), or are need user input in order to derive e.g. - // markdown headers or date fields. - // - // wrap all resolve functions to (a) auto-memoize and (b) cache to disk any - // resolve function that takes longer than ~10ms (do research on this - // e.g. how long reading/writing to cache takes), and (c) track which - // queries are based on which source nodes. Also if connection of what - // which are always rerun if their underlying nodes change.. - // - // every node type in DataTree gets a schema type automatically. - // typegen plugins just modify the auto-generated types to add derived fields - // as well as computationally expensive fields. - const files = _.values(store.getState().nodes).filter( - n => n.internal.type === `File` - ) + const ASTGenerationPromise = new Promise(async resolve => { + const files = _.values(store.getState().nodes).filter( + n => n.internal.type === `File` + ) + const ast = await new Promise((resolve, reject) => { // Use Bluebird's Promise function "each" to run remark plugins serially. Promise.each(pluginOptions.plugins, plugin => { const requiredPlugin = require(plugin.resolve) - if (_.isFunction(requiredPlugin)) { - return requiredPlugin( + if (_.isFunction(requiredPlugin.mutateSource)) { + return requiredPlugin.mutateSource( { - markdownAST, markdownNode, - getNode, files, - pathPrefix, + getNode, reporter, }, plugin.pluginOptions @@ -179,14 +125,86 @@ module.exports = ( return Promise.resolve() } }).then(() => { - resolve(markdownAST) + const markdownAST = remark.parse(markdownNode.internal.content) + + if (pathPrefix) { + // Ensure relative links include `pathPrefix` + visit(markdownAST, `link`, node => { + if ( + node.url && + node.url.startsWith(`/`) && + !node.url.startsWith(`//`) + ) { + node.url = withPathPrefix(node.url, pathPrefix) + } + }) + } + + // source => parse (can order parsing for dependencies) => typegen + // + // source plugins identify nodes, provide id, initial parse, know + // when nodes are created/removed/deleted + // get passed cached DataTree and return list of clean and dirty nodes. + // Also get passed `dirtyNodes` function which they can call with an array + // of node ids which will then get re-parsed and the inferred schema + // recreated (if inferring schema gets too expensive, can also + // cache the schema until a query fails at which point recreate the + // schema). + // + // parse plugins take data from source nodes and extend it, never mutate + // it. Freeze all nodes once done so typegen plugins can't change it + // this lets us save off the DataTree at that point as well as create + // indexes. + // + // typegen plugins identify further types of data that should be lazily + // computed due to their expense, or are hard to infer graphql type + // (markdown ast), or are need user input in order to derive e.g. + // markdown headers or date fields. + // + // wrap all resolve functions to (a) auto-memoize and (b) cache to disk any + // resolve function that takes longer than ~10ms (do research on this + // e.g. how long reading/writing to cache takes), and (c) track which + // queries are based on which source nodes. Also if connection of what + // which are always rerun if their underlying nodes change.. + // + // every node type in DataTree gets a schema type automatically. + // typegen plugins just modify the auto-generated types to add derived fields + // as well as computationally expensive fields. + const files = _.values(store.getState().nodes).filter( + n => n.internal.type === `File` + ) + // Use Bluebird's Promise function "each" to run remark plugins serially. + Promise.each(pluginOptions.plugins, plugin => { + const requiredPlugin = require(plugin.resolve) + if (_.isFunction(requiredPlugin)) { + return requiredPlugin( + { + markdownAST, + markdownNode, + getNode, + files, + pathPrefix, + reporter, + }, + plugin.pluginOptions + ) + } else { + return Promise.resolve() + } + }).then(() => { + resolve(markdownAST) + }) }) }) - }) - // Save new AST to cache and return - cache.set(astCacheKey(markdownNode), ast) - return ast + // Save new AST to cache and return + cache.set(astCacheKey(markdownNode), ast) + // We can now release promise, as we cached result + ASTPromiseMap.delete(astCacheKey) + return resolve(ast) + }) + ASTPromiseMap.set(cacheKey, ASTGenerationPromise) + return await ASTGenerationPromise } }