From 77f8446a17b696ddf95b1bb0621e64e2a987832d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Fri, 10 Jan 2025 14:44:33 +0100 Subject: [PATCH] [Data Liberation] Filesystem entity reader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A part of https://github.com/WordPress/wordpress-playground/issues/1894 Adds a new API for loading content from a WP_Filesystem instance: * `WP_Filesystem_To_Post_Tree` for traversing a directory tree and mapping the structure a hierarchical WordPress post/meta entity stream * `WP_Filesystem_Entity_Reader` for converting the raw file content into WordPress blocks To convert a set of zipped files into WordPress entities: ```php // Any Filesystem instance works here. Could be WP_Local_Filesystem, // WP_Git_Filesystem, or anything else. Let's read from a zip file here: $fs = new WP_Zip_Filesystem( WP_File_Reader::create('./docs.zip') ); $reader = new WP_Filesystem_Entity_Reader( $fs ); foreach($reader as $entity) { var_dump($entity); } ``` ## Testing The code isn't used anywhere yet – just rely on the CI checks. --- .../src/WP_Markdown_Importer.php | 6 +- .../data-liberation/blueprints-library | 2 +- .../playground/data-liberation/bootstrap.php | 5 +- .../playground/data-liberation/phpunit.xml | 1 + .../WP_Directory_Tree_Entity_Reader.php | 342 --------------- .../WP_Filesystem_Entity_Reader.php | 167 +++++++ .../WP_Filesystem_To_Post_Tree.php | 413 ++++++++++++++++++ .../tests/WPFilesystemToPostTreeTests.php | 40 ++ .../nested/page1.html | 2 + .../filesystem-entity-reader/root.html | 2 + 10 files changed, 633 insertions(+), 347 deletions(-) delete mode 100644 packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php create mode 100644 packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php create mode 100644 packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php create mode 100644 packages/playground/data-liberation/tests/WPFilesystemToPostTreeTests.php create mode 100644 packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/nested/page1.html create mode 100644 packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/root.html diff --git a/packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php b/packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php index a164dc2025..d120e6e6fa 100644 --- a/packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php +++ b/packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php @@ -1,6 +1,6 @@ $markdown_directory, 'first_post_id' => 1, 'allowed_extensions' => array( 'md' ), 'index_file_patterns' => array( '#^index\.md$#' ), - 'markup_converter_factory' => function ( $content ) { + 'data_consumer_factory' => function ( $content ) { return new WP_Markdown_Consumer( $content ); }, ) diff --git a/packages/playground/data-liberation/blueprints-library b/packages/playground/data-liberation/blueprints-library index 2558e0ecc3..63fab6b9c4 160000 --- a/packages/playground/data-liberation/blueprints-library +++ b/packages/playground/data-liberation/blueprints-library @@ -1 +1 @@ -Subproject commit 2558e0ecc39aaf58b55e848f7a966c2d1b3f7470 +Subproject commit 63fab6b9c447c799aa971bf2f6b7cf7856b4e0bc diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php index 1ac5ceac69..d1e2a61356 100644 --- a/packages/playground/data-liberation/bootstrap.php +++ b/packages/playground/data-liberation/bootstrap.php @@ -12,9 +12,10 @@ require_once __DIR__ . '/blueprints-library/src/WordPress/AsyncHttp/Client.php'; require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_Abstract_Filesystem.php'; -require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_Filesystem.php'; +require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_Local_Filesystem.php'; require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_File_Visitor_Event.php'; require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_Filesystem_Visitor.php'; +require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/functions.php'; require_once __DIR__ . '/blueprints-library/src/WordPress/ByteReader/WP_Byte_Reader.php'; require_once __DIR__ . '/blueprints-library/src/WordPress/ByteReader/WP_File_Reader.php'; @@ -87,6 +88,8 @@ require_once __DIR__ . '/src/import/WP_Retry_Frontloading_Iterator.php'; require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php'; require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php'; +require_once __DIR__ . '/src/entity-readers/WP_Filesystem_To_Post_Tree.php'; +require_once __DIR__ . '/src/entity-readers/WP_Filesystem_Entity_Reader.php'; require_once __DIR__ . '/src/utf8_decoder.php'; diff --git a/packages/playground/data-liberation/phpunit.xml b/packages/playground/data-liberation/phpunit.xml index 9566f9c750..9c3e0dc036 100644 --- a/packages/playground/data-liberation/phpunit.xml +++ b/packages/playground/data-liberation/phpunit.xml @@ -8,6 +8,7 @@ tests/WPMarkupProcessorConsumerTests.php tests/WPHTMLEntityReaderTests.php tests/WPURLInTextProcessorTests.php + tests/WPFilesystemToPostTreeTests.php tests/WPBlockMarkupProcessorTests.php tests/WPBlockMarkupUrlProcessorTests.php tests/URLParserWHATWGComplianceTests.php diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php deleted file mode 100644 index fcbcd70133..0000000000 --- a/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php +++ /dev/null @@ -1,342 +0,0 @@ -file_visitor = new \WordPress\Filesystem\WP_Filesystem_Visitor( $filesystem, $options['root_dir'] ); - $this->filesystem = $filesystem; - $this->next_post_id = $options['first_post_id']; - $this->allowed_extensions = $options['allowed_extensions']; - $this->index_file_patterns = $options['index_file_patterns']; - $this->markup_converter_factory = $options['markup_converter_factory']; - } - - public function next_entity() { - while ( true ) { - if ( null !== $this->pending_directory_index ) { - $dir = $this->file_visitor->get_event()->dir; - $depth = $this->file_visitor->get_current_depth(); - $parent_id = $this->parent_ids[ $depth - 1 ] ?? null; - - if ( null === $parent_id && $depth > 1 ) { - // There's no parent ID even though we're a few levels deep. - // This is a scenario where `next_file()` skipped a few levels - // of directories with no relevant content in them: - // - // - /docs/ - // - /foo/ - // - /bar/ - // - /baz.md - // - // In this case, we need to backtrack and create the missing - // parent pages for /bar/ and /foo/. - - // Find the topmost missing parent ID - $missing_parent_id_depth = 1; - while ( isset( $this->parent_ids[ $missing_parent_id_depth ] ) ) { - ++$missing_parent_id_depth; - } - - // Move up to the corresponding directory - $missing_parent_path = $dir; - for ( $i = $missing_parent_id_depth; $i < $depth; $i++ ) { - $missing_parent_path = dirname( $missing_parent_path ); - } - - $this->parent_ids[ $missing_parent_id_depth ] = $this->emit_post_entity( - array( - 'content' => '', - 'source_path' => $missing_parent_path, - 'parent_id' => $this->parent_ids[ $missing_parent_id_depth - 1 ], - 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $missing_parent_path ) ), - ) - ); - } elseif ( false === $this->pending_directory_index ) { - // No directory index candidate – let's create a fake page - // just to have something in the page tree. - $this->parent_ids[ $depth ] = $this->emit_post_entity( - array( - 'content' => '', - 'source_path' => $dir, - 'parent_id' => $parent_id, - 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $dir ) ), - ) - ); - // We're no longer looking for a directory index. - $this->pending_directory_index = null; - } else { - $file_path = $this->pending_directory_index; - $this->parent_ids[ $depth ] = $this->emit_post_entity( - array( - 'content' => $this->filesystem->read_file( $file_path ), - 'source_path' => $file_path, - 'parent_id' => $parent_id, - 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $file_path ) ), - ) - ); - // We're no longer looking for a directory index. - $this->pending_directory_index = null; - } - return true; - } - - while ( count( $this->pending_files ) ) { - $parent_id = $this->parent_ids[ $this->file_visitor->get_current_depth() ] ?? null; - $file_path = array_shift( $this->pending_files ); - $this->emit_post_entity( - array( - 'content' => $this->filesystem->read_file( $file_path ), - 'source_path' => $file_path, - 'parent_id' => $parent_id, - 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $file_path ) ), - ) - ); - return true; - } - - if ( false === $this->next_file() ) { - break; - } - } - $this->is_finished = true; - return false; - } - - public function get_entity(): ?\WP_Imported_Entity { - return $this->entity; - } - - protected function emit_post_entity( $options ) { - $factory = $this->markup_converter_factory; - $converter = $factory( $options['content'] ); - $converter->convert(); - $block_markup = $converter->get_block_markup(); - - $post_title = null; - if ( ! $post_title ) { - $removed_title = WP_Import_Utils::remove_first_h1_block_from_block_markup( $block_markup ); - if ( false !== $removed_title ) { - $post_title = $removed_title['title']; - $block_markup = $removed_title['remaining_html']; - } - } - if ( ! $post_title ) { - // In Markdown, the frontmatter title can be a worse title candidate than - // the first H1 block. In block markup exports, it will be the opposite. - // - // @TODO: Enable the API consumer to customize the title resolution. - $post_title = $converter->get_meta_value( 'post_title' ); - } - if ( ! $post_title ) { - $post_title = $options['title_fallback']; - } - - $entity_data = array( - 'post_id' => $this->next_post_id, - 'post_type' => 'page', - 'guid' => $options['source_path'], - 'post_title' => $post_title, - 'post_content' => $block_markup, - 'post_excerpt' => $converter->get_meta_value( 'post_excerpt' ) ?? '', - 'post_status' => 'publish', - ); - - /** - * Technically `source_path` isn't a part of the WordPress post object, - * but we need it to resolve relative URLs in the imported content. - * - * This path is relative to the root directory traversed by this class. - */ - if ( ! empty( $options['source_path'] ) ) { - $source_path = $options['source_path']; - $root_dir = $this->file_visitor->get_root_dir(); - if ( str_starts_with( $source_path, $root_dir ) ) { - $source_path = substr( $source_path, strlen( $root_dir ) ); - } - $source_path = ltrim( $source_path, '/' ); - $entity_data['source_path'] = $source_path; - } - - if ( $converter->get_meta_value( 'slug' ) ) { - $slug = $converter->get_meta_value( 'slug' ); - $last_segment = substr( $slug, strrpos( $slug, '/' ) + 1 ); - $entity_data['post_name'] = $last_segment; - } - - if ( $converter->get_meta_value( 'post_order' ) ) { - $entity_data['post_order'] = $converter->get_meta_value( 'post_order' ); - } - - if ( $options['parent_id'] ) { - $entity_data['post_parent'] = $options['parent_id']; - } - - $this->entity = new \WP_Imported_Entity( 'post', $entity_data ); - ++$this->next_post_id; - ++$this->entities_read_so_far; - return $entity_data['post_id']; - } - - private function next_file() { - $this->pending_files = array(); - $this->entity = null; - while ( $this->file_visitor->next() ) { - $event = $this->file_visitor->get_event(); - - if ( $event->is_exiting() ) { - // Clean up stale IDs to save some memory when processing - // large directory trees. - unset( $this->parent_ids[ $event->dir ] ); - continue; - } - - if ( $event->is_entering() ) { - $abs_paths = array(); - foreach ( $event->files as $filename ) { - $abs_paths[] = $event->dir . '/' . $filename; - } - $this->pending_files = $this->choose_relevant_files( $abs_paths ); - if ( ! count( $this->pending_files ) ) { - // Only consider directories with relevant files in them. - // Otherwise we'll create fake pages for media directories - // and other directories that don't contain any content. - // - // One corner case is when there's a few levels of directories - // with a single relevant file at the bottom: - // - // - /docs/ - // - /foo/ - // - /bar/ - // - /baz.md - // - // In this case, `next_entity()` will backtrack at baz.md and - // create the missing parent pages. - continue; - } - $directory_index_idx = $this->choose_directory_index( $this->pending_files ); - if ( -1 === $directory_index_idx ) { - $this->pending_directory_index = false; - } else { - $this->pending_directory_index = $this->pending_files[ $directory_index_idx ]; - unset( $this->pending_files[ $directory_index_idx ] ); - } - return true; - } - - return false; - } - return false; - } - - protected function choose_directory_index( $files ) { - foreach ( $files as $idx => $file ) { - if ( $this->looks_like_directory_index( $file ) ) { - return $idx; - } - } - return -1; - } - - protected function looks_like_directory_index( $path ) { - $filename = basename( $path ); - foreach ( $this->index_file_patterns as $pattern ) { - if ( preg_match( $pattern, $filename ) ) { - return true; - } - } - return false; - } - - protected function choose_relevant_files( $paths ) { - return array_filter( $paths, array( $this, 'is_valid_file' ) ); - } - - protected function is_valid_file( $path ) { - $extension = pathinfo( $path, PATHINFO_EXTENSION ); - return in_array( $extension, $this->allowed_extensions, true ); - } - - /** - * @TODO: Either implement this method, or introduce a concept of - * reentrant and non-reentrant entity readers. - */ - public function get_reentrancy_cursor() { - return ''; - } - - public function current(): mixed { - if ( null === $this->entity && ! $this->is_finished ) { - $this->next(); - } - return $this->get_entity(); - } - - public function next(): void { - $this->next_entity(); - } - - public function key(): int { - return $this->entities_read_so_far - 1; - } - - public function valid(): bool { - return ! $this->is_finished; - } - - public function rewind(): void { - // @TODO: Either implement this method, or formalize the fact that - // entity readers are not rewindable. - } -} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php new file mode 100644 index 0000000000..2ac427e726 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php @@ -0,0 +1,167 @@ +filesystem = $filesystem; + $this->post_type = $options['post_type'] ?? 'page'; + $this->post_tree = WP_Filesystem_To_Post_Tree::create( + $this->filesystem, + array_merge( + array( + 'root_parent_id' => null, + 'filter_pattern' => '#\.(?:md|html|xhtml|png|jpg|jpeg|gif|svg|webp|mp4)$#', + 'index_file_pattern' => '#^index\.[a-z]+$#', + ), + $options['post_tree_options'] ?? array() + ) + ); + if ( false === $this->post_tree ) { + return false; + } + } + + public function get_last_error(): ?string { + // @TODO: Implement this. + return null; + } + + public function get_entity() { + return $this->current_entity; + } + + public function is_finished(): bool { + return $this->finished; + } + + public function next_entity(): bool { + while ( true ) { + while ( count( $this->entities ) > 0 ) { + $this->current_entity = array_shift( $this->entities ); + return true; + } + + if ( ! $this->post_tree->next_node() ) { + $this->finished = true; + return false; + } + + $source_content_converter = null; + $post_tree_node = $this->post_tree->get_current_node(); + if ( $post_tree_node['type'] === 'file' ) { + $extension = pathinfo( $post_tree_node['local_file_path'], PATHINFO_EXTENSION ); + switch ( $extension ) { + case 'md': + $content = $this->filesystem->get_contents( $post_tree_node['local_file_path'] ); + $converter = new WP_Markdown_To_Blocks( $content ); + $source_content_converter = 'md'; + break; + case 'xhtml': + $content = $this->filesystem->get_contents( $post_tree_node['local_file_path'] ); + $converter = new WP_HTML_To_Blocks( WP_XML_Processor::create_from_string( $content ) ); + $source_content_converter = 'xhtml'; + break; + case 'html': + $content = $this->filesystem->get_contents( $post_tree_node['local_file_path'] ); + $converter = new WP_HTML_To_Blocks( WP_HTML_Processor::create_fragment( $content ) ); + $source_content_converter = 'html'; + break; + default: + $filetype = 'application/octet-stream'; + if ( function_exists( 'wp_check_filetype' ) ) { + $filetype = wp_check_filetype( basename( $post_tree_node['local_file_path'] ), null ); + if ( isset( $filetype['type'] ) ) { + $filetype = $filetype['type']; + } + } + $this->entities[] = new WP_Imported_Entity( + 'post', + array( + 'post_id' => $post_tree_node['post_id'], + 'post_title' => sanitize_file_name( basename( $post_tree_node['local_file_path'] ) ), + 'post_status' => 'inherit', + 'post_content' => '', + 'post_mime_type' => $filetype, + 'post_type' => 'attachment', + 'post_parent' => $post_tree_node['parent_id'], + 'guid' => $post_tree_node['local_file_path'], + // The importer will use the same Filesystem instance to + // source the attachment. + 'attachment_url' => 'file://' . $post_tree_node['local_file_path'], + ) + ); + $this->entities[] = new WP_Imported_Entity( + 'post_meta', + array( + 'post_id' => $post_tree_node['post_id'], + 'key' => 'local_file_path', + 'value' => $post_tree_node['local_file_path'], + ) + ); + // We're done emiting the entity. + // wp_generate_attachment_metadata() et al. will be called by the + // importer at the database insertion step. + continue 2; + } + + if ( false === $converter->convert() ) { + throw new Exception( 'Failed to convert Markdown to blocks' ); + } + $markup = $converter->get_block_markup(); + $metadata = $converter->get_all_metadata(); + } else { + $markup = ''; + $metadata = array(); + // @TODO: Accept an option to set what should we default to. + $source_content_converter = 'html'; + } + + $reader = new WP_Block_Markup_Entity_Reader( + $markup, + $metadata, + $post_tree_node['post_id'] + ); + while ( $reader->next_entity() ) { + $entity = $reader->get_entity(); + $data = $entity->get_data(); + if ( $entity->get_type() === 'post' ) { + $data['id'] = $post_tree_node['post_id']; + $data['guid'] = $post_tree_node['local_file_path']; + $data['post_parent'] = $post_tree_node['parent_id']; + $data['post_title'] = $data['post_title'] ?? null; + $data['post_status'] = 'publish'; + $data['post_type'] = $this->post_type; + if ( ! $data['post_title'] ) { + $data['post_title'] = WP_Import_Utils::slug_to_title( basename( $post_tree_node['local_file_path'] ) ); + } + $entity = new WP_Imported_Entity( $entity->get_type(), $data ); + } + $this->entities[] = $entity; + } + + // Also emit: + $additional_meta = array( + 'local_file_path' => $post_tree_node['local_file_path'], + 'source_type' => $post_tree_node['type'], + 'source_content_converter' => $source_content_converter, + ); + foreach ( $additional_meta as $key => $value ) { + $this->entities[] = new WP_Imported_Entity( + 'post_meta', + array( + 'post_id' => $post_tree_node['post_id'], + 'key' => $key, + 'value' => $value, + ) + ); + } + } + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php new file mode 100644 index 0000000000..0980d228c1 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php @@ -0,0 +1,413 @@ + 100, + * 'filter_pattern' => '/\.md$/', + * 'index_file_pattern' => 'index\.md', + * ]; + * $reader = WP_Filesystem_To_Post_Tree::create($filesystem, $options); + * while ($reader->next_node()) { + * $current_node = $reader->get_current_node(); + * // Process the node, e.g., create a post from it. + * } + * ``` + */ +class WP_Filesystem_To_Post_Tree { + /** + * The filesystem instance to read from. + * + * @var WP_Abstract_Filesystem + */ + private $fs; + + /** + * Visitor for traversing the filesystem. + * + * @var WP_Filesystem_Visitor + */ + private $file_visitor; + + /** + * The current node being processed. + * + * @var array|null + */ + private $current_node; + + /** + * Files pending processing. + * + * @var array + */ + private $pending_files = array(); + + /** + * A filename to emit as the next directory index. If null, there's no matching + * directory index file and a placeholder file will be created. If false, + * we're not emitting directory indexes at all. + * + * @var string|false|null + */ + private $pending_directory_index; + + /** + * A stack of post IDs emitted at each directory depth up to the currently processed + * directory. + * + * @var array + */ + private $parent_ids = array(); + + /** + * The next post ID to assign. + * + * @var int + */ + private $next_post_id; + + /** + * Flag to determine if an index page should be created when no index file is found + * in a directory. + * + * @var bool + */ + private $create_index_pages; + + /** + * Counter for entities read so far. + * + * @var int + */ + private $entities_read_so_far = 0; + + /** + * Pattern to filter files. + * + * @var string + */ + private $filter_pattern = '##'; + + /** + * Pattern to identify index files. + * + * @var string + */ + private $index_file_pattern = '##'; + + /** + * Flag to indicate if processing is finished. + * + * @var bool + */ + private $is_finished = false; + + /** + * Creates a new instance of WP_Filesystem_To_Post_Tree. + * + * @param WP_Abstract_Filesystem $filesystem The filesystem to traverse. + * @param array $options Configuration options. { + * $first_post_id => int The ID of the first post to emit. + * $filter_pattern => string A pattern to filter files by. + * $index_file_pattern => string A pattern to identify index files. + * $root_parent_id => int|null The ID of the root parent post. + * $create_index_pages => bool Whether to create index pages when no index file is found. + * } + * @return WP_Filesystem_To_Post_Tree|false The created instance or false on failure. + */ + public static function create( + \WordPress\Filesystem\WP_Abstract_Filesystem $filesystem, + $options + ) { + if ( ! isset( $options['first_post_id'] ) ) { + $options['first_post_id'] = 2; + if ( function_exists( 'get_posts' ) ) { + $max_id = get_posts( + array( + 'post_type' => 'any', + 'posts_per_page' => 1, + 'fields' => 'ids', + 'orderby' => 'ID', + 'order' => 'DESC', + ) + ); + if ( ! empty( $max_id ) ) { + $options['first_post_id'] = $max_id[0] + 1; + } + } + } + if ( 1 === $options['first_post_id'] ) { + _doing_it_wrong( __FUNCTION__, 'First node ID must be greater than 1', '1.0.0' ); + return false; + } + if ( ! isset( $options['filter_pattern'] ) ) { + _doing_it_wrong( __FUNCTION__, 'Missing required options: filter_pattern', '1.0.0' ); + return false; + } + if ( ! isset( $options['index_file_pattern'] ) ) { + _doing_it_wrong( __FUNCTION__, 'Missing required options: index_file_pattern', '1.0.0' ); + return false; + } + return new self( $filesystem, $options ); + } + + /** + * Initializes the reader with filesystem and options. + * + * @param WP_Abstract_Filesystem $filesystem The filesystem to traverse. + * @param array $options Configuration options. + */ + private function __construct( + WP_Abstract_Filesystem $filesystem, + $options + ) { + $this->fs = $filesystem; + $this->file_visitor = new WordPress\Filesystem\WP_Filesystem_Visitor( $filesystem ); + $this->create_index_pages = $options['create_index_pages'] ?? true; + $this->next_post_id = $options['first_post_id']; + $this->filter_pattern = $options['filter_pattern']; + $this->index_file_pattern = $options['index_file_pattern']; + if ( isset( $options['root_parent_id'] ) ) { + $this->parent_ids[-1] = $options['root_parent_id']; + } + } + + /** + * Retrieves the current node being processed. + * + * @return array|null The current node or null if none. + */ + public function get_current_node() { + return $this->current_node; + } + + /** + * Advances to the next node in the filesystem. + * + * @return bool True if a node is found, false if processing is complete. + */ + public function next_node() { + $this->current_node = null; + if ( $this->is_finished ) { + return false; + } + while ( true ) { + if ( null !== $this->pending_directory_index ) { + $dir = $this->file_visitor->get_event()->dir; + $depth = $this->file_visitor->get_current_depth(); + $parent_id = $this->parent_ids[ $depth - 1 ] ?? null; + if ( null === $parent_id && $depth > 1 ) { + // There's no parent ID even though we're a few levels deep. + // This is a scenario where `next_file()` skipped a few levels + // of directories with no relevant content in them: + // + // - /docs/ + // - /foo/ + // - /bar/ + // - /baz.md + // + // In this case, we need to backtrack and create the missing + // parent pages for /bar/ and /foo/. + + // Find the topmost missing parent ID + $missing_parent_id_depth = 1; + while ( isset( $this->parent_ids[ $missing_parent_id_depth ] ) ) { + ++$missing_parent_id_depth; + } + + // Move up to the corresponding directory + $missing_parent_path = $dir; + for ( $i = $missing_parent_id_depth; $i < $depth; $i++ ) { + $missing_parent_path = dirname( $missing_parent_path ); + } + + $this->parent_ids[ $missing_parent_id_depth ] = $this->emit_object( + array( + 'type' => 'directory', + 'local_file_path' => $missing_parent_path, + 'parent_id' => $this->parent_ids[ $missing_parent_id_depth - 1 ] ?? null, + ) + ); + } elseif ( false === $this->pending_directory_index ) { + // No directory index candidate – let's create a fake page + // just to have something in the page tree. + $this->parent_ids[ $depth ] = $this->emit_object( + array( + 'type' => 'file_placeholder', + 'local_file_path' => $dir, + 'parent_id' => $parent_id, + ) + ); + // We're no longer looking for a directory index. + $this->pending_directory_index = null; + } else { + $file_path = $this->pending_directory_index; + $this->parent_ids[ $depth ] = $this->emit_object( + array( + 'type' => 'file', + 'local_file_path' => $file_path, + 'parent_id' => $parent_id, + ) + ); + // We're no longer looking for a directory index. + $this->pending_directory_index = null; + } + return true; + } + while ( count( $this->pending_files ) ) { + $parent_id = $this->parent_ids[ $this->file_visitor->get_current_depth() ] ?? null; + $file_path = array_shift( $this->pending_files ); + $this->emit_object( + array( + 'type' => 'file', + 'local_file_path' => $file_path, + 'parent_id' => $parent_id, + ) + ); + return true; + } + + if ( false === $this->next_file() ) { + break; + } + } + $this->is_finished = true; + return false; + } + + /** + * Emits a WordPress post entity based on the provided options. + * + * @param array $options Configuration for the post entity. + * @return int The ID of the created post. + */ + protected function emit_object( $options ) { + $post_id = $this->next_post_id; + ++$this->next_post_id; + $this->current_node = array_merge( + $options, + array( + 'post_id' => $post_id, + ) + ); + ++$this->entities_read_so_far; + return $post_id; + } + + /** + * Processes the next file in the traversal. + * + * @return bool True if a file is processed, false otherwise. + */ + private function next_file() { + $this->pending_files = array(); + while ( $this->file_visitor->next() ) { + $event = $this->file_visitor->get_event(); + + if ( $event->is_exiting() ) { + // Clean up stale IDs to save some memory when processing + // large directory trees. + unset( $this->parent_ids[ $event->dir ] ); + continue; + } + + if ( $event->is_entering() ) { + $abs_paths = array(); + foreach ( $event->files as $filename ) { + $abs_paths[] = wp_join_paths( $event->dir, $filename ); + } + $this->pending_files = array(); + foreach ( $abs_paths as $path ) { + // Add all the subdirectory into the pending files list – there's + // a chance the directory wouldn't match the filter pattern, but + // a descendant file might. + if ( $this->fs->is_dir( $path ) ) { + $this->pending_files[] = $path; + } + + // Only add the files that match the filter pattern. + if ( $this->fs->is_file( $path ) && preg_match( $this->filter_pattern, $path ) ) { + $this->pending_files[] = $path; + } + } + if ( ! count( $this->pending_files ) ) { + // Only consider directories with relevant files in them. + // Otherwise we'll create fake pages for media directories + // and other directories that don't contain any content. + // + // One corner case is when there's a few levels of directories + // with a single relevant file at the bottom: + // + // - /docs/ + // - /foo/ + // - /bar/ + // - /baz.md + // + // In this case, `next_entity()` will backtrack at baz.md and + // create the missing parent pages. + continue; + } + $directory_index_idx = $this->choose_directory_index( $this->pending_files ); + if ( -1 === $directory_index_idx ) { + $this->pending_directory_index = false; + } else { + $this->pending_directory_index = $this->pending_files[ $directory_index_idx ]; + unset( $this->pending_files[ $directory_index_idx ] ); + } + return true; + } + + return false; + } + return false; + } + + /** + * Chooses an index file from the list of pending files. + * + * @param array $files List of files to choose from. + * @return int The index of the chosen file or -1 if none. + */ + protected function choose_directory_index( $files ) { + foreach ( $files as $idx => $file ) { + if ( $this->looks_like_directory_index( $file ) ) { + return $idx; + } + } + if ( ! $this->create_index_pages && count( $files ) > 0 ) { + return 0; + } + return -1; + } + + /** + * Determines if a file path matches the index file pattern. + * + * @param string $path The file path to check. + * @return bool True if it matches, false otherwise. + */ + protected function looks_like_directory_index( $path ) { + return preg_match( $this->index_file_pattern, basename( $path ) ); + } + + /** + * Finds a node in the filesystem tree by its path. + * + * @param string $path The path to search for. + * @return array|null The found node or null if not found. + */ + private function find_node($path) { + // existing code... + } +} diff --git a/packages/playground/data-liberation/tests/WPFilesystemToPostTreeTests.php b/packages/playground/data-liberation/tests/WPFilesystemToPostTreeTests.php new file mode 100644 index 0000000000..6201cb70cb --- /dev/null +++ b/packages/playground/data-liberation/tests/WPFilesystemToPostTreeTests.php @@ -0,0 +1,40 @@ + 2, + 'create_index_pages' => true, + 'filter_pattern' => '#\.html$#', + 'index_file_pattern' => '#root.html#', + ] + ); + $posts = []; + while ( $reader->next_node() ) { + $posts[] = $reader->get_current_node(); + } + $this->assertCount(3, $posts); + + // The root index page + // Root index page + $this->assertEquals(2, $posts[0]['post_id']); + $this->assertNull($posts[0]['parent_id']); + $this->assertEquals('file', $posts[0]['type']); + + // Nested directory page + $this->assertEquals(3, $posts[1]['post_id']); + $this->assertEquals(2, $posts[1]['parent_id']); + $this->assertEquals('file_placeholder', $posts[1]['type']); + + // Leaf page + $this->assertEquals(4, $posts[2]['post_id']); + $this->assertEquals(3, $posts[2]['parent_id']); + $this->assertEquals('file', $posts[2]['type']); + } + +} diff --git a/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/nested/page1.html b/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/nested/page1.html new file mode 100644 index 0000000000..a76ff59751 --- /dev/null +++ b/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/nested/page1.html @@ -0,0 +1,2 @@ +

Page 1

+

This is page 1.

diff --git a/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/root.html b/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/root.html new file mode 100644 index 0000000000..5666bc9ad6 --- /dev/null +++ b/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/root.html @@ -0,0 +1,2 @@ +

Root

+

This is the root page.