diff --git a/packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php b/packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php
index a164dc2025..d120e6e6fa 100644
--- a/packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php
+++ b/packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php
@@ -1,6 +1,6 @@
$markdown_directory,
'first_post_id' => 1,
'allowed_extensions' => array( 'md' ),
'index_file_patterns' => array( '#^index\.md$#' ),
- 'markup_converter_factory' => function ( $content ) {
+ 'data_consumer_factory' => function ( $content ) {
return new WP_Markdown_Consumer( $content );
},
)
diff --git a/packages/playground/data-liberation/blueprints-library b/packages/playground/data-liberation/blueprints-library
index 2558e0ecc3..63fab6b9c4 160000
--- a/packages/playground/data-liberation/blueprints-library
+++ b/packages/playground/data-liberation/blueprints-library
@@ -1 +1 @@
-Subproject commit 2558e0ecc39aaf58b55e848f7a966c2d1b3f7470
+Subproject commit 63fab6b9c447c799aa971bf2f6b7cf7856b4e0bc
diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php
index 1ac5ceac69..d1e2a61356 100644
--- a/packages/playground/data-liberation/bootstrap.php
+++ b/packages/playground/data-liberation/bootstrap.php
@@ -12,9 +12,10 @@
require_once __DIR__ . '/blueprints-library/src/WordPress/AsyncHttp/Client.php';
require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_Abstract_Filesystem.php';
-require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_Filesystem.php';
+require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_Local_Filesystem.php';
require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_File_Visitor_Event.php';
require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/WP_Filesystem_Visitor.php';
+require_once __DIR__ . '/blueprints-library/src/WordPress/Filesystem/functions.php';
require_once __DIR__ . '/blueprints-library/src/WordPress/ByteReader/WP_Byte_Reader.php';
require_once __DIR__ . '/blueprints-library/src/WordPress/ByteReader/WP_File_Reader.php';
@@ -87,6 +88,8 @@
require_once __DIR__ . '/src/import/WP_Retry_Frontloading_Iterator.php';
require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php';
require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php';
+require_once __DIR__ . '/src/entity-readers/WP_Filesystem_To_Post_Tree.php';
+require_once __DIR__ . '/src/entity-readers/WP_Filesystem_Entity_Reader.php';
require_once __DIR__ . '/src/utf8_decoder.php';
diff --git a/packages/playground/data-liberation/phpunit.xml b/packages/playground/data-liberation/phpunit.xml
index 9566f9c750..9c3e0dc036 100644
--- a/packages/playground/data-liberation/phpunit.xml
+++ b/packages/playground/data-liberation/phpunit.xml
@@ -8,6 +8,7 @@
tests/WPMarkupProcessorConsumerTests.php
tests/WPHTMLEntityReaderTests.php
tests/WPURLInTextProcessorTests.php
+ tests/WPFilesystemToPostTreeTests.php
tests/WPBlockMarkupProcessorTests.php
tests/WPBlockMarkupUrlProcessorTests.php
tests/URLParserWHATWGComplianceTests.php
diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php
deleted file mode 100644
index fcbcd70133..0000000000
--- a/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php
+++ /dev/null
@@ -1,342 +0,0 @@
-file_visitor = new \WordPress\Filesystem\WP_Filesystem_Visitor( $filesystem, $options['root_dir'] );
- $this->filesystem = $filesystem;
- $this->next_post_id = $options['first_post_id'];
- $this->allowed_extensions = $options['allowed_extensions'];
- $this->index_file_patterns = $options['index_file_patterns'];
- $this->markup_converter_factory = $options['markup_converter_factory'];
- }
-
- public function next_entity() {
- while ( true ) {
- if ( null !== $this->pending_directory_index ) {
- $dir = $this->file_visitor->get_event()->dir;
- $depth = $this->file_visitor->get_current_depth();
- $parent_id = $this->parent_ids[ $depth - 1 ] ?? null;
-
- if ( null === $parent_id && $depth > 1 ) {
- // There's no parent ID even though we're a few levels deep.
- // This is a scenario where `next_file()` skipped a few levels
- // of directories with no relevant content in them:
- //
- // - /docs/
- // - /foo/
- // - /bar/
- // - /baz.md
- //
- // In this case, we need to backtrack and create the missing
- // parent pages for /bar/ and /foo/.
-
- // Find the topmost missing parent ID
- $missing_parent_id_depth = 1;
- while ( isset( $this->parent_ids[ $missing_parent_id_depth ] ) ) {
- ++$missing_parent_id_depth;
- }
-
- // Move up to the corresponding directory
- $missing_parent_path = $dir;
- for ( $i = $missing_parent_id_depth; $i < $depth; $i++ ) {
- $missing_parent_path = dirname( $missing_parent_path );
- }
-
- $this->parent_ids[ $missing_parent_id_depth ] = $this->emit_post_entity(
- array(
- 'content' => '',
- 'source_path' => $missing_parent_path,
- 'parent_id' => $this->parent_ids[ $missing_parent_id_depth - 1 ],
- 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $missing_parent_path ) ),
- )
- );
- } elseif ( false === $this->pending_directory_index ) {
- // No directory index candidate – let's create a fake page
- // just to have something in the page tree.
- $this->parent_ids[ $depth ] = $this->emit_post_entity(
- array(
- 'content' => '',
- 'source_path' => $dir,
- 'parent_id' => $parent_id,
- 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $dir ) ),
- )
- );
- // We're no longer looking for a directory index.
- $this->pending_directory_index = null;
- } else {
- $file_path = $this->pending_directory_index;
- $this->parent_ids[ $depth ] = $this->emit_post_entity(
- array(
- 'content' => $this->filesystem->read_file( $file_path ),
- 'source_path' => $file_path,
- 'parent_id' => $parent_id,
- 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $file_path ) ),
- )
- );
- // We're no longer looking for a directory index.
- $this->pending_directory_index = null;
- }
- return true;
- }
-
- while ( count( $this->pending_files ) ) {
- $parent_id = $this->parent_ids[ $this->file_visitor->get_current_depth() ] ?? null;
- $file_path = array_shift( $this->pending_files );
- $this->emit_post_entity(
- array(
- 'content' => $this->filesystem->read_file( $file_path ),
- 'source_path' => $file_path,
- 'parent_id' => $parent_id,
- 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $file_path ) ),
- )
- );
- return true;
- }
-
- if ( false === $this->next_file() ) {
- break;
- }
- }
- $this->is_finished = true;
- return false;
- }
-
- public function get_entity(): ?\WP_Imported_Entity {
- return $this->entity;
- }
-
- protected function emit_post_entity( $options ) {
- $factory = $this->markup_converter_factory;
- $converter = $factory( $options['content'] );
- $converter->convert();
- $block_markup = $converter->get_block_markup();
-
- $post_title = null;
- if ( ! $post_title ) {
- $removed_title = WP_Import_Utils::remove_first_h1_block_from_block_markup( $block_markup );
- if ( false !== $removed_title ) {
- $post_title = $removed_title['title'];
- $block_markup = $removed_title['remaining_html'];
- }
- }
- if ( ! $post_title ) {
- // In Markdown, the frontmatter title can be a worse title candidate than
- // the first H1 block. In block markup exports, it will be the opposite.
- //
- // @TODO: Enable the API consumer to customize the title resolution.
- $post_title = $converter->get_meta_value( 'post_title' );
- }
- if ( ! $post_title ) {
- $post_title = $options['title_fallback'];
- }
-
- $entity_data = array(
- 'post_id' => $this->next_post_id,
- 'post_type' => 'page',
- 'guid' => $options['source_path'],
- 'post_title' => $post_title,
- 'post_content' => $block_markup,
- 'post_excerpt' => $converter->get_meta_value( 'post_excerpt' ) ?? '',
- 'post_status' => 'publish',
- );
-
- /**
- * Technically `source_path` isn't a part of the WordPress post object,
- * but we need it to resolve relative URLs in the imported content.
- *
- * This path is relative to the root directory traversed by this class.
- */
- if ( ! empty( $options['source_path'] ) ) {
- $source_path = $options['source_path'];
- $root_dir = $this->file_visitor->get_root_dir();
- if ( str_starts_with( $source_path, $root_dir ) ) {
- $source_path = substr( $source_path, strlen( $root_dir ) );
- }
- $source_path = ltrim( $source_path, '/' );
- $entity_data['source_path'] = $source_path;
- }
-
- if ( $converter->get_meta_value( 'slug' ) ) {
- $slug = $converter->get_meta_value( 'slug' );
- $last_segment = substr( $slug, strrpos( $slug, '/' ) + 1 );
- $entity_data['post_name'] = $last_segment;
- }
-
- if ( $converter->get_meta_value( 'post_order' ) ) {
- $entity_data['post_order'] = $converter->get_meta_value( 'post_order' );
- }
-
- if ( $options['parent_id'] ) {
- $entity_data['post_parent'] = $options['parent_id'];
- }
-
- $this->entity = new \WP_Imported_Entity( 'post', $entity_data );
- ++$this->next_post_id;
- ++$this->entities_read_so_far;
- return $entity_data['post_id'];
- }
-
- private function next_file() {
- $this->pending_files = array();
- $this->entity = null;
- while ( $this->file_visitor->next() ) {
- $event = $this->file_visitor->get_event();
-
- if ( $event->is_exiting() ) {
- // Clean up stale IDs to save some memory when processing
- // large directory trees.
- unset( $this->parent_ids[ $event->dir ] );
- continue;
- }
-
- if ( $event->is_entering() ) {
- $abs_paths = array();
- foreach ( $event->files as $filename ) {
- $abs_paths[] = $event->dir . '/' . $filename;
- }
- $this->pending_files = $this->choose_relevant_files( $abs_paths );
- if ( ! count( $this->pending_files ) ) {
- // Only consider directories with relevant files in them.
- // Otherwise we'll create fake pages for media directories
- // and other directories that don't contain any content.
- //
- // One corner case is when there's a few levels of directories
- // with a single relevant file at the bottom:
- //
- // - /docs/
- // - /foo/
- // - /bar/
- // - /baz.md
- //
- // In this case, `next_entity()` will backtrack at baz.md and
- // create the missing parent pages.
- continue;
- }
- $directory_index_idx = $this->choose_directory_index( $this->pending_files );
- if ( -1 === $directory_index_idx ) {
- $this->pending_directory_index = false;
- } else {
- $this->pending_directory_index = $this->pending_files[ $directory_index_idx ];
- unset( $this->pending_files[ $directory_index_idx ] );
- }
- return true;
- }
-
- return false;
- }
- return false;
- }
-
- protected function choose_directory_index( $files ) {
- foreach ( $files as $idx => $file ) {
- if ( $this->looks_like_directory_index( $file ) ) {
- return $idx;
- }
- }
- return -1;
- }
-
- protected function looks_like_directory_index( $path ) {
- $filename = basename( $path );
- foreach ( $this->index_file_patterns as $pattern ) {
- if ( preg_match( $pattern, $filename ) ) {
- return true;
- }
- }
- return false;
- }
-
- protected function choose_relevant_files( $paths ) {
- return array_filter( $paths, array( $this, 'is_valid_file' ) );
- }
-
- protected function is_valid_file( $path ) {
- $extension = pathinfo( $path, PATHINFO_EXTENSION );
- return in_array( $extension, $this->allowed_extensions, true );
- }
-
- /**
- * @TODO: Either implement this method, or introduce a concept of
- * reentrant and non-reentrant entity readers.
- */
- public function get_reentrancy_cursor() {
- return '';
- }
-
- public function current(): mixed {
- if ( null === $this->entity && ! $this->is_finished ) {
- $this->next();
- }
- return $this->get_entity();
- }
-
- public function next(): void {
- $this->next_entity();
- }
-
- public function key(): int {
- return $this->entities_read_so_far - 1;
- }
-
- public function valid(): bool {
- return ! $this->is_finished;
- }
-
- public function rewind(): void {
- // @TODO: Either implement this method, or formalize the fact that
- // entity readers are not rewindable.
- }
-}
diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php
new file mode 100644
index 0000000000..2ac427e726
--- /dev/null
+++ b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php
@@ -0,0 +1,167 @@
+filesystem = $filesystem;
+ $this->post_type = $options['post_type'] ?? 'page';
+ $this->post_tree = WP_Filesystem_To_Post_Tree::create(
+ $this->filesystem,
+ array_merge(
+ array(
+ 'root_parent_id' => null,
+ 'filter_pattern' => '#\.(?:md|html|xhtml|png|jpg|jpeg|gif|svg|webp|mp4)$#',
+ 'index_file_pattern' => '#^index\.[a-z]+$#',
+ ),
+ $options['post_tree_options'] ?? array()
+ )
+ );
+ if ( false === $this->post_tree ) {
+ return false;
+ }
+ }
+
+ public function get_last_error(): ?string {
+ // @TODO: Implement this.
+ return null;
+ }
+
+ public function get_entity() {
+ return $this->current_entity;
+ }
+
+ public function is_finished(): bool {
+ return $this->finished;
+ }
+
+ public function next_entity(): bool {
+ while ( true ) {
+ while ( count( $this->entities ) > 0 ) {
+ $this->current_entity = array_shift( $this->entities );
+ return true;
+ }
+
+ if ( ! $this->post_tree->next_node() ) {
+ $this->finished = true;
+ return false;
+ }
+
+ $source_content_converter = null;
+ $post_tree_node = $this->post_tree->get_current_node();
+ if ( $post_tree_node['type'] === 'file' ) {
+ $extension = pathinfo( $post_tree_node['local_file_path'], PATHINFO_EXTENSION );
+ switch ( $extension ) {
+ case 'md':
+ $content = $this->filesystem->get_contents( $post_tree_node['local_file_path'] );
+ $converter = new WP_Markdown_To_Blocks( $content );
+ $source_content_converter = 'md';
+ break;
+ case 'xhtml':
+ $content = $this->filesystem->get_contents( $post_tree_node['local_file_path'] );
+ $converter = new WP_HTML_To_Blocks( WP_XML_Processor::create_from_string( $content ) );
+ $source_content_converter = 'xhtml';
+ break;
+ case 'html':
+ $content = $this->filesystem->get_contents( $post_tree_node['local_file_path'] );
+ $converter = new WP_HTML_To_Blocks( WP_HTML_Processor::create_fragment( $content ) );
+ $source_content_converter = 'html';
+ break;
+ default:
+ $filetype = 'application/octet-stream';
+ if ( function_exists( 'wp_check_filetype' ) ) {
+ $filetype = wp_check_filetype( basename( $post_tree_node['local_file_path'] ), null );
+ if ( isset( $filetype['type'] ) ) {
+ $filetype = $filetype['type'];
+ }
+ }
+ $this->entities[] = new WP_Imported_Entity(
+ 'post',
+ array(
+ 'post_id' => $post_tree_node['post_id'],
+ 'post_title' => sanitize_file_name( basename( $post_tree_node['local_file_path'] ) ),
+ 'post_status' => 'inherit',
+ 'post_content' => '',
+ 'post_mime_type' => $filetype,
+ 'post_type' => 'attachment',
+ 'post_parent' => $post_tree_node['parent_id'],
+ 'guid' => $post_tree_node['local_file_path'],
+ // The importer will use the same Filesystem instance to
+ // source the attachment.
+ 'attachment_url' => 'file://' . $post_tree_node['local_file_path'],
+ )
+ );
+ $this->entities[] = new WP_Imported_Entity(
+ 'post_meta',
+ array(
+ 'post_id' => $post_tree_node['post_id'],
+ 'key' => 'local_file_path',
+ 'value' => $post_tree_node['local_file_path'],
+ )
+ );
+ // We're done emiting the entity.
+ // wp_generate_attachment_metadata() et al. will be called by the
+ // importer at the database insertion step.
+ continue 2;
+ }
+
+ if ( false === $converter->convert() ) {
+ throw new Exception( 'Failed to convert Markdown to blocks' );
+ }
+ $markup = $converter->get_block_markup();
+ $metadata = $converter->get_all_metadata();
+ } else {
+ $markup = '';
+ $metadata = array();
+ // @TODO: Accept an option to set what should we default to.
+ $source_content_converter = 'html';
+ }
+
+ $reader = new WP_Block_Markup_Entity_Reader(
+ $markup,
+ $metadata,
+ $post_tree_node['post_id']
+ );
+ while ( $reader->next_entity() ) {
+ $entity = $reader->get_entity();
+ $data = $entity->get_data();
+ if ( $entity->get_type() === 'post' ) {
+ $data['id'] = $post_tree_node['post_id'];
+ $data['guid'] = $post_tree_node['local_file_path'];
+ $data['post_parent'] = $post_tree_node['parent_id'];
+ $data['post_title'] = $data['post_title'] ?? null;
+ $data['post_status'] = 'publish';
+ $data['post_type'] = $this->post_type;
+ if ( ! $data['post_title'] ) {
+ $data['post_title'] = WP_Import_Utils::slug_to_title( basename( $post_tree_node['local_file_path'] ) );
+ }
+ $entity = new WP_Imported_Entity( $entity->get_type(), $data );
+ }
+ $this->entities[] = $entity;
+ }
+
+ // Also emit:
+ $additional_meta = array(
+ 'local_file_path' => $post_tree_node['local_file_path'],
+ 'source_type' => $post_tree_node['type'],
+ 'source_content_converter' => $source_content_converter,
+ );
+ foreach ( $additional_meta as $key => $value ) {
+ $this->entities[] = new WP_Imported_Entity(
+ 'post_meta',
+ array(
+ 'post_id' => $post_tree_node['post_id'],
+ 'key' => $key,
+ 'value' => $value,
+ )
+ );
+ }
+ }
+ }
+}
diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php
new file mode 100644
index 0000000000..0980d228c1
--- /dev/null
+++ b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php
@@ -0,0 +1,413 @@
+ 100,
+ * 'filter_pattern' => '/\.md$/',
+ * 'index_file_pattern' => 'index\.md',
+ * ];
+ * $reader = WP_Filesystem_To_Post_Tree::create($filesystem, $options);
+ * while ($reader->next_node()) {
+ * $current_node = $reader->get_current_node();
+ * // Process the node, e.g., create a post from it.
+ * }
+ * ```
+ */
+class WP_Filesystem_To_Post_Tree {
+ /**
+ * The filesystem instance to read from.
+ *
+ * @var WP_Abstract_Filesystem
+ */
+ private $fs;
+
+ /**
+ * Visitor for traversing the filesystem.
+ *
+ * @var WP_Filesystem_Visitor
+ */
+ private $file_visitor;
+
+ /**
+ * The current node being processed.
+ *
+ * @var array|null
+ */
+ private $current_node;
+
+ /**
+ * Files pending processing.
+ *
+ * @var array
+ */
+ private $pending_files = array();
+
+ /**
+ * A filename to emit as the next directory index. If null, there's no matching
+ * directory index file and a placeholder file will be created. If false,
+ * we're not emitting directory indexes at all.
+ *
+ * @var string|false|null
+ */
+ private $pending_directory_index;
+
+ /**
+ * A stack of post IDs emitted at each directory depth up to the currently processed
+ * directory.
+ *
+ * @var array
+ */
+ private $parent_ids = array();
+
+ /**
+ * The next post ID to assign.
+ *
+ * @var int
+ */
+ private $next_post_id;
+
+ /**
+ * Flag to determine if an index page should be created when no index file is found
+ * in a directory.
+ *
+ * @var bool
+ */
+ private $create_index_pages;
+
+ /**
+ * Counter for entities read so far.
+ *
+ * @var int
+ */
+ private $entities_read_so_far = 0;
+
+ /**
+ * Pattern to filter files.
+ *
+ * @var string
+ */
+ private $filter_pattern = '##';
+
+ /**
+ * Pattern to identify index files.
+ *
+ * @var string
+ */
+ private $index_file_pattern = '##';
+
+ /**
+ * Flag to indicate if processing is finished.
+ *
+ * @var bool
+ */
+ private $is_finished = false;
+
+ /**
+ * Creates a new instance of WP_Filesystem_To_Post_Tree.
+ *
+ * @param WP_Abstract_Filesystem $filesystem The filesystem to traverse.
+ * @param array $options Configuration options. {
+ * $first_post_id => int The ID of the first post to emit.
+ * $filter_pattern => string A pattern to filter files by.
+ * $index_file_pattern => string A pattern to identify index files.
+ * $root_parent_id => int|null The ID of the root parent post.
+ * $create_index_pages => bool Whether to create index pages when no index file is found.
+ * }
+ * @return WP_Filesystem_To_Post_Tree|false The created instance or false on failure.
+ */
+ public static function create(
+ \WordPress\Filesystem\WP_Abstract_Filesystem $filesystem,
+ $options
+ ) {
+ if ( ! isset( $options['first_post_id'] ) ) {
+ $options['first_post_id'] = 2;
+ if ( function_exists( 'get_posts' ) ) {
+ $max_id = get_posts(
+ array(
+ 'post_type' => 'any',
+ 'posts_per_page' => 1,
+ 'fields' => 'ids',
+ 'orderby' => 'ID',
+ 'order' => 'DESC',
+ )
+ );
+ if ( ! empty( $max_id ) ) {
+ $options['first_post_id'] = $max_id[0] + 1;
+ }
+ }
+ }
+ if ( 1 === $options['first_post_id'] ) {
+ _doing_it_wrong( __FUNCTION__, 'First node ID must be greater than 1', '1.0.0' );
+ return false;
+ }
+ if ( ! isset( $options['filter_pattern'] ) ) {
+ _doing_it_wrong( __FUNCTION__, 'Missing required options: filter_pattern', '1.0.0' );
+ return false;
+ }
+ if ( ! isset( $options['index_file_pattern'] ) ) {
+ _doing_it_wrong( __FUNCTION__, 'Missing required options: index_file_pattern', '1.0.0' );
+ return false;
+ }
+ return new self( $filesystem, $options );
+ }
+
+ /**
+ * Initializes the reader with filesystem and options.
+ *
+ * @param WP_Abstract_Filesystem $filesystem The filesystem to traverse.
+ * @param array $options Configuration options.
+ */
+ private function __construct(
+ WP_Abstract_Filesystem $filesystem,
+ $options
+ ) {
+ $this->fs = $filesystem;
+ $this->file_visitor = new WordPress\Filesystem\WP_Filesystem_Visitor( $filesystem );
+ $this->create_index_pages = $options['create_index_pages'] ?? true;
+ $this->next_post_id = $options['first_post_id'];
+ $this->filter_pattern = $options['filter_pattern'];
+ $this->index_file_pattern = $options['index_file_pattern'];
+ if ( isset( $options['root_parent_id'] ) ) {
+ $this->parent_ids[-1] = $options['root_parent_id'];
+ }
+ }
+
+ /**
+ * Retrieves the current node being processed.
+ *
+ * @return array|null The current node or null if none.
+ */
+ public function get_current_node() {
+ return $this->current_node;
+ }
+
+ /**
+ * Advances to the next node in the filesystem.
+ *
+ * @return bool True if a node is found, false if processing is complete.
+ */
+ public function next_node() {
+ $this->current_node = null;
+ if ( $this->is_finished ) {
+ return false;
+ }
+ while ( true ) {
+ if ( null !== $this->pending_directory_index ) {
+ $dir = $this->file_visitor->get_event()->dir;
+ $depth = $this->file_visitor->get_current_depth();
+ $parent_id = $this->parent_ids[ $depth - 1 ] ?? null;
+ if ( null === $parent_id && $depth > 1 ) {
+ // There's no parent ID even though we're a few levels deep.
+ // This is a scenario where `next_file()` skipped a few levels
+ // of directories with no relevant content in them:
+ //
+ // - /docs/
+ // - /foo/
+ // - /bar/
+ // - /baz.md
+ //
+ // In this case, we need to backtrack and create the missing
+ // parent pages for /bar/ and /foo/.
+
+ // Find the topmost missing parent ID
+ $missing_parent_id_depth = 1;
+ while ( isset( $this->parent_ids[ $missing_parent_id_depth ] ) ) {
+ ++$missing_parent_id_depth;
+ }
+
+ // Move up to the corresponding directory
+ $missing_parent_path = $dir;
+ for ( $i = $missing_parent_id_depth; $i < $depth; $i++ ) {
+ $missing_parent_path = dirname( $missing_parent_path );
+ }
+
+ $this->parent_ids[ $missing_parent_id_depth ] = $this->emit_object(
+ array(
+ 'type' => 'directory',
+ 'local_file_path' => $missing_parent_path,
+ 'parent_id' => $this->parent_ids[ $missing_parent_id_depth - 1 ] ?? null,
+ )
+ );
+ } elseif ( false === $this->pending_directory_index ) {
+ // No directory index candidate – let's create a fake page
+ // just to have something in the page tree.
+ $this->parent_ids[ $depth ] = $this->emit_object(
+ array(
+ 'type' => 'file_placeholder',
+ 'local_file_path' => $dir,
+ 'parent_id' => $parent_id,
+ )
+ );
+ // We're no longer looking for a directory index.
+ $this->pending_directory_index = null;
+ } else {
+ $file_path = $this->pending_directory_index;
+ $this->parent_ids[ $depth ] = $this->emit_object(
+ array(
+ 'type' => 'file',
+ 'local_file_path' => $file_path,
+ 'parent_id' => $parent_id,
+ )
+ );
+ // We're no longer looking for a directory index.
+ $this->pending_directory_index = null;
+ }
+ return true;
+ }
+ while ( count( $this->pending_files ) ) {
+ $parent_id = $this->parent_ids[ $this->file_visitor->get_current_depth() ] ?? null;
+ $file_path = array_shift( $this->pending_files );
+ $this->emit_object(
+ array(
+ 'type' => 'file',
+ 'local_file_path' => $file_path,
+ 'parent_id' => $parent_id,
+ )
+ );
+ return true;
+ }
+
+ if ( false === $this->next_file() ) {
+ break;
+ }
+ }
+ $this->is_finished = true;
+ return false;
+ }
+
+ /**
+ * Emits a WordPress post entity based on the provided options.
+ *
+ * @param array $options Configuration for the post entity.
+ * @return int The ID of the created post.
+ */
+ protected function emit_object( $options ) {
+ $post_id = $this->next_post_id;
+ ++$this->next_post_id;
+ $this->current_node = array_merge(
+ $options,
+ array(
+ 'post_id' => $post_id,
+ )
+ );
+ ++$this->entities_read_so_far;
+ return $post_id;
+ }
+
+ /**
+ * Processes the next file in the traversal.
+ *
+ * @return bool True if a file is processed, false otherwise.
+ */
+ private function next_file() {
+ $this->pending_files = array();
+ while ( $this->file_visitor->next() ) {
+ $event = $this->file_visitor->get_event();
+
+ if ( $event->is_exiting() ) {
+ // Clean up stale IDs to save some memory when processing
+ // large directory trees.
+ unset( $this->parent_ids[ $event->dir ] );
+ continue;
+ }
+
+ if ( $event->is_entering() ) {
+ $abs_paths = array();
+ foreach ( $event->files as $filename ) {
+ $abs_paths[] = wp_join_paths( $event->dir, $filename );
+ }
+ $this->pending_files = array();
+ foreach ( $abs_paths as $path ) {
+ // Add all the subdirectory into the pending files list – there's
+ // a chance the directory wouldn't match the filter pattern, but
+ // a descendant file might.
+ if ( $this->fs->is_dir( $path ) ) {
+ $this->pending_files[] = $path;
+ }
+
+ // Only add the files that match the filter pattern.
+ if ( $this->fs->is_file( $path ) && preg_match( $this->filter_pattern, $path ) ) {
+ $this->pending_files[] = $path;
+ }
+ }
+ if ( ! count( $this->pending_files ) ) {
+ // Only consider directories with relevant files in them.
+ // Otherwise we'll create fake pages for media directories
+ // and other directories that don't contain any content.
+ //
+ // One corner case is when there's a few levels of directories
+ // with a single relevant file at the bottom:
+ //
+ // - /docs/
+ // - /foo/
+ // - /bar/
+ // - /baz.md
+ //
+ // In this case, `next_entity()` will backtrack at baz.md and
+ // create the missing parent pages.
+ continue;
+ }
+ $directory_index_idx = $this->choose_directory_index( $this->pending_files );
+ if ( -1 === $directory_index_idx ) {
+ $this->pending_directory_index = false;
+ } else {
+ $this->pending_directory_index = $this->pending_files[ $directory_index_idx ];
+ unset( $this->pending_files[ $directory_index_idx ] );
+ }
+ return true;
+ }
+
+ return false;
+ }
+ return false;
+ }
+
+ /**
+ * Chooses an index file from the list of pending files.
+ *
+ * @param array $files List of files to choose from.
+ * @return int The index of the chosen file or -1 if none.
+ */
+ protected function choose_directory_index( $files ) {
+ foreach ( $files as $idx => $file ) {
+ if ( $this->looks_like_directory_index( $file ) ) {
+ return $idx;
+ }
+ }
+ if ( ! $this->create_index_pages && count( $files ) > 0 ) {
+ return 0;
+ }
+ return -1;
+ }
+
+ /**
+ * Determines if a file path matches the index file pattern.
+ *
+ * @param string $path The file path to check.
+ * @return bool True if it matches, false otherwise.
+ */
+ protected function looks_like_directory_index( $path ) {
+ return preg_match( $this->index_file_pattern, basename( $path ) );
+ }
+
+ /**
+ * Finds a node in the filesystem tree by its path.
+ *
+ * @param string $path The path to search for.
+ * @return array|null The found node or null if not found.
+ */
+ private function find_node($path) {
+ // existing code...
+ }
+}
diff --git a/packages/playground/data-liberation/tests/WPFilesystemToPostTreeTests.php b/packages/playground/data-liberation/tests/WPFilesystemToPostTreeTests.php
new file mode 100644
index 0000000000..6201cb70cb
--- /dev/null
+++ b/packages/playground/data-liberation/tests/WPFilesystemToPostTreeTests.php
@@ -0,0 +1,40 @@
+ 2,
+ 'create_index_pages' => true,
+ 'filter_pattern' => '#\.html$#',
+ 'index_file_pattern' => '#root.html#',
+ ]
+ );
+ $posts = [];
+ while ( $reader->next_node() ) {
+ $posts[] = $reader->get_current_node();
+ }
+ $this->assertCount(3, $posts);
+
+ // The root index page
+ // Root index page
+ $this->assertEquals(2, $posts[0]['post_id']);
+ $this->assertNull($posts[0]['parent_id']);
+ $this->assertEquals('file', $posts[0]['type']);
+
+ // Nested directory page
+ $this->assertEquals(3, $posts[1]['post_id']);
+ $this->assertEquals(2, $posts[1]['parent_id']);
+ $this->assertEquals('file_placeholder', $posts[1]['type']);
+
+ // Leaf page
+ $this->assertEquals(4, $posts[2]['post_id']);
+ $this->assertEquals(3, $posts[2]['parent_id']);
+ $this->assertEquals('file', $posts[2]['type']);
+ }
+
+}
diff --git a/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/nested/page1.html b/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/nested/page1.html
new file mode 100644
index 0000000000..a76ff59751
--- /dev/null
+++ b/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/nested/page1.html
@@ -0,0 +1,2 @@
+
Page 1
+This is page 1.
diff --git a/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/root.html b/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/root.html
new file mode 100644
index 0000000000..5666bc9ad6
--- /dev/null
+++ b/packages/playground/data-liberation/tests/fixtures/filesystem-entity-reader/root.html
@@ -0,0 +1,2 @@
+Root
+This is the root page.