<?php

namespace Drupal\tampers_extra\Plugin\Tamper;

use Drupal\Core\Form\FormStateInterface;
use Drupal\tamper\TamperableItemInterface;
use Drupal\tamper\TamperBase;

/**
 * Plugin implementation of the Text Between Regex plugin.
 *
 * @Tamper(
 *   id = "text_between_regex",
 *   label = @Translation("Text between regex"),
 *   description = @Translation("Extracts text between upper and lower regex patterns."),
 *   category = "Text"
 * )
 */
class TextBetweenRegex extends TamperBase
{

    const SETTING_START_REGEX = 'start_regex';
    const SETTING_END_REGEX = 'end_regex';
    const SETTING_INCLUDE_DELIMITERS = 'include_delimiters';
    const SETTING_MATCH_INDEX = 'match_index';
    const SETTING_DEFAULT_VALUE = 'default_value';
    const SETTING_FIX_HTML = 'fix_html';

    /**
     * {@inheritdoc}
     */
    public function defaultConfiguration(): array
    {
        return [
            self::SETTING_START_REGEX => '',
            self::SETTING_END_REGEX => '',
            self::SETTING_INCLUDE_DELIMITERS => FALSE,
            self::SETTING_MATCH_INDEX => 0,
            self::SETTING_DEFAULT_VALUE => '',
            self::SETTING_FIX_HTML => FALSE,
        ];
    }

    /**
     * {@inheritdoc}
     */
    public function buildConfigurationForm(array $form, FormStateInterface $form_state): array
    {
        $form[self::SETTING_START_REGEX] = [
            '#type' => 'textfield',
            '#title' => $this->t('Start pattern (lower limit)'),
            '#description' => $this->t('A regular expression pattern to match the START of the text to extract. Example: /\<description\>/ or /START:/'),
            '#default_value' => $this->configuration[self::SETTING_START_REGEX],
            '#maxlength' => 1024,
            '#required' => TRUE,
        ];

        $form[self::SETTING_END_REGEX] = [
            '#type' => 'textfield',
            '#title' => $this->t('End pattern (upper limit)'),
            '#description' => $this->t('A regular expression pattern to match the END of the text to extract. Example: /\<\/description\>/ or /:END/'),
            '#default_value' => $this->configuration[self::SETTING_END_REGEX],
            '#maxlength' => 1024,
            '#required' => TRUE,
        ];

        $form[self::SETTING_INCLUDE_DELIMITERS] = [
            '#type' => 'checkbox',
            '#title' => $this->t('Include delimiters in result'),
            '#description' => $this->t('If checked, the matched start and end patterns will be included in the result.'),
            '#default_value' => $this->configuration[self::SETTING_INCLUDE_DELIMITERS],
        ];

        $form[self::SETTING_MATCH_INDEX] = [
            '#type' => 'number',
            '#title' => $this->t('Match index'),
            '#description' => $this->t('If multiple matches are found, which match to return (0-based index). Use -1 to return all matches as an array.'),
            '#default_value' => $this->configuration[self::SETTING_MATCH_INDEX],
            '#min' => -1,
        ];

        $form[self::SETTING_DEFAULT_VALUE] = [
            '#type' => 'textfield',
            '#title' => $this->t('Default value'),
            '#description' => $this->t('The value to return if no match is found.'),
            '#default_value' => $this->configuration[self::SETTING_DEFAULT_VALUE],
        ];

        $form[self::SETTING_FIX_HTML] = [
            '#type' => 'checkbox',
            '#title' => $this->t('Fix broken HTML'),
            '#description' => $this->t('If checked, attempt to fix broken/malformed HTML in the extracted text using DOMDocument.'),
            '#default_value' => $this->configuration[self::SETTING_FIX_HTML],
        ];

        return $form;
    }

    /**
     * {@inheritdoc}
     */
    public function validateConfigurationForm(array &$form, FormStateInterface $form_state): void
    {
        $start_regex = $form_state->getValue(self::SETTING_START_REGEX);
        $end_regex = $form_state->getValue(self::SETTING_END_REGEX);

        // Test the start regex.
        if (@preg_match($start_regex, '') === FALSE) {
            $form_state->setErrorByName(self::SETTING_START_REGEX, $this->t('Invalid start regular expression.'));
        }

        // Test the end regex.
        if (@preg_match($end_regex, '') === FALSE) {
            $form_state->setErrorByName(self::SETTING_END_REGEX, $this->t('Invalid end regular expression.'));
        }
    }

    /**
     * {@inheritdoc}
     */
    public function submitConfigurationForm(array &$form, FormStateInterface $form_state): void
    {
        parent::submitConfigurationForm($form, $form_state);
        $this->setConfiguration([
            self::SETTING_START_REGEX => $form_state->getValue(self::SETTING_START_REGEX),
            self::SETTING_END_REGEX => $form_state->getValue(self::SETTING_END_REGEX),
            self::SETTING_INCLUDE_DELIMITERS => $form_state->getValue(self::SETTING_INCLUDE_DELIMITERS),
            self::SETTING_MATCH_INDEX => $form_state->getValue(self::SETTING_MATCH_INDEX),
            self::SETTING_DEFAULT_VALUE => $form_state->getValue(self::SETTING_DEFAULT_VALUE),
            self::SETTING_FIX_HTML => $form_state->getValue(self::SETTING_FIX_HTML),
        ]);
    }

    /**
     * {@inheritdoc}
     */
    public function tamper($data, ?TamperableItemInterface $item = NULL): mixed
    {
        // Don't process empty or null values.
        if (is_null($data) || $data === '') {
            return $this->configuration[self::SETTING_DEFAULT_VALUE];
        }

        if (!is_string($data) && !is_numeric($data)) {
            return $this->configuration[self::SETTING_DEFAULT_VALUE];
        }

        $data = (string) $data;
        $start_regex = $this->configuration[self::SETTING_START_REGEX];
        $end_regex = $this->configuration[self::SETTING_END_REGEX];
        $include_delimiters = $this->configuration[self::SETTING_INCLUDE_DELIMITERS];
        $match_index = (int) $this->configuration[self::SETTING_MATCH_INDEX];
        $default_value = $this->configuration[self::SETTING_DEFAULT_VALUE];
        $fix_html = $this->configuration[self::SETTING_FIX_HTML];

        $matches = $this->extractTextBetween($data, $start_regex, $end_regex, $include_delimiters);

        if (empty($matches)) {
            return $default_value;
        }

        // Apply HTML fix if enabled.
        if ($fix_html) {
            $matches = array_map([$this, 'fixHtml'], $matches);
        }

        // Return all matches if index is -1.
        if ($match_index === -1) {
            return $matches;
        }

        // Return specific match by index.
        if (isset($matches[$match_index])) {
            return $matches[$match_index];
        }

        return $default_value;
    }

    /**
     * Extract text between two regex patterns.
     *
     * @param string $data
     *   The input string.
     * @param string $start_regex
     *   The start pattern regex.
     * @param string $end_regex
     *   The end pattern regex.
     * @param bool $include_delimiters
     *   Whether to include the matched delimiters.
     *
     * @return array
     *   Array of matched text segments.
     */
    protected function extractTextBetween(string $data, string $start_regex, string $end_regex, bool $include_delimiters): array
    {
        $results = [];

        // Find all start pattern matches with their positions.
        if (!preg_match_all($start_regex, $data, $start_matches, PREG_OFFSET_CAPTURE)) {
            return $results;
        }

        foreach ($start_matches[0] as $start_match) {
            $start_text = $start_match[0];
            $start_pos = $start_match[1];

            // Calculate where the content begins (after the start pattern).
            $content_start = $start_pos + strlen($start_text);

            // Get the remaining string after the start pattern.
            $remaining = substr($data, $content_start);

            // Find the end pattern in the remaining string.
            if (!preg_match($end_regex, $remaining, $end_match, PREG_OFFSET_CAPTURE)) {
                continue;
            }

            $end_text = $end_match[0][0];
            $end_pos_in_remaining = $end_match[0][1];

            // Extract the text between start and end.
            $between_text = substr($remaining, 0, $end_pos_in_remaining);

            if ($include_delimiters) {
                $results[] = $start_text . $between_text . $end_text;
            } else {
                $results[] = $between_text;
            }
        }

        return $results;
    }

    /**
     * Fix broken/malformed HTML using DOMDocument.
     *
     * @param string $html
     *   The potentially broken HTML string.
     *
     * @return string
     *   The fixed HTML string.
     */
    protected function fixHtml(string $html): string
    {
        if (empty(trim($html))) {
            return $html;
        }

        // Use libxml error handling to suppress warnings.
        $previous_use_errors = libxml_use_internal_errors(TRUE);

        $doc = new \DOMDocument();
        // Load HTML with UTF-8 encoding.
        $doc->loadHTML(
            '<?xml encoding="UTF-8"><div>' . $html . '</div>',
            LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
        );

        // Clear any libxml errors.
        libxml_clear_errors();
        libxml_use_internal_errors($previous_use_errors);

        // Get the inner HTML of the wrapper div.
        $xpath = new \DOMXPath($doc);
        $wrapper = $xpath->query('//div')->item(0);

        if (!$wrapper) {
            return $html;
        }

        $result = '';
        foreach ($wrapper->childNodes as $child) {
            $result .= $doc->saveHTML($child);
        }

        return trim($result);
    }
}
