"""
This module provides :class:`GitIgnoreSpecPattern` which implements Git's
`gitignore`_ patterns, and handles edge-cases where Git's behavior differs from
what's documented. Git allows including files from excluded directories which
appears to contradict the documentation. Git discards patterns with invalid
range notation. This is used by :class:`~pathspec.gitignore.GitIgnoreSpec` to
fully replicate Git's handling.

.. _`gitignore`: https://git-scm.com/docs/gitignore
"""

from typing import (
	Optional)  # Replaced by `X | None` in 3.10.

from pathspec._typing import (
	AnyStr,  # Removed in 3.18.
	assert_unreachable,
	override)  # Added in 3.12.

from .base import (
	GitIgnorePatternError,
	_BYTES_ENCODING,
	_GitIgnoreBasePattern,
	_RangeError)

_DIR_MARK = 'ps_d'
"""
The regex group name for the directory marker. This is only used by
:class:`GitIgnoreSpec`.
"""

_DIR_MARK_CG = f'(?P<{_DIR_MARK}>/)'
"""
This regular expression matches the directory marker.
"""

_DIR_MARK_OPT = f'(?:{_DIR_MARK_CG}|$)'
"""
This regular expression matches the optional directory marker and sub-path.
"""


class GitIgnoreSpecPattern(_GitIgnoreBasePattern):
	"""
	The :class:`GitIgnoreSpecPattern` class represents a compiled gitignore
	pattern with special handling for edge-cases to replicate Git's behavior.

	This is registered under the deprecated name "gitwildmatch" for backward
	compatibility with v0.12. The registered name will be removed in a future
	version.
	"""

	# Keep the dict-less class hierarchy.
	__slots__ = ()

	@staticmethod
	def __normalize_segments(
		is_dir_pattern: bool,
		pattern_segs: list[str],
	) -> tuple[Optional[list[str]], Optional[str]]:
		"""
		Normalize the pattern segments to make processing easier.

		*is_dir_pattern* (:class:`bool`) is whether the pattern is a directory
		pattern (i.e., ends with a slash '/').

		*pattern_segs* (:class:`list` of :class:`str`) contains the pattern
		segments. This may be modified in place.

		Returns a :class:`tuple` containing either:

		- The normalized segments (:class:`list` of :class:`str`; or :data:`None`).

		- The regular expression override (:class:`str` or :data:`None`).
		"""
		if not pattern_segs[0]:
			# A pattern beginning with a slash ('/') should match relative to the root
			# directory. Remove the empty first segment to make the pattern relative
			# to root.
			del pattern_segs[0]

		elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
			# A single segment pattern with or without a trailing slash ('/') will
			# match any descendant path. This is equivalent to "**/{pattern}". Prepend
			# a double-asterisk segment to make the pattern relative to root.
			if pattern_segs[0] != '**':
				pattern_segs.insert(0, '**')

		else:
			# A pattern without a beginning slash ('/') but contains at least one
			# prepended directory (e.g., "dir/{pattern}") should match relative to the
			# root directory. No segment modification is needed.
			pass

		if not pattern_segs:
			# After normalization, we end up with no pattern at all. This must be
			# because the pattern is invalid.
			raise ValueError("Pattern normalized to nothing.")

		if not pattern_segs[-1]:
			# A pattern ending with a slash ('/') will match all descendant paths if
			# it is a directory but not if it is a regular file. This is equivalent to
			# "{pattern}/**". Set the empty last segment to a double-asterisk to
			# include all descendants.
			pattern_segs[-1] = '**'

		# EDGE CASE: Collapse duplicate double-asterisk sequences (i.e., '**/**').
		# Iterate over the segments in reverse order and remove the duplicate double
		# asterisks as we go.
		for i in range(len(pattern_segs) - 1, 0, -1):
			prev = pattern_segs[i-1]
			seg = pattern_segs[i]
			if prev == '**' and seg == '**':
				del pattern_segs[i]

		seg_count = len(pattern_segs)
		if seg_count == 1 and pattern_segs[0] == '**':
			if is_dir_pattern:
				# The pattern "**/" will be normalized to "**", but it should match
				# everything except for files in the root. Special case this pattern.
				return (None, _DIR_MARK_CG)
			else:
				# The pattern "**" will match every path. Special case this pattern.
				return (None, '.')

		elif (
			seg_count == 2
			and pattern_segs[0] == '**'
			and pattern_segs[1] == '*'
		):
			# The pattern "*" will be normalized to "**/*" and will match every
			# path. Special case this pattern for efficiency.
			return (None, '.')

		elif (
			seg_count == 3
			and pattern_segs[0] == '**'
			and pattern_segs[1] == '*'
			and pattern_segs[2] == '**'
		):
			# The pattern "*/" will be normalized to "**/*/**" which will match every
			# file not in the root directory. Special case this pattern for
			# efficiency.
			if is_dir_pattern:
				return (None, _DIR_MARK_CG)
			else:
				return (None, '/')

		# No regular expression override, return modified pattern segments.
		return (pattern_segs, None)

	@override
	@classmethod
	def pattern_to_regex(
		cls,
		pattern: AnyStr,
	) -> tuple[Optional[AnyStr], Optional[bool]]:
		"""
		Convert the pattern into a regular expression.

		*pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
		regular expression.

		Returns a :class:`tuple` containing:

			-	*pattern* (:class:`str`, :class:`bytes` or :data:`None`) is the
				uncompiled regular expression.

			-	*include* (:class:`bool` or :data:`None`) is whether matched files
				should be included (:data:`True`), excluded (:data:`False`), or is a
				null-operation (:data:`None`).
		"""
		if isinstance(pattern, str):
			pattern_str = pattern
			return_type = str
		elif isinstance(pattern, bytes):
			pattern_str = pattern.decode(_BYTES_ENCODING)
			return_type = bytes
		else:
			raise TypeError(f"{pattern=!r} is not a unicode or byte string.")

		original_pattern = pattern_str
		del pattern

		if pattern_str.endswith('\\ '):
			# EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
			# with a backslash is followed by a space, do not strip from the left.
			pass
		else:
			# EDGE CASE: Leading spaces should be kept (only trailing spaces should be
			# removed). Git does not remove leading spaces.
			pattern_str = pattern_str.rstrip()

		regex: Optional[str]
		include: Optional[bool]

		if not pattern_str:
			# A blank pattern is a null-operation (neither includes nor excludes
			# files).
			return (None, None)

		elif pattern_str.startswith('#'):
			# A pattern starting with a hash ('#') serves as a comment (neither
			# includes nor excludes files). Escape the hash with a backslash to match
			# a literal hash (i.e., '\#').
			return (None, None)

		elif pattern_str == '/':
			# EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does
			# not match any file.
			return (None, None)

		if pattern_str.startswith('!'):
			# A pattern starting with an exclamation mark ('!') negates the pattern
			# (exclude instead of include). Escape the exclamation mark with a
			# backslash to match a literal exclamation mark (i.e., '\!').
			include = False
			# Remove leading exclamation mark.
			pattern_str = pattern_str[1:]
		else:
			include = True

		# Split pattern into segments.
		orig_segs = pattern_str.split('/')

		# Check whether the pattern is specifically a directory pattern before
		# normalization.
		is_dir_pattern = not orig_segs[-1]

		# Normalize pattern to make processing easier.
		try:
			pattern_segs, override_regex = cls.__normalize_segments(
				is_dir_pattern, orig_segs,
			)
		except ValueError as e:
			raise GitIgnorePatternError((
				f"Invalid git pattern: {original_pattern!r}"
			)) from e  # GitIgnorePatternError

		if override_regex is not None:
			# Use regex override.
			regex = override_regex

		elif pattern_segs is not None:
			# Build regular expression from pattern.
			try:
				regex_parts = cls.__translate_segments(is_dir_pattern, pattern_segs)
			except _RangeError:
				# EDGE CASE: Git discards patterns with invalid range notation.
				return (None, None)
			except ValueError as e:
				raise GitIgnorePatternError((
					f"Invalid git pattern: {original_pattern!r}"
				)) from e  # GitIgnorePatternError

			regex = ''.join(regex_parts)

		else:
			assert_unreachable((
				f"{override_regex=} and {pattern_segs=} cannot both be null."
			))  # assert_unreachable

		# Encode regex if needed.
		out_regex: AnyStr
		if regex is not None and return_type is bytes:
			regex_bytes = regex.encode(_BYTES_ENCODING)
			out_regex = regex_bytes  # type: ignore[assignment]
		else:
			out_regex = regex  # type: ignore[assignment]

		return (out_regex, include)

	@classmethod
	def __translate_segments(
		cls,
		is_dir_pattern: bool,
		pattern_segs: list[str],
	) -> list[str]:
		"""
		Translate the pattern segments to regular expressions.

		*is_dir_pattern* (:class:`bool`) is whether the pattern is a directory
		pattern (i.e., ends with a slash '/').

		*pattern_segs* (:class:`list` of :class:`str`) contains the pattern
		segments.

		Raises :class:`_RangeError` if invalid range notation is found.

		Returns the regular expression parts (:class:`list` of :class:`str`).
		"""
		# Build regular expression from pattern.
		out_parts = []
		need_slash = False
		end = len(pattern_segs) - 1
		for i, seg in enumerate(pattern_segs):
			if seg == '**':
				if i == 0:
					# A normalized pattern beginning with double-asterisks ('**') will
					# match any leading path segments.
					out_parts.append('^(?:.+/)?')

				elif i < end:
					# A pattern with inner double-asterisks ('**') will match multiple (or
					# zero) inner path segments.
					out_parts.append('(?:/.+)?')
					need_slash = True

				else:
					assert i == end, (i, end)
					# A normalized pattern ending with double-asterisks ('**') will match
					# any trailing path segments.
					if is_dir_pattern:
						out_parts.append(_DIR_MARK_CG)
					else:
						out_parts.append('/')

			else:
				# Match path segment.
				if i == 0:
					# Anchor to root directory.
					out_parts.append('^')

				if need_slash:
					out_parts.append('/')

				if seg == '*':
					# Match whole path segment.
					out_parts.append('[^/]+')

				else:
					# Match segment glob pattern.
					# - EDGE CASE: Git discards patterns with invalid range notation.
					out_parts.append(cls._translate_segment_glob(seg, 'raise'))

				if i == end:
					# A pattern ending without a slash ('/') will match a file or a
					# directory (with paths underneath it). E.g., "foo" matches "foo",
					# "foo/bar", "foo/bar/baz", etc.
					out_parts.append(_DIR_MARK_OPT)

				need_slash = True

		return out_parts