pandas-dev · MarcoGorelli · Feb 15, 2023 · Jan 27, 2023 · Jan 27, 2023 · Jan 27, 2023
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -290,6 +290,16 @@ date_parser : function, default ``None``
   values from the columns defined by parse_dates into a single array and pass
   that; and 3) call date_parser once for each row using one or more strings
   (corresponding to the columns defined by parse_dates) as arguments.
+
+  .. deprecated:: 2.0.0
+   Use ``date_format`` instead, or read in as ``object`` and then apply
+   :func:`to_datetime` as-needed.
+date_format : str, default ``None``
+   If used in conjunction with ``parse_dates``, will parse dates according to this
+   format. For anything more complex (e.g. different formats for different columns),
+   please read in as ``object`` and then apply :func:`to_datetime` as-needed.
+
+    .. versionadded:: 2.0.0
 dayfirst : boolean, default ``False``
   DD/MM format dates, international and European format.
 cache_dates : boolean, default True
@@ -800,7 +810,7 @@ Specifying date columns
 +++++++++++++++++++++++
 
 To better facilitate working with datetime data, :func:`read_csv`
-uses the keyword arguments ``parse_dates`` and ``date_parser``
+uses the keyword arguments ``parse_dates`` and ``date_format``
 to allow users to specify a variety of columns and date/time formats to turn the
 input text data into ``datetime`` objects.
 
@@ -898,33 +908,15 @@ data columns:
 Date parsing functions
 ++++++++++++++++++++++
 
-Finally, the parser allows you to specify a custom ``date_parser`` function to
-take full advantage of the flexibility of the date parsing API:
-
-.. ipython:: python
-
-   df = pd.read_csv(
-       "tmp.csv", header=None, parse_dates=date_spec, date_parser=pd.to_datetime
-   )
-   df
-
-pandas will try to call the ``date_parser`` function in three different ways. If
-an exception is raised, the next one is tried:
-
-1. ``date_parser`` is first called with one or more arrays as arguments,
-   as defined using ``parse_dates`` (e.g., ``date_parser(['2013', '2013'], ['1', '2'])``).
-
-2. If #1 fails, ``date_parser`` is called with all the columns
-   concatenated row-wise into a single array (e.g., ``date_parser(['2013 1', '2013 2'])``).
+Finally, the parser allows you to specify a custom ``date_format``.
+Performance-wise, you should try these methods of parsing dates in order:
 
-Note that performance-wise, you should try these methods of parsing dates in order:
+1. If you know the format, use ``date_format``, e.g.:
+   ``date_format="%d/%m/%Y"``.
 
-1. If you know the format, use ``pd.to_datetime()``:
-   ``date_parser=lambda x: pd.to_datetime(x, format=...)``.
-
-2. If you have a really non-standard format, use a custom ``date_parser`` function.
-   For optimal performance, this should be vectorized, i.e., it should accept arrays
-   as arguments.
+2. If you different formats for different columns, or want to pass any extra options (such
+   as ``utc``) to ``to_datetime``, then you should read in your data as ``object`` dtype, and
+   then use ``to_datetime``.
 
 
 .. ipython:: python
@@ -952,16 +944,13 @@ an object-dtype column with strings, even with ``parse_dates``.
    df = pd.read_csv(StringIO(content), parse_dates=["a"])
    df["a"]
 
-To parse the mixed-timezone values as a datetime column, pass a partially-applied
-:func:`to_datetime` with ``utc=True`` as the ``date_parser``.
+To parse the mixed-timezone values as a datetime column, read in as ``object`` dtype and
+then call :func:`to_datetime` with ``utc=True``.
 
 .. ipython:: python
 
-   df = pd.read_csv(
-       StringIO(content),
-       parse_dates=["a"],
-       date_parser=lambda col: pd.to_datetime(col, utc=True),
-   )
+   df = pd.read_csv(StringIO(content))
+   df["a"] = pd.to_datetime(df["a"], utc=True)
    df["a"]
 
 

diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -686,11 +686,19 @@ Parsing mixed-timezones with :func:`read_csv`
 As can be seen, the ``dtype`` is object; each value in the column is a string.
 To convert the strings to an array of datetimes, the ``date_parser`` argument
 
-.. ipython:: python
+.. code-block:: ipython
 
-   df = pd.read_csv(io.StringIO(content), parse_dates=['a'],
-                    date_parser=lambda col: pd.to_datetime(col, utc=True))
-   df.a
+   In [3]: df = pd.read_csv(
+      ...:     io.StringIO(content),
+      ...:     parse_dates=['a'],
+      ...:     date_parser=lambda col: pd.to_datetime(col, utc=True),
+      ...: )
+
+   In [4]: df.a
+   Out[4]:
+   0   1999-12-31 19:00:00+00:00
+   1   1999-12-31 18:00:00+00:00
+   Name: a, dtype: datetime64[ns, UTC]
 
 See :ref:`whatsnew_0240.api.timezone_offset_parsing` for more.
 

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -291,6 +291,7 @@ Other enhancements
 - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`)
 - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`)
 - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`)
+- :func:`read_csv` and :func:`read_table` now accept ``date_format`` (:issue:`50601`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -780,6 +781,7 @@ Deprecations
 - :meth:`Index.is_categorical` has been deprecated. Use :func:`pandas.api.types.is_categorical_dtype` instead (:issue:`50042`)
 - :meth:`Index.is_object` has been deprecated. Use :func:`pandas.api.types.is_object_dtype` instead (:issue:`50042`)
 - :meth:`Index.is_interval` has been deprecated. Use :func:`pandas.api.types.is_intterval_dtype` instead (:issue:`50042`)
+- Deprecated argument ``date_parser`` in :func:`read_csv` and :func:`read_table` in favour of ``date_format`` (:issue:`50601`)
 - Deprecated ``all`` and ``any`` reductions with ``datetime64`` and :class:`DatetimeTZDtype` dtypes, use e.g. ``(obj != pd.Timestamp(0), tz=obj.tz).all()`` instead (:issue:`34479`)
 - Deprecated unused arguments ``*args`` and ``**kwargs`` in :class:`Resampler` (:issue:`50977`)
 - Deprecated calling ``float`` or ``int`` on a single element :class:`Series` to return a ``float`` or ``int`` respectively. Extract the element before calling ``float`` or ``int`` instead (:issue:`51101`)

@@ -250,6 +250,16 @@
     and pass that; and 3) call `date_parser` once for each row using one or
     more strings (corresponding to the columns defined by `parse_dates`) as
     arguments.
+
+  .. deprecated:: 2.0.0
+   Use ``date_format`` instead, or read in as ``object`` and then apply
+   :func:`to_datetime` as-needed.
+date_format : str, default ``None``
+   If used in conjunction with ``parse_dates``, will parse dates according to this
+   format. For anything more complex (e.g. different formats for different columns),
+   please read in as ``object`` and then apply :func:`to_datetime` as-needed.
+
+    .. versionadded:: 2.0.0
 thousands : str, default None
     Thousands separator for parsing string columns to numeric.  Note that
     this parameter is only necessary for columns stored as TEXT in Excel,
@@ -387,6 +397,7 @@ def read_excel(
     verbose: bool = ...,
     parse_dates: list | dict | bool = ...,
     date_parser: Callable | None = ...,
+    date_format: str | None = ...,
     thousands: str | None = ...,
     decimal: str = ...,
     comment: str | None = ...,
@@ -426,6 +437,7 @@ def read_excel(
     verbose: bool = ...,
     parse_dates: list | dict | bool = ...,
     date_parser: Callable | None = ...,
+    date_format: str | None = ...,
     thousands: str | None = ...,
     decimal: str = ...,
     comment: str | None = ...,
@@ -465,6 +477,7 @@ def read_excel(
     verbose: bool = False,
     parse_dates: list | dict | bool = False,
     date_parser: Callable | None = None,
+    date_format: str | None = None,
     thousands: str | None = None,
     decimal: str = ".",
     comment: str | None = None,
@@ -509,6 +522,7 @@ def read_excel(
             verbose=verbose,
             parse_dates=parse_dates,
             date_parser=date_parser,
+            date_format=date_format,
             thousands=thousands,
             decimal=decimal,
             comment=comment,
@@ -713,6 +727,7 @@ def parse(
         verbose: bool = False,
         parse_dates: list | dict | bool = False,
         date_parser: Callable | None = None,
+        date_format: str | None = None,
         thousands: str | None = None,
         decimal: str = ".",
         comment: str | None = None,
@@ -873,6 +888,7 @@ def parse(
                     skip_blank_lines=False,  # GH 39808
                     parse_dates=parse_dates,
                     date_parser=date_parser,
+                    date_format=date_format,
                     thousands=thousands,
                     decimal=decimal,
                     comment=comment,
@@ -1540,6 +1556,7 @@ def parse(
         na_values=None,
         parse_dates: list | dict | bool = False,
         date_parser: Callable | None = None,
+        date_format: str | None = None,
         thousands: str | None = None,
         comment: str | None = None,
         skipfooter: int = 0,
@@ -1572,6 +1589,7 @@ def parse(
             na_values=na_values,
             parse_dates=parse_dates,
             date_parser=date_parser,
+            date_format=date_format,
             thousands=thousands,
             comment=comment,
             skipfooter=skipfooter,

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -116,6 +116,7 @@ def __init__(self, kwds) -> None:
         self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
         self._parse_date_cols: Iterable = []
         self.date_parser = kwds.pop("date_parser", None)
+        self.date_format = kwds.pop("date_format", None)
         self.dayfirst = kwds.pop("dayfirst", False)
         self.keep_date_col = kwds.pop("keep_date_col", False)
 
@@ -134,6 +135,7 @@ def __init__(self, kwds) -> None:
 
         self._date_conv = _make_date_converter(
             date_parser=self.date_parser,
+            date_format=self.date_format,
             dayfirst=self.dayfirst,
             cache_dates=self.cache_dates,
         )
@@ -1094,13 +1096,27 @@ def _make_date_converter(
     date_parser=None,
     dayfirst: bool = False,
     cache_dates: bool = True,
+    date_format: str | None = None,
 ):
+    if date_parser is not None:
+        warnings.warn(
+            "The argument 'date_parser' is deprecated and will "
+            "be removed in a future version. "
+            "Please use 'date_format' instead, or read your data in as 'object' dtype "
+            "and then call 'to_datetime'.",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+    if date_parser is not None and date_format is not None:
+        raise TypeError("Cannot use both 'date_parser' and 'date_format'")
+
     def converter(*date_cols):
         if date_parser is None:
             strs = parsing.concat_date_cols(date_cols)
 
             return tools.to_datetime(
                 ensure_object(strs),
+                format=date_format,
                 utc=False,
                 dayfirst=dayfirst,
                 errors="ignore",
@@ -1155,6 +1171,7 @@ def converter(*date_cols):
     "keep_date_col": False,
     "dayfirst": False,
     "date_parser": None,
+    "date_format": None,
     "usecols": None,
     # 'iterator': False,
     "chunksize": None,

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -239,10 +239,7 @@
     say because of an unparsable value or a mixture of timezones, the column
     or index will be returned unaltered as an object data type. For
     non-standard datetime parsing, use ``pd.to_datetime`` after
-    ``pd.read_csv``. To parse an index or column with a mixture of timezones,
-    specify ``date_parser`` to be a partially-applied
-    :func:`pandas.to_datetime` with ``utc=True``. See
-    :ref:`io.csv.mixed_timezones` for more.
+    ``pd.read_csv``.
 
     Note: A fast-path exists for iso8601-formatted dates.
 infer_datetime_format : bool, default False
@@ -267,6 +264,16 @@
     and pass that; and 3) call `date_parser` once for each row using one or
     more strings (corresponding to the columns defined by `parse_dates`) as
     arguments.
+
+  .. deprecated:: 2.0.0
+   Use ``date_format`` instead, or read in as ``object`` and then apply
+   :func:`to_datetime` as-needed.
+date_format : str, default ``None``
+   If used in conjunction with ``parse_dates``, will parse dates according to this
+   format. For anything more complex (e.g. different formats for different columns),
+   please read in as ``object`` and then apply :func:`to_datetime` as-needed.
+
+    .. versionadded:: 2.0.0
 dayfirst : bool, default False
     DD/MM format dates, international and European format.
 cache_dates : bool, default True
@@ -548,7 +555,7 @@ def _read(
     # if we pass a date_parser and parse_dates=False, we should not parse the
     # dates GH#44366
     if kwds.get("parse_dates", None) is None:
-        if kwds.get("date_parser", None) is None:
+        if kwds.get("date_parser", None) is None and kwds.get("date_format") is None:
             kwds["parse_dates"] = False
         else:
             kwds["parse_dates"] = True
@@ -622,6 +629,7 @@ def read_csv(
     infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
+    date_format: str | None = ...,
     dayfirst: bool = ...,
     cache_dates: bool = ...,
     iterator: Literal[True],
@@ -678,6 +686,7 @@ def read_csv(
     infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
+    date_format: str | None = ...,
     dayfirst: bool = ...,
     cache_dates: bool = ...,
     iterator: bool = ...,
@@ -734,6 +743,7 @@ def read_csv(
     infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
+    date_format: str | None = ...,
     dayfirst: bool = ...,
     cache_dates: bool = ...,
     iterator: Literal[False] = ...,
@@ -790,6 +800,7 @@ def read_csv(
     infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
+    date_format: str | None = ...,
     dayfirst: bool = ...,
     cache_dates: bool = ...,
     iterator: bool = ...,
@@ -858,6 +869,7 @@ def read_csv(
     infer_datetime_format: bool | lib.NoDefault = lib.no_default,
     keep_date_col: bool = False,
     date_parser=None,
+    date_format: str | None = None,
     dayfirst: bool = False,
     cache_dates: bool = True,
     # Iteration
@@ -946,6 +958,7 @@ def read_table(
     infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
+    date_format: str | None = ...,
     dayfirst: bool = ...,
     cache_dates: bool = ...,
     iterator: Literal[True],
@@ -1002,6 +1015,7 @@ def read_table(
     infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
+    date_format: str | None = ...,
     dayfirst: bool = ...,
     cache_dates: bool = ...,
     iterator: bool = ...,
@@ -1058,6 +1072,7 @@ def read_table(
     infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
+    date_format: str | None = ...,
     dayfirst: bool = ...,
     cache_dates: bool = ...,
     iterator: Literal[False] = ...,
@@ -1114,6 +1129,7 @@ def read_table(
     infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
+    date_format: str | None = ...,
     dayfirst: bool = ...,
     cache_dates: bool = ...,
     iterator: bool = ...,
@@ -1182,6 +1198,7 @@ def read_table(
     infer_datetime_format: bool | lib.NoDefault = lib.no_default,
     keep_date_col: bool = False,
     date_parser=None,
+    date_format: str | None = None,
     dayfirst: bool = False,
     cache_dates: bool = True,
     # Iteration
@@ -1786,6 +1803,11 @@ def TextParser(*args, **kwds) -> TextFileReader:
     parse_dates : bool, default False
     keep_date_col : bool, default False
     date_parser : function, optional
+
+        .. deprecated:: 2.0.0
+    date_format : str, default ``None``
+
+        .. versionadded:: 2.0.0
     skiprows : list of integers
         Row numbers to skip
     skipfooter : int