diff --git a/.github/workflows/lint-and-test.yml b/.github/workflows/lint-and-test.yml index 3a913be..d6c6187 100644 --- a/.github/workflows/lint-and-test.yml +++ b/.github/workflows/lint-and-test.yml @@ -1,40 +1,63 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python -name: Python package +name: Lint and Test on: push: branches: [ "main" ] pull_request: - branches: [ "main" ] + branches: [ "main", "v*" ] + types: [ opened, synchronize, reopened, ready_for_review ] jobs: build: - - runs-on: ubuntu-latest + if: github.event.pull_request.draft == false strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + os: [macos-latest, windows-latest, ubuntu-latest] + include: + - python-version: "3.6" + os: macos-12 + - python-version: "3.6" + os: windows-latest + - python-version: "3.6" + os: ubuntu-20.04 + - python-version: "3.7" + os: macos-12 + - python-version: "3.7" + os: windows-latest + - python-version: "3.7" + os: ubuntu-latest + + runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pytest + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + shell: bash + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + + - name: Lint with flake8 + shell: bash + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + + - name: Test with pytest + shell: bash + run: | + pytest diff --git a/.gitignore b/.gitignore index a15111e..b8e9c1a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ # Byte-compiled / optimized / DLL files -__pycache__/ +__pycache__ *.py[cod] *$py.class @@ -50,6 +50,7 @@ coverage.xml .hypothesis/ .pytest_cache/ cover/ +*/**/tmpcache-* # Translations *.mo @@ -139,3 +140,9 @@ dmypy.json # Cython debug symbols cython_debug/ + +# Mac local files +.DS_Store + +# Dev +scratch.ipynb diff --git a/nfl_data_py/__init__.py b/nfl_data_py/__init__.py index 604ba7c..01ddaa1 100644 --- a/nfl_data_py/__init__.py +++ b/nfl_data_py/__init__.py @@ -1,14 +1,16 @@ name = 'nfl_data_py' -import datetime import os import logging +import datetime +from warnings import warn +from typing import Iterable from concurrent.futures import ThreadPoolExecutor, as_completed -import appdirs import numpy import pandas -from typing import Iterable +import appdirs +from urllib.error import HTTPError # module level doc string __doc__ = """ @@ -142,20 +144,32 @@ def import_pbp_data( raw = pandas.DataFrame(data) raw['season'] = year - if all([include_participation, year >= 2016, not cache]): + + if include_participation and not cache: path = r'https://github.com/nflverse/nflverse-data/releases/download/pbp_participation/pbp_participation_{}.parquet'.format(year) - partic = pandas.read_parquet(path) - raw = raw.merge(partic, how='left', on=['play_id','old_game_id']) + + try: + partic = pandas.read_parquet(path) + raw = raw.merge( + partic, + how='left', + left_on=['play_id','game_id'], + right_on=['play_id','nflverse_game_id'] + ) + except HTTPError: + pass pbp_data.append(raw) print(str(year) + ' done.') - except Error as e: + except Exception as e: print(e) print('Data not available for ' + str(year)) - if pbp_data: - plays = pandas.concat(pbp_data).reset_index(drop=True) + if not pbp_data: + return pandas.DataFrame() + + plays = pandas.concat(pbp_data, ignore_index=True) # converts float64 to float32, saves ~30% memory if downcast: @@ -183,12 +197,10 @@ def cache_pbp(years, downcast=True, alt_path=None): if min(years) < 1999: raise ValueError('Data not available before 1999.') - plays = pandas.DataFrame() - url1 = r'https://github.com/nflverse/nflverse-data/releases/download/pbp/play_by_play_' url2 = r'.parquet' appname = 'nfl_data_py' - appauthor = 'cooper_dff' + appauthor = 'nflverse' # define path for caching if alt_path is not None: @@ -230,7 +242,15 @@ def cache_pbp(years, downcast=True, alt_path=None): print(str(year) + ' done.') - except: + except Exception as e: + warn( + f"Caching failed for {year}, skipping.\n" + "In nfl_data_py 1.0, this will raise an exception.\n" + f"Failure: {e}", + DeprecationWarning, + stacklevel=2 + ) + next @@ -432,7 +452,7 @@ def __import_rosters(release, years, columns=None): rosters = pandas.concat([ pandas.read_parquet(uri.format(y)) for y in years - ]) + ], ignore_index=True) # Post-import processing rosters['birth_date'] = pandas.to_datetime(rosters.birth_date) @@ -728,52 +748,32 @@ def import_ids(columns=None, ids=None): """Import mapping table of ids for most major data providers Args: - columns (List[str]): list of columns to return - ids (List[str]): list of specific ids to return + columns (Iterable[str]): list of columns to return + ids (Iterable[str]): list of specific ids to return Returns: DataFrame """ - - # create list of id options - avail_ids = ['mfl_id', 'sportradar_id', 'fantasypros_id', 'gsis_id', 'pff_id', - 'sleeper_id', 'nfl_id', 'espn_id', 'yahoo_id', 'fleaflicker_id', - 'cbs_id', 'rotowire_id', 'rotoworld_id', 'ktc_id', 'pfr_id', - 'cfbref_id', 'stats_id', 'stats_global_id', 'fantasy_data_id'] - avail_sites = [x[:-3] for x in avail_ids] - - # check variable types - if columns is None: - columns = [] - - if ids is None: - ids = [] - if not isinstance(columns, list): - raise ValueError('columns variable must be list.') - - if not isinstance(ids, list): - raise ValueError('ids variable must be list.') - - # confirm id is in table - if False in [x in avail_sites for x in ids]: - raise ValueError('ids variable can only contain ' + ', '.join(avail_sites)) + columns = columns or [] + if not isinstance(columns, Iterable): + raise ValueError('columns argument must be a list.') + + ids = ids or [] + if not isinstance(ids, Iterable): + raise ValueError('ids argument must be a list.') - # import data - df = pandas.read_csv(r'https://raw.githubusercontent.com/dynastyprocess/data/master/files/db_playerids.csv') + df = pandas.read_csv("https://raw.githubusercontent.com/dynastyprocess/data/master/files/db_playerids.csv") - rem_cols = [x for x in df.columns if x not in avail_ids] - tgt_ids = [x + '_id' for x in ids] - - # filter df to just specified columns - if len(columns) > 0 and len(ids) > 0: - df = df[set(tgt_ids + columns)] - elif len(columns) > 0 and len(ids) == 0: - df = df[set(avail_ids + columns)] - elif len(columns) == 0 and len(ids) > 0: - df = df[set(tgt_ids + rem_cols)] + id_cols = [c for c in df.columns if c.endswith('_id')] + non_id_cols = [c for c in df.columns if not c.endswith('_id')] - return df + # filter df to just specified ids + columns + ret_ids = [x + '_id' for x in ids] or id_cols + ret_cols = columns or non_id_cols + ret_columns = list(set([*ret_ids, *ret_cols])) + + return df[ret_columns] def import_contracts(): @@ -916,8 +916,8 @@ def import_qbr(years=None, level='nfl', frequency='season'): def __validate_pfr_inputs(s_type, years=None): - if s_type not in ('pass', 'rec', 'rush'): - raise ValueError('s_type variable must be one of "pass", "rec", or "rush".') + if s_type not in ('pass', 'rec', 'rush', 'def'): + raise ValueError('s_type variable must be one of "pass", "rec","rush", or "def".') if years is None: return [] @@ -939,7 +939,7 @@ def import_seasonal_pfr(s_type, years=None): """Import PFR advanced season-level statistics Args: - s_type (str): must be one of pass, rec, rush + s_type (str): must be one of pass, rec, rush, def years (List[int]): years to return data for, optional Returns: DataFrame @@ -957,7 +957,7 @@ def import_weekly_pfr(s_type, years=None): """Import PFR advanced week-level statistics Args: - s_type (str): must be one of pass, rec, rush + s_type (str): must be one of pass, rec, rush, def years (List[int]): years to return data for, optional Returns: DataFrame @@ -1139,24 +1139,13 @@ def clean_nfl_data(df): 'Louisiana State': 'LSU' } - pro_tm_repl = { - 'GNB': 'GB', - 'KAN': 'KC', - 'LA': 'LAR', - 'LVR': 'LV', - 'NWE': 'NE', - 'NOR': 'NO', - 'SDG': 'SD', - 'SFO': 'SF', - 'TAM': 'TB' - } - na_replace = { 'NA':numpy.nan } for col in df.columns: - df.replace({col:na_replace}, inplace=True) + if df[col].dtype == 'object': + df.replace({col:na_replace}, inplace=True) if 'name' in df.columns: df.replace({'name': name_repl}, inplace=True) @@ -1164,8 +1153,4 @@ def clean_nfl_data(df): if 'col_team' in df.columns: df.replace({'col_team': col_tm_repl}, inplace=True) - if 'name' in df.columns: - for z in player_col_tm_repl: - df[df['name'] == z[0]] = df[df['name'] == z[0]].replace({z[1]: z[2]}) - return df diff --git a/nfl_data_py/tests/__pycache__/nfl_test.cpython-37-pytest-6.2.4.pyc b/nfl_data_py/tests/__pycache__/nfl_test.cpython-37-pytest-6.2.4.pyc deleted file mode 100644 index d1d8cde..0000000 Binary files a/nfl_data_py/tests/__pycache__/nfl_test.cpython-37-pytest-6.2.4.pyc and /dev/null differ diff --git a/nfl_data_py/tests/__pycache__/nfl_test.cpython-39-pytest-6.2.4.pyc b/nfl_data_py/tests/__pycache__/nfl_test.cpython-39-pytest-6.2.4.pyc deleted file mode 100644 index 0c113eb..0000000 Binary files a/nfl_data_py/tests/__pycache__/nfl_test.cpython-39-pytest-6.2.4.pyc and /dev/null differ diff --git a/nfl_data_py/tests/nfl_test.py b/nfl_data_py/tests/nfl_test.py index 290b968..779a4c2 100644 --- a/nfl_data_py/tests/nfl_test.py +++ b/nfl_data_py/tests/nfl_test.py @@ -1,6 +1,7 @@ from unittest import TestCase from pathlib import Path import shutil +import random import pandas as pd @@ -8,19 +9,19 @@ class test_pbp(TestCase): + pbp = nfl.import_pbp_data([2020]) + def test_is_df_with_data(self): - s = nfl.import_pbp_data([2020]) - self.assertEqual(True, isinstance(s, pd.DataFrame)) - self.assertTrue(len(s) > 0) + self.assertIsInstance(self.pbp, pd.DataFrame) + self.assertTrue(len(self.pbp) > 0) def test_is_df_with_data_thread_requests(self): s = nfl.import_pbp_data([2020, 2021], thread_requests=True) - self.assertEqual(True, isinstance(s, pd.DataFrame)) + self.assertIsInstance(s, pd.DataFrame) self.assertTrue(len(s) > 0) - def test_uses_cache_when_cache_is_true(self): - cache = Path(__file__).parent/"tmpcache" + cache = Path(__file__).parent/f"tmpcache-{random.randint(0, 10000)}" self.assertRaises( ValueError, nfl.import_pbp_data, [2020], cache=True, alt_path=cache @@ -32,43 +33,62 @@ def test_uses_cache_when_cache_is_true(self): self.assertIsInstance(data, pd.DataFrame) shutil.rmtree(cache) + + def test_includes_participation_by_default(self): + self.assertIn("offense_players", self.pbp.columns) + + def test_excludes_participation_when_requested(self): + data = nfl.import_pbp_data([2020], include_participation=False) + self.assertIsInstance(self.pbp, pd.DataFrame) + self.assertTrue(len(self.pbp) > 0) + self.assertNotIn("offense_players", data.columns) + + def test_excludes_participation_if_not_available(self): + data = nfl.import_pbp_data([2024]) + self.assertIsInstance(self.pbp, pd.DataFrame) + self.assertTrue(len(self.pbp) > 0) + self.assertNotIn("offense_players", data.columns) class test_weekly(TestCase): def test_is_df_with_data(self): s = nfl.import_weekly_data([2020]) - self.assertEqual(True, isinstance(s, pd.DataFrame)) + self.assertIsInstance(s, pd.DataFrame) self.assertTrue(len(s) > 0) def test_is_df_with_data_thread_requests(self): s = nfl.import_weekly_data([2020, 2021], thread_requests=True) - self.assertEqual(True, isinstance(s, pd.DataFrame)) + self.assertIsInstance(s, pd.DataFrame) self.assertTrue(len(s) > 0) class test_seasonal(TestCase): def test_is_df_with_data(self): s = nfl.import_seasonal_data([2020]) - self.assertEqual(True, isinstance(s, pd.DataFrame)) + self.assertIsInstance(s, pd.DataFrame) self.assertTrue(len(s) > 0) class test_pbp_cols(TestCase): def test_is_list_with_data(self): s = nfl.see_pbp_cols() - self.assertEqual(True, isinstance(set(nfl.see_pbp_cols()), set)) self.assertTrue(len(s) > 0) class test_weekly_cols(TestCase): def test_is_list_with_data(self): s = nfl.see_weekly_cols() - self.assertEqual(True, isinstance(set(nfl.see_pbp_cols()), set)) self.assertTrue(len(s) > 0) class test_seasonal_rosters(TestCase): data = nfl.import_seasonal_rosters([2020]) def test_is_df_with_data(self): - self.assertEqual(True, isinstance(self.data, pd.DataFrame)) + self.assertIsInstance(self.data, pd.DataFrame) self.assertTrue(len(self.data) > 0) + + def test_import_multiple_years(self): + s = nfl.import_weekly_rosters([2022, 2023]) + self.assertIsInstance(s, pd.DataFrame) + self.assertGreater(len(s), len(self.data)) + self.assertListEqual(s.season.unique().tolist(), [2022, 2023]) def test_computes_age_as_of_season_start(self): mahomes_ages = get_pat(self.data).age @@ -82,6 +102,12 @@ class test_weekly_rosters(TestCase): def test_is_df_with_data(self): assert isinstance(self.data, pd.DataFrame) self.assertGreater(len(self.data), 0) + + def test_import_multiple_years(self): + s = nfl.import_weekly_rosters([2022, 2023]) + self.assertIsInstance(s, pd.DataFrame) + self.assertGreater(len(s), len(self.data)) + self.assertListEqual(s.season.unique().tolist(), [2022, 2023]) def test_gets_weekly_updates(self): assert isinstance(self.data, pd.DataFrame) @@ -166,6 +192,29 @@ def test_is_df_with_data(self): s = nfl.import_ids() self.assertEqual(True, isinstance(s, pd.DataFrame)) self.assertTrue(len(s) > 0) + + def test_import_using_ids(self): + ids = ["espn", "yahoo", "gsis"] + s = nfl.import_ids(ids=ids) + self.assertTrue(all([f"{id}_id" in s.columns for id in ids])) + + def test_import_using_columns(self): + ret_columns = ["name", "birthdate", "college"] + not_ret_columns = ["draft_year", "db_season", "team"] + s = nfl.import_ids(columns=ret_columns) + self.assertTrue(all([column in s.columns for column in ret_columns])) + self.assertTrue(all([column not in s.columns for column in not_ret_columns])) + + def test_import_using_ids_and_columns(self): + ret_ids = ["espn", "yahoo", "gsis"] + ret_columns = ["name", "birthdate", "college"] + not_ret_ids = ["cfbref_id", "pff_id", "prf_id"] + not_ret_columns = ["draft_year", "db_season", "team"] + s = nfl.import_ids(columns=ret_columns, ids=ret_ids) + self.assertTrue(all([column in s.columns for column in ret_columns])) + self.assertTrue(all([column not in s.columns for column in not_ret_columns])) + self.assertTrue(all([f"{id}_id" in s.columns for id in ret_ids])) + self.assertTrue(all([f"{id}_id" not in s.columns for id in not_ret_ids])) class test_ngs(TestCase): def test_is_df_with_data(self): @@ -268,17 +317,14 @@ def test_is_df_with_data_thread_requests(self): class test_cache(TestCase): def test_cache(self): - cache = Path(__file__).parent/"tmpcache" + cache = Path(__file__).parent/f"tmpcache-{random.randint(0, 10000)}" self.assertFalse(cache.is_dir()) nfl.cache_pbp([2020], alt_path=cache) - new_paths = list(cache.glob("**/*")) - self.assertEqual(len(new_paths), 2) - self.assertTrue(new_paths[0].is_dir()) - self.assertTrue(new_paths[1].is_file()) - - pbp2020 = pd.read_parquet(new_paths[1]) + self.assertTrue(cache.is_dir()) + + pbp2020 = pd.read_parquet(cache/"season=2020"/"part.0.parquet") self.assertIsInstance(pbp2020, pd.DataFrame) self.assertFalse(pbp2020.empty) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..964f8c2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +appdirs +fastparquet; python_version >= '3.7' +fastparquet==0.7.2; python_version <= '3.6' +numpy>=1.0,<2.0 +pandas>=1.0,<2.0 \ No newline at end of file diff --git a/setup.py b/setup.py index d94e14f..ed0b913 100644 --- a/setup.py +++ b/setup.py @@ -8,15 +8,16 @@ NAME = 'nfl_data_py' DESCRIPTION = 'python library for interacting with NFL data sourced from nflfastR' URL = 'https://github.com/nflverse/nfl_data_py' -EMAIL = 'cooper.dff11@gmail.com' -AUTHOR = 'cooperdff' +EMAIL = 'alec.ostrander@gmail.com' +AUTHOR = 'Alec Ostrander' REQUIRES_PYTHON = '>=3.6.0' -VERSION = '0.3.2' +VERSION = '0.3.3' # What packages are required for this module to be executed? REQUIRED = [ - 'pandas>1', + 'numpy>=1.0, <2.0', + 'pandas>=1.0, <2.0', 'appdirs>1', 'fastparquet>0.5', ]