Skip to content

YfinancePipeline

Handles OHLCV and metadata scraping from yfinance. Each method processes one ticker - Airflow does the parallelization.

Component Separation

Pipeline focuses on data scraping. For ticker lists see YfinanceTickers, for validation see YfinanceValidation.

Quick Start

from sec_data_pipeline.yfinance.yfinance_pipeline import YfinancePipeline
from datetime import date

pipeline = YfinancePipeline()

# Download OHLCV data
df = pipeline.scrape_date_range('AAPL', date(2024, 1, 1), date.today())

# Get metadata (50+ fundamental fields)
meta = pipeline.scrape_metadata('AAPL')

Data Scraping

Type Configuration Returns
OHLCV auto_adjust=True, flattened MultiIndex DataFrame with Date, Open, High, Low, Close, Volume
Metadata 50+ fields via yfinance .info Dict with company info, valuation, profitability, analyst coverage

Adjusted Prices

auto_adjust=True handles stock splits and dividends automatically - no manual adjustments needed

API Reference

Main data pipeline for OHLCV and metadata scraping - Atomic methods (should be bareboned & simple) - No retry logic & debug logging only (maybe error log) NO INFO LOGS! - Each method should download data for one stock (airflow handles parallel execution)

Source code in data_pipeline/sec_data_pipeline/yfinance/yfinance_pipeline.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
class YfinancePipeline:
    """
    Main data pipeline for OHLCV and metadata scraping
    - Atomic methods (should be bareboned & simple)
    - No retry logic & debug logging only (maybe error log) NO INFO LOGS!
    - Each method should download data for one stock (airflow handles parallel execution)
    """

    def scrape_date_range(
        self,
        ticker: str,
        start_date: date,
        end_date: date,
        interval: str = '1d'
    ) -> pd.DataFrame | None:
        """
        Scrape historical data for a specific date range for a single ticker

        Args:
            ticker: A single ticker symbol
            start_date: Start date
            end_date: End date
            interval: Data interval (1d only)

        Returns:
            Returns a dataframe containing the ohlcv for a single stock
        """

        # Download the data for a ticker
        ticker_data = yf.download(
            ticker,
            start=start_date,
            end=end_date,
            interval=interval,
            progress=False, # Disable individual progress bars
            auto_adjust=True, # Adjusted close prices (dividends & stock splits)
        )

        if ticker_data is None:
            raise ValueError("OHLCV data is None")

        # Flatten MultiIndex columns (yfinance returns MultiIndex for single ticker)
        if isinstance(ticker_data.columns, pd.MultiIndex):
            ticker_data.columns = ticker_data.columns.get_level_values(0)

        return ticker_data

    def scrape_metadata(self, ticker: str) -> Dict[str, Any]:
        """
        Scrape fundamental metadata for a single ticker

        Args:
            ticker: A string for the ticker

        Returns:
            A dictionary with metadata about the ticker
        """

        # Get the metadata from yfinance
        stock = yf.Ticker(ticker)
        info = stock.info

        # Calculate derived metrics
        current_price = info.get('currentPrice') or info.get('regularMarketPrice')
        target_price = info.get('targetMeanPrice')
        target_upside = None

        # Calculate target upside
        if target_price and current_price:
            target_upside = (target_price - current_price) / current_price

        # Calculate free cash flow and fcf_yield
        free_cash_flow = info.get('freeCashflow')
        market_cap = info.get('marketCap')
        fcf_yield = None
        if free_cash_flow and market_cap:
            fcf_yield = free_cash_flow / market_cap

        # Extract only high and medium availability fields (>50%)
        metadata = {
            'ticker': ticker,
            'date_scraped': date.today(),

            # Company Basic Info (80%+ availability)
            'company_name': info.get('longName'),
            'exchange': info.get('exchange'),
            'country': info.get('country'),
            'sector': info.get('sector'),
            'industry': info.get('industry'),
            'market_cap': market_cap,
            'enterprise_value': info.get('enterpriseValue'),
            'shares_outstanding': info.get('sharesOutstanding'),
            'float_shares': info.get('floatShares'),

            # Valuation Metrics (50%+ availability)
            'price_to_book': info.get('priceToBook'),
            'forward_pe': info.get('forwardPE'),
            'ev_to_ebitda': info.get('enterpriseToEbitda'),
            'ev_to_revenue': info.get('enterpriseToRevenue'),
            'price_to_sales': info.get('priceToSalesTrailing12Months'),

            # Profitability & Quality (75%+ availability)
            'gross_margin': info.get('grossMargins'),
            'operating_margin': info.get('operatingMargins'),
            'profit_margin': info.get('profitMargins'),
            'return_on_equity': info.get('returnOnEquity'),
            'return_on_assets': info.get('returnOnAssets'),
            'free_cash_flow_yield': fcf_yield,

            # Growth Metrics (60%+ availability)
            'revenue_growth_yoy': info.get('revenueGrowth'),
            'revenue_per_share': info.get('revenuePerShare'),

            # Financial Health (67%+ availability)
            'debt_to_equity': info.get('debtToEquity'),
            'current_ratio': info.get('currentRatio'),
            'quick_ratio': info.get('quickRatio'),
            'total_cash': info.get('totalCash'),
            'total_debt': info.get('totalDebt'),
            'total_cash_per_share': info.get('totalCashPerShare'),
            'book_value': info.get('bookValue'),

            # Cash Flow (77%+ availability)
            'operating_cash_flow': info.get('operatingCashflow'),
            'free_cash_flow': free_cash_flow,

            # Dividends (81%+ availability)
            'payout_ratio': info.get('payoutRatio'),

            # Short Interest & Ownership (80%+ availability)
            'short_percent_of_float': info.get('shortPercentOfFloat'),
            'short_ratio': info.get('shortRatio'),
            'shares_short': info.get('sharesShort'),
            'shares_percent_shares_out': info.get('sharesPercentSharesOut'),
            'held_percent_institutions': info.get('heldPercentInstitutions'),
            'held_percent_insiders': info.get('heldPercentInsiders'),

            # Analyst Coverage (61%+ availability)
            'target_mean_price': target_price,
            'target_price_upside': target_upside,
            'number_of_analysts': info.get('numberOfAnalystOpinions'),
            'recommendation_key': info.get('recommendationKey'),

            # Market Performance (80%+ availability)
            'beta': info.get('beta'),
            '52_week_high': info.get('fiftyTwoWeekHigh'),
            '52_week_low': info.get('fiftyTwoWeekLow'),
            '52_week_change': info.get('52WeekChange'),
            'sp500_52_week_change': info.get('SandP52WeekChange'),
            '50_day_average': info.get('fiftyDayAverage'),
            '200_day_average': info.get('twoHundredDayAverage'),

            # Trading Volume (100% availability)
            'average_volume': info.get('averageVolume'),
            'average_volume_10days': info.get('averageDailyVolume10Day'),
            'regular_market_volume': info.get('regularMarketVolume'),

            # Metadata
            'last_updated': datetime.now(),
            'data_source': 'yfinance'
        }

        # Return metadata
        return metadata

scrape_date_range(ticker, start_date, end_date, interval='1d')

Scrape historical data for a specific date range for a single ticker

Parameters:

Name Type Description Default
ticker str

A single ticker symbol

required
start_date date

Start date

required
end_date date

End date

required
interval str

Data interval (1d only)

'1d'

Returns:

Type Description
DataFrame | None

Returns a dataframe containing the ohlcv for a single stock

Source code in data_pipeline/sec_data_pipeline/yfinance/yfinance_pipeline.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def scrape_date_range(
    self,
    ticker: str,
    start_date: date,
    end_date: date,
    interval: str = '1d'
) -> pd.DataFrame | None:
    """
    Scrape historical data for a specific date range for a single ticker

    Args:
        ticker: A single ticker symbol
        start_date: Start date
        end_date: End date
        interval: Data interval (1d only)

    Returns:
        Returns a dataframe containing the ohlcv for a single stock
    """

    # Download the data for a ticker
    ticker_data = yf.download(
        ticker,
        start=start_date,
        end=end_date,
        interval=interval,
        progress=False, # Disable individual progress bars
        auto_adjust=True, # Adjusted close prices (dividends & stock splits)
    )

    if ticker_data is None:
        raise ValueError("OHLCV data is None")

    # Flatten MultiIndex columns (yfinance returns MultiIndex for single ticker)
    if isinstance(ticker_data.columns, pd.MultiIndex):
        ticker_data.columns = ticker_data.columns.get_level_values(0)

    return ticker_data

scrape_metadata(ticker)

Scrape fundamental metadata for a single ticker

Parameters:

Name Type Description Default
ticker str

A string for the ticker

required

Returns:

Type Description
Dict[str, Any]

A dictionary with metadata about the ticker

Source code in data_pipeline/sec_data_pipeline/yfinance/yfinance_pipeline.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def scrape_metadata(self, ticker: str) -> Dict[str, Any]:
    """
    Scrape fundamental metadata for a single ticker

    Args:
        ticker: A string for the ticker

    Returns:
        A dictionary with metadata about the ticker
    """

    # Get the metadata from yfinance
    stock = yf.Ticker(ticker)
    info = stock.info

    # Calculate derived metrics
    current_price = info.get('currentPrice') or info.get('regularMarketPrice')
    target_price = info.get('targetMeanPrice')
    target_upside = None

    # Calculate target upside
    if target_price and current_price:
        target_upside = (target_price - current_price) / current_price

    # Calculate free cash flow and fcf_yield
    free_cash_flow = info.get('freeCashflow')
    market_cap = info.get('marketCap')
    fcf_yield = None
    if free_cash_flow and market_cap:
        fcf_yield = free_cash_flow / market_cap

    # Extract only high and medium availability fields (>50%)
    metadata = {
        'ticker': ticker,
        'date_scraped': date.today(),

        # Company Basic Info (80%+ availability)
        'company_name': info.get('longName'),
        'exchange': info.get('exchange'),
        'country': info.get('country'),
        'sector': info.get('sector'),
        'industry': info.get('industry'),
        'market_cap': market_cap,
        'enterprise_value': info.get('enterpriseValue'),
        'shares_outstanding': info.get('sharesOutstanding'),
        'float_shares': info.get('floatShares'),

        # Valuation Metrics (50%+ availability)
        'price_to_book': info.get('priceToBook'),
        'forward_pe': info.get('forwardPE'),
        'ev_to_ebitda': info.get('enterpriseToEbitda'),
        'ev_to_revenue': info.get('enterpriseToRevenue'),
        'price_to_sales': info.get('priceToSalesTrailing12Months'),

        # Profitability & Quality (75%+ availability)
        'gross_margin': info.get('grossMargins'),
        'operating_margin': info.get('operatingMargins'),
        'profit_margin': info.get('profitMargins'),
        'return_on_equity': info.get('returnOnEquity'),
        'return_on_assets': info.get('returnOnAssets'),
        'free_cash_flow_yield': fcf_yield,

        # Growth Metrics (60%+ availability)
        'revenue_growth_yoy': info.get('revenueGrowth'),
        'revenue_per_share': info.get('revenuePerShare'),

        # Financial Health (67%+ availability)
        'debt_to_equity': info.get('debtToEquity'),
        'current_ratio': info.get('currentRatio'),
        'quick_ratio': info.get('quickRatio'),
        'total_cash': info.get('totalCash'),
        'total_debt': info.get('totalDebt'),
        'total_cash_per_share': info.get('totalCashPerShare'),
        'book_value': info.get('bookValue'),

        # Cash Flow (77%+ availability)
        'operating_cash_flow': info.get('operatingCashflow'),
        'free_cash_flow': free_cash_flow,

        # Dividends (81%+ availability)
        'payout_ratio': info.get('payoutRatio'),

        # Short Interest & Ownership (80%+ availability)
        'short_percent_of_float': info.get('shortPercentOfFloat'),
        'short_ratio': info.get('shortRatio'),
        'shares_short': info.get('sharesShort'),
        'shares_percent_shares_out': info.get('sharesPercentSharesOut'),
        'held_percent_institutions': info.get('heldPercentInstitutions'),
        'held_percent_insiders': info.get('heldPercentInsiders'),

        # Analyst Coverage (61%+ availability)
        'target_mean_price': target_price,
        'target_price_upside': target_upside,
        'number_of_analysts': info.get('numberOfAnalystOpinions'),
        'recommendation_key': info.get('recommendationKey'),

        # Market Performance (80%+ availability)
        'beta': info.get('beta'),
        '52_week_high': info.get('fiftyTwoWeekHigh'),
        '52_week_low': info.get('fiftyTwoWeekLow'),
        '52_week_change': info.get('52WeekChange'),
        'sp500_52_week_change': info.get('SandP52WeekChange'),
        '50_day_average': info.get('fiftyDayAverage'),
        '200_day_average': info.get('twoHundredDayAverage'),

        # Trading Volume (100% availability)
        'average_volume': info.get('averageVolume'),
        'average_volume_10days': info.get('averageDailyVolume10Day'),
        'regular_market_volume': info.get('regularMarketVolume'),

        # Metadata
        'last_updated': datetime.now(),
        'data_source': 'yfinance'
    }

    # Return metadata
    return metadata