Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tricks to make saving images quicker? #636

Open
bigmit2011 opened this issue Jun 21, 2023 · 6 comments
Open

Tricks to make saving images quicker? #636

bigmit2011 opened this issue Jun 21, 2023 · 6 comments
Labels
question Further information is requested

Comments

@bigmit2011
Copy link

bigmit2011 commented Jun 21, 2023

I am using a friends script (so I don't know all the details in this script), but I wonder if there are simple tricks I can do here to make it save quicker.
I want to be able to save around 10k images.

I plan to incorporate multiprocessing to make it even quicker.

Thank you.

def pre_outcome_chart(month, save_dir, sample_df, test = True):
     if test:
          sample_df = sample_df.sample(100)
    
     for index, row in sample_df.iterrows():
            ticker = row['ticker']
            input_date = row['entry_date']
          
            #nput_date_string = input_date.strftime('%Y-%m-%d')
            startDate = pd.to_datetime(input_date).date() - relativedelta(months=month)
            endDate = pd.to_datetime(input_date).date() + relativedelta(days =1)
            startDateString = startDate.strftime('%Y%m%d')
            endDateString = endDate.strftime('%Y%m%d')
         
            filename = os.path.join(DATA_DIR, ticker + '.csv')
            df = pd.read_csv(filename, index_col=0, parse_dates=True)
            dt_range = pd.date_range(start=startDate, end=endDate)
            df = df[df.index.isin(dt_range)]
            # Drop any rows with missing data
            df.dropna(inplace=True)

            # Compute moving averages
            df['SMA50'] = df['Close'].rolling(window=50, min_periods=1).mean()
            df['SMA100'] = df['Close'].rolling(window=100, min_periods=1).mean()
            df['SMA200'] = df['Close'].rolling(window=200, min_periods=1).mean()

            # Define addplots
            ema10 = fplt.make_addplot(df['Close'].ewm(span=10, min_periods=1).mean(), color='#CBC3E3')
            ema20 = fplt.make_addplot(df['Close'].ewm(span=20, min_periods=1).mean(), color='#87CEEB')
            sma50 = fplt.make_addplot(df['SMA50'], color='red')
            sma100 = fplt.make_addplot(df['SMA100'], color='yellow')
            sma200 = fplt.make_addplot(df['SMA200'], color='white')

            # Define market colors and style
            mc = fplt.make_marketcolors(up='black',down='#f76757',
                                       edge={'up': '#13eda4', 'down': '#f76757'},
                                       wick={'up': '#13eda4', 'down': '#f76757'},
                                       volume={'up': '#13eda4', 'down': '#f76757'},
                                        )

            s = fplt.make_mpf_style(marketcolors=mc,facecolor='black',figcolor='black',
                                    gridcolor='gray',
                                    gridstyle='dotted',
                                    rc={'xtick.color':'white',
                                      'ytick.color':'white',
                                      'axes.labelcolor':'white',
                                      'text.color':'white',
                                      'axes.edgecolor': 'gray',
                                      'grid.alpha': 0.7,
                                      'grid.linewidth': 0.5,
                                    })



            # Plot the data
            fig, axlist = fplt.plot(
                df,
                type='candle',
                addplot=[ema10, ema20, sma50, sma100, sma200],
                style=s,
                figsize=(12,6),
                update_width_config={'candle_linewidth':1.0, 'candle_width':0.525, 'volume_width': 0.525},
                tight_layout=True,
                volume=True,
                ylabel='Price',
                xrotation=0,
                returnfig=True
            )

            # Add legend
            ax = axlist[0]
            legend_handles = [ax.lines[i] for i in range(len(ax.lines))]
            legend_labels = ['EMA10', 'EMA20', 'SMA50', 'SMA100', 'SMA200']
            ax.legend(legend_handles, legend_labels)

            # Get the index of the input_date candle
            input_date_index = df.index.get_loc(pd.to_datetime(input_date, format='%Y-%m-%d').floor('D'))

            # Get the coordinates of the input_date candle
            x_coord = input_date_index
            y_coord = df['High'].iloc[input_date_index]

            # Calculate the maximum value of the chart
            chart_max = max(df['High'].max(), df[['SMA50', 'SMA100', 'SMA200']].max().max())

            # Calculate the maximum range of the chart
            chart_range = df['High'].max() - df['Low'].min()

            # Calculate the desired arrow length as a fraction of the maximum range
            arrow_length_fraction = 0.1  # Adjust this value to control the arrow length
            arrow_length = arrow_length_fraction * chart_range

            # Calculate the desired gap size
            gap_size = arrow_length

            # Adjust the ylim to create the gap
            axlist[0].set_ylim(df['Low'].min(), 1.2 * df['High'].max() + gap_size)

            # Annotate the input_date above the input_date candle
            axlist[0].annotate('',
                               xy=(x_coord, y_coord),
                               xytext=(x_coord, y_coord+arrow_length),
                               arrowprops=dict(arrowstyle='->', color='yellow', linewidth=3),
                               color='yellow', ha='center', va='bottom')

            # Add the date tag at the top of the chart
            ax.annotate(ticker + '  '  + pd.to_datetime(input_date).date().strftime('%Y-%m-%d'),
                        xy=(x_coord, y_coord),
                        xytext=(x_coord, chart_max+arrow_length),
                        color='yellow', ha='center', va='bottom')
            # Save chart
           
            plt.savefig(
                f"{save_dir}/"
                + f"{ticker}_"
                + input_date
                + 'M_'
                 + '.png', dpi=300, bbox_inches='tight')
            plt.close(fig)`
@bigmit2011 bigmit2011 added the question Further information is requested label Jun 21, 2023
@DrChandrakant
Copy link
Contributor

@bigmit2011 Just Try Concurrent Processing The Large Data Read Out More https://docs.python.org/3/library/concurrency.html

@DanielGoldfarb
Copy link
Collaborator

DanielGoldfarb commented Jun 23, 2023

For concurrent processing, I highly recommend using the multiprocessing module of pathos (see also here).

It is essentially the same as the python standard library multiprocessing module, except that pathos.multiprocessing uses dill under the hood instead of pickle in order to avoid many of the limitations that pickle imposes.

That said, there are potentially several sources of inefficiency here:

  1. It would be helpful if you could provide the sample_df data file. I strongly suspect, looking at the code, that the code may be opening and reading the same files over and over, incurring both the cost of reading from disk and of allocating memory to do so. The code appears to take only a portion of the data from each file that it opens, and if a given ticker is in sample_df more than once, then for sure the code is opening and reading the same file multiple times while using only a portion of the file with each iteration.
  2. In addition to the sample_df file, if you can provide some of the other data files then it may be possible to identify other inefficiences in the code.
  3. Certain parts of the code definitely do not need to be in the loop, for example both mc = fplt.make_marketcolors(...) and s = fplt.make_mpf_style(...) can and should be done outside of the loop. Although these should be relatively quick calls, there is no reason to do them thousands of times, resulting in thousands of memory allocations and the ensuing garbage collection.
  4. It is also possible to allow mplfinance to calculate your SMAs and EMAs instead of calculating them externally and the using addplot to plot. This will certainly make the code simpler, however one would have to experiment to see if letting mplfinance doe the SMA and EMA calculations is faster, slower, or the same. Based on my understanding of the code, it would be truly difficult to predict, and thus one would have to actually run the test to find out.
  5. I think it will also be more efficient to re-use the Figure and Axes returned from mplfinance, rather than closing the Figure and reallocating it again and again each time through the loop (thus again avoiding a tremendous amount of object and memory allocation, and garbage collection). In order to do this, one would have to save the Figure and Axes from the very first call, and then use mplfinance in external axis mode after that, minimally clearing the Axes each time through the loop.
  6. Finally, I would question intensely what your ultimate goal is here, what you are trying to accomplish in the big picture and why you think the means to accomplishing that is to save some 10,000 chart image files? Perhaps there is a more efficient way to acheive the same result. Please let me know what is ultimate problem you are trying to solve (for which you think that the best solution is to make thousands of chart images).

Another point, implied above, regarding items 1 and 4 above. If indeed it is the case that a ticker may be in sample_df multiple times, then the logical restructructuring of the code would be to create a loop within a loop, where the outer loop reads the data file filename = os.path.join(DATA_DIR, ticker + '.csv') and calculates all of the SMAs and EMAs for that data set, and then the inner loop is over the various date ranges (startDate = pd.to_datetime(input_date).date() - relativedelta(months=month) and endDate = pd.to_datetime(input_date).date() + relativedelta(days =1)) for that ticker. In this way each ticker's file is read only once and the SMAs and EMAs for each ticker would also be calculated only once.

@bigmit2011
Copy link
Author

Hi,

Thank you so much for the detailed reply.

Regarding 1 and 2:
So sample_df is only opened once.
It's usually a dataframe with a list of tickers and dates.
Most of the time the tickers are different, so I don't think that's the bottleneck.
However, I will looking into reusing the data if the ticker is the same.
I will try to copy and paste some of the sample_df, when I get back to my home pc.

The historical data files depend on the ticker.
But it's basically the entire historical data of the ticker:

https://finance.yahoo.com/quote/AAPL/history/

  1. This makes sense. I will give this a go.

  2. Yeah the calculations are in there due to simplicity.

  3. This one is a little confusing for me, but I will try to see if I can find some examples.

  4. So I'm a little far for what I want to accomplish,
    but if there are 12k tickers, I want to be able to run pattern recognition
    on them daily. I haven't tested out the latest computer vision models for accurate I can get them for pattern recognition with something like transfer learning (if that's still the best way), but that's
    the goal.

`

@DrChandrakant
Copy link
Contributor

@bigmit2011 too achieved try store historical in a local database or cached file format. keep updating the last candle open, high, low close data which significantly reduces request time from Yahoo data. for Better performance, I like to suggest please use API

@bigmit2011
Copy link
Author

@bigmit2011 too achieved try store historical in a local database or cached file format. keep updating the last candle open, high, low close data which significantly reduces request time from Yahoo data. for Better performance, I like to suggest please use API

Hi,

I actually have data saved locally as csv files and am not scraping during the time of creating charts.

@BennyThadikaran
Copy link

I recently wrote some code for saving mplfinance chart images to disk using concurrent processing.

https://github.com/BennyThadikaran/stock-pattern/blob/main/src/init.py#L107

Below is the main outline of the code using concurrent.futures module. It assumes your data is already on disk. If using network to download the data see the second part.

import concurrent.futures
import mplfinance as mpf
import matplotlib.pyplot as plt


def process(sym):
    """This runs in a child process"""

    # load the file in a DataFrame, do some processing
    df = pd.read_csv("symfile.csv")

    # switch to non interactive backend when working inside child process
    plt.switch_backend("AGG")

    plt.ioff()

    mpf.plot(df, type="candle", style="tradingview", savefig=f"{sym}.png")

    # return something usefull
    return f"success {sym}"


def main():
    """Main entry point of script"""
    futures = []
    sym_list = ["tcs", "infosys"]  # your fairly long list of symbols

    with concurrent.futures.ProcessPoolExecutor() as executor:
        for sym in sym_list:
            # Pass process function and any additional
            # positional arguments and keyword arguments to executor.submit
            future = executor.submit(process, sym)
            futures.append(future)

            for future in concurrent.futures.as_completed(futures):
                # do something with the result
                print(future.result())


if __name__ == "__main__":
    # run the script
    main()

If you're making network requests for stock data, you can get a big performance boost using asyncio (stdlib) from and aiohttp (external package).

The benefit is not having to wait for each stocks data to be downloaded. With the asyncio.as_completed, you can begin processing responses and saving the images in a child process. Below is a very simplified script, demonstrating the crucial bits.

Make sure to use a throttler or you will exceed the server api limits.

async def main():
    sym_list = [] # your symbol list

    async with aiohttp.ClientSession() as session:
        tasks = []

        for sym in sym_list:
            # call your data fetch function with create_task
            # data_fetch takes the sym and session argument and calls session.get(url)
            task = asyncio.create_task(data_fetch(sym, session))

            tasks.append(task)

        loop = asyncio.get_event_loop()
        executor = concurrent.futures.ProcessPoolExecutor
        futures_list = []

        async for futures_completed in asyncio.as_completed(tasks):
            stock_data = futures_completed.result()

            futures = loop.run_in_executor(executor, process, stock_data)
            futures_list.append(futures)

        results = await asyncio.gather(*futures_list)


if __name__ == "__main__":
    # run the script
    asyncio.run(main())

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
question Further information is requested
Projects
None yet
Development

No branches or pull requests

4 participants