Skip to content

BKM Scrap and Update Kaggle Dataset #716

BKM Scrap and Update Kaggle Dataset

BKM Scrap and Update Kaggle Dataset #716

Workflow file for this run

name: BKM Scrap and Update Kaggle Dataset
on:
workflow_dispatch:
schedule:
- cron: '00 06,09,12,18 * * *'
env:
DATASET_NAME: 'furkanzeki/bkm-book-dataset'
FILE_NAME: 'BKM_Datasets.csv'
LOG_FILE: 'BKM.log'
ID: 'BKM'
jobs:
scrape_categories:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: 3.10.6
- name: Install Python Dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Categories Scrape
run: |
python Scripts/bkm_scrape_categories.py
- name: Download Kaggle Dataset
env:
KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
DATASET_NAME: ${{ env.DATASET_NAME }}
run: python Scripts/dataset_download.py
- name: Upload dataset as artifact
uses: actions/upload-artifact@v3
with:
name: bkm_dataset
path: data/
retention-days: 1
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: LogFiles
path: logs/${{ env.LOG_FILE }}
- name: Upload categories artifact
uses: actions/upload-artifact@v3
with:
name: bkm_categories
path: |
categories_1.json
categories_2.json
categories_3.json
categories_4.json
categories_5.json
retention-days: 1
scrape_job:
needs: scrape_categories
runs-on: ubuntu-latest
strategy:
matrix:
job_id: [1, 2, 3, 4, 5] # Job sayısını ihtiyacınıza göre artırabilirsiniz
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: 3.10.6
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Download bkm_dataset artifact
uses: actions/download-artifact@v3
with:
name: bkm_dataset #Dataset ana dizine ekleniyor
path: Data/
- name: Download categories artifact
uses: actions/download-artifact@v3
with:
name: bkm_categories
- name: Create Dataset File
run: mkdir -p Dataset
- name: Scraping Start
env:
categories_file: categories_${{ matrix.job_id }}.json
matrix_id: ${{ matrix.job_id }}
CLUSTER_LOG_FILE: BKM_${{ matrix.job_id }}.log
WORKERS_COUNT: 5
run: |
python Scripts/bkm_scrape.py
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: scraping data
path: Dataset/BKM_${{ matrix.job_id }}.csv
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: LogFiles
path: logs/BKM_${{ matrix.job_id }}.log
combine:
runs-on: ubuntu-latest
needs: scrape_job
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: 3.10.6
- name: Install dependencies
run: |
pip install pandas kaggle
- name: Download artifacts
uses: actions/download-artifact@v3
with:
name: bkm_dataset
path: Data/
- name: Download artifacts
uses: actions/download-artifact@v3
with:
name: LogFiles
path: logs/
- name: Download artifacts
uses: actions/download-artifact@v3
with:
name: scraping data
path: Datasets/
- name: Combine datasets
run: |
python Scripts/bkm_combine.py
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: Finished BKM Dataset
path: Data/BKM_Datasets.csv
- name: Dataset Upload
run: |
python Scripts/dataset_upload.py
env:
KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
- name: Cleanup artifacts
run: |
rm Datasets/BKM_*.csv
rm Data/BKM_*.csv
rm logs/BKM_*.log
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: LogFiles
path: logs/BKM.log
- name: Rename log file with today's date
run: |
python Scripts/rename_log.py
- name: Commit and push changes
run: |
git config --global user.name 'github-actions[bot]'
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
git pull origin main
git add logs/
git commit -m "Update BKM logs"
git push
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}