Discord Bot - Data Scraper


Python SQL Discord.py



About the project

In the early days of creating Discord bots, data storage needs were minimal, and text files sufficed for occasional use. As the complexity of my projects grew, so did the necessity for a more robust data storage solution. This prompted me to explore alternatives, leading me to adopt PostgreSQL. I now manage a database that contains over 25 tables and my scripts perform hundreds of queries per day. Some of those tables are used to collect staff message data to better review their performance on a month to month basis, that is the topic of this project.

 

In short, this Discord bot utilizes Discords api to scrape data from staff members and stores it in a PostgreSQL database. The data is then compiled on a month-to-month basis into csv and graphic formats to show monthly performance metrics. This data is then used to help review the staff members and their performance.

 

Recent bot updates:

Since I started the bot I have done a lot to make it more efficient and reliable. I have recently rewrote the entire code for the bot, you can find the improvements from this below. You can also find the code after the snippets below.

  • Modularity
    • Made the code more reusable by making new libraries that all my bots access.
  • Useless Code/Efficiency
    • Removed useless code such as conn.close(), as AsyncConnection will close automatically when exited.
  • Better Queries
    • As some of my old code resorted to sorting data outside of a query, I made my queries more complex to extract what I wanted in one simple line of code.
  • Cleanliness
    • Overall reduced the size of the bots code by over 60%.

Another update was a recent addition of Streamlit to easily isolate and graph all the data collected month-to-month since late 2022. An example snippet of the Streamlit App can be found below. It can also be found at https://www.kianjolley.com/notify/staff_data/ but it is blocked by Discord OAuth to only allow staff of Notify to access. 

 

 

Below you can find examples of the month-to-month data that is collected by the bot (Matplotlib for the visualization).

 

Example of month-to-month visualized data
Example of month-to-month visualized data

 

Small snippet of collected data, cells formatted as prev month/current month/% diff

 

Main function containing the Discord bot.

import os
import discord
from discord import app_commands
from discord.utils import get
import asyncio
import datetime
from datetime import date
import matplotlib.pyplot as plt
import time
from pytz import timezone
from decimal import Decimal
from dotenv import load_dotenv

load_dotenv()

from database import dbaccess
from utils import role_puller_utils

TOKEN = os.environ.get('ROLE_PULLER_TOKEN')
intents = discord.Intents().all()
intents.members = True
intents.guilds = True

GUILD_ID = int(os.environ.get('NOTIFYGUILDID'))

class Client(discord.Client):
    def __init__(self):
        super().__init__(intents = intents)
        self.synced = False
        asyncio.set_event_loop_policy(
            asyncio.WindowsSelectorEventLoopPolicy()
        )

    async def on_ready(self):
        await self.wait_until_ready()
        if not self.synced:
            await client.sync()
            self.synced = True
        print(f'{self.user} has connected to Discord!')

        #sets some variables used in welcome.py
        await role_puller_utils.set_vars(Client)

        await asyncio.gather(monthly_delete())

Client = Client()
client = app_commands.CommandTree(Client)

# deletes and graphs data monthly
async def monthly_delete():
    while True:
        await asyncio.sleep(3600)  # Wait 60 minutes to check

        # Get current month
        curr_month = await role_puller_utils.get_current_month()
        month = datetime.datetime.now().month

        try:
            if month != curr_month:
                print("Gathering monthly message data then deleting the data.")
                start = time.time()

                # Updating the month since it is a new month in this else
                query = "UPDATE datetracker set value = %s WHERE scriptid = %s"
                data = (month, 1)
                await dbaccess.write_data(query, data)

                # Create directory
                final_directory = os.path.join(os.getcwd(), f'staff_data/data_save_{date.today()}')
                os.makedirs(final_directory, exist_ok=True)

                # Fetch and export data for each category
                categories = [
                    "supporttickets", "staffcategory", "generalcategory",
                    "generalcategory2", "importantcategory", "sneakerinfocategory",
                    "sneakerreleasescategory", "canadacategory"
                ]
                for category_name in categories:
                    final_final_directory = os.path.join(final_directory, r'' + category_name)
                    if not os.path.exists(final_final_directory):
                        os.makedirs(final_final_directory)
                    ids, msgs, words, chars = await role_puller_utils.fetch_data(category_name)
                    names = await role_puller_utils.fetch_user_names(ids, Client)
                    await role_puller_utils.export_data_to_csv(ids, msgs, words, chars, names, category_name, final_directory)

                # Erase all rows in db
                await dbaccess.delete_all_rows()

                end = time.time()
                print(end - start)
        except Exception as e: 
            print(f"Error executing the monthly_delete function.\nError: {e}")

@Client.event
async def on_message(msg):

    if not msg.webhook_id:
        staffRole = discord.utils.get(msg.channel.guild.roles, name="Staff")
        # Support Tickets
        if msg.channel.category_id == 570248275374899200 and staffRole in msg.author.roles:
            words, chars = await role_puller_utils.text_data(msg)
            await role_puller_utils.readwrite("supporttickets", msg.author.id, msg.author, 1, words, chars, 0)

        # Staff
        if msg.channel.category_id == 570142944921911296 and staffRole in msg.author.roles:
            words, chars = await role_puller_utils.text_data(msg)
            await role_puller_utils.readwrite("staffcategory", msg.author.id, msg.author, 1, words, chars, 0)

        # General (Chat, Lifetime Chat)
        if msg.channel.category_id == 570142948155588608 and staffRole in msg.author.roles and msg.channel.id != 992842655346196490 and msg.channel.id != 570143030145974302 and msg.channel.id != 570143033337577482:
            words, chars = await role_puller_utils.text_data(msg)
            await role_puller_utils.readwrite("generalcategory", msg.author.id, msg.author, 1, words, chars, 0)

        # General (New Member Chat, Questions)
        if (msg.channel.id == 992842655346196490 or msg.channel.id == 570143030145974302 or msg.channel.category_id == 1054648731154251796) and staffRole in msg.author.roles:
            words, chars = await role_puller_utils.text_data(msg)
            await role_puller_utils.readwrite("generalcategory2", msg.author.id, msg.author, 1, words, chars, 0)

        # Important
        if msg.channel.category_id == 570142946272477184 and staffRole in msg.author.roles:
            words, chars = await role_puller_utils.text_data(msg)
            await role_puller_utils.readwrite("importantcategory", msg.author.id, msg.author, 1, words, chars, 0)

        # Sneaker Info
        if msg.channel.category_id == 1016090827073798164 and staffRole in msg.author.roles:
            words, chars = await role_puller_utils.text_data(msg)
            await role_puller_utils.readwrite("sneakerinfocategory", msg.author.id, msg.author, 1, words, chars, 0)

        # Sneaker Releases
        if msg.channel.category_id == 622110241882112011 and staffRole in msg.author.roles:
            words, chars = await role_puller_utils.text_data(msg)
            await role_puller_utils.readwrite("sneakerreleasescategory", msg.author.id, msg.author, 1, words, chars, 0)

        # Canada
        if msg.channel.category_id == 839185199673507901 and staffRole in msg.author.roles:
            words, chars = await role_puller_utils.text_data(msg)
            await role_puller_utils.readwrite("canadacategory", msg.author.id, msg.author, 1, words, chars, 0)

Client.run(TOKEN)

 

The next file contains some functions used by the main file to handle data visualization, data management, etc.

import discord
import os
import asyncio
import datetime
from datetime import date
import matplotlib.pyplot as plt
import numpy as np

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

from database import dbaccess
from embeds import embeds

GUILD = int(os.environ.get('NOTIFYGUILDID'))
NEWMEMBERROLE = int(os.environ.get('NEWMEMBERROLE'))
STAFFMODCHANNEL = int(os.environ.get('STAFFMODCHANNEL'))

Client = None

# Sets client var on bot start
async def set_vars(c):
    global Client

    Client = c

# Generates the graphs and saves it
async def graph(N, height1, height2, height3, tick_label, final_directory, end_directory):
    ind = np.arange(N)
    width = 0.25

    fig = plt.figure(facecolor = "white", figsize = (21,9), dpi = 300)
    #subplot to allow multiple plots
    ax = fig.add_subplot(111)
    ax2 = ax.twinx()
    #all 3 sets of data into one plot (msg,word,char)
    first = ax2.bar(ind, height1, width, color = ['aqua'])
    second = ax.bar(ind+width, height2, width, color = ['lightgreen'])
    third = ax.bar(ind+width+width, height3, width, color = ['fuchsia'])

    #formatting stuff for the graph
    ax.set_ylabel('Word/Char Totals')
    ax2.set_ylabel('Msg Totals')
    ax.set_title('Category Interaction Graph')
    ax.set_xticks(ind + width / 3)
    ax.set_xticklabels(tick_label)
    ax.legend( (first[0], second[0], third[0]),("Tot Msgs", "Tot Words", "Tot Chars"),loc='upper center', bbox_to_anchor=(1,-0.1) )
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.set_facecolor("black")
    #rotates the names 45 and changes font 
    fig.autofmt_xdate(rotation = 45)
    #fits everything to the borders
    fig.tight_layout()

    plt.savefig(final_directory + end_directory)

async def fetch_data(category_name):
    query = f"SELECT * FROM {category_name}"
    records = await dbaccess.get_data(query, None)
    ids, msgs, words, chars = [], [], [], []
    for row in records:
        ids.append(row[0])
        msgs.append(row[2])
        words.append(row[3])
        chars.append(row[4])
    return ids, msgs, words, chars

async def fetch_user_names(ids, client):
    names = []
    for i in ids:
        user = await client.fetch_user(i)
        names.append(user.name)
    return names

async def export_data_to_csv(ids, msgs, words, chars, names, category_name, final_directory):
    await graph(len(ids), msgs, words, chars, names, final_directory, f'/{category_name}/graph.png')
    csv_filename = f'{final_directory}/{category_name}/csv.csv'
    query = f"SELECT * FROM {category_name}"
    records = await dbaccess.get_data(query, None)
    column_names = ['nameid', 'name', 'msgcount', 'wordcount', 'charcount', 'engage']

    if records:
        # Write column headers to the CSV file
        with open(csv_filename, 'w') as csv_file:
            csv_file.write(",".join(column_names) + "\n")
        # Write data rows to the CSV file
        with open(csv_filename, 'a') as csv_file:
            for row in records:
                csv_file.write(",".join(map(str, row)) + "\n")
    else:
        print("Error with exporting to csv.")

async def get_current_month():
    query = "SELECT * FROM datetracker WHERE scriptid = 1"
    records = await dbaccess.get_data(query, None)
    for row in records:
        if row[0] == 1:
            return row[1]
    return 0  # Return 0 if no record found

async def text_data(msg):
    msgstring = msg.content
    words = len(msgstring.split())
    msgstring = msgstring.replace(" ", "")
    msgstring = msgstring.replace("\n", "")
    chars = len(msgstring)

    return  words, chars

async def readwrite(stringtable, auth, a, count, words, chars, engage):
    query = f"SELECT * FROM {stringtable} WHERE nameid = %s"
    records = await dbaccess.get_data(query, (auth,))

    for row in records:
        if row[0] == auth:
            count += row[2]
            words += row[3]
            chars += row[4]

    query = f"INSERT INTO {stringtable} (nameid, name, msgcount, wordcount, charcount, engage) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (nameid) DO UPDATE set (name, msgcount, wordcount, charcount, engage) = (EXCLUDED.name, EXCLUDED.msgcount, EXCLUDED.wordcount, EXCLUDED.charcount, EXCLUDED.engage)"
    data = (auth, str(a), count, words, chars, engage)
    await dbaccess.write_data(query, data)

 

The next part contains code for the database access, any query from the main file or the utils file will go through this.

import psycopg
from psycopg import sql
import pandas as pd
import logging
import os
from dotenv import load_dotenv, find_dotenv

from global_vars.global_vars import GlobalVariables

load_dotenv(find_dotenv())

log_file_path = os.path.join('logs', 'dblog.log')
logging.basicConfig(level=os.environ.get("LOGLEVEL", "DEBUG"), filename=log_file_path, filemode='a', format='%(asctime)s - [Line:%(lineno)d - Function:%(funcName)s In:%(filename)s From:%(name)s] \n[%(levelname)s] - %(message)s \n', datefmt='%d-%b-%y %H:%M:%S')

DBTable = os.environ.get('DBTable')
DBHost = os.environ.get('DBHost')
DBUsr = os.environ.get('DBUsr')
DBPass = os.environ.get('DBPass')
DBPort = os.environ.get('DBPort')

# Returns wanted information as a df from the database given a query.
async def get_data_as_dataframe(query):
    records = None
    async with await psycopg.AsyncConnection.connect(
        dbname=DBTable,
        user=DBUsr,
        password=DBPass,
        host=DBHost,
        port=DBPort
    ) as conn:
        try:
            logging.info("Opened database successfully")
            async with conn.cursor() as curr:
                # Execute the query to fetch the specific data
                await curr.execute(query)
                records = await curr.fetchall()
                logging.info("Pulled values")

                # Execute a separate query to get the total number of rows in the table
                total_query = "SELECT COUNT(*) FROM ticketsurvey"
                await curr.execute(total_query)
                total_rows = await curr.fetchone()
                total_rows = total_rows[0] if total_rows else 0
                logging.info("Fetched total rows")

        except Exception as e:
            logging.error(e)
            return pd.DataFrame(), 0  # Return an empty DataFrame and total rows 0 in case of error

    # Convert fetched records into a DataFrame
    df = pd.DataFrame(records)  # Specify column names
    return df, total_rows

# Returns wanted information from the database given a query. Set value to None if you aren't passing in data.
async def get_data(query, val):
    records = None
    async with await psycopg.AsyncConnection.connect(dbname=DBTable, user=DBUsr, password=DBPass, host=DBHost, port=DBPort) as conn:
        try:
            logging.info("Opened database successfully")
            async with conn.cursor() as curr:
                if val != None:
                    await curr.execute(query, val)
                else:
                    await curr.execute(query)
                records = await curr.fetchall()
                logging.info("Pulled values")

                logging.info("Fetched total rows")

        except Exception as e:
            logging.error(e)
            
    return records

async def write_data(query, data):
    async with await psycopg.AsyncConnection.connect(dbname = DBTable, user = DBUsr, password = DBPass, host = DBHost, port = DBPort) as conn:
        try:
            logging.info("Opened database successfully")
            async with conn.cursor() as curr:
                await curr.execute(query, data)
                await conn.commit()
                logging.info("Written values and closed")

        except Exception as e:
            logging.error(e)

async def write_to_db(query1, query2, payload, user):
    async with await psycopg.AsyncConnection.connect(dbname = DBTable, user = DBUsr, password = DBPass, host = DBHost, port = DBPort) as conn:
        try:
            logging.info("Opened database successfully")
            async with conn.cursor() as curr:

                await curr.execute(query1, user)
                ticketsurvey = await curr.fetchall()

                for row in ticketsurvey:
                    chid = row[0]

                #data for the %s value
                data = (chid, user, payload)
                await curr.execute(query2, data)
                await conn.commit()
                logging.info("Written values and closed")

        except Exception as e:
            logging.error(e)

async def delete_all_rows():
    async with await psycopg.AsyncConnection.connect(dbname = DBTable, user = DBUsr, password = DBPass, host = DBHost, port = DBPort) as conn:
        try:
            logging.info("Opened database successfully")
            async with conn.cursor() as curr:
                sql = "DELETE FROM supporttickets"
                await curr.execute(sql)
                sql = "DELETE FROM staffcategory"
                await curr.execute(sql)
                sql = "DELETE FROM generalcategory"
                await curr.execute(sql)
                sql = "DELETE FROM generalcategory2"
                await curr.execute(sql)
                sql = "DELETE FROM importantcategory"
                await curr.execute(sql)
                sql = "DELETE FROM sneakerinfocategory"
                await curr.execute(sql)
                sql = "DELETE FROM sneakerreleasescategory"
                await curr.execute(sql)
                sql = "DELETE FROM canadacategory"
                await curr.execute(sql)
                await conn.commit()
                logging.info("Updated and closed")

        except Exception as e:
            logging.error(e)