Pandas AI

Introduction

  • Pandas is the most popular open-source Python library used for data manipulation and analysis.

    If you are familiar with Excel or SQL, you can think of Pandas as a "programmable Excel" that is much faster, more powerful, and capable of handling millions of rows of data.

  • By importing the excel or SQL to table format , it is powerfully for analysis the data , such as calculating sum, plotting graph, cleaning data, grouping data, ... .

  • By Integrating with AI, the prompt with natural language can convert into pandas function to obtain the calculated result, suitable for creating a chatbot of data dashboard

Example

def chat_with_sql():
    llm = LiteLLM(
        model="gpt-4.1", 
        api_key="sk-wx6xklvYEtIrd-CQuQcqNA", 
        base_url="https://ai.hld.com/lite-llm/"
    )
    pai.config.set({
        "llm": llm
    })

    # Connect to MSSQL
    conn_str = (
        "DRIVER={ODBC Driver 17 for SQL Server};"
        "SERVER=CLUSQL01;"
        "DATABASE=HLDCCSCPRDB_report;"
        "UID=HLDCCSCPRDBViewer;"
        "PWD=EFbG5BVVNx!hgxox6#5EMkzd4ks*EN;"
    )
    conn = pyodbc.connect(conn_str)

    query = """
    SELECT
        PtyCode AS ptycode,
        ChnName,
        PtyCode + ' - ' + ChnName as PtycodeCn,
        SUM(TotalUnit) AS TotalUnit,
        SUM(AvailableUnits) AS AvailableUnits,
        SUM(AvailableLongTermUnits) AS AvailableLongTermUnits,
        SUM(RentedUnit) AS RentedUnit,     
        CASE 
            WHEN SUM(AvailableLongTermUnits) = 0 THEN 0 
            ELSE SUM(RentedUnit) * 1.0 / SUM(AvailableLongTermUnits) 
        END AS RentRatio,
        TakenDayTime
    FROM 
        vwDTOverallRentalDetail
    WHERE 
        CarType = 'ALL'
    GROUP BY 
        PtyCode,
        ChnName,
        TakenDayTime,
        PtyCode + ' - ' + ChnName
    """

    # Load data into pandas DataFrame
    rental_df = pd.read_sql(
        query, 
        conn, 
    )
    print(rental_df)
    conn.close()

    # Define column descriptions for better AI understanding
    columns_dict = {
        "ptycode": {"type": "string", "description": "Property code identifier (unique per property)"},
        "ChnName": {"type": "string", "description": "Chinese name of the property"},
        "PtycodeCn": {"type": "string", "description": "Combined property code and Chinese name"},
        "TotalUnit": {"type": "integer", "description": "Total number of units for this property"},
        "AvailableUnits": {"type": "integer", "description": "Number of available units for this property"},
        "AvailableLongTermUnits": {"type": "integer", "description": "Number of available long-term rental units. Used as denominator when calculating rent ratio."},
        "RentedUnit": {"type": "integer", "description": "Number of rented units. Used as numerator when calculating rent ratio."},
        "RentRatio": {"type": "float", "description": "Per-record rental ratio for this single property = RentedUnit / AvailableLongTermUnits. WARNING: Do NOT use SUM(RentRatio) for aggregate calculations. To calculate total/overall/aggregate rent ratio across multiple records, always use: SUM(RentedUnit) / SUM(AvailableLongTermUnits)"},
        "TakenDayTime": {"type": "datetime", "description": "The date when the data was recorded. Format is YYYY-MM-DD. Use this column for date filtering. Each property has one record per date."}
    }
    columns_list = [{"name": name, **props} for name, props in columns_dict.items()]

    # Wrap pandas DataFrame with pai.DataFrame for PandasAI compatibility
    pai_df = pai.DataFrame(rental_df)

    # Load existing dataset or create new one
    dataset_path = "company/rental-detail"
    try:
        df = pai.load(dataset_path)
    except Exception:
        df = pai.create(
            path=dataset_path,
            df=pai_df,
            description="""Rental detail data from vwDTOverallRentalDetail view showing property rental statistics. 
Each row represents one property on a specific date (TakenDayTime). 
Contains: total units, available units, rented units and rental ratio per property per date.
""",
            columns=columns_list
        )

    # Chat with your data

    # Data Set 1: Overall
    response1 = df.chat("What is the total rent ratio in November 2025?")
    print(response1)
    response2 = df.chat("What is the total number of rented units in November 2025?")
    print(response2)
    response3 = df.chat("What is the total number of available long-term rental units in November 2025?")
    print(response3)
    response4 = df.chat("What is the total number of total units in November 2025?")
    print(response4)
    response5 = df.chat("Comparing October 2025 and November 2025, what is the change in the total rent ratio?")
    print(response5)
    response6 = df.chat("Comparing October 2025 and November 2025, what is the change in the total number of rented units?")
    print(response6)
    response7 = df.chat("Comparing October 2025 and November 2025, what is the change in the total number of available long-term rental units?")
    print(response7)
    response8 = df.chat("Comparing October 2025 and November 2025, what is the change in the total number of total units?")
    print(response8)

    # Data Set 2: Property Level
    response9 = df.chat("What is the rent ratio for 友邦廣場 in November 2025?")
    response10 = df.chat("What is the top 20 highest rent ratio properties in November 2025?")
    print(response9)
    print(response10)
  • Result

Reference

Last updated

Was this helpful?