diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 51fcbfd..acd54a9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,5 @@ # Task 6: Build a CI workflow that runs on pull requests and pushes to main. +# Task 6: Build a CI workflow (Triggering pipeline run) # # See the assignment chapter for the required steps and commands. # Fill in the TODO values below. @@ -15,7 +16,7 @@ name: CI on: push: - branches: ["TODO-replace-with-main"] + branches: ["main"] pull_request: jobs: @@ -29,10 +30,27 @@ jobs: - name: Install dependencies run: pip install -r requirements.txt - name: Lint - run: echo "TODO implement this step" + run: ruff check src - name: Format - run: echo "TODO implement this step" + run: ruff format --check src - name: Test - run: echo "TODO implement this step" + run: pytest -q - name: Build image - run: echo "TODO implement this step" + run: docker build -t mareh-aboghanem-pipeline:${{ github.sha }} . + # The next three steps need the AZURE_CREDENTIALS secret, which GitHub + # withholds on pull_request runs from forks for security. The `if:` guard + # skips them on fork PRs so CI stays green; the central `Grade ACR push` + # workflow on main handles the actual push from base-repo context. + - name: Azure login + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork == false + uses: azure/login@v2 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + - name: ACR login + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork == false + run: az acr login --name hyfregistry + - name: Push image + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork == false + run: | + docker tag mareh-aboghanem-pipeline:${{ github.sha }} hyfregistry.azurecr.io/mareh-aboghanem-pipeline:${{ github.sha }} + docker push hyfregistry.azurecr.io/mareh-aboghanem-pipeline:${{ github.sha }} diff --git a/.gitignore b/.gitignore index ec8f344..e320bfa 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ Thumbs.db # hyf .hyf/score.json +#My files +.my_side/ + # Editor and IDE settings .vscode/ .idea/ diff --git a/Dockerfile b/Dockerfile index d665b11..3a30da9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,15 +10,15 @@ # Replace each TODO comment with the correct Dockerfile instruction. # TODO: set the base image -FROM TODO +FROM python:3.11-slim WORKDIR /app # TODO: copy requirements.txt (before source — this keeps the install layer cached) - +COPY requirements.txt . # TODO: install dependencies - +RUN pip install -r requirements.txt # TODO: copy source code - +COPY src/ src # TODO: set the command that runs when the container starts -CMD ["TODO"] +CMD ["python", "-m", "src.pipeline"] diff --git a/assets/acr_push_week5.png b/assets/acr_push_week5.png new file mode 100644 index 0000000..dd04e4f Binary files /dev/null and b/assets/acr_push_week5.png differ diff --git a/data/messy_customers.csv b/data/messy_customers.csv new file mode 100644 index 0000000..31b2e1f --- /dev/null +++ b/data/messy_customers.csv @@ -0,0 +1,38 @@ +customer_email,customer_name,region,signup_date,loyalty_tier +alice@example.com,Alice van den Berg,NL,2023-11-15,Gold +Bob@Company.COM,Bob De Smet,BE,2023-12-01,Silver +charlie@work.org,Charlie Müller,DE,2024-01-10,Bronze +dave@email.com,,NL,2024-01-15,Silver +eve@startup.io,Eve Jansen,NL,2024-02-01,Gold +frank@corp.com,Frank Dubois,FR,2024-02-14,Bronze +Grace@University.EDU,Grace van Dijk,NL,2024-03-01,Silver +henry@business.com,Henry Peeters,BE,2024-03-10,Bronze +ivan@email.com,Ivan Schneider,DE,2024-03-15,Silver +jenny@work.org,Jenny Laurent,FR,2024-01-20,Gold +karl@startup.io,Karl Bakker,NL,2024-02-28,Bronze +lena@mail.nl,Lena de Vries,NL,2024-04-01,Silver +Marco@Business.DE,Marco Weber,DE,2024-04-10,Gold +nina@university.edu,,NL,2024-04-15,Bronze +oliver@corp.com,Oliver Martin,FR,2024-05-01,Silver +Paula@Startup.IO,Paula Visser,NL,2024-05-10,Bronze +quinn@work.org,Quinn Claes,BE,2024-05-15,Silver +rachel@email.com,Rachel Schmitt,DE,not_a_date,Gold +simon@company.com,Simon Leroy,FR,2024-06-01,Bronze +tina@mail.nl,Tina Meijer,NL,2024-06-05,Silver +uwe@business.de,Uwe Fischer,DE,2024-06-10,Gold +vera@mail.nl,Vera Willems,BE,2024-06-15,Bronze +wendy@startup.io,Wendy van Leeuwen,NL,2024-01-05,Silver +xander@corp.com,Xander Moreau,FR,2024-02-20,Gold +Yara@University.EDU,Yara Hendriks,NL,2024-03-25,Bronze +zach@email.com,Zach Bauer,DE,2024-04-05,Silver +orphan_customer1@mail.nl,Dirk Janssen,NL,2024-01-01,Gold +orphan_customer2@business.de,Petra Hofmann,DE,2024-02-15,Bronze +orphan_customer3@work.fr,Louis Petit,FR,2024-03-20,Silver +orphan_customer4@mail.be,,BE,invalid_date,Gold +sam@company.com,Sam de Groot,NL,2024-05-20,Silver +lisa@email.com,Lisa Maes,BE,2024-06-01,Bronze +tom@corp.com,Tom Bernard,FR,2024-04-18,Gold +emma@startup.io,Emma van Houten,NL,2024-03-12,Silver +jan@mail.nl,Jan de Boer,NL,2024-02-08,Bronze +sophie@business.de,Sophie Klein,DE,2024-05-25,Gold +orphan_customer5@university.edu,Maria Garcia,FR,2024-06-20,Silver diff --git a/data/messy_sales.csv b/data/messy_sales.csv new file mode 100644 index 0000000..77ce766 --- /dev/null +++ b/data/messy_sales.csv @@ -0,0 +1,123 @@ +transaction_id,product_name,category,price,quantity,customer_email,date +1, laptop PRO ,Electronics,999.99,2,alice@example.com,2024-03-15 +2,WIRELESS MOUSE,Electronics,29.99,5, BOB@Company.COM ,2024-03-15 +3, usb cable,Electronics,4.99,10,,2024-03-16 +4, Office Chair ,Furniture,349.50,1,charlie@work.org,2024-03-16 +5,standing DESK,Furniture,599.00,1,charlie@work.org,not_a_date +6,,Electronics,19.99,3,dave@email.com,2024-03-17 +7, Mechanical Keyboard ,Electronics,-89.99,2,eve@startup.io,2024-03-17 +8,monitor ARM,Furniture,79.99,0,frank@corp.com,2024-03-18 +9, WEBCAM hd ,Electronics,54.99,1, ,2024-03-18 +10,desk lamp,Furniture,34.99,4,grace@university.edu,2024-03-19 +11, NOISE CANCELLING headphones,Electronics,199.99,1,alice@example.com,2024-03-19 +12,cable management KIT,Furniture,15.99,6,henry@business.com,2024-03-20 +13, ergonomic MOUSE PAD ,Furniture,24.99,3,ivan@email.com,2024-03-20 +14,laptop STAND,Furniture,45.99,2,jenny@work.org,2024-03-21 +15, BLUETOOTH speaker,,39.99,1,karl@startup.io,2024-03-21 +16,Docking Station,Electronics,129.99,2,alice@example.com,2024-03-22 +17, USB HUB ,Electronics,24.99,3,bob@company.com,2024-03-23 +18,WIRELESS CHARGER,Electronics,35.99,1,lena@mail.nl,2024-03-24 +19, desk organizer,Furniture,22.50,2,marco@business.de,2024-03-25 +20,Monitor 27 Inch,Electronics,449.00,1,nina@university.edu,2024-03-26 +21, STANDING desk MAT ,Furniture,67.99,1,oliver@corp.com,2024-03-27 +22,webcam RING LIGHT,Electronics,19.99,4,paula@startup.io,2024-03-28 +23,Laptop Bag,Accessories,49.99,2,quinn@work.org,2024-03-29 +24, mouse pad XL,Furniture,18.99,3,rachel@email.com,2024-03-30 +25,power STRIP,Electronics,14.99,5,simon@company.com,2024-03-31 +26, Ergonomic Footrest ,Furniture,45.00,1,tina@mail.nl,2024-04-01 +27,HDMI Cable,Electronics,9.99,8,alice@example.com,2024-04-02 +28, wireless KEYBOARD ,Electronics,69.99,2,bob@company.com,2024-04-03 +29,Desk Shelf,Furniture,89.99,1,charlie@work.org,2024-04-04 +30, SCREEN PROTECTOR ,Accessories,12.99,6,dave@email.com,2024-04-05 +31,,Furniture,199.99,1,eve@startup.io,2024-04-06 +32,Adjustable Monitor Stand,Furniture,159.00,1,frank@corp.com,2024-04-07 +33, usb c ADAPTER ,Electronics,15.99,3,grace@university.edu,2024-04-08 +34,Cable Clips,Accessories,7.99,10,unknown_buyer@gmail.com,2024-04-09 +35,LAPTOP COOLING pad,Electronics,32.99,2,ivan@email.com,2024-04-10 +36, desk PLANT pot ,Furniture,11.99,4,jenny@work.org,2024-04-11 +37,Webcam Cover,Accessories,4.99,15,karl@startup.io,2024-04-12 +38, MECHANICAL keyboard ,Electronics,-149.99,1,lena@mail.nl,2024-04-13 +39,Office Lamp LED,Furniture,54.99,2,marco@business.de,2024-04-14 +40, ethernet cable ,Electronics,8.99,7,nina@university.edu,2024-04-15 +41,Standing Desk Frame,Furniture,399.00,1,oliver@corp.com,2024-04-16 +42, wireless PRESENTER ,Electronics,29.99,2,paula@startup.io,2024-04-17 +43,Drawer Organizer,Furniture,16.99,3,quinn@work.org,2024-04-18 +44,,Electronics,44.99,2,rachel@email.com,2024-04-19 +45, PORTABLE monitor ,Electronics,279.99,1,simon@company.com,2024-04-20 +46,Desk Cable Tray,Furniture,24.99,2,tina@mail.nl,2024-04-21 +47, noise machine ,,39.99,1,uwe@business.de,2024-04-22 +48,LAPTOP RISER,Furniture,55.99,1,vera@mail.nl,2024-04-23 +49, usb MICROPHONE ,Electronics,79.99,1,wendy@startup.io,2024-04-24 +50,Whiteboard Small,Furniture,29.99,2,xander@corp.com,2024-04-25 +51, POWER bank ,Electronics,49.99,3,yara@university.edu,2024-04-26 +52,Desk Mat Large,Furniture,34.99,1,zach@email.com,2024-04-27 +53,Monitor Light Bar,Electronics,59.99,1,alice@example.com,2024-04-28 +54, FILE organizer ,Furniture,13.99,4,bob@company.com,2024-04-29 +55,Webcam Tripod,Accessories,21.99,2,charlie@work.org,2024-04-30 +56, SURGE protector ,Electronics,-19.99,3,dave@email.com,2024-05-01 +57,Keyboard Wrist Rest,Furniture,17.99,2,eve@startup.io,2024-05-02 +58, hdmi SPLITTER ,Electronics,22.99,2,frank@corp.com,2024-05-03 +59,Desk Fan USB,Electronics,15.99,3,temp_worker@messycorp.com,2024-05-04 +60, bookend SET ,Furniture,19.99,2,henry@business.com,2024-05-05 +61,Wireless Mouse Pad,Electronics,42.99,1,ivan@email.com,2024-05-06 +62,,,29.99,2,jenny@work.org,2024-05-07 +63, LAPTOP sleeve ,Accessories,27.99,2,karl@startup.io,2024-05-08 +64,Smart Power Strip,Electronics,39.99,1,lena@mail.nl,2024-05-09 +65, monitor CLEANING kit ,Accessories,11.99,5,marco@business.de,2024-05-10 +66,Desk Drawer Unit,Furniture,129.99,1,nina@university.edu,2024-05-11 +67, PHONE stand ,Accessories,14.99,3,oliver@corp.com,2024-05-12 +68,USB Docking Hub,Electronics,89.99,1,paula@startup.io,2024-05-13 +69, cable MANAGEMENT box ,Furniture,21.99,2,quinn@work.org,2024-05-14 +70,LED Desk Lamp,Furniture,44.99,2,rachel@email.com,2024-05-15 +71,Portable Charger,Electronics,34.99,0,simon@company.com,2024-05-16 +72, OFFICE whiteboard ,Furniture,79.99,1,tina@mail.nl,2024-05-17 +73,Mouse Bungee,,16.99,2,uwe@business.de,2024-05-18 +74, laptop DOCKING station ,Electronics,189.99,1,vera@mail.nl,2024-05-19 +75,Paper Tray,Furniture,9.99,4,,2024-05-20 +76, BLUETOOTH adapter ,Electronics,12.99,3,wendy@startup.io,2024-05-21 +77,Monitor Privacy Screen,Accessories,49.99,1,xander@corp.com,2024-05-22 +78, document HOLDER ,Furniture,23.99,2,yara@university.edu,2024-05-23 +79,Portable SSD,Electronics,119.99,1,zach@email.com,2024-05-24 +80, DESK calendar ,Furniture,8.99,5,alice@example.com,2024-05-25 +81,Webcam HD Pro,Electronics,89.99,1,bob@company.com,2024-05-26 +82, pen HOLDER ,Furniture,6.99,6,charlie@work.org,2024-05-27 +83,Gaming Mouse,Electronics,59.99,2,dave@email.com,2024-05-28 +16,Docking Station,Electronics,129.99,2,alice@example.com,2024-03-22 +84, WIRELESS earbuds ,Electronics,79.99,1,eve@startup.io,2024-05-29 +85,Desk Hutch,Furniture,199.99,1,frank@corp.com,2024-05-30 +86, usb FAN ,Electronics,11.99,3,grace@university.edu,2024-05-31 +87,Keyboard Cover,Accessories,9.99,4,freelancer@outlook.com,2024-06-01 +88, MINI projector ,Electronics,299.99,1,ivan@email.com,2024-06-02 +89,Foot Hammock,Furniture,18.99,2,jenny@work.org,2024-06-03 +90, screen CLEANING wipes ,Accessories,5.99,10,karl@startup.io,2024-06-04 +91,Portable Speaker,Electronics,44.99,1,lena@mail.nl,2024-06-05 +92,,Accessories,14.99,3,marco@business.de,2024-06-06 +93, LAPTOP lock ,Accessories,24.99,2,nina@university.edu,2024-06-07 +94,Desk Pad Leather,Furniture,64.99,1,oliver@corp.com,2024-06-08 +95, wireless HEADSET ,Electronics,149.99,1,paula@startup.io,2024-06-09 +96,Clip-on Desk Light,Furniture,27.99,3,quinn@work.org,2024-06-10 +97,USB Wall Charger,Electronics,18.99,4,intern2024@messycorp.com,2024-06-11 +98, DESK storage BOX ,Furniture,32.99,2,simon@company.com,2024-06-12 +99,Portable Monitor Stand,Furniture,44.99,0,tina@mail.nl,2024-06-13 +100, webcam LIGHT ,Electronics,22.99,2,uwe@business.de,2024-06-14 +101,Desk Shelf Riser,Furniture,37.99,1,vera@mail.nl,2024-06-15 +102, CABLE tester ,Electronics,29.99,1,wendy@startup.io,2024-06-16 +103,Under Desk Drawer,Furniture,41.99,1,xander@corp.com,2024-06-17 +53,Monitor Light Bar,Electronics,59.99,1,alice@example.com,2024-04-28 +104, SD card READER ,Electronics,14.99,3,yara@university.edu,2024-06-18 +105,Desktop Organizer Set,Furniture,54.99,1,zach@email.com,2024-06-19 +106, GAMING keyboard ,Electronics,129.99,1,alice@example.com,2024-06-20 +107,Laptop Stand Pro,Furniture,4999.99,1,bob@company.com,2024-06-21 +108, usb LIGHT strip ,,16.99,3,charlie@work.org,2024-06-22 +109,Desk Clock Digital,Furniture,19.99,2,dave@email.com,2024-06-23 +110, WIRELESS trackball ,Electronics,64.99,1,eve@startup.io,2024-06-24 +111,Office Partition,Furniture,2499.00,1,frank@corp.com,2024-06-25 +112, laptop SCREEN cleaner ,Accessories,7.99,5,grace@university.edu,2024-06-26 +113,Smart Plug,Electronics,19.99,4,henry@business.com,2024-06-27 +114, DESK lamp WIRELESS ,Furniture,69.99,1,ivan@email.com,2024-06-28 +115,Monitor Arm Dual,Furniture,179.99,1,jenny@work.org,2024-06-29 +116, PHONE charger ,Electronics,12.99,5, ,2024-06-30 +117,Keyboard Cleaner,Accessories,8.99,3,lena@mail.nl,2024-04-22 +118, desk TIDY caddy ,Furniture,15.99,2,marco@business.de,2024-13-01 +119,USB Hub Powered,Electronics,34.99,2,nina@university.edu, +120, WEBCAM stand ,Accessories,18.99,1,oliver@corp.com,2024-05-15 diff --git a/requirements.txt b/requirements.txt index 42299cf..0a141b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,8 @@ -# Task 2: Pin every dependency your pipeline uses. -# -# Format: package==version -# Example: requests==2.31.0 -# -# Find the current version of any package: -# pip show -# -# Always include pytest and ruff: -# pytest== -# ruff== -# -# Add your pinned dependencies below: +azure-storage-blob +matplotlib==3.10.9 +pandas==3.0.3 +pytest==9.0.3 +python-dotenv==1.2.2 +pyarrow +azure-identity +ruff diff --git a/src/clean.py b/src/clean.py new file mode 100644 index 0000000..f785e3a --- /dev/null +++ b/src/clean.py @@ -0,0 +1,42 @@ +import logging +from pathlib import Path +import pandas as pd + + +def load_and_explore(data_dir: Path) -> tuple[pd.DataFrame, pd.DataFrame]: + data_sales = pd.read_csv(data_dir / "messy_sales.csv") + data_customers = pd.read_csv(data_dir / "messy_customers.csv") + + logging.info("--- Exploring Data ---") + logging.info("\n== Sales Data ==") + data_sales.info() + logging.info(f"\nDescribe:\n{data_sales.describe()}") + logging.info(f"\nRows:\n{data_sales.head(20)}") + logging.info(f"\nMissing Values:\n{data_sales.isna().sum()}") + logging.info("Exploration of Sales Data is complete.") + logging.info("\n== Customers Data ==") + data_customers.info() + logging.info(f"\nDescribe:\n{data_customers.describe()}") + logging.info(f"\nFirst Rows:\n{data_customers.head(20)}") + logging.info(f"\nMissing Values:\n{data_customers.isna().sum()}") + logging.info("Exploration of Customers Data is complete.") + return data_sales, data_customers + + +def clean_sales(sales: pd.DataFrame) -> pd.DataFrame: + product_name = sales["product_name"].str.strip().str.title() + sales["product_name"] = product_name + customer_email = sales["customer_email"].str.lower().str.strip() + sales["customer_email"] = customer_email + price = pd.to_numeric(sales["price"], errors="coerce") + sales["price"] = price + date = pd.to_datetime(sales["date"], errors="coerce") + sales["date"] = date + sales = sales.dropna(subset=["product_name"]) + sales = sales[sales["price"] >= 0] + sales = sales[sales["quantity"] > 0] + sales = sales.dropna(subset=["date"]) + sales = sales.drop_duplicates(subset="transaction_id", keep="first") + logging.info(f"cleaning complete. Rows remaining: {len(sales)}") + # Decision: Leave outlier prices as they are. Why? Because I think they could be valid values and I need to understand the detalis of the data. + return sales diff --git a/src/data/messy_customers.csv b/src/data/messy_customers.csv new file mode 100644 index 0000000..31b2e1f --- /dev/null +++ b/src/data/messy_customers.csv @@ -0,0 +1,38 @@ +customer_email,customer_name,region,signup_date,loyalty_tier +alice@example.com,Alice van den Berg,NL,2023-11-15,Gold +Bob@Company.COM,Bob De Smet,BE,2023-12-01,Silver +charlie@work.org,Charlie Müller,DE,2024-01-10,Bronze +dave@email.com,,NL,2024-01-15,Silver +eve@startup.io,Eve Jansen,NL,2024-02-01,Gold +frank@corp.com,Frank Dubois,FR,2024-02-14,Bronze +Grace@University.EDU,Grace van Dijk,NL,2024-03-01,Silver +henry@business.com,Henry Peeters,BE,2024-03-10,Bronze +ivan@email.com,Ivan Schneider,DE,2024-03-15,Silver +jenny@work.org,Jenny Laurent,FR,2024-01-20,Gold +karl@startup.io,Karl Bakker,NL,2024-02-28,Bronze +lena@mail.nl,Lena de Vries,NL,2024-04-01,Silver +Marco@Business.DE,Marco Weber,DE,2024-04-10,Gold +nina@university.edu,,NL,2024-04-15,Bronze +oliver@corp.com,Oliver Martin,FR,2024-05-01,Silver +Paula@Startup.IO,Paula Visser,NL,2024-05-10,Bronze +quinn@work.org,Quinn Claes,BE,2024-05-15,Silver +rachel@email.com,Rachel Schmitt,DE,not_a_date,Gold +simon@company.com,Simon Leroy,FR,2024-06-01,Bronze +tina@mail.nl,Tina Meijer,NL,2024-06-05,Silver +uwe@business.de,Uwe Fischer,DE,2024-06-10,Gold +vera@mail.nl,Vera Willems,BE,2024-06-15,Bronze +wendy@startup.io,Wendy van Leeuwen,NL,2024-01-05,Silver +xander@corp.com,Xander Moreau,FR,2024-02-20,Gold +Yara@University.EDU,Yara Hendriks,NL,2024-03-25,Bronze +zach@email.com,Zach Bauer,DE,2024-04-05,Silver +orphan_customer1@mail.nl,Dirk Janssen,NL,2024-01-01,Gold +orphan_customer2@business.de,Petra Hofmann,DE,2024-02-15,Bronze +orphan_customer3@work.fr,Louis Petit,FR,2024-03-20,Silver +orphan_customer4@mail.be,,BE,invalid_date,Gold +sam@company.com,Sam de Groot,NL,2024-05-20,Silver +lisa@email.com,Lisa Maes,BE,2024-06-01,Bronze +tom@corp.com,Tom Bernard,FR,2024-04-18,Gold +emma@startup.io,Emma van Houten,NL,2024-03-12,Silver +jan@mail.nl,Jan de Boer,NL,2024-02-08,Bronze +sophie@business.de,Sophie Klein,DE,2024-05-25,Gold +orphan_customer5@university.edu,Maria Garcia,FR,2024-06-20,Silver diff --git a/src/data/messy_sales.csv b/src/data/messy_sales.csv new file mode 100644 index 0000000..77ce766 --- /dev/null +++ b/src/data/messy_sales.csv @@ -0,0 +1,123 @@ +transaction_id,product_name,category,price,quantity,customer_email,date +1, laptop PRO ,Electronics,999.99,2,alice@example.com,2024-03-15 +2,WIRELESS MOUSE,Electronics,29.99,5, BOB@Company.COM ,2024-03-15 +3, usb cable,Electronics,4.99,10,,2024-03-16 +4, Office Chair ,Furniture,349.50,1,charlie@work.org,2024-03-16 +5,standing DESK,Furniture,599.00,1,charlie@work.org,not_a_date +6,,Electronics,19.99,3,dave@email.com,2024-03-17 +7, Mechanical Keyboard ,Electronics,-89.99,2,eve@startup.io,2024-03-17 +8,monitor ARM,Furniture,79.99,0,frank@corp.com,2024-03-18 +9, WEBCAM hd ,Electronics,54.99,1, ,2024-03-18 +10,desk lamp,Furniture,34.99,4,grace@university.edu,2024-03-19 +11, NOISE CANCELLING headphones,Electronics,199.99,1,alice@example.com,2024-03-19 +12,cable management KIT,Furniture,15.99,6,henry@business.com,2024-03-20 +13, ergonomic MOUSE PAD ,Furniture,24.99,3,ivan@email.com,2024-03-20 +14,laptop STAND,Furniture,45.99,2,jenny@work.org,2024-03-21 +15, BLUETOOTH speaker,,39.99,1,karl@startup.io,2024-03-21 +16,Docking Station,Electronics,129.99,2,alice@example.com,2024-03-22 +17, USB HUB ,Electronics,24.99,3,bob@company.com,2024-03-23 +18,WIRELESS CHARGER,Electronics,35.99,1,lena@mail.nl,2024-03-24 +19, desk organizer,Furniture,22.50,2,marco@business.de,2024-03-25 +20,Monitor 27 Inch,Electronics,449.00,1,nina@university.edu,2024-03-26 +21, STANDING desk MAT ,Furniture,67.99,1,oliver@corp.com,2024-03-27 +22,webcam RING LIGHT,Electronics,19.99,4,paula@startup.io,2024-03-28 +23,Laptop Bag,Accessories,49.99,2,quinn@work.org,2024-03-29 +24, mouse pad XL,Furniture,18.99,3,rachel@email.com,2024-03-30 +25,power STRIP,Electronics,14.99,5,simon@company.com,2024-03-31 +26, Ergonomic Footrest ,Furniture,45.00,1,tina@mail.nl,2024-04-01 +27,HDMI Cable,Electronics,9.99,8,alice@example.com,2024-04-02 +28, wireless KEYBOARD ,Electronics,69.99,2,bob@company.com,2024-04-03 +29,Desk Shelf,Furniture,89.99,1,charlie@work.org,2024-04-04 +30, SCREEN PROTECTOR ,Accessories,12.99,6,dave@email.com,2024-04-05 +31,,Furniture,199.99,1,eve@startup.io,2024-04-06 +32,Adjustable Monitor Stand,Furniture,159.00,1,frank@corp.com,2024-04-07 +33, usb c ADAPTER ,Electronics,15.99,3,grace@university.edu,2024-04-08 +34,Cable Clips,Accessories,7.99,10,unknown_buyer@gmail.com,2024-04-09 +35,LAPTOP COOLING pad,Electronics,32.99,2,ivan@email.com,2024-04-10 +36, desk PLANT pot ,Furniture,11.99,4,jenny@work.org,2024-04-11 +37,Webcam Cover,Accessories,4.99,15,karl@startup.io,2024-04-12 +38, MECHANICAL keyboard ,Electronics,-149.99,1,lena@mail.nl,2024-04-13 +39,Office Lamp LED,Furniture,54.99,2,marco@business.de,2024-04-14 +40, ethernet cable ,Electronics,8.99,7,nina@university.edu,2024-04-15 +41,Standing Desk Frame,Furniture,399.00,1,oliver@corp.com,2024-04-16 +42, wireless PRESENTER ,Electronics,29.99,2,paula@startup.io,2024-04-17 +43,Drawer Organizer,Furniture,16.99,3,quinn@work.org,2024-04-18 +44,,Electronics,44.99,2,rachel@email.com,2024-04-19 +45, PORTABLE monitor ,Electronics,279.99,1,simon@company.com,2024-04-20 +46,Desk Cable Tray,Furniture,24.99,2,tina@mail.nl,2024-04-21 +47, noise machine ,,39.99,1,uwe@business.de,2024-04-22 +48,LAPTOP RISER,Furniture,55.99,1,vera@mail.nl,2024-04-23 +49, usb MICROPHONE ,Electronics,79.99,1,wendy@startup.io,2024-04-24 +50,Whiteboard Small,Furniture,29.99,2,xander@corp.com,2024-04-25 +51, POWER bank ,Electronics,49.99,3,yara@university.edu,2024-04-26 +52,Desk Mat Large,Furniture,34.99,1,zach@email.com,2024-04-27 +53,Monitor Light Bar,Electronics,59.99,1,alice@example.com,2024-04-28 +54, FILE organizer ,Furniture,13.99,4,bob@company.com,2024-04-29 +55,Webcam Tripod,Accessories,21.99,2,charlie@work.org,2024-04-30 +56, SURGE protector ,Electronics,-19.99,3,dave@email.com,2024-05-01 +57,Keyboard Wrist Rest,Furniture,17.99,2,eve@startup.io,2024-05-02 +58, hdmi SPLITTER ,Electronics,22.99,2,frank@corp.com,2024-05-03 +59,Desk Fan USB,Electronics,15.99,3,temp_worker@messycorp.com,2024-05-04 +60, bookend SET ,Furniture,19.99,2,henry@business.com,2024-05-05 +61,Wireless Mouse Pad,Electronics,42.99,1,ivan@email.com,2024-05-06 +62,,,29.99,2,jenny@work.org,2024-05-07 +63, LAPTOP sleeve ,Accessories,27.99,2,karl@startup.io,2024-05-08 +64,Smart Power Strip,Electronics,39.99,1,lena@mail.nl,2024-05-09 +65, monitor CLEANING kit ,Accessories,11.99,5,marco@business.de,2024-05-10 +66,Desk Drawer Unit,Furniture,129.99,1,nina@university.edu,2024-05-11 +67, PHONE stand ,Accessories,14.99,3,oliver@corp.com,2024-05-12 +68,USB Docking Hub,Electronics,89.99,1,paula@startup.io,2024-05-13 +69, cable MANAGEMENT box ,Furniture,21.99,2,quinn@work.org,2024-05-14 +70,LED Desk Lamp,Furniture,44.99,2,rachel@email.com,2024-05-15 +71,Portable Charger,Electronics,34.99,0,simon@company.com,2024-05-16 +72, OFFICE whiteboard ,Furniture,79.99,1,tina@mail.nl,2024-05-17 +73,Mouse Bungee,,16.99,2,uwe@business.de,2024-05-18 +74, laptop DOCKING station ,Electronics,189.99,1,vera@mail.nl,2024-05-19 +75,Paper Tray,Furniture,9.99,4,,2024-05-20 +76, BLUETOOTH adapter ,Electronics,12.99,3,wendy@startup.io,2024-05-21 +77,Monitor Privacy Screen,Accessories,49.99,1,xander@corp.com,2024-05-22 +78, document HOLDER ,Furniture,23.99,2,yara@university.edu,2024-05-23 +79,Portable SSD,Electronics,119.99,1,zach@email.com,2024-05-24 +80, DESK calendar ,Furniture,8.99,5,alice@example.com,2024-05-25 +81,Webcam HD Pro,Electronics,89.99,1,bob@company.com,2024-05-26 +82, pen HOLDER ,Furniture,6.99,6,charlie@work.org,2024-05-27 +83,Gaming Mouse,Electronics,59.99,2,dave@email.com,2024-05-28 +16,Docking Station,Electronics,129.99,2,alice@example.com,2024-03-22 +84, WIRELESS earbuds ,Electronics,79.99,1,eve@startup.io,2024-05-29 +85,Desk Hutch,Furniture,199.99,1,frank@corp.com,2024-05-30 +86, usb FAN ,Electronics,11.99,3,grace@university.edu,2024-05-31 +87,Keyboard Cover,Accessories,9.99,4,freelancer@outlook.com,2024-06-01 +88, MINI projector ,Electronics,299.99,1,ivan@email.com,2024-06-02 +89,Foot Hammock,Furniture,18.99,2,jenny@work.org,2024-06-03 +90, screen CLEANING wipes ,Accessories,5.99,10,karl@startup.io,2024-06-04 +91,Portable Speaker,Electronics,44.99,1,lena@mail.nl,2024-06-05 +92,,Accessories,14.99,3,marco@business.de,2024-06-06 +93, LAPTOP lock ,Accessories,24.99,2,nina@university.edu,2024-06-07 +94,Desk Pad Leather,Furniture,64.99,1,oliver@corp.com,2024-06-08 +95, wireless HEADSET ,Electronics,149.99,1,paula@startup.io,2024-06-09 +96,Clip-on Desk Light,Furniture,27.99,3,quinn@work.org,2024-06-10 +97,USB Wall Charger,Electronics,18.99,4,intern2024@messycorp.com,2024-06-11 +98, DESK storage BOX ,Furniture,32.99,2,simon@company.com,2024-06-12 +99,Portable Monitor Stand,Furniture,44.99,0,tina@mail.nl,2024-06-13 +100, webcam LIGHT ,Electronics,22.99,2,uwe@business.de,2024-06-14 +101,Desk Shelf Riser,Furniture,37.99,1,vera@mail.nl,2024-06-15 +102, CABLE tester ,Electronics,29.99,1,wendy@startup.io,2024-06-16 +103,Under Desk Drawer,Furniture,41.99,1,xander@corp.com,2024-06-17 +53,Monitor Light Bar,Electronics,59.99,1,alice@example.com,2024-04-28 +104, SD card READER ,Electronics,14.99,3,yara@university.edu,2024-06-18 +105,Desktop Organizer Set,Furniture,54.99,1,zach@email.com,2024-06-19 +106, GAMING keyboard ,Electronics,129.99,1,alice@example.com,2024-06-20 +107,Laptop Stand Pro,Furniture,4999.99,1,bob@company.com,2024-06-21 +108, usb LIGHT strip ,,16.99,3,charlie@work.org,2024-06-22 +109,Desk Clock Digital,Furniture,19.99,2,dave@email.com,2024-06-23 +110, WIRELESS trackball ,Electronics,64.99,1,eve@startup.io,2024-06-24 +111,Office Partition,Furniture,2499.00,1,frank@corp.com,2024-06-25 +112, laptop SCREEN cleaner ,Accessories,7.99,5,grace@university.edu,2024-06-26 +113,Smart Plug,Electronics,19.99,4,henry@business.com,2024-06-27 +114, DESK lamp WIRELESS ,Furniture,69.99,1,ivan@email.com,2024-06-28 +115,Monitor Arm Dual,Furniture,179.99,1,jenny@work.org,2024-06-29 +116, PHONE charger ,Electronics,12.99,5, ,2024-06-30 +117,Keyboard Cleaner,Accessories,8.99,3,lena@mail.nl,2024-04-22 +118, desk TIDY caddy ,Furniture,15.99,2,marco@business.de,2024-13-01 +119,USB Hub Powered,Electronics,34.99,2,nina@university.edu, +120, WEBCAM stand ,Accessories,18.99,1,oliver@corp.com,2024-05-15 diff --git a/src/ingest.py b/src/ingest.py new file mode 100644 index 0000000..2b250b5 --- /dev/null +++ b/src/ingest.py @@ -0,0 +1,68 @@ +import logging +from pathlib import Path +import os +from dotenv import load_dotenv +from azure.identity import DefaultAzureCredential +from azure.storage.blob import BlobServiceClient + + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +load_dotenv(PROJECT_ROOT / ".env") +load_dotenv() + +FILES = ["messy_sales.csv", "messy_customers.csv"] + + +def download_inputs(data_dir: Path) -> None: + account_url = os.getenv("ACCOUNT_URL") + source_container = os.getenv("SOURCE_CONTAINER") + if not account_url or not source_container: + raise RuntimeError( + "Please set ACCOUNT_URL and SOURCE_CONTAINER environment variables before running." + ) + credential = DefaultAzureCredential() + service = BlobServiceClient(account_url=account_url, credential=credential) + container = service.get_container_client(source_container) + if not container.exists(): + logging.info(f"Container '{source_container}' not found.") + return + data_dir.mkdir(parents=True, exist_ok=True) + for name in FILES: + blob = container.get_blob_client(name) + with open(data_dir / name, "wb") as f: + f.write(blob.download_blob().readall()) + logging.info("Downloaded %s", name) + + +"""def load_inputs_local(data_dir: Path) -> None: + #Because of the Fallback i implented this function to load data locally + data_dir.mkdir(exist_ok=True) + sample_data_dir = Path(__file__).resolve().parent.parent / "sample_data" + + for name in FILES: + source_file = sample_data_dir / name + destination_file = data_dir / name + if source_file.exists(): + shutil.copy(source_file, destination_file) + logging.info("Successfully loaded %s from local sample_data", name) + else: + logging.error("File %s not found in sample_data!", name)""" + + +def upload_outputs(output_dir: Path, github_username: str) -> None: + """Task 7 (extra credit): Upload Parquet outputs to Azure and verify the round-trip.""" + # container_name = f"week4-{github_username}" + + # EXTRA CREDIT — implement this after Tasks 2–6 are working. + # TODO: Create a BlobServiceClient using DefaultAzureCredential and ACCOUNT_URL. + # TODO: Get (or create) the container named container_name. + # TODO: Upload every .parquet file in output_dir to the container. + # TODO: Download customer_summary.parquet back and assert its row count matches the local file. + # TODO: Log the container name and number of files uploaded. + pass + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + LOCAL_DATA_DIR = Path(__file__).resolve().parent / "data" + download_inputs(LOCAL_DATA_DIR) diff --git a/src/pipeline.py b/src/pipeline.py index 1cd17a4..2d62cff 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -8,23 +8,31 @@ Replace every `raise NotImplementedError` below with a real implementation. """ +# python -m src.pipeline +import os import logging from pathlib import Path +from src.ingest import download_inputs +from src.clean import load_and_explore, clean_sales +from src.transform import join_customers +from src.report import build_reports, write_outputs + +GITHUB_USERNAME = "mareh-aboghanem" +DATA_DIR = Path("data") +# OUTPUT_DIR = Path("output") logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") logger = logging.getLogger(__name__) def get_config() -> dict: - """ - Return configuration read from environment variables. + api_key = os.getenv("API_KEY") + if not api_key: + raise RuntimeError("API_KEY environment is required but its value is missing.") - Required variable: API_KEY - Optional variable: OUTPUT_DIR (default "output") + output_dirctroy = os.getenv("OUTPUT_DIR", "output") - Raise RuntimeError with a clear message if a required variable is missing. - """ - raise NotImplementedError("Task 5: read API_KEY and OUTPUT_DIR from the environment") + return {"api_key": api_key, "output_dir": output_dirctroy} def fetch_data(api_key: str) -> list[dict]: @@ -34,7 +42,16 @@ def fetch_data(api_key: str) -> list[dict]: Return a list of at least one dict representing a record. In a real pipeline you would call requests.get(...) here. """ - raise NotImplementedError("Task 1: return at least one sample record") + mock_record = { + "transaction_id": 999, + "product_name": "Mock Test Item", + "category": "Testing", + "price": 99.99, + "quantity": 1, + "customer_email": "test_user@example.com", + "date": "2026-06-03", + } + return [mock_record] def save_results(records: list[dict], output_dir: Path) -> None: @@ -44,16 +61,28 @@ def save_results(records: list[dict], output_dir: Path) -> None: Create output_dir if it does not exist. Log the number of records written. """ - raise NotImplementedError("Task 1: write records to output_dir/results.txt") + output_dir.mkdir(parents=True, exist_ok=True) + with open(output_dir / "results.txt", "w") as f: + for record in records: + f.write(f"{record}\n") + logging.info("Saved %d records to %s", len(records), output_dir / "results.txt") +# 5 def run() -> None: config = get_config() logger.info("starting pipeline") records = fetch_data(config["api_key"]) output_dir = Path(config["output_dir"]) save_results(records, output_dir) - logger.info("pipeline complete") + download_inputs(DATA_DIR) + sales_raw, customers_raw = load_and_explore(DATA_DIR) + sales_clean = clean_sales(sales_raw) + enriched = join_customers(sales_clean, customers_raw) + reports = build_reports(enriched) + write_outputs(reports, output_dir) + # upload_outputs(OUTPUT_DIR, GITHUB_USERNAME) + logging.info("Pipeline complete.") if __name__ == "__main__": diff --git a/src/report.py b/src/report.py new file mode 100644 index 0000000..646260c --- /dev/null +++ b/src/report.py @@ -0,0 +1,57 @@ +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from pathlib import Path +import pandas as pd + + +def build_reports(enriched: pd.DataFrame) -> dict[str, pd.DataFrame]: + week = enriched["date"].dt.isocalendar().week + enriched["week"] = week + weekly_revenue = enriched.groupby(["week", "region"], as_index=False).agg( + total_revenue=("revenue", "sum"), order_count=("transaction_id", "count") + ) + customer_summary = enriched.groupby("customer_email", as_index=False).agg( + customer_name=("customer_name", "first"), + region=("region", "first"), + loyalty_tier=("loyalty_tier", "first"), + total_spent=("revenue", "sum"), + avg_order=("revenue", "mean"), + order_count=("transaction_id", "count"), + ) + category_performance = enriched.groupby("category", as_index=False).agg( + total_revenue=("revenue", "sum"), order_count=("transaction_id", "count") + ) + loyalty_analysis = enriched.groupby("loyalty_tier", as_index=False).agg( + avg_spent=("revenue", "mean"), customer_count=("customer_email", "nunique") + ) + return { + "weekly_revenue": weekly_revenue, + "customer_summary": customer_summary, + "category_performance": category_performance, + "loyalty_analysis": loyalty_analysis, + } + + +def write_outputs(reports: dict[str, pd.DataFrame], output_dir: Path) -> None: + """Task 6: Write report tables to CSV/Parquet and save a bar chart.""" + output_dir.mkdir(exist_ok=True) + reports["weekly_revenue"].to_csv(output_dir / "weekly_revenue.csv", index=False) + reports["customer_summary"].to_parquet( + output_dir / "customer_summary.parquet", index=False + ) + cat_df = reports["category_performance"].sort_values( + by="total_revenue", ascending=False + ) + cat_df.to_csv(output_dir / "category_performance.csv", index=False) + plt.figure(figsize=(10, 6)) + plt.bar( + cat_df["category"], cat_df["total_revenue"], color="skyblue", edgecolor="black" + ) + plt.title("Total Revenue by Category") + plt.xlabel("Category") + plt.ylabel("Total Revenue") + plt.xticks(rotation=45) + plt.savefig(output_dir / "category_revenue.png", bbox_inches="tight") + plt.close() diff --git a/src/transform.py b/src/transform.py new file mode 100644 index 0000000..abdb53b --- /dev/null +++ b/src/transform.py @@ -0,0 +1,14 @@ +import logging +import pandas as pd + + +def join_customers(sales: pd.DataFrame, customers: pd.DataFrame) -> pd.DataFrame: + """Task 4: Normalize join keys, merge, and add a derived boolean flag.""" + customers["customer_email"] = customers["customer_email"].str.lower().str.strip() + sales["customer_email"] = sales["customer_email"].str.lower().str.strip() + merged = sales.merge(customers, on="customer_email", how="inner") + merged["revenue"] = merged["price"] * merged["quantity"] + merged["is_high_value"] = merged["revenue"] >= 150 + # TODO: (Optional hands-on) Try a left join instead and inspect rows where customer_name is NaN. + logging.info("Joining complete. Rows in merged DataFrame: %d", len(merged)) + return merged diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 73029e3..0e2e74e 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,9 +1,8 @@ """Tests for the Week 5 pipeline.""" - +# python -m pytest tests/ -v import pytest - -from src.pipeline import fetch_data, get_config, save_results - +import pandas as pd +from src.pipeline import fetch_data, get_config, save_results, clean_sales class TestGetConfig: def test_returns_api_key_from_env(self, monkeypatch): @@ -59,3 +58,31 @@ def test_file_contains_records(self, tmp_path): save_results([{"id": 1}, {"id": 2}], tmp_path) content = (tmp_path / "results.txt").read_text() assert len(content.strip().splitlines()) >= 2 + +def test_clean_sales_strips_whitespace(): + mock_sales = pd.DataFrame( + { + "transaction_id": [1], + "product_name": [" Iphone green "], + "customer_email": ["TEST@example.com"], + "price": [999.99], + "quantity": [1], + "date": ["2026-06-03"], + } + ) + cleaned = clean_sales(mock_sales) + assert cleaned["product_name"].iloc[0] == "Iphone Green" + +def test_clean_sales_handles_empty(): + mock_sales = pd.DataFrame( + { + "transaction_id": [1], + "product_name": [""], + "customer_email": ["TEST@example.com"], + "price": [999.99], + "quantity": [1], + "date": ["2026-06-03"], + } + ) + cleaned = clean_sales(mock_sales) + assert cleaned["product_name"].iloc[0] == ""