-
Notifications
You must be signed in to change notification settings - Fork 8
Hannah #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Hannah #3
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| { | ||
| "score": 0, | ||
| "pass": false, | ||
| "passingScore": 60, | ||
| "ai_assist_present": false | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,21 +8,46 @@ | |
| def load_and_explore(data_dir: Path) -> tuple[pd.DataFrame, pd.DataFrame]: | ||
| """Task 2: Load both CSV files and explore their contents before cleaning.""" | ||
| # TODO: Read messy_sales.csv and messy_customers.csv with pd.read_csv(). | ||
| sales_df = pd.read_csv(data_dir / "messy_sales.csv") | ||
| customers_df = pd.read_csv(data_dir / "messy_customers.csv") | ||
| # TODO: For each DataFrame call .info(), .describe(), .head(20), and .isna().sum(). | ||
| logging.info("Sales DataFrame info:\n%s", sales_df.info()) | ||
| logging.info("Sales DataFrame description:\n%s", sales_df.describe()) | ||
| logging.info("Sales DataFrame head:\n%s", sales_df.head(20)) | ||
| logging.info("Sales DataFrame null counts:\n%s", sales_df.isna().sum()) | ||
| logging.info("Customers DataFrame info:\n%s", customers_df.info()) | ||
| logging.info("Customers DataFrame description:\n%s", customers_df.describe()) | ||
| logging.info("Customers DataFrame head:\n%s", customers_df.head(20)) | ||
| logging.info("Customers DataFrame null counts:\n%s", customers_df.isna().sum()) | ||
| # TODO: Log what you discover (e.g. which columns have nulls, any suspicious values). | ||
| raise NotImplementedError("Task 2: implement load_and_explore") | ||
| logging.info("Sales DataFrame has %d rows and %d columns", sales_df.shape[0], sales_df.shape[1]) | ||
| logging.info("Customers DataFrame has %d rows and %d columns", customers_df.shape[0], customers_df.shape[1]) | ||
| return sales_df, customers_df | ||
|
|
||
|
|
||
| def clean_sales(sales: pd.DataFrame) -> pd.DataFrame: | ||
| """Task 3: Clean the sales DataFrame using vectorized Pandas operations.""" | ||
| # TODO: Normalize product_name with .str.strip().str.title(). | ||
| sales["product_name"] = sales["product_name"].str.strip().str.title() | ||
| # TODO: Normalize customer_email with .str.lower().str.strip(). | ||
| sales["customer_email"] = sales["customer_email"].str.lower().str.strip() | ||
| # TODO: Convert price to numeric with pd.to_numeric(errors="coerce"). | ||
| sales["price"] = pd.to_numeric(sales["price"], errors="coerce") | ||
| # TODO: Parse date with pd.to_datetime(errors="coerce"). | ||
| sales["date"] = pd.to_datetime(sales["date"], errors="coerce") | ||
| # TODO: Drop rows where product_name is missing. | ||
| sales = sales.dropna(subset=["product_name"]) | ||
| # TODO: Drop rows where price is negative. | ||
| sales = sales.dropna(subset=["price"]) | ||
|
|
||
| # TODO: Drop rows where quantity is zero. | ||
| sales = sales.dropna(subset=["quantity"]) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same thing here — "drop rows where quantity is zero" needs a filter, not dropna(). The cleaned data still has quantity == 0 rows, which then inflate your order counts. Try:
|
||
|
|
||
| # TODO: Drop rows where date is NaT (invalid after parsing). | ||
| sales = sales.dropna(subset=["date"]) | ||
| # TODO: Remove duplicate transactions: .drop_duplicates(subset="transaction_id", keep="first"). | ||
| sales = sales.drop_duplicates(subset="transaction_id", keep="first") | ||
| # TODO: Decide what to do with outlier prices (clip, flag, or leave) and add a comment explaining why. | ||
| raise NotImplementedError("Task 3: implement clean_sales") | ||
| # For simplicity, we'll leave outlier prices as they are, but in a real scenario, we might want to investigate them further or apply business rules to handle them. | ||
| return sales | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,10 +15,21 @@ | |
| def download_inputs(data_dir: Path) -> None: | ||
| """Task 1: Download input CSV files from Azure Blob Storage.""" | ||
| # TODO: Create a BlobServiceClient using DefaultAzureCredential and ACCOUNT_URL. | ||
| credential = DefaultAzureCredential() | ||
| blob_service_client = BlobServiceClient(account_url=ACCOUNT_URL, credential=credential) | ||
| # TODO: Get a container client for SOURCE_CONTAINER. | ||
| container_client = blob_service_client.get_container_client(SOURCE_CONTAINER) | ||
| # TODO: For each filename in FILES, download the blob and write it to data_dir/<filename>. | ||
| for filename in FILES: | ||
| Path("data").mkdir(exist_ok=True) | ||
| for name in FILES: | ||
| blob = container_client.get_blob_client(name) | ||
| with open(f"data/{name}", "wb") as f: | ||
| f.write(blob.download_blob().readall()) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's an indentation bug in the download loop. The with open(...) block sits outside the for loop, so only the last file (messy_customers.csv) gets written, while messy_sales.csv would be skipped. It also hardcodes "data/" instead of using the data_dir argument. Restructure into one loop: |
||
|
|
||
| # TODO: Log a message for each downloaded file. | ||
| raise NotImplementedError("Task 1: implement download_inputs") | ||
| for filename in FILES: | ||
| logging.info("Downloaded %s", name) | ||
|
|
||
|
|
||
| def upload_outputs(output_dir: Path, github_username: str) -> None: | ||
|
|
@@ -31,4 +42,4 @@ def upload_outputs(output_dir: Path, github_username: str) -> None: | |
| # TODO: Upload every .parquet file in output_dir to the container. | ||
| # TODO: Download customer_summary.parquet back and assert its row count matches the local file. | ||
| # TODO: Log the container name and number of files uploaded. | ||
| raise NotImplementedError("Task 7: implement upload_outputs") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Negative prices aren't being removed here. The task is "drop rows where price is negative," but dropna() only drops blank values. I ran your pipeline and the cleaned data still contains a price of -149.99. You want a boolean filter here:
sales = sales[sales["price"] >= 0]