Skip to content

Commit 3641098

Browse files
committed
refactor: live AI analysis on every poll cycle
- AI is now called on every poll (every 2 min by default) with fresh CF events + recent application logs, behaving like a sysadmin watching the deploy in real time - Verdicts changed from WAIT/CANCEL to CONTINUE/CANCEL; AI unreachable defaults to CONTINUE (safe) instead of WAIT - hang_threshold_minutes is now a hint to the AI rather than a hard trigger - auto_cancel=false posts an advisory commit comment but keeps monitoring - Three-step structure (Monitor → Analyse → Cancel) collapsed into one loop - Log group name resolved once at startup rather than every AI retry
1 parent 5b29f26 commit 3641098

1 file changed

Lines changed: 94 additions & 175 deletions

File tree

.github/workflows/reusable-cdk-deploy-monitor.yml

Lines changed: 94 additions & 175 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@ on:
1717
required: false
1818
default: ""
1919
hang_threshold_minutes:
20-
description: "Minutes with no new CloudFormation events before asking AI (keep under 40 with defaults: up to 3x this value is spent in AI retries within the 120-minute job timeout)"
20+
description: "Minutes with no new CloudFormation events considered suspicious; passed to the AI as context"
2121
type: number
2222
required: false
2323
default: 10
2424
poll_interval_seconds:
25-
description: "How often to poll CloudFormation events (seconds)"
25+
description: "How often to poll and run AI analysis (seconds)"
2626
type: number
2727
required: false
2828
default: 120
@@ -58,7 +58,7 @@ jobs:
5858
role-to-assume: ${{ secrets.aws_role_arn }}
5959
aws-region: ${{ inputs.aws_region }}
6060

61-
- name: Monitor stack
61+
- name: Monitor and analyse
6262
env:
6363
STACK_NAME: ${{ inputs.stack_name }}
6464
AWS_REGION: ${{ inputs.aws_region }}
@@ -72,9 +72,6 @@ jobs:
7272
run: |
7373
set -euo pipefail
7474
75-
HANG_THRESHOLD_SECONDS=$(( HANG_THRESHOLD_MINUTES * 60 ))
76-
LAST_EVENT_TIME=$(date -u +%s)
77-
7875
TERMINAL_STATUSES="CREATE_COMPLETE UPDATE_COMPLETE DELETE_COMPLETE \
7976
CREATE_FAILED UPDATE_FAILED DELETE_FAILED \
8077
ROLLBACK_COMPLETE UPDATE_ROLLBACK_COMPLETE \
@@ -87,12 +84,28 @@ jobs:
8784
}
8885
8986
echo "Monitoring stack: $STACK_NAME"
90-
echo "Hang threshold: ${HANG_THRESHOLD_MINUTES}m (${HANG_THRESHOLD_SECONDS}s)"
9187
echo "Poll interval: ${POLL_INTERVAL_SECONDS}s"
88+
echo "Hang hint for AI: ${HANG_THRESHOLD_MINUTES}m with no CF events"
89+
90+
# Resolve log group name once (prefix → actual name)
91+
RESOLVED_LOG_GROUP=""
92+
if [[ -n "$LOG_GROUP_NAME" ]]; then
93+
RESOLVED_LOG_GROUP=$(aws logs describe-log-groups \
94+
--log-group-name-prefix "$LOG_GROUP_NAME" \
95+
--region "$AWS_REGION" \
96+
--query 'logGroups[0].logGroupName' \
97+
--output text 2>/dev/null || echo "")
98+
if [[ -z "$RESOLVED_LOG_GROUP" || "$RESOLVED_LOG_GROUP" == "None" ]]; then
99+
echo "Log group '$LOG_GROUP_NAME' not found -- log fetching disabled."
100+
RESOLVED_LOG_GROUP=""
101+
else
102+
echo "Log group: $RESOLVED_LOG_GROUP"
103+
fi
104+
fi
92105
93106
# If the stack is already in a terminal state when the monitor starts,
94107
# wait up to 5 minutes for the CDK deploy to begin its update before
95-
# entering the hang-detection loop. This avoids premature exit when the
108+
# entering the monitoring loop. This avoids premature exit when the
96109
# monitor job starts before CDK has called CloudFormation.
97110
INITIAL_STATUS=$(aws cloudformation describe-stacks \
98111
--stack-name "$STACK_NAME" \
@@ -114,7 +127,6 @@ jobs:
114127
--output text 2>/dev/null || echo "STACK_NOT_FOUND")
115128
if ! is_terminal "$INITIAL_STATUS" && [[ "$INITIAL_STATUS" != "STACK_NOT_FOUND" ]]; then
116129
echo "Stack entered: $INITIAL_STATUS -- starting monitoring."
117-
LAST_EVENT_TIME=$(date -u +%s)
118130
break
119131
fi
120132
if [[ $(date -u +%s) -ge "$STARTUP_DEADLINE" ]]; then
@@ -134,204 +146,111 @@ jobs:
134146
135147
echo "[$(date -u +%H:%M:%S)] Stack status: $STACK_STATUS"
136148
137-
# Exit if stack reached a terminal state
138-
for STATUS in $TERMINAL_STATUSES; do
139-
if [[ "$STACK_STATUS" == "$STATUS" ]]; then
140-
echo "Stack reached terminal state: $STACK_STATUS -- monitor exiting."
141-
exit 0
142-
fi
143-
done
144-
145149
if [[ "$STACK_STATUS" == "STACK_NOT_FOUND" ]]; then
146-
echo "Stack not found -- may have been deleted or never existed. Exiting."
150+
echo "Stack not found -- exiting."
147151
exit 0
148152
fi
149153
150-
# Fetch latest CloudFormation event timestamp
151-
LATEST_EVENT_TS=$(aws cloudformation describe-stack-events \
152-
--stack-name "$STACK_NAME" \
153-
--region "$AWS_REGION" \
154-
--query 'StackEvents[0].Timestamp' \
155-
--output text 2>/dev/null || echo "")
156-
157-
if [[ -n "$LATEST_EVENT_TS" && "$LATEST_EVENT_TS" != "None" ]]; then
158-
# Convert ISO8601 timestamp to epoch (supports both Linux and macOS date)
159-
LATEST_EPOCH=$(date -u -d "$LATEST_EVENT_TS" +%s 2>/dev/null \
160-
|| date -u -j -f "%Y-%m-%dT%H:%M:%S" "${LATEST_EVENT_TS%%.*}" +%s 2>/dev/null \
161-
|| echo "$LAST_EVENT_TIME")
162-
163-
if [[ "$LATEST_EPOCH" -gt "$LAST_EVENT_TIME" ]]; then
164-
echo "New event at $LATEST_EVENT_TS -- resetting hang timer."
165-
LAST_EVENT_TIME=$LATEST_EPOCH
166-
fi
167-
fi
168-
169-
# Check if hang threshold exceeded
170-
NOW=$(date -u +%s)
171-
SECONDS_SINCE_LAST_EVENT=$(( NOW - LAST_EVENT_TIME ))
172-
173-
if [[ "$SECONDS_SINCE_LAST_EVENT" -ge "$HANG_THRESHOLD_SECONDS" ]]; then
174-
echo "No new events for ${SECONDS_SINCE_LAST_EVENT}s (threshold: ${HANG_THRESHOLD_SECONDS}s). Triggering AI analysis."
175-
echo "Last event timestamp: ${LATEST_EVENT_TS:-unknown}"
176-
echo "AI_TRIGGER" > /tmp/ai_trigger
154+
if is_terminal "$STACK_STATUS"; then
155+
echo "Stack reached terminal state: $STACK_STATUS -- monitor exiting."
177156
exit 0
178157
fi
179158
180-
sleep "$POLL_INTERVAL_SECONDS"
181-
done
182-
183-
- name: Analyse hang with AI and act
184-
if: always() && !cancelled()
185-
env:
186-
STACK_NAME: ${{ inputs.stack_name }}
187-
AWS_REGION: ${{ inputs.aws_region }}
188-
LOG_GROUP_NAME: ${{ inputs.log_group_name }}
189-
HANG_THRESHOLD_MINUTES: ${{ inputs.hang_threshold_minutes }}
190-
GH_TOKEN: ${{ github.token }}
191-
REPO: ${{ github.repository }}
192-
SHA: ${{ github.sha }}
193-
run: |
194-
set -euo pipefail
195-
196-
if [[ ! -f /tmp/ai_trigger ]]; then
197-
echo "No hang detected -- skipping AI analysis."
198-
exit 0
199-
fi
200-
201-
MAX_AI_RETRIES=3
202-
AI_RETRIES=0
203-
204-
HANG_DETECTED_AT=$(date -u +%s)
205-
while [[ "$AI_RETRIES" -lt "$MAX_AI_RETRIES" ]]; do
206-
echo "AI analysis attempt $((AI_RETRIES + 1)) of $MAX_AI_RETRIES"
207-
208-
# Fetch last 50 CloudFormation events
159+
# Fetch recent CloudFormation events
209160
CF_EVENTS=$(aws cloudformation describe-stack-events \
210161
--stack-name "$STACK_NAME" \
211162
--region "$AWS_REGION" \
212-
--max-items 50 \
163+
--max-items 20 \
213164
--query 'StackEvents[*].{Time:Timestamp,Status:ResourceStatus,Resource:LogicalResourceId,Reason:ResourceStatusReason}' \
214165
--output json 2>/dev/null || echo "[]")
215166
216-
# Fetch CloudWatch logs if log_group_name is set
167+
# Calculate time since last CF event
168+
LATEST_EVENT_TS=$(echo "$CF_EVENTS" | jq -r '.[0].Time // empty' 2>/dev/null || echo "")
169+
MINUTES_SINCE_EVENT=0
170+
if [[ -n "$LATEST_EVENT_TS" ]]; then
171+
LATEST_EPOCH=$(date -u -d "$LATEST_EVENT_TS" +%s 2>/dev/null \
172+
|| date -u -j -f "%Y-%m-%dT%H:%M:%S" "${LATEST_EVENT_TS%%.*}" +%s 2>/dev/null \
173+
|| echo "0")
174+
MINUTES_SINCE_EVENT=$(( ( $(date -u +%s) - LATEST_EPOCH ) / 60 ))
175+
fi
176+
177+
# Fetch recent application logs (last 2x poll interval)
217178
CW_LOGS=""
218-
if [[ -n "$LOG_GROUP_NAME" ]]; then
219-
RESOLVED_LOG_GROUP=$(aws logs describe-log-groups \
220-
--log-group-name-prefix "$LOG_GROUP_NAME" \
179+
if [[ -n "$RESOLVED_LOG_GROUP" ]]; then
180+
CW_LOGS=$(aws logs filter-log-events \
181+
--log-group-name "$RESOLVED_LOG_GROUP" \
221182
--region "$AWS_REGION" \
222-
--query 'logGroups[0].logGroupName' \
223-
--output text 2>/dev/null || echo "")
224-
225-
if [[ -n "$RESOLVED_LOG_GROUP" && "$RESOLVED_LOG_GROUP" != "None" ]]; then
226-
echo "Fetching logs from: $RESOLVED_LOG_GROUP"
227-
CW_LOGS=$(aws logs filter-log-events \
228-
--log-group-name "$RESOLVED_LOG_GROUP" \
229-
--region "$AWS_REGION" \
230-
--start-time "$(( ($(date -u +%s) - 1800) * 1000 ))" \
231-
--query 'events[*].message' \
232-
--output text 2>/dev/null \
233-
| tail -200 || echo "")
234-
else
235-
echo "Log group '$LOG_GROUP_NAME' not found -- skipping log fetch."
236-
fi
183+
--start-time "$(( ($(date -u +%s) - POLL_INTERVAL_SECONDS * 2) * 1000 ))" \
184+
--query 'events[*].message' \
185+
--output text 2>/dev/null | tail -50 || echo "")
237186
fi
238187
239-
# Build prompt
240-
MINUTES_ELAPSED=$(( ($(date -u +%s) - HANG_DETECTED_AT) / 60 ))
241-
# Ensure at least 1 minute is reported (first call)
242-
[[ "$MINUTES_ELAPSED" -lt 1 ]] && MINUTES_ELAPSED=1
243-
CF_EVENTS_TRIMMED=$(echo "$CF_EVENTS" | head -c 8000)
244-
PROMPT="Stack name: $STACK_NAME"$'\n'"Region: $AWS_REGION"$'\n'"Minutes since last CloudFormation event: $MINUTES_ELAPSED (attempt $((AI_RETRIES + 1)) of $MAX_AI_RETRIES)"$'\n\n'"Recent CloudFormation events (newest first, JSON):"$'\n'"$CF_EVENTS_TRIMMED"
188+
# Build AI prompt
189+
CF_EVENTS_TRIMMED=$(echo "$CF_EVENTS" | head -c 4000)
190+
PROMPT="Stack: $STACK_NAME ($STACK_STATUS)"$'\n'
191+
PROMPT+="Minutes since last CloudFormation event: $MINUTES_SINCE_EVENT"$'\n'
192+
PROMPT+="Operator hang threshold: ${HANG_THRESHOLD_MINUTES} minutes with no CF events"$'\n\n'
193+
PROMPT+="Recent CloudFormation events (newest first):"$'\n'"$CF_EVENTS_TRIMMED"
245194
if [[ -n "$CW_LOGS" ]]; then
246-
CW_LOGS_TRIMMED=$(echo "$CW_LOGS" | head -c 4000)
247-
PROMPT="$PROMPT"$'\n\n'"Recent application logs (last 200 lines):"$'\n'"$CW_LOGS_TRIMMED"
195+
CW_LOGS_TRIMMED=$(echo "$CW_LOGS" | head -c 3000)
196+
PROMPT+=$'\n\n'"Recent application logs (last ~${POLL_INTERVAL_SECONDS}s):"$'\n'"$CW_LOGS_TRIMMED"
248197
fi
249198
250-
# Call GitHub Models API
199+
# Call AI
251200
REQUEST_BODY=$(jq -n \
252201
--arg prompt "$PROMPT" \
253-
'{model:"gpt-4o",messages:[{role:"system",content:"You are a senior DevOps engineer expert in AWS CloudFormation and ECS deployments. Be concise and decisive."},{role:"user",content:("Is this CloudFormation deployment stuck (will not progress without intervention) or still making progress?\n\n" + $prompt + "\n\nReply with exactly CANCEL or WAIT on the first line, followed by a 2-sentence explanation.")}],max_tokens:300}')
202+
'{model:"gpt-4o",messages:[
203+
{role:"system",content:"You are a senior DevOps engineer monitoring an AWS CloudFormation/ECS deployment in real time. You see the stack status, recent CloudFormation events, and recent application logs on every check. Be concise and decisive."},
204+
{role:"user",content:("Is this deployment progressing normally, or is it stuck/failing and needs to be cancelled?\n\n" + $prompt + "\n\nReply with CONTINUE or CANCEL on the first line, followed by one sentence explaining why.")}
205+
],max_tokens:150}')
254206
255207
RESPONSE=$(curl -sf \
256208
-H "Authorization: Bearer $GH_TOKEN" \
257209
-H "Content-Type: application/json" \
258210
"https://models.inference.ai.azure.com/chat/completions" \
259211
-d "$REQUEST_BODY" 2>/dev/null || echo '{}')
260212
261-
AI_TEXT=$(echo "$RESPONSE" | jq -r '.choices[0].message.content // "WAIT Could not reach AI model."' 2>/dev/null || echo "WAIT Could not parse AI response.")
262-
VERDICT=$(echo "$AI_TEXT" | head -1 | tr '[:lower:]' '[:upper:]' | grep -oE '^(CANCEL|WAIT)' || echo "WAIT")
263-
264-
echo "AI verdict: $VERDICT"
265-
echo "AI explanation: $AI_TEXT"
266-
267-
# Post commit comment
268-
COMMENT_BODY="### CDK Deploy Monitor"$'\n\n'"**Stack:** \`$STACK_NAME\`"$'\n'"**No new CloudFormation events for:** ${MINUTES_ELAPSED} minutes (check $((AI_RETRIES + 1)) of $MAX_AI_RETRIES)"$'\n\n'"**AI Verdict:** \`$VERDICT\`"$'\n\n'"${AI_TEXT}"$'\n\n'"---"$'\n'"*Posted by [reusable-cdk-deploy-monitor](https://github.com/geolonia/.github/blob/main/.github/workflows/reusable-cdk-deploy-monitor.yml)*"
213+
AI_TEXT=$(echo "$RESPONSE" | jq -r '.choices[0].message.content // ""' 2>/dev/null || echo "")
214+
if [[ -z "$AI_TEXT" ]]; then
215+
echo "AI unreachable -- defaulting to CONTINUE."
216+
sleep "$POLL_INTERVAL_SECONDS"
217+
continue
218+
fi
269219
270-
gh api \
271-
"repos/$REPO/commits/$SHA/comments" \
272-
-f body="$COMMENT_BODY" \
273-
2>/dev/null || echo "Warning: failed to post commit comment"
220+
VERDICT=$(echo "$AI_TEXT" | head -1 | tr '[:lower:]' '[:upper:]' | grep -oE '^(CONTINUE|CANCEL)' || echo "CONTINUE")
221+
echo "AI verdict: $VERDICT -- $AI_TEXT"
274222
275223
if [[ "$VERDICT" == "CANCEL" ]]; then
276-
echo "CANCEL" > /tmp/ai_verdict
277-
exit 0
224+
COMMENT_BODY="### CDK Deploy Monitor"$'\n\n'
225+
COMMENT_BODY+="**Stack:** \`$STACK_NAME\` | **Status:** \`$STACK_STATUS\`"$'\n'
226+
COMMENT_BODY+="**No new CF events for:** ${MINUTES_SINCE_EVENT} minutes"$'\n\n'
227+
COMMENT_BODY+="**AI Verdict:** \`CANCEL\`"$'\n\n'"${AI_TEXT}"$'\n\n'
228+
COMMENT_BODY+="---"$'\n'"*Posted by [reusable-cdk-deploy-monitor](https://github.com/geolonia/.github/blob/main/.github/workflows/reusable-cdk-deploy-monitor.yml)*"
229+
230+
gh api "repos/$REPO/commits/$SHA/comments" \
231+
-f body="$COMMENT_BODY" 2>/dev/null || echo "Warning: failed to post commit comment"
232+
233+
if [[ "$AUTO_CANCEL" == "true" ]]; then
234+
echo "Cancelling stack update: $STACK_NAME"
235+
if aws cloudformation cancel-update-stack \
236+
--stack-name "$STACK_NAME" \
237+
--region "$AWS_REGION" 2>/tmp/cancel_error; then
238+
echo "Stack update cancelled. CloudFormation is rolling back."
239+
gh api "repos/$REPO/commits/$SHA/comments" \
240+
-f body="### CDK Deploy Monitor -- Update Cancelled"$'\n\n'"Stack \`$STACK_NAME\` update was cancelled (AI verdict: CANCEL, auto_cancel: true)."$'\n'"CloudFormation is rolling back. The deploy job will fail with a rollback error -- this is expected." \
241+
2>/dev/null || true
242+
else
243+
CANCEL_ERR=$(cat /tmp/cancel_error 2>/dev/null || echo "unknown error")
244+
echo "cancel-update-stack failed: $CANCEL_ERR"
245+
gh api "repos/$REPO/commits/$SHA/comments" \
246+
-f body="### CDK Deploy Monitor -- Cancel Attempt Failed"$'\n\n'"Tried to cancel \`$STACK_NAME\` but got: $CANCEL_ERR"$'\n\n'"The stack may have already completed or been cancelled manually." \
247+
2>/dev/null || true
248+
fi
249+
exit 0
250+
else
251+
echo "auto_cancel=false -- advisory comment posted, continuing to monitor."
252+
fi
278253
fi
279254
280-
AI_RETRIES=$(( AI_RETRIES + 1 ))
281-
if [[ "$AI_RETRIES" -lt "$MAX_AI_RETRIES" ]]; then
282-
echo "AI said WAIT -- sleeping ${HANG_THRESHOLD_MINUTES}m before retry..."
283-
sleep $(( HANG_THRESHOLD_MINUTES * 60 ))
284-
else
285-
echo "Max AI retries reached -- treating as CANCEL."
286-
echo "CANCEL" > /tmp/ai_verdict
287-
fi
255+
sleep "$POLL_INTERVAL_SECONDS"
288256
done
289-
290-
- name: Cancel stack update (if verdict is CANCEL and auto_cancel is true)
291-
if: always() && !cancelled()
292-
env:
293-
STACK_NAME: ${{ inputs.stack_name }}
294-
AWS_REGION: ${{ inputs.aws_region }}
295-
AUTO_CANCEL: ${{ inputs.auto_cancel }}
296-
GH_TOKEN: ${{ github.token }}
297-
REPO: ${{ github.repository }}
298-
SHA: ${{ github.sha }}
299-
run: |
300-
set -euo pipefail
301-
302-
if [[ ! -f /tmp/ai_verdict ]]; then
303-
echo "No AI verdict -- nothing to cancel."
304-
exit 0
305-
fi
306-
307-
VERDICT=$(cat /tmp/ai_verdict)
308-
309-
if [[ "$VERDICT" != "CANCEL" ]]; then
310-
echo "AI verdict is $VERDICT -- skipping cancellation."
311-
exit 0
312-
fi
313-
314-
if [[ "$AUTO_CANCEL" != "true" ]]; then
315-
echo "auto_cancel=false -- advisory comment posted, not cancelling stack."
316-
exit 0
317-
fi
318-
319-
echo "Cancelling stack update: $STACK_NAME"
320-
if aws cloudformation cancel-update-stack \
321-
--stack-name "$STACK_NAME" \
322-
--region "$AWS_REGION" 2>/tmp/cancel_error; then
323-
echo "Stack update cancelled. CloudFormation is rolling back."
324-
CANCEL_COMMENT="### CDK Deploy Monitor -- Update Cancelled"$'\n\n'"Stack \`$STACK_NAME\` update was cancelled (AI verdict: CANCEL, auto_cancel: true)."$'\n'"CloudFormation is rolling back. The deploy job will fail with a rollback error -- this is expected."
325-
gh api \
326-
"repos/$REPO/commits/$SHA/comments" \
327-
-f body="$CANCEL_COMMENT" \
328-
2>/dev/null || echo "Warning: failed to post cancel comment"
329-
else
330-
CANCEL_ERR=$(cat /tmp/cancel_error 2>/dev/null || echo "unknown error")
331-
echo "cancel-update-stack failed: $CANCEL_ERR"
332-
CANCEL_COMMENT="### CDK Deploy Monitor -- Cancel Attempt Failed"$'\n\n'"Tried to cancel \`$STACK_NAME\` but got an error:"$'\n'"$CANCEL_ERR"$'\n\n'"The stack may have already completed or been cancelled manually."
333-
gh api \
334-
"repos/$REPO/commits/$SHA/comments" \
335-
-f body="$CANCEL_COMMENT" \
336-
2>/dev/null || echo "Warning: failed to post error comment"
337-
fi

0 commit comments

Comments
 (0)