Debian
如何調試 PostgreSQL 分段錯誤?
我有一個不斷崩潰的 PostgreSQL 13 實例:
LOG: server process (PID 10722) was terminated by signal 11: Segmentation fault DETAIL: Failed process was running: COMMIT LOG: terminating any other active server processes WARNING: terminating connection because of crash of another server process DETAIL: The postmaster has commanded this server process to roll back the current transaction and exit, because another server process exited abnormally and possibly corrupted shared memory.
我已更新
/etc/postgresql/13/main/pg_ctl.conf
以包含核心轉儲pg_ctl_options = '--core-files'
並重新啟動
postgresql
服務。現在它似乎允許核心轉儲:$ for f in `pgrep postgres`; do cat /proc/$f/limits | grep core; done Max core file size unlimited unlimited bytes
gdb
回溯給出以下輸出$ gdb /usr/lib/postgresql/13/bin/postgres 13/main/core.postgres.12264 Program terminated with signal SIGSEGV, Segmentation fault. #0 slot_deform_heap_tuple (natts=5, offp=0x557cc2e60720, tuple=<optimized out>, slot=0x557cc2e606d8) at ./build/../src/backend/executor/execTuples.c:930 930 ./build/../src/backend/executor/execTuples.c: No such file or directory. (gdb) bt #0 slot_deform_heap_tuple (natts=5, offp=0x557cc2e60720, tuple=<optimized out>, slot=0x557cc2e606d8) at ./build/../src/backend/executor/execTuples.c:930 #1 tts_buffer_heap_getsomeattrs (slot=0x557cc2e606d8, natts=5) at ./build/../src/backend/executor/execTuples.c:695 #2 0x0000557cc1d3998c in slot_getsomeattrs_int (slot=slot@entry=0x557cc2e606d8, attnum=5) at ./build/../src/backend/executor/execTuples.c:1912 #3 0x0000557cc1d28fba in slot_getsomeattrs (attnum=<optimized out>, slot=0x557cc2e606d8) at ./build/../src/include/executor/tuptable.h:344 #4 ExecInterpExpr (state=0x557cc2e620a8, econtext=0x557cc2ea1768, isnull=<optimized out>) at ./build/../src/backend/executor/execExprInterp.c:482 #5 0x0000557cc1d5548d in ExecEvalExprSwitchContext (isNull=0x7ffdd2599507, econtext=0x557cc2ea1768, state=0x557cc2e620a8) at ./build/../src/include/executor/executor.h:322 #6 ExecQual (econtext=0x557cc2ea1768, state=0x557cc2e620a8) at ./build/../src/include/executor/executor.h:391 #7 MJFillInner (node=0x557cc2ea1558) at ./build/../src/backend/executor/nodeMergejoin.c:494 #8 0x0000557cc1d55ce8 in ExecMergeJoin (pstate=0x557cc2ea1558) at ./build/../src/backend/executor/nodeMergejoin.c:1353 #9 0x0000557cc1d2cc83 in ExecProcNode (node=0x557cc2ea1558) at ./build/../src/include/executor/executor.h:248 #10 ExecutePlan (execute_once=<optimized out>, dest=0x557cc2e1a630, direction=<optimized out>, numberTuples=0, sendTuples=<optimized out>, operation=CMD_SELECT, use_parallel_mode=<optimized out>, planstate=0x557cc2ea1558, estate=0x557cc2ea12f8) at ./build/../src/backend/executor/execMain.c:1632 #11 standard_ExecutorRun (queryDesc=0x557cc2e1a5a0, direction=<optimized out>, count=0, execute_once=<optimized out>) at ./build/../src/backend/executor/execMain.c:350 #12 0x00007f0ec05ae09d in pgss_ExecutorRun (queryDesc=0x557cc2e1a5a0, direction=ForwardScanDirection, count=0, execute_once=<optimized out>) at ./build/../contrib/pg_stat_statements/pg_stat_statements.c:1045 #13 0x0000557cc1cdbcd4 in PersistHoldablePortal (portal=portal@entry=0x557cc2d44b78) at ./build/../src/backend/commands/portalcmds.c:407 #14 0x0000557cc1ff95f9 in HoldPortal (portal=portal@entry=0x557cc2d44b78) at ./build/../src/backend/utils/mmgr/portalmem.c:642 #15 0x0000557cc1ff9e7d in PreCommit_Portals (isPrepare=isPrepare@entry=false) at ./build/../src/backend/utils/mmgr/portalmem.c:738 #16 0x0000557cc1c001c4 in CommitTransaction () at ./build/../src/backend/access/transam/xact.c:2087 #17 0x0000557cc1c015d5 in CommitTransactionCommand () at ./build/../src/backend/access/transam/xact.c:3085 #18 0x0000557cc1ea211d in finish_xact_command () at ./build/../src/backend/tcop/postgres.c:2662 #19 0x0000557cc1ea4703 in exec_simple_query (query_string=0x557cc2c9cd28 "COMMIT") at ./build/../src/backend/tcop/postgres.c:1264 #20 0x0000557cc1ea6143 in PostgresMain (argc=<optimized out>, argv=argv@entry=0x557cc2cf6c68, dbname=<optimized out>, username=<optimized out>) at ./build/../src/backend/tcop/postgres.c:4339 #21 0x0000557cc1e25bcd in BackendRun (port=0x557cc2ce94d0, port=0x557cc2ce94d0) at ./build/../src/backend/postmaster/postmaster.c:4526 #22 BackendStartup (port=0x557cc2ce94d0) at ./build/../src/backend/postmaster/postmaster.c:4210 #23 ServerLoop () at ./build/../src/backend/postmaster/postmaster.c:1739 #24 0x0000557cc1e26b41 in PostmasterMain (argc=5, argv=<optimized out>) at ./build/../src/backend/postmaster/postmaster.c:1412 #25 0x0000557cc1b70f4f in main (argc=5, argv=0x557cc2c96c30) at ./build/../src/backend/main/main.c:210
添加
log_statement = 'all'
到/etc/postgresql/13/main/postgresql.conf
並沒有真正的幫助,因為postmaster
會立即終止所有程序並且查詢不會寫入日誌。這是
strace
執行後的輸出COMMIT
[pid 20006] pwrite64(29, "COMMIT", 6, 15936) = 6 [pid 20006] pwrite64(29, "\0", 1, 15942) = 1 [pid 20006] close(29) = 0 [pid 20006] --- SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0x10} --- [pid 20006] +++ killed by SIGSEGV (core dumped) +++ <... select resumed> ) = ? ERESTARTNOHAND (To be restarted if no handler) --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_DUMPED, si_pid=20006, si_uid=108, si_status=SIGSEGV, si_utime=0, si_stime=0} --- wait4(-1, [{WIFSIGNALED(s) && WTERMSIG(s) == SIGSEGV && WCOREDUMP(s)}], WNOHANG, NULL) = 20006 write(2, "2021-09-08 13:38:51.853 UTC [299"..., 198) = 198 write(2, "2021-09-08 13:38:51.853 UTC [299"..., 88) = 88 kill(19324, SIGQUIT) = 0 kill(-19324, SIGQUIT) = 0 kill(19331, SIGQUIT) = 0 kill(-19331, SIGQUIT) = 0 kill(19320, SIGQUIT) = 0 kill(-19320, SIGQUIT) = 0 kill(19319, SIGQUIT) = 0 kill(-19319, SIGQUIT) = 0 kill(19321, SIGQUIT) = 0 kill(-19321, SIGQUIT) = 0 kill(19322, SIGQUIT) = 0 kill(-19322, SIGQUIT) = 0 kill(19323, SIGQUIT) = 0 kill(-19323, SIGQUIT) = 0 wait4(-1, 0x7ffe90814374, WNOHANG, NULL) = 0 rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call) rt_sigprocmask(SIG_SETMASK, ~[ILL TRAP ABRT BUS FPE SEGV CONT SYS RTMIN RT_1], NULL, 8) = 0 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 select(7, [5 6], NULL, NULL, {tv_sec=5, tv_usec=0}) = ? ERESTARTNOHAND (To be restarted if no handler) --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=19320, si_uid=108, si_status=2, si_utime=14, si_stime=3} ---
有沒有辦法追溯執行的確切 SQL 查詢?
首先為您的發行版安裝調試符號,對於 Debian 發行版:
apt install gdb postgresql-13-dbgsym
跳轉到包含一些
queryDesc
變數的幀,例如12
:(gdb) frame 12 #12 0x00007f0ec05ae09d in pgss_ExecutorRun (queryDesc=0x557cc302b7d0, direction=ForwardScanDirection, count=0, execute_once=<optimized out>) at ./build/../contrib/pg_stat_statements/pg_stat_statements.c:1045 1045 in ./build/../contrib/pg_stat_statements/pg_stat_statements.c
列印該變數:
(gdb) p queryDesc $1 = (QueryDesc *) 0x557cc302b7d0
現在在等號之後複製上面的行並使用取消引用它
*
(gdb) p *(QueryDesc *) 0x557cc302b7d0 $6 = {operation = CMD_SELECT, plannedstmt = 0x557cc300e218, sourceText = 0x557cc302b370 "\n", ' ' <repeats 12 times>, "DECLARE \"categoryPagePhotoUrl_image_urls\" CURSOR WITH HOLD FOR\n", ' ' <repeats 12 times>, "SELECT di.itemId, image_number, filename FROM (SELECT *\n", ' ' <repeats 12 times>, "FROM downl"..., snapshot = 0x557cc2e9b188, crosscheck_snapshot = 0x0, dest = 0x557cc302b860, params = 0x0, queryEnv = 0x0, instrument_options = 0, tupDesc = 0x557cc2f7bff8, estate = 0x557cc2cf8d08, planstate = 0x557cc2cf8f68, already_executed = true, totaltime = 0x0}
它不會為您提供整個查詢,但至少可以了解執行查詢的表。
根據
gdb
輸出,我設法隔離了正在執行此類查詢的客戶端。我嘗試
VACUUM FULL
在受影響的表上執行,重建表和索引,切換到副本,使用pg_dump
. 儘管如此,該問題仍然存在於數據庫副本上。最後,我設法隔離了一個最小的 SQL 程式碼來複製問題。
$ pg_createcluster 13 main $ createdb testdb $ psql -d testdb -f postgresql-segfault.sql CREATE SCHEMA CREATE TABLE COPY 1 ALTER TABLE BEGIN CREATE TABLE DECLARE CURSOR itemid --------- 1190300 (1 row) psql:postgresql-segfault:34: server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. psql:postgresql-segfault:34: fatal: connection to server was lost
使用複制程式碼就足以向pgsql -bugs郵件列表報告錯誤(還有一個網路表單)。結果是重新執行計劃的錯誤,該計劃已經在 PostgreSQL 13.4、12.8(以及可能的其他版本)中包含的不穩定游標上完成。