记录日常工作关于系统运维,虚拟化云计算,数据库,网络安全等各方面问题。

MySQL8表中有大量重复数据去重后保存到新表方案


一,找deepseek来解决

MySQL8 数据库中有两张表: 
 
 表 mat_transport_license,创建表SQL如下:


CREATE TABLE `mat_transport_license` (
  `ID` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '主键',
  `PERM_ID` int DEFAULT NULL COMMENT '微机编号',
  `PRINT_ID` varchar(8) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '纸张号',
  `ORDER_ID` varchar(35) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '凭证编号',
  UNIQUE KEY `PERM_ID` (`PERM_ID`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC COMMENT='准运证';




 表  mat_transport_license_detail ,创建表SQL如下:


CREATE TABLE `mat_transport_license_detail` (
  `ZYB_ID` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '主键',
  `PERM_ID` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  `ORDER_ID` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  `SPEC` varchar(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC ;




现需要根据表 mat_transport_license 中的 PERM_ID字段中的数据匹配 mat_transport_license_detail中 PERM_ID字段中的数据,
但 mat_transport_license_detail中有大量重复数据,需要根据 两表 PERM_ID字段关联数据,提取  mat_transport_license_detail中 MAX(ZYB_ID)行的数据,插入到新表 mat_transport_license_detail_new 。


因为表有大量数据约5千万条, 需要使用存储过程,分批执行,请给出优化方案:


二,DeeSeek 多次回复后提到的最终结果


错误原因很明确:字符集排序规则(collation)冲突

  • 明细表 mat_transport_license_detail 的字段使用 utf8mb4_general_ci

  • CAST(lic.PERM_ID AS CHAR) 没有指定排序规则,会继承数据库或会话的默认排序规则(MySQL 8.0 默认是 utf8mb4_0900_ai_ci

  • 两者在 = 比较时混用,导致 1267 错误。

解决方法:在转换时显式指定与明细表相同的排序规则,使用 COLLATE utf8mb4_general_ci


版本一(窗口函数版,已修正排序规则)

sql
DELIMITER $$

DROP PROCEDURE IF EXISTS proc_batch_max_zyb$$

CREATE PROCEDURE proc_batch_max_zyb(IN p_batch_count INT)
BEGIN
    DECLARE v_min_id INT;
    DECLARE v_max_id INT;
    DECLARE v_start INT DEFAULT 0;
    DECLARE v_end INT;
    DECLARE v_batch_rows INT;
    DECLARE v_total_rows INT DEFAULT 0;
    DECLARE v_done INT DEFAULT 0;

    SELECT MIN(PERM_ID), MAX(PERM_ID) INTO v_min_id, v_max_id
    FROM mat_transport_license WHERE PERM_ID IS NOT NULL;

    IF v_min_id IS NOT NULL THEN

        IF p_batch_count IS NULL OR p_batch_count <= 0 THEN
            SET p_batch_count = 5000;
        END IF;

        WHILE v_done = 0 DO
            SET v_end = v_start + p_batch_count;
            IF v_end > v_max_id THEN
                SET v_end = v_max_id;
                SET v_done = 1;
            END IF;

            START TRANSACTION;

            INSERT INTO mat_transport_license_detail_new
            SELECT
                ZYB_ID, PERM_ID, ORDER_ID, SERIAL, PRODUCT_ID,
                PRODUCT_NAME, MADE_IN, SPEC, UNIT, PRICE, QUANTITY, ZYB_TID
            FROM (
                SELECT
                    det.*,
                    ROW_NUMBER() OVER (
                        PARTITION BY det.PERM_ID ORDER BY det.ZYB_ID DESC
                    ) AS rn
                FROM mat_transport_license_detail det
                WHERE det.PERM_ID IN (
                    SELECT CAST(lic.PERM_ID AS CHAR CHARACTER SET utf8mb4) 
                           COLLATE utf8mb4_general_ci
                    FROM mat_transport_license lic
                    WHERE lic.PERM_ID > v_start AND lic.PERM_ID <= v_end
                )
            ) t
            WHERE t.rn = 1;

            SET v_batch_rows = ROW_COUNT();
            SET v_total_rows = v_total_rows + v_batch_rows;
            COMMIT;

            SELECT CONCAT('Batch [', v_start+1, ' ~ ', v_end, '] inserted ', 
                           v_batch_rows, ' rows, total: ', v_total_rows) AS progress;

            SET v_start = v_end;
        END WHILE;

        SELECT CONCAT('All done. Total rows: ', v_total_rows) AS final_message;

    ELSE
        SELECT 'No data in mat_transport_license' AS msg;
    END IF;
END$$

DELIMITER ;

版本二(无窗口函数版,同样修正)

sql
DELIMITER $$

DROP PROCEDURE IF EXISTS proc_batch_max_zyb_no_window$$

CREATE PROCEDURE proc_batch_max_zyb_no_window(IN p_batch_count INT)
BEGIN
    DECLARE v_min_id INT;
    DECLARE v_max_id INT;
    DECLARE v_start INT DEFAULT 0;
    DECLARE v_end INT;
    DECLARE v_batch_rows INT;
    DECLARE v_total_rows INT DEFAULT 0;
    DECLARE v_done INT DEFAULT 0;

    SELECT MIN(PERM_ID), MAX(PERM_ID) INTO v_min_id, v_max_id
    FROM mat_transport_license WHERE PERM_ID IS NOT NULL;

    IF v_min_id IS NOT NULL THEN

        IF p_batch_count IS NULL OR p_batch_count <= 0 THEN
            SET p_batch_count = 5000;
        END IF;

        WHILE v_done = 0 DO
            SET v_end = v_start + p_batch_count;
            IF v_end > v_max_id THEN
                SET v_end = v_max_id;
                SET v_done = 1;
            END IF;

            START TRANSACTION;

            INSERT INTO mat_transport_license_detail_new
            SELECT d.*
            FROM mat_transport_license_detail d
            INNER JOIN (
                SELECT PERM_ID, MAX(ZYB_ID) AS max_zyb
                FROM mat_transport_license_detail
                WHERE PERM_ID IN (
                    SELECT CAST(lic.PERM_ID AS CHAR CHARACTER SET utf8mb4) 
                           COLLATE utf8mb4_general_ci
                    FROM mat_transport_license lic
                    WHERE lic.PERM_ID > v_start AND lic.PERM_ID <= v_end
                )
                GROUP BY PERM_ID
            ) t ON d.PERM_ID = t.PERM_ID AND d.ZYB_ID = t.max_zyb;

            SET v_batch_rows = ROW_COUNT();
            SET v_total_rows = v_total_rows + v_batch_rows;
            COMMIT;

            SELECT CONCAT('Batch [', v_start+1, ' ~ ', v_end, '] inserted ', 
                           v_batch_rows, ' rows, total: ', v_total_rows) AS progress;

            SET v_start = v_end;
        END WHILE;

        SELECT CONCAT('All done. Total rows: ', v_total_rows) AS final_message;

    ELSE
        SELECT 'No data in mat_transport_license' AS msg;
    END IF;
END$$

DELIMITER ;

执行步骤

  1. 确认索引和目标表已存在(如果还未执行):

    sql
    ALTER TABLE mat_transport_license_detail ADD INDEX idx_perm_zyb (PERM_ID, ZYB_ID);
    CREATE TABLE mat_transport_license_detail_new LIKE mat_transport_license_detail;
  2. 调用存储过程

    sql
    CALL proc_batch_max_zyb(5000);
  3. 校验、补索引(同前)。

这两个版本已将子查询中的字符串转换强制设为 utf8mb4_general_ci,与明细表字段完全一致,不会再报 1267 错误。




转载请标明出处【MySQL8表中有大量重复数据去重后保存到新表方案】。

《www.micoder.cc》 虚拟化云计算,系统运维,安全技术服务.

网站已经关闭评论