优化集群相关代码

This commit is contained in:
Yun
2025-09-18 09:56:24 +08:00
parent 0cff7af265
commit 44eeb8468d
5 changed files with 260 additions and 223 deletions
+235 -212
View File
@@ -2,7 +2,6 @@ package timerx
import (
"context"
"encoding/json"
"errors"
"fmt"
"runtime/debug"
@@ -10,7 +9,7 @@ import (
"time"
"github.com/go-redis/redis/v8"
uuid "github.com/satori/go.uuid"
"github.com/google/uuid"
"github.com/yuninks/cachex"
"github.com/yuninks/lockx"
"github.com/yuninks/timerx/logger"
@@ -24,18 +23,12 @@ import (
// 暂不支持删除定时器,因为这个定时器的设计是基于全局的,如果删除了,那么其他服务就不知道了
// 单例模式
// var clusterOnceLimit sync.Once
// 已注册的任务列表
var clusterWorkerList sync.Map
type Cluster struct {
ctx context.Context
redis redis.UniversalClient
cache *cachex.Cache
timeout time.Duration
logger logger.Logger
ctx context.Context // context
redis redis.UniversalClient // redis
cache *cachex.Cache // 本地缓存
timeout time.Duration // job执行超时时间
logger logger.Logger // 日志
keyPrefix string // key前缀
location *time.Location // 根据时区计算的时间
@@ -46,10 +39,15 @@ type Cluster struct {
priority *priority.Priority // 全局优先级
priorityKey string // 全局优先级的key
usePriority bool
}
usePriority bool // 是否使用优先级
// var clu *Cluster = nil
wg sync.WaitGroup // 等待组
isLeader bool // 是否是领导
leaderLock sync.RWMutex // 领导锁
leaderKey string // 实例Id
workerList sync.Map // 注册的任务列表
stopChan chan struct{}
}
// 初始化定时器
// 全局只需要初始化一次
@@ -58,6 +56,8 @@ func InitCluster(ctx context.Context, red redis.UniversalClient, keyPrefix strin
// clusterOnceLimit.Do(func() {
op := newOptions(opts...)
// u, _ := uuid.NewV7()
clu := &Cluster{
ctx: ctx,
redis: red,
@@ -71,7 +71,9 @@ func InitCluster(ctx context.Context, red redis.UniversalClient, keyPrefix strin
listKey: "timer:cluster_listKey" + keyPrefix, // 列表
setKey: "timer:cluster_setKey" + keyPrefix, // 重入集合
priorityKey: "timer:cluster_priorityKey" + keyPrefix, // 全局优先级的key
leaderKey: "timer:cluster_leaderKey" + keyPrefix, // 领导
usePriority: op.usePriority,
stopChan: make(chan struct{}),
}
// 初始化优先级
@@ -80,33 +82,117 @@ func InitCluster(ctx context.Context, red redis.UniversalClient, keyPrefix strin
clu.priority = priority.InitPriority(ctx, red, clu.priorityKey, op.priorityVal, priority.SetLogger(clu.logger))
}
// 监听任务
go clu.watch()
timer := time.NewTicker(time.Millisecond * 200)
go func(ctx context.Context) {
Loop:
for {
select {
case <-timer.C:
if clu.usePriority {
if !clu.priority.IsLatest(ctx) {
continue
}
}
clu.getTask()
clu.getNextTime()
case <-ctx.Done():
break Loop
}
}
}(ctx)
// 启动守护进程
clu.startDaemon()
return clu
}
// 守护任务
func (l *Cluster) startDaemon() {
// 领导选举
l.wg.Add(1)
go l.leaderElection()
// 任务调度
l.wg.Add(1)
go l.scheduleTasks()
// 任务执行
l.wg.Add(1)
go l.executeTasks()
}
// 领导选举
// 领导作用:全局推选一个人计算执行时间&移入队列,避免每个都进行计算浪费资源
func (l *Cluster) leaderElection() {
defer l.wg.Done()
// 先执行一次
l.getLeaderLock()
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
l.getLeaderLock()
case <-l.stopChan:
return
case <-l.ctx.Done():
return
}
}
}
// 成为领导
func (l *Cluster) getLeaderLock() error {
// 尝试加锁
lock, err := lockx.NewGlobalLock(l.ctx, l.redis, l.leaderKey)
if err != nil {
l.logger.Errorf(l.ctx, "getLeaderLock err:%+v", err)
return err
}
if b, _ := lock.Lock(); !b {
// 加锁失败 非Reader
l.leaderLock.Lock()
l.isLeader = false
l.leaderLock.Unlock()
return nil
}
defer lock.Unlock()
// 加锁成功
l.leaderLock.Lock()
l.isLeader = true
l.leaderLock.Unlock()
l.logger.Infof(l.ctx, "getLeaderLock Instance %s became leader", lock.GetValue())
// 等待超时退出
select {
case <-lock.GetCtx().Done():
return nil
}
}
// isCurrentLeader 检查当前实例是否是leader
func (c *Cluster) isCurrentLeader() bool {
c.leaderLock.RLock()
defer c.leaderLock.RUnlock()
return c.isLeader
}
// scheduleTasks 调度任务(只有leader执行)
func (c *Cluster) scheduleTasks() {
defer c.wg.Done()
ticker := time.NewTicker(200 * time.Millisecond)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if !c.isCurrentLeader() {
continue
}
if c.usePriority && !c.priority.IsLatest(c.ctx) {
continue
}
c.calculateNextTimes()
c.moveReadyTasks()
case <-c.stopChan:
return
case <-c.ctx.Done():
return
}
}
}
// 每月执行一次
// @param ctx 上下文
// @param taskId 任务ID
@@ -225,30 +311,21 @@ func (c *Cluster) EverySpace(ctx context.Context, taskId string, spaceTime time.
// @param callback callback 回调函数
// @param extendData interface{} 扩展数据
// @return error
func (c *Cluster) addJob(ctx context.Context, taskId string, jobData JobData, callback func(ctx context.Context, extendData interface{}) error, extendData interface{}) error {
_, ok := clusterWorkerList.Load(taskId)
func (l *Cluster) addJob(ctx context.Context, taskId string, jobData JobData, callback func(ctx context.Context, extendData interface{}) error, extendData interface{}) error {
// 判断是否重复
_, ok := l.workerList.Load(taskId)
if ok {
c.logger.Errorf(ctx, "key已存在:%s", taskId)
return errors.New("key已存在")
l.logger.Errorf(ctx, "Cluster addJob taskId exits:%s", taskId)
return ErrTaskIdExists
}
_, err := GetNextTime(time.Now().In(c.location), jobData)
// 校验时间是否合法
_, err := GetNextTime(time.Now().In(l.location), jobData)
if err != nil {
c.logger.Errorf(ctx, "获取下次执行时间失败:%s", err.Error())
l.logger.Errorf(ctx, "Cluster addJob GetNextTime err:%s", err.Error())
return err
}
// ctx, cancel := context.WithCancel(ctx)
// defer cancel()
// lock := lockx.NewGlobalLock(ctx, c.redis, taskId)
// tB := lock.Try(2)
// if !tB {
// c.logger.Errorf(ctx, "添加失败:%s", taskId)
// return errors.New("添加失败")
// }
// defer lock.Unlock()
t := timerStr{
Callback: callback,
ExtendData: extendData,
@@ -256,196 +333,129 @@ func (c *Cluster) addJob(ctx context.Context, taskId string, jobData JobData, ca
JobData: &jobData,
}
clusterWorkerList.Store(taskId, t)
l.workerList.Store(taskId, t)
return err
l.logger.Infof(ctx, "Cluster addJob taskId:%s", taskId)
return nil
}
// 计算下一次执行的时间
// TODO:注册的任务需放在Redis集中存储,因为本地的话,如果有多个服务,那么就会出现不一致的情况。但是要注意服务如何进行下线,由于是主动上报的,需要有一个机制进行删除过期的任务(添加任务&定时器轮训注册)
// TODO:考虑不同实例系统时间不一样,可能计算的下次时间不一致,会有重复执行的可能
func (c *Cluster) getNextTime() {
func (l *Cluster) calculateNextTimes() {
lock := lockx.NewGlobalLock(c.ctx, c.redis, c.lockKey)
// 获取锁
lockBool := lock.Lock()
if !lockBool {
// log.Println("timer:获取锁失败")
return
}
defer lock.Unlock()
// 计算下一次时间
// p := c.redis.Pipeline()
pipe := l.redis.Pipeline()
// 根据内部注册的任务列表计算下一次执行的时间
clusterWorkerList.Range(func(key, value interface{}) bool {
l.workerList.Range(func(key, value interface{}) bool {
val := value.(timerStr)
nextTime, err := GetNextTime(time.Now().In(c.location), *val.JobData)
nextTime, err := GetNextTime(time.Now().In(l.location), *val.JobData)
if err != nil {
c.logger.Errorf(c.ctx, "获取下次执行时间失败:%s %s", val.TaskId, err.Error())
l.logger.Errorf(l.ctx, "Cluster calculateNextTimes GetNextTime err:%s %s", val.TaskId, err.Error())
return true
}
// 内部判定是否重复
cacheKey := fmt.Sprintf("%s_%s_%d", c.keyPrefix, val.TaskId, nextTime.UnixMilli())
cacheVal, err := c.cache.Get(cacheKey)
if err == nil {
// 缓存已有值
return true
}
valueNum := int(0)
if cacheVal != nil {
valueNum = cacheVal.(int)
}
if valueNum > 2 {
// 重试2次还是失败就不执行了
return true
}
// l.logger.Infof(l.ctx, "Cluster calculateNextTimes GetNextTime nextTime:%s %s", val.TaskId, nextTime.Format(time.RFC3339))
// redis lua脚本,尝试设置nx锁时间为一分钟,如果能设置进去则添加到有序集合zsetKey
// 使用Lua脚本原子性添加任务
script := `
local zsetKey = KEYS[1]
local lockKey = KEYS[2]
local score = ARGV[1]
local taskID = ARGV[2]
local expireTime = ARGV[3]
local cacheKey = ARGV[1]
local expireTime = ARGV[2]
local score = ARGV[3]
local member = ARGV[4]
local res = redis.call('set', cacheKey, '', 'nx', 'ex', expireTime)
if res then
redis.call('zadd', zsetKey, score, member)
return "SUCCESS"
-- 检查是否已存在
local existing = redis.call('zscore', zsetKey, taskID)
if existing and tonumber(existing) <= tonumber(score) then
return 0
end
return "ERROR"
-- 设置NX锁避免重复计算
local lockAcquired = redis.call('set', lockKey, 1, 'NX', 'EX', expireTime)
if not lockAcquired then
return 0
end
redis.call('zadd', zsetKey, score, taskID)
return 1
`
// TODO:
expireTime := time.Minute
res, err := c.redis.Eval(c.ctx, script, []string{c.zsetKey}, cacheKey, expireTime.Seconds(), nextTime.UnixMilli(), val.TaskId).Result()
valueNum++
if err == nil && res.(string) == "SUCCESS" {
// 设置成功
valueNum = 10
// fmt.Println("计算时间2", val.ExtendData, time.UnixMilli(nextTime.UnixMilli()).Format("2006-01-02 15:04:05"))
lockKey := fmt.Sprintf("%s:lock:calc:%s:%d", l.keyPrefix, val.TaskId, nextTime.UnixNano())
_, err = pipe.Eval(l.ctx, script, []string{l.zsetKey, lockKey},
nextTime.UnixMilli(), val.TaskId, 60).Result()
if err != nil {
l.logger.Errorf(l.ctx, "Failed to schedule task: %v", err)
}
c.cache.Set(cacheKey, valueNum, expireTime)
return true
})
// _, err := p.Exec(ctx)
// _ = err
_, err := pipe.Exec(l.ctx)
if err != nil {
l.logger.Errorf(l.ctx, "Cluster Failed to schedule task: %v", err)
}
}
// 获取任务
func (c *Cluster) getTask() {
// 定时去Redis获取任务
// moveReadyTasks 移动就绪任务到执行列表
func (c *Cluster) moveReadyTasks() {
script := `
local token = redis.call('zrangebyscore',KEYS[1],ARGV[1],ARGV[2])
for i,v in ipairs(token) do
redis.call('zrem',KEYS[1],v)
redis.call('lpush',KEYS[2],v)
end
return "OK"
`
c.redis.Eval(c.ctx, script, []string{c.zsetKey, c.listKey}, 0, time.Now().UnixMilli()).Result()
local zsetKey = KEYS[1]
local listKey = KEYS[2]
local maxTime = ARGV[1]
local limit = ARGV[2]
local tasks = redis.call('zrangebyscore', zsetKey, 0, maxTime, 'LIMIT', 0, limit)
for i, taskID in ipairs(tasks) do
redis.call('zrem', zsetKey, taskID)
redis.call('lpush', listKey, taskID)
end
return #tasks
`
result, err := c.redis.Eval(c.ctx, script, []string{c.zsetKey, c.listKey},
time.Now().UnixMilli(), 100).Result()
if err != nil && err != redis.Nil {
c.logger.Errorf(c.ctx, "Failed to move ready tasks: %v", err)
return
}
if count, ok := result.(int64); ok && count > 0 {
c.logger.Infof(c.ctx, "Cluster moveReadyTasks Moved %d tasks to ready list", count)
}
}
// 监听任务
func (c *Cluster) watch() {
// 执行任务
go func() {
// executeTasks 执行任务
func (c *Cluster) executeTasks() {
defer c.wg.Done()
for {
if c.usePriority {
if !c.priority.IsLatest(c.ctx) {
// 如果全局优先级不满足就不执行
time.Sleep(time.Second * 5)
select {
case <-c.stopChan:
return
case <-c.ctx.Done():
return
default:
if c.usePriority && !c.priority.IsLatest(c.ctx) {
time.Sleep(5 * time.Second)
continue
}
}
keys, err := c.redis.BLPop(c.ctx, time.Second*10, c.listKey).Result()
taskID, err := c.redis.BLPop(c.ctx, 10*time.Second, c.listKey).Result()
if err != nil {
if err != redis.Nil {
c.logger.Errorf(c.ctx, "BLPop watch err:%+v", err)
}
continue
}
_, ok := clusterWorkerList.Load(keys[1])
if !ok {
c.logger.Errorf(c.ctx, "watch timer:任务不存在%+v", keys[1])
rd := ReJobData{
TaskId: keys[1],
Times: 1,
}
rdb, _ := json.Marshal(rd)
c.redis.SAdd(c.ctx, c.setKey, string(rdb))
continue
}
go c.doTask(c.ctx, keys[1])
}
}()
// 处理重入任务
go func() {
for {
if c.usePriority {
if !c.priority.IsLatest(c.ctx) {
// 如果全局优先级不满足就不执行
time.Sleep(time.Second * 5)
continue
}
}
res, err := c.redis.SPop(c.ctx, c.setKey).Result()
if err != nil {
if err == redis.Nil {
// 已经是空了就不要浪费资源了
time.Sleep(time.Second)
} else {
c.logger.Errorf(c.ctx, "SPop watch err:%+v", err)
c.logger.Errorf(c.ctx, "Failed to pop task: %v", err)
}
continue
}
var rd ReJobData
err = json.Unmarshal([]byte(res), &rd)
if err != nil {
c.logger.Errorf(c.ctx, "json.Unmarshal err:%+v", err)
if len(taskID) < 2 {
continue
}
_, ok := clusterWorkerList.Load(rd.TaskId)
if !ok {
c.logger.Errorf(c.ctx, "watch timer:任务不存在%+v", rd.TaskId)
if rd.Times >= 3 {
// 重试3次还是失败就不执行了
continue
go c.processTask(taskID[1])
}
rd.Times++
rdb, _ := json.Marshal(rd)
c.redis.SAdd(c.ctx, c.setKey, string(rdb))
continue
}
go c.doTask(c.ctx, rd.TaskId)
}
}()
}
@@ -455,39 +465,52 @@ type ReJobData struct {
}
// 执行任务
func (c *Cluster) doTask(ctx context.Context, taskId string) {
func (l *Cluster) processTask(taskId string) {
ctx, cancel := context.WithTimeout(ctx, c.timeout)
ctx, cancel := context.WithTimeout(l.ctx, l.timeout)
defer cancel()
val, ok := clusterWorkerList.Load(taskId)
val, ok := l.workerList.Load(taskId)
if !ok {
c.logger.Errorf(ctx, "doTask timer:任务不存在:%s", taskId)
l.logger.Errorf(ctx, "doTask timer:任务不存在:%s", taskId)
return
}
t, ok := val.(timerStr)
if !ok {
c.logger.Errorf(ctx, "doTask timer:任务不存在:%s", taskId)
l.logger.Errorf(ctx, "doTask timer:任务不存在:%s", taskId)
return
}
// 这里加一个全局锁
lock := lockx.NewGlobalLock(ctx, c.redis, taskId)
tB := lock.Lock()
if !tB {
c.logger.Errorf(ctx, "doTask timer:获取锁失败:%s", taskId)
lock, err := lockx.NewGlobalLock(ctx, l.redis, taskId)
if err != nil {
l.logger.Errorf(ctx, "doTask timer:获取锁失败:%s", taskId)
return
}
if b, err := lock.Lock(); !b {
l.logger.Errorf(ctx, "doTask timer:获取锁失败:%s %+v", taskId, err)
return
}
defer lock.Unlock()
begin := time.Now()
defer func() {
if err := recover(); err != nil {
c.logger.Errorf(ctx, "timer:回调任务panic err:%+v stack:%s", err, string(debug.Stack()))
l.logger.Errorf(ctx, "timer:回调任务panic err:%+v stack:%s", err, string(debug.Stack()))
}
l.logger.Infof(ctx, "doTask timer:执行任务耗时:%s %dms", taskId, time.Since(begin).Milliseconds())
}()
ctx = context.WithValue(ctx, "trace_id", uuid.NewV4().String())
u, _ := uuid.NewV7()
ctx = context.WithValue(ctx, "trace_id", u.String())
// 执行任务
t.Callback(ctx, t.ExtendData)
if err := t.Callback(ctx, t.ExtendData); err != nil {
l.logger.Errorf(ctx, "doTask timer:执行任务失败:%s %+v", taskId, err)
return
}
l.logger.Infof(ctx, "doTask timer:执行任务成功:%s", taskId)
}
+8 -1
View File
@@ -14,6 +14,8 @@ func TestCluster_AddEveryMonth(t *testing.T) {
ctx := context.Background()
redis := redis.NewClient(&redis.Options{
Addr: "localhost:6379",
Password: "123456",
DB: 0,
})
defer redis.Close()
@@ -35,6 +37,8 @@ func TestCluster_AddEveryMonth(t *testing.T) {
t.Errorf("AddEveryMonth failed, err: %v", err)
}
time.Sleep(time.Second * 10)
// TODO: verify the job is added to the cluster and can be executed at the specified time
}
@@ -154,6 +158,8 @@ func TestCluster_Add(t *testing.T) {
fmt.Println("66666")
redis := redis.NewClient(&redis.Options{
Addr: "localhost:6379",
Password: "123456",
DB: 0,
})
defer redis.Close()
@@ -172,9 +178,10 @@ func TestCluster_Add(t *testing.T) {
err := cluster.EverySpace(ctx, taskId, dur, callback, extendData)
if err != nil {
t.Errorf("Add failed, err: %v", err)
t.Errorf("Add failed,1 err: %v", err)
}
time.Sleep(time.Second * 20)
+2
View File
@@ -22,4 +22,6 @@ var (
ErrCreateTime = errors.New("create time can not be empty")
// 间隔时间必须大于0
ErrIntervalTime = errors.New("interval time must be greater than 0")
// 任务Id已存在
ErrTaskIdExists = errors.New("taskId already exists")
)
+2 -1
View File
@@ -4,10 +4,11 @@ go 1.21
require (
github.com/go-redis/redis/v8 v8.11.5
github.com/google/uuid v1.6.0
github.com/satori/go.uuid v1.2.0
github.com/stretchr/testify v1.11.1
github.com/yuninks/cachex v1.0.5
github.com/yuninks/lockx v1.0.2
github.com/yuninks/lockx v1.1.0
)
require (
+6 -2
View File
@@ -8,6 +8,8 @@ github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWo
github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
github.com/go-redis/redis/v8 v8.11.5 h1:AcZZR7igkdvfVmQTPnu9WE37LRrO/YrBH5zWyjDC0oI=
github.com/go-redis/redis/v8 v8.11.5/go.mod h1:gREzHqY1hg6oD9ngVRbLStwAWKhA0FEgq8Jd4h5lpwo=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
@@ -23,12 +25,14 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww=
github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/yuninks/cachex v1.0.5 h1:Y2NmTsuEgwEVYb7FVFh5tUN67kmrUioeksQqLbOAwsM=
github.com/yuninks/cachex v1.0.5/go.mod h1:5357qz18UvHTJSgZzkMamUzZoFzGeKG9+4tIUBXRSVM=
github.com/yuninks/lockx v1.0.2 h1:p0n791WmsU8D7YF2tQaNLwPE75jdd774unlJZRTNfaw=
github.com/yuninks/lockx v1.0.2/go.mod h1:J6wvuUELLcMn6FCmiZFt7K5w1QQAh1myL7h3JrZaQiQ=
github.com/yuninks/lockx v1.1.0 h1:Qkf+RqYQ6Vr242h3dWxWJqIvIp9/6j5ijTkbuWvrgqU=
github.com/yuninks/lockx v1.1.0/go.mod h1:Y/MtD+4Zc79/0Qsdd9c99jxW5WWxXVZlxcJyYJFJy5A=
golang.org/x/net v0.0.0-20210428140749-89ef3d95e781 h1:DzZ89McO9/gWPsQXS/FVKAlG02ZjaQ6AlZRBimEYOd0=
golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk=
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e h1:fLOSk5Q00efkSvAm+4xcoXD+RRmLmmulPn5I3Y9F2EM=