YOLOv4-tiny
batch = 64
subdivisions = 64
width = 416
height = 416
channels = 3
momentum = 0.9
decay = 0.0005
angle = 0
saturation = 1.5
exposure = 1.5
hue = .1
learning_rate = 0.00261
burn_in = 1000
max_batches = 2000200
policy = steps
steps = 1600000, 1800000
scales = .1, .1
no layer batch_normalize activation pad filters size/stride input output
0 conv 1 leaky 1 32 3x3/2 416x416x3 208x208x32
1 conv 1 leaky 1 64 3x3/2 208x208x32 104x104x64
2 conv 1 leaky 1 64 3x3/1 104x104x64 104x104x64
3 route -1 /2 104x104x32
4 conv 1 leaky 1 32 3x3/1 104x104x32 104x104x32
5 conv 1 leaky 1 32 3x3/1 104x104x32 104x104x32
6 route -1, -2 104x104x64
7 conv 1 leaky 1 64 1x1/1 104x104x64 104x104x64
8 route -6, -1 104x104x128
9 max 2x2/2 104x104x128 52x52x128
10 conv 1 leaky 1 128 3x3/1 52x52x128 52x52x128
11 route -1 /2 52x52x64
12 conv 1 leaky 1 64 3x3/1 52x52x64 52x52x64
13 conv 1 leaky 1 64 3x3/1 52x52x64 52x52x64
14 route -1, -2 52x52x128
15 conv 1 leaky 1 128 1x1/1 52x52x128 52x52x128
16 route -6, -1 52x52x256
17 max 2x2/2 52x52x256 26x26x256
18 conv 1 leaky 1 256 3x3/1 26x26x256 26x26x256
19 route -1 /2 26x26x128
20 conv 1 leaky 1 128 3x3/1 26x26x128 26x26x128
21 conv 1 leaky 1 128 3x3/1 26x26x128 26x26x128
22 route -1, -2 26x26x256
23 conv 1 leaky 1 256 1x1/1 26x26x256 26x26x256
24 route -6, -1 26x26x512
25 max 2x2/2 26x26x512 13x13x512
26 conv 1 leaky 1 512 3x3/1 13x13x512 13x13x512
27 conv 1 leaky 1 256 1x1/1 13x13x512 13x13x256
28 conv 1 leaky 1 512 3x3/1 13x13x256 13x13x512
29 conv linear 1 255 1x1/1 13x13x512 13x13x255
30 yolo
31 route -4 13x13x256
32 conv 1 leaky 1 128 1x1/1 13x13x256 13x13x128
33 up x2 13x13x128 26x26x128
34 route -1, 23 26x26x384
35 conv 1 leaky 1 256 3x3/1 26x26x384 26x26x256
36 conv 1 leaky 1 255 1x1/1 26x26x256 26x26x255
37 yolo
YOLOv4
batch = 64
subdivisions = 8
width = 608
height = 608
channels = 3
momentum = 0.949
decay = 0.0005
angle = 0
saturation = 1.5
exposure = 1.5
hue = .1
learning_rate = 0.0013
burn_in = 1000
max_batches = 500500
policy = steps
steps = 400000, 450000
scales = .1, .1
mosaic = 1
no layer batch_normalize activation pad filters size/stride input output
0 conv 1 mish 1 32 3x3/1 608x608x3 608x608x32
1 conv 1 mish 1 64 3x3/2 608x608x32 304x304x64
2 conv 1 mish 1 64 1x1/1 304x304x64 304x304x64
3 route -2 304x304x64
4 conv 1 mish 1 64 1x1/1 304x304x64 304x304x64
5 conv 1 mish 1 32 1x1/1 304x304x64 304x304x32
6 conv 1 mish 1 64 3x3/1 304x304x32 304x304x64
7 short -3 linear 304x304x64
8 conv 1 mish 1 64 1x1/1 304x304x64 304x304x64
9 route -1, -7 304x304x128
10 conv 1 mish 1 64 1x1/1 304x304x128 304x304x64
11 conv 1 mish 1 128 3x3/2 304x304x64 152x152x128
12 conv 1 mish 1 64 1x1/1 152x152x128 152x152x64
13 route -2 152x152x128
14 conv 1 mish 1 64 1x1/1 152x152x128 152x152x64
15 conv 1 mish 1 64 1x1/1 152x152x64 152x152x64
16 conv 1 mish 1 64 3x3/1 152x152x64 152x152x64
17 short -3 linear 152x152x64
18 conv 1 mish 1 64 1x1/1 152x152x64 152x152x64
19 conv 1 mish 1 64 3x3/1 152x152x64 152x152x64
20 short -3 linear 152x152x64
21 conv 1 mish 1 64 1x1/1 152x152x64 152x152x64
22 route -1, -10 152x152x128
23 conv 1 mish 1 128 1x1/1 152x152x128 152x152x128
24 conv 1 mish 1 256 3x3/2 152x152x128 76x76x256
25 conv 1 mish 1 128 1x1/1 76x76x256 76x76x128
26 route -2 76x76x256
27 conv 1 mish 1 128 1x1/1 76x76x256 76x76x128
28 conv 1 mish 1 128 1x1/1 76x76x128 76x76x128
29 conv 1 mish 1 128 3x3/1 76x76x128 76x76x128
30 short -3 linear 76x76x128
31 conv 1 mish 1 128 1x1/1 76x76x128 76x76x128
32 conv 1 mish 1 128 3x3/1 76x76x128 76x76x128
33 short -3 linear 76x76x128
34 conv 1 mish 1 128 1x1/1 76x76x128 76x76x128
35 conv 1 mish 1 128 3x3/1 76x76x128 76x76x128
36 short -3 linear 76x76x128
37 conv 1 mish 1 128 1x1/1 76x76x128 76x76x128
38 conv 1 mish 1 128 3x3/1 76x76x128 76x76x128
39 short -3 linear 76x76x128
40 conv 1 mish 1 128 1x1/1 76x76x128 76x76x128
41 conv 1 mish 1 128 3x3/1 76x76x128 76x76x128
42 short -3 linear 76x76x128
43 conv 1 mish 1 128 1x1/1 76x76x128 76x76x128
44 conv 1 mish 1 128 3x3/1 76x76x128 76x76x128
45 short -3 linear 76x76x128
46 conv 1 mish 1 128 1x1/1 76x76x128 76x76x128
47 conv 1 mish 1 128 3x3/1 76x76x128 76x76x128
48 short -3 linear 76x76x128
49 conv 1 mish 1 128 1x1/1 76x76x128 76x76x128
50 conv 1 mish 1 128 3x3/1 76x76x128 76x76x128
51 short -3 linear 76x76x128
52 conv 1 mish 1 128 1x1/1 76x76x128 76x76x128
53 route -1, -28 76x76x256
54 conv 1 mish 1 256 1x1/1 76x76x256 76x76x256
55 conv 1 mish 1 512 3x3/2 76x76x256 38x38x512
56 conv 1 mish 1 256 1x1/1 38x38x512 38x38x256
57 route -2 38x38x512
58 conv 1 mish 1 256 1x1/1 38x38x512 38x38x256
59 conv 1 mish 1 256 1x1/1 38x38x256 38x38x256
60 conv 1 mish 1 256 3x3/1 38x38x256 38x38x256
61 short -3 linear 38x38x256
62 conv 1 mish 1 256 1x1/1 38x38x256 38x38x256
63 conv 1 mish 1 256 3x3/1 38x38x256 38x38x256
64 short -3 linear 38x38x256
65 conv 1 mish 1 256 1x1/1 38x38x256 38x38x256
66 conv 1 mish 1 256 3x3/1 38x38x256 38x38x256
67 short -3 linear 38x38x256
68 conv 1 mish 1 256 1x1/1 38x38x256 38x38x256
69 conv 1 mish 1 256 3x3/1 38x38x256 38x38x256
70 short -3 linear 38x38x256
71 conv 1 mish 1 256 1x1/1 38x38x256 38x38x256
72 conv 1 mish 1 256 3x3/1 38x38x256 38x38x256
73 short -3 linear 38x38x256
74 conv 1 mish 1 256 1x1/1 38x38x256 38x38x256
75 conv 1 mish 1 256 3x3/1 38x38x256 38x38x256
76 short -3 linear 38x38x256
77 conv 1 mish 1 256 1x1/1 38x38x256 38x38x256
78 conv 1 mish 1 256 3x3/1 38x38x256 38x38x256
79 short -3 linear 38x38x256
80 conv 1 mish 1 256 1x1/1 38x38x256 38x38x256
81 conv 1 mish 1 256 3x3/1 38x38x256 38x38x256
82 short -3 linear 38x38x256
83 conv 1 mish 1 256 1x1/1 38x38x256 38x38x256
84 route -1, -28 38x38x512
85 conv 1 mish 1 512 1x1/1 38v38x512 38x38x512
86 conv 1 mish 1 1024 3x3/2 38x38x512 19x19x1024
87 conv 1 mish 1 512 1x1/1 19x19x1024 19x19x512
88 route -2 19x19x1024
89 conv 1 mish 1 512 1x1/1 19x19x1024 19x19x512
90 conv 1 mish 1 512 1x1/1 19x19x512 19x19x512
91 conv 1 mish 1 512 3x3/1 19x19x512 19x19x512
92 short -3 linear 19x19x512
93 conv 1 mish 1 512 1x1/1 19x19x512 19x19x512
94 conv 1 mish 1 512 3x3/1 19x19x512 19x19x512
95 short -3 19x19x512
96 conv 1 mish 1 512 1x1/1 19x19x512 19x19x512
97 conv 1 mish 1 512 3x3/1 19x19x512 19x19x512
98 short -3 linear 19x19x512
99 conv 1 mish 1 512 1x1/1 19x19x512 19x19x512
100 conv 1 mish 1 512 3x3/1 19x19x512 19x19x512
101 short -3 linear 19x19x512
102 conv 1 mish 1 512 1x1/1 19x19x512 19x19x512
103 route -1, -16 19x19x1024
104 conv 1 mish 1 1024 1x1/1 19x19x1024 19x19x1024
105 conv 1 leaky 1 512 1x1/1 19x19x1024 19x19x512
106 conv 1 leaky 1 1024 3x3/1 19x19x512 19x19x1024
107 conv 1 leaky 1 512 1x1/1 19x19x1024 19x19x512
108 max 5x5/1 19x19x512 19x19x512
109 route -2 19x19x512
110 max 9x9x/1 19x19x512 19x19x512
111 route -4 19x19x512
112 max 13x13/1 19x19x512 19x19x512
113 -1, -3, -5, -6 19x19x2048
114 conv 1 leaky 1 512 1x1/1 19x19x2048 19x19x512
115 conv 1 leaky 1 1024 3x3/1 19x19x512 19x19x1024
116 conv 1 leaky 1 512 1x1/1 19x19x1024 19x19x512
117 conv 1 leaky 1 256 1x1/1 19x19x512 19x19x256
118 up x2 19x19x256 38x38x256
119 route 85 38x38x512
120 conv 1 leaky 1 256 1x1/1 38x38x512 38x38x256
121 route -1, -3 38x38x512
122 conv 1 leaky 1 256 1x1/1 38x38x512 38x38x256
123 conv 1 leaky 1 512 3x3/1 38x38x256 38x38x512
124 conv 1 leaky 1 256 1x1/1 38x38x512 38x38x256
125 conv 1 leaky 1 512 3x3/1 38x38x256 38x38x512
126 conv 1 leaky 1 256 1x1/1 38x38x512 38x38x256
127 conv 1 leaky 1 128 1x1/1 38x38x256 38x38x128
128 up x2 38x38x128 76x76x128
129 route 54 76x76x256
130 conv 1 leaky 1 128 1x1/1 76x76x256 76x76x128
131 route -1, -3 76x76x256
132 conv 1 leaky 1 128 1x1/1 76x76x256 76x76x128
133 conv 1 leaky 1 256 3x3/1 76x76x128 76x76x256
134 conv 1 leaky 1 128 1x1/1 76x76x256 76x76x128
135 conv 1 leaky 1 256 3x3/1 76x76x128 76x76x256
136 conv 1 leaky 1 128 1x1/1 76x76x256 76x76x128
137 conv 1 leaky 1 256 3x3/1 76x76x128 76x76x256
138 conv linear 1 255 1x1/1 76x76x256 76x76x255
139 yolo
140 route -4 76x76x128
141 conv 1 leaky 1 256 3x3/2 76x76x128 38x38x256
142 route -1, -16 38x38x512
143 conv 1 leaky 1 256 1x1/1 38x38x512 38x38x256
144 conv 1 leaky 1 512 3x3/1 38x38x256 38x38x512
145 conv 1 leaky 1 256 1x1/1 38x38x512 38x38x256
146 conv 1 leaky 1 512 3x3/1 38x38x256 38x38x512
147 conv 1 leaky 1 256 1x1/1 38x38x512 38x38x256
148 conv 1 leaky 1 512 3x3/1 38x38x256 38x38x512
149 conv linear 1 255 1x1/1 38x38x512 38x38x255
150 yolo
151 route -4 38x38x256
152 conv 1 leaky 1 512 3x3/2 38x38x256 19x19x512
153 route -1, -37 19x19x1024
154 conv 1 leaky 1 512 1x1/1 19x19x1024 19x19x512
155 conv 1 leaky 1 1024 3x3/1 19x19x512 19x19x1024
156 conv 1 leaky 1 512 1x1/1 19x19x1024 19x19x512
157 conv 1 leaky 1 1024 3x3/1 19x19x512 19x19x1024
158 conv 1 leaky 1 512 1x1/1 19x19x1024 19x19x512
159 conv 1 leaky 1 1024 3x3/1 19x19x512 19x19x1024
160 conv linear 1 255 1x1/1 19x19x1024 19x19x255
161 yolo
YOLOv4-tiny의 YOLO Layer
[yolo]
mask = 3,4,5
anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
classes=80
num=6
jitter=.3
scale_x_y = 1.05
cls_normalizer=1.0
iou_normalizer=0.07
iou_loss=ciou
ignore_thresh = .7
truth_thresh = 1
random=0
resize=1.5
nms_kind=greedynms
beta_nms=0.6
[yolo]
mask = 1,2,3
anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
classes=80
num=6
jitter=.3
scale_x_y = 1.05
cls_normalizer=1.0
iou_normalizer=0.07
iou_loss=ciou
ignore_thresh = .7
truth_thresh = 1
random=0
resize=1.5
nms_kind=greedynms
beta_nms=0.6
YOLOv4의 YOLO Layer
[yolo]
mask = 0,1,2
anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
classes=80
num=9
jitter=.3
ignore_thresh = .7
truth_thresh = 1
scale_x_y = 1.2
iou_thresh=0.213
cls_normalizer=1.0
iou_normalizer=0.07
iou_loss=ciou
nms_kind=greedynms
beta_mms=0.6
max_delta=5
[yolo]
mask = 3,4,5
anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
classes=80
num=9
jitter=.3
ignore_thresh = .7
truth_thresh = 1
scale_x_y = 1.1
iou_thresh=0.213
cls_normalizer=1.0
iou_normalizer=0.07
iou_loss=ciou
nms_kind=greedynms
beta_nms=0.6
max_delta=5
[yolo]
mask = 6,7,8
anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
classes=80
num=9
jitter=.3
ignore_thresh = .7
truth_thresh = 1
random=1
scale_x_y = 1.05
iou_thresh=0.213
cls_normalizer=1.0
iou_normalizer=0.07
iou_loss=ciou
nms_kind=greedynms
beta_nms=0.6
max_delta=5
net layer parameters
batch : 한 묶음에 포함되는 이미지의 수를 지정한다 (값이 높을 수록 VRAM 사용량 증가)
과하게 높으면 오히려 악영향을 끼칠 수 있다고 하며 사람마다 의견이 다르지만 대부분 64까지는 괜찮다고 한다
subdivisions : 학습 시 batch를 몇개의 서브 묶음으로 나눌지 설정한다 (값이 높을 수록 VRAM 사용량 감소) (단, subdivision =< batch)
subdivision의 값이 낮을 수록 정확도와 속도가 높아진다
batch와 subdivisions가 같을 경우 batch를 1로 설정하는것과 차이가 있는지는 확인이 필요하다
width : 학습에 사용되는 이미지의 가로 크기 (클 수록 좋은 결과를 얻을 수 있으나 VRAM 사용량 및 학습시간 증가)
height : 학습에 사용되는 이미지의 세로 크기 (클 수록 좋은 결과를 얻을 수 있으나 VRAM 사용량 및 학습시간 증가)
channels : 학습에 사용되는 이미지의 채널 크기 (주로 RGB의 3채널을 사용)
momentum : weight의 추가 변화에 미치는 영향의 축적
decay : weight가 너무 높아지지 않도록 값을 줄여주는 역할을 하며 이로인해 데이터 집합의 불균형을 제거한다
가중치가 클수록 큰 패널티를 부여해서 overfitting을 방지한다
angle : 이미지를 랜덤으로 회전시키는 각도 (classification only)
saturation : 학습하는 동안 이미지의 채도를 랜덤으로 변경한다
exposure : 학습하는 동안 이미지의 노출을 랜덤으로 변경한다
hue : 학습하는 동안 이미지의 색상을 랜덤으로 변경한다
learning_rate : 초기 학습률이다
burn_in : 설정한 값의 반복까지 learning_rate를 천천히 증가시켜 Loss가 감소하기 전까지 모니터링하여 learning_rate를 결정하는데 사용한다
max_batches : 학습의 최대 반복횟수이다
(class * 2000) + 200 또는 (class * 4000) + 200 을 주로 사용한다
policy : learning_rate를 변경하는 방법을 설정한다
steps : 지정된 iterations에서 scales의 값 만큼 곱해진다 (scales의 갯수와 같아야한다)
+200을 하지않은 max_batches의 80%, 90%를 기본으로 한다
scales : steps에서 지정된 iteration에서 learning_rate에 곱하는 값을 설정한다 (steps의 갯수와 같아야한다)
mosaic : 4개의 이미지를 하나의 이미지로 합쳐서 학습을 진행한다
yolo layer parameters
mask : 해당 yolo 레이어에서 사용할 anchors의 index
anchors : bounded_boxed의 조정될 크기
classes : 클래스의 수
num : anchors의 총 개수
jitter : 이미지의 크기를 랜덤으로 변경한다
(1 - 2 * jitter) ~ (1 + 2 * jitter)의 범위로 크기를 임의로 변경한다
ignore_thresh : IoU(detect, truth) > ignore_thresh인 경우 중복 탐지를 유지하며, NMS 도중에 용합된다 (학습에서만 사용된다)
truth_thresh : IoU(detect, truch) > truth_thresh인 경우 중복 탐지를 조정하며, NMS 도중에 융합된다 (학습에서만 사용된다)
random : 네트워크 크기의 초기 가로 세로 비율을 유지하면서 10배치마다 1/1.4 ~ 1.4 로 네트워크의 크기를 랜덤으로 변경한다
scale_x_y : grid 감도를 없앤다
iou_thresh : IoU(Obj, Anchor) > iou_thresh인 경우 객체당 많은 anchor를 사용한다
cls_normalizer : 델타-객체성에 대한 정규화
iou_normalizer : 델타-IoU에 대한 정규화
iou_loss : IoU 손실 - mse, giou, diou, ciou가 있다
nms_kind : nms를 선택한다 greedynms, diounms가 있다
beta_nms : diounms 일때만 사용한다 값을 증가시키면 동일 객체에 저 적은 경계 상자를 얻는다
max_delta : 각 항목의 델타 제한
resize : 랜덤으로 1/{설정값} ~ 1*{설정값} 의 범위에서 이미지의 크기를 변경한다
YOLOv4에서는 conv의 활성함수(activation)를 2/3는 mish를, 1/3은 leaky를 사용하고
YOLOv4-tiny에서는 conv의 활성함수를 모두 leaky를 사용한다
YOLOv4, YOLOv4-tiny 모두 yolo 이전의 conv 레이어에서 활성함수는 linear를 사용한다
YOLOv4의 최대 채널은 2048이며, YOLOv4-tiny의 최대 채널은 512 이다
YOLOv4에는 shortcut 레이어를 사용하지만 YOLOv4-tiny에는 사용하지 않는다
shortcut 레이어에서 활성함수는 linear를 사용한다
YOLOv4에 shortcut과 route를 이용하여 여러번 반복되는 Layer들이 있다
YOLOv4에서 max_batches의 값이 class * 2000 + 200의 값이 아니다
yolo 레이어의 classes가 80 이므로 160200가 되어야하지만 다른 500500으로 되어있으며
max_batches가 다른 프레임워크에서 말하는 EPOCH 이므로 무조건 지킬 필요는 없다고 생각된다
YOLOv4는 mosaic가 설정되어 있지만 YOLOv4-tiny는 설정되어있지 않다
YOLOv4는 yolo 레이어가 3개 있으며 YOLOv4-tiny는 yolo 레이어가2개 있다
주로 채널의 수를 늘릴 때는 size를 3으로 하고 줄일 때는 1로 설정했다
YOLOv4의 경우 maxpool과 route를 반복하는 부분이 있으며
5, 9, 13의 size와 1 stride의 값으로 진행한다
YOLOv4-tiny는 13, 26 해상도에서, YOLOv4는 76, 38, 19 해상도에서 YOLO 레이어를 추가하여 인식을 진행했다
leaky는 leakyReLU를 의미하는 것으로 생각된다
ReLU의 개선된 함수이며 Dying ReLU문제를 방지하고 ReLU보다 균형적인 값을 반환한다
ReLU는 연산이 매우 빠르며 비선형이고 Dying ReLU문제로 인하여 입력값이 0 이하일 때 gradient값이 0이되어 학습하지 못하는 문제가 있다
mish는 양수가 무한으로 되어있어 캡핑으로 인한 포화를 피할 수 있으며 약간의 음수를 허용하여 relu zero bound보다 gradient가 더 잘 흐른다
깊은 신경망 구조를 만들어도 보다 안정적으로 학습이 된다
YOLOv4와 YOLOv4-tiny의 momentum에 0.043차이가 있으며 YOLOv4가 더 값이 높다
YOLOv4의 learning_rate는 0.0013이며 YOLOv4-tiny의 learning_rate는 0.00261이다
YOLOv4의 learning_rate가 절반 정도로 더 낮다
yolo레이어에서 mask를
YOLOv4의 경우 0,1,2 -> 3,4,5 -> 6,7,8로 설정했고
YOLOv4-tiny의 경우 3,4,5 -> 1,2,3으로 설정했다
yolo레이어에서 anchor를
YOLOv4는 12, 16, 39, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 259, 401로 설정했고
YOLOv4-tiny는 10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319로 설정했다
입력 이미지의 해상도와 anchor의 수에 따라 anchor 값이 다르게 나타난다
anchor 및 mask에 의한 인식 성능의 차이가 있을 것으로 예상되며
anchor와 mask를 지정하는 방식에 대한 정보가 더 필요하다
scale_x_y의 값이 YOLOv4에서는 1.2 -> 1.1 -> 1.05 순으로 1에 가까워지며
YOLOv4-tiny에서는 1.05만을 사용한다
이 부분에 대해서는 추가 정보가 필요하다
YOLOv4의 yolo 레이어에 resize가 없으나
YOLOv4-tiny의 yolo 레이어는 resize가 있다
메모리 절약을 위해 이미지의 크기를 작게 했기때문에 yolo 레이어에서는 조금 늘리기 위함으로 생각된다
번역 예정
CFG-Parameters in the [net] section:
[net] 섹션의 CFG-파라미터:
[net] section
[net] 섹션
batch=1 - number of samples (images, letters, ...) which will be precossed in one batch
batch-1 - 한 묶음으로 선점될 샘플의(이미지, 글자, ...) 수
subdivisions=1 - number of mini_batches in one batch, size mini_batch = batch/subdivisions, so GPU processes mini_batch samples at once, and the weights will be updated for batch samples (1 iteration processes batch images)
subdivisions=1 - 하나의 배치에 있는 mini_batches의 수, mini_batch 크기 = batch/subdivistion, 그래서 GPU는 한번에 mini_batch 샘플을 처기하고, 가중치는 배치 샘플에 대해 업데이트 된다 (1회 반복은 배치 이미지를 처리)
width=416 - network size (width), so every image will be resized to the network size during Training and Detection
width=416 - 네트워크 크기(가로)로, 모든 이미지는 학습과 탐지중에 네트워크 크기로 리사이즈 된다
height=416 - network size (height), so every image will be resized to the network size during Training and Detection
height=416 - 네트워크 크기(세로)로, 모든 이미지는 학습과 탐지중에 네트워크 크기로 리사이즈 된다
channels=3 - network size (channels), so every image will be converted to this number of channels during Training and Detection
channels=3 - 네트워크 크기(채널)로, 모든 이미지는 학습과 탐지중에 이 수의 채널로 변환된다
inputs=256 - network size (inputs) is used for non-image data: letters, prices, any custom data
max_chart_loss=20 - max value of Loss in the image chart.png
For training only
Contrastive loss:
contrastive=1 - use Supervised contrastive loss for training Classifier (should be used with [contrastive] layer)
unsupervised=1 - use Unsupervised contrastive loss for training Classifier on images without labels (should be used with contrastive=1 parameter and with [contrastive] layer)
Data augmentation:
angle=0 - randomly rotates images during training (classification only)
saturation = 1.5 - randomly changes saturation of images during training
exposure = 1.5 - randomly changes exposure (brightness) during training
hue=.1 - randomly changes hue (color) during training https://en.wikipedia.org/wiki/HSL_and_HSV
blur=1 - blur will be applied randomly in 50% of the time: if 1 - will be blured background except objects with blur_kernel=31, if >1 - will be blured whole image with blur_kernel=blur (only for detection and if OpenCV is used)
min_crop=224 - minimum size of randomly cropped image (classification only)
max_crop=448 - maximum size of randomly cropped image (classification only)
aspect=.75 - aspect ration can be changed during croping from 0.75 - to 1/0.75 (classification only)
letter_box=1 - keeps aspect ratio of loaded images during training (detection training only, but to use it during detection-inference - use flag -letter_box at the end of detection command)
cutmix=1 - use CutMix data augmentation (for Classifier only, not for Detector)
mosaic=1 - use Mosaic data augmentation (4 images in one)
mosaic_bound=1 - limits the size of objects when mosaic=1 is used (does not allow bounding boxes to leave the borders of their images when Mosaic-data-augmentation is used)
data augmentation in the last [yolo]-layer
jitter=0.3 - randomly changes size of image and its aspect ratio from x(1 - 2*jitter) to x(1 + 2*jitter)
random=1 - randomly resizes network size after each 10 batches (iterations) from /1.4 to x1.4 with keeping initial aspect ratio of network size
adversarial_lr=1.0 - Changes all detected objects to make it unlike themselves from neural network point of view. The neural network do an adversarial attack on itself
attention=1 - shows points of attention during training
gaussian_noise=1 - add gaussian noise
Optimizator:
momentum=0.9 - accumulation of movement, how much the history affects the further change of weights (optimizer)
decay=0.0005 - a weaker updating of the weights for typical features, it eliminates dysbalance in dataset (optimizer) http://cs231n.github.io/neural-networks-3/
learning_rate=0.001 - initial learning rate for training
burn_in=1000 - initial burn_in will be processed for the first 1000 iterations, current_learning rate = learning_rate * pow(iterations / burn_in, power) = 0.001 * pow(iterations/1000, 4) where is power=4 by default
max_batches = 500200 - the training will be processed for this number of iterations (batches)
policy=steps - policy for changing learning rate: constant (by default), sgdr, steps, step, sig, exp, poly, random (f.e., if policy=random - then current learning rate will be changed in this way = learning_rate * pow(rand_uniform(0,1), power))
power=4 - if policy=poly - the learning rate will be = learning_rate * pow(1 - current_iteration / max_batches, power)
sgdr_cycle=1000 - if policy=sgdr - the initial number of iterations in cosine-cycle
sgdr_mult=2 - if policy=sgdr - multiplier for cosine-cycle https://towardsdatascience.com/https-medium-com-reina-wang-tw-stochastic-gradient-descent-with-restarts-5f511975163
steps=8000,9000,12000 - if policy=steps - at these numbers of iterations the learning rate will be multiplied by scales factor
scales=.1,.1,.1 - if policy=steps - f.e. if steps=8000,9000,12000, scales=.1,.1,.1 and the current iteration number is 10000 then current_learning_rate = learning_rate * scales[0] * scales[1] = 0.001 * 0.1 * 0.1 = 0.00001
label_smooth_eps=0.1 - use label smoothing for training Classifier
For training Recurrent networks:
Object Detection/Tracking on Video - if [conv-lstm] or [crnn] layers are used in additional to [connected] and [convolutional] layers
Text generation - if [lstm] or [rnn] layers are used in additional to [connected] layers
track=1 - if is set 1 then the training will be performed in Recurrents-tyle for image sequences
time_steps=16 - training will be performed for a random image sequence that contains 16 images from train.txt file
for [convolutional]-layers: mini_batch = time_steps*batch/subdivisions
for [conv_lstm]-recurrent-layers: mini_batch = batch/subdivisions and sequence=16
augment_speed=3 - if set 3 then can be used each 1st, 2nd or 3rd image randomly, i.e. can be used 16 images with indexes 0, 1, 2, ... 15 or 110, 113, 116, ... 155 from train.txt file
sequential_subdivisions=8 - lower value increases the sequence of images, so if time_steps=16 batch=16 sequential_subdivisions=8, then will be loaded time_steps*batch/sequential_subdivisions = 16*16/8 = 32 sequential images with the same data-augmentation, so the model will be trained for sequence of 32 video-frames
seq_scales=0.5, 0.5 - increasing sequence of images at some steps, i.e. the coefficients to which the original sequential_subdivisions value will be multiplied (and batch will be dividied, so the weights will be updated rarely) at correspond steps if is used policy=steps or policy=sgdr
CFG-Parameters in the different layers
Image processing [N x C x H x W]:
[convolutional] - convolutional layer
batch_normalize=1 - if 1 - will be used batch-normalization, if 0 will not (0 by default)
filters=64 - number of kernel-filters (1 by default)
size=3 - kernel_size of filter (1 by default)
groups = 32 - number of groups for grouped-convolutional (depth-wise) (1 by default)
stride=1 - stride (offset step) of kernel filter (1 by default)
padding=1 - size of padding (0 by default)
pad=1 - if 1 will be used padding = size/2, if 0 the will be used parameter padding= (0 by default)
dilation=1 - size of dilation (1 by default)
activation=leaky - activation function after convolution: logistic (by default), loggy, relu, elu, selu, relie, plse, hardtan, lhtan, linear, ramp, leaky, tanh, stair, relu6, swish, mish
[activation] - separate activation layer
activation=leaky - activation function: linear (by default), loggy, relu, elu, selu, relie, plse, hardtan, lhtan, linear, ramp, leaky, tanh, stair
[batchnorm] - separate Batch-normalization layer
[maxpool] - max-pooling layer (the maximum value)
size=2 - size of max-pooling kernel
stride=2 - stirde (offset step) of max-pooling kernel
[avgpool] - average pooling layer input W x H x C -> output 1 x 1 x C
[shortcut] - residual connection (ResNet)
from=-3,-5 - relative layer numbers, preforms element-wise adding of several layers: previous-layer and layers specified in from= parameter
weights_type=per_feature - will be used weights for shortcut y[i] = w1*layer1[i] + w2*layer2[i] ...
per_feature - 1 weights per layer/feature
per_channel - 1 weights per channel
none - weights will not be used (by default)
weights_normalization=softmax - will be used weights normalization
softmax - softmax normalization
relu - relu normalization
none - without weights normalization - unbound weights (by default)
activation=linear - activation function after shortcut/residual connection (linear by default)
[upsample] - upsample layer (increase W x H resolution of input by duplicating elements)
stride=2 - factor for increasing both Width and Height (new_w = w*stride, new_h = h*stride)
[scale_channels] - scales channels (SE: squeeze-and-excitation blocks) or (ASFF: adaptively spatial feature fusion) -it multiplies elements of one layer by elements of another layer
from=-3 - relative layer number, performs multiplication of all elements of channel N from layer -3, by one element of channel N from the previous layer -1 (i.e. for(int i=0; i < b*c*h*w; ++i) output[i] = from_layer[i] * previous_layer[i/(w*h)]; )
scale_wh=0 - SE-layer (previous layer 1x1xC), scale_wh=1 - ASFF-layer (previous layer WxHx1)
activation=linear - activation function after scale_channels-layer (linear by default)
[sam] - Spatial Attention Module (SAM) - it multiplies elements of one layer by elements of another layer
from=-3 - relative layer number (this and previous layers should be the same size WxHxC)
[reorg3d] - reorg layer (resize W x H x C)
stride=2 - if reverse=0 input will be resized to W/2 x H/2 x C4, if reverse=1thenW2 x H*2 x C/4`, (1 by default)
reverse=1 - if 0(by default) then decrease WxH, if1thenincrease WxH (0 by default)
[reorg] - OLD reorg layer from Yolo v2 - has incorrect logic (resize W x H x C) - depracated
stride=2 - if reverse=0 input will be resized to W/2 x H/2 x C4, if reverse=1thenW2 x H*2 x C/4`, (1 by default)
reverse=1 - if 0(by default) then decrease WxH, if1thenincrease WxH (0 by default)
[route] - concatenation layer, Concat for several input-layers, or Identity for one input-layer
layers = -1, 61 - layers that will be concatenated, output: W x H x C_layer_1 + C_layer_2
if index < 0, then it is relative layer number (-1 means previous layer)
if index >= 0, then it is absolute layer number
[yolo] - detection layer for Yolo v3 / v4
mask = 3,4,5 - indexes of anchors which are used in this [yolo]-layer
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 - initial sizes if bounded_boxes that will be adjusted
num=9 - total number of anchors
classes=80 - number of classes of objects which can be detected
ignore_thresh = .7 - keeps duplicated detections if IoU(detect, truth) > ignore_thresh, which will be fused during NMS (is used for training only)
truth_thresh = 1 - adjusts duplicated detections if IoU(detect, truth) > truth_thresh, which will be fused during NMS (is used for training only)
jitter=.3 - randomly crops and resizes images with changing aspect ratio from x(1 - 2*jitter) to x(1 + 2*jitter) (data augmentation parameter is used only from the last layer)
random=1 - randomly resizes network for each 10 iterations from 1/1.4 to 1.4(data augmentation parameter is used only from the last layer)
resize=1.5 - randomly resizes image in range: 1/1.5 - 1.5x
max=200 - maximum number of objects per image during training
counters_per_class=100,10,1000 - number of objects per class in Training dataset to eliminate the imbalance
label_smooth_eps=0.1 - label smoothing
scale_x_y=1.05 - eliminate grid sensitivity
iou_thresh=0.2 - use many anchors per object if IoU(Obj, Anchor) > 0.2
iou_loss=mse - IoU-loss: mse, giou, diou, ciou
iou_normalizer=0.07 - normalizer for delta-IoU
cls_normalizer=1.0 - normalizer for delta-Objectness
max_delta=5 - limits delta for each entry
[crnn] - convolutional RNN-layer (recurrent)
batch_normalize=1 - if 1 - will be used batch-normalization, if 0 will not (0 by default)
size=1 - convolutional kernel_size of filter (1 by default)
pad=0 - if 1 will be used padding = size/2, if 0 the will be used parameter padding= (0 by default)
output = 1024 - number of kernel-filters in one output convolutional layer (1 by default)
hidden=1024 - number of kernel-filters in two (input and hidden) convolutional layers (1 by default)
activation=leaky - activation function for each of 3 convolutional-layers in the [crnn]-layer (logistic by default)
[conv_lstm] - convolutional LSTM-layer (recurrent)
batch_normalize=1 - if 1 - will be used batch-normalization, if 0 will not (0 by default)
size=3 - convolutional kernel_size of filter (1 by default)
padding=1 - convolutional size of padding (0 by default)
pad=1 - if 1 will be used padding = size/2, if 0 the will be used parameter padding= (by default)
stride=1 - convolutional stride (offset step) of kernel filter (1 by default)
dilation=1 - convolutional size of dilation (1 by default)
output=256 - number of kernel-filters in each of 8 or 11 convolutional layers (1 by default)
groups=4 - number of groups for grouped-convolutional (depth-wise) (1 by default)
state_constrain=512 - constrains LSTM-state values [-512; +512] after each inference (time_steps*32 by default)
peephole=0 - if 1 then will be used Peephole (additional 3 conv-layers), if 0 will not (1 by default)
bottleneck=0 - if 1 then will be used reduced optimal versionn of conv-lstm layer
activation=leaky - activation function for each of 8 or 11 convolutional-layers in the [conv_lstm]-layer (linear by default)
lstm_activation=tanh - activation for G (gate: g = tanh(wg + ug)) and C (memory cell: h = o * tanh(c))
Detailed-architecture-of-the-peephole-LSTM
Free-form data processing [Inputs]:
[connected] - fully connected layer
output=256 - number of outputs (1 by default), so number of connections is equal to inputs*outputs
activation=leaky - activation after layer (logistic by default)
[dropout] - dropout layer
probability=0.5 - dropout probability - what part of inputs will be zeroed (0.5 = 50% by default)
dropblock=1 - use as DropBlock
dropblock_size_abs=7 - size of DropBlock in pixels 7x7
[softmax] - SoftMax CE (cross entropy) layer - Categorical cross-entropy for multi-class classification
[contrastive] - Contrastive loss layer for Supervised and Unsupervised learning (should be set [net] contrastive=1 and optionally [net] unsupervised=1)
classes=1000 - number of classes
temperature=1.0 - temperature
[cost] - cost layer calculates (linear)Delta and (squared)Loss
type=sse - cost type: sse (L2), masked, smooth (smooth-L1) (SSE by default)
[rnn] - fully connected RNN-layer (recurrent)
batch_normalize=1 - if 1 - will be used batch-normalization, if 0 will not (0 by default)
output = 1024 - number of outputs in one connected layer (1 by default)
hidden=1024 - number of outputs in two (input and hidden) connected layers (1 by default)
activation=leaky - activation after layer (logistic by default)
[lstm] - fully connected LSTM-layer (recurrent)
batch_normalize=1 - if 1 - will be used batch-normalization, if 0 will not (0 by default)
output = 1024 - number of outputs in all connected layers (1 by default)
[gru] - fully connected GRU-layer (recurrent)
batch_normalize=1 - if 1 - will be used batch-normalization, if 0 will not (0 by default)
output = 1024 - number of outputs in all
connected layers (1 by default)
참고 링크
https://stackoverflow.com/questions/50390836/understanding-darknets-yolo-cfg-config-files
https://github.com/AlexeyAB/darknet/wiki/CFG-Parameters-in-the-%5Bnet%5D-section
https://github.com/AlexeyAB/darknet/wiki/CFG-Parameters-in-the-different-layers
https://eehoeskrap.tistory.com/370
https://blog-st.tistory.com/entry/Darknet-Layer-파라미터-정리
https://daewonyoon.tistory.com/299
https://mickael-k.tistory.com/27