共享內存可以使多個進程共享某段內存,由於不需要進程間數據複製,所以是速度最快的IPC。
多個進程訪問共享內存時需要同步機制,如進程A往共享內存中寫數據時,進程B不能使用共享內存;通常採用信號量同步多進程訪問共享內存。
共享內存實現主要有以下幾點:
1.分配物理內存
2.將物理內存映射到進程的地址空間;通過修改進程的頁表,可以虛擬地址直接訪問物理內存
3.進程不再使用共享內存時,取消物理內存在進程地址空間的映射
tmpfs文件系統將所有文件存儲在內存(而非硬盤等介質)中;tmpfs將所有的東西存放在內核緩存中,可以根據文件系統中所容納的文件自動增長和收縮,也可以將不使用的頁swap出去。
linux共享內存的實現基於tmpfs文件系統及mmap文件映射;通過在tmpfs中創建文件來獲取物理內存,將文件映射到進程地址空間後可以使用虛擬地址訪問共享內存。
I.數據結構
include/linux/shm.h
86 struct shmid_kernel /* private to the kernel */
87 {
88 struct kern_ipc_perm shm_perm; /* operation perms */
89 struct file * shm_file; /* tmpfs file */
90 unsigned long shm_nattch; /* no. of current attaches */
91 unsigned long shm_segsz; /* size of segment (bytes) */
92 time_t shm_atim; /* last attach time */
93 time_t shm_dtim; /* last detach time */
94 time_t shm_ctim; /* last change time */
95 pid_t shm_cprid; /* pid of creator */
96 pid_t shm_lprid; /* pid of last operator */
97 struct user_struct *mlock_user;
98 };
shmid_kernel用於存放共享內存信息
注:
shm_file存放tmpfs中創建的內存文件,用於分配物理內存;用tmpfs的文件映射功能直接將共享內存映射到進程地址空間
ipc/shm.c
48 struct shm_file_data {
49 int id;
50 struct ipc_namespace *ns;
51 struct file *file;
52 const struct vm_operations_struct *vm_ops;
53 };
shm_file_data主要用於保存文件(tmpfs文件)內存映射的虛擬內存操作集vm_ops,進而擴展vm_ops,使某進程已經調用IPC_RMID,其它所有進程detach後能正常釋放共享內存IPC資源
注:
共享內存主要涉及兩種文件,tmpfs文件與shm文件;一個共享內存對應一個tmpfs文件,有多少個進程attach到共享內存就有多少個shm文件。
爲什麼要在tmpfs文件上層再加shm文件呢?直接將tmpfs文件映射到多個進程地址空間不就能實現內存共享了嗎?
的確,可以將tmpfs文件映射到多個進程地址空間,並能實現內存共享。但是有一種特殊情況,當多個進程attach到共享內存,此時某個進程刪除共享內存,爲了保證其他進程能繼續正常使用共享內存,則暫不能刪除共享內存的IPC資源;而所有的進程detach後,tmpfs文件munmap又不能刪除IPC資源。
所以在tmpfs文件上層添加shm文件,用於擴展tmpfs文件映射的vm_ops,來實現所有進程detach後刪除共享內存的IPC資源。
詳細代碼參見:do_shm_rmid、shm_close
II.共享內存創建
326 /**
327 * newseg - Create a new shared memory segment
328 * @ns: namespace
329 * @params: ptr to the structure that contains key, size and shmflg
330 *
331 * Called with shm_ids.rw_mutex held as a writer.
332 */
333
334 static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
335 {
336 key_t key = params->key;
337 int shmflg = params->flg;
338 size_t size = params->u.size;
339 int error;
340 struct shmid_kernel *shp;
341 int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
342 struct file * file;
343 char name[13];
344 int id;
345 int acctflag = 0;
346
347 if (size < SHMMIN || size > ns->shm_ctlmax)
348 return -EINVAL;
349
350 if (ns->shm_tot + numpages > ns->shm_ctlall)
351 return -ENOSPC;
352
353 shp = ipc_rcu_alloc(sizeof(*shp));
354 if (!shp)
355 return -ENOMEM;
356
357 shp->shm_perm.key = key;
358 shp->shm_perm.mode = (shmflg & S_IRWXUGO);
359 shp->mlock_user = NULL;
360
361 shp->shm_perm.security = NULL;
362 error = security_shm_alloc(shp);
363 if (error) {
364 ipc_rcu_putref(shp);
365 return error;
366 }
367
368 sprintf (name, "SYSV%08x", key);
369 if (shmflg & SHM_HUGETLB) {
370 /* hugetlb_file_setup applies strict accounting */
371 if (shmflg & SHM_NORESERVE)
372 acctflag = VM_NORESERVE;
373 file = hugetlb_file_setup(name, size, acctflag,
374 &shp->mlock_user, HUGETLB_SHMFS_INODE);
375 } else {
376 /*
377 * Do not allow no accounting for OVERCOMMIT_NEVER, even
378 * if it's asked for.
379 */
380 if ((shmflg & SHM_NORESERVE) &&
381 sysctl_overcommit_memory != OVERCOMMIT_NEVER)
382 acctflag = VM_NORESERVE;
383 file = shmem_file_setup(name, size, acctflag);
384 }
385 error = PTR_ERR(file);
386 if (IS_ERR(file))
387 goto no_file;
388
389 id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
390 if (id < 0) {
391 error = id;
392 goto no_id;
393 }
394
395 shp->shm_cprid = task_tgid_vnr(current);
396 shp->shm_lprid = 0;
397 shp->shm_atim = shp->shm_dtim = 0;
398 shp->shm_ctim = get_seconds();
399 shp->shm_segsz = size;
400 shp->shm_nattch = 0;
401 shp->shm_file = file;
402 /*
403 * shmid gets reported as "inode#" in /proc/pid/maps.
404 * proc-ps tools use this. Changing this will break them.
405 */
406 file->f_dentry->d_inode->i_ino = shp->shm_perm.id;
407
408 ns->shm_tot += numpages;
409 error = shp->shm_perm.id;
410 shm_unlock(shp);
411 return error;
412
413 no_id:
414 if (is_file_hugepages(file) && shp->mlock_user)
415 user_shm_unlock(size, shp->mlock_user);
416 fput(file);
417 no_file:
418 security_shm_free(shp);
419 ipc_rcu_putref(shp);
420 return error;
421 }
1.參數及共享內存系統限制檢查
2.分配共享內存管理結構shmid_kernel
3.在tmpfs中創建共享內存文件,以獲取物理內存
4.將shmid_kernel添加到共享內存基數樹中,並獲得基數樹id
5.初始化shmid_kernel結構
6.返回共享內存IPC id
III.共享內存映射到進程地址空間
i.do_shmat
806 /*
807 * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
808 *
809 * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
810 * "raddr" thing points to kernel space, and there has to be a wrapper around
811 * this.
812 */
813 long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
814 {
815 struct shmid_kernel *shp;
816 unsigned long addr;
817 unsigned long size;
818 struct file * file;
819 int err;
820 unsigned long flags;
821 unsigned long prot;
822 int acc_mode;
823 unsigned long user_addr;
824 struct ipc_namespace *ns;
825 struct shm_file_data *sfd;
826 struct path path;
827 fmode_t f_mode;
828
829 err = -EINVAL;
830 if (shmid < 0)
831 goto out;
832 else if ((addr = (ulong)shmaddr)) {
833 if (addr & (SHMLBA-1)) {
834 if (shmflg & SHM_RND)
835 addr &= ~(SHMLBA-1); /* round down */
836 else
837 #ifndef __ARCH_FORCE_SHMLBA
838 if (addr & ~PAGE_MASK)
839 #endif
840 goto out;
841 }
842 flags = MAP_SHARED | MAP_FIXED;
843 } else {
844 if ((shmflg & SHM_REMAP))
845 goto out;
846
847 flags = MAP_SHARED;
848 }
849
850 if (shmflg & SHM_RDONLY) {
851 prot = PROT_READ;
852 acc_mode = S_IRUGO;
853 f_mode = FMODE_READ;
854 } else {
855 prot = PROT_READ | PROT_WRITE;
856 acc_mode = S_IRUGO | S_IWUGO;
857 f_mode = FMODE_READ | FMODE_WRITE;
858 }
859 if (shmflg & SHM_EXEC) {
860 prot |= PROT_EXEC;
861 acc_mode |= S_IXUGO;
862 }
863
864 /*
865 * We cannot rely on the fs check since SYSV IPC does have an
866 * additional creator id...
867 */
868 ns = current->nsproxy->ipc_ns;
869 shp = shm_lock_check(ns, shmid);
870 if (IS_ERR(shp)) {
871 err = PTR_ERR(shp);
872 goto out;
873 }
874
875 err = -EACCES;
876 if (ipcperms(&shp->shm_perm, acc_mode))
877 goto out_unlock;
878
879 err = security_shm_shmat(shp, shmaddr, shmflg);
880 if (err)
881 goto out_unlock;
882
883 path.dentry = dget(shp->shm_file->f_path.dentry);
884 path.mnt = shp->shm_file->f_path.mnt;
885 shp->shm_nattch++;
886 size = i_size_read(path.dentry->d_inode);
887 shm_unlock(shp);
888
889 err = -ENOMEM;
890 sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
891 if (!sfd)
892 goto out_put_dentry;
893
894 file = alloc_file(path.mnt, path.dentry, f_mode,
895 is_file_hugepages(shp->shm_file) ?
896 &shm_file_operations_huge :
897 &shm_file_operations);
898 if (!file)
899 goto out_free;
900 ima_counts_get(file);
901
902 file->private_data = sfd;
903 file->f_mapping = shp->shm_file->f_mapping;
904 sfd->id = shp->shm_perm.id;
905 sfd->ns = get_ipc_ns(ns);
906 sfd->file = shp->shm_file;
907 sfd->vm_ops = NULL;
908
909 down_write(¤t->mm->mmap_sem);
910 if (addr && !(shmflg & SHM_REMAP)) {
911 err = -EINVAL;
912 if (find_vma_intersection(current->mm, addr, addr + size))
913 goto invalid;
914 /*
915 * If shm segment goes below stack, make sure there is some
916 * space left for the stack to grow (at least 4 pages).
917 */
918 if (addr < current->mm->start_stack &&
919 addr > current->mm->start_stack - size - PAGE_SIZE * 5)
920 goto invalid;
921 }
922
923 user_addr = do_mmap (file, addr, size, prot, flags, 0);
924 *raddr = user_addr;
925 err = 0;
926 if (IS_ERR_VALUE(user_addr))
927 err = (long)user_addr;
928 invalid:
929 up_write(¤t->mm->mmap_sem);
930
931 fput(file);
932
933 out_nattch:
934 down_write(&shm_ids(ns).rw_mutex);
935 shp = shm_lock(ns, shmid);
936 BUG_ON(IS_ERR(shp));
937 shp->shm_nattch--;
938 if(shp->shm_nattch == 0 &&
939 shp->shm_perm.mode & SHM_DEST)
940 shm_destroy(ns, shp);
941 else
942 shm_unlock(shp);
943 up_write(&shm_ids(ns).rw_mutex);
944
945 out:
946 return err;
947
948 out_unlock:
949 shm_unlock(shp);
950 goto out;
951
952 out_free:
953 kfree(sfd);
954 out_put_dentry:
955 dput(path.dentry);
956 goto out_nattch;
957 }
1.對參數進行合法性檢查,並根據參數計算內存映射標識和保護方式
2.attach權限檢驗
3.attach計數器shm_nattch加1
4.分配shm文件,並初始化私有數據shm_file_data
5.將shm文件映射到進程地址空間(do_mmap實現是將tmpfs文件映射到進程地址空間)
6.attach計數器shm_nattch減1,由於在shm文件映射時shm_mmap->shm_open會將shm_nattch加1
ii.shm_mmap
do_mmap會回調shm文件的shm_mmap函數:
do_mmap->do_mmap_pgoff->mmap_region->mmap
249 static int shm_mmap(struct file * file, struct vm_area_struct * vma)
250 {
251 struct shm_file_data *sfd = shm_file_data(file);
252 int ret;
253
254 ret = sfd->file->f_op->mmap(sfd->file, vma);
255 if (ret != 0)
256 return ret;
257 sfd->vm_ops = vma->vm_ops;
258 #ifdef CONFIG_MMU
259 BUG_ON(!sfd->vm_ops->fault);
260 #endif
261 vma->vm_ops = &shm_vm_ops;
262 shm_open(vma);
263
264 return ret;
265 }
1.將tmpfs文件的vma操作切換成shm文件的vma操作shm_vm_ops,用於進程munmap時調用shm_close,來實現所有進程detach且之前有IPC_RMID時刪除共享內存IPC資源
2.可以看出do_mmap最後會調用tmpfs文件的mmap方法,將tmpfs文件映射到進程地址空間
iii.shm_fault
將tmpfs文件映射到進程地址空間後,如果是第一次訪問會產生缺頁異常;缺頁異常處理中會將文件裝入內存中並添加相應的頁表項,以便使用虛擬地址訪問。
do_page_fault->handle_mm_fault->handle_pte_fault->do_linear_fault->__do_fault->shm_fault
214 static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
215 {
216 struct file *file = vma->vm_file;
217 struct shm_file_data *sfd = shm_file_data(file);
218
219 return sfd->vm_ops->fault(vma, vmf);
220 }
shm文件的異常處理shm_fault實際調用的是tmpfs文件的異常處理,來裝入tmpfs文件的內容。
IV.共享內存從進程地址空間中刪除
i.shmdt
當進程不想再訪問共享內存時,會將其從地址空間中移除。
971 /*
972 * detach and kill segment if marked destroyed.
973 * The work is done in shm_close.
974 */
975 SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
976 {
977 struct mm_struct *mm = current->mm;
978 struct vm_area_struct *vma;
979 unsigned long addr = (unsigned long)shmaddr;
980 int retval = -EINVAL;
981 #ifdef CONFIG_MMU
982 loff_t size = 0;
983 struct vm_area_struct *next;
984 #endif
985
986 if (addr & ~PAGE_MASK)
987 return retval;
988
989 down_write(&mm->mmap_sem);
990
991 /*
992 * This function tries to be smart and unmap shm segments that
993 * were modified by partial mlock or munmap calls:
994 * - It first determines the size of the shm segment that should be
995 * unmapped: It searches for a vma that is backed by shm and that
996 * started at address shmaddr. It records it's size and then unmaps
997 * it.
998 * - Then it unmaps all shm vmas that started at shmaddr and that
999 * are within the initially determined size.
1000 * Errors from do_munmap are ignored: the function only fails if
1001 * it's called with invalid parameters or if it's called to unmap
1002 * a part of a vma. Both calls in this function are for full vmas,
1003 * the parameters are directly copied from the vma itself and always
1004 * valid - therefore do_munmap cannot fail. (famous last words?)
1005 */
1006 /*
1007 * If it had been mremap()'d, the starting address would not
1008 * match the usual checks anyway. So assume all vma's are
1009 * above the starting address given.
1010 */
1011 vma = find_vma(mm, addr);
1012
1013 #ifdef CONFIG_MMU
1014 while (vma) {
1015 next = vma->vm_next;
1016
1017 /*
1018 * Check if the starting address would match, i.e. it's
1019 * a fragment created by mprotect() and/or munmap(), or it
1020 * otherwise it starts at this address with no hassles.
1021 */
1022 if ((vma->vm_ops == &shm_vm_ops) &&
1023 (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
1024
1025
1026 size = vma->vm_file->f_path.dentry->d_inode->i_size;
1027 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1028 /*
1029 * We discovered the size of the shm segment, so
1030 * break out of here and fall through to the next
1031 * loop that uses the size information to stop
1032 * searching for matching vma's.
1033 */
1034 retval = 0;
1035 vma = next;
1036 break;
1037 }
1038 vma = next;
1039 }
1040
1041 /*
1042 * We need look no further than the maximum address a fragment
1043 * could possibly have landed at. Also cast things to loff_t to
1044 * prevent overflows and make comparisions vs. equal-width types.
1045 */
1046 size = PAGE_ALIGN(size);
1047 while (vma && (loff_t)(vma->vm_end - addr) <= size) {
1048 next = vma->vm_next;
1049
1050 /* finding a matching vma now does not alter retval */
1051 if ((vma->vm_ops == &shm_vm_ops) &&
1052 (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff)
1053
1054 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1055 vma = next;
1056 }
1057
1058 #else /* CONFIG_MMU */
1059 /* under NOMMU conditions, the exact address to be destroyed must be
1060 * given */
1061 retval = -EINVAL;
1062 if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
1063 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1064 retval = 0;
1065 }
1066
1067 #endif
1068
1069 up_write(&mm->mmap_sem);
1070 return retval;
1071 }
1.查找共享內存映射到進程地址空間的虛擬地址內存段vma
2.移除共享內存在進程地址空間的映射
ii.shm_close
186 /*
187 * remove the attach descriptor vma.
188 * free memory for segment if it is marked destroyed.
189 * The descriptor has already been removed from the current->mm->mmap list
190 * and will later be kfree()d.
191 */
192 static void shm_close(struct vm_area_struct *vma)
193 {
194 struct file * file = vma->vm_file;
195 struct shm_file_data *sfd = shm_file_data(file);
196 struct shmid_kernel *shp;
197 struct ipc_namespace *ns = sfd->ns;
198
199 down_write(&shm_ids(ns).rw_mutex);
200 /* remove from the list of attaches of the shm segment */
201 shp = shm_lock(ns, sfd->id);
202 BUG_ON(IS_ERR(shp));
203 shp->shm_lprid = task_tgid_vnr(current);
204 shp->shm_dtim = get_seconds();
205 shp->shm_nattch--;
206 if(shp->shm_nattch == 0 &&
207 shp->shm_perm.mode & SHM_DEST)
208 shm_destroy(ns, shp);
209 else
210 shm_unlock(shp);
211 up_write(&shm_ids(ns).rw_mutex);
212 }
1.attach計數器shm_nattch減1
2.如果共享內存沒有attach的進程,且已經有進程調用過shmctl(...,IPC_RMID,...),則銷燬共享內存IPC資源
V.共享內存移除
i.do_shm_rmid
當不再使用共享內存時,會移除共享內存;通過IPC_RMID命令調用shmctl來實現共享內存的移除
82 /*
83 * Called with shm_ids.rw_mutex (writer) and the shp structure locked.
84 * Only shm_ids.rw_mutex remains locked on exit.
85 */
86 static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
87 {
88 struct shmid_kernel *shp;
89 shp = container_of(ipcp, struct shmid_kernel, shm_perm);
90
91 if (shp->shm_nattch){
92 shp->shm_perm.mode |= SHM_DEST;
93 /* Do not find it any more */
94 shp->shm_perm.key = IPC_PRIVATE;
95 shm_unlock(shp);
96 } else
97 shm_destroy(ns, shp);
98 }
1.還有進程attach到共享內存,置共享內存銷燬SHM_DEST標識,用於在所有進程detach時銷燬共享內存IPC資源,見shm_close;並將key置爲IPC_PRIVATE,不能通過key再獲取共享內存
2.如果沒有進程attach到共享內存,銷燬共享內存IPC資源
ii.shm_destroy
162 /*
163 * shm_destroy - free the struct shmid_kernel
164 *
165 * @ns: namespace
166 * @shp: struct to free
167 *
168 * It has to be called with shp and shm_ids.rw_mutex (writer) locked,
169 * but returns with shp unlocked and freed.
170 */
171 static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
172 {
173 ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
174 shm_rmid(ns, shp);
175 shm_unlock(shp);
176 if (!is_file_hugepages(shp->shm_file))
177 shmem_lock(shp->shm_file, 0, shp->mlock_user);
178 else if (shp->mlock_user)
179 user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size,
180 shp->mlock_user);
181 fput (shp->shm_file);
182 security_shm_free(shp);
183 ipc_rcu_putref(shp);
184 }
1.將共享內存IPC從共享內存基數樹中移除
2.釋放tmpfs文件使用的file結構內存
3.釋放shmid_kernel內存