linux SysV IPC shm共享內存實現

共享內存可以使多個進程共享某段內存,由於不需要進程間數據複製,所以是速度最快的IPC。
多個進程訪問共享內存時需要同步機制,如進程A往共享內存中寫數據時,進程B不能使用共享內存;通常採用信號量同步多進程訪問共享內存。

共享內存實現主要有以下幾點:
1.分配物理內存
2.將物理內存映射到進程的地址空間;通過修改進程的頁表,可以虛擬地址直接訪問物理內存
3.進程不再使用共享內存時,取消物理內存在進程地址空間的映射

tmpfs文件系統將所有文件存儲在內存(而非硬盤等介質)中;tmpfs將所有的東西存放在內核緩存中,可以根據文件系統中所容納的文件自動增長和收縮,也可以將不使用的頁swap出去。
linux共享內存的實現基於tmpfs文件系統及mmap文件映射;通過在tmpfs中創建文件來獲取物理內存,將文件映射到進程地址空間後可以使用虛擬地址訪問共享內存。


I.數據結構
include/linux/shm.h

 86 struct shmid_kernel /* private to the kernel */
 87 {
 88         struct kern_ipc_perm    shm_perm;       /* operation perms */
 89         struct file *           shm_file;       /* tmpfs file */
 90         unsigned long           shm_nattch;     /* no. of current attaches */
 91         unsigned long           shm_segsz;      /* size of segment (bytes) */
 92         time_t                  shm_atim;       /* last attach time */
 93         time_t                  shm_dtim;       /* last detach time */
 94         time_t                  shm_ctim;       /* last change time */
 95         pid_t                   shm_cprid;      /* pid of creator */
 96         pid_t                   shm_lprid;      /* pid of last operator */
 97         struct user_struct      *mlock_user;
 98 };

shmid_kernel用於存放共享內存信息
注:
  shm_file存放tmpfs中創建的內存文件,用於分配物理內存;用tmpfs的文件映射功能直接將共享內存映射到進程地址空間

 

ipc/shm.c

  48 struct shm_file_data {
  49         int id;
  50         struct ipc_namespace *ns;
  51         struct file *file;
  52         const struct vm_operations_struct *vm_ops;
  53 };

shm_file_data主要用於保存文件(tmpfs文件)內存映射的虛擬內存操作集vm_ops,進而擴展vm_ops,使某進程已經調用IPC_RMID,其它所有進程detach後能正常釋放共享內存IPC資源
注:
  共享內存主要涉及兩種文件,tmpfs文件與shm文件;一個共享內存對應一個tmpfs文件,有多少個進程attach到共享內存就有多少個shm文件。
  爲什麼要在tmpfs文件上層再加shm文件呢?直接將tmpfs文件映射到多個進程地址空間不就能實現內存共享了嗎?
  的確,可以將tmpfs文件映射到多個進程地址空間,並能實現內存共享。但是有一種特殊情況,當多個進程attach到共享內存,此時某個進程刪除共享內存,爲了保證其他進程能繼續正常使用共享內存,則暫不能刪除共享內存的IPC資源;而所有的進程detach後,tmpfs文件munmap又不能刪除IPC資源。
  所以在tmpfs文件上層添加shm文件,用於擴展tmpfs文件映射的vm_ops,來實現所有進程detach後刪除共享內存的IPC資源。
  詳細代碼參見:do_shm_rmid、shm_close
 


II.共享內存創建

 326 /**
 327  * newseg - Create a new shared memory segment
 328  * @ns: namespace
 329  * @params: ptr to the structure that contains key, size and shmflg
 330  *
 331  * Called with shm_ids.rw_mutex held as a writer.
 332  */
 333 
 334 static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 335 {
 336         key_t key = params->key;
 337         int shmflg = params->flg;
 338         size_t size = params->u.size;
 339         int error;
 340         struct shmid_kernel *shp;
 341         int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
 342         struct file * file;
 343         char name[13];
 344         int id;
 345         int acctflag = 0;
 346 
 347         if (size < SHMMIN || size > ns->shm_ctlmax)
 348                 return -EINVAL;
 349 
 350         if (ns->shm_tot + numpages > ns->shm_ctlall)
 351                 return -ENOSPC;
 352 
 353         shp = ipc_rcu_alloc(sizeof(*shp));
 354         if (!shp)
 355                 return -ENOMEM;
 356 
 357         shp->shm_perm.key = key;
 358         shp->shm_perm.mode = (shmflg & S_IRWXUGO);
 359         shp->mlock_user = NULL;
 360 
 361         shp->shm_perm.security = NULL;
 362         error = security_shm_alloc(shp);
 363         if (error) {
 364                 ipc_rcu_putref(shp);
 365                 return error;
 366         }
 367 
 368         sprintf (name, "SYSV%08x", key);
 369         if (shmflg & SHM_HUGETLB) {
 370                 /* hugetlb_file_setup applies strict accounting */
 371                 if (shmflg & SHM_NORESERVE)
 372                         acctflag = VM_NORESERVE;
 373                 file = hugetlb_file_setup(name, size, acctflag,
 374                                         &shp->mlock_user, HUGETLB_SHMFS_INODE);
 375         } else {
 376                 /*
 377                  * Do not allow no accounting for OVERCOMMIT_NEVER, even
 378                  * if it's asked for.
 379                  */
 380                 if  ((shmflg & SHM_NORESERVE) &&
 381                                 sysctl_overcommit_memory != OVERCOMMIT_NEVER)
 382                         acctflag = VM_NORESERVE;
 383                 file = shmem_file_setup(name, size, acctflag);
 384         }
 385         error = PTR_ERR(file);
 386         if (IS_ERR(file))
 387                 goto no_file;
 388 
 389         id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
 390         if (id < 0) {
 391                 error = id;
 392                 goto no_id;
 393         }
 394 
 395         shp->shm_cprid = task_tgid_vnr(current);
 396         shp->shm_lprid = 0;
 397         shp->shm_atim = shp->shm_dtim = 0;
 398         shp->shm_ctim = get_seconds();
 399         shp->shm_segsz = size;
 400         shp->shm_nattch = 0;
 401         shp->shm_file = file;
 402         /*
 403          * shmid gets reported as "inode#" in /proc/pid/maps.
 404          * proc-ps tools use this. Changing this will break them.
 405          */
 406         file->f_dentry->d_inode->i_ino = shp->shm_perm.id;
 407 
 408         ns->shm_tot += numpages;
 409         error = shp->shm_perm.id;
 410         shm_unlock(shp);
 411         return error;
 412 
 413 no_id:
 414         if (is_file_hugepages(file) && shp->mlock_user)
 415                 user_shm_unlock(size, shp->mlock_user);
 416         fput(file);
 417 no_file:
 418         security_shm_free(shp);
 419         ipc_rcu_putref(shp);
 420         return error;
 421 }

1.參數及共享內存系統限制檢查
2.分配共享內存管理結構shmid_kernel
3.在tmpfs中創建共享內存文件,以獲取物理內存
4.將shmid_kernel添加到共享內存基數樹中,並獲得基數樹id
5.初始化shmid_kernel結構
6.返回共享內存IPC id

 

III.共享內存映射到進程地址空間
i.do_shmat

 806 /*
 807  * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
 808  *
 809  * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
 810  * "raddr" thing points to kernel space, and there has to be a wrapper around
 811  * this.
 812  */
 813 long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
 814 {
 815         struct shmid_kernel *shp;
 816         unsigned long addr;
 817         unsigned long size;
 818         struct file * file;
 819         int    err;
 820         unsigned long flags;
 821         unsigned long prot;
 822         int acc_mode;
 823         unsigned long user_addr;
 824         struct ipc_namespace *ns;
 825         struct shm_file_data *sfd;
 826         struct path path;
 827         fmode_t f_mode;
 828 
 829         err = -EINVAL;
 830         if (shmid < 0)
 831                 goto out;
 832         else if ((addr = (ulong)shmaddr)) {
 833                 if (addr & (SHMLBA-1)) {
 834                         if (shmflg & SHM_RND)
 835                                 addr &= ~(SHMLBA-1);       /* round down */
 836                         else
 837 #ifndef __ARCH_FORCE_SHMLBA
 838                                 if (addr & ~PAGE_MASK)
 839 #endif
 840                                         goto out;
 841                 }
 842                 flags = MAP_SHARED | MAP_FIXED;
 843         } else {
 844                 if ((shmflg & SHM_REMAP))
 845                         goto out;
 846 
 847                 flags = MAP_SHARED;
 848         }
 849 
 850         if (shmflg & SHM_RDONLY) {
 851                 prot = PROT_READ;
 852                 acc_mode = S_IRUGO;
 853                 f_mode = FMODE_READ;
 854         } else {
 855                 prot = PROT_READ | PROT_WRITE;
 856                 acc_mode = S_IRUGO | S_IWUGO;
 857                 f_mode = FMODE_READ | FMODE_WRITE;
 858         }
 859         if (shmflg & SHM_EXEC) {
 860                 prot |= PROT_EXEC;
 861                 acc_mode |= S_IXUGO;
 862         }
 863 
 864         /*
 865          * We cannot rely on the fs check since SYSV IPC does have an
 866          * additional creator id...
 867          */
 868         ns = current->nsproxy->ipc_ns;
 869         shp = shm_lock_check(ns, shmid);
 870         if (IS_ERR(shp)) {
 871                 err = PTR_ERR(shp);
 872                 goto out;
 873         }
 874 
 875         err = -EACCES;
 876         if (ipcperms(&shp->shm_perm, acc_mode))
 877                 goto out_unlock;
 878 
 879         err = security_shm_shmat(shp, shmaddr, shmflg);
 880         if (err)
 881                 goto out_unlock;
 882 
 883         path.dentry = dget(shp->shm_file->f_path.dentry);
 884         path.mnt    = shp->shm_file->f_path.mnt;
 885         shp->shm_nattch++;
 886         size = i_size_read(path.dentry->d_inode);
 887         shm_unlock(shp);
 888 
 889         err = -ENOMEM;
 890         sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
 891         if (!sfd)
 892                 goto out_put_dentry;
 893 
 894         file = alloc_file(path.mnt, path.dentry, f_mode,
 895                         is_file_hugepages(shp->shm_file) ?
 896                                 &shm_file_operations_huge :
 897                                 &shm_file_operations);
 898         if (!file)
 899                 goto out_free;
 900         ima_counts_get(file);
 901 
 902         file->private_data = sfd;
 903         file->f_mapping = shp->shm_file->f_mapping;
 904         sfd->id = shp->shm_perm.id;
 905         sfd->ns = get_ipc_ns(ns);
 906         sfd->file = shp->shm_file;
 907         sfd->vm_ops = NULL;
 908 
 909         down_write(¤t->mm->mmap_sem);
 910         if (addr && !(shmflg & SHM_REMAP)) {
 911                 err = -EINVAL;
 912                 if (find_vma_intersection(current->mm, addr, addr + size))
 913                         goto invalid;
 914                 /*
 915                  * If shm segment goes below stack, make sure there is some
 916                  * space left for the stack to grow (at least 4 pages).
 917                  */
 918                 if (addr < current->mm->start_stack &&
 919                     addr > current->mm->start_stack - size - PAGE_SIZE * 5)
 920                         goto invalid;
 921         }
 922 
 923         user_addr = do_mmap (file, addr, size, prot, flags, 0);
 924         *raddr = user_addr;
 925         err = 0;
 926         if (IS_ERR_VALUE(user_addr))
 927                 err = (long)user_addr;
 928 invalid:
 929         up_write(¤t->mm->mmap_sem);
 930 
 931         fput(file);
 932 
 933 out_nattch:
 934         down_write(&shm_ids(ns).rw_mutex);
 935         shp = shm_lock(ns, shmid);
 936         BUG_ON(IS_ERR(shp));
 937         shp->shm_nattch--;
 938         if(shp->shm_nattch == 0 &&
 939            shp->shm_perm.mode & SHM_DEST)
 940                 shm_destroy(ns, shp);
 941         else
 942                 shm_unlock(shp);
 943         up_write(&shm_ids(ns).rw_mutex);
 944 
 945 out:
 946         return err;
 947 
 948 out_unlock:
 949         shm_unlock(shp);
 950         goto out;
 951 
 952 out_free:
 953         kfree(sfd);
 954 out_put_dentry:
 955         dput(path.dentry);
 956         goto out_nattch;
 957 }

1.對參數進行合法性檢查,並根據參數計算內存映射標識和保護方式
2.attach權限檢驗
3.attach計數器shm_nattch加1
4.分配shm文件,並初始化私有數據shm_file_data
5.將shm文件映射到進程地址空間(do_mmap實現是將tmpfs文件映射到進程地址空間)
6.attach計數器shm_nattch減1,由於在shm文件映射時shm_mmap->shm_open會將shm_nattch加1

 

ii.shm_mmap
do_mmap會回調shm文件的shm_mmap函數:
do_mmap->do_mmap_pgoff->mmap_region->mmap

 249 static int shm_mmap(struct file * file, struct vm_area_struct * vma)
 250 {
 251         struct shm_file_data *sfd = shm_file_data(file);
 252         int ret;
 253 
 254         ret = sfd->file->f_op->mmap(sfd->file, vma);
 255         if (ret != 0)
 256                 return ret;
 257         sfd->vm_ops = vma->vm_ops;
 258 #ifdef CONFIG_MMU
 259         BUG_ON(!sfd->vm_ops->fault);
 260 #endif
 261         vma->vm_ops = &shm_vm_ops;
 262         shm_open(vma);
 263 
 264         return ret;
 265 }

1.將tmpfs文件的vma操作切換成shm文件的vma操作shm_vm_ops,用於進程munmap時調用shm_close,來實現所有進程detach且之前有IPC_RMID時刪除共享內存IPC資源
2.可以看出do_mmap最後會調用tmpfs文件的mmap方法,將tmpfs文件映射到進程地址空間

 

iii.shm_fault
將tmpfs文件映射到進程地址空間後,如果是第一次訪問會產生缺頁異常;缺頁異常處理中會將文件裝入內存中並添加相應的頁表項,以便使用虛擬地址訪問。
do_page_fault->handle_mm_fault->handle_pte_fault->do_linear_fault->__do_fault->shm_fault

 214 static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 215 {
 216         struct file *file = vma->vm_file;
 217         struct shm_file_data *sfd = shm_file_data(file);
 218 
 219         return sfd->vm_ops->fault(vma, vmf);
 220 }

shm文件的異常處理shm_fault實際調用的是tmpfs文件的異常處理,來裝入tmpfs文件的內容。

 


IV.共享內存從進程地址空間中刪除
i.shmdt
當進程不想再訪問共享內存時,會將其從地址空間中移除。

 971 /*
 972  * detach and kill segment if marked destroyed.
 973  * The work is done in shm_close.
 974  */
 975 SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 976 {
 977         struct mm_struct *mm = current->mm;
 978         struct vm_area_struct *vma;
 979         unsigned long addr = (unsigned long)shmaddr;
 980         int retval = -EINVAL;
 981 #ifdef CONFIG_MMU
 982         loff_t size = 0;
 983         struct vm_area_struct *next;
 984 #endif
 985 
 986         if (addr & ~PAGE_MASK)
 987                 return retval;
 988 
 989         down_write(&mm->mmap_sem);
 990 
 991         /*
 992          * This function tries to be smart and unmap shm segments that
 993          * were modified by partial mlock or munmap calls:
 994          * - It first determines the size of the shm segment that should be
 995          *   unmapped: It searches for a vma that is backed by shm and that
 996          *   started at address shmaddr. It records it's size and then unmaps
 997          *   it.
 998          * - Then it unmaps all shm vmas that started at shmaddr and that
 999          *   are within the initially determined size.
1000          * Errors from do_munmap are ignored: the function only fails if
1001          * it's called with invalid parameters or if it's called to unmap
1002          * a part of a vma. Both calls in this function are for full vmas,
1003          * the parameters are directly copied from the vma itself and always
1004          * valid - therefore do_munmap cannot fail. (famous last words?)
1005          */
1006         /*
1007          * If it had been mremap()'d, the starting address would not
1008          * match the usual checks anyway. So assume all vma's are
1009          * above the starting address given.
1010          */
1011         vma = find_vma(mm, addr);
1012 
1013 #ifdef CONFIG_MMU
1014         while (vma) {
1015                 next = vma->vm_next;
1016 
1017                 /*
1018                  * Check if the starting address would match, i.e. it's
1019                  * a fragment created by mprotect() and/or munmap(), or it
1020                  * otherwise it starts at this address with no hassles.
1021                  */
1022                 if ((vma->vm_ops == &shm_vm_ops) &&
1023                         (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
1024 
1025 
1026                         size = vma->vm_file->f_path.dentry->d_inode->i_size;
1027                         do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1028                         /*
1029                          * We discovered the size of the shm segment, so
1030                          * break out of here and fall through to the next
1031                          * loop that uses the size information to stop
1032                          * searching for matching vma's.
1033                          */
1034                         retval = 0;
1035                         vma = next;
1036                         break;
1037                 }
1038                 vma = next;
1039         }
1040 
1041         /*
1042          * We need look no further than the maximum address a fragment
1043          * could possibly have landed at. Also cast things to loff_t to
1044          * prevent overflows and make comparisions vs. equal-width types.
1045          */
1046         size = PAGE_ALIGN(size);
1047         while (vma && (loff_t)(vma->vm_end - addr) <= size) {
1048                 next = vma->vm_next;
1049 
1050                 /* finding a matching vma now does not alter retval */
1051                 if ((vma->vm_ops == &shm_vm_ops) &&
1052                         (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff)
1053 
1054                         do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1055                 vma = next;
1056         }
1057 
1058 #else /* CONFIG_MMU */
1059         /* under NOMMU conditions, the exact address to be destroyed must be
1060          * given */
1061         retval = -EINVAL;
1062         if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
1063                 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1064                 retval = 0;
1065         }
1066 
1067 #endif
1068 
1069         up_write(&mm->mmap_sem);
1070         return retval;
1071 }

1.查找共享內存映射到進程地址空間的虛擬地址內存段vma
2.移除共享內存在進程地址空間的映射

ii.shm_close

186 /*
 187  * remove the attach descriptor vma.
 188  * free memory for segment if it is marked destroyed.
 189  * The descriptor has already been removed from the current->mm->mmap list
 190  * and will later be kfree()d.
 191  */
 192 static void shm_close(struct vm_area_struct *vma)
 193 {
 194         struct file * file = vma->vm_file;
 195         struct shm_file_data *sfd = shm_file_data(file);
 196         struct shmid_kernel *shp;
 197         struct ipc_namespace *ns = sfd->ns;
 198 
 199         down_write(&shm_ids(ns).rw_mutex);
 200         /* remove from the list of attaches of the shm segment */
 201         shp = shm_lock(ns, sfd->id);
 202         BUG_ON(IS_ERR(shp));
 203         shp->shm_lprid = task_tgid_vnr(current);
 204         shp->shm_dtim = get_seconds();
 205         shp->shm_nattch--;
 206         if(shp->shm_nattch == 0 &&
 207            shp->shm_perm.mode & SHM_DEST)
 208                 shm_destroy(ns, shp);
 209         else
 210                 shm_unlock(shp);
 211         up_write(&shm_ids(ns).rw_mutex);
 212 }

1.attach計數器shm_nattch減1
2.如果共享內存沒有attach的進程,且已經有進程調用過shmctl(...,IPC_RMID,...),則銷燬共享內存IPC資源

 


V.共享內存移除
i.do_shm_rmid
當不再使用共享內存時,會移除共享內存;通過IPC_RMID命令調用shmctl來實現共享內存的移除

  82 /*
  83  * Called with shm_ids.rw_mutex (writer) and the shp structure locked.
  84  * Only shm_ids.rw_mutex remains locked on exit.
  85  */
  86 static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
  87 {
  88         struct shmid_kernel *shp;
  89         shp = container_of(ipcp, struct shmid_kernel, shm_perm);
  90 
  91         if (shp->shm_nattch){
  92                 shp->shm_perm.mode |= SHM_DEST;
  93                 /* Do not find it any more */
  94                 shp->shm_perm.key = IPC_PRIVATE;
  95                 shm_unlock(shp);
  96         } else
  97                 shm_destroy(ns, shp);
  98 }

1.還有進程attach到共享內存,置共享內存銷燬SHM_DEST標識,用於在所有進程detach時銷燬共享內存IPC資源,見shm_close;並將key置爲IPC_PRIVATE,不能通過key再獲取共享內存
2.如果沒有進程attach到共享內存,銷燬共享內存IPC資源

 

ii.shm_destroy

 162 /*
 163  * shm_destroy - free the struct shmid_kernel
 164  *
 165  * @ns: namespace
 166  * @shp: struct to free
 167  *
 168  * It has to be called with shp and shm_ids.rw_mutex (writer) locked,
 169  * but returns with shp unlocked and freed.
 170  */
 171 static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
 172 {
 173         ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
 174         shm_rmid(ns, shp);
 175         shm_unlock(shp);
 176         if (!is_file_hugepages(shp->shm_file))
 177                 shmem_lock(shp->shm_file, 0, shp->mlock_user);
 178         else if (shp->mlock_user)
 179                 user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size,
 180                                                 shp->mlock_user);
 181         fput (shp->shm_file);
 182         security_shm_free(shp);
 183         ipc_rcu_putref(shp);
 184 }

1.將共享內存IPC從共享內存基數樹中移除
2.釋放tmpfs文件使用的file結構內存
3.釋放shmid_kernel內存

發佈了50 篇原創文章 · 獲贊 12 · 訪問量 15萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章