From 69919ae2f137942155d11fb4113b7ded0d28d800 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Mon, 22 May 2006 09:13:49 +1000 Subject: [PATCH] Wiggle 0.6 - first release --- ANNOUNCE | 95 + COPYING | 340 + DOC/diff.ps | 11852 ++++++++++++++++++++++++++ INSTALL | 11 + Makefile | 50 + ReadMe.c | 143 + TODO | 29 + bestmatch.c | 513 ++ diff.c | 428 + dotest | 88 + extract.c | 260 + get-p-options | 8 + hash.h | 92 + load.c | 142 + merge.c | 882 ++ notes | 101 + p | 727 ++ p.help | 327 + split.c | 124 + tests/linux/inode-fullpatch/diff | 1330 +++ tests/linux/inode-fullpatch/merge | 1358 +++ tests/linux/inode-fullpatch/orig | 1323 +++ tests/linux/inode-fullpatch/patch | 77 + tests/linux/inode-fullpatch/rediff | 73 + tests/linux/inode-fullpatch/wmerge | 1352 +++ tests/linux/inode-justrej/lmerge | 1358 +++ tests/linux/inode-justrej/merge | 1358 +++ tests/linux/inode-justrej/orig | 1353 +++ tests/linux/inode-justrej/patch | 16 + tests/linux/inode-justrej/wmerge | 1352 +++ tests/linux/md-autostart/merge | 4025 +++++++++ tests/linux/md-autostart/orig | 4025 +++++++++ tests/linux/md-autostart/patch | 27 + tests/linux/md-loop/1 | 3949 +++++++++ tests/linux/md-loop/2 | 3949 +++++++++ tests/linux/md-loop/merge | 3960 +++++++++ tests/linux/md-loop/orig | 3960 +++++++++ tests/linux/md-messy/diff | 93 + tests/linux/md-messy/new | 90 + tests/linux/md-messy/orig | 91 + tests/linux/md-resync/merge | 1911 +++++ tests/linux/md-resync/orig | 1848 ++++ tests/linux/md-resync/patch | 312 + tests/linux/md/diff | 3680 ++++++++ tests/linux/md/lmerge | 3595 ++++++++ tests/linux/md/merge | 3595 ++++++++ tests/linux/md/orig | 3674 ++++++++ tests/linux/md/patch | 117 + tests/linux/md/rediff | 101 + tests/linux/md/replace | 0 tests/linux/md/wmerge | 3591 ++++++++ tests/linux/nfsd-defines/merge | 270 + tests/linux/nfsd-defines/orig | 270 + tests/linux/nfsd-defines/patch | 24 + tests/linux/raid5/orig | 2079 +++++ tests/linux/raid5/patch | 962 +++ tests/linux/raid5build/merge | 30 + tests/linux/raid5build/orig | 15 + tests/linux/raid5build/patch | 31 + tests/linux/raid5line/lmerge | 7 + tests/linux/raid5line/merge | 7 + tests/linux/raid5line/orig | 1 + tests/linux/raid5line/patch | 3 + tests/linux/raid5line/wmerge | 1 + tests/linux/rpc_tcp_nonagle/merge | 1518 ++++ tests/linux/rpc_tcp_nonagle/orig | 1511 ++++ tests/linux/rpc_tcp_nonagle/patch | 33 + tests/simple/all-different-2/lmerge | 34 + tests/simple/all-different-2/merge | 34 + tests/simple/all-different-2/new | 10 + tests/simple/all-different-2/new2 | 10 + tests/simple/all-different-2/orig | 10 + tests/simple/all-different-2/wmerge | 10 + tests/simple/all-different/lmerge | 35 + tests/simple/all-different/merge | 35 + tests/simple/all-different/new | 11 + tests/simple/all-different/new2 | 11 + tests/simple/all-different/orig | 11 + tests/simple/all-different/wmerge | 11 + tests/simple/already-applied/merge | 3 + tests/simple/already-applied/new | 2 + tests/simple/already-applied/new2 | 2 + tests/simple/already-applied/orig | 3 + tests/simple/base/diff | 23 + tests/simple/base/ldiff | 25 + tests/simple/base/merge | 20 + tests/simple/base/new | 21 + tests/simple/base/new2 | 21 + tests/simple/base/orig | 20 + tests/simple/brokenlines/diff | 7 + tests/simple/brokenlines/merge | 5 + tests/simple/brokenlines/new | 3 + tests/simple/brokenlines/new2 | 3 + tests/simple/brokenlines/orig | 5 + tests/simple/changeafteradd/merge | 5 + tests/simple/changeafteradd/new | 6 + tests/simple/changeafteradd/new2 | 6 + tests/simple/changeafteradd/orig | 5 + tests/simple/conflict/diff | 5 + tests/simple/conflict/ldiff | 6 + tests/simple/conflict/merge | 10 + tests/simple/conflict/new | 4 + tests/simple/conflict/new2 | 4 + tests/simple/conflict/orig | 4 + tests/simple/conflict/wmerge | 4 + tests/simple/conflictmixed/diff | 5 + tests/simple/conflictmixed/ldiff | 6 + tests/simple/conflictmixed/lmerge | 10 + tests/simple/conflictmixed/merge | 10 + tests/simple/conflictmixed/new | 4 + tests/simple/conflictmixed/new2 | 4 + tests/simple/conflictmixed/orig | 4 + tests/simple/conflictmixed/wmerge | 4 + tests/simple/multideletes/lmerge | 2 + tests/simple/multideletes/merge | 2 + tests/simple/multideletes/new | 8 + tests/simple/multideletes/new2 | 5 + tests/simple/multideletes/orig | 5 + tests/simple/multiple-add/lmerge | 15 + tests/simple/multiple-add/merge | 15 + tests/simple/multiple-add/new | 9 + tests/simple/multiple-add/new2 | 9 + tests/simple/multiple-add/orig | 9 + tests/simple/multiple-add/wmerge | 9 + version | 1 + wiggle.1 | 439 + wiggle.c | 643 ++ wiggle.h | 100 + 128 files changed, 82279 insertions(+) create mode 100644 ANNOUNCE create mode 100644 COPYING create mode 100644 DOC/diff.ps create mode 100644 INSTALL create mode 100644 Makefile create mode 100644 ReadMe.c create mode 100644 TODO create mode 100644 bestmatch.c create mode 100644 diff.c create mode 100755 dotest create mode 100644 extract.c create mode 100644 get-p-options create mode 100644 hash.h create mode 100644 load.c create mode 100644 merge.c create mode 100644 notes create mode 100755 p create mode 100644 p.help create mode 100644 split.c create mode 100644 tests/linux/inode-fullpatch/diff create mode 100644 tests/linux/inode-fullpatch/merge create mode 100644 tests/linux/inode-fullpatch/orig create mode 100644 tests/linux/inode-fullpatch/patch create mode 100644 tests/linux/inode-fullpatch/rediff create mode 100644 tests/linux/inode-fullpatch/wmerge create mode 100644 tests/linux/inode-justrej/lmerge create mode 100644 tests/linux/inode-justrej/merge create mode 100644 tests/linux/inode-justrej/orig create mode 100644 tests/linux/inode-justrej/patch create mode 100644 tests/linux/inode-justrej/wmerge create mode 100644 tests/linux/md-autostart/merge create mode 100644 tests/linux/md-autostart/orig create mode 100644 tests/linux/md-autostart/patch create mode 100644 tests/linux/md-loop/1 create mode 100644 tests/linux/md-loop/2 create mode 100644 tests/linux/md-loop/merge create mode 100644 tests/linux/md-loop/orig create mode 100644 tests/linux/md-messy/diff create mode 100644 tests/linux/md-messy/new create mode 100644 tests/linux/md-messy/orig create mode 100644 tests/linux/md-resync/merge create mode 100644 tests/linux/md-resync/orig create mode 100644 tests/linux/md-resync/patch create mode 100644 tests/linux/md/diff create mode 100644 tests/linux/md/lmerge create mode 100644 tests/linux/md/merge create mode 100644 tests/linux/md/orig create mode 100644 tests/linux/md/patch create mode 100644 tests/linux/md/rediff create mode 100644 tests/linux/md/replace create mode 100644 tests/linux/md/wmerge create mode 100644 tests/linux/nfsd-defines/merge create mode 100644 tests/linux/nfsd-defines/orig create mode 100644 tests/linux/nfsd-defines/patch create mode 100644 tests/linux/raid5/orig create mode 100644 tests/linux/raid5/patch create mode 100644 tests/linux/raid5build/merge create mode 100644 tests/linux/raid5build/orig create mode 100644 tests/linux/raid5build/patch create mode 100644 tests/linux/raid5line/lmerge create mode 100644 tests/linux/raid5line/merge create mode 100644 tests/linux/raid5line/orig create mode 100644 tests/linux/raid5line/patch create mode 100644 tests/linux/raid5line/wmerge create mode 100644 tests/linux/rpc_tcp_nonagle/merge create mode 100644 tests/linux/rpc_tcp_nonagle/orig create mode 100644 tests/linux/rpc_tcp_nonagle/patch create mode 100644 tests/simple/all-different-2/lmerge create mode 100644 tests/simple/all-different-2/merge create mode 100644 tests/simple/all-different-2/new create mode 100644 tests/simple/all-different-2/new2 create mode 100644 tests/simple/all-different-2/orig create mode 100644 tests/simple/all-different-2/wmerge create mode 100644 tests/simple/all-different/lmerge create mode 100644 tests/simple/all-different/merge create mode 100644 tests/simple/all-different/new create mode 100644 tests/simple/all-different/new2 create mode 100644 tests/simple/all-different/orig create mode 100644 tests/simple/all-different/wmerge create mode 100644 tests/simple/already-applied/merge create mode 100644 tests/simple/already-applied/new create mode 100644 tests/simple/already-applied/new2 create mode 100644 tests/simple/already-applied/orig create mode 100644 tests/simple/base/diff create mode 100644 tests/simple/base/ldiff create mode 100644 tests/simple/base/merge create mode 100644 tests/simple/base/new create mode 100644 tests/simple/base/new2 create mode 100644 tests/simple/base/orig create mode 100644 tests/simple/brokenlines/diff create mode 100644 tests/simple/brokenlines/merge create mode 100644 tests/simple/brokenlines/new create mode 100644 tests/simple/brokenlines/new2 create mode 100644 tests/simple/brokenlines/orig create mode 100644 tests/simple/changeafteradd/merge create mode 100644 tests/simple/changeafteradd/new create mode 100644 tests/simple/changeafteradd/new2 create mode 100644 tests/simple/changeafteradd/orig create mode 100644 tests/simple/conflict/diff create mode 100644 tests/simple/conflict/ldiff create mode 100644 tests/simple/conflict/merge create mode 100644 tests/simple/conflict/new create mode 100644 tests/simple/conflict/new2 create mode 100644 tests/simple/conflict/orig create mode 100644 tests/simple/conflict/wmerge create mode 100644 tests/simple/conflictmixed/diff create mode 100644 tests/simple/conflictmixed/ldiff create mode 100644 tests/simple/conflictmixed/lmerge create mode 100644 tests/simple/conflictmixed/merge create mode 100644 tests/simple/conflictmixed/new create mode 100644 tests/simple/conflictmixed/new2 create mode 100644 tests/simple/conflictmixed/orig create mode 100644 tests/simple/conflictmixed/wmerge create mode 100644 tests/simple/multideletes/lmerge create mode 100644 tests/simple/multideletes/merge create mode 100644 tests/simple/multideletes/new create mode 100644 tests/simple/multideletes/new2 create mode 100644 tests/simple/multideletes/orig create mode 100644 tests/simple/multiple-add/lmerge create mode 100644 tests/simple/multiple-add/merge create mode 100644 tests/simple/multiple-add/new create mode 100644 tests/simple/multiple-add/new2 create mode 100644 tests/simple/multiple-add/orig create mode 100644 tests/simple/multiple-add/wmerge create mode 100644 version create mode 100644 wiggle.1 create mode 100644 wiggle.c create mode 100644 wiggle.h diff --git a/ANNOUNCE b/ANNOUNCE new file mode 100644 index 0000000..5ed35c8 --- /dev/null +++ b/ANNOUNCE @@ -0,0 +1,95 @@ +ANNOUNCE: wiggle - a tools for applying patches with conflicts + +I am pleased to announce the first public release of 'wiggle'. + +Wiggle is a program for applying patches that 'patch' cannot +apply due to conflicting changes in the original. + +Wiggle will always apply all changes in the patch to the original. +If it cannot find a way to cleanly apply a patch, it inserts it +in the original in a manner similar to 'merge', and report an +unresolvable conflict. Such a conflict will look like: + +<<<<<<< +Some text from +the original file +||||||| +Some text that the patch changes +======= +Some text that is the result of the patch +>>>>>>> + +with the meaning that the "text that the patch +changes" was expected somewhere in the "text from the original +file" and should be replaced with "the result of the patch". + +wiggle analyses the file and the patch in terms of words rather than +whole lines and so is able to find matches that patch is +unable to find. If a patch changes a word at the end of a line, and +a word at the start of that line has been modified since the patch +was made, then wiggle will have no trouble applying the patch. + +wiggle has proved very useful for back-porting patches that were +generated for the development kernel, onto the stable kernel. +Sometimes it does exactly the right thing with the patch. When it doesn't +it reports a conflict which is easy to resolve with an understanding of +what the code and the patch were trying to achieve. + +Wiggle is available under the GPL and can be fetched from: + + http://www.cse.unsw.edu.au/~neilb/source/wiggle/ + +The name 'wiggle' was inspired by Andrew Morton's comment: + + The problem I find is that I often want to take + (file1+patch) -> file2, + when I don't have file1. But merge tools want to take + (file1|file2) -> file3. + I haven't seen a graphical tool which helps you to wiggle a patch + into a file. + +which google can find for you: + http://www.google.com/search?q=graphical+tool+which+helps+you+to+wiggle+a+patch + +It isn't a graphical tool, but it is a good first step. + +NOTES: + +This release contains a 'tests' directory with a number of test cases +that have proved invaluable in developing the program and my +understanding of the subtleties of some of the issues involved. If you +find a case where wiggle behaves sub-optimally (e.g. dumps core), +please consider sending me a test case to add to the tests directory. + +This release also contains a script 'p' and accompanying 'p.help'. +This is a script that I use for patch management for my kernel patches +and it makes use of wiggle to allow me to apply patches that +'patch' cannot manage. It is included both as an example of +how wiggle can be used, and as a tool that some might find useful. + +One shortcoming I find with wiggle is that I would like to be able +to 'see' what it has done. I would love it if someone were to write +a program that allowed the results of wiggle to be visualised. +The closest that I have come to imagining a workable UI is to +have two side-by-side windows, one of which shows the original patch, +and the other shows a "diff -u" of before and after wiggle has done it's +thing, and to have these windows automatically aligned so that when +a change is shown in one, the corresponding change appears in the other. +Maybe something like tkdiff, but that knows about patches and knows +about word-based diffs.... + +Wiggle is also able to perform a function similar to 'diff' and show the +differences and similarities between two files. It can show these differences +and similarities at a word-by-word level. The output format is not machine +readable as the character sequences used to delimit inserted and deleted +words are not quoted in the output. Hence this format will probably change +at some stage and should not be depended upon. + +If you read the source, beware of comments: they were probably written +while I was still trying to understand the issues myself, and so are +probably wrong and out-of-date. I would like to review all the code and +comments, but if I wait until I do that before releasing it, it'll never +get released! + +NeilBrown +The University of New South Wales diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..60549be --- /dev/null +++ b/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/DOC/diff.ps b/DOC/diff.ps new file mode 100644 index 0000000..e40c95e --- /dev/null +++ b/DOC/diff.ps @@ -0,0 +1,11852 @@ +%!PS-Adobe-3.0 +%%Creator: psdit +%%For: bobcat.CS.Arizona.EDU:gene (Gene Myers) +%%Title: stdin (ditroff) +%%CreationDate: Mon Sep 22 11:11:50 1997 +%%DocumentNeededResources: (atend) +%%DocumentSuppliedResources: DIThacks +%%Pages: (atend) +%%EndComments +% Start of psdit.pro -- prolog for ditroff translator +% Copyright (c) 1985,1987 Adobe Systems Incorporated. All Rights Reserved. +% GOVERNMENT END USERS: See Notice file in TranScript library directory +% -- probably /usr/lib/ps/Notice +% RCS: $Header: /disks/hobo/vp6/snichols/rel3.0/transcript/lib/RCS/psdit.pro,v 3.0 1991/06/17 17:08:31 snichols Exp $ +% Psfig RCSID $Header: psdit.pro,v 1.5 88/01/04 17:48:22 trevor Exp $ + +/$DITroff 180 dict def $DITroff begin + +/DocumentInitState [ matrix currentmatrix currentlinewidth currentlinecap +currentlinejoin currentdash currentgray currentmiterlimit ] cvx def + +%% Psfig additions +/startFig { + /SavedState save def + userdict maxlength dict begin + currentpoint transform + + DocumentInitState setmiterlimit setgray setdash setlinejoin setlinecap + setlinewidth setmatrix + + itransform moveto + + /ury exch def + /urx exch def + /lly exch def + /llx exch def + /y exch 72 mul resolution div def + /x exch 72 mul resolution div def + + currentpoint /cy exch def /cx exch def + + /sx x urx llx sub div def % scaling for x + /sy y ury lly sub div def % scaling for y + + sx sy scale % scale by (sx,sy) + + cx sx div llx sub + cy sy div ury sub translate + + /DefFigCTM matrix currentmatrix def + + /initmatrix { + DefFigCTM setmatrix + } def + /defaultmatrix { + DefFigCTM exch copy + } def + + /initgraphics { + DocumentInitState setmiterlimit setgray setdash + setlinejoin setlinecap setlinewidth setmatrix + DefFigCTM setmatrix + } def + + /showpage { + initgraphics + } def + +} def +% Args are llx lly urx ury (in figure coordinates) +/clipFig { + currentpoint 6 2 roll + newpath 4 copy + 4 2 roll moveto + 6 -1 roll exch lineto + exch lineto + exch lineto + closepath clip + newpath + moveto +} def +% doclip, if called, will always be just after a `startfig' +/doclip { llx lly urx ury clipFig } def +/endFig { + end SavedState restore +} def +/globalstart { + % Push details about the enviornment on the stack. + fontnum fontsize fontslant fontheight + % firstpage + mh my resolution slotno currentpoint + pagesave restore gsave +} def +/globalend { + grestore moveto + /slotno exch def /resolution exch def /my exch def + /mh exch def + % /firstpage exch def + /fontheight exch def + /fontslant exch def /fontsize exch def /fontnum exch def + F + /pagesave save def +} def + +%% end XMOD additions + +/fontnum 1 def /fontsize 10 def /fontheight 10 def /fontslant 0 def +/xi {72 mul 0 exch translate 72 resolution div dup neg scale 0 0 moveto + /fontnum 1 def /fontsize 10 def /fontheight 10 def /fontslant 0 def F + /pagesave save def}def +/PB{save /psv exch def currentpoint translate + resolution 72 div dup neg scale 0 0 moveto}def +/PE{psv restore}def +/m1 matrix def /m2 matrix def /m3 matrix def /oldmat matrix def +/tan{dup sin exch cos div}bind def +/point{resolution 72 div mul}bind def +/dround {transform round exch round exch itransform}bind def +/xT{/devname exch def}def +/xr{/mh exch def /my exch def /resolution exch def}def +/xp{}def +/xs{docsave restore end}def +/xt{}def +/xf{/fontname exch def /slotno exch def fontnames slotno get fontname eq not + {fonts slotno fontname findfont put fontnames slotno fontname put}if}def +/xH{/fontheight exch def F}bind def +/xS{/fontslant exch def F}bind def +/s{/fontsize exch def /fontheight fontsize def F}bind def +/f{/fontnum exch def F}bind def +/F{fontheight 0 le {/fontheight fontsize def}if + fonts fontnum get fontsize point 0 0 fontheight point neg 0 0 m1 astore + fontslant 0 ne{1 0 fontslant neg tan 1 0 0 m2 astore m3 concatmatrix}if + makefont setfont .04 fontsize point mul 0 dround pop setlinewidth}bind def +/X{exch currentpoint exch pop moveto show}bind def +/N{3 1 roll moveto show}bind def +/Y{exch currentpoint pop exch moveto show}bind def +/S /show load def +/ditpush{}def/ditpop{}def +/AX{3 -1 roll currentpoint exch pop moveto 0 exch ashow}bind def +/AN{4 2 roll moveto 0 exch ashow}bind def +/AY{3 -1 roll currentpoint pop exch moveto 0 exch ashow}bind def +/AS{0 exch ashow}bind def +/MX{currentpoint exch pop moveto}bind def +/MY{currentpoint pop exch moveto}bind def +/MXY /moveto load def +/cb{pop}def % action on unknown char -- nothing for now +/n{}def/w{}def +/p{pop showpage pagesave restore /pagesave save def}def +/abspoint{currentpoint exch pop add exch currentpoint pop add exch}def +/dstroke{currentpoint stroke moveto}bind def +/Dl{2 copy gsave rlineto stroke grestore rmoveto}bind def +/arcellipse{oldmat currentmatrix pop + currentpoint translate 1 diamv diamh div scale /rad diamh 2 div def + rad 0 rad -180 180 arc oldmat setmatrix}def +/Dc{gsave dup /diamv exch def /diamh exch def arcellipse dstroke + grestore diamh 0 rmoveto}def +/De{gsave /diamv exch def /diamh exch def arcellipse dstroke + grestore diamh 0 rmoveto}def +/Da{currentpoint /by exch def /bx exch def /fy exch def /fx exch def + /cy exch def /cx exch def /rad cx cx mul cy cy mul add sqrt def + /ang1 cy neg cx neg atan def /ang2 fy fx atan def cx bx add cy by add + 2 copy rad ang1 ang2 arcn stroke exch fx add exch fy add moveto}def +/Barray 200 array def % 200 values in a wiggle +/D~{mark}def +/D~~{counttomark Barray exch 0 exch getinterval astore /Bcontrol exch def pop + /Blen Bcontrol length def Blen 4 ge Blen 2 mod 0 eq and + {Bcontrol 0 get Bcontrol 1 get abspoint /Ycont exch def /Xcont exch def + Bcontrol 0 2 copy get 2 mul put Bcontrol 1 2 copy get 2 mul put + Bcontrol Blen 2 sub 2 copy get 2 mul put + Bcontrol Blen 1 sub 2 copy get 2 mul put + /Ybi /Xbi currentpoint 3 1 roll def def 0 2 Blen 4 sub + {/i exch def + Bcontrol i get 3 div Bcontrol i 1 add get 3 div + Bcontrol i get 3 mul Bcontrol i 2 add get add 6 div + Bcontrol i 1 add get 3 mul Bcontrol i 3 add get add 6 div + /Xbi Xcont Bcontrol i 2 add get 2 div add def + /Ybi Ycont Bcontrol i 3 add get 2 div add def + /Xcont Xcont Bcontrol i 2 add get add def + /Ycont Ycont Bcontrol i 3 add get add def + Xbi currentpoint pop sub Ybi currentpoint exch pop sub rcurveto + }for dstroke}if}def +end +/ditstart{$DITroff begin + /nfonts 60 def % NFONTS makedev/ditroff dependent! + /fonts[nfonts{0}repeat]def + /fontnames[nfonts{()}repeat]def +/docsave save def +}def + +% character outcalls +/oc {/pswid exch def /cc exch def /name exch def + /ditwid pswid fontsize mul resolution mul 72000 div def + /ditsiz fontsize resolution mul 72 div def + ocprocs name known{ocprocs name get exec}{name cb} + ifelse}def +/fractm [.65 0 0 .6 0 0] def +/fraction + {/fden exch def /fnum exch def gsave /cf currentfont def + cf fractm makefont setfont 0 .3 dm 2 copy neg rmoveto + fnum show rmoveto currentfont cf setfont(\244)show setfont fden show + grestore ditwid 0 rmoveto} def +/oce {grestore ditwid 0 rmoveto}def +/dm {ditsiz mul}def +/ocprocs 50 dict def ocprocs begin +(14){(1)(4)fraction}def +(12){(1)(2)fraction}def +(34){(3)(4)fraction}def +(13){(1)(3)fraction}def +(23){(2)(3)fraction}def +(18){(1)(8)fraction}def +(38){(3)(8)fraction}def +(58){(5)(8)fraction}def +(78){(7)(8)fraction}def +(sr){gsave .05 dm .16 dm rmoveto(\326)show oce}def +(is){gsave 0 .15 dm rmoveto(\362)show oce}def +(->){gsave 0 .02 dm rmoveto(\256)show oce}def +(<-){gsave 0 .02 dm rmoveto(\254)show oce}def +(==){gsave 0 .05 dm rmoveto(\272)show oce}def +end +%%BeginResource: font DIThacks +% DIThacks fonts for some special chars +50 dict dup begin +/FontType 3 def +/FontName /DIThacks def +/FontMatrix [.001 0.0 0.0 .001 0.0 0.0] def +/FontBBox [-220 -280 900 900] def% a lie but ... +/Encoding 256 array def +0 1 255{Encoding exch /.notdef put}for +Encoding + dup 8#040/space put %space + dup 8#110/rc put %right ceil + dup 8#111/lt put %left top curl + dup 8#112/bv put %bold vert + dup 8#113/lk put %left mid curl + dup 8#114/lb put %left bot curl + dup 8#115/rt put %right top curl + dup 8#116/rk put %right mid curl + dup 8#117/rb put %right bot curl + dup 8#120/rf put %right floor + dup 8#121/lf put %left floor + dup 8#122/lc put %left ceil + dup 8#140/sq put %square + dup 8#141/bx put %box + dup 8#142/ci put %circle + dup 8#143/br put %box rule + dup 8#144/rn put %root extender + dup 8#145/vr put %vertical rule + dup 8#146/ob put %outline bullet + dup 8#147/bu put %bullet + dup 8#150/ru put %rule + dup 8#151/ul put %underline + pop +/DITfd 100 dict def +/BuildChar{0 begin + /cc exch def /fd exch def + /charname fd /Encoding get cc get def + /charwid fd /Metrics get charname get def + /charproc fd /CharProcs get charname get def + charwid 0 fd /FontBBox get aload pop setcachedevice + 40 setlinewidth + newpath 0 0 moveto gsave charproc grestore + end}def +/BuildChar load 0 DITfd put +%/UniqueID 5 def +/CharProcs 50 dict def +CharProcs begin +/space{}def +/.notdef{}def +/ru{500 0 rls}def +/rn{0 750 moveto 500 0 rls}def +/vr{20 800 moveto 0 -770 rls}def +/bv{20 800 moveto 0 -1000 rls}def +/br{20 770 moveto 0 -1040 rls}def +/ul{0 -250 moveto 500 0 rls}def +/ob{200 250 rmoveto currentpoint newpath 200 0 360 arc closepath stroke}def +/bu{200 250 rmoveto currentpoint newpath 200 0 360 arc closepath fill}def +/sq{80 0 rmoveto currentpoint dround newpath moveto + 640 0 rlineto 0 640 rlineto -640 0 rlineto closepath stroke}def +/bx{80 0 rmoveto currentpoint dround newpath moveto + 640 0 rlineto 0 640 rlineto -640 0 rlineto closepath fill}def +/ci{355 333 rmoveto currentpoint newpath 333 0 360 arc + 50 setlinewidth stroke}def + +/lt{20 -200 moveto 0 550 rlineto currx 800 2cx s4 add exch s4 a4p stroke}def +/lb{20 800 moveto 0 -550 rlineto currx -200 2cx s4 add exch s4 a4p stroke}def +/rt{20 -200 moveto 0 550 rlineto currx 800 2cx s4 sub exch s4 a4p stroke}def +/rb{20 800 moveto 0 -500 rlineto currx -200 2cx s4 sub exch s4 a4p stroke}def +/lk{20 800 moveto 20 300 -280 300 s4 arcto pop pop 1000 sub + currentpoint stroke moveto + 20 300 4 2 roll s4 a4p 20 -200 lineto stroke}def +/rk{20 800 moveto 20 300 320 300 s4 arcto pop pop 1000 sub + currentpoint stroke moveto + 20 300 4 2 roll s4 a4p 20 -200 lineto stroke}def +/lf{20 800 moveto 0 -1000 rlineto s4 0 rls}def +/rf{20 800 moveto 0 -1000 rlineto s4 neg 0 rls}def +/lc{20 -200 moveto 0 1000 rlineto s4 0 rls}def +/rc{20 -200 moveto 0 1000 rlineto s4 neg 0 rls}def +end + +/Metrics 50 dict def Metrics begin +/.notdef 0 def +/space 500 def +/ru 500 def +/br 0 def +/lt 250 def +/lb 250 def +/rt 250 def +/rb 250 def +/lk 250 def +/rk 250 def +/rc 250 def +/lc 250 def +/rf 250 def +/lf 250 def +/bv 250 def +/ob 350 def +/bu 350 def +/ci 750 def +/bx 750 def +/sq 750 def +/rn 500 def +/ul 500 def +/vr 0 def +end + +DITfd begin +/s2 500 def /s4 250 def /s3 333 def +/a4p{arcto pop pop pop pop}def +/2cx{2 copy exch}def +/rls{rlineto stroke}def +/currx{currentpoint pop}def +/dround{transform round exch round exch itransform} def +end +end +/DIThacks exch definefont pop +%%EndResource +%%EndProlog +%%BeginSetup +ditstart +(psc)xT +576 1 1 xr +%%IncludeResource: font Times-Roman +1(Times-Roman)xf 1 f +%%IncludeResource: font Times-Italic +2(Times-Italic)xf 2 f +%%IncludeResource: font Times-Bold +3(Times-Bold)xf 3 f +%%IncludeResource: font Times-BoldItalic +4(Times-BoldItalic)xf 4 f +%%IncludeResource: font Helvetica +5(Helvetica)xf 5 f +%%IncludeResource: font Helvetica-Bold +6(Helvetica-Bold)xf 6 f +%%IncludeResource: font Courier +7(Courier)xf 7 f +%%IncludeResource: font Courier-Bold +8(Courier-Bold)xf 8 f +%%IncludeResource: font Symbol +9(Symbol)xf 9 f +10(DIThacks)xf 10 f +10 s +1 f +11.00 xi +%%EndSetup + +%%Page: 1 1 +10 s 10 xH 0 xS 1 f +3 f +14 s +1197 1088(An)N +1368(O\(ND\))X +1721(Difference)X +2249(Algorithm)X +2776(and)X +2984(Its)X +3138(Variations)X +9 f +3643 1032(*)N +1 f +10 s +2082 1472(EUGENE)N +2423(W.)X +2539(MYERS)X +2 f +1112 1728(Department)N +1515(of)X +1597(Computer)X +1937(Science,)X +2223(University)X +2577(of)X +2659(Arizona,)X +2952(Tucson,)X +3223(AZ)X +3336(85721,)X +3576(U.S.A.)X +3 f +2230 2112(ABSTRACT)N +576 2460(The)N +734(problems)X +1079(of)X +1171(\256nding)X +1434(a)X +1499(longest)X +1764(common)X +2083(subsequence)X +2534(of)X +2626(two)X +2776(sequences)X +3139(A)X +3222(and)X +3375(B)X +3453(and)X +3606(a)X +3671(shortest)X +3969(edit)X +4124(script)X +576 2588(for)N +707(transforming)X +1189(A)X +1275(into)X +1436(B)X +1517(have)X +1705(long)X +1879(been)X +2067(known)X +2324(to)X +2418(be)X +2525(dual)X +2702(problems.)X +3089(In)X +3191(this)X +3342(paper,)X +3589(they)X +3763(are)X +3902(shown)X +4146(to)X +4240(be)X +576 2716(equivalent)N +959(to)X +1054(\256nding)X +1320(a)X +1388(shortest/longest)X +1950(path)X +2134(in)X +2229(an)X +2342(edit)X +2500(graph.)X +2773(Using)X +2997(this)X +3150(perspective,)X +3587(a)X +3656(simple)X +3907(O\(ND\))X +4168(time)X +576 2844(and)N +737(space)X +957(algorithm)X +1328(is)X +1414(developed)X +1789(where)X +2032(N)X +2123(is)X +2209(the)X +2349(sum)X +2524(of)X +2624(the)X +2764(lengths)X +3041(of)X +3141(A)X +3232(and)X +3393(B)X +3479(and)X +3640(D)X +3731(is)X +3817(the)X +3957(size)X +4114(of)X +4213(the)X +576 2972(minimum)N +937(edit)X +1094(script)X +1318(for)X +1449(A)X +1535(and)X +1691(B.)X +1812(The)X +1973(algorithm)X +2339(performs)X +2684(well)X +2850(when)X +3060(differences)X +3463(are)X +3603(small)X +3814(\(sequences)X +4208(are)X +576 3100(similar\))N +890(and)X +1065(is)X +1165(consequently)X +1656(fast)X +1828(in)X +1941(typical)X +2219(applications.)X +2718(The)X +2897(algorithm)X +3281(is)X +3380(shown)X +3643(to)X +3756(have)X +3962(O)X +4030(\()X +4063(N)X +9 f +4134(+)X +3 f +4191(D)X +7 s +4253 3068(2)N +10 s +4293 3100(\))N +576 3228(expected-time)N +1088(performance)X +1564(under)X +1803(a)X +1878(basic)X +2086(stochastic)X +2455(model.)X +2739(A)X +2832(re\256nement)X +3237(of)X +3339(the)X +3481(algorithm)X +3854(requires)X +4174(only)X +576 3356(O\(N\))N +770(space,)X +997(and)X +1145(the)X +1272(use)X +1403(of)X +1490(suf\256x)X +1696(trees)X +1882(leads)X +2075(to)X +2162(an)X +2266(O)X +2334(\()X +2367(NlgN)X +9 f +2558(+)X +3 f +2615(D)X +7 s +2677 3324(2)N +10 s +2717 3356(\))N +2764(time)X +2936(variation.)X +8 s +576 3612(KEY)N +731(WORDS)X +1088(longest)X +1296(common)X +1546(subsequence)X +2003(shortest)X +2235(edit)X +2354(script)X +2625(edit)X +2744(graph)X +3023(\256le)X +3121(comparison)X +10 s +576 3868(1.)N +656(Introduction)X +1 f +696 4024(The)N +851(problem)X +1148(of)X +1245(determining)X +1662(the)X +1790(differences)X +2178(between)X +2476(two)X +2626(sequences)X +2982(of)X +3079(symbols)X +3375(has)X +3512(been)X +3694(studied)X +3955(extensively)X +576 4152([1,8,11,13,16,19,20].)N +1302(Algorithms)X +1697(for)X +1822(the)X +1951(problem)X +2249(have)X +2432(numerous)X +2779(applications,)X +3217(including)X +3550(spelling)X +3834(correction)X +4191(sys-)X +576 4280(tems,)N +774(\256le)X +903(comparison)X +1304(tools,)X +1506(and)X +1649(the)X +1774(study)X +1974(of)X +2068(genetic)X +2328(evolution)X +2658([4,5,17,18].)X +3080(Formally,)X +3421(the)X +3547(problem)X +3842(statement)X +4177(is)X +4258(to)X +576 4408(\256nd)N +725(a)X +786(longest)X +1042(common)X +1347(subsequence)X +1778(or,)X +1890(equivalently,)X +2331(to)X +2418(\256nd)X +2567(the)X +2690(minimum)X +3024(``script'')X +3334(of)X +3425(symbol)X +3684(deletions)X +3997(and)X +4137(inser-)X +576 4536(tions)N +757(that)X +903(transform)X +1241(one)X +1384(sequence)X +1706(into)X +1857(the)X +1982(other.)X +2214(One)X +2375(of)X +2469(the)X +2594(earliest)X +2853(algorithms)X +3222(is)X +3302(by)X +3409(Wagner)X +3691(&)X +3780(Fischer)X +4043([20])X +4204(and)X +576 4664(takes)N +774(O)X +838(\()X +871(N)X +7 s +933 4632(2)N +10 s +973 4664(\))N +1033(time)X +1208(and)X +1357(space)X +1569(to)X +1664(solve)X +1866(a)X +1935(generalization)X +2423(they)X +2594(call)X +2743(the)X +2874(string-to-string)X +3387(correction)X +3746(problem.)X +4086(A)X +4177(later)X +576 4792(re\256nement)N +939(by)X +1039(Hirschberg)X +1417([7])X +1532(delivers)X +1807(a)X +1864(longest)X +2116(common)X +2417(subsequence)X +2844(using)X +3038(only)X +3201(linear)X +3405(space.)X +3645(When)X +3858(algorithms)X +4221(are)X +576 4920(over)N +750(arbitrary)X +1058(alphabets,)X +1412(use)X +1550(``equal\320unequal'')X +2197(comparisons,)X +2653(and)X +2800(are)X +2930 0.3125(characterized)AX +3391(in)X +3484(terms)X +3693(of)X +3791(the)X +3920(size)X +4076(of)X +4173(their)X +576 5048(input,)N +788(it)X +860(has)X +995(been)X +1175(shown)X +1412(that)X +9 f +1560(W)X +1 f +1628(\()X +1661(N)X +7 s +1723 5016(2)N +10 s +1763 5048(\))N +1818(time)X +1988(is)X +2069(necessary)X +2410([1].)X +2572(A)X +2659(``Four)X +2893(Russians'')X +3260(approach)X +3584(leads)X +3778(to)X +3869(slightly)X +4137(better)X +576 5176(O)N +640(\()X +673(N)X +7 s +735 5144(2)N +10 s +775 5176(lglgN)N +963(/)X +991(lgN)X +1117(\))X +1191(and)X +1354(O)X +1418(\()X +1451(N)X +7 s +1513 5144(2)N +10 s +1553 5176(/)N +1581(lgN)X +1707(\))X +1781(time)X +1969(algorithms)X +2357(for)X +2497(arbitrary)X +2820(and)X +2982(\256nite)X +3192(alphabets)X +3541(respectively)X +3975([13].)X +4195(The)X +576 5304(existence)N +903(of)X +998(faster)X +1205(algorithms)X +1575(using)X +1776(other)X +1969(comparison)X +2371(formats)X +2644(is)X +2726(still)X +2874(open.)X +3099(Indeed,)X +3367(for)X +3490(algorithms)X +3861(that)X +4010(use)X +4146(``less)X +576 5432(than\320equal\320greater)N +1292(than'')X +1504(comparisons,)X +9 f +1949(W)X +1 f +2011(\(NlgN\))X +2263(time)X +2425(is)X +2498(the)X +2616(best)X +2765(lower)X +2968(bound)X +3188(known)X +3426([9].)X +8 s +10 f +576 5512(hhhhhhhhhhhhhhhhhh)N +9 f +576 5600(*)N +1 f +624(This)X +754(work)X +901(was)X +1016(supported)X +1284(in)X +1350(part)X +1465(by)X +1545(the)X +1639(National)X +1875(Science)X +2089(Foundation)X +2397(under)X +2558(Grant)X +2719(MCS82-10096.)X +10 s +2381 6176(-)N +2428(1)X +2488(-)X + +2 p +%%Page: 2 2 +10 s 10 xH 0 xS 1 f +696 704(Recent)N +940(work)X +1126(improves)X +1445(upon)X +1626(the)X +1746(basic)X +1933(O)X +1997(\()X +2030(N)X +7 s +2092 672(2)N +10 s +2132 704(\))N +2181(time)X +2345(arbitrary)X +2644(alphabet)X +2938(algorithm)X +3271(by)X +3373(being)X +3573(sensitive)X +3875(to)X +3959(other)X +4146(prob-)X +576 832(lem)N +723(size)X +875(parameters.)X +1295(Let)X +1429(the)X +1554(output)X +1785(parameter)X +2133(L)X +2208(be)X +2310(the)X +2434(length)X +2660(of)X +2753(a)X +2815(longest)X +3072(common)X +3378(subsequence)X +3810(and)X +3952(let)X +4058(the)X +4182(dual)X +576 960(parameter)N +920(D)X +9 f +1010(=)X +1 f +1086(2)X +1132(\()X +1165(N)X +9 f +1236(-)X +1 f +1293(L)X +1348(\))X +1397(be)X +1495(the)X +1615(length)X +1837(of)X +1926(a)X +1984(shortest)X +2255(edit)X +2397(script.)X +2637(\(It)X +2735(is)X +2810(assumed)X +3109(throughout)X +3483(this)X +3621(introduction)X +4035(that)X +4178(both)X +576 1088(strings)N +832(have)X +1027(the)X +1168(same)X +1376(length)X +1619(N.\))X +1787(The)X +1955(two)X +2118(best)X +2290(output-sensitive)X +2844(algorithms)X +3229(are)X +3371(by)X +3493(Hirschberg)X +3892([8])X +4028(and)X +4186(take)X +576 1216(O)N +640(\()X +673(NL)X +9 f +793(+)X +1 f +850(NlgN)X +1034(\))X +1089(and)X +1233(O)X +1297(\()X +1330(DLlgN)X +1563(\))X +1618(time.)X +1828(An)X +1954(algorithm)X +2293(by)X +2401(Hunt)X +2589(&)X +2679(Szymanski)X +3058([11])X +3220(takes)X +3413(O)X +3477(\()X +3510(\()X +3543(R)X +9 f +3609(+)X +1 f +3666(N)X +3730(\))X +3770(lgN)X +3896(\))X +3952(time)X +4123(where)X +576 1344(the)N +697(parameter)X +1042(R)X +1118(is)X +1194(the)X +1315(total)X +1480(number)X +1747(of)X +1836(ordered)X +2104(pairs)X +2282(of)X +2371(positions)X +2681(at)X +2761(which)X +2979(the)X +3099(two)X +3241(input)X +3427(strings)X +3662(match.)X +3920(Note)X +4098(that)X +4240(all)X +576 1472(these)N +761(algorithms)X +1123(are)X +9 f +1242(W)X +1 f +1310(\()X +1343(N)X +7 s +1405 1440(2)N +10 s +1445 1472(\))N +1492(or)X +1579(worse)X +1791(in)X +1873(terms)X +2071(of)X +2158(N)X +2236(alone.)X +696 1743(In)N +791(practical)X +1096(situations,)X +1450(it)X +1522(is)X +1603(usually)X +1862(the)X +1988(parameter)X +2338(D)X +2424(that)X +2572(is)X +2653(small.)X +2895(Programmers)X +3356(wish)X +3536(to)X +3627(know)X +3834(how)X +4001(they)X +4168(have)X +576 1871(altered)N +819(a)X +879(text)X +1023(\256le.)X +1189(Biologists)X +1535(wish)X +1709(to)X +1794(know)X +1995(how)X +2156(one)X +2295(DNA)X +2492(strand)X +2711(has)X +2841(mutated)X +3122(into)X +3269(another.)X +3573(For)X +3707(these)X +3895(situations,)X +4244(an)X +576 1999(O)N +640(\()X +673(ND)X +795(\))X +842(time)X +1004(algorithm)X +1335(is)X +1408(superior)X +1691(to)X +1773(Hirschberg's)X +2208(algorithms)X +2570(because)X +2845(L)X +2915(is)X +2989(O)X +3053(\()X +3086(N)X +3150(\))X +3198(when)X +3393(D)X +3472(is)X +3546(small.)X +3780(Furthermore,)X +4222(the)X +576 2127(approach)N +895(of)X +986(Hunt)X +1170(and)X +1310(Szymanski)X +1685([11])X +1843(is)X +1920(predicated)X +2279(on)X +2383(the)X +2505(hypothesis)X +2871(that)X +3015(R)X +3092(is)X +3169(small)X +3366(in)X +3452(practice.)X +3770(While)X +3989(this)X +4127(is)X +4203(fre-)X +576 2255(quently)N +838(true,)X +1005(it)X +1071(must)X +1248(be)X +1346(noted)X +1546(that)X +1689(R)X +1765(has)X +1895(no)X +1998(correlation)X +2369(with)X +2534(either)X +2740(the)X +2861(size)X +3009(of)X +3099(the)X +3220(input)X +3407(or)X +3497(the)X +3618(size)X +3766(of)X +3856(the)X +3977(output)X +4204(and)X +576 2383(can)N +720(be)X +828(O)X +892(\()X +925(N)X +7 s +987 2351(2)N +10 s +1027 2383(\))N +1085(in)X +1178(many)X +1387(situations.)X +1764(For)X +1906(example,)X +2229(if)X +2309(10%)X +2487(of)X +2585(all)X +2696(lines)X +2878(in)X +2971(a)X +3038(\256le)X +3171(are)X +3301(blank)X +3510(and)X +3657(the)X +3786(\256le)X +3919(is)X +4003(compared)X +576 2511(against)N +832(itself,)X +1041(R)X +1123(is)X +1205(greater)X +1458(than)X +1625(.)X +1651(01)X +1737(N)X +7 s +1799 2479(2)N +10 s +1833 2511(.)N +1902(For)X +2042(DNA)X +2245(molecules,)X +2619(the)X +2747(alphabet)X +3049(size)X +3204(is)X +3287(four)X +3451(implying)X +3769(that)X +3919(R)X +4002(is)X +4085(at)X +4173(least)X +576 2639(.)N +602(25)X +688(N)X +7 s +750 2607(2)N +10 s +804 2639(when)N +998(an)X +1094(arbitrary)X +1391(molecule)X +1705(is)X +1778(compared)X +2115(against)X +2362(itself)X +2542(or)X +2629(a)X +2685(very)X +2848(similar)X +3090(molecule.)X +696 2910(In)N +794(this)X +940(paper)X +1150(an)X +1257(O\(ND\))X +1516(time)X +1689(algorithm)X +2031(is)X +2115(presented.)X +2494(Our)X +2650(algorithm)X +2992(is)X +3076(simple)X +3320(and)X +3467(based)X +3682(on)X +3794(an)X +3902(intuitive)X +4200(edit)X +576 3038(graph)N +781(formalism.)X +1172(Unlike)X +1412(others)X +1630(it)X +1696(employs)X +1989(the)X +2109(``greedy'')X +2458(design)X +2689(paradigm)X +3014(and)X +3151(exposes)X +3426(the)X +3545(relationship)X +3944(of)X +4032(the)X +4151(long-)X +576 3166(est)N +690(common)X +995(subsequence)X +1426(problem)X +1718(to)X +1805(the)X +1929(single-source)X +2383(shortest)X +2658(path)X +2822(problem.)X +3155(Another)X +3444(O\(ND\))X +3698(algorithm)X +4035(has)X +4168(been)X +576 3294(presented)N +909(elsewhere)X +1256([16].)X +1455(However,)X +1795(it)X +1864(uses)X +2027(a)X +2088(different)X +2390(design)X +2624(paradigm)X +2952(and)X +3093(does)X +3265(not)X +3392(share)X +3587(the)X +3710(following)X +4045(features.)X +576 3422(The)N +742(algorithm)X +1094(can)X +1247(be)X +1364(re\256ned)X +1628(to)X +1731(use)X +1879(only)X +2062(linear)X +2286(space,)X +2526(and)X +2683(its)X +2799 0.2917(expected-case)AX +3292(time)X +3475(behavior)X +3797(is)X +3891(shown)X +4141(to)X +4244(be)X +576 3550(O)N +640(\()X +673(N)X +9 f +744(+)X +1 f +801(D)X +7 s +863 3518(2)N +10 s +903 3550(\).)N +991(Moreover,)X +1348(the)X +1466(method)X +1726(admits)X +1959(an)X +2055(O)X +2119(\()X +2152(NlgN)X +9 f +2343(+)X +1 f +2400(D)X +7 s +2462 3518(2)N +10 s +2502 3550(\))N +2549(time)X +2711(worst-case)X +3075(variation.)X +3420(This)X +3582(is)X +3655(asymptotically)X +4146(supe-)X +576 3678(rior)N +712(to)X +794(previous)X +1090(algorithms)X +1452([8,16,20])X +1766(when)X +1960(D)X +2038(is)X +2111(o\(N\).)X +696 3949(With)N +879(the)X +1000(exception)X +1335(of)X +1425(the)X +1546(O)X +1610(\()X +1643(NlgN)X +9 f +1834(+)X +1 f +1891(D)X +7 s +1953 3917(2)N +10 s +1993 3949(\))N +2043(worst-case)X +2410(variation,)X +2739(the)X +2861(algorithms)X +3227(presented)X +3559(in)X +3645(this)X +3784(paper)X +3987(are)X +4110(practi-)X +576 4077(cal.)N +733(The)X +881(basic)X +1068(O\(ND\))X +1318(algorithm)X +1651(served)X +1883(as)X +1972(the)X +2092(basis)X +2274(for)X +2390(a)X +2448(new)X +2604(implementation)X +3128(of)X +3217(the)X +3337(UNIX)X +2 f +3560(diff)X +1 f +3688(program)X +3982([15].)X +4178(This)X +576 4205(version)N +838(usually)X +1095(runs)X +1259(two)X +1405(to)X +1493(four)X +1653(times)X +1852(faster)X +2057(than)X +2221(the)X +2345(System)X +2606(5)X +2672(implementation)X +3200(based)X +3409(on)X +3515(the)X +3639(Hunt)X +3826(and)X +3969(Szymanski)X +576 4333(algorithm)N +914([10].)X +1115(However,)X +1457(there)X +1645(are)X +1771(cases)X +1968(when)X +2169(D)X +2254(is)X +2334(large)X +2522(where)X +2746(their)X +2920(algorithm)X +3258(is)X +3338(superior)X +3627(\(e.g.)X +3796(for)X +3916(\256les)X +4075(that)X +4221(are)X +576 4461(completely)N +962(different,)X +1289(R=0)X +1457(and)X +1603(D=2N\).)X +1902(The)X +2058(linear)X +2272(space)X +2482(re\256nment)X +2820(is)X +2904(roughly)X +3184(twice)X +3389(as)X +3487(slow)X +3669(as)X +3767(the)X +3896(basic)X +4092(O\(ND\))X +576 4589(algorithm)N +914(but)X +1043(still)X +1189(competitive)X +1594(because)X +1876(it)X +1947(can)X +2086(perform)X +2372(extremely)X +2720(large)X +2907(compares)X +3241(that)X +3387(are)X +3512(out)X +3640(of)X +3733(the)X +3857(range)X +4062(of)X +4155(other)X +576 4717(algorithms.)N +983(For)X +1119(instance,)X +1427(two)X +1572(1.5)X +1697(million)X +1952(byte)X +2115(sequences)X +2466(were)X +2648(compared)X +2990(in)X +3077(less)X +3222(than)X +3385(two)X +3530(minutes)X +3808(\(on)X +3940(a)X +4001(VAX)X +4200(785)X +576 4845(running)N +845(4.2BSD)X +1120(UNIX\))X +1368(even)X +1540(though)X +1782(the)X +1900(difference)X +2247(was)X +2392(greater)X +2636(than)X +2794(500.)X +3 f +576 5101(2.)N +656(Edit)X +822(Graphs)X +1 f +696 5257(Let)N +824(A)X +903(=)X +969(a)X +7 s +1014 5273(1)N +10 s +1054 5257(a)N +7 s +1099 5273(2)N +10 s +1159 5237(.)N +1199(.)X +1239(.)X +1285 5257(a)N +7 s +1330 5273(N)N +10 s +1397 5257(and)N +1534(B)X +1608(=)X +1675(b)X +7 s +1724 5273(1)N +10 s +1764 5257(b)N +7 s +1813 5273(2)N +10 s +1873 5237(.)N +1913(.)X +1953(.)X +1999 5257(b)N +7 s +2048 5273(M)N +10 s +2126 5257(be)N +2224(sequences)X +2572(of)X +2661(length)X +2883(N)X +2963(and)X +3101(M)X +3194(respectively.)X +3644(The)X +2 f +3791(edit)X +3933(graph)X +1 f +4146(for)X +4262(A)X +576 5385(and)N +720(B)X +801(has)X +936(a)X +1000(vertex)X +1229(at)X +1315(each)X +1491(point)X +1683(in)X +1773(the)X +1899(grid)X +2056(\(x,y\),)X +2258(x)X +9 f +(\316)S +1 f +2355([0,N])X +2554(and)X +2697(y)X +9 f +(\316)S +1 f +2794([0,M].)X +3046(The)X +3198(vertices)X +3475(of)X +3569(the)X +3694(edit)X +3841(graph)X +4051(are)X +4177(con-)X +576 5513(nected)N +809(by)X +912(horizontal,)X +1280(vertical,)X +1564(and)X +1703(diagonal)X +2002(directed)X +2285(edges)X +2492(to)X +2578(form)X +2758(a)X +2818(directed)X +3101(acyclic)X +3353(graph.)X +2 f +3600(Horizontal)X +3970(edges)X +1 f +4177(con-)X +576 5641(nect)N +739(each)X +916(vertex)X +1146(to)X +1237(its)X +1341(right)X +1521(neighbor,)X +1855(i.e.)X +1982(\(x)X +9 f +2049(-)X +1 f +2093(1,y\))X +9 f +2220 MX +(->)174 1768 oc +1 f +2299(\(x,y\))X +2482(for)X +2605(x)X +9 f +(\316)S +1 f +2702([1,N])X +2903(and)X +3047(y)X +9 f +(\316)S +1 f +3144([0,M].)X +2 f +3397(Vertical)X +3683(edges)X +1 f +3894(connect)X +4172(each)X +576 5769(vertex)N +803(to)X +891(the)X +1015(neighbor)X +1326(below)X +1548(it,)X +1639(i.e.)X +1764(\(x,y)X +9 f +1891(-)X +1 f +1935(1\))X +9 f +2002 MX +(->)174 1768 oc +1 f +2081(\(x,y\))X +2262(for)X +2383(x)X +9 f +(\316)S +1 f +2480([0,N])X +2679(and)X +2822(y)X +9 f +(\316)S +1 f +2919([1,M].)X +3171(If)X +3252(a)X +7 s +3297 5785(x)N +10 s +3358 5769(=)N +3430(b)X +7 s +3479 5785(y)N +10 s +3540 5769(then)N +3705(there)X +3893(is)X +3973(a)X +2 f +4036(diagonal)X +1 f +2381 6176(-)N +2428(2)X +2488(-)X + +3 p +%%Page: 3 3 +10 s 10 xH 0 xS 1 f +2 f +576 704(edge)N +1 f +752(connecting)X +1128(vertex)X +1353(\(x)X +9 f +1420(-)X +1 f +1464(1,y)X +9 f +(-)S +1 f +1608(1\))X +1699(to)X +1785(vertex)X +2009(\(x,y\).)X +2226(The)X +2374(points)X +2592(\(x,y\))X +2769(for)X +2886(which)X +3105(a)X +7 s +3150 720(x)N +10 s +3207 704(=)N +3275(b)X +7 s +3324 720(y)N +10 s +3381 704(are)N +3503(called)X +2 f +3718(match)X +3937(points)X +1 f +4132(.)X +4195(The)X +576 832(total)N +742(number)X +1011(of)X +1102(match)X +1322(points)X +1541(between)X +1833(A)X +1915(and)X +2055(B)X +2132(is)X +2209(the)X +2332(parameter)X +2679(R)X +2757(characterizing)X +3238(the)X +3361(Hunt)X +3546(&)X +3633(Szymanski)X +4009(algorithm)X +576 960([11].)N +775(It)X +849(is)X +927(also)X +1081(the)X +1204(number)X +1474(of)X +1566(diagonal)X +1867(edges)X +2075(in)X +2162(the)X +2285(edit)X +2429(graph)X +2636(as)X +2727(diagonal)X +3027(edges)X +3234(are)X +3357(in)X +3443(one-to-one)X +3815 0.2404(correspondence)AX +576 1088(with)N +738(match)X +954(points.)X +1209(Figure)X +1438(1)X +1498(depicts)X +1745(the)X +1863(edit)X +2003(graph)X +2206(for)X +2320(the)X +2438(sequences)X +2784(A)X +2862(=)X +2 f +2927(abcabba)X +1 f +3223(and)X +3359(B)X +3432(=)X +2 f +3497(cbabac)X +1 f +3729(.)X +1228 1280 MXY +2440 +2408 +153.00 +245.00 +458.00 +546.00 +startFig +%!PS-Adobe-2.0 +%%%Title: egraph.fig +%%%Creator: fig2dev Version 3.1 Patchlevel 1 +%%%CreationDate: Sun Sep 21 15:21:42 1997 +%%%For: gene@bobcat.CS.Arizona.EDU (Gene Myers) +%%%Orientation: Portrait +%%%BoundingBox: 153 245 458 546 +%%%Pages: 1 +%%%BeginSetup +%%%IncludeFeature: *PageSize Letter +%%%EndSetup +%%%EndComments +/$F2psDict 200 dict def +$F2psDict begin +$F2psDict /mtrx matrix put +/col-1 {} def +/col0 {0.000 0.000 0.000 srgb} bind def +/col1 {0.000 0.000 1.000 srgb} bind def +/col2 {0.000 1.000 0.000 srgb} bind def +/col3 {0.000 1.000 1.000 srgb} bind def +/col4 {1.000 0.000 0.000 srgb} bind def +/col5 {1.000 0.000 1.000 srgb} bind def +/col6 {1.000 1.000 0.000 srgb} bind def +/col7 {1.000 1.000 1.000 srgb} bind def +/col8 {0.000 0.000 0.560 srgb} bind def +/col9 {0.000 0.000 0.690 srgb} bind def +/col10 {0.000 0.000 0.820 srgb} bind def +/col11 {0.530 0.810 1.000 srgb} bind def +/col12 {0.000 0.560 0.000 srgb} bind def +/col13 {0.000 0.690 0.000 srgb} bind def +/col14 {0.000 0.820 0.000 srgb} bind def +/col15 {0.000 0.560 0.560 srgb} bind def +/col16 {0.000 0.690 0.690 srgb} bind def +/col17 {0.000 0.820 0.820 srgb} bind def +/col18 {0.560 0.000 0.000 srgb} bind def +/col19 {0.690 0.000 0.000 srgb} bind def +/col20 {0.820 0.000 0.000 srgb} bind def +/col21 {0.560 0.000 0.560 srgb} bind def +/col22 {0.690 0.000 0.690 srgb} bind def +/col23 {0.820 0.000 0.820 srgb} bind def +/col24 {0.500 0.190 0.000 srgb} bind def +/col25 {0.630 0.250 0.000 srgb} bind def +/col26 {0.750 0.380 0.000 srgb} bind def +/col27 {1.000 0.500 0.500 srgb} bind def +/col28 {1.000 0.630 0.630 srgb} bind def +/col29 {1.000 0.750 0.750 srgb} bind def +/col30 {1.000 0.880 0.880 srgb} bind def +/col31 {1.000 0.840 0.000 srgb} bind def + +end +save +129.5 551.5 translate +1 -1 scale + +/clp {closepath} bind def +/ef {eofill} bind def +/gr {grestore} bind def +/gs {gsave} bind def +/l {lineto} bind def +/m {moveto} bind def +/n {newpath} bind def +/s {stroke} bind def +/slc {setlinecap} bind def +/slj {setlinejoin} bind def +/slw {setlinewidth} bind def +/srgb {setrgbcolor} bind def +/rot {rotate} bind def +/sc {scale} bind def +/tr {translate} bind def +/tnt {dup dup currentrgbcolor + 4 -2 roll dup 1 exch sub 3 -1 roll mul add + 4 -2 roll dup 1 exch sub 3 -1 roll mul add + 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb} + bind def +/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul + 4 -2 roll mul srgb} bind def + /DrawEllipse { + /endangle exch def + /startangle exch def + /yrad exch def + /xrad exch def + /y exch def + /x exch def + /savematrix mtrx currentmatrix def + x y tr xrad yrad sc 0 0 1 startangle endangle arc + closepath + savematrix setmatrix + } def + +/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def +/$F2psEnd {$F2psEnteredState restore end} def +%%%EndProlog + +$F2psBegin +10 setmiterlimit + 0.03000 0.03000 sc +7.500 slw +%% Polyline +n 5207 3812 m 5207 4697 l 5132 4607 l 5207 4832 l 5312 4832 l 5372 4607 l + 5297 4697 l 5297 3789 l 5207 3782 l clp gs 0.80 setgray ef gr gs col-1 s gr +%% Polyline +n 5254 4939 m 6244 5929 l 6124 5929 l 6379 6049 l 6424 6004 l 6304 5749 l + 6304 5854 l 6304 5869 l 5314 4879 l 5254 4924 l clp gs 0.80 setgray ef gr gs col-1 s gr +%% Polyline +n 4051 2533 m 5041 3523 l 4921 3523 l 5176 3643 l 5221 3598 l 5101 3343 l + 5101 3448 l 5101 3463 l 4111 2473 l 4051 2518 l clp gs 0.80 setgray ef gr gs col-1 s gr +%% Polyline +n 6483 6170 m 7473 7160 l 7353 7160 l 7608 7280 l 7653 7235 l 7533 6980 l + 7533 7085 l 7533 7100 l 6543 6110 l 6483 6155 l clp gs 0.80 setgray ef gr gs col-1 s gr +%% Polyline +n 7708 7328 m 8688 7327 l 8583 7402 l 8808 7327 l 8808 7237 l 8583 7162 l + 8688 7237 l 7708 7238 l 7708 7328 l clp gs 0.80 setgray ef gr gs col-1 s gr +%% Polyline +n 10010 8612 m 10010 9497 l 9935 9407 l 10010 9632 l 10115 9632 l 10175 9407 l + 10100 9497 l 10103 8588 l 10010 8582 l clp gs 0.80 setgray ef gr gs col-1 s gr +%% Polyline +n 2633 367 m 3613 366 l 3508 441 l 3733 366 l 3733 276 l 3508 201 l + 3613 276 l 2633 277 l 2633 367 l clp gs 0.80 setgray ef gr gs col-1 s gr +%% Polyline +n 1707 2528 m 2687 2527 l 2582 2602 l 2807 2527 l 2807 2437 l 2582 2362 l + 2687 2437 l 1707 2438 l 1707 2528 l clp gs 0.80 setgray ef gr gs col-1 s gr +%% Polyline +n 8858 7340 m 9848 8330 l 9728 8330 l 9983 8450 l 10028 8405 l 9908 8150 l + 9908 8255 l 9908 8270 l 8918 7280 l 8858 7325 l clp gs 0.80 setgray ef gr gs col-1 s gr +%% Polyline +n 2903 2524 m 3883 2523 l 3778 2598 l 4003 2523 l 4003 2433 l 3778 2358 l + 3883 2433 l 2903 2434 l 2903 2524 l clp gs 0.80 setgray ef gr gs col-1 s gr +%% Ellipse +n 10052 8477 105 105 0 360 DrawEllipse gs 0.80 setgray ef gr gs col-1 s gr + +%% Ellipse +n 3624 776 105 105 0 360 DrawEllipse gs 0.80 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 3675 105 105 0 360 DrawEllipse gs 0.80 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7656 7278 105 105 0 360 DrawEllipse gs 0.80 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6452 6078 105 105 0 360 DrawEllipse gs 0.80 setgray ef gr gs col-1 s gr + +%% Polyline +n 5254 9679 m 6424 9679 l gs col-1 s gr +n 6289.00 9641.50 m 6424.00 9679.00 l 6289.00 9716.50 l gs col-1 s gr +%% Polyline +n 7656 9681 m 8826 9681 l gs col-1 s gr +n 8691.00 9643.50 m 8826.00 9681.00 l 8691.00 9718.50 l gs col-1 s gr +%% Polyline +n 1654 9679 m 2824 9679 l gs col-1 s gr +n 2689.00 9641.50 m 2824.00 9679.00 l 2689.00 9716.50 l gs col-1 s gr +%% Polyline +n 6454 9679 m 7624 9679 l gs col-1 s gr +n 7489.00 9641.50 m 7624.00 9679.00 l 7489.00 9716.50 l gs col-1 s gr +%% Polyline +n 1654 8479 m 1654 9634 l gs col-1 s gr +n 1691.50 9499.00 m 1654.00 9634.00 l 1616.50 9499.00 l gs col-1 s gr +%% Polyline +n 10053 8481 m 10053 9636 l gs col-1 s gr +n 10090.50 9501.00 m 10053.00 9636.00 l 10015.50 9501.00 l gs col-1 s gr +%% Polyline +n 8856 9678 m 10026 9678 l gs col-1 s gr +n 9891.00 9640.50 m 10026.00 9678.00 l 9891.00 9715.50 l gs col-1 s gr +%% Polyline +n 4054 7279 m 4054 8434 l gs col-1 s gr +n 4091.50 8299.00 m 4054.00 8434.00 l 4016.50 8299.00 l gs col-1 s gr +%% Polyline +n 5254 7279 m 5254 8434 l gs col-1 s gr +n 5291.50 8299.00 m 5254.00 8434.00 l 5216.50 8299.00 l gs col-1 s gr +%% Polyline +n 5254 7279 m 6424 8449 l gs col-1 s gr +n 6355.06 8327.02 m 6424.00 8449.00 l 6302.02 8380.06 l gs col-1 s gr +%% Polyline +n 8856 7281 m 8856 8436 l gs col-1 s gr +n 8893.50 8301.00 m 8856.00 8436.00 l 8818.50 8301.00 l gs col-1 s gr +%% Polyline +n 1654 7279 m 2824 8449 l gs col-1 s gr +n 2755.06 8327.02 m 2824.00 8449.00 l 2702.02 8380.06 l gs col-1 s gr +%% Polyline +n 2854 7279 m 2854 8434 l gs col-1 s gr +n 2891.50 8299.00 m 2854.00 8434.00 l 2816.50 8299.00 l gs col-1 s gr +%% Polyline +n 6454 7279 m 6454 8434 l gs col-1 s gr +n 6491.50 8299.00 m 6454.00 8434.00 l 6416.50 8299.00 l gs col-1 s gr +%% Polyline +n 7654 7279 m 7654 8434 l gs col-1 s gr +n 7691.50 8299.00 m 7654.00 8434.00 l 7616.50 8299.00 l gs col-1 s gr +%% Polyline +n 2854 8479 m 4024 8479 l gs col-1 s gr +n 3889.00 8441.50 m 4024.00 8479.00 l 3889.00 8516.50 l gs col-1 s gr +%% Polyline +n 4054 8479 m 5224 8479 l gs col-1 s gr +n 5089.00 8441.50 m 5224.00 8479.00 l 5089.00 8516.50 l gs col-1 s gr +%% Polyline +n 5254 8479 m 6424 8479 l gs col-1 s gr +n 6289.00 8441.50 m 6424.00 8479.00 l 6289.00 8516.50 l gs col-1 s gr +%% Polyline +n 7656 8481 m 8826 8481 l gs col-1 s gr +n 8691.00 8443.50 m 8826.00 8481.00 l 8691.00 8518.50 l gs col-1 s gr +%% Polyline +n 1654 8479 m 2824 8479 l gs col-1 s gr +n 2689.00 8441.50 m 2824.00 8479.00 l 2689.00 8516.50 l gs col-1 s gr +%% Polyline +n 6454 8479 m 7624 8479 l gs col-1 s gr +n 7489.00 8441.50 m 7624.00 8479.00 l 7489.00 8516.50 l gs col-1 s gr +%% Polyline +n 1654 7279 m 1654 8434 l gs col-1 s gr +n 1691.50 8299.00 m 1654.00 8434.00 l 1616.50 8299.00 l gs col-1 s gr +%% Polyline +n 10053 7281 m 10053 8436 l gs col-1 s gr +n 10090.50 8301.00 m 10053.00 8436.00 l 10015.50 8301.00 l gs col-1 s gr +%% Polyline +n 8856 7281 m 10026 8451 l gs col-1 s gr +n 9957.06 8329.02 m 10026.00 8451.00 l 9904.02 8382.06 l gs col-1 s gr +%% Polyline +n 8856 8478 m 10026 8478 l gs col-1 s gr +n 9891.00 8440.50 m 10026.00 8478.00 l 9891.00 8515.50 l gs col-1 s gr +%% Polyline +n 2854 6079 m 4024 7249 l gs col-1 s gr +n 3955.06 7127.02 m 4024.00 7249.00 l 3902.02 7180.06 l gs col-1 s gr +%% Polyline +n 4054 6079 m 4054 7234 l gs col-1 s gr +n 4091.50 7099.00 m 4054.00 7234.00 l 4016.50 7099.00 l gs col-1 s gr +%% Polyline +n 5254 6079 m 5254 7234 l gs col-1 s gr +n 5291.50 7099.00 m 5254.00 7234.00 l 5216.50 7099.00 l gs col-1 s gr +%% Polyline +n 7656 6081 m 8826 7251 l gs col-1 s gr +n 8757.06 7129.02 m 8826.00 7251.00 l 8704.02 7182.06 l gs col-1 s gr +%% Polyline +n 8856 6081 m 8856 7236 l gs col-1 s gr +n 8893.50 7101.00 m 8856.00 7236.00 l 8818.50 7101.00 l gs col-1 s gr +%% Polyline +n 2854 6079 m 2854 7234 l gs col-1 s gr +n 2891.50 7099.00 m 2854.00 7234.00 l 2816.50 7099.00 l gs col-1 s gr +%% Polyline +n 6454 6079 m 6454 7234 l gs col-1 s gr +n 6491.50 7099.00 m 6454.00 7234.00 l 6416.50 7099.00 l gs col-1 s gr +%% Polyline +n 6454 6079 m 7624 7249 l gs col-1 s gr +n 7555.06 7127.02 m 7624.00 7249.00 l 7502.02 7180.06 l gs col-1 s gr +%% Polyline +n 7654 6079 m 7654 7234 l gs col-1 s gr +n 7691.50 7099.00 m 7654.00 7234.00 l 7616.50 7099.00 l gs col-1 s gr +%% Polyline +n 2854 7279 m 4024 7279 l gs col-1 s gr +n 3889.00 7241.50 m 4024.00 7279.00 l 3889.00 7316.50 l gs col-1 s gr +%% Polyline +n 4054 7279 m 5224 7279 l gs col-1 s gr +n 5089.00 7241.50 m 5224.00 7279.00 l 5089.00 7316.50 l gs col-1 s gr +%% Polyline +n 5254 7279 m 6424 7279 l gs col-1 s gr +n 6289.00 7241.50 m 6424.00 7279.00 l 6289.00 7316.50 l gs col-1 s gr +%% Polyline +n 7656 7281 m 8826 7281 l gs col-1 s gr +n 8691.00 7243.50 m 8826.00 7281.00 l 8691.00 7318.50 l gs col-1 s gr +%% Polyline +n 1654 7279 m 2824 7279 l gs col-1 s gr +n 2689.00 7241.50 m 2824.00 7279.00 l 2689.00 7316.50 l gs col-1 s gr +%% Polyline +n 6454 7279 m 7624 7279 l gs col-1 s gr +n 7489.00 7241.50 m 7624.00 7279.00 l 7489.00 7316.50 l gs col-1 s gr +%% Polyline +n 1654 6079 m 1654 7234 l gs col-1 s gr +n 1691.50 7099.00 m 1654.00 7234.00 l 1616.50 7099.00 l gs col-1 s gr +%% Polyline +n 10053 6081 m 10053 7236 l gs col-1 s gr +n 10090.50 7101.00 m 10053.00 7236.00 l 10015.50 7101.00 l gs col-1 s gr +%% Polyline +n 8856 7278 m 10026 7278 l gs col-1 s gr +n 9891.00 7240.50 m 10026.00 7278.00 l 9891.00 7315.50 l gs col-1 s gr +%% Polyline +n 4054 4879 m 4054 6034 l gs col-1 s gr +n 4091.50 5899.00 m 4054.00 6034.00 l 4016.50 5899.00 l gs col-1 s gr +%% Polyline +n 5254 4879 m 5254 6034 l gs col-1 s gr +n 5291.50 5899.00 m 5254.00 6034.00 l 5216.50 5899.00 l gs col-1 s gr +%% Polyline +n 5254 4879 m 6424 6049 l gs col-1 s gr +n 6355.06 5927.02 m 6424.00 6049.00 l 6302.02 5980.06 l gs col-1 s gr +%% Polyline +n 8856 4881 m 8856 6036 l gs col-1 s gr +n 8893.50 5901.00 m 8856.00 6036.00 l 8818.50 5901.00 l gs col-1 s gr +%% Polyline +n 2630 321 m 3739 321 l gs col-1 s gr +n 3604.00 283.50 m 3739.00 321.00 l 3604.00 358.50 l gs col-1 s gr +%% Polyline +n 1654 4879 m 2824 6049 l gs col-1 s gr +n 2755.06 5927.02 m 2824.00 6049.00 l 2702.02 5980.06 l gs col-1 s gr +%% Polyline +n 4054 8479 m 4054 9634 l gs col-1 s gr +n 4091.50 9499.00 m 4054.00 9634.00 l 4016.50 9499.00 l gs col-1 s gr +%% Polyline +n 2854 4879 m 2854 6034 l gs col-1 s gr +n 2891.50 5899.00 m 2854.00 6034.00 l 2816.50 5899.00 l gs col-1 s gr +%% Polyline +n 6454 4879 m 6454 6034 l gs col-1 s gr +n 6491.50 5899.00 m 6454.00 6034.00 l 6416.50 5899.00 l gs col-1 s gr +%% Polyline +n 7654 4879 m 7654 6034 l gs col-1 s gr +n 7691.50 5899.00 m 7654.00 6034.00 l 7616.50 5899.00 l gs col-1 s gr +%% Polyline +n 2854 6079 m 4024 6079 l gs col-1 s gr +n 3889.00 6041.50 m 4024.00 6079.00 l 3889.00 6116.50 l gs col-1 s gr +%% Polyline +n 4054 6079 m 5224 6079 l gs col-1 s gr +n 5089.00 6041.50 m 5224.00 6079.00 l 5089.00 6116.50 l gs col-1 s gr +%% Polyline +n 5254 6079 m 6424 6079 l gs col-1 s gr +n 6289.00 6041.50 m 6424.00 6079.00 l 6289.00 6116.50 l gs col-1 s gr +%% Polyline +n 7656 6081 m 8826 6081 l gs col-1 s gr +n 8691.00 6043.50 m 8826.00 6081.00 l 8691.00 6118.50 l gs col-1 s gr +%% Polyline +n 1654 6079 m 2824 6079 l gs col-1 s gr +n 2689.00 6041.50 m 2824.00 6079.00 l 2689.00 6116.50 l gs col-1 s gr +%% Polyline +n 6454 6079 m 7624 6079 l gs col-1 s gr +n 7489.00 6041.50 m 7624.00 6079.00 l 7489.00 6116.50 l gs col-1 s gr +%% Polyline +n 1654 4879 m 1654 6034 l gs col-1 s gr +n 1691.50 5899.00 m 1654.00 6034.00 l 1616.50 5899.00 l gs col-1 s gr +%% Polyline +n 10053 4881 m 10053 6036 l gs col-1 s gr +n 10090.50 5901.00 m 10053.00 6036.00 l 10015.50 5901.00 l gs col-1 s gr +%% Polyline +n 8856 4881 m 10026 6051 l gs col-1 s gr +n 9957.06 5929.02 m 10026.00 6051.00 l 9904.02 5982.06 l gs col-1 s gr +%% Polyline +n 8856 6078 m 10026 6078 l gs col-1 s gr +n 9891.00 6040.50 m 10026.00 6078.00 l 9891.00 6115.50 l gs col-1 s gr +%% Polyline +n 2854 3679 m 4024 4849 l gs col-1 s gr +n 3955.06 4727.02 m 4024.00 4849.00 l 3902.02 4780.06 l gs col-1 s gr +%% Polyline +n 4054 3679 m 4054 4834 l gs col-1 s gr +n 4091.50 4699.00 m 4054.00 4834.00 l 4016.50 4699.00 l gs col-1 s gr +%% Polyline +n 5254 3679 m 5254 4834 l gs col-1 s gr +n 5291.50 4699.00 m 5254.00 4834.00 l 5216.50 4699.00 l gs col-1 s gr +%% Polyline +n 7656 3681 m 8826 4851 l gs col-1 s gr +n 8757.06 4729.02 m 8826.00 4851.00 l 8704.02 4782.06 l gs col-1 s gr +%% Polyline +n 8856 3681 m 8856 4836 l gs col-1 s gr +n 8893.50 4701.00 m 8856.00 4836.00 l 8818.50 4701.00 l gs col-1 s gr +%% Polyline +n 2854 3679 m 2854 4834 l gs col-1 s gr +n 2891.50 4699.00 m 2854.00 4834.00 l 2816.50 4699.00 l gs col-1 s gr +%% Polyline +n 6454 3679 m 6454 4834 l gs col-1 s gr +n 6491.50 4699.00 m 6454.00 4834.00 l 6416.50 4699.00 l gs col-1 s gr +%% Polyline +n 6454 3679 m 7624 4849 l gs col-1 s gr +n 7555.06 4727.02 m 7624.00 4849.00 l 7502.02 4780.06 l gs col-1 s gr +%% Polyline +n 7654 3679 m 7654 4834 l gs col-1 s gr +n 7691.50 4699.00 m 7654.00 4834.00 l 7616.50 4699.00 l gs col-1 s gr +%% Polyline +n 2854 4879 m 4024 4879 l gs col-1 s gr +n 3889.00 4841.50 m 4024.00 4879.00 l 3889.00 4916.50 l gs col-1 s gr +%% Polyline +n 4054 4879 m 5224 4879 l gs col-1 s gr +n 5089.00 4841.50 m 5224.00 4879.00 l 5089.00 4916.50 l gs col-1 s gr +%% Polyline +n 5254 4879 m 6424 4879 l gs col-1 s gr +n 6289.00 4841.50 m 6424.00 4879.00 l 6289.00 4916.50 l gs col-1 s gr +%% Polyline +n 7656 4881 m 8826 4881 l gs col-1 s gr +n 8691.00 4843.50 m 8826.00 4881.00 l 8691.00 4918.50 l gs col-1 s gr +%% Polyline +n 1654 4879 m 2824 4879 l gs col-1 s gr +n 2689.00 4841.50 m 2824.00 4879.00 l 2689.00 4916.50 l gs col-1 s gr +%% Polyline +n 6454 4879 m 7624 4879 l gs col-1 s gr +n 7489.00 4841.50 m 7624.00 4879.00 l 7489.00 4916.50 l gs col-1 s gr +%% Polyline +n 1654 3679 m 1654 4834 l gs col-1 s gr +n 1691.50 4699.00 m 1654.00 4834.00 l 1616.50 4699.00 l gs col-1 s gr +%% Polyline +n 10053 3681 m 10053 4836 l gs col-1 s gr +n 10090.50 4701.00 m 10053.00 4836.00 l 10015.50 4701.00 l gs col-1 s gr +%% Polyline +n 8856 4878 m 10026 4878 l gs col-1 s gr +n 9891.00 4840.50 m 10026.00 4878.00 l 9891.00 4915.50 l gs col-1 s gr +%% Polyline +n 4053 2478 m 4053 3633 l gs col-1 s gr +n 4090.50 3498.00 m 4053.00 3633.00 l 4015.50 3498.00 l gs col-1 s gr +%% Polyline +n 4053 2478 m 5223 3648 l gs col-1 s gr +n 5154.06 3526.02 m 5223.00 3648.00 l 5101.02 3579.06 l gs col-1 s gr +%% Polyline +n 5253 2478 m 5253 3633 l gs col-1 s gr +n 5290.50 3498.00 m 5253.00 3633.00 l 5215.50 3498.00 l gs col-1 s gr +%% Polyline +n 8855 2480 m 8855 3635 l gs col-1 s gr +n 8892.50 3500.00 m 8855.00 3635.00 l 8817.50 3500.00 l gs col-1 s gr +%% Polyline +n 2853 2478 m 2853 3633 l gs col-1 s gr +n 2890.50 3498.00 m 2853.00 3633.00 l 2815.50 3498.00 l gs col-1 s gr +%% Polyline +n 6453 2478 m 6453 3633 l gs col-1 s gr +n 6490.50 3498.00 m 6453.00 3633.00 l 6415.50 3498.00 l gs col-1 s gr +%% Polyline +n 7653 2478 m 7653 3633 l gs col-1 s gr +n 7690.50 3498.00 m 7653.00 3633.00 l 7615.50 3498.00 l gs col-1 s gr +%% Polyline +n 2853 3678 m 4023 3678 l gs col-1 s gr +n 3888.00 3640.50 m 4023.00 3678.00 l 3888.00 3715.50 l gs col-1 s gr +%% Polyline +n 4053 3678 m 5223 3678 l gs col-1 s gr +n 5088.00 3640.50 m 5223.00 3678.00 l 5088.00 3715.50 l gs col-1 s gr +%% Polyline +n 5253 3678 m 6423 3678 l gs col-1 s gr +n 6288.00 3640.50 m 6423.00 3678.00 l 6288.00 3715.50 l gs col-1 s gr +%% Polyline +n 7655 3680 m 8825 3680 l gs col-1 s gr +n 8690.00 3642.50 m 8825.00 3680.00 l 8690.00 3717.50 l gs col-1 s gr +%% Polyline +n 1653 3678 m 2823 3678 l gs col-1 s gr +n 2688.00 3640.50 m 2823.00 3678.00 l 2688.00 3715.50 l gs col-1 s gr +%% Polyline +n 6453 3678 m 7623 3678 l gs col-1 s gr +n 7488.00 3640.50 m 7623.00 3678.00 l 7488.00 3715.50 l gs col-1 s gr +%% Polyline +n 1653 2478 m 1653 3633 l gs col-1 s gr +n 1690.50 3498.00 m 1653.00 3633.00 l 1615.50 3498.00 l gs col-1 s gr +%% Polyline +n 10052 2480 m 10052 3635 l gs col-1 s gr +n 10089.50 3500.00 m 10052.00 3635.00 l 10014.50 3500.00 l gs col-1 s gr +%% Polyline +n 8855 3677 m 10025 3677 l gs col-1 s gr +n 9890.00 3639.50 m 10025.00 3677.00 l 9890.00 3714.50 l gs col-1 s gr +%% Polyline +n 7731 2478 m 8901 2478 l gs col-1 s gr +n 8766.00 2440.50 m 8901.00 2478.00 l 8766.00 2515.50 l gs col-1 s gr +%% Polyline +n 8931 2478 m 10101 2478 l gs col-1 s gr +n 9966.00 2440.50 m 10101.00 2478.00 l 9966.00 2515.50 l gs col-1 s gr +%% Polyline +n 2854 2479 m 4024 2479 l gs col-1 s gr +n 3889.00 2441.50 m 4024.00 2479.00 l 3889.00 2516.50 l gs col-1 s gr +%% Polyline +n 4054 2479 m 5224 2479 l gs col-1 s gr +n 5089.00 2441.50 m 5224.00 2479.00 l 5089.00 2516.50 l gs col-1 s gr +%% Polyline +n 5254 2479 m 6424 2479 l gs col-1 s gr +n 6289.00 2441.50 m 6424.00 2479.00 l 6289.00 2516.50 l gs col-1 s gr +%% Polyline +n 6454 2479 m 7624 2479 l gs col-1 s gr +n 7489.00 2441.50 m 7624.00 2479.00 l 7489.00 2516.50 l gs col-1 s gr +%% Polyline +n 1654 2479 m 2824 2479 l gs col-1 s gr +n 2689.00 2441.50 m 2824.00 2479.00 l 2689.00 2516.50 l gs col-1 s gr +%% Polyline +n 4054 8479 m 5224 9649 l gs col-1 s gr +n 5155.06 9527.02 m 5224.00 9649.00 l 5102.02 9580.06 l gs col-1 s gr +%% Polyline +n 5254 8479 m 5254 9634 l gs col-1 s gr +n 5291.50 9499.00 m 5254.00 9634.00 l 5216.50 9499.00 l gs col-1 s gr +%% Polyline +n 8856 8481 m 8856 9636 l gs col-1 s gr +n 8893.50 9501.00 m 8856.00 9636.00 l 8818.50 9501.00 l gs col-1 s gr +%% Polyline +n 2854 8479 m 2854 9634 l gs col-1 s gr +n 2891.50 9499.00 m 2854.00 9634.00 l 2816.50 9499.00 l gs col-1 s gr +%% Polyline +n 6454 8479 m 6454 9634 l gs col-1 s gr +n 6491.50 9499.00 m 6454.00 9634.00 l 6416.50 9499.00 l gs col-1 s gr +%% Polyline +n 7654 8479 m 7654 9634 l gs col-1 s gr +n 7691.50 9499.00 m 7654.00 9634.00 l 7616.50 9499.00 l gs col-1 s gr +%% Polyline +n 2854 9679 m 4024 9679 l gs col-1 s gr +n 3889.00 9641.50 m 4024.00 9679.00 l 3889.00 9716.50 l gs col-1 s gr +%% Polyline +n 4054 9679 m 5224 9679 l gs col-1 s gr +n 5089.00 9641.50 m 5224.00 9679.00 l 5089.00 9716.50 l gs col-1 s gr +%% Ellipse +n 6454 6079 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7654 6079 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 6079 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7656 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7656 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 10053 6078 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 4879 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 4879 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 4879 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 4879 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8856 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8858 4883 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8858 4883 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 4879 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 4879 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1654 4879 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1656 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1656 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 4879 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7654 4879 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 4879 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7656 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7656 4881 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 10053 4878 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4053 3678 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5253 3678 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4053 3678 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5253 3678 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4055 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5255 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4055 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5255 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4055 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5255 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8855 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8857 3682 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8857 3682 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2853 3678 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2853 3678 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1653 3678 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 3620 781 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2855 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +/Helvetica-Bold findfont 360.00 scalefont setfont +10202 2702 m +gs 1 -1 sc (0) col-1 show gr +%% Ellipse +n 2855 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1655 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1655 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2855 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6453 3678 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7653 3678 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6453 3678 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6455 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7655 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6455 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6455 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7655 3680 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 10052 3677 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8928 2478 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7654 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1654 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1654 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7654 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 2479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 10052 2477 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +/Helvetica-Bold findfont 360.00 scalefont setfont +6827 1352 m +gs 1 -1 sc (A A A A = B B B B) col-1 show gr +/Helvetica-Bold findfont 270.00 scalefont setfont +7082 1434 m +gs 1 -1 sc (3) col-1 show gr +/Helvetica-Bold findfont 270.00 scalefont setfont +7547 1434 m +gs 1 -1 sc (4) col-1 show gr +/Helvetica-Bold findfont 270.00 scalefont setfont +7975 1449 m +gs 1 -1 sc (5) col-1 show gr +/Helvetica-Bold findfont 270.00 scalefont setfont +8402 1450 m +gs 1 -1 sc (7) col-1 show gr +/Helvetica-Bold findfont 270.00 scalefont setfont +9283 1442 m +gs 1 -1 sc (1) col-1 show gr +/Helvetica-Bold findfont 270.00 scalefont setfont +9741 1449 m +gs 1 -1 sc (3) col-1 show gr +/Helvetica-Bold findfont 270.00 scalefont setfont +10198 1450 m +gs 1 -1 sc (4) col-1 show gr +/Helvetica-Bold findfont 270.00 scalefont setfont +10641 1442 m +gs 1 -1 sc (5) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +3901 451 m +gs 1 -1 sc (Path) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +3901 901 m +gs 1 -1 sc (Trace = \(3,1\) \(4,3\) \(5,4\) \(7,5\)) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +3056 1811 m +gs 1 -1 sc (Edit Script = 1D, 2D, 3IB, 6D 7IC) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +886 1351 m +gs 1 -1 sc (Common Subsequence = CABA =) col-1 show gr +%% Ellipse +n 4054 9679 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 9679 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 9679 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 9679 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8856 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8858 9683 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8858 9683 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 9679 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 9679 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1654 9679 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1656 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1656 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 9679 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7654 9679 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 9679 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7656 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7656 9681 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 10053 9678 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 8479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 8479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 8479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 8479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8856 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8858 8483 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8858 8483 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 8479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 8479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1654 8479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1656 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1656 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 8479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7654 8479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 8479 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7656 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7656 8481 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 10053 8478 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 7279 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 7279 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 7279 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 7279 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8856 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8858 7283 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8858 7283 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 7279 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 7279 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1654 7279 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1656 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1656 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 7279 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7654 7279 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6454 7279 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7656 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 6456 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 7656 7281 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 10053 7278 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 6079 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 6079 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4054 6079 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5254 6079 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 4056 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 5256 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8856 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8858 6083 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 8858 6083 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 6079 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2854 6079 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1654 6079 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1656 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 1656 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +%% Ellipse +n 2856 6081 45 45 0 360 DrawEllipse gs 0.00 setgray ef gr gs col-1 s gr + +/Helvetica-Bold findfont 360.00 scalefont setfont +1277 9827 m +gs 1 -1 sc (C) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +1277 8627 m +gs 1 -1 sc (A) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +1277 7427 m +gs 1 -1 sc (B) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +1277 6227 m +gs 1 -1 sc (A) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +1277 5027 m +gs 1 -1 sc (B) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +1277 3827 m +gs 1 -1 sc (C) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +2777 2327 m +gs 1 -1 sc (A) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +3977 2327 m +gs 1 -1 sc (B) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +5177 2327 m +gs 1 -1 sc (C) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +6377 2327 m +gs 1 -1 sc (A) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +7577 2327 m +gs 1 -1 sc (B) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +8777 2327 m +gs 1 -1 sc (B) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +9977 2327 m +gs 1 -1 sc (A) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +10202 10052 m +gs 1 -1 sc (\(7,6\)) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +827 2402 m +gs 1 -1 sc (\(0,0\)) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +2777 10127 m +gs 1 -1 sc (1) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +1652 10127 m +gs 1 -1 sc (0) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +8777 10127 m +gs 1 -1 sc (6) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +7577 10127 m +gs 1 -1 sc (5) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +6377 10127 m +gs 1 -1 sc (4) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +5177 10127 m +gs 1 -1 sc (3) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +3977 10127 m +gs 1 -1 sc (2) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +10202 3827 m +gs 1 -1 sc (1) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +10202 5027 m +gs 1 -1 sc (2) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +10202 7427 m +gs 1 -1 sc (4) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +10202 8627 m +gs 1 -1 sc (5) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +10202 6302 m +gs 1 -1 sc (3) col-1 show gr +showpage +%%%Page: 1 1 +$F2psEnd +restore +endFig +3 f +2112 3944(Fig.)N +2263(1.)X +1 f +2343(An)X +2461(edit)X +2601(graph)X +696 4164(A)N +2 f +786(trace)X +1 f +983(of)X +1082(length)X +1314(L)X +1395(is)X +1480(a)X +1548(sequence)X +1875(of)X +1974(L)X +2055(match)X +2283(points,)X +2530(\()X +2563(x)X +7 s +2612 4180(1)N +10 s +2652 4164(,)N +2678(y)X +7 s +2727 4180(1)N +10 s +2767 4164(\))N +2800(\()X +2833(x)X +7 s +2882 4180(2)N +10 s +2922 4164(,)N +2948(y)X +7 s +2997 4180(2)N +10 s +3037 4164(\))N +3097 4144(.)N +3137(.)X +3177(.)X +3223 4164(\()N +3256(x)X +7 s +3305 4180(L)N +10 s +3351 4164(,)N +3377(y)X +7 s +3426 4180(L)N +10 s +3472 4164(\),)N +3551(such)X +3730(that)X +3882(x)X +7 s +3931 4180(i)N +10 s +3966 4164(<)N +4024(x)X +7 s +4073 4180(i)N +9 f +4098(+)X +1 f +4138(1)X +10 s +4204 4164(and)N +576 4292(y)N +7 s +625 4308(i)N +10 s +660 4292(<)N +718(y)X +7 s +767 4308(i)N +9 f +792(+)X +1 f +832(1)X +10 s +890 4292(for)N +1008(successive)X +1371(points)X +1590(\(x)X +7 s +1666 4308(i)N +10 s +1688 4292(,y)N +7 s +1757 4308(i)N +10 s +1779 4292(\))N +1830(and)X +1970(\(x)X +7 s +2046 4308(i)N +9 f +2071(+)X +1 f +2111(1)X +10 s +2145 4292(,y)N +7 s +2214 4308(i)N +9 f +2239(+)X +1 f +2279(1)X +10 s +2313 4292(\),)N +2384(i)X +9 f +2406(\316)X +1 f +2463([1,L)X +9 f +2599(-)X +1 f +2643(1].)X +2774(Every)X +2990(trace)X +3171(is)X +3248(in)X +3334(exact)X +3528 0.2404(correspondence)AX +4057(with)X +4222(the)X +576 4420(diagonal)N +873(edges)X +1077(of)X +1165(a)X +1222(path)X +1381(in)X +1464(the)X +1584(edit)X +1726(graph)X +1931(from)X +2109(\(0,0\))X +2285(to)X +2369(\(N,M\).)X +2634(The)X +2781(sequence)X +3098(of)X +3187(match)X +3405(points)X +3622(visited)X +3857(in)X +3941(traversing)X +4284(a)X +576 4548(path)N +751(from)X +944(start)X +1119(to)X +1218(\256nish)X +1432(is)X +1522(easily)X +1746(veri\256ed)X +2028(to)X +2127(be)X +2240(a)X +2313(trace.)X +2547(Note)X +2740(that)X +2897(L)X +2983(is)X +3073(the)X +3208(number)X +3490(of)X +3593(diagonal)X +3905(edges)X +4124(in)X +4222(the)X +576 4676(corresponding)N +1064(path.)X +1272(To)X +1391(construct)X +1715(a)X +1781(path)X +1949(from)X +2135(a)X +2201(trace,)X +2408(take)X +2572(the)X +2700(sequence)X +3025(of)X +3122(diagonal)X +3428(edges)X +3641(corresponding)X +4130(to)X +4222(the)X +576 4804(match)N +796(points)X +1015(of)X +1106(the)X +1228(trace)X +1409(and)X +1549(connect)X +1823(successive)X +2186(diagonals)X +2517(with)X +2683(a)X +2743(series)X +2950(of)X +3041(horizontal)X +3390(and)X +3530(vertical)X +3795(edges.)X +4042(This)X +4208(can)X +576 4932(always)N +828(be)X +933(done)X +1118(as)X +1214(x)X +7 s +1263 4948(i)N +10 s +1298 4932(<)N +1356(x)X +7 s +1405 4948(i)N +9 f +1430(+)X +1 f +1470(1)X +10 s +1533 4932(and)N +1678(y)X +7 s +1727 4948(i)N +10 s +1762 4932(<)N +1820(y)X +7 s +1869 4948(i)N +9 f +1894(+)X +1 f +1934(1)X +10 s +1998 4932(for)N +2122(successive)X +2491(match)X +2717(points.)X +2982(Note)X +3168(that)X +3318(several)X +3576(paths)X +3775(differing)X +4086(only)X +4258(in)X +576 5060(their)N +743(non-diagonal)X +1186(edges)X +1389(can)X +1521(correspond)X +1898(to)X +1980(a)X +2036(given)X +2234(trace.)X +2451(Figure)X +2680(1)X +2740(illustrates)X +3071(this)X +3206(relation)X +3471(between)X +3759(paths)X +3948(and)X +4084(traces.)X +2381 6176(-)N +2428(3)X +2488(-)X + +4 p +%%Page: 4 4 +10 s 10 xH 0 xS 1 f +696 704(A)N +2 f +774(subsequence)X +1 f +1200(of)X +1287(a)X +1343(string)X +1545(is)X +1618(any)X +1754(string)X +1956(obtained)X +2252(by)X +2352(deleting)X +2630(zero)X +2789(or)X +2876(more)X +3061(symbols)X +3347(from)X +3523(the)X +3641(given)X +3839(string.)X +4081(A)X +2 f +4159(com-)X +576 832(mon)N +735(subsequence)X +1 f +1162(of)X +1249(two)X +1389(strings,)X +1642(A)X +1720(and)X +1856(B,)X +1949(is)X +2022(a)X +2078(subsequence)X +2504(of)X +2591(both.)X +2793(Each)X +2974(trace)X +3151(gives)X +3340(rise)X +3476(to)X +3558(a)X +3614(common)X +3914(subsequence)X +576 960(of)N +667(A)X +749(and)X +889(B)X +966(and)X +1106(vice)X +1264(versa.)X +1499(Speci\256cally,)X +1922(a)X +7 s +1967 976(x)N +4 s +2000 987(1)N +10 s +2032 960(a)N +7 s +2077 976(x)N +4 s +2110 987(2)N +10 s +2162 940(.)N +2202(.)X +2242(.)X +2288 960(a)N +7 s +2333 976(x)N +4 s +2366 987(L)N +10 s +2421 960(=)N +2491(b)X +7 s +2540 976(y)N +4 s +2573 987(1)N +10 s +2605 960(b)N +7 s +2654 976(y)N +4 s +2687 987(2)N +10 s +2739 940(.)N +2779(.)X +2819(.)X +2865 960(b)N +7 s +2914 976(y)N +4 s +2947 987(L)N +10 s +3002 960(is)N +3080(a)X +3141(common)X +3446(subsequence)X +3877(of)X +3969(A)X +4052(and)X +4193(B)X +4271(if)X +576 1099(and)N +712(only)X +874(if)X +943(\()X +976(x)X +7 s +1025 1115(1)N +10 s +1065 1099(,)N +1091(y)X +7 s +1140 1115(1)N +10 s +1180 1099(\))N +1213(\()X +1246(x)X +7 s +1295 1115(2)N +10 s +1335 1099(,)N +1361(y)X +7 s +1410 1115(2)N +10 s +1450 1099(\))N +1510 1079(.)N +1550(.)X +1590(.)X +1636 1099(\()N +1669(x)X +7 s +1718 1115(L)N +10 s +1764 1099(,)N +1790(y)X +7 s +1839 1115(L)N +10 s +1885 1099(\))N +1932(is)X +2005(a)X +2061(trace)X +2238(of)X +2325(A)X +2403(and)X +2539(B.)X +696 1255(An)N +2 f +815(edit)X +956(script)X +1 f +1159(for)X +1274(A)X +1353(and)X +1490(B)X +1564(is)X +1638(a)X +1695(set)X +1805(of)X +1893(insertion)X +2194(and)X +2332(deletion)X +2612(commands)X +2981(that)X +3123(transform)X +3457(A)X +3537(into)X +3683(B.)X +3798(The)X +2 f +3945(delete)X +4159(com-)X +576 1383(mand)N +1 f +776(``x)X +2 f +876(D)X +1 f +934('')X +1010(deletes)X +1255(the)X +1375(symbol)X +1632(a)X +7 s +1677 1399(x)N +10 s +1733 1383(from)N +1911(A.)X +2031(The)X +2 f +2178(insert)X +2381(command)X +1 f +2714(``x)X +2833(I)X +2885(b)X +7 s +2934 1399(1)N +10 s +2974 1383(,)N +3000(b)X +7 s +3049 1399(2)N +10 s +3089 1383(,)N +3135 1363(.)N +3175(.)X +3215(.)X +3261 1383(b)N +7 s +3310 1399(t)N +10 s +3332 1383('')N +3407(inserts)X +3637(the)X +3756(sequence)X +4072(of)X +4160(sym-)X +576 1511(bols)N +737(b)X +7 s +786 1527(1)N +10 s +846 1491(.)N +886(.)X +926(.)X +972 1511(b)N +7 s +1021 1527(t)N +10 s +1071 1511(immediately)N +1499(after)X +1675(a)X +7 s +1720 1527(x)N +10 s +1754 1511(.)N +1823(Script)X +2043(commands)X +2419(refer)X +2601(to)X +2692(symbol)X +2956(positions)X +3273(within)X +3506(A)X +3593(before)X +3828(any)X +3973(commands)X +576 1639(have)N +756(been)X +936(performed.)X +1338(One)X +1499(must)X +1681(think)X +1872(of)X +1966(the)X +2091(set)X +2207(of)X +2301(commands)X +2675(in)X +2764(a)X +2827(script)X +3032(as)X +3126(being)X +3331(executed)X +3644(simultaneously.)X +4195(The)X +576 1767(length)N +796(of)X +883(a)X +939(script)X +1137(is)X +1210(the)X +1328(number)X +1593(of)X +1680(symbols)X +1966(inserted)X +2240(and)X +2376(deleted.)X +696 1923(Every)N +910(trace)X +1089(corresponds)X +1499(uniquely)X +1801(to)X +1885(an)X +1983(edit)X +2125(script.)X +2365(Let)X +2494(\()X +2527(x)X +7 s +2576 1939(1)N +10 s +2616 1923(,)N +2642(y)X +7 s +2691 1939(1)N +10 s +2731 1923(\))N +2764(\()X +2797(x)X +7 s +2846 1939(2)N +10 s +2886 1923(,)N +2912(y)X +7 s +2961 1939(2)N +10 s +3001 1923(\))N +3061 1903(.)N +3101(.)X +3141(.)X +3187 1923(\()N +3220(x)X +7 s +3269 1939(L)N +10 s +3315 1923(,)N +3341(y)X +7 s +3390 1939(L)N +10 s +3436 1923(\))N +3485(be)X +3583(a)X +3641(trace.)X +3860(Let)X +3989(y)X +7 s +4038 1939(0)N +10 s +9 f +4085 1923(=)N +1 f +4142(0)X +4204(and)X +576 2051(y)N +7 s +625 2067(L)N +9 f +668(+)X +1 f +708(1)X +10 s +9 f +755 2051(=)N +1 f +812(M)X +9 f +896(+)X +1 f +953(1.)X +1102(The)X +1295(associated)X +1693(script)X +1939(consists)X +2260(of)X +2395(the)X +2561(commands:)X +2998(``x)X +2 f +3098(D)X +1 f +3156('')X +3278(for)X +3440(x)X +9 f +3486(\316)X +1 f +3504(/)X +3549({x)X +7 s +3636 2067(1)N +10 s +3676 2051(,)N +3702(x)X +7 s +3751 2067(2)N +10 s +3791 2051(,)N +3837 2031(.)N +3877(.)X +3917(.)X +3963 2051(,)N +3989(x)X +7 s +4038 2067(L)N +10 s +4078 2051(},)N +4204(and)X +576 2179(``x)N +7 s +679 2195(k)N +10 s +738 2179(I)N +790(b)X +7 s +839 2195(y)N +4 s +872 2206(k)N +7 s +9 f +901 2195(+)N +1 f +941(1)X +10 s +981 2179(,)N +1027 2159(.)N +1067(.)X +1107(.)X +1153 2179(,)N +1179(b)X +7 s +1228 2195(y)N +4 s +1261 2206(k)N +9 f +1282(+)X +1 f +1305(1)X +7 s +9 f +1334 2195(-)N +1 f +1374(1)X +10 s +1408 2179('')N +1486(for)X +1605(k)X +1670(such)X +1842(that)X +1987(y)X +7 s +2036 2195(k)N +10 s +9 f +2083 2179(+)N +1 f +2140(1)X +2193(<)X +2251(y)X +7 s +2300 2195(k)N +9 f +2337(+)X +1 f +2377(1)X +10 s +2411 2179(.)N +2476(The)X +2626(script)X +2829(deletes)X +3077(N)X +9 f +3148(-)X +1 f +3205(L)X +3279(symbols)X +3570(and)X +3711(inserts)X +3945(M)X +9 f +4029(-)X +1 f +4086(L)X +4160(sym-)X +576 2318(bols.)N +770(So)X +875(for)X +990(every)X +1190(trace)X +1368(of)X +1456(length)X +1677(L)X +1747(there)X +1929(is)X +2003(a)X +2060(corresponding)X +2540(script)X +2739(of)X +2827(length)X +3047(D)X +3125(=)X +3190(N+M)X +9 f +(-)S +1 f +3408(2L.)X +3557(To)X +3666(map)X +3824(an)X +3920(edit)X +4060(script)X +4258(to)X +576 2446(a)N +632(trace,)X +829(simply)X +1066(perform)X +1345(all)X +1445(delete)X +1657(commands)X +2024(on)X +2124(A,)X +2222(observe)X +2492(that)X +2632(the)X +2750(result)X +2948(is)X +3021(a)X +3078(common)X +3379(subsequence)X +3806(of)X +3894(A)X +3973(and)X +4110(B,)X +4204(and)X +576 2574(then)N +740(map)X +904(the)X +1027(subsequence)X +1458(to)X +1545(its)X +1645(unique)X +1888(trace.)X +2110(Note)X +2291(that)X +2436(inverting)X +2750(the)X +2873(action)X +3094(of)X +3186(the)X +3309(insert)X +3512(commands)X +3884(gives)X +4078(a)X +4139(set)X +4253(of)X +576 2702(delete)N +788(commands)X +1155(that)X +1295(map)X +1453(B)X +1526(to)X +1608(the)X +1726(same)X +1911(common)X +2211(subsequence.)X +696 2858(Common)N +1019(subsequences,)X +1502(edit)X +1648(scripts,)X +1903(traces,)X +2137(and)X +2279(paths)X +2474(from)X +2656(\(0,0\))X +2837(to)X +2926(\(N,M\))X +3156(in)X +3245(the)X +3370(edit)X +3517(graph)X +3727(are)X +3853(all)X +3960(isomorphic)X +576 2986(formalisms.)N +1002(The)X +1153(edges)X +1362(of)X +1455(every)X +1660(path)X +1824(have)X +2002(the)X +2126(following)X +2463(direct)X +2672(interpretations)X +3161(in)X +3249(terms)X +3453(of)X +3546(the)X +3670(corresponding)X +4155(com-)X +576 3114(mon)N +746(subsequence)X +1180(and)X +1324(edit)X +1472(script.)X +1718(Each)X +1907(diagonal)X +2211(edge)X +2391(ending)X +2637(at)X +2724(\(x,y\))X +2907(gives)X +3105(a)X +3170(symbol,)X +3454(a)X +7 s +3499 3130(x)N +10 s +3562 3114(\(=)N +3663(b)X +7 s +3712 3130(y)N +10 s +3746 3114(\),)N +3822(in)X +3913(the)X +4040(common)X +576 3242(subsequence;)N +1035(each)X +1214(horizontal)X +1570(edge)X +1753(to)X +1846(point)X +2041(\(x,y\))X +2226(corresponds)X +2645(to)X +2737(the)X +2865(delete)X +3087(command)X +3433(``x)X +3552(D'';)X +3716(and)X +3862(a)X +3928(sequence)X +4253(of)X +576 3370(vertical)N +847(edges)X +1060(from)X +1246(\(x,y\))X +1430(to)X +1522(\(x,z\))X +1702(corresponds)X +2120(to)X +2212(the)X +2341(insert)X +2550(command,)X +2917(``x)X +3036(I)X +3088(b)X +7 s +3137 3386(y)N +9 f +3174(+)X +1 f +3214(1)X +10 s +3254 3370(,)N +3300 3350(.)N +3340(.)X +3380(.)X +3426 3370(,)N +3452(b)X +7 s +3501 3386(z)N +10 s +3532 3370(''.)N +3657(Thus)X +3848(the)X +3977(number)X +4253(of)X +576 3498(vertical)N +838(and)X +975(horizontal)X +1321(edges)X +1525(in)X +1608(the)X +1727(path)X +1886(is)X +1960(the)X +2079(length)X +2300(of)X +2388(its)X +2483(corresponding)X +2962(script,)X +3180(the)X +3298(number)X +3563(of)X +3650(diagonal)X +3946(edges)X +4149(is)X +4222(the)X +576 3626(length)N +798(of)X +887(its)X +984(corresponding)X +1465(subsequence,)X +1913(and)X +2051(the)X +2171(total)X +2335(number)X +2602(of)X +2691(edges)X +2896(is)X +2972(N+M)X +9 f +(-)S +1 f +3190(L.)X +3302(Figure)X +3534(1)X +3597(illustrates)X +3931(these)X +4119(obser-)X +576 3754(vations.)N +696 3910(The)N +841(problem)X +1128(of)X +1215(\256nding)X +1461(a)X +2 f +1517(longest)X +1768(common)X +2060(subsequence)X +1 f +2486(\(LCS\))X +2707(is)X +2781(equivalent)X +3136(to)X +3219(\256nding)X +3466(a)X +3523(path)X +3682(from)X +3859(\(0,0\))X +4034(to)X +4117(\(N,M\))X +576 4038(with)N +744(the)X +868(maximum)X +1218(number)X +1488(of)X +1580(diagonal)X +1881(edges.)X +2129(The)X +2279(problem)X +2571(of)X +2663(\256nding)X +2914(a)X +2 f +2975(shortest)X +3253(edit)X +3398(script)X +1 f +3605(\(SES\))X +3821(is)X +3899(equivalent)X +4258(to)X +576 4166(\256nding)N +824(a)X +882(path)X +1042(from)X +1220(\(0,0\))X +1396(to)X +1480(\(N,M\))X +1706(with)X +1871(the)X +1992(minimum)X +2325(number)X +2593(of)X +2683(non-diagonal)X +3129(edges.)X +3375(These)X +3590(are)X +3712(dual)X +3873(problems)X +4194(as)X +4284(a)X +576 4294(path)N +738(with)X +904(the)X +1026(maximum)X +1374(number)X +1643(of)X +1734(diagonal)X +2033(edges)X +2239(has)X +2369(the)X +2490(minimal)X +2779(number)X +3047(of)X +3137(non-diagonal)X +3583(edges)X +3789(\(D+2L)X +4031(=)X +4099(M+N\).)X +576 4422(Consider)N +891(adding)X +1135(a)X +1197(weight)X +1441(or)X +1534(cost)X +1690(to)X +1779(every)X +1985(edge.)X +2204(Give)X +2387(diagonal)X +2690(edges)X +2900(weight)X +3145(0)X +3212(and)X +3355(non-diagonal)X +3805(edges)X +4015(weight)X +4260(1.)X +576 4550(The)N +724(LCS/SES)X +1052(problem)X +1342(is)X +1418(equivalent)X +1774(to)X +1858(\256nding)X +2106(a)X +2164(minimum-cost)X +2652(path)X +2812(from)X +2990(\(0,0\))X +3166(to)X +3250(\(N,M\))X +3475(in)X +3559(the)X +3679(weighted)X +3995(edit)X +4137(graph)X +576 4678(and)N +712(is)X +785(thus)X +938(a)X +994(special)X +1237(instance)X +1520(of)X +1607(the)X +1725(single-source)X +2173(shortest)X +2442(path)X +2600(problem.)X +3 f +576 4934(3.)N +656(An)X +778(O\(\(M+N\)D\))X +1206(Greedy)X +1480(Algorithm)X +1 f +696 5090(The)N +853(problem)X +1152(of)X +1251(\256nding)X +1509(a)X +1578(shortest)X +1860(edit)X +2013(script)X +2224(reduces)X +2503(to)X +2598(\256nding)X +2857(a)X +2926(path)X +3097(from)X +3286(\(0,0\))X +3473(to)X +3568(\(N,M\))X +3804(with)X +3979(the)X +4110(fewest)X +576 5218(number)N +849(of)X +944(horizontal)X +1297(and)X +1441(vertical)X +1710(edges.)X +1961(Let)X +2095(a)X +2158(D-path)X +2408(be)X +2511(a)X +2574(path)X +2739(starting)X +3006(at)X +3091(\(0,0\))X +3272(that)X +3419(has)X +3553(exactly)X +3812(D)X +3897(non-diagonal)X +576 5346(edges.)N +821(A)X +901(0-path)X +1128(must)X +1305(consist)X +1549(solely)X +1762(of)X +1851(diagonal)X +2149(edges.)X +2394(By)X +2509(a)X +2567(simple)X +2802(induction,)X +3146(it)X +3212(follows)X +3474(that)X +3616(a)X +3674(D-path)X +3920(must)X +4098(consist)X +576 5474(of)N +670(a)X +733(\(D)X +9 f +831(-)X +1 f +888(1\)-path)X +1147(followed)X +1459(by)X +1566(a)X +1628(non-diagonal)X +2077(edge)X +2255(and)X +2397(then)X +2561(a)X +2623(possibly)X +2915(empty)X +3141(sequence)X +3462(of)X +3555(diagonal)X +3857(edges)X +4066(called)X +4284(a)X +2 f +576 5602(snake)N +1 f +759(.)X +696 5758(Number)N +983(the)X +1105(diagonals)X +1436(in)X +1522(the)X +1644(grid)X +1797(of)X +1888(edit)X +2032(graph)X +2239(vertices)X +2513(so)X +2608(that)X +2752(diagonal)X +3052(k)X +3116(consists)X +3393(of)X +3484(the)X +3606(points)X +3826(\(x,y\))X +4005(for)X +4124(which)X +576 5886(x)N +9 f +629(-)X +1 f +686(y)X +9 f +758(=)X +1 f +834(k.)X +943(With)X +1132(this)X +1276(de\256nition)X +1611(the)X +1738(diagonals)X +2074(are)X +2202(numbered)X +2552(from)X +9 f +2737(-)X +1 f +2781(M)X +2880(to)X +2970(N.)X +3096(Note)X +3280(that)X +3428(a)X +3492(vertical)X +3761(\(horizontal\))X +4168(edge)X +2381 6176(-)N +2428(4)X +2488(-)X + +5 p +%%Page: 5 5 +10 s 10 xH 0 xS 1 f +576 704(with)N +739(start)X +898(point)X +1083(on)X +1184(diagonal)X +1481(k)X +1542(has)X +1670(end)X +1807(point)X +1992(on)X +2093(diagonal)X +2390(k)X +9 f +2443(-)X +1 f +2500(1)X +2561(\(k)X +9 f +2641(+)X +1 f +2698(1\))X +2786(and)X +2923(a)X +2980(snake)X +3184(remains)X +3459(on)X +3560(the)X +3679(diagonal)X +3976(in)X +4059(which)X +4276(it)X +576 832(starts.)N +3 f +576 988(Lemma)N +859(1)X +1 f +(:)S +996(A)X +1074(D-path)X +1317(must)X +1492(end)X +1628(on)X +1728(diagonal)X +2024(k)X +9 f +2084(\316)X +1 f +2161({)X +9 f +2231(-)X +1 f +2288(D)X +2352(,)X +9 f +2404(-)X +1 f +2461(D)X +9 f +2532(+)X +1 f +2589(2)X +2635(,)X +2700 968(.)N +2740(.)X +2780(.)X +2845 988(D)N +9 f +2916(-)X +1 f +2973(2)X +3019(,)X +3064(D)X +3147(}.)X +3 f +576 1144(Proof)N +1 f +768(:)X +696 1300(A)N +777(0-path)X +1006(consists)X +1283(solely)X +1498(of)X +1589(diagonal)X +1889(edges)X +2096(and)X +2236(starts)X +2429(on)X +2533(diagonal)X +2833(0.)X +2937(Hence)X +3167(it)X +3235(must)X +3414(end)X +3554(on)X +3658(diagonal)X +3958(0.)X +4062(Assume)X +576 1428(inductively)N +957(that)X +1098(a)X +1154(D-path)X +1397(must)X +1572(end)X +1708(on)X +1808(diagonal)X +2104(k)X +2164(in)X +2246({)X +9 f +2316(-)X +1 f +2373(D)X +2437(,)X +9 f +2489(-)X +1 f +2546(D)X +9 f +2617(+)X +1 f +2674(2)X +2720(,)X +2785 1408(.)N +2825(.)X +2865(.)X +2930 1428(D)N +9 f +3001(-)X +1 f +3058(2)X +3104(,)X +3149(D)X +3232(}.)X +3330(Every)X +3542(\(D+1\)-path)X +3924(consists)X +4197(of)X +4284(a)X +576 1556(pre\256x)N +785(D-path,)X +1050(ending)X +1290(on)X +1392(say)X +1521(diagonal)X +1819(k,)X +1901(a)X +1959(non-diagonal)X +2404(edge)X +2578(ending)X +2818(on)X +2920(diagonal)X +3218(k+1)X +3365(or)X +3454(k)X +9 f +(-)S +1 f +3538(1,)X +3620(and)X +3758(a)X +3816(snake)X +4022(that)X +4165(must)X +576 1684(also)N +739(end)X +889(on)X +1003(diagonal)X +1313(k+1)X +1472(or)X +1573(k)X +9 f +(-)S +1 f +1657(1.)X +1771(It)X +1854(then)X +2026(follows)X +2300(that)X +2454(every)X +2667(\(D+1\)-path)X +3063(must)X +3252(end)X +3402(on)X +3516(a)X +3585(diagonal)X +3894(in)X +3989({)X +4060(\()X +9 f +4087(-)X +1 f +4131(D\))X +9 f +4216(\261)X +1 f +4260(1,)X +576 1812(\()N +9 f +603(-)X +1 f +647(D+2\))X +9 f +817(\261)X +1 f +861(1,)X +961 1792(.)N +1001(.)X +1041(.)X +1101 1812(\(D)N +9 f +1186(-)X +1 f +1230(2\))X +9 f +1297(\261)X +1 f +1341(1,)X +1421(\(D\))X +9 f +1533(\261)X +1 f +1577(1)X +1637(})X +1695(=)X +1760({)X +9 f +1818(-)X +1 f +1862(D)X +9 f +1920(-)X +1 f +1964(1,)X +9 f +2044(-)X +1 f +2088(D+1,)X +2291 1792(.)N +2331(.)X +2371(.)X +2431 1812(D)N +9 f +2489(-)X +1 f +2533(1,)X +2613(D+1)X +2776(}.)X +2874(Thus)X +3054(the)X +3172(result)X +3370(holds)X +3563(by)X +3663(induction.)X +16 s +10 f +4065 1892(`)N +10 s +1 f +576 1993(The)N +721(lemma)X +959(implies)X +1214(that)X +1354(D-paths)X +1628(end)X +1764(solely)X +1975(on)X +2075(odd)X +2215(diagonals)X +2542(when)X +2736(D)X +2814(is)X +2887(odd)X +3027(and)X +3163(even)X +3335(diagonals)X +3662(when)X +3856(D)X +3934(is)X +4007(even.)X +696 2149(A)N +774(D-path)X +1017(is)X +2 f +1090(furthest)X +1354(reaching)X +1 f +1659(in)X +1741(diagonal)X +2037(k)X +2097(if)X +2166(and)X +2302(only)X +2464(if)X +2533(it)X +2597(is)X +2670(one)X +2806(of)X +2893(the)X +3011(D-paths)X +3285(ending)X +3523(on)X +3623(diagonal)X +3919(k)X +3979(whose)X +4204(end)X +576 2277(point)N +761(has)X +889(the)X +1008(greatest)X +1279(possible)X +1562(row)X +1708(\(column\))X +2023(number)X +2289(of)X +2377(all)X +2478(such)X +2646(paths.)X +2876(Informally,)X +3260(of)X +3348(all)X +3449(D-paths)X +3724(ending)X +3962(in)X +4044(diagonal)X +576 2405(k,)N +657(it)X +722(ends)X +890(furthest)X +1156(from)X +1333(the)X +1452(origin,)X +1684(\(0,0\).)X +1899(The)X +2045(following)X +2377(lemma)X +2617(gives)X +2808(an)X +2906(inductive)X +3226 0.2167(characterization)AX +3762(of)X +3851(furthest)X +4118(reach-)X +576 2533(ing)N +702(D-paths)X +980(and)X +1120(embodies)X +1450(a)X +1509(greedy)X +1751(principle:)X +2081(furthest)X +2349(reaching)X +2649(D-paths)X +2926(are)X +3048(obtained)X +3347(by)X +3450(greedily)X +3736(extending)X +4075(furthest)X +576 2661(reaching)N +873(\(D)X +9 f +971(-)X +1 f +1028(1\)-paths.)X +3 f +576 2817(Lemma)N +859(2:)X +1 f +996(A)X +1083(furthest)X +1357(reaching)X +1663(0-path)X +1897(ends)X +2073(at)X +2160(\(x,x\),)X +2363(where)X +2589(x)X +2658(is)X +2740(min)X +2870(\()X +2922(z)X +9 f +2964(-)X +1 f +3014(1)X +9 f +3079 -4.0000(||)AX +1 f +3139(a)X +7 s +3184 2833(z)N +10 s +9 f +3221 2817(\271)N +1 f +3271(b)X +7 s +3320 2833(z)N +10 s +3380 2817(or)N +3476(z>M)X +3657(or)X +3753(z>N\).)X +3988(A)X +4075(furthest)X +996 2945(reaching)N +1300(D-path)X +1550(on)X +1657(diagonal)X +1960(k)X +2027(can)X +2166(without)X +2437(loss)X +2588(of)X +2682(generality)X +3030(be)X +3133(decomposed)X +3560(into)X +3710(a)X +3772(furthest)X +4043(reaching)X +996 3073(\(D)N +9 f +1094(-)X +1 f +1151(1\)-path)X +1403(on)X +1503(diagonal)X +1799(k)X +9 f +1852(-)X +1 f +1909(1,)X +1989(followed)X +2294(by)X +2394(a)X +2450(horizontal)X +2795(edge,)X +2988(followed)X +3294(by)X +3395(the)X +3514(longest)X +3766(possible)X +4049(snake)X +4253(or)X +996 3201(it)N +1071(may)X +1240(be)X +1347(decomposed)X +1778(into)X +1932(a)X +1998(furthest)X +2273(reaching)X +2580(\(D)X +9 f +2678(-)X +1 f +2735(1\)-path)X +2997(on)X +3107(diagonal)X +3413(k+1,)X +3588(followed)X +3903(by)X +4013(a)X +4079(vertical)X +996 3329(edge,)N +1188(followed)X +1493(by)X +1593(the)X +1711(longest)X +1962(possible)X +2244(snake.)X +3 f +576 3485(Proof:)N +1 f +696 3641(The)N +848(basis)X +1036(for)X +1158(0-paths)X +1422(is)X +1503(straightforward.)X +2066(As)X +2183(noted)X +2389(before,)X +2643(a)X +2707(D-path)X +2958(consists)X +3239(of)X +3334(a)X +3398(\(D)X +9 f +3496(-)X +1 f +3553(1\)-path,)X +3833(a)X +3897(non-diagonal)X +576 3769(edge,)N +779(and)X +926(a)X +993(snake.)X +1247(If)X +1332(the)X +1461(D-path)X +1715(ends)X +1893(on)X +2004(diagonal)X +2311(k,)X +2402(it)X +2477(follows)X +2748(that)X +2899(the)X +3028(\(D)X +9 f +3126(-)X +1 f +3183(1\)-path)X +3446(must)X +3632(end)X +3779(on)X +3890(diagonal)X +4196(k)X +9 f +(\261)S +1 f +4280(1)X +576 3897(depending)N +935(on)X +1040(whether)X +1324(a)X +1385(vertical)X +1651(or)X +1743(horizontal)X +2093(edge)X +2270(precedes)X +2577(the)X +2700(\256nal)X +2867(snake.)X +3116(The)X +3267(\256nal)X +3435(snake)X +3644(must)X +3825(be)X +3927(maximal,)X +4253(as)X +576 4025(the)N +707(D-path)X +963(would)X +1195(not)X +1329(be)X +1437(furthest)X +1714(reaching)X +2023(if)X +2104(the)X +2234(snake)X +2449(could)X +2659(be)X +2767(extended.)X +3129(Suppose)X +3432(that)X +3584(the)X +3714(\(D)X +9 f +3812(-)X +1 f +3869(1\)-path)X +4133(is)X +4218(not)X +576 4153(furthest)N +843(reaching)X +1142(in)X +1226(its)X +1323(diagonal.)X +1662(But)X +1800(then)X +1961(a)X +2020(further)X +2262(reaching)X +2562(\(D)X +9 f +2660(-)X +1 f +2717(1\)-path)X +2972(can)X +3107(be)X +3206(connected)X +3555(to)X +3640(the)X +3761(\256nal)X +3926(snake)X +4132(of)X +4222(the)X +576 4281(D-path)N +819(with)X +981(an)X +1077(appropriate)X +1463(non-diagonal)X +1906(move.)X +2144(Thus)X +2324(the)X +2442(D-path)X +2685(can)X +2817(always)X +3060(be)X +3156(decomposed)X +3577(as)X +3664(desired.)X +16 s +10 f +3996 4361(`)N +10 s +1 f +696 4462(Given)N +925(the)X +1056(endpoints)X +1400(of)X +1500(the)X +1632(furthest)X +1911(reaching)X +2222(\(D)X +9 f +2320(-)X +1 f +2377(1\)-paths)X +2674(in)X +2770(diagonal)X +3080(k+1)X +3239(and)X +3389(k)X +9 f +(-)S +1 f +3473(1,)X +3567(say)X +3708(\(x',y'\))X +3950(and)X +4100(\(x",y"\))X +576 4590(respectively,)N +1008(Lemma)X +1277(2)X +1341(gives)X +1534(a)X +1594(procedure)X +1940(for)X +2058(computing)X +2424(the)X +2546(endpoint)X +2850(of)X +2941(the)X +3062(furthest)X +3330(reaching)X +3630(D-path)X +3876(in)X +3961(diagonal)X +4260(k.)X +576 4718(Namely,)N +872(take)X +1028(the)X +1148(further)X +1389(reaching)X +1688(of)X +1777(\(x',y'+1\))X +2092(and)X +2230(\(x"+1,y"\))X +2557(in)X +2641(diagonal)X +2939(k)X +3001(and)X +3139(then)X +3299(follow)X +3530(diagonal)X +3828(edges)X +4033(until)X +4201(it)X +4267(is)X +576 4846(no)N +681(longer)X +911(possible)X +1198(to)X +1285(do)X +1390(so)X +1486(or)X +1578(until)X +1749(the)X +1871(boundary)X +2198(of)X +2289(the)X +2411(edit)X +2555(graph)X +2762(is)X +2839(reached.)X +3154(Furthermore,)X +3599(by)X +3703(Lemma)X +3972(1)X +4036(there)X +4221(are)X +576 4974(only)N +743(D+1)X +911(diagonals)X +1243(in)X +1330(which)X +1551(a)X +1613(D-path)X +1862(can)X +2000(end.)X +2182(This)X +2350(suggests)X +2647(computing)X +3015(the)X +3139(endpoints)X +3476(of)X +3569(D-paths)X +3849(in)X +3937(the)X +4061(relevant)X +576 5102(D+1)N +748(diagonals)X +1084(for)X +1207(successively)X +1637(increasing)X +1996(values)X +2230(of)X +2326(D)X +2413(until)X +2587(the)X +2713(furthest)X +2986(reaching)X +3291(path)X +3457(in)X +3547(diagonal)X +3851(N)X +9 f +3922(-)X +1 f +3979(M)X +4078(reaches)X +576 5230(\(N,M\).)N +2381 6176(-)N +2428(5)X +2488(-)X + +6 p +%%Page: 6 6 +10 s 10 xH 0 xS 1 f +3 f +1184 704(For)N +1 f +1329(D)X +9 f +1407 MX +(<-)172 1768 oc +1 f +1506(0)X +3 f +1566(to)X +1 f +1653(M+N)X +3 f +1847(Do)X +1356 832(For)N +1 f +1501(k)X +9 f +1561 MX +(<-)172 1768 oc +1660(-)X +1 f +1704(D)X +3 f +1782(to)X +1 f +1869(D)X +3 f +1947(in)X +2033(steps)X +2222(of)X +1 f +2309(2)X +3 f +2369(Do)X +1 f +1529 960(Find)N +1695(the)X +1813(endpoint)X +2113(of)X +2200(the)X +2318(furthest)X +2583(reaching)X +2880(D-path)X +3123(in)X +3205(diagonal)X +3501(k.)X +3 f +1529 1088(If)N +1 f +1607(\(N,M\))X +1830(is)X +1903(the)X +2021(endpoint)X +3 f +2321(Then)X +2 f +1702 1216(The)N +1842(D-path)X +2089(is)X +2162(an)X +2262(optimal)X +2526(solution.)X +3 f +1702 1344(Stop)N +1 f +576 1564(The)N +723(outline)X +967(above)X +1181(stops)X +1367(when)X +1564(the)X +1685(smallest)X +1970(D)X +2051(is)X +2127(encountered)X +2543(for)X +2660(which)X +2879(there)X +3063(is)X +3139(a)X +3198(furthest)X +3466(reaching)X +3766(D-path)X +4012(to)X +4097(\(N,M\).)X +576 1692(This)N +743(must)X +923(happen)X +1180(before)X +1411(the)X +1534(outer)X +1724(loop)X +1890(terminates)X +2248(because)X +2527(D)X +2609(must)X +2788(be)X +2888(less)X +3032(than)X +3194(or)X +3285(equal)X +3483(to)X +3569(M+N.)X +3807(By)X +3924(construction)X +576 1820(this)N +714(path)X +875(must)X +1054(be)X +1154(minimal)X +1444(with)X +1610(respect)X +1862(to)X +1948(the)X +2070(number)X +2339(of)X +2430(non-diagonal)X +2877(edges)X +3084(within)X +3312(it.)X +3420(Hence)X +3650(it)X +3718(is)X +3795(a)X +3855(solution)X +4136(to)X +4222(the)X +576 1948(LCS/SES)N +901(problem.)X +696 2104(In)N +793(presenting)X +1157(the)X +1285(detailed)X +1569(algorithm)X +1910(in)X +2002(Figure)X +2241(2)X +2311(below,)X +2557(a)X +2623(number)X +2898(of)X +2995(simple)X +3239(optimizations)X +3705(are)X +3835(employed.)X +4222(An)X +576 2232(array,)N +795(V,)X +906(contains)X +1206(the)X +1337(endpoints)X +1681(of)X +1781(the)X +1912(furthest)X +2190(reaching)X +2500(D-paths)X +2787(in)X +2882(elements)X +3200(V[)X +9 f +3285(-)X +1 f +3342(D],)X +3480(V[)X +9 f +3565(-)X +1 f +3622(D)X +9 f +3693(+)X +1 f +3750(2],)X +3890 2212(.)N +3930(.)X +3970(.)X +4010 2232(,)N +4063(V[D-2],)X +576 2360(V[D].)N +807(By)X +921(Lemma)X +1187(1)X +1248(this)X +1384(set)X +1494(of)X +1582(elements)X +1888(is)X +1962(disjoint)X +2222(from)X +2399(those)X +2589(where)X +2807(the)X +2926(endpoints)X +3258(of)X +3346(the)X +3465(\(D+1\)-paths)X +3880(will)X +4026(be)X +4124(stored)X +576 2488(in)N +661(the)X +781(next)X +941(iteration)X +1230(of)X +1319(the)X +1439(outer)X +1626(loop.)X +1830(Thus)X +2012(the)X +2132(array)X +2320(V)X +2400(can)X +2534(simultaneously)X +3040(hold)X +3204(the)X +3324(endpoints)X +3657(of)X +3746(the)X +3866(D-paths)X +4142(while)X +576 2616(the)N +697(\(D+1\)-path)X +1082(endpoints)X +1416(are)X +1538(being)X +1739(computed)X +2079(from)X +2259(them.)X +2483(Furthermore,)X +2928(to)X +3014(record)X +3244(an)X +3344(endpoint)X +3648(\(x,y\))X +3826(in)X +3912(diagonal)X +4212(k)X +4276(it)X +576 2744(suf\256ces)N +843(to)X +927(retain)X +1132(just)X +1269(x)X +1331(because)X +1608(y)X +1670(is)X +1745(known)X +1985(to)X +2069(be)X +2167(x)X +9 f +2220(-)X +1 f +2277(k.)X +2379(Consequently,)X +2861(V)X +2941(is)X +3015(an)X +3112(array)X +3299(of)X +3387(integers)X +3662(where)X +3880(V[k])X +4053(contains)X +576 2872(the)N +694(row)X +839(index)X +1037(of)X +1124(the)X +1242(endpoint)X +1542(of)X +1629(a)X +1685(furthest)X +1950(reaching)X +2247(path)X +2405(in)X +2487(diagonal)X +2783(k.)X +3 f +1184 3064(Constant)N +1 f +1515(MAX)X +9 f +1722(\316)X +1 f +1799([0,M+N])X +3 f +1184 3256(Var)N +1 f +1338(V:)X +3 f +1438(Array)X +1 f +1668([)X +9 f +1695(-)X +1 f +1752(MAX)X +1959(..)X +2019(MAX])X +3 f +2253(of)X +2340(Integer)X +1 f +951 3473(1.)N +1184(V[1])X +9 f +1356 MX +(<-)172 1768 oc +1 f +1455(0)X +951 3601(2.)N +3 f +1184(For)X +1 f +1329(D)X +9 f +1407 MX +(<-)172 1768 oc +1 f +1506(0)X +3 f +1566(to)X +1 f +1653(MAX)X +3 f +1860(Do)X +1 f +951 3729(3.)N +3 f +1356(For)X +1 f +1501(k)X +9 f +1561 MX +(<-)172 1768 oc +1660(-)X +1 f +1704(D)X +3 f +1782(to)X +1 f +1869(D)X +3 f +1947(in)X +2033(steps)X +2222(of)X +1 f +2309(2)X +3 f +2369(Do)X +1 f +951 3857(4.)N +3 f +1529(If)X +1 f +1607(k)X +1667(=)X +9 f +1732(-)X +1 f +1776(D)X +3 f +1854(or)X +1 f +1950(k)X +9 f +2010(\271)X +1 f +2074(D)X +3 f +2152(and)X +1 f +2300(V)X +2364([)X +2397(k)X +9 f +2450(-)X +1 f +2507(1)X +2553(])X +2600(<)X +2665(V)X +2729([)X +2762(k)X +9 f +2815(+)X +1 f +2872(1)X +2918(])X +3 f +2965(Then)X +1 f +951 3985(5.)N +1702(x)X +9 f +1762 MX +(<-)172 1768 oc +1 f +1861(V)X +1925([)X +1958(k)X +9 f +2011(+)X +1 f +2068(1)X +2114(])X +951 4113(6.)N +3 f +1529(Else)X +1 f +951 4241(7.)N +1702(x)X +9 f +1762 MX +(<-)172 1768 oc +1 f +1861(V)X +1925([)X +1958(k)X +9 f +2011(-)X +1 f +2068(1)X +2114(]+1)X +951 4369(8.)N +1529(y)X +9 f +1589 MX +(<-)172 1768 oc +1 f +1688(x)X +9 f +1741(-)X +1 f +1798(k)X +951 4497(9.)N +3 f +1529(While)X +1 f +1753(x)X +1813(<)X +1878(N)X +3 f +1956(and)X +1 f +2104(y)X +2164(<)X +2229(M)X +3 f +2320(and)X +1 f +2468(a)X +7 s +2513 4513(x)N +9 f +2550(+)X +1 f +2590(1)X +3 f +10 s +2644 4497(=)N +1 f +2710(b)X +7 s +2759 4513(y)N +9 f +2796(+)X +1 f +2836(1)X +3 f +10 s +2890 4497(Do)N +3028(\(x,y\))X +9 f +3202 MX +(<-)172 1768 oc +3 f +3301(\(x+1,y+1\))X +911 4625(10.)N +1 f +1529(V)X +1593([)X +1626(k)X +1672(])X +3 f +9 f +1719 MX +(<-)172 1768 oc +3 f +1818(x)X +911 4753(11.)N +1529(If)X +1607(x)X +9 f +1667(\263)X +3 f +1731(N)X +1809(and)X +1957(y)X +9 f +2017(\263)X +3 f +2081(M)X +2177(Then)X +911 4881(12.)N +2 f +1702(Length)X +1944(of)X +2026(an)X +2126(SES)X +2275(is)X +2348(D)X +3 f +911 5009(13.)N +1702(Stop)X +911 5137(14.)N +2 f +1184(Length)X +1426(of)X +1508(an)X +1608(SES)X +1757(is)X +1830(greater)X +2086(than)X +2248(MAX)X +3 f +1678 5393(FIGURE)N +2009(2:)X +2096(The)X +2249(Greedy)X +2523(LCS/SES)X +2861(Algorithm)X +1 f +2381 6176(-)N +2428(6)X +2488(-)X + +7 p +%%Page: 7 7 +10 s 10 xH 0 xS 1 f +696 704(As)N +809(a)X +869(practical)X +1170(matter)X +1399(the)X +1521(algorithm)X +1856(searches)X +2153(D-paths)X +2431(where)X +2652(D)X +9 f +2710(\243)X +1 f +2754(MAX)X +2965(and)X +3105(if)X +3178(no)X +3283(such)X +3455(path)X +3618(reaches)X +3885(\(N,M\))X +4113(then)X +4276(it)X +576 832(reports)N +820(that)X +961(any)X +1098(edit)X +1239(script)X +1438(for)X +1553(A)X +1632(and)X +1769(B)X +1843(must)X +2019(be)X +2116(longer)X +2342(than)X +2501(MAX)X +2709(in)X +2792(Line)X +2960(14.)X +3101(By)X +3215(setting)X +3449(the)X +3568(constant)X +3856(MAX)X +4064(to)X +4146(M+N)X +576 960(as)N +669(in)X +757(the)X +881(outline)X +1129(above,)X +1367(the)X +1491(algorithm)X +1828(is)X +1907(guaranteed)X +2286(to)X +2375(\256nd)X +2526(the)X +2651(length)X +2878(of)X +2972(the)X +3097(LCS/SES.)X +3469(Figure)X +3705(3)X +3772(illustrates)X +4110(the)X +4235(D-)X +576 1088(paths)N +766(searched)X +1069(when)X +1264(the)X +1383(algorithm)X +1715(is)X +1789(applied)X +2046(to)X +2129(the)X +2248(example)X +2541(of)X +2629(Figure)X +2858(1.)X +2958(Note)X +3134(that)X +3274(a)X +3330(\256ctitious)X +3629(endpoint,)X +3949(\()X +3982(0)X +4028(,)X +9 f +4061(-)X +1 f +4118(1)X +4164(\),)X +4231(set)X +576 1216(up)N +683(in)X +772(Line)X +946(1)X +1013(of)X +1107(the)X +1232(algorithm)X +1570(is)X +1650(used)X +1825(to)X +1915(\256nd)X +2067(the)X +2193(endpoint)X +2501(of)X +2596(the)X +2722(furthest)X +2995(reaching)X +3300(0-path.)X +3573(Also)X +3752(note)X +3918(that)X +4066(D-paths)X +576 1344(extend)N +814(off)X +932(the)X +1054(left)X +1185(and)X +1325(lower)X +1532(boundaries)X +1908(of)X +1999(the)X +2121(edit)X +2265(graph)X +2471(proper)X +2704(as)X +2794(the)X +2915(algorithm)X +3249(progresses.)X +3651(This)X +3816(boundary)X +4142(situa-)X +576 1472(tion)N +720(is)X +793(correctly)X +1099(handled)X +1373(by)X +1473(assuming)X +1795(that)X +1935(there)X +2116(are)X +2235(no)X +2335(diagonal)X +2631(edges)X +2834(in)X +2916(this)X +3051(region.)X +3 f +1136 1664 MXY +2624 +3256 +142.00 +192.00 +470.00 +599.00 +startFig +%!PS-Adobe-2.0 +%%%Title: frpaths.fig +%%%Creator: fig2dev Version 3.1 Patchlevel 1 +%%%CreationDate: Mon Sep 22 11:05:56 1997 +%%%For: gene@bobcat.CS.Arizona.EDU (Gene Myers) +%%%Orientation: Portrait +%%%BoundingBox: 142 192 470 599 +%%%Pages: 1 +%%%BeginSetup +%%%IncludeFeature: *PageSize Letter +%%%EndSetup +%%%EndComments +/$F2psDict 200 dict def +$F2psDict begin +$F2psDict /mtrx matrix put +/col-1 {} def +/col0 {0.000 0.000 0.000 srgb} bind def +/col1 {0.000 0.000 1.000 srgb} bind def +/col2 {0.000 1.000 0.000 srgb} bind def +/col3 {0.000 1.000 1.000 srgb} bind def +/col4 {1.000 0.000 0.000 srgb} bind def +/col5 {1.000 0.000 1.000 srgb} bind def +/col6 {1.000 1.000 0.000 srgb} bind def +/col7 {1.000 1.000 1.000 srgb} bind def +/col8 {0.000 0.000 0.560 srgb} bind def +/col9 {0.000 0.000 0.690 srgb} bind def +/col10 {0.000 0.000 0.820 srgb} bind def +/col11 {0.530 0.810 1.000 srgb} bind def +/col12 {0.000 0.560 0.000 srgb} bind def +/col13 {0.000 0.690 0.000 srgb} bind def +/col14 {0.000 0.820 0.000 srgb} bind def +/col15 {0.000 0.560 0.560 srgb} bind def +/col16 {0.000 0.690 0.690 srgb} bind def +/col17 {0.000 0.820 0.820 srgb} bind def +/col18 {0.560 0.000 0.000 srgb} bind def +/col19 {0.690 0.000 0.000 srgb} bind def +/col20 {0.820 0.000 0.000 srgb} bind def +/col21 {0.560 0.000 0.560 srgb} bind def +/col22 {0.690 0.000 0.690 srgb} bind def +/col23 {0.820 0.000 0.820 srgb} bind def +/col24 {0.500 0.190 0.000 srgb} bind def +/col25 {0.630 0.250 0.000 srgb} bind def +/col26 {0.750 0.380 0.000 srgb} bind def +/col27 {1.000 0.500 0.500 srgb} bind def +/col28 {1.000 0.630 0.630 srgb} bind def +/col29 {1.000 0.750 0.750 srgb} bind def +/col30 {1.000 0.880 0.880 srgb} bind def +/col31 {1.000 0.840 0.000 srgb} bind def +/col32 {0.608 0.608 0.608 srgb} bind def + +end +save +140.0 606.5 translate +1 -1 scale + +/clp {closepath} bind def +/ef {eofill} bind def +/gr {grestore} bind def +/gs {gsave} bind def +/l {lineto} bind def +/m {moveto} bind def +/n {newpath} bind def +/s {stroke} bind def +/slc {setlinecap} bind def +/slj {setlinejoin} bind def +/slw {setlinewidth} bind def +/srgb {setrgbcolor} bind def +/rot {rotate} bind def +/sc {scale} bind def +/tr {translate} bind def +/tnt {dup dup currentrgbcolor + 4 -2 roll dup 1 exch sub 3 -1 roll mul add + 4 -2 roll dup 1 exch sub 3 -1 roll mul add + 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb} + bind def +/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul + 4 -2 roll mul srgb} bind def + /DrawEllipse { + /endangle exch def + /startangle exch def + /yrad exch def + /xrad exch def + /y exch def + /x exch def + /savematrix mtrx currentmatrix def + x y tr xrad yrad sc 0 0 1 startangle endangle arc + closepath + savematrix setmatrix + } def + +/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def +/$F2psEnd {$F2psEnteredState restore end} def +%%%EndProlog + +$F2psBegin +10 setmiterlimit + 0.03000 0.03000 sc +7.500 slw +%% Polyline + [1 50.0] 50.000000 setdash +n 903 3228 m 9003 11328 l gs col-1 s gr [] 0 setdash +%% Polyline + [1 50.0] 50.000000 setdash +n 903 4428 m 7803 11328 l gs col-1 s gr [] 0 setdash +%% Polyline + [1 50.0] 50.000000 setdash +n 903 5628 m 6903 11628 l gs col-1 s gr [] 0 setdash +%% Polyline + [1 50.0] 50.000000 setdash +n 903 6828 m 6303 12228 l gs col-1 s gr [] 0 setdash +%% Polyline + [1 50.0] 50.000000 setdash +n 903 8028 m 5703 12828 l gs col-1 s gr [] 0 setdash +%% Polyline + [1 50.0] 50.000000 setdash +n 5703 3228 m 10203 7728 l gs col-1 s gr [] 0 setdash +%% Polyline + [1 50.0] 50.000000 setdash +n 4503 3228 m 10203 8928 l gs col-1 s gr [] 0 setdash +%% Polyline + [1 50.0] 50.000000 setdash +n 903 9228 m 5103 13428 l gs col-1 s gr [] 0 setdash +%% Polyline + [1 50.0] 50.000000 setdash +n 3303 3228 m 10203 10128 l gs col-1 s gr [] 0 setdash +15.000 slw +%% Polyline +n 903 3228 m 9303 3228 l 9303 10428 l 903 10428 l clp gs col-1 s gr +7.500 slw +%% Polyline + [1 50.0] 50.000000 setdash +n 2103 3228 m 10203 11328 l gs col-1 s gr [] 0 setdash +0.000 slw +%% Polyline +n 6752 10352 m 7052 10352 l 7052 10502 l 6752 10502 l clp gs col7 1.00 shd ef gr +%% Polyline +n 5552 10352 m 5852 10352 l 5852 10502 l 5552 10502 l clp gs col7 1.00 shd ef gr +%% Polyline +n 9152 10352 m 9452 10352 l 9452 10502 l 9152 10502 l clp gs col7 1.00 shd ef gr +%% Polyline +n 4352 10352 m 4652 10352 l 4652 10502 l 4352 10502 l clp gs col7 1.00 shd ef gr +%% Polyline +n 9227 6677 m 9377 6677 l 9377 6977 l 9227 6977 l clp gs col7 1.00 shd ef gr +%% Polyline +n 9227 9077 m 9377 9077 l 9377 10427 l 9227 10427 l clp gs col7 1.00 shd ef gr +15.000 slw +%% Polyline + [100.0] 0 setdash +n 4502 4427 m 5627 4427 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 5339.00 4367.00 m 5627.00 4427.00 l 5339.00 4487.00 l 5387.50 4427.50 l 5339.00 4367.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +2 slj +45.000 slw +%% Polyline +n 602 4727 m 2402 2927 l gs col7 1.00 shd ef gr gs col32 s gr +0 slj +15.000 slw +%% Polyline + [100.0] 0 setdash +n 4502 4427 m 4502 5552 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 4562.00 5264.00 m 4502.00 5552.00 l 4442.00 5264.00 l 4502.50 5312.50 l 4562.00 5264.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +2 slj +45.000 slw +%% Polyline +n 3902 13427 m 6902 10427 l 9302 10427 l 10202 9527 l gs col32 s gr +0 slj +15.000 slw +%% Polyline + [100.0] 0 setdash +n 3302 9227 m 4427 10352 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 4265.78 10105.93 m 4427.00 10352.00 l 4180.93 10190.78 l 4257.79 10182.79 l 4265.78 10105.93 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 4502 8027 m 5627 9152 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 5465.78 8905.93 m 5627.00 9152.00 l 5380.93 8990.78 l 5457.79 8982.79 l 5465.78 8905.93 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 5702 4427 m 6827 5552 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 6665.78 5305.93 m 6827.00 5552.00 l 6580.93 5390.78 l 6657.79 5382.79 l 6665.78 5305.93 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 4577 5702 m 5627 6752 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 5465.78 6505.93 m 5627.00 6752.00 l 5380.93 6590.78 l 5457.79 6582.79 l 5465.78 6505.93 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 5702 6827 m 6827 7952 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 6665.78 7705.93 m 6827.00 7952.00 l 6580.93 7790.78 l 6657.79 7782.79 l 6665.78 7705.93 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 6902 8027 m 8027 8027 l gs col7 1.00 shd ef gr gs col-1 s gr +n 7739.00 7967.00 m 8027.00 8027.00 l 7739.00 8087.00 l 7787.50 8027.50 l 7739.00 7967.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 8102 5627 m 9227 6752 l gs col7 1.00 shd ef gr gs col-1 s gr +n 9065.78 6505.93 m 9227.00 6752.00 l 8980.93 6590.78 l 9057.79 6582.79 l 9065.78 6505.93 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 8102 8027 m 9227 9152 l gs col7 1.00 shd ef gr gs col-1 s gr +n 9065.78 8905.93 m 9227.00 9152.00 l 8980.93 8990.78 l 9057.79 8982.79 l 9065.78 8905.93 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 5702 9227 m 5702 10352 l gs col7 1.00 shd ef gr gs col-1 s gr +n 5762.00 10064.00 m 5702.00 10352.00 l 5642.00 10064.00 l 5702.50 10112.50 l 5762.00 10064.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 4502 10427 m 4502 11552 l gs col7 1.00 shd ef gr gs col-1 s gr +n 4562.00 11264.00 m 4502.00 11552.00 l 4442.00 11264.00 l 4502.50 11312.50 l 4562.00 11264.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 6902 5627 m 8027 5627 l gs col7 1.00 shd ef gr gs col-1 s gr +n 7739.00 5567.00 m 8027.00 5627.00 l 7739.00 5687.00 l 7787.50 5627.50 l 7739.00 5567.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 5702 10427 m 5702 11552 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 5762.00 11264.00 m 5702.00 11552.00 l 5642.00 11264.00 l 5702.50 11312.50 l 5762.00 11264.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 6902 9227 m 6902 10352 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 6962.00 10064.00 m 6902.00 10352.00 l 6842.00 10064.00 l 6902.50 10112.50 l 6962.00 10064.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 6902 8027 m 6902 9152 l gs col7 1.00 shd ef gr gs col-1 s gr +n 6962.00 8864.00 m 6902.00 9152.00 l 6842.00 8864.00 l 6902.50 8912.50 l 6962.00 8864.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 9302 9227 m 9302 10352 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 9362.00 10064.00 m 9302.00 10352.00 l 9242.00 10064.00 l 9302.50 10112.50 l 9362.00 10064.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 4502 11627 m 4502 12752 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 4562.00 12464.00 m 4502.00 12752.00 l 4442.00 12464.00 l 4502.50 12512.50 l 4562.00 12464.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +2 slj +45.000 slw +%% Polyline +n 3902 12227 m 6902 9227 l 9302 9227 l 9302 6827 l 9602 6527 l gs col32 s gr +%% Polyline +n 3902 11027 m 7202 7727 l 6602 5927 l 9602 2927 l gs col32 s gr +%% Polyline +n 2702 8627 m 3602 7727 l 3002 5927 l 6002 2927 l gs col32 s gr +%% Polyline +n 602 3527 m 1202 2927 l gs col7 1.00 shd ef gr gs col32 s gr +0 slj +0.000 slw +%% Polyline +n 828 3153 m 978 3153 l 978 5778 l 828 5778 l clp gs col7 1.00 shd ef gr +%% Polyline +n 828 3153 m 3528 3153 l 3528 3303 l 828 3303 l clp gs col7 1.00 shd ef gr +15.000 slw +%% Polyline +n 902 2027 m 902 3152 l gs col7 1.00 shd ef gr gs col-1 s gr +n 962.00 2864.00 m 902.00 3152.00 l 842.00 2864.00 l 902.50 2912.50 l 962.00 2864.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 902 3227 m 2027 3227 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 1739.00 3167.00 m 2027.00 3227.00 l 1739.00 3287.00 l 1787.50 3227.50 l 1739.00 3167.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 902 3227 m 902 4352 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 962.00 4064.00 m 902.00 4352.00 l 842.00 4064.00 l 902.50 4112.50 l 962.00 4064.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 902 4427 m 902 5552 l gs col7 1.00 shd ef gr gs col-1 s gr +n 962.00 5264.00 m 902.00 5552.00 l 842.00 5264.00 l 902.50 5312.50 l 962.00 5264.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 902 5627 m 2027 6752 l gs col7 1.00 shd ef gr gs col-1 s gr +n 1865.78 6505.93 m 2027.00 6752.00 l 1780.93 6590.78 l 1857.79 6582.79 l 1865.78 6505.93 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 2102 3227 m 2102 4352 l gs col7 1.00 shd ef gr gs col-1 s gr +n 2162.00 4064.00 m 2102.00 4352.00 l 2042.00 4064.00 l 2102.50 4112.50 l 2162.00 4064.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 2102 4427 m 3227 5552 l gs col7 1.00 shd ef gr gs col-1 s gr +n 3065.78 5305.93 m 3227.00 5552.00 l 2980.93 5390.78 l 3057.79 5382.79 l 3065.78 5305.93 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 2102 3227 m 3227 3227 l gs col7 1.00 shd ef gr gs col-1 s gr +n 2939.00 3167.00 m 3227.00 3227.00 l 2939.00 3287.00 l 2987.50 3227.50 l 2939.00 3167.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 3302 3227 m 4427 4352 l gs col7 1.00 shd ef gr gs col-1 s gr +n 4265.78 4105.93 m 4427.00 4352.00 l 4180.93 4190.78 l 4257.79 4182.79 l 4265.78 4105.93 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline +n 2102 6827 m 3227 7952 l gs col7 1.00 shd ef gr gs col-1 s gr +n 3065.78 7705.93 m 3227.00 7952.00 l 2980.93 7790.78 l 3057.79 7782.79 l 3065.78 7705.93 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 3302 8027 m 3302 9152 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 3362.00 8864.00 m 3302.00 9152.00 l 3242.00 8864.00 l 3302.50 8912.50 l 3362.00 8864.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +%% Polyline + [100.0] 0 setdash +n 3302 8027 m 4427 8027 l gs col7 1.00 shd ef gr gs col-1 s gr [] 0 setdash +n 4139.00 7967.00 m 4427.00 8027.00 l 4139.00 8087.00 l 4187.50 8027.50 l 4139.00 7967.00 l clp gs 0.00 setgray ef gr gs col-1 s gr +7.500 slw +%% Ellipse +n 4502 8027 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 4502 11627 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 5702 10427 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 9302 9227 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 8102 8027 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 8102 5627 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 9302 6827 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 4502 12827 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 5702 11627 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 6902 10427 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 9302 10427 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 6902 9227 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 6902 5627 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 5702 6827 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 3302 9227 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 4502 10427 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 5702 9227 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 3302 8027 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 902 2027 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 2103 6828 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 5703 4428 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 4503 5628 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 3303 5628 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 4503 4428 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 2103 4428 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 903 4428 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 903 5628 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 3303 3228 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 6902 8027 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +%% Ellipse +n 2103 3228 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +/Helvetica-Bold findfont 360.00 scalefont setfont +3077 1727 m +gs 1 -1 sc (Envelope of D-Path Endpoints) col-1 show gr +%% Ellipse +n 903 3228 75 75 0 360 DrawEllipse gs col7 0.00 shd ef gr gs col-1 s gr + +/Helvetica-Bold findfont 360.00 scalefont setfont +1201 2851 m +gs 1 -1 sc (D=0) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +2401 2851 m +gs 1 -1 sc (D=1) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +6001 2851 m +gs 1 -1 sc (D=2) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +9601 2851 m +gs 1 -1 sc (D=3) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +10276 7876 m +gs 1 -1 sc (4) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +10276 10276 m +gs 1 -1 sc (2) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +10276 11476 m +gs 1 -1 sc (1) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +9676 6676 m +gs 1 -1 sc (D=4) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +10276 9076 m +gs 1 -1 sc (3) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +10276 9676 m +gs 1 -1 sc (D=5) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +8926 11701 m +gs 1 -1 sc (0) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +7651 11701 m +gs 1 -1 sc (-1) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +6826 11926 m +gs 1 -1 sc (-2) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +6226 12526 m +gs 1 -1 sc (-3) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +5626 13126 m +gs 1 -1 sc (-4) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +5026 13726 m +gs 1 -1 sc (-5) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +76 3226 m +gs 1 -1 sc (\(0,0\)) col-1 show gr +/Helvetica-Bold findfont 360.00 scalefont setfont +8701 10801 m +gs 1 -1 sc (\(7,6\)) col-1 show gr +2 slj +15.000 slw +%% Polyline +n 451 376 m 1651 376 l gs col0 s gr +0 slj +n 1363.00 316.00 m 1651.00 376.00 l 1363.00 436.00 l 1411.50 376.50 l 1363.00 316.00 l clp gs 0.00 setgray ef gr gs col0 s gr +/Helvetica-Bold findfont 360.00 scalefont setfont +1951 526 m +gs 1 -1 sc (Even Extensions) col-1 show gr +2 slj +7.500 slw +%% Polyline + [1 50.0] 50.000000 setdash +n 5251 376 m 6451 376 l gs col0 s gr [] 0 setdash +/Helvetica-Bold findfont 360.00 scalefont setfont +6751 526 m +gs 1 -1 sc (Diagonals) col-1 show gr +15.000 slw +%% Polyline +n 5251 976 m 6451 976 l gs col0 s gr +/Helvetica-Bold findfont 360.00 scalefont setfont +6751 1126 m +gs 1 -1 sc (Edit Graph Boundary) col-1 show gr +%% Polyline + [100.0] 0 setdash +n 452 977 m 1652 977 l gs col0 s gr [] 0 setdash +0 slj +n 1364.00 917.00 m 1652.00 977.00 l 1364.00 1037.00 l 1412.50 977.50 l 1364.00 917.00 l clp gs 0.00 setgray ef gr gs col0 s gr +/Helvetica-Bold findfont 360.00 scalefont setfont +1952 1127 m +gs 1 -1 sc (Odd Extensions) col-1 show gr +2 slj +45.000 slw +%% Polyline +n 1577 1577 m 2777 1577 l gs col32 s gr +showpage +%%%Page: 1 1 +$F2psEnd +restore +endFig +1913 5176(Fig.)N +2064(3.)X +2144(Furthest)X +2458(reaching)X +2776(paths.)X +1 f +696 5396(The)N +844(greedy)X +1086(algorithm)X +1420(takes)X +1608(at)X +1689(most)X +1867(O\(\(M+N\)D\))X +2289(time.)X +2495(Lines)X +2697(1)X +2761(and)X +2901(14)X +3005(consume)X +3314(O\(1\))X +3490(time.)X +3696(The)X +3845(inner)X +2 f +4034(For)X +1 f +4178(loop)X +576 5524(\(Line)N +772(3\))X +861(is)X +935(repeated)X +1229(at)X +1308(most)X +1484(\(D+1\)\(D+2\)/2)X +1961(times)X +2155(because)X +2431(the)X +2550(outer)X +2 f +2736(For)X +1 f +2877(loop)X +3040(\(Line)X +3235(3\))X +3323(is)X +3397(repeated)X +3691(D+1)X +3855(times)X +4049(and)X +4186(dur-)X +576 5652(ing)N +700(its)X +797(k)X +7 s +841 5620(th)N +10 s +913 5652(iteration)N +1202(the)X +1322(inner)X +1509(loop)X +1673(is)X +1748(repeated)X +2043(at)X +2123(most)X +2300(k)X +2362(times.)X +2597(All)X +2721(the)X +2841(lines)X +3014(within)X +3240(this)X +3378(inner)X +3566(loop)X +3731(take)X +3888(constant)X +4178(time)X +576 5780(except)N +814(for)X +936(the)X +2 f +1062(While)X +1 f +1277(loop)X +1447(\(Line)X +1649(9\).)X +1784(Thus)X +1972(O\(D)X +7 s +2119 5748(2)N +10 s +2153 5780(\))N +2208(time)X +2378(is)X +2459(spent)X +2656(executing)X +2996(Lines)X +3201(2-8)X +3335(and)X +3478(10-13.)X +3732(The)X +2 f +3884(While)X +1 f +4098(loop)X +4267(is)X +2381 6176(-)N +2428(7)X +2488(-)X + +8 p +%%Page: 8 8 +10 s 10 xH 0 xS 1 f +576 704(iterated)N +837(once)X +1009(for)X +1123(each)X +1291(diagonal)X +1587(traversed)X +1902(in)X +1984(the)X +2102(extension)X +2429(of)X +2516(furthest)X +2781(reaching)X +3078(paths.)X +3307(But)X +3442(at)X +3520(most)X +3696(O\(\(M+N\)D\))X +4115(diago-)X +576 832(nals)N +729(are)X +852(traversed)X +1171(since)X +1360(all)X +1464(D-paths)X +1742(lie)X +1846(between)X +2138(diagonals)X +9 f +2469(-)X +1 f +2526(D)X +2608(and)X +2748(D)X +2830(and)X +2970(there)X +3155(are)X +3278(at)X +3360(most)X +3538(\(2D+1\)min\(N,M\))X +4125(points)X +576 960(within)N +808(this)X +951(band.)X +1175(Thus)X +1363(the)X +1489(algorithm)X +1828(requires)X +2115(a)X +2179(total)X +2349(of)X +2444(O\(\(M+N\)D\))X +2870(time.)X +3080(Note)X +3264(that)X +3412(just)X +3555(Line)X +3731(9,)X +3820(the)X +3947(traversal)X +4253(of)X +576 1088(snakes,)N +836(is)X +915(the)X +1039(limiting)X +1317(step.)X +1512(The)X +1663(rest)X +1805(of)X +1898(the)X +2022(algorithm)X +2359(is)X +2438(O\(D)X +7 s +2585 1056(2)N +10 s +2619 1088(\).)N +2712(Furthermore)X +3139(the)X +3262(algorithm)X +3598(never)X +3802(takes)X +3992(more)X +4182(than)X +576 1216(O\(\(M+N\)MAX\))N +1123(time)X +1285(in)X +1367(the)X +1485(practical)X +1782(case)X +1941(where)X +2158(the)X +2276(threshold)X +2594(MAX)X +2801(is)X +2874(set)X +2983(to)X +3065(a)X +3121(value)X +3315(much)X +3513(less)X +3653(than)X +3811(M+N.)X +696 1410(The)N +841(search)X +1067(of)X +1155(the)X +1274(greedy)X +1514(algorithm)X +1846(traces)X +2055(the)X +2174(optimal)X +2439(D-paths)X +2714(among)X +2953(others.)X +3210(But)X +3346(only)X +3509(the)X +3628(current)X +3877(set)X +3987(of)X +4075(furthest)X +576 1538(reaching)N +875(endpoints)X +1208(are)X +1329(retained)X +1610(in)X +1694(V.)X +1814(Consequently,)X +2296(only)X +2460(the)X +2580(length)X +2802(of)X +2891(an)X +2989(SES/LCS)X +3316(can)X +3450(be)X +3548(reported)X +3838(in)X +3922(Line)X +4090(12.)X +4231(To)X +576 1666(explicitly)N +901(generate)X +1197(a)X +1256(solution)X +1536(path,)X +1717(O)X +1781(\()X +1814(D)X +7 s +1876 1634(2)N +10 s +1916 1666(\))N +1966(space)X +9 f +2145 1626(*)N +1 f +2208 1666(is)N +2284(used)X +2454(to)X +2539(store)X +2718(a)X +2777(copy)X +2956(of)X +3046(V)X +3127(after)X +3298(each)X +3469(iteration)X +3759(of)X +3849(the)X +3970(outer)X +4158(loop.)X +576 1794(Let)N +704(V)X +7 s +771 1810(d)N +10 s +826 1794(be)N +923(the)X +1042(copy)X +1219(of)X +1307(V)X +1386(kept)X +1545(after)X +1714(the)X +1833(d)X +7 s +1877 1762(th)N +10 s +1948 1794(iteration.)N +2276(To)X +2386(list)X +2503(an)X +2599(optimal)X +2863(path)X +3021(from)X +3197(\(0,0\))X +3371(to)X +3453(the)X +3571(point)X +3755(V)X +7 s +3822 1810(d)N +10 s +3862 1794([)N +3895(k)X +3941(])X +3988(\256rst)X +4132(deter-)X +576 1922(mine)N +761(whether)X +1045(it)X +1114(is)X +1192(at)X +1275(the)X +1398(end)X +1540(of)X +1633(a)X +1695(maximal)X +2001(snake)X +2210(following)X +2547(a)X +2609(vertical)X +2876(edge)X +3054(from)X +3236(V)X +7 s +3303 1938(d)N +9 f +3340(-)X +1 f +3380(1)X +10 s +3420 1922([)N +3453(k)X +9 f +3506(+)X +1 f +3563(1)X +3609(])X +3662(or)X +3755(a)X +3817(horizontal)X +4168(edge)X +576 2050(from)N +760(V)X +7 s +827 2066(d)N +9 f +864(-)X +1 f +904(1)X +10 s +944 2050([)N +977(k)X +9 f +1030(-)X +1 f +1087(1)X +1133(].)X +1228(To)X +1345(be)X +1449(concrete,)X +1770(suppose)X +2056(it)X +2128(is)X +2209(V)X +7 s +2276 2066(d)N +9 f +2313(-)X +1 f +2353(1)X +10 s +2393 2050([)N +2426(k)X +9 f +2479(-)X +1 f +2536(1)X +2582(].)X +2677(Recursively)X +3088(list)X +3213(an)X +3316(optimal)X +3587(path)X +3752(from)X +3935(\(0,0\))X +4116(to)X +4205(this)X +576 2178(point)N +761(and)X +898(then)X +1057(list)X +1175(the)X +1294(vertical)X +1556(edge)X +1729(and)X +1866(maximal)X +2167(snake)X +2371(to)X +2454(V)X +7 s +2521 2194(d)N +10 s +2561 2178([)N +2594(k)X +2640(].)X +2728(The)X +2874(recursion)X +3194(stops)X +3379(when)X +3574(d)X +3635(=)X +3701(0)X +3762(in)X +3845(which)X +4062(case)X +4222(the)X +576 2306(snake)N +787(from)X +971(\(0,0\))X +1153(to)X +1243(\(V)X +7 s +1337 2322(0)N +10 s +1377 2306([)N +1410(0)X +1456(],V)X +7 s +1570 2322(0)N +10 s +1610 2306([)N +1643(0)X +1689(]\))X +1771(is)X +1852(listed.)X +2093(So)X +2205(with)X +2375(O\(M+N\))X +2689(additional)X +3037(time)X +3206(and)X +3349(O)X +3413(\()X +3446(D)X +7 s +3508 2274(2)N +10 s +3548 2306(\))N +3602(space)X +3808(an)X +3911(optimal)X +4182(path)X +576 2434(can)N +715(be)X +818(listed)X +1018(by)X +1125(replacing)X +1451(Line)X +1625(12)X +1732(with)X +1901(a)X +1964(call)X +2107(to)X +2197(this)X +2340(recursive)X +2663(procedure)X +3013(with)X +3183(V)X +7 s +3250 2450(D)N +10 s +3302 2434([)N +3335(N)X +9 f +3406(-)X +1 f +3463(M)X +3540(])X +3595(as)X +3690(the)X +3816(initial)X +4030(point.)X +4262(A)X +576 2562(re\256nement)N +939(requiring)X +1253(only)X +1415(O\(M+N\))X +1721(space)X +1920(is)X +1993(shown)X +2222(in)X +2304(the)X +2422(next)X +2580(section.)X +696 2756(As)N +812(noted)X +1017(in)X +1106(Section)X +1373(2,)X +1460(the)X +1585(LCS/SES)X +1917(problem)X +2211(can)X +2350(be)X +2453(viewed)X +2712(as)X +2806(an)X +2909(instance)X +3199(of)X +3293(the)X +3418(single-source)X +3874(shortest)X +4151(paths)X +576 2884(problem)N +882(on)X +1001(a)X +1076(weighted)X +1409(edit)X +1568(graph.)X +1830(This)X +2010(suggests)X +2319(that)X +2477(an)X +2591(ef\256cient)X +2892(algorithm)X +3241(can)X +3391(be)X +3505(obtained)X +3819(by)X +3937(specializing)X +576 3012(Dijkstra's)N +914(algorithm)X +1247([3].)X +1403(A)X +1483(basic)X +1670(exercise)X +1956([2:)X +2067(207-208])X +2383(shows)X +2605(that)X +2747(the)X +2867(algorithm)X +3200(takes)X +3387(O\(ElgV\))X +3690(time)X +3854(where)X +4074(E)X +4146(is)X +4222(the)X +576 3140(number)N +842(of)X +930(edges)X +1134(and)X +1271(V)X +1350(is)X +1424(the)X +1542(number)X +1807(of)X +1894(vertices)X +2164(in)X +2246(the)X +2364(subject)X +2611(graph.)X +2854(For)X +2985(an)X +3081(edit)X +3221(graph)X +3424(E)X +3493(<)X +3558(3V)X +3676(since)X +3861(each)X +4029(point)X +4213(has)X +576 3268(outdegree)N +916(at)X +997(most)X +1175(three.)X +1399(Moreover,)X +1759(the)X +1880(lg)X +1948(V)X +2029(term)X +2199(comes)X +2427(from)X +2606(the)X +2727(cost)X +2879(of)X +2969(managing)X +3308(a)X +3367(priority)X +3630(queue.)X +3886(In)X +3977(the)X +4099(case)X +4262(at)X +576 3396(hand)N +756(the)X +878(priorities)X +1191(will)X +1338(be)X +1437(integers)X +1714(in)X +1799([0,M+N])X +2110(as)X +2200(edge)X +2375(costs)X +2558(are)X +2680(0)X +2743(or)X +2833(1)X +2896(and)X +3035(the)X +3156(longest)X +3410(possible)X +3695(path)X +3856(to)X +3941(any)X +4080(point)X +4267(is)X +576 3524(M+N.)N +816(Under)X +1043(these)X +1234(conditions,)X +1614(the)X +1739(priority)X +2006(queue)X +2225(operations)X +2586(can)X +2725(be)X +2828(implemented)X +3273(in)X +3362(constant)X +3656(time)X +3825(using)X +4025(``bucket-)X +576 3652(ing'')N +768(and)X +920(linked-list)X +1280(techniques.)X +1699(Thus)X +1895(Dijkstra's)X +2246(algorithm)X +2592(can)X +2739(be)X +2850(specialized)X +3242(to)X +3339(perform)X +3633(in)X +3730(time)X +3907(linear)X +4125(in)X +4222(the)X +576 3780(number)N +846(of)X +938(vertices)X +1213(in)X +1300(the)X +1423(edit)X +1568(graph,)X +1797(i.e.)X +1921(O\(MN\).)X +2228(The)X +2379(\256nal)X +2547(re\256nement)X +2916(stems)X +3124(from)X +3306(noting)X +3536(that)X +3682(all)X +3788(that)X +3934(is)X +4013(needed)X +4267(is)X +576 3908(the)N +699(shortest)X +973(path)X +1136(from)X +1317(the)X +1440(source)X +1675(\(0,0\))X +1854(to)X +1941(the)X +2064(point)X +2252(\(M,N\).)X +2519(Dijkstra's)X +2859(algorithm)X +3194(determines)X +3570(the)X +3692(minimum)X +4026(distances)X +576 4036(of)N +664(vertices)X +935(from)X +1112(the)X +1231(source)X +1462(in)X +1545(increasing)X +1896(order,)X +2107(one)X +2244(vertex)X +2466(per)X +2590(iteration.)X +2918(By)X +3032(Lemma)X +3299(1)X +3361(there)X +3544(are)X +3665(at)X +3745(most)X +3922(O\(\(M+N\)D\))X +576 4164(points)N +802(less)X +953(distant)X +1197(from)X +1383(\(0,0\))X +1567(than)X +1735(\(M,N\))X +1968(and)X +2114(the)X +2242(previous)X +2548(re\256nements)X +2952(reduce)X +3197(the)X +3325(cost)X +3484(of)X +3581(each)X +3759(iteration)X +4056(to)X +4148(O\(1\).)X +576 4292(Thus)N +779(the)X +920(algorithm)X +1274(can)X +1429(stop)X +1605(as)X +1715(soon)X +1909(as)X +2019(the)X +2160(minimum)X +2513(distance)X +2819(to)X +2924(\(M,N\))X +3170(is)X +3266(ascertained)X +3671(and)X +3830(it)X +3917(only)X +4102(spends)X +576 4420(O\(\(M+N\)D\))N +994(time)X +1156(in)X +1238(so)X +1329(doing.)X +696 4614(It)N +778(has)X +918(been)X +1103(shown)X +1345(that)X +1498(a)X +1567(specialization)X +2041(of)X +2141(Dijkstra's)X +2490(algorithm)X +2835(also)X +2998(gives)X +3201(an)X +3311(O\(ND\))X +3573(time)X +3749(algorithm)X +4094(for)X +4222(the)X +576 4742(LCS/SES)N +905(problem.)X +1236(However,)X +1575(the)X +1697(resulting)X +2001(algorithm)X +2336(involves)X +2631(a)X +2690(relatively)X +3016(complex)X +3315(discrete)X +3588(priority)X +3851(queue)X +4066(and)X +4205(this)X +576 4870(queue)N +791(may)X +952(contain)X +1211(as)X +1301(many)X +1502(as)X +1592(O\(ND\))X +1843(entries)X +2081(even)X +2257(in)X +2343(the)X +2465(case)X +2628(where)X +2849(just)X +2988(the)X +3110(length)X +3334(of)X +3425(the)X +3547(LCS/SES)X +3876(is)X +3953(being)X +4155(com-)X +576 4998(puted.)N +824(While)X +1050(one)X +1195(could)X +1402(argue)X +1610(that)X +1759(further)X +2007(re\256nement)X +2379(leads)X +2573(to)X +2664(the)X +2791(simple)X +3033(algorithm)X +3373(of)X +3469(this)X +3613(paper,)X +3841(the)X +3968(connection)X +576 5126(becomes)N +878(so)X +970(tenuous)X +1240(that)X +1381(the)X +1500(direct)X +1705(and)X +1843(easily)X +2052(motivated)X +2394(derivation)X +2741(used)X +2910(in)X +2994(this)X +3131(section)X +3380(is)X +3455(preferable.)X +3844(The)X +3991(aim)X +4133(of)X +4222(the)X +576 5254(discussion)N +941(is)X +1026(to)X +1120(expose)X +1375(the)X +1505(close)X +1701(relationship)X +2110(between)X +2409(the)X +2538(shortest)X +2818(paths)X +3018(and)X +3165(LCS/SES)X +3501(problems)X +3830(and)X +3977(their)X +4155(algo-)X +576 5382(rithms.)N +8 s +10 f +576 5472(hhhhhhhhhhhhhhhhhh)N +9 f +576 5552(*)N +1 f +625(If)X +684(only)X +815(O\(D)X +6 s +928 5520(2)N +8 s +5552(\))Y +990(space)X +1148(is)X +1208(to)X +1275(be)X +2 f +1352(allocated)X +1 f +1590(,)X +1623(the)X +1718(algorithm)X +1984(is)X +2044(\256rst)X +2161(run)X +2263(to)X +2330(determine)X +2602(D)X +2665(in)X +2732(O\(N\))X +2883(space,)X +3057(then)X +3184(the)X +3279(space)X +3438(is)X +3499(allocated,)X +3763(and)X +3873(\256nal-)X +576 5632(ly,)N +658(the)X +752(algorithm)X +1017(is)X +1076(run)X +1177(again)X +1331(to)X +1397(determine)X +1668(a)X +1712(solution)X +1935(path.)X +10 s +2381 6176(-)N +2428(8)X +2488(-)X + +9 p +%%Page: 9 9 +10 s 10 xH 0 xS 1 f +3 f +576 704(4.)N +656(Re\256nements)X +1 f +696 860(The)N +849(basic)X +1042(algorithm)X +1381(can)X +1521(be)X +1625(embellished)X +2040(in)X +2130(a)X +2194(number)X +2467(of)X +2562(ways.)X +2796(First,)X +2991(the)X +3118(algorithm's)X +3516(expected)X +3831(performance)X +4267(is)X +576 988(O)N +640(\()X +673(M)X +9 f +757(+)X +1 f +814(N)X +9 f +885(+)X +1 f +942(D)X +7 s +1004 956(2)N +10 s +1044 988(\),)N +1113(which)X +1331(is)X +1405(much)X +1604(superior)X +1888(to)X +1971(the)X +2090(worst)X +2289(case)X +2449(prediction)X +2795(of)X +2883(O\(\(M+N\)D\).)X +3342(While)X +3559(not)X +3682(shown)X +3912(here,)X +4092(experi-)X +576 1116(ments)N +794(reveal)X +1018(that)X +1165(the)X +1290(variance)X +1590(about)X +1795(the)X +1920(mean)X +2121(is)X +2201(small)X +2401(especially)X +2749(as)X +2843(the)X +2968(alphabet)X +3267(size)X +3419(becomes)X +3727(large.)X +3955(Thus)X +4142(while)X +576 1244(there)N +767(are)X +895(pathological)X +1320(cases)X +1519(that)X +1668(require)X +1925(O\(\(M+N\)D\))X +2352(time)X +2523(they)X +2690(are)X +2818(extremely)X +3168(rare)X +3323(\(e.g.)X +3495(like)X +3644(O)X +3708(\()X +3741(N)X +7 s +3803 1212(2)N +10 s +3843 1244(\))N +3899(problems)X +4226(for)X +576 1372(quicksort\).)N +967(Second,)X +1249(the)X +1373(algorithm)X +1710(can)X +1848(be)X +1951(re\256ned)X +2201(to)X +2290(use)X +2424(only)X +2593(linear)X +2803(space)X +3009(when)X +3210(reporting)X +3531(an)X +3634(edit)X +3781(script.)X +4026(The)X +4178(only)X +576 1500(other)N +769(algorithm)X +1108(that)X +1256(has)X +1391(been)X +1571(shown)X +1808(to)X +1898(admit)X +2108(such)X +2283(a)X +2347(re\256nement)X +2718(is)X +2799(the)X +2925(basic)X +3118(O\(MN\))X +3387(dynamic)X +3691(programming)X +4155(algo-)X +576 1628(rithm)N +776([7].)X +937(A)X +1022(linear)X +1232(space)X +1438(algorithm)X +1776(is)X +1856(of)X +1950(practical)X +2254(import)X +2494(since)X +2686(many)X +2891(large)X +3079(problems)X +3404(can)X +3543(reasonably)X +3918(be)X +4021(solved)X +4258(in)X +576 1756(O)N +640(\()X +673(D)X +7 s +735 1724(2)N +10 s +775 1756(\))N +843(time)X +1026(but)X +1169(not)X +1312(in)X +1415(O)X +1479(\()X +1512(D)X +7 s +1574 1724(2)N +10 s +1614 1756(\))N +1682(space.)X +1942(Finally,)X +2229(an)X +2346(O)X +2410(\()X +2462(\()X +2495(M)X +9 f +2579(+)X +1 f +2636(N)X +2700(\))X +2740(lg)X +2808(\()X +2841(M)X +9 f +2925(+)X +1 f +2982(N)X +3046(\))X +9 f +3112(+)X +1 f +3188(D)X +7 s +3250 1724(2)N +10 s +3309 1756(\))N +3376(worst-case)X +3760(time)X +3942(variation)X +4267(is)X +576 1884(obtained)N +873(by)X +974(speeding)X +1280(up)X +1381(the)X +1501(traversal)X +1800(of)X +1889(snakes)X +2125(with)X +2289(some)X +2480(previously)X +2840(developed)X +3192(techniques)X +3557([6,14].)X +3813(The)X +3960(variation)X +4267(is)X +576 2012(impractical)N +960(due)X +1099(to)X +1184(the)X +1305(sophistication)X +1772(of)X +1862(these)X +2050(underlying)X +2420(methods)X +2714(but)X +2839(its)X +2936(superior)X +3221(asymptotic)X +3594(worst-case)X +3960(complexity)X +576 2140(is)N +649(of)X +736(theoretical)X +1095(interest.)X +3 f +576 2396(4a.)N +696(A)X +774(Probabilistic)X +1229(Analysis)X +1 f +696 2552(Consider)N +1010(the)X +1133(following)X +1469(stochastic)X +1810(model)X +2035(for)X +2154(the)X +2277(sequences)X +2628(A)X +2712(and)X +2854(B)X +2933(in)X +3021(a)X +3083(shortest)X +3358(edit)X +3504(script)X +3708(problem.)X +4041(A)X +4125(and)X +4267(B)X +576 2680(are)N +696(sequences)X +1043(over)X +1207(an)X +1304(alphabet)X +9 f +1597(S)X +1 f +1665(where)X +1883(each)X +2052(symbol)X +2308(occurs)X +2539(with)X +2701(probability)X +3072(p)X +7 s +9 f +3121 2696(s)N +10 s +1 f +3181 2680(for)N +9 f +3295(s)X +3363(\316)X +3440(S)X +1 f +3487(.)X +3547(The)X +3692(N)X +3770(symbols)X +4056(of)X +4143(A)X +4221(are)X +576 2808(randomly)N +906(and)X +1045(independently)X +1522(chosen)X +1768(according)X +2108(to)X +2193(the)X +2314(probability)X +2688(densities,)X +3012(p)X +7 s +9 f +3061 2824(s)N +10 s +1 f +3101 2808(.)N +3165(The)X +3314(M)X +3409(=)X +3478(N)X +9 f +3549(-)X +3606(d)X +3659(+)X +3716(i)X +1 f +3766(symbol)X +4025(sequence)X +576 2936(B)N +659(is)X +742(obtained)X +1048(by)X +1158(randomly)X +1495(deleting)X +9 f +1783(d)X +1 f +1853(symbols)X +2149(from)X +2335(A)X +2423(and)X +2569(randomly)X +2906(inserting)X +9 f +3216(i)X +1 f +3272(randomly)X +3608(chosen)X +3860(symbols.)X +4195(The)X +576 3064(deletion)N +858(and)X +998(insertion)X +1302(positions)X +1614(are)X +1737(chosen)X +1984(with)X +2150(uniform)X +2432(probability.)X +2847(An)X +2969(equivalent)X +3327(model)X +3551(is)X +3629(to)X +3716(generate)X +4014(a)X +4075(random)X +576 3192(sequence)N +894(of)X +984(length)X +1207(L)X +1279(=)X +1347(N)X +9 f +1418(-)X +1475(d)X +1 f +1537(and)X +1675(then)X +1835(randomly)X +2164(insert)X +9 f +2364(d)X +1 f +2426(and)X +9 f +2564(i)X +1 f +2612(randomly)X +2941(generated)X +3276(symbols)X +3564(to)X +3648(this)X +3785(sequence)X +4102(to)X +4186(pro-)X +576 3320(duce)N +748(A)X +826(and)X +962(B,)X +1055(respectively.)X +1503(Note)X +1679(that)X +1819(the)X +1937(LCS)X +2103(of)X +2190(A)X +2268(and)X +2404(B)X +2477(must)X +2652(consist)X +2894(of)X +2981(at)X +3059(least)X +3226(L)X +3295(symbols)X +3581(but)X +3703(may)X +3861(be)X +3957(longer.)X +696 3476(An)N +817(alternate)X +1117(model)X +1340(is)X +1416(to)X +1501(consider)X +1796(A)X +1877(and)X +2016(B)X +2093(as)X +2184(randomly)X +2515(generated)X +2852(sequences)X +3202(of)X +3293(length)X +3517(N)X +3599(and)X +3739(M)X +3834(which)X +4054(are)X +4177(con-)X +576 3604(strained)N +851(to)X +934(have)X +1107(an)X +1204(LCS)X +1371(of)X +1459(length)X +1680(L.)X +1790(This)X +1953(model)X +2174(is)X +2248(not)X +2371(equivalent)X +2726(to)X +2809(the)X +2928(one)X +3065(above)X +3278(except)X +3509(in)X +3592(the)X +3711(limit)X +3882(when)X +4077(the)X +4195(size)X +576 3732(of)N +9 f +663(S)X +1 f +730(becomes)X +1031(arbitrarily)X +1372(large)X +1553(and)X +1689(every)X +1888(probability)X +2259(p)X +7 s +9 f +2308 3748(s)N +10 s +1 f +2368 3732(goes)N +2535(to)X +2617(zero.)X +2816(Nonetheless,)X +3248(the)X +3367(ensuing)X +3637(treatment)X +3961(can)X +4094(also)X +4244(be)X +576 3860(applied)N +839(to)X +928(this)X +1070(model)X +1297(with)X +1466(the)X +1591(same)X +1783(asymptotic)X +2161(results.)X +2437(The)X +2589(\256rst)X +2740(model)X +2967(is)X +3047(chosen)X +3297(as)X +3391(it)X +3462(re\257ects)X +3721(the)X +3845(edit)X +3991(scripts)X +4226(for)X +576 3988(mapping)N +883(A)X +968(into)X +1119(B)X +1199(that)X +1346(are)X +1472(assumed)X +1775(by)X +1882(the)X +2007(SES)X +2171(problem.)X +2505(While)X +2728(other)X +2920(edit)X +3067(script)X +3272(commands)X +3646(such)X +3820(as)X +3915 0.2404(``transfers'',)AX +576 4116(``moves'',)N +936(and)X +1075 0.2812(``exchanges'')AX +1541(are)X +1663(more)X +1851(re\257ective)X +2173(of)X +2263(actual)X +2478(editing)X +2723(sessions,)X +3028(their)X +3198(inclusion)X +3513(results)X +3744(in)X +3828(distinct)X +4085(optimi-)X +576 4244(zation)N +794(problems)X +1114(from)X +1292(the)X +1412(SES)X +1571(problem)X +1860(discussed)X +2189(here.)X +2390(Hence)X +2618(stochastic)X +2956(models)X +3210(based)X +3416(on)X +3519(such)X +3689(edit)X +3832(process)X +4096(are)X +4218(not)X +576 4372(considered.)N +696 4528(In)N +785(the)X +905(edit)X +1047(graph)X +1252(of)X +1341(A)X +1421(and)X +1559(B)X +1634(there)X +1817(are)X +1938(L)X +2009(diagonal)X +2307(edges)X +2512(corresponding)X +2993(to)X +3077(the)X +3198(randomly)X +3528(generated)X +3864(LCS)X +4033(of)X +4123(A)X +4204(and)X +576 4656(B.)N +697(Any)X +863(other)X +1056(diagonal)X +1360(edge,)X +1560(ending)X +1806(at)X +1892(say)X +2027(\(x,y\),)X +2229(occurs)X +2467(with)X +2637(the)X +2763(same)X +2955(probability)X +3333(that)X +3480(a)X +7 s +3525 4672(x)N +10 s +3586 4656(=)N +3658(b)X +7 s +3707 4672(y)N +10 s +3768 4656(as)N +3862(these)X +4054(symbols)X +576 4784(were)N +755(obtained)X +1053(by)X +1155(independent)X +1569(random)X +1836(trials.)X +2058(Thus)X +2240(the)X +2360(probability)X +2733(of)X +2822(an)X +2 f +2921(off-LCS)X +3192(diagonal)X +1 f +3499(is)X +9 f +3575(r)X +3658(=)X +7 s +3748 4864 4.0938(s\316S)AN +15 s +3770 4808(S)N +10 s +1 f +3888 4784(p)N +7 s +9 f +3932 4799(s)N +1 f +3932 4752(2)N +10 s +3972 4784(.)N +4035(The)X +4183(SES)X +576 4976(algorithm)N +908(searches)X +1202(by)X +1303(extending)X +1640(furthest)X +1906(reaching)X +2204(paths)X +2394(until)X +2561(the)X +2680(point)X +2865(\(N,M\))X +3088(is)X +3161(reached.)X +3472(Each)X +3653(extension)X +3980(consists)X +4253(of)X +576 5104(a)N +639(horizontal)X +991(or)X +1085(vertical)X +1353(edge)X +1532(followed)X +1844(by)X +1951(the)X +2076(longest)X +2334(possible)X +2623(snake.)X +2873(The)X +3025(maximal)X +3332(snakes)X +3573(consist)X +3822(of)X +3916(a)X +3980(number)X +4253(of)X +576 5232(LCS)N +744(and)X +882(off-LCS)X +1171(diagonals.)X +1539(The)X +1685(probability)X +2057(that)X +2198(there)X +2380(are)X +2500(exactly)X +2753(t)X +2796(off-LCS)X +3084(diagonals)X +3412(in)X +3495(a)X +3552(given)X +3751(extension's)X +4137(snake)X +576 5400(is)N +9 f +650(r)X +7 s +1 f +698 5368(t)N +10 s +726 5400(\()N +759(1)X +9 f +812(-)X +869(r)X +1 f +919(\).)X +1007(Thus)X +1188(the)X +1307(expected)X +1614(number)X +1880(of)X +1968(off-LCS)X +2256(diagonals)X +2584(in)X +2667(an)X +2764(extension)X +3093(is)X +7 s +3170 5480(t)N +9 f +3195(=)X +1 f +3235(0)X +15 s +9 f +3181 5424(S)N +7 s +3197 5320(\245)N +10 s +1 f +3290 5400(t)N +9 f +3318(r)X +7 s +1 f +3366 5368(t)N +10 s +3394 5400(\()N +3427(1)X +9 f +3480(-)X +3537(r)X +1 f +3587(\))X +3636(=)X +9 f +3703(r)X +1 f +3753(/)X +3781(\()X +3814(1)X +9 f +3867(-)X +3924(r)X +1 f +3974(\).)X +4063(At)X +4165(most)X +576 5592(d+1)N +739(extensions)X +1115(are)X +1252(made)X +1464(in)X +1564(the)X +1700(d)X +7 s +1744 5560(th)N +10 s +1832 5592(iteration)N +2137(of)X +2241(the)X +2376(outer)X +2 f +2578(For)X +1 f +2735(loop)X +2914(of)X +3018(the)X +3153(SES)X +3327(algorithm.)X +3715(Therefore)X +4070(at)X +4165(most)X +576 5720(\()N +609(D)X +9 f +680(+)X +1 f +737(1)X +783(\))X +816(\()X +849(D)X +9 f +920(+)X +1 f +977(2)X +1023(\))X +9 f +1063(r)X +1 f +1113(/)X +1141(2)X +1187(\()X +1220(1)X +9 f +1273(-)X +1330(r)X +1 f +1380(\))X +1433(off-LCS)X +1726(diagonals)X +2059(are)X +2184(traversed)X +2505(in)X +2593(the)X +2717(expected)X +3029(case.)X +3235(Moreover,)X +3599(at)X +3684(most)X +3866(L)X +3942(LCS)X +4115(diago-)X +576 5848(nals)N +730(are)X +854(ever)X +1018(traversed.)X +1378(Consequently,)X +1863(the)X +1986(critical)X +2 f +2234(While)X +1 f +2446(loop)X +2613(of)X +2705(the)X +2828(algorithm)X +3164(is)X +3242(executed)X +3552(an)X +3652(average)X +3927(of)X +4018(O\(L+D)X +7 s +4259 5816(2)N +10 s +4293 5848(\))N +2381 6176(-)N +2428(9)X +2488(-)X + +10 p +%%Page: 10 10 +10 s 10 xH 0 xS 1 f +576 704(times)N +772(when)X +9 f +969(r)X +1 f +1036(is)X +1112(bounded)X +1411(away)X +1604(from)X +1783(1.)X +1887(The)X +2036(remainder)X +2386(of)X +2477(the)X +2599(algorithm)X +2934(has)X +3065(already)X +3326(been)X +3502(observed)X +3816(to)X +3902(take)X +4060(at)X +4142(worst)X +576 832(O)N +640(\()X +673(D)X +7 s +735 800(2)N +10 s +775 832(\))N +827(time.)X +1034(When)X +9 f +1251(r)X +1 f +1320(=)X +1390(1,)X +1475(there)X +1661(is)X +1739(only)X +1906(one)X +2047(letter)X +2237(of)X +2329(nonzero)X +2612(probability)X +2987(in)X +3073(the)X +3195(alphabet)X +9 f +3491(S)X +1 f +3538(,)X +3582(so)X +3677(A)X +3759(and)X +3899(B)X +3976(consists)X +4253(of)X +576 960(repetitions)N +934(of)X +1021(that)X +1161(letter,)X +1366(with)X +1528(probability)X +1899(one.)X +2075(In)X +2162(this)X +2297(case,)X +2476(the)X +2594(algorithm)X +2925(runs)X +3083(in)X +3165(O\(M+N\))X +3472(time.)X +3675(Thus)X +3856(the)X +3975(SES)X +4133(Algo-)X +576 1088(rithm)N +769(takes)X +954(O)X +1018(\()X +1051(M)X +9 f +1135(+)X +1 f +1192(N)X +9 f +1263(+)X +1 f +1320(D)X +7 s +1382 1056(2)N +10 s +1422 1088(\))N +1469(time)X +1631(in)X +1713(the)X +1831(expected)X +2137(case.)X +3 f +576 1344(4b.)N +700(A)X +778(Linear)X +1029(Space)X +1249(Re\256nement)X +1 f +696 1500(The)N +844(LCS/SES)X +1172(problem)X +1462(is)X +1538(symmetric)X +1900(with)X +2066(respect)X +2318(to)X +2404(the)X +2526(orientation)X +2897(of)X +2988(edit)X +3132(graph)X +3339(edges.)X +3586(Consider)X +3899(reversing)X +4222(the)X +576 1628(direction)N +883(of)X +972(every)X +1173(edge)X +1347(in)X +1431(the)X +1551(edit)X +1693(graph)X +1898(for)X +2014(sequences)X +2362(A)X +2442(and)X +2580(B.)X +2695(Subsequences)X +3167(and)X +3305(edit)X +3446(scripts)X +3676(for)X +3791(A)X +3870(and)X +4007(B)X +4081(are)X +4201(still)X +576 1756(modeled)N +872(as)X +959(paths)X +1148(in)X +1230(this)X +1365(reverse)X +1618(edit)X +1758(graph)X +1961(but)X +2083(now)X +2241(the)X +2359(paths)X +2548(start)X +2706(at)X +2785(\(N,M\))X +3009(and)X +3146(end)X +3283(at)X +3362(\(0,0\).)X +3577(Also,)X +3769(the)X +3888(interpretation)X +576 1884(of)N +664(paths)X +854(alters)X +1049(just)X +1185(slightly)X +1445(to)X +1528(re\257ect)X +1750(the)X +1869(reversal)X +2145(of)X +2233(direction.)X +2579(Each)X +2760(diagonal)X +3056(edge)X +2 f +3228(beginning)X +1 f +3568(at)X +3646(\(x,y\))X +3820(gives)X +4009(a)X +4065(symbol,)X +576 2012(a)N +7 s +621 2028(x)N +10 s +681 2012(\(=)N +779(b)X +7 s +828 2028(y)N +10 s +862 2012(\),)N +935(in)X +1023(the)X +1147(common)X +1453(subsequence;)X +1907(each)X +2081(horizontal)X +2432(edge)X +2 f +2610(from)X +1 f +2787(point)X +2977(\(x,y\))X +3157(corresponds)X +3571(to)X +3660(the)X +3785(delete)X +4004(command)X +576 2140(``x)N +695(D'';)X +862(etc.)X +1029(So)X +1146(the)X +1277(LCS/SES)X +1614(problem)X +1913(can)X +2057(be)X +2165(solved)X +2406(by)X +2518(starting)X +2790(at)X +2880(\(N,M\))X +3115(and)X +3263(progressively)X +3727(extending)X +4075(furthest)X +576 2268(reaching)N +878(paths)X +1072(in)X +1159(the)X +1282(reverse)X +1540(edit)X +1685(graph)X +1893(until)X +2065(one)X +2207(reaches)X +2475(\(0,0\).)X +2695 0.3472(Hereafter,)AX +3046(forward)X +3327(paths)X +3522(will)X +3672(refer)X +3851(to)X +3939(those)X +4134(in)X +4222(the)X +576 2396(edit)N +721(graph)X +929(and)X +1070(reverse)X +1328(paths)X +1522(will)X +1671(refer)X +1849(to)X +1936(those)X +2130(in)X +2217(the)X +2340(reverse)X +2598(edit)X +2743(graph.)X +2991(Since)X +3194(paths)X +3388(in)X +3475(opposing)X +3793(directions)X +4134(are)X +4258(in)X +576 2524(exact)N +766 0.2232(correspondence,)AX +1311(the)X +1429(direction)X +1734(of)X +1821(a)X +1877(path)X +2035(is)X +2108(distinguished)X +2554(only)X +2716(when)X +2910(it)X +2974(is)X +3047(of)X +3134(operational)X +3515(importance.)X +696 2680(As)N +813(in)X +903(the)X +1029(linear)X +1240(space)X +1447(algorithm)X +1786(of)X +1881(Hirschberg)X +2266([7],)X +2408(a)X +2472(divide-and-conquer)X +3129(strategy)X +3411(is)X +3492(employed.)X +3876(A)X +3962(D-path)X +4213(has)X +576 2808(D+1)N +748(snakes)X +991(some)X +1189(of)X +1285(which)X +1510(may)X +1677(be)X +1782(empty.)X +2051(The)X +2205(divide)X +2434(step)X +2592(requires)X +2880(\256nding)X +3135(the)X +10 f +3262 2800(R)N +1 f +3288 2808(D)N +3352(/)X +3380(2)X +10 f +3426 2800(H)N +9 f +3467 2808(+)N +1 f +3524(1)X +3593(or)X +3688(middle)X +3938(snake)X +4149(of)X +4244(an)X +576 2936(optimal)N +843(D-path.)X +1129(The)X +1277(idea)X +1434(for)X +1551(doing)X +1756(so)X +1850(is)X +1926(to)X +2011(simultaneously)X +2518(run)X +2648(the)X +2770(basic)X +2959(algorithm)X +3294(in)X +3380(both)X +3546(the)X +3668(forward)X +3947(and)X +4087(reverse)X +576 3064(directions)N +916(until)X +1086(furthest)X +1355(reaching)X +1656(forward)X +1935(and)X +2075(reverse)X +2331(paths)X +2523(starting)X +2786(at)X +2867(opposing)X +3183(corners)X +3443(``overlap''.)X +3855(Lemma)X +4123(3)X +4186(pro-)X +576 3192(vides)N +765(the)X +883(formal)X +1117(observation)X +1511(underlying)X +1878(this)X +2013(approach.)X +3 f +576 3348(Lemma)N +859(3:)X +1 f +996(There)X +1204(is)X +1277(a)X +1333(D-path)X +1576(from)X +1752(\(0,0\))X +1926(to)X +2008(\(N,M\))X +2231(if)X +2300(and)X +2436(only)X +2598(if)X +2667(there)X +2848(is)X +2921(a)X +10 f +2978 3340(R)N +1 f +3004 3348(D)N +3068(/)X +3096(2)X +10 f +3142 3340(H)N +1 f +3170 3348(-path)N +3356(from)X +3533(\(0,0\))X +3708(to)X +3791(some)X +3981(point)X +4166(\(x,y\))X +996 3476(and)N +1132(a)X +10 f +1188(Q)X +1 f +1214(D)X +1278(/)X +1306(2)X +10 f +1352(P)X +1 f +1380(-path)X +1565(from)X +1741(some)X +1930(point)X +2114(\(u,v\))X +2288(to)X +2370(\(N,M\))X +2593(such)X +2760(that:)X +3 f +996 3668(\(feasibility\))N +1572(u+v)X +9 f +1722(\263)X +1 f +10 f +1786 3660(R)N +1 f +1812 3668(D)N +1876(/)X +1904(2)X +10 f +1950 3660(H)N +3 f +1998 3668(and)N +2146(x+y)X +9 f +2292(\243)X +3 f +2356(N+M)X +9 f +2536(-)X +1 f +10 f +2580(Q)X +1 f +2606(D)X +2670(/)X +2698(2)X +10 f +2744(P)X +3 f +2772(,)X +2812(and)X +996 3796(\(overlap\))N +1572(x)X +9 f +(-)S +3 f +1656(y)X +1716(=)X +1782(u)X +9 f +1826(-)X +3 f +1870(v)X +1930(and)X +2078(x)X +9 f +2138(\263)X +3 f +2202(u.)X +1 f +996 4016(Moreover,)N +1353(both)X +1515(D/2-paths)X +1851(are)X +1970(contained)X +2302(within)X +2526(D-paths)X +2800(from)X +2976(\(0,0\))X +3150(to)X +3232(\(N,M\).)X +3 f +576 4172(Proof:)N +1 f +696 4328(Suppose)N +989(there)X +1172(is)X +1247(a)X +1305(D-path)X +1550(from)X +1728(\(0,0\))X +1904(to)X +1988(\(N,M\).)X +2253(It)X +2324(can)X +2458(be)X +2556(partitioned)X +2925(at)X +3005(the)X +3125(start,)X +3305(\(x,y\),)X +3501(of)X +3590(its)X +3687(middle)X +3931(snake)X +4137(into)X +4284(a)X +10 f +576 4448(R)N +1 f +602 4456(D)N +666(/)X +694(2)X +10 f +740 4448(H)N +1 f +768 4456(-path)N +956(from)X +1135(\(0,0\))X +1312(to)X +1396(\(x,y\))X +1572(and)X +1710(a)X +10 f +1768(Q)X +1 f +1794(D)X +1858(/)X +1886(2)X +10 f +1932(P)X +1 f +1960(-path)X +2147(from)X +2325(\(u,v\))X +2501(to)X +2585(\(N,M\))X +2810(where)X +3029(\(u,v\))X +3205(=)X +3272(\(x,y\).)X +3488(A)X +3568(path)X +3728(from)X +3906(\(0,0\))X +4082(to)X +4166(\(u,v\))X +576 4584(can)N +708(have)X +880(at)X +958(most)X +1133(u+v)X +1278(non-diagonal)X +1722(edges)X +1926(and)X +2063(there)X +2245(is)X +2319(a)X +10 f +2376 4576(R)N +1 f +2402 4584(D)N +2466(/)X +2494(2)X +10 f +2540 4576(H)N +1 f +2568 4584(-path)N +2754(to)X +2837(\(u,v\))X +3012(implying)X +3321(that)X +3462(u+v)X +9 f +3608(\263)X +10 f +3673 4576(R)N +1 f +3699 4584(D)N +3763(/)X +3791(2)X +10 f +3837 4576(H)N +1 f +3865 4584(.)N +3926(A)X +4005(path)X +4164(from)X +576 4712(\(x,y\))N +755(to)X +842(\(N,M\))X +1070(can)X +1207(have)X +1384(at)X +1467(most)X +1647(\(N+M\))X +9 f +1875(-)X +1 f +1919(\(x+y\))X +2123(non-diagonal)X +2571(edges)X +2779(and)X +2920(there)X +3106(is)X +3183(a)X +10 f +3243(Q)X +1 f +3269(D)X +3333(/)X +3361(2)X +10 f +3407(P)X +1 f +3435(-path)X +3624(to)X +3710(\(x,y\))X +3888(implying)X +4200(that)X +576 4840(x+y)N +9 f +721(\243)X +1 f +785(N+M)X +9 f +(-)S +10 f +1003(Q)X +1 f +1029(D)X +1093(/)X +1121(2)X +10 f +1167(P)X +1 f +1195(.)X +1255(Finally,)X +1521(u)X +9 f +(-)S +1 f +1605(v)X +1665(=)X +1730(x)X +9 f +(-)S +1 f +1814(y)X +1874(and)X +2010(u)X +9 f +(\243)S +1 f +2094(x)X +2154(as)X +2241(\(x,y\))X +2415(=)X +2480(\(u,v\).)X +696 4996(Conversely,)N +1111(suppose)X +1399(the)X +10 f +1527 4988(R)N +1 f +1553 4996(D)N +1617(/)X +1645(2)X +10 f +1691 4988(H)N +1 f +1719 4996(-)N +1776(and)X +10 f +1922(Q)X +1 f +1948(D)X +2012(/)X +2040(2)X +10 f +2086(P)X +1 f +2114(-paths)X +2340(exist.)X +2561(But)X +2706(u)X +9 f +(\243)S +1 f +2790(x)X +2860(implies)X +3125(there)X +3316(is)X +3399(a)X +3465(k-path)X +3701(from)X +3888(\(0,0\))X +4073(to)X +4166(\(u,v\))X +576 5124(where)N +800(k)X +9 f +(\243)S +10 f +884 5116(R)N +1 f +910 5124(D)N +974(/)X +1002(2)X +10 f +1048 5116(H)N +1 f +1076 5124(.)N +1143(By)X +1263(Lemma)X +1534(1,)X +9 f +1620(D)X +1 f +1695(=)X +10 f +1766 5116(R)N +1 f +1792 5124(D)N +1856(/)X +1884(2)X +10 f +1930 5116(H)N +9 f +1958 5124(-)N +1 f +2002(k)X +2068(is)X +2147(a)X +2209(multiple)X +2501(of)X +2594(2)X +2660(as)X +2753(both)X +2921(the)X +3045(k-path)X +3276(and)X +10 f +3418 5116(R)N +1 f +3444 5124(D)N +3508(/)X +3536(2)X +10 f +3582 5116(H)N +1 f +3610 5124(-path)N +3801(end)X +3943(in)X +4031(the)X +4155(same)X +576 5252(diagonal.)N +920(Moreover,)X +1285(the)X +1411(k-path)X +1644(has)X +1779(\(u+v)X +9 f +1931(-)X +1 f +1975(k\)/2)X +9 f +2132(\263)X +2204(D)X +1 f +2253(/2)X +2343(diagonals)X +2679(as)X +2775(u+v)X +9 f +2929(\243)X +10 f +3002 5244(R)N +1 f +3028 5252(D)N +3092(/)X +3120(2)X +10 f +3166 5244(H)N +1 f +3194 5252(.)N +3263(By)X +3385(replacing)X +3713(each)X +3890(of)X +9 f +3986(D)X +1 f +4035(/2)X +4126(of)X +4222(the)X +576 5380(diagonals)N +904(in)X +987(the)X +1106(k-path)X +1332(with)X +1495(a)X +1552(pair)X +1698(of)X +1786(horizontal)X +2132(and)X +2269(vertical)X +2531(edges,)X +2755(a)X +10 f +2812 5372(R)N +1 f +2838 5380(D)N +2902(/)X +2930(2)X +10 f +2976 5372(H)N +1 f +3004 5380(-path)N +3190(from)X +3366(\(0,0\))X +3540(to)X +3622(\(u,v\))X +3796(is)X +3869(obtained.)X +4205(But)X +576 5508(then)N +737(there)X +921(is)X +997(a)X +1056(D-path)X +1303(from)X +1483(\(0,0\))X +1661(to)X +1747(\(N,M\))X +1974(consisting)X +2322(of)X +2413(this)X +10 f +2552 5500(R)N +1 f +2578 5508(D)N +2642(/)X +2670(2)X +10 f +2716 5500(H)N +1 f +2744 5508(-path)N +2933(to)X +3019(\(u,v\))X +3197(and)X +3337(the)X +3459(the)X +3581(given)X +10 f +3783(Q)X +1 f +3809(D)X +3873(/)X +3901(2)X +10 f +3947(P)X +1 f +3975(-path)X +4164(from)X +576 5636(\(u,v\))N +758(to)X +848(\(N,M\).)X +1119(Note)X +1303(that)X +1451(the)X +10 f +1577(Q)X +1 f +1603(D)X +1667(/)X +1695(2)X +10 f +1741(P)X +1 f +1769(-path)X +1962(is)X +2043(part)X +2196(of)X +2291(this)X +2434(D-path.)X +2724(By)X +2844(a)X +2907(symmetric)X +3272(argument)X +3602(the)X +10 f +3727 5628(R)N +1 f +3753 5636(D)N +3817(/)X +3845(2)X +10 f +3891 5628(H)N +1 f +3919 5636(-path)N +4111(is)X +4191(also)X +576 5764(part)N +721(of)X +808(a)X +864(D-path)X +1107(from)X +1283(\(0,0\))X +1457(to)X +1539(\(N,M\).)X +16 s +10 f +1842 5844(`)N +10 s +1 f +2361 6176(-)N +2408(10)X +2508(-)X + +11 p +%%Page: 11 11 +10 s 10 xH 0 xS 1 f +696 704(The)N +842(outline)X +1085(below)X +1302(gives)X +1492(the)X +1611(procedure)X +1954(for)X +2069(\256nding)X +2316(the)X +2435(middle)X +2678(snake)X +2882(of)X +2970(an)X +3067(optimal)X +3332(path.)X +3532(For)X +3665(successive)X +4026(values)X +4253(of)X +576 832(D,)N +678(compute)X +978(the)X +1100(endpoints)X +1435(of)X +1526(the)X +1648(furthest)X +1917(reaching)X +2218(forward)X +2497(D-paths)X +2774(from)X +2953(\(0,0\))X +3130(and)X +3269(then)X +3430(compute)X +3729(the)X +3850(furthest)X +4118(reach-)X +576 960(ing)N +698(reverse)X +951(D-paths)X +1225(from)X +1401(\(N,M\).)X +1664(Do)X +1782(so)X +1873(in)X +1955(V)X +2033(vectors,)X +2305(one)X +2441(for)X +2555(each)X +2724(direction,)X +3050(as)X +3138(in)X +3221(the)X +3340(basic)X +3526(algorithm.)X +3898(As)X +4008(each)X +4177(end-)X +576 1088(point)N +764(is)X +841(computed,)X +1201(check)X +1413(to)X +1499(see)X +1626(if)X +1699(it)X +1767(overlaps)X +2063(with)X +2229(the)X +2351(path)X +2513(in)X +2599(the)X +2721(same)X +2909(diagonal)X +3208(but)X +3333(opposite)X +3627(direction.)X +3975(A)X +4056(check)X +4267(is)X +576 1216(needed)N +830(to)X +918(ensure)X +1154(that)X +1300(there)X +1487(is)X +1566(an)X +1668(opposing)X +1987(path)X +2151(in)X +2239(the)X +2364(given)X +2569(diagonal)X +2872(because)X +3154(forward)X +3436(paths)X +3632(are)X +3758(in)X +3847(diagonals)X +4181(cen-)X +576 1344(tered)N +764(about)X +969(0)X +1036(and)X +1179(reverse)X +1439(paths)X +1635(are)X +1761(in)X +1850(diagonals)X +2184(centered)X +2484(around)X +9 f +2734(D)X +1 f +2810(=)X +2882(N)X +9 f +2940(-)X +1 f +2984(M.)X +3122(Moreover,)X +3486(by)X +3593(Lemma)X +3865(1,)X +3952(the)X +4076(optimal)X +576 1472(edit)N +720(script)X +922(length)X +1146(is)X +1223(odd)X +1368(or)X +1460(even)X +1637(as)X +9 f +1729(D)X +1 f +1803(is)X +1881(odd)X +2026(or)X +2118(even.)X +2335(Thus)X +2520(when)X +9 f +2719(D)X +1 f +2793(is)X +2871(odd,)X +3036(check)X +3249(for)X +3368(overlap)X +3634(only)X +3801(while)X +4004(extending)X +576 1600(forward)N +862(paths)X +1062(and)X +1209(when)X +9 f +1414(D)X +1 f +1494(is)X +1578(even,)X +1781(check)X +2000(for)X +2125(overlap)X +2397(only)X +2570(while)X +2779(extending)X +3126(reverse)X +3390(paths.)X +3630(As)X +3750(soon)X +3932(as)X +4030(a)X +4097(pair)X +4253(of)X +576 1728(opposing)N +902(and)X +1051(furthest)X +1329(reaching)X +1639(paths)X +1841(overlap,)X +2135(stop)X +2301(and)X +2450(report)X +2675(the)X +2806(overlapping)X +3222(snake)X +3438(as)X +3538(the)X +3670(middle)X +3926(snake)X +4143(of)X +4244(an)X +576 1856(optimal)N +845(path.)X +1048(Note)X +1229(that)X +1374(the)X +1497(endpoints)X +1833(of)X +1925(this)X +2065(snake)X +2273(can)X +2410(be)X +2511(readily)X +2759(delivered)X +3083(as)X +3175(the)X +3298(snake)X +3506(was)X +3656(just)X +3796(computed)X +4136(in)X +4222(the)X +576 1984(previous)N +872(step.)X +3 f +9 f +776 2176(D)N +845 MX +(<-)172 1768 oc +3 f +944(N)X +9 f +1002(-)X +3 f +1046(M)X +776 2304(For)N +921(D)X +9 f +999 MX +(<-)172 1768 oc +3 f +1098(0)X +1158(to)X +1 f +10 f +1245 2296(R)N +1 f +1271 2304(\()N +1304(M)X +9 f +1388(+)X +1 f +1445(N)X +1509(\))X +1542(/)X +1570(2)X +10 f +1616 2296(H)N +3 f +1664 2304(Do)N +948 2432(For)N +1093(k)X +9 f +1157 MX +(<-)172 1768 oc +1256(-)X +3 f +1300(D)X +1378(to)X +1465(D)X +1543(in)X +1629(steps)X +1818(of)X +1905(2)X +1965(Do)X +1121 2560(Find)N +1300(the)X +1427(end)X +1571(of)X +1658(the)X +1785(furthest)X +2077(reaching)X +2395(forward)X +2696(D-path)X +2956(in)X +3042(diagonal)X +3354(k.)X +1121 2688(If)N +9 f +1199(D)X +3 f +1268(is)X +1341(odd)X +1489(and)X +1 f +1637(k)X +9 f +1702(\316)X +1 f +1784([)X +9 f +1817(D)X +1879(-)X +1 f +1936(\()X +1969(D)X +9 f +2040(-)X +1 f +2097(1)X +2143(\))X +2183(,)X +9 f +2209(D)X +2271(+)X +1 f +2328(\()X +2361(D)X +9 f +2432(-)X +1 f +2489(1)X +2535(\))X +2568(])X +3 f +2615(Then)X +1294 2816(If)N +1372(the)X +1499(path)X +1674(overlaps)X +1983(the)X +2110(furthest)X +2402(reaching)X +2720(reverse)X +1 f +2991(\()X +3024(D)X +9 f +3095(-)X +1 f +3152(1)X +3198(\))X +3 f +3225(-path)X +3427(in)X +3513(diagonal)X +3825(k)X +3889(Then)X +1467 2944(Length)N +1731(of)X +1818(an)X +1922(SES)X +2083(is)X +2156(2D)X +9 f +2254(-)X +3 f +2298(1.)X +1467 3072(The)N +1620(last)X +1760(snake)X +1975(of)X +2062(the)X +2189(forward)X +2490(path)X +2665(is)X +2738(the)X +2865(middle)X +3120(snake.)X +948 3200(For)N +1093(k)X +9 f +1157 MX +(<-)172 1768 oc +1256(-)X +3 f +1300(D)X +1378(to)X +1465(D)X +1543(in)X +1629(steps)X +1818(of)X +1905(2)X +1965(Do)X +1121 3328(Find)N +1300(the)X +1427(end)X +1571(of)X +1658(the)X +1785(furthest)X +2077(reaching)X +2395(reverse)X +2666(D-path)X +2926(in)X +3012(diagonal)X +3324(k+)X +9 f +3414(D)X +3 f +3463(.)X +1121 3456(If)N +9 f +1199(D)X +3 f +1268(is)X +1341(even)X +1517(and)X +1 f +1665(k)X +9 f +1718(+)X +1775(D)X +1849(\316)X +1 f +1931([)X +9 f +1971(-)X +1 f +2028(D)X +2092(,)X +2118(D)X +2182(])X +3 f +2229(Then)X +1294 3584(If)N +1372(the)X +1499(path)X +1674(overlaps)X +1983(the)X +2110(furthest)X +2402(reaching)X +2720(forward)X +3021(D-path)X +3281(in)X +3367(diagonal)X +3679(k+)X +9 f +3769(D)X +3 f +3838(Then)X +1467 3712(Length)N +1731(of)X +1818(an)X +1922(SES)X +2083(is)X +2156(2D.)X +1467 3840(The)N +1620(last)X +1760(snake)X +1975(of)X +2062(the)X +2189(reverse)X +2460(path)X +2635(is)X +2708(the)X +2835(middle)X +3090(snake.)X +1 f +696 4060(The)N +841(correctness)X +1223(of)X +1310(this)X +1445(procedure)X +1787(relies)X +1981(heavily)X +2237(on)X +2337(Lemma)X +2602(3.)X +2702(Without)X +2984(loss)X +3129(of)X +3217(generality)X +3559(suppose)X +9 f +3838(D)X +1 f +3908(is)X +3982(even.)X +4195(The)X +576 4188(algorithm)N +911(stops)X +1099(as)X +1190(soon)X +1365(as)X +1456(the)X +1578(smallest)X +1864(D)X +1946(is)X +2023(encountered)X +2440(for)X +2558(which)X +2778(furthest)X +3047(reaching)X +3348(D-paths)X +3625(in)X +3710(opposite)X +4004(directions)X +576 4316(overlap.)N +883(First,)X +1075(the)X +1199(overlapping)X +1608(paths)X +1804(must)X +1986(be)X +2089(shown)X +2325(to)X +2414(satisfy)X +2650(the)X +2775(feasibility)X +3122(condition)X +3451(of)X +3545(Lemma)X +3817(3.)X +3924(Suppose)X +4222(the)X +576 4444(reverse)N +836(furthest)X +1108(reaching)X +1412(path)X +1577(ends)X +1751(at)X +1836(\(u,v\))X +2017(where)X +2241(u+v)X +2393(=)X +2465(k.)X +2572(There)X +2787(is)X +2867(always)X +3117(a)X +3180(k-path)X +3412(of)X +3506(non-diagonal)X +3955(edges)X +4164(from)X +576 4572(\(0,0\))N +753(to)X +838(\(u,v\))X +1015(which)X +1234(when)X +1431(combined)X +1770(with)X +1936(the)X +2058(reverse)X +2315(D-path)X +2562(forms)X +2773(a)X +2833(k+D-path)X +3165(from)X +3345(\(0,0\))X +3523(to)X +3609(\(N,M\).)X +3876(This)X +4042(path)X +4204(and)X +576 4700(Lemma)N +844(3)X +907(imply)X +1116(there)X +1300(are)X +1422(overlapping)X +1828(h-paths)X +2087(where)X +2307(h)X +2370(=)X +2438(\(k+D\)/2)X +2720(\(k+D)X +2913(is)X +2989(divisible)X +3287(by)X +3390(2)X +3453(as)X +9 f +3543(D)X +1 f +3615(is)X +3691(even\).)X +3933(So)X +4039(certainly)X +576 4828(there)N +767(are)X +896(overlapping)X +1309(furthest)X +1584(reaching)X +1891(h-or-less)X +2203(paths.)X +2443(If)X +2528(k)X +2599(<)X +2675(D,)X +2784(then)X +2953(h)X +3024(<)X +3100(D)X +3189(contradicting)X +3643(the)X +3772(fact)X +3924(that)X +4075(furthest)X +576 4956(reaching)N +876(D-paths)X +1153(are)X +1275(the)X +1396(\256rst)X +1543(to)X +1628(overlap.)X +1932(So)X +2039(u+v)X +9 f +2187(\263)X +1 f +2254(D)X +2335(as)X +2424(desired.)X +2718(A)X +2798(similar)X +3042(argument)X +3367(shows)X +3589(that)X +3731(the)X +3851(furthest)X +4118(reach-)X +576 5084(ing)N +701(forward)X +979(D-path)X +1225(also)X +1377(satis\256es)X +1653(the)X +1775(feasibility)X +2119(constraint)X +2459(of)X +2550(Lemma)X +2819(3.)X +2923(Now)X +3103(the)X +3225(feasible,)X +3519(overlapping)X +3926(D-paths)X +4204(and)X +576 5212(Lemma)N +844(3)X +907(imply)X +1116(that)X +1259(there)X +1443(is)X +1519(a)X +1578(solution)X +1858(path)X +2019(of)X +2109(length)X +2332(2D.)X +2493(This)X +2658(must)X +2836(be)X +2935(optimal)X +3202(for)X +3319(if)X +3391(there)X +3575(is)X +3651(a)X +3710(2k-path,)X +3997(k0)X +4024(and)X +4164(M>0)X +576 2808(and)N +716(D)X +9 f +774(\243)X +1 f +818(1.)X +921(If)X +998(D)X +9 f +1056(\243)X +1 f +1100(1)X +1163(then)X +1324(B)X +1400(is)X +1476(obtained)X +1775(from)X +1954(A)X +2035(by)X +2138(either)X +2344(deleting)X +2625(or)X +2715(inserting)X +3018(at)X +3099(most)X +3277(one)X +3416(symbol.)X +3714(But)X +3852(then)X +4013(it)X +4080(follows)X +576 2936(that)N +716(the)X +834(shorter)X +1077(of)X +1164(A)X +1242(and)X +1378(B)X +1451(is)X +1524(the)X +1642(LCS)X +1808(and)X +1944(should)X +2177(be)X +2273(listed.)X +3 f +1184 3256(LCS\(A,N,B,M\))N +1356 3384(If)N +1434(N>0)X +1598(and)X +1746(M>0)X +1928(Then)X +1529 3512(Find)N +1708(the)X +1835(middle)X +2090(snake)X +2305(and)X +2453(length)X +2686(of)X +2773(an)X +2877(optimal)X +3159(path)X +3334(for)X +3457(A)X +3535(and)X +3683(B.)X +1529 3640(Suppose)N +1832(it)X +1901(is)X +1974(from)X +2164(\(x,y\))X +2338(to)X +2425(\(u,v\).)X +1529 3768(If)N +1607(D)X +1685(>)X +1751(1)X +1811(Then)X +1702 3896(LCS\(A[1..x],x,B[1..y],y\))N +1702 4024(Output)N +1970(A[x+1..u].)X +1702 4152(LCS\(A[u+1..N],N)N +9 f +2302(-)X +3 f +2346(u,B[v+1..M],M)X +9 f +2855(-)X +3 f +2899(v\))X +1529 4280(Else)N +1691(If)X +1769(M)X +1865(>)X +1931(N)X +2009(Then)X +1702 4408(Output)N +1970(A[1..N].)X +1529 4536(Else)N +1702 4664(Output)N +1970(B[1..M].)X +1 f +696 4884(Let)N +829(T\(P,D\))X +1080(be)X +1182(the)X +1306(time)X +1474(taken)X +1674(by)X +1780(the)X +1904(algorithm)X +2241(where)X +2464(P)X +2535(is)X +2615(N+M.)X +2856(It)X +2932(follows)X +3199(that)X +3346(T)X +3422(satis\256es)X +3702(the)X +3827 0.4028(recurrence)AX +4195(ine-)X +576 5012(quality:)N +1515 5272(T)N +1570(\()X +1603(P)X +1653(,)X +1679(D)X +1743(\))X +9 f +1809(\243)X +10 f +1891 5202(I)N +1891 5282(K)N +1891 5362(L)N +9 f +5344(b)Y +1 f +1961(P)X +9 f +1911 5216(a)N +1 f +1967(PD)X +9 f +2082(+)X +1 f +2139(T)X +2194(\()X +2227(P)X +7 s +2280 5232(1)N +10 s +2320 5216(,)N +10 f +2346 5208(R)N +1 f +2372 5216(D)N +2436(/)X +2464(2)X +10 f +2510 5208(H)N +1 f +2544 5216(\))N +9 f +2584(+)X +1 f +2641(T)X +2696(\()X +2729(P)X +7 s +2782 5232(2)N +10 s +2822 5216(,)N +10 f +2848(Q)X +1 f +2874(D)X +2938(/)X +2966(2)X +10 f +3012(P)X +1 f +3046(\))X +3131 5344(if)N +3212(D)X +9 f +3276(\243)X +1 f +3326(1)X +3131 5232(if)N +3212(D)X +3283(>)X +3341(1)X +576 5576(where)N +798(P)X +7 s +851 5592(1)N +10 s +9 f +898 5576(+)N +1 f +955(P)X +7 s +1008 5592(2)N +10 s +9 f +1067 5576(\243)N +1 f +1136(P)X +1205(and)X +9 f +1346(a)X +1 f +1421(and)X +9 f +1562(b)X +1 f +1632(are)X +1757(suitably)X +2036(large)X +2223(constants.)X +2587(Noting)X +2835(that)X +10 f +2981 5568(R)N +1 f +3007 5576(D)N +3071(/)X +3099(2)X +10 f +3145 5568(H)N +9 f +3199 5576(\243)N +1 f +3269(2D/3)X +3455(for)X +3575(D)X +9 f +3633(\263)X +1 f +3677(2,)X +3763(a)X +3825(straightforward)X +576 5704(induction)N +924(argument)X +1273(shows)X +1519(that)X +1684(T\(P,D\))X +9 f +1954(\243)X +1 f +2043(3)X +9 f +(a)S +1 f +2133(PD+)X +9 f +2280(b)X +1 f +2324(P.)X +2453(Thus)X +2658(the)X +2801(divide-and-conquer)X +3475(algorithm)X +3831(still)X +3995(takes)X +4205(just)X +576 5832(O\(\(M+N\)D\))N +996(time)X +1160(despite)X +1409(the)X +10 f +1529 5824(R)N +1 f +1555 5832(lg)N +1642(D)X +10 f +1706 5824(H)N +1 f +1756 5832(levels)N +1965(of)X +2054(recursion)X +2375(through)X +2646(which)X +2864(it)X +2930(descends.)X +3282(Furthermore,)X +3725(the)X +3845(algorithm)X +4178(only)X +2361 6176(-)N +2408(12)X +2508(-)X + +13 p +%%Page: 13 13 +10 s 10 xH 0 xS 1 f +576 704(requires)N +862(O\(D\))X +1059(working)X +1353(storage.)X +1652(The)X +1804(middle)X +2053(snake)X +2263(procedure)X +2612(requires)X +2898(two)X +3045(O\(D\))X +3242(space)X +3448(V)X +3532(vectors.)X +3830(But)X +3971(this)X +4112(step)X +4267(is)X +576 832(completed)N +931(before)X +1158(engaging)X +1473(in)X +1556(the)X +1675(recursion.)X +2035(Thus)X +2216(only)X +2379(one)X +2516(pair)X +2662(of)X +2750(global)X +2971(V)X +3051(vectors)X +3305(are)X +3426(shared)X +3658(by)X +3760(all)X +3862(invocations)X +4253(of)X +576 960(the)N +695(procedure.)X +1078(Moreover,)X +1436(only)X +1599(O\(lgD\))X +1852(levels)X +2060(of)X +2148(recursion)X +2468(are)X +2588(traversed)X +2904(implying)X +3213(that)X +3353(only)X +3515(O\(lgD\))X +3767(storage)X +4019(is)X +4092(needed)X +576 1088(on)N +676(the)X +794(recursion)X +1113(stack.)X +1338(Unfortunately,)X +1828(the)X +1946(input)X +2130(sequences)X +2476(A)X +2554(and)X +2690(B)X +2763(must)X +2938(be)X +3034(kept)X +3192(in)X +3275(memory,)X +3583(implying)X +3892(that)X +4033(a)X +4090(total)X +4253(of)X +576 1216(O\(M+N\))N +882(space)X +1081(is)X +1154(needed.)X +3 f +576 1472(4c.)N +692(An)X +814(O)X +882(\()X +934(\()X +967(M)X +9 f +1056(+)X +3 f +1113(N)X +1177(\))X +1217(lg)X +1285(\()X +1318(M)X +9 f +1407(+)X +3 f +1464(N)X +1528(\))X +9 f +1594(+)X +3 f +1670(D)X +7 s +1732 1440(2)N +10 s +1791 1472(\))N +1838(Worst-Case)X +2264(Variation)X +1 f +696 1628(The)N +842(\256nal)X +1005(topic)X +1186(involves)X +1478(two)X +1619(previous)X +1916(results,)X +2166(each)X +2335(of)X +2423(which)X +2640(are)X +2760(just)X +2896(sketched)X +3198(here.)X +3399(First,)X +3587(suf\256x)X +3791(trees)X +3965([12,14])X +4221(are)X +576 1756(used)N +760(to)X +859(ef\256ciently)X +1220(record)X +1462(the)X +1596(common)X +1912(sublists)X +2187(of)X +2290(the)X +2424(sequences)X +2786(being)X +3000(compared.)X +3393(The)X +3554(term)X +3737(sublist)X +3981(is)X +4070(used)X +4253(as)X +576 1884(opposed)N +863(to)X +945(subsequence)X +1371(to)X +1453(emphasize)X +1812(that)X +1953(the)X +2072(symbols)X +2359(must)X +2535(be)X +2632(contiguous.)X +3044(Second,)X +3321(a)X +3378(recent)X +3596(RAM-based)X +4009(algorithm)X +576 2012(for)N +695(anwering)X +1019(Q)X +1101(on-line)X +1352(queries)X +1608(for)X +1726(the)X +1848(lowest)X +2081(common)X +2385(ancestors)X +2708(of)X +2799(vertices)X +3073(in)X +3159(a)X +3219(\256xed)X +3403(V-vertex)X +3713(tree)X +3858(takes)X +4047(O\(V+Q\))X +576 2140(time)N +746([6].)X +908(The)X +1061(ef\256cient)X +1352(variation)X +1665(centers)X +1921(on)X +2029(quickly)X +2297(\256nding)X +2551(the)X +2677(length)X +2906(or)X +3002(endpoint)X +3311(of)X +3407(a)X +3472(maximal)X +3781(snake)X +3993(starting)X +4262(at)X +576 2268(point)N +763(\(x,y\).)X +980(This)X +1145(is)X +1221(shown)X +1453(to)X +1538(reduce)X +1776(to)X +1861(\256nding)X +2110(the)X +2231(lowest)X +2463(common)X +2766(ancestor)X +3057(of)X +3147(two)X +3290(leaves)X +3514(in)X +3599(a)X +3657(suf\256x)X +3861(tree.)X +4044(This)X +4208(can)X +576 2396(be)N +678(done)X +860(in)X +948(O\(\(M+N\)lg\(M+N\)\))X +1604(pre-processing)X +2103(time)X +2271(and)X +2413(O\(1\))X +2591(time)X +2759(per)X +2888(query)X +3097(using)X +3296(the)X +3420(two)X +3566(techniques)X +3936(above.)X +4195(The)X +576 2524(ensuing)N +845(paragraphs)X +1218(embellish)X +1549(these)X +1734(ideas.)X +696 2680(A)N +777(suf\256x)X +982(or)X +1072(Patricia)X +1340(tree)X +1484([12,14])X +1741(for)X +1858(a)X +1917(sequence)X +2235(S)X +2303(of)X +2394(length)X +2618(L)X +2691(has)X +2822(edges)X +3029(labelled)X +3307(with)X +3473(sublists)X +3736(of)X +3827(S,)X +3915(has)X +4046(L)X +4119(leaves)X +576 2808(labelled)N +850(with)X +1012(the)X +1130(positions)X +1438(of)X +1525(S,)X +1609(and)X +1745(satis\256es)X +2018(the)X +2136(following)X +2467(three)X +2648(properties.)X +616 2964(1.)N +776(Concatenating)X +1265(the)X +1389(edge)X +1567(labels)X +1780(traversed)X +2101(on)X +2207(the)X +2332(path)X +2497(from)X +2680(the)X +2805(root)X +2961(to)X +3050(the)X +3175(leaf)X +3323(for)X +3444(position)X +3728(j,)X +3797(gives)X +3993(the)X +4118(suf\256x,)X +776 3092(S[j..L],)N +1025(of)X +1112(S)X +1176(starting)X +1436(at)X +1514(j.)X +1596(Thus)X +1776(every)X +1975(path)X +2133(within)X +2357(the)X +2475(tree)X +2616(denotes)X +2881(a)X +2937(sublist)X +3165(of)X +3252(S.)X +616 3248(2.)N +776(Every)X +988(interior)X +1244(vertex)X +1465(has)X +1592(out-degree)X +1956(greater)X +2200(than)X +2358(one.)X +616 3404(3.)N +776(The)X +921(labels)X +1128(of)X +1215(the)X +1333(out-edges)X +1665(of)X +1752(every)X +1951(vertex)X +2172(begin)X +2370(with)X +2532(distinct)X +2787(symbols.)X +576 3560(These)N +788(properties)X +1129(can)X +1261(only)X +1423(be)X +1519(satis\256ed)X +1801(if)X +1870(the)X +1988(last)X +2119(symbol)X +2374(of)X +2461(S)X +2525(is)X +2598(distinct)X +2853(from)X +3029(every)X +3228(other)X +3413(symbol)X +3668(in)X +3750(S.)X +3855(This)X +4018(condition)X +576 3688(is)N +659(usually)X +919(met)X +1068(by)X +1177(appending)X +1540(a)X +1605(special)X +1857(symbol)X +2121(to)X +2212(the)X +2339(target)X +2551(sequence)X +2875(and)X +3020(once)X +3201(satis\256ed,)X +3512(the)X +3639(suf\256x)X +3850(tree)X +4000(is)X +4082(unique.)X +576 3816(Property)N +879(2)X +946(guarantees)X +1317(that)X +1464(there)X +1652(are)X +1778(less)X +1925(than)X +2090(L)X +2166(interior)X +2429(vertices.)X +2746(Moreover,)X +3110(the)X +3235(substrings)X +3586(labelling)X +3893(edges)X +4104(can)X +4244(be)X +576 3944(represented)N +977(by)X +1087(just)X +1232(storing)X +1484(indices)X +1741(to)X +1833(their)X +2010(\256rst)X +2164(and)X +2310(last)X +2451(characters)X +2808(in)X +2900(S.)X +3014(Thus)X +3204(suf\256x)X +3416(trees)X +3597(can)X +3738(be)X +3843(stored)X +4068(in)X +4159(O\(L\))X +576 4072(space.)N +817(The)X +964(ef\256cient)X +1249(construction)X +1667(of)X +1757(suf\256x)X +1962(trees)X +2137(is)X +2213(beyond)X +2472(the)X +2593(scope)X +2799(of)X +2889(this)X +3027(paper.)X +3269(The)X +3417(reader)X +3642(is)X +3718(referred)X +3997(to)X +4082(a)X +4141(paper)X +576 4200(by)N +679(McCreight)X +1049([14])X +1206(giving)X +1433(an)X +1532(algorithm)X +1866(that)X +2009(constructs)X +2357(a)X +2416(suf\256x)X +2621(tree)X +2765(in)X +2850(O\(L\))X +3034(steps.)X +3257(Most)X +3443(of)X +3532(the)X +3652(steps)X +3834(are)X +3955(easily)X +4164(done)X +576 4328(in)N +664(O\(1\))X +842(time)X +1010(but)X +1138(some)X +1333(require)X +1587(selecting)X +1898(an)X +2000(out-edge)X +2307(based)X +2516(on)X +2622(its)X +2723(\256rst)X +2873(symbol.)X +3174(When)X +3392(the)X +3516(alpabet)X +3775(is)X +3855(\256nite,)X +4066(the)X +4191(out-)X +576 4456(degree)N +821(of)X +918(vertices)X +1198(is)X +1281(\256nite)X +1475(and)X +1621(the)X +1748(selection)X +2062(takes)X +2256(O\(1\))X +2437(time.)X +2648(When)X +2869(the)X +2996(alphabet)X +3297(is)X +3379(unrestricted,)X +3807(height-balanced)X +576 4584(trees)N +749(or)X +837(some)X +1027(other)X +1213(worst-case)X +1578(ef\256cient)X +1862(search)X +2089(structure)X +2391(permits)X +2652(selection)X +2959(in)X +3043(O\(lgL\))X +3288(time.)X +3492(Thus)X +3674(suf\256x)X +3878(tree)X +4021(construc-)X +576 4712(tion)N +720(takes)X +905(O\(L\))X +1086(time)X +1248(for)X +1362(\256nite)X +1546(alphabets)X +1869(and)X +2005(O\(LlgL\))X +2297(time)X +2459(otherwise.)X +696 4868(Consider)N +1009(the)X +1131(two)X +1275(paths)X +1468(from)X +1648(the)X +1771(root)X +1925(of)X +2017(S's)X +2144(suf\256x)X +2351(tree)X +2497(to)X +2584(leaves)X +2810(i)X +2857(and)X +2998(j.)X +3085(Each)X +3271(path)X +3434(from)X +3615(the)X +3738(root)X +3892(to)X +3979(a)X +4040(common)X +576 4996(ancestor)N +868(of)X +959(i)X +1005(and)X +1145(j,)X +1211(denotes)X +1480(a)X +1540(common)X +1844(pre\256x)X +2055(of)X +2146(the)X +2268(suf\256xes)X +2541(S[i..L])X +2774(and)X +2914(S[j..L].)X +3187(From)X +3384(Property)X +3684(3)X +3748(it)X +3816(follows)X +4079(that)X +4222(the)X +576 5124(path)N +736(to)X +820(the)X +940(lowest)X +1171(common)X +1474(ancestor)X +1765(of)X +1855(i)X +1900(and)X +2039(j,)X +2104(denotes)X +2372(the)X +2 f +2493(longest)X +1 f +2747(pre\256x)X +2957(of)X +3047(their)X +3217(respective)X +3566(suf\256xes.)X +3878(This)X +4043(observa-)X +576 5252(tion)N +721(motivates)X +1053(the)X +1172(following)X +1504(suf\256x)X +1707(tree)X +1849 0.2167(characterization)AX +2384(of)X +2472(the)X +2591(maximal)X +2892(snake)X +3096(starting)X +3357(at)X +3436(point)X +3621(\(x,y\))X +3796(in)X +3879(the)X +3997(edit)X +4137(graph)X +576 5380(of)N +664(A)X +743(and)X +880(B)X +954(of)X +1042(lengths)X +1294(N)X +1373(and)X +1510(M)X +1602(respectively.)X +2051(Form)X +2245(the)X +2364(position)X +2642(tree)X +2784(for)X +2899(the)X +3018(sequence)X +3334(S)X +3399(=)X +3465(A.$)X +7 s +3592 5396(1)N +10 s +3626 5380(.B.$)N +7 s +3768 5396(2)N +10 s +3823 5380(where)N +4041(the)X +4160(sym-)X +576 5508(bols)N +732($)X +7 s +781 5524(1)N +10 s +838 5508(and)N +977($)X +7 s +1026 5524(2)N +10 s +1083 5508(are)N +1205(not)X +1330(equal)X +1526(to)X +1610(each)X +1780(other)X +1967(or)X +2056(any)X +2194(symbol)X +2451(in)X +2535(A)X +2615(or)X +2704(B.)X +2819(The)X +2966(maximal)X +3268(snake)X +3473(starting)X +3735(at)X +3815(\(x,y\))X +3991(is)X +4066(denoted)X +576 5636(by)N +679(the)X +800(path)X +961(from)X +1140(the)X +1261(root)X +1413(of)X +1503(S's)X +1628(suf\256x)X +1833(tree)X +1978(to)X +2064(the)X +2186(lowest)X +2419(common)X +2723(ancestor)X +3015(of)X +3106(positions)X +3418(x)X +3482(and)X +3622(y+N+1.)X +3914(This)X +4080(follows)X +576 5764(because)N +871(neither)X +1134($)X +7 s +1183 5780(1)N +10 s +1257 5764(or)N +1364($)X +7 s +1413 5780(2)N +10 s +1487 5764(can)N +1639(be)X +1755(a)X +1831(part)X +1996(of)X +2103(this)X +2258(longest)X +2529(common)X +2849(pre\256x)X +3076(for)X +3210(the)X +3348(suf\256xes)X +3636(A[x..N].$)X +7 s +3955 5780(1)N +10 s +3989 5764(.B.$)N +7 s +4131 5780(2)N +10 s +4204 5764(and)N +2361 6176(-)N +2408(13)X +2508(-)X + +14 p +%%Page: 14 14 +10 s 10 xH 0 xS 1 f +576 704(B[y..M].$)N +7 s +903 720(2)N +10 s +937 704(.)N +1003(So)X +1113(to)X +1201(\256nd)X +1351(the)X +1475(endpoint)X +1781(of)X +1875(a)X +1938(snake)X +2148(starting)X +2415(at)X +2500(\(x,y\),)X +2701(\256nd)X +2852(the)X +2977(lowest)X +3213(common)X +3520(ancestor)X +3815(of)X +3909(leaves)X +4137(x)X +4204(and)X +576 832(y+N+1)N +825(in)X +907(the)X +1025(suf\256x)X +1227(tree)X +1368(and)X +1504(return)X +1716(\(x+m,y+m\))X +2104(where)X +2321(m)X +2403(is)X +2476(the)X +2594(length)X +2814(of)X +2901(the)X +3019(sublist)X +3247(denoted)X +3521(by)X +3621(the)X +3739(path)X +3897(to)X +3979(this)X +4114(ances-)X +576 960(tor.)N +736(In)X +834(a)X +901(linear)X +1115(preprocessing)X +1593(pass)X +1763(the)X +1893(sublist)X +2133(lengths)X +2396(to)X +2490(every)X +2701(vertex)X +2934(are)X +3065(computed)X +3413(and)X +3561(the)X +3691(auxiliary)X +4008(structures)X +576 1088(needed)N +833(for)X +956(the)X +1083(O\(V+Q\))X +1385(lowest)X +1623(common)X +1932(ancestor)X +2229(algorithm)X +2569(of)X +2665(Harel)X +2872(and)X +3016(Tarjan)X +3254([6])X +3376(are)X +3503(constructed.)X +3941(This)X +4111(RAM-)X +576 1216(based)N +782(algorithm)X +1116(requires)X +1398(O\(V\))X +1591(preprocessing)X +2060(time)X +2225(but)X +2350(can)X +2485(then)X +2646(answer)X +2897(each)X +3068(on-line)X +3319(query)X +3526(in)X +3612(O\(1\))X +3788(time.)X +3994(Thus)X +4178(with)X +576 1344(O\(\(M+N\)lg\(M+N\)\))N +1228(preprocessing)X +1696(time)X +1860(\(building)X +2175(the)X +2295(suf\256x)X +2499(tree)X +2642(is)X +2717(the)X +2837(dominant)X +3161(cost\),)X +3358(a)X +3415(collection)X +3752(of)X +3840(on-line)X +4088(queries)X +576 1472(for)N +690(the)X +808(endpoints)X +1139(of)X +1226(maximal)X +1526(snakes)X +1760(can)X +1892(be)X +1988(answered)X +2312(in)X +2394(O\(1\))X +2566(time)X +2728(per)X +2851(query.)X +696 1628(Modify)N +956(the)X +1074(basic)X +1260(algorithm)X +1592(of)X +1680(Section)X +1941(3)X +2002(by)X +2103(\(a\))X +2214(prefacing)X +2539(it)X +2604(with)X +2767(the)X +2886(preprocessing)X +3353(needed)X +3602(for)X +3717(the)X +3836(maximal)X +4137(snake)X +576 1756(queries)N +833(and)X +974(\(b\))X +1093(replacing)X +1417(Line)X +1589(9)X +1654(with)X +1821(the)X +1944(O\(1\))X +2121(query)X +2329(primitives.)X +2717(Recall)X +2946(that)X +3090(every)X +3293(line)X +3437(in)X +3523(the)X +3645(innermost)X +3989(loop)X +4155(other)X +576 1884(than)N +737(Line)X +907(9)X +970(is)X +1046(O\(1\))X +1221(and)X +1360(that)X +1503(the)X +1624(loop)X +1789(is)X +1865(repeated)X +2161(O)X +2225(\()X +2258(D)X +7 s +2320 1852(2)N +10 s +2360 1884(\))N +2410(times.)X +2646(Now)X +2825(that)X +2968(Line)X +3138(9)X +3201(takes)X +3389(O\(1\))X +3564(time)X +3729(it)X +3797(follows)X +4061(that)X +4205(this)X +576 2012(modi\256cation)N +1003(results)X +1235(in)X +1320(an)X +1419(algorithm)X +1753(that)X +1896(runs)X +2057(in)X +2142(O)X +2206(\()X +2258(\()X +2291(M)X +9 f +2375(+)X +1 f +2432(N)X +2496(\))X +2536(lg)X +2604(\()X +2637(M)X +9 f +2721(+)X +1 f +2778(N)X +2842(\))X +9 f +2908(+)X +1 f +2984(D)X +7 s +3046 1980(2)N +10 s +3105 2012(\))N +3155(time.)X +3360(Note)X +3539(that)X +3682(this)X +3820(variation)X +4128(is)X +4204(pri-)X +576 2140(marily)N +822(of)X +926(theoretical)X +1302(interest.)X +1615(The)X +1777(coef\256cients)X +2184(of)X +2288(proportionality)X +2805(are)X +2941(much)X +3156(larger)X +3382(for)X +3514(the)X +3650(algorithm)X +3999(fragments)X +576 2268(employed)N +913(implying)X +1222(that)X +1363(problems)X +1682(will)X +1827(have)X +2000(to)X +2083(be)X +2179(very)X +2342(large)X +2523(before)X +2749(the)X +2867(variation)X +3172(becomes)X +3473(faster.)X +3712(But)X +3847(suf\256x)X +4049(trees)X +4221(are)X +576 2396(particularly)N +971(space)X +1175(inef\256cient)X +1525(and)X +1667(two)X +1813(auxiliary)X +2124(trees)X +2302(of)X +2395(equal)X +2595(size)X +2746(are)X +2871(needed)X +3125(for)X +3245(the)X +3369(fast)X +3511(lowest)X +3746(common)X +4052(ancestor)X +576 2524(algorithm.)N +960(Thus)X +1152(for)X +1278(problems)X +1608(large)X +1801(enough)X +2069(to)X +2163(make)X +2369(the)X +2499(time)X +2673(savings)X +2945(worthwhile)X +3342(it)X +3418(is)X +3503(likely)X +3717(that)X +3869(there)X +4062(will)X +4218(not)X +576 2652(enough)N +832(memory)X +1119(to)X +1201(accomodate)X +1605(these)X +1790(additional)X +2130(structures.)X +3 f +576 2908(Acknowledgements)N +1 f +576 3064(Webb)N +792(Miller)X +1016(originally)X +1351(proposed)X +1669(the)X +1791(problem)X +2083(of)X +2175(\256nding)X +2426(an)X +2527(O\(ND\))X +2780(algorithm.)X +3156(The)X +3306(author)X +3536(would)X +3761(like)X +3906(to)X +3993(thank)X +4196(him)X +576 3192(for)N +701(nurturing)X +1030(this)X +1176(work)X +1372(and)X +1519(his)X +1643(many)X +1852(helpful)X +2109(suggestions.)X +2552(The)X +2707(referees)X +2993(comments)X +3352(and)X +3498(corrections)X +3885(improved)X +4222(the)X +576 3320(paper)N +775(greatly.)X +3 f +576 3544(References)N +1 f +576 3668(1.)N +736(Aho,)X +920(A.V.,)X +1122(Hirschberg,)X +1525(D.S.,)X +1713(and)X +1855(Ullman,)X +2141(J.D.)X +2316(``Bounds)X +2640(on)X +2746(the)X +2870(Complexity)X +3274(of)X +3368(the)X +3493(Longest)X +3778(Common)X +4102(Subse-)X +736 3764(quence)N +984(Problem.'')X +2 f +1369(Journal)X +1638(of)X +1720(ACM)X +1909(23)X +1 f +(,)S +2029(1)X +2089(\(1976\),)X +2343(1-12.)X +576 3888(2.)N +736(Aho,)X +929(A.V.,)X +1140(Hopcroft,)X +1485(J.E.,)X +1660(and)X +1811(Ullman,)X +2106(J.D.)X +2 f +2290(Data)X +2486(Structures)X +2851(and)X +3007(Algorithms.)X +1 f +3438(Addison-Wesley,)X +4033(Reading,)X +736 3984(Mass.)N +945(\(1983\),)X +1199(203-208.)X +576 4108(3.)N +736(Dijkstra,)X +1042(E.W.)X +1255(``A)X +1395(Note)X +1580(on)X +1689(Two)X +1865(Problems)X +2196(in)X +2287(Connexion)X +2667(with)X +2838(Graphs.'')X +2 f +3193(Numerische)X +3605(Mathematik)X +4017(1)X +1 f +4086(\(1959\),)X +736 4204(269-271.)N +576 4328(4.)N +736(Gosling,)X +1037(J.)X +1136(``A)X +1276(Redisplay)X +1625(Algorithm.'')X +2 f +2081(Proceedings)X +2511(ACM)X +2709(SIGPLAN/SIGOA)X +3312(Symposium)X +3706(on)X +3815(Text)X +3982(Manipula-)X +736 4424(tion)N +1 f +880(\(1981\),)X +1134(123-129.)X +576 4548(5.)N +736(Hall,)X +914(P.A.V.)X +1154(and)X +1290(Dowling,)X +1610(G.R.)X +1801(``Approximate)X +2298(String)X +2513(Matching.'')X +2 f +2934(Computing)X +3309(Surveys)X +3579(12)X +1 f +(,)S +3699(4)X +3759(\(1980\),)X +4013(381-402.)X +576 4672(6.)N +736(Harel,)X +966(D.)X +1075(and)X +1222(Tarjan,)X +1483(R.E.)X +1676(``Fast)X +1894(Algorithms)X +2289(for)X +2414(Finding)X +2693(Nearest)X +2970(Common)X +3298(Ancestors.'')X +2 f +3744(SIAM)X +3959(Journal)X +4240(on)X +736 4768(Computing)N +1111(13)X +1 f +(,)S +1231(2)X +1291(\(1984\),)X +1545(338-355.)X +576 4892(7.)N +736(Hirschberg,)X +1138(D.S.)X +1325(``A)X +1462(Linear)X +1697(Space)X +1914(Algorithm)X +2272(for)X +2391(Computing)X +2775(Maximal)X +3089(Common)X +3412(Subsequences.'')X +2 f +3982(Communi-)X +736 4988(cations)N +987(of)X +1069(ACM)X +1258(18)X +1 f +(,)S +1378(6)X +1438(\(1975\),)X +1692(341-343.)X +576 5112(8.)N +736(Hirschberg,)X +1149(D.S.)X +1347(``Algorithms)X +1801(for)X +1931(the)X +2065(Longest)X +2360(Common)X +2694(Subsequence)X +3150(Problem.'')X +2 f +3552(Journal)X +3838(of)X +3937(ACM)X +4143(24)X +1 f +(,)S +4280(4)X +736 5208(\(1977\),)N +990(664-675.)X +576 5332(9.)N +736(Hirschberg,)X +1134(D.S.)X +1318(``An)X +1492(Information-Theoretic)X +2232(Lower)X +2464(Bound)X +2699(for)X +2815(the)X +2935(Longest)X +3215(Common)X +3534(Subsequence)X +3975(Problem.'')X +2 f +736 5428(Information)N +1138(Processing)X +1514(Letters)X +1756(7)X +1 f +(,)S +1836(1)X +1896(\(1978\),)X +2150(40-41.)X +576 5552(10.)N +736(Hunt,)X +936(J.W.)X +1103(and)X +1239(McIlroy,)X +1542(M.D.)X +1752(``An)X +1925(Algorithm)X +2279(for)X +2394(Differential)X +2790(File)X +2935(Comparison.'')X +3441(Computing)X +3821(Science)X +4092(Techn-)X +736 5648(ical)N +872(Report)X +1110(41,)X +1230(Bell)X +1383(Laboratories)X +1809(\(1975\).)X +2361 6144(-)N +2408(14)X +2508(-)X + +15 p +%%Page: 15 15 +10 s 10 xH 0 xS 1 f +576 672(11.)N +736(Hunt,)X +938(J.W.)X +1107(and)X +1245(Szymanski,)X +1638(T.G.)X +1828(``A)X +1963(Fast)X +2119(Algorithm)X +2475(for)X +2592(Computing)X +2974(Longest)X +3255(Common)X +3575(Subsequences.'')X +2 f +4142(Com-)X +736 768(munications)N +1147(of)X +1229(ACM)X +1418(20)X +1 f +(,)S +1538(5)X +1598(\(1977\),)X +1852(350-353.)X +576 892(12.)N +736(Knuth,)X +983(D.E.)X +2 f +1177(The)X +1324(Art)X +1453(of)X +1543(Computer)X +1891(Programming,)X +2388(Vol.)X +2547(3:)X +2642(Sorting)X +2905(and)X +3053(Searching.)X +1 f +3446(Addison-Wesley,)X +4033(Reading,)X +736 988(Mass.)N +945(\(1983\),)X +1199(490-493.)X +576 1112(13.)N +736(Masek,)X +993(W.J.)X +1163(and)X +1302(Paterson,)X +1621(M.S.)X +1819(``A)X +1954(Faster)X +2173(Algorithm)X +2529(for)X +2646(Computing)X +3028(String)X +3247(Edit)X +3404(Distances.'')X +2 f +3834(J.)X +3914(of)X +4000(Computer)X +736 1208(and)N +876(Systems)X +1150(Sciences)X +1447(20)X +1 f +(,)S +1567(1)X +1627(\(1980\),)X +1881(18-31.)X +576 1332(14.)N +736(McCreight,)X +1125(E.M.)X +1327(``A)X +1461(Space-Economical)X +2085(Suf\256x)X +2302(Tree)X +2472(Construction)X +2907(Algorithm.'')X +2 f +3356(Journal)X +3627(of)X +3711(ACM)X +3902(23)X +1 f +(,)S +4024(2)X +4086(\(1976\),)X +736 1428(262-272.)N +576 1552(15.)N +736(Miller,)X +988(W.,)X +1137(and)X +1286(Myers,)X +1544(E.W.)X +1762(``A)X +1907(File)X +2064(Comparison)X +2488(Program.'')X +2 f +2891(Software)X +3208(\320)X +3312(Practice)X +3617(&)X +3712(Experience)X +4107(15)X +1 f +(,)S +4240(11)X +736 1648(\(1985\),)N +990(1025-1040.)X +576 1772(16.)N +736(Nakatsu,)X +1053(N.,)X +1185(Kambayashi,)X +1641(Y.,)X +1774(and)X +1925(Yajima,)X +2216(S.)X +2335(``A)X +2482(Longest)X +2775(Common)X +3107(Subsequence)X +3561(Algorithm)X +3929(Suitable)X +4226(for)X +736 1868(Similar)N +991(Text)X +1158(Strings.'')X +2 f +1498(Acta)X +1665(Informatica)X +2063(18)X +1 f +2163(\(1982\),)X +2417(171-179.)X +576 1992(17.)N +736(Rochkind,)X +1105(M.J.)X +1305(``The)X +1522(Source)X +1783(Code)X +1990(Control)X +2272(System.'')X +2 f +2639(IEEE)X +2851(Transactions)X +3306(on)X +3424(Software)X +3746(Engineering)X +4181(1)X +1 f +(,)S +4280(4)X +736 2088(\(1975\),)N +990(364-370.)X +576 2212(18.)N +736(Sankoff,)X +1041(D.)X +1150(and)X +1297(Kruskal,)X +1602(J.B.)X +2 f +1777(Time)X +1968(Warps,)X +2228(String)X +2454(Edits)X +2649(and)X +2800(Macromolecules:)X +3393(The)X +3544(Theory)X +3802(and)X +3954(Practice)X +4258(of)X +736 2308(Sequence)N +1060(Comparison.)X +1 f +1515(Addison-Wesley,)X +2094(Reading,)X +2401(Mass.)X +2610(\(1983\).)X +576 2432(19.)N +736(Tichy,)X +970(W.)X +1113(``The)X +1319(String-to-String)X +1852(Correction)X +2222(Problem)X +2521(with)X +2691(Block)X +2910(Moves.'')X +2 f +3250(ACM)X +3447(Transactions)X +3892(on)X +4000(Computer)X +736 2528(Systems)N +1010(2)X +1 f +(,)S +1090(4)X +1150(\(1984\),)X +1404(309-321.)X +576 2652(20.)N +736(Wagner,)X +1037(R.A.)X +1215(and)X +1358(Fischer,)X +1641(M.J.)X +1830(``The)X +2036(String-to-String)X +2569(Correction)X +2939(Problem.'')X +2 f +3331(Journal)X +3607(of)X +3696(ACM)X +3892(21)X +1 f +(,)S +4019(1)X +4086(\(1974\),)X +736 2748(168-173.)N +2361 6144(-)N +2408(15)X +2508(-)X + +15 p +%%Trailer +xt +%%Pages: 15 +%%DocumentNeededResources: font Times-Roman Times-Italic Times-Bold +%%+ Times-BoldItalic Helvetica Helvetica-Bold Courier Courier-Bold Symbol + +xs diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..e106c89 --- /dev/null +++ b/INSTALL @@ -0,0 +1,11 @@ + +To build and install wiggle, simply type: + + make install + +This will install /usr/bin/wiggle and /usr/share/man/man1/wiggle.1 + +You might like to inspect the Makefile and change + OptDbg=-ggdb +to something that will compile faster code on your compter, such as + OptDbg=-O3 -march=pentium2 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..388bf09 --- /dev/null +++ b/Makefile @@ -0,0 +1,50 @@ + +# Note on my Mobile Pentium II, -march=pentium2 delivers twice the performance of i386 +#OptDbg=-O3 +#OptDbg=-O3 -march=pentium2 +OptDbg=-ggdb +CFLAGS=$(OptDbg) -Wall -Werror + +# STRIP = -s +INSTALL = /usr/bin/install +DESTDIR = +BINDIR = /usr/bin +MANDIR = /usr/share/man +MAN1DIR = $(MANDIR)/man1 +MAN5DIR = $(MANDIR)/man5 + +all: wiggle wiggle.man test + + +wiggle : wiggle.o load.o split.o extract.o diff.o bestmatch.o ReadMe.o merge.o +wiggle.o load.o split.o extract.o diff.o bestmatch.o ReadMe.o merge.o : wiggle.h + +test: wiggle dotest + sh dotest + +wiggle.man : wiggle.1 + nroff -man wiggle.1 > wiggle.man + +clean: + rm -f *.o *.man wiggle .version* version + find . -name core -o -name '*.tmp*' -o -name .tmp | xargs rm -f + +install : wiggle wiggle.1 + $(INSTALL) -D $(STRIP) -m 755 wiggle $(DESTDIR)$(BINDIR)/wiggle + $(INSTALL) -D -m 644 wiggle.1 $(DESTDIR)$(MAN1DIR)/wiggle.1 + +version : ReadMe.c wiggle.1 + @rm -f version + @sed -n -e 's/.*wiggle - v\([0-9.]*\) - .*/\1/p' ReadMe.c > .version-readme + @sed -n -e 's/.*WIGGLE 1 "" v\([0-9.]*\)$$/\1/p' wiggle.1 > .version-man + @cmp -s .version-readme .version-man && cat .version-man > version || { echo Inconsistant versions.; exit 1;} + +dist : test clean version + mkdir -p DIST + rm -f DIST/wiggle-`cat version` + ln -s .. DIST/wiggle-`cat version` + tar czvf DIST/wiggle-`cat version`.tar.gz -h -C DIST --exclude RCS --exclude DIST wiggle-`cat version` + rm -f DIST/wiggle-`cat version` + +v : version + cat version diff --git a/ReadMe.c b/ReadMe.c new file mode 100644 index 0000000..6da1ab5 --- /dev/null +++ b/ReadMe.c @@ -0,0 +1,143 @@ +/* + * wiggle - apply rejected patches + * + * Copyright (C) 2003 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * Paper: Neil Brown + * School of Computer Science and Engineering + * The University of New South Wales + * Sydney, 2052 + * Australia + */ + +/* + * Options and help text for wiggle + */ + +#include "wiggle.h" + +char Version[] = "wiggle - v0.6 - 20 May 2003\n"; + +char short_options[]="xdmwlrh123pVRvq"; +struct option long_options[] = { + {"extract", 0, 0, 'x'}, + {"diff", 0, 0, 'd'}, + {"merge", 0, 0, 'm'}, + {"words", 0, 0, 'w'}, + {"lines", 0, 0, 'l'}, + {"patch", 0, 0, 'p'}, + {"replace", 0, 0, 'r'}, + {"help", 0, 0, 'h'}, + {"version", 0, 0, 'V'}, + {"reverse", 0, 0, 'R'}, + {"verbose", 0, 0, 'v'}, + {"quiet", 0, 0, 'q'}, + {0, 0, 0, 0} +}; + +char Usage[] = +"Usage: wiggle --diff|--extract|--merge --lines|--words [--replace] files...\n"; + +char Help[] = "\n" +"Wiggle - apply patches that 'patch' rejects.\n" +"\n" +"Wiggle provides three distinct but related functions:\n" +"merge, diff, and extract.\n" +"To get more detailed help on a function, select the function\n" +"before requesting help. e.g.\n" +" wiggle --diff --help\n" +"\n" +"Options:\n" +" --extract -x : select 'extract' function.\n" +" --diff -d : select 'diff' function.\n" +" --merge -m : select 'merge' function (default).\n" +"\n" +" --words -w : word-wise diff and merge.\n" +" --lines -l : line-wise diff and merge.\n" +"\n" +" --patch -p : treat last file as a patch file.\n" +" -1 -2 -3 : select which component of patch or merge to use.\n" +" --reverse -R : swap 'before' and 'after' for diff function.\n" +"\n" +" --help -h : get help.\n" +" --version -V : get version of wiggle.\n" +" --verbose -v : (potentially) be more verbose.\n" +" --quiet -q : don't print un-necessary messages.\n" +"\n" +" --replace -r : replace first file with result of merger.\n" +"\n" +"Wiggle needs to be given 1, 2, or 3 files. Any one of these can\n" +"be given as '-' to signify standard input.\n" +"\n"; + +char HelpExtract[] = "\n" +"wiggle --extract -[123] [--patch] merge-or-patch\n" +"\n" +"The extract function allows one banch of a patch or merge file\n" +"to be extracted. A 'patch' is the output of 'diff -c' or 'diff -u'.\n" +"Either the before (-1) or after (-2) branch can be extracted.\n" +"\n" +"A 'merge' is the output of 'diff3 -m' or 'merge -A'. Either the\n" +"first, second, or third branch can be extracted.\n" +"\n" +"A 'merge' file is assumed unless --patch is given.\n" +"\n"; + +char HelpDiff[] = "\n" +"wiggle --diff [-wl] [-p12] [-R] file-or-patch [file-or-patch]\n" +"\n" +"The diff function will report the differencs and similarities between\n" +"two files in a format similar to 'diff -u'. With --word mode\n" +"(the default) word-wise differences are displayed on lines starting\n" +"with a '|'. With --line mode, only whole lines are considered\n" +"much like normal diff.\n" +"\n" +"If one file is given is it assumed to be a patch, and the two\n" +"branches of the patch are extracted and compared. If two files\n" +"are given they are normally assumed to be whole files and are compared.\n" +"However if the --patch option is given with two files, then the\n" +"second is treated as a patch and the first or (with -2) second branch\n" +"is extracted and compared against the first file.\n" +"\n" +"--reverse (-R) with cause diff two swap the two files before comparing\n" +"them.\n" +"\n"; + +char HelpMerge[] = "\n" +"wiggle --merge [-wl] [--replace] file-or-merge [file-or-patch [file]]\n" +"\n" +"The merge function is the primary function of wiggle and is assumed\n" +"if no function is explicitly chosen.\n" +"\n" +"Normally wiggle will compare three files on a word-by-word basis and\n" +"output unresolvable conflicts in the resulting merge by showing\n" +"whole-line differences.\n" +"With the --lines option, the files are compared line-wise much\n" +"like 'merge'. With the --words option, files are compared\n" +"word-wise and unresolvable conflicts are reported word-wise.\n" +"\n" +"If --merge is given one file, it is treated as a merge (merge -A\n" +"output) and the three needed streams are extracted from it.\n" +"If --merge is given two files, the second is treated as a patch\n" +"file and the first is the original file.\n" +"If --merge is given three files, they are each treated as whole files\n" +"and differences between the second and third are merged into the first.\n" +"This usage is much like 'merge'.\n" +"\n"; diff --git a/TODO b/TODO new file mode 100644 index 0000000..d0f8bd2 --- /dev/null +++ b/TODO @@ -0,0 +1,29 @@ + +- extract.c should be able to extract half of a word-diff +- extract.c should work on word-merges +- review all test output to make sure it looks right +- document 'p' DOING +- can find_best be optimised more? +- --verbose flag ?? what should it do? +- review commented code and discard some of it +- test on raid code +- possibly encourage "###...####" onto line by itself in diff output +- possibly remember match information while reading patch/merge + to help matching. +- is there anything useful to be done with linenumber information? +- document diff algorithm +- document best-match algorithm +- document merge algorithm +- enhance 'p' + - editmail? reviewmail + - review wiggle failures + +- Application of patch-03-MdRaid5Works caused some odd matches + +- possible verbosity: + report lines at which each patch was applied.?? +- add examples to man page + +- Design viewer. + Maybe: + 3 windows: before, patch, after diff --git a/bestmatch.c b/bestmatch.c new file mode 100644 index 0000000..5bb31ef --- /dev/null +++ b/bestmatch.c @@ -0,0 +1,513 @@ +/* + * wiggle - apply rejected patches + * + * Copyright (C) 2003 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * Paper: Neil Brown + * School of Computer Science and Engineering + * The University of New South Wales + * Sydney, 2052 + * Australia + */ + +/* + * Find the best match for a patch against + * a file. + * The quality of a match is the length of the match minus the + * differential between the endpoints. + * We progress through the matrix recording the best + * match as we find it. + * + * We perform a full diagonal bredth first traversal assessing + * the quality of matches at each point. + * At each point there are two or three previous points, + * up, back or diagonal if there is a match. + * We assess the value of the match at each point and choose the + * best. No match at all is given a score of -3. + * + * For any point, the best possible score using that point + * is a complete diagonal to the nearest edge. We ignore points + * which cannot contibute to a better overall score. + * + */ + +/* This structure keeps track of the current match at each point. + * It holds the start of the match as x,k where k is the + * diagonal, so y = x-k. + * Also the length of the match so far. + * If l == 0, there is no match. + */ + +#include +#include +#include +#include "wiggle.h" + + +struct v { + int x,y; /* location of start of match */ + int val; /* value of match from x,y to here */ + int k; /* diagonal of last match */ + int inmatch; /* 1 if last point was a match */ + int c; /* chunk number */ +}; + +/* + * Here must must determine the 'value' of a partial match. + * The input parameters are: + * length - the total number of symbols matches + * errs - the total number of insertions or deletions + * dif - the absolute difference between number of insertions and deletions. + * + * In general we want length to be high, errs to be low, and dif to be low. + * Particular questions that must be answered include: + * - When does adding an extra symbol after a small gap improve the match + * - When does a match become so bad that we would rather start again. + * + * We would like symetry in our answers so that a good sequence with an out-rider on + * one end is evaluated the same as a good sequence with an out-rider on the other end. + * However to do this we cannot really use value of the good sequence to weigh in the + * outriders favour as in the case of a leading outrider, we do not yet know the value of + * of the good sequence. + * First, we need an arbitrary number, X, to say "Given a single symbol, after X errors, we + * forget that symbol". 5 seems a good number. + * Next we need to understand how replacements compare to insertions or deletions. + * Probably a replacement is the same cost as an insertion or deletion. + * Finally, a few large stretches are better then lots of little ones, so the number + * of disjoint stretches should be kept low. + * So: + * Each match after the first adds 5 to value. + * The first match in a string adds 6. + * Each non-match subtracts one unless it is the other half of a replacement. + * A value of 0 causes us to forget where we are and start again. + * + * We need to not only assess the value at a particular location, but also + * assess the maximum value we could get if all remaining symbols matched, to + * help exclude parts of the matrix. + * The value of that possibility is 6 times the number of remaining symbols, -1 if we + * just had a match. + */ +/* dir == 0 for match, 1 for k increase, -1 for k decrease */ +static inline void update_value(struct v *v, int dir, int k, int x) +{ + if (dir == 0) { + if (v->val <= 0) { + v->x = x-1; + v->y = x-k-1; + v->inmatch = 0; + v->val = 4; + } + v->val += 2+v->inmatch; + v->inmatch = 1; + v->k = k; + } else { + v->inmatch = 0; + if (dir * (v->k - k) > 0) { + /* other half of replacement */ + } else { + v->val -= 1; + } + } +} +static inline int best_val(struct v *v, int max) +{ + if (v->val <= 0) + return 4+max*3-1; + else + return max*3-1+v->inmatch+v->val; +} + +#ifdef OLDSTUFF +#if 0 +#define value(v,kk,xx) (v.l ? (v.l - abs(kk-v.k)): -3) +#else +# if 0 +# define value(v,kk,xx) (v.l ? (v.l - (xx-v.x)/2): -3) +# else +# define value(v,kk,xx) (v.l ? (v.l - (xx-v.x)*2/v.l): -3) +# endif +#endif +#endif +struct best { + int xlo,ylo,xhi,yhi,val; +}; + +static inline int min(int a, int b) { + return a < b ? a : b; +} + +void find_best(struct file *a, struct file *b, + int alo, int ahi, + int blo, int bhi, struct best *best) +{ + int klo, khi, k; + int f; + + struct v *valloc = malloc(sizeof(struct v)*((ahi-alo)+(bhi-blo)+5)); + struct v *v = valloc + (bhi-alo+2); + + k = klo = khi = alo-blo; + f = alo+blo; /* front that moves forward */ + v[k].val = 0; + v[k].c = -1; + + while (f < ahi+bhi) { + int x,y; + + f++; + +#if 0 + if (f == ahi+bhi) + printf("f %d klo %d khi %d\n", f,klo,khi); +#endif + for (k=klo+1; k <= khi-1 ; k+=2) { + struct v vnew, vnew2; + x = (k+f)/2; + y = x-k; + /* first consider the diagonal */ + if (match(&a->list[x-1], &b->list[y-1])) { + vnew = v[k]; + update_value(&vnew, 0, k, x); +#if 0 + printf("new %d,%d %d,%d (%d) ...", + vnew.x, vy(vnew), x, y, value(vnew,k,x)); +#endif + if (vnew.c < 0) abort(); + if (vnew.val > best[vnew.c].val) { +#if 0 + printf("New best for %d at %d,%d %d,%d, val %d\n", + vnew.c, vnew.x, vnew.y,x,y,vnew.val); +#endif + best[vnew.c].xlo = vnew.x; + best[vnew.c].ylo = vnew.y; + best[vnew.c].xhi = x; + best[vnew.c].yhi = y; + best[vnew.c].val = vnew.val; + } + v[k] = vnew; + } else { + vnew = v[k+1]; + update_value(&vnew, -1, k,x); + /* might cross a chunk boundary */ + if (b->list[y-1].len && b->list[y-1].start[0]==0) { + vnew.c = atoi(b->list[y-1].start+1); + vnew.val = 0; + } + vnew2 = v[k-1]; + update_value(&vnew2, 1, k, x); + + if (vnew2.val > vnew.val) + v[k] = vnew2; + else + v[k] = vnew; + } + } + /* extend or contract range */ + klo--; + v[klo] = v[klo+1]; + x = (klo+f)/2; y = x-klo; + update_value(&v[klo],-1,klo,x); + if (y<=bhi && b->list[y-1].len && b->list[y-1].start[0]==0) { + v[klo].c = atoi(b->list[y-1].start+1); +#if 0 + printf("entered %d at %d,%d\n", v[klo].c, x, y); +#endif + v[klo].val = 0; + } + while (klo+2 < (ahi-bhi) && + (y > bhi || + (best_val(&v[klo], min(ahi-x,bhi-y)) < best[v[klo].c].val && + best_val(&v[klo+1], min(ahi-x,bhi-y+1)) < best[v[klo+1].c].val + ) + )) { + klo+=2; + x = (klo+f)/2; y = x-klo; + } + + khi++; + v[khi] = v[khi-1]; + x = (khi+f)/2; y = x - khi; + update_value(&v[khi],-1,khi,x); + while(khi-2 > (ahi-bhi) && + (x > ahi || + (best_val(&v[khi], min(ahi-x,bhi-y)) < best[v[khi].c].val && + best_val(&v[khi-1], min(ahi-x+1,bhi-y)) < best[v[khi].c].val + ) + )) { + khi -= 2; + x = (khi+f)/2; y = x - khi; + } + + } + free(valloc); +} + +struct csl *csl_join(struct csl *c1, struct csl *c2) +{ + struct csl *c,*cd, *rv; + int cnt; + if (c1 == NULL) + return c2; + if (c2 == NULL) + return c1; + + cnt = 1; /* the sentinal */ + for (c=c1; c->len; c++) cnt++; + for (c=c2; c->len; c++) cnt++; + cd = rv = malloc(sizeof(*rv)*cnt); + for (c=c1; c->len; c++) + *cd++ = *c; + for (c=c2; c->len; c++) + *cd++ = *c; + cd->len = 0; + free(c1); + free(c2); + return rv; +} + +#if 0 +static void printword(struct elmnt e) +{ + if (e.start[0]) + printf("%.*s", e.len, e.start); + else { + int a,b,c; + sscanf(e.start+1, "%d %d %d", &a, &b, &c); + printf("*** %d,%d **** %d\n", b,c,a); + } +} +#endif + +/* + * reduce a file by discarding less interesting words + * Words that end with a newline are interesting (so all words + * in line-mode are interesting) and words that start with + * and alphanumeric are interesting. This excludes spaces and + * special characters in word mode + * Doing a best-fit comparision on only interesting words is + * much fast than on all words, and it nearly as good + */ + +static inline int is_skipped(struct elmnt e) +{ + return !( ends_line(e) || + isalnum(e.start[0]) || + e.start[0] == '_'); +} +struct file reduce(struct file orig) +{ + int cnt=0; + int i; + struct file rv; + + for (i=0; i0 && is_skipped(a2.list[pa-1])) + pa--; + while (pb>0 && is_skipped(b2.list[pb-1])) + pb--; + +#if 0 + printf("-> %d,%d\n", pa,pb); +#endif + best[b].xlo = pa; + best[b].ylo = pb; + + while (pa %d,%d\n", pa,pb); +#endif + best[b].xhi = pa; + best[b].yhi = pb; + } +} + +static void find_best_inorder(struct file *a, struct file *b, + int alo, int ahi, int blo, int bhi, + struct best *best, int bestlo, int besthi) +{ + /* make sure the best matches we find are inorder. + * If they aren't we find a overall best, and + * recurse either side of that + */ + int i; + int bad=0; + int bestval, bestpos=0; + for (i=bestlo; i 0 && + best[i].val > 0 && + best[i-1].xhi >= best[i].xlo) + bad = 1; + + if (!bad) + return; + bestval = 0; + for (i=bestlo; i bestval) { + bestval = best[i].val; + bestpos = i; + } + if (bestpos > bestlo) { + /* move top down below chunk marker */ + int y = best[bestpos].ylo; + while (b->list[y].start[0]) y--; + find_best_inorder(a,b, + alo, best[bestpos].xlo, + blo, y, + best, bestlo, bestpos); + } + if (bestpos < besthi-1) { + /* move bottom up to chunk marker */ + int y = best[bestpos].yhi; + while (b->list[y].start[0]) y++; + find_best_inorder(a,b, + best[bestpos].xhi, ahi, + y, bhi, + best, bestpos+1, besthi); + } +} + +struct csl *pdiff(struct file a, struct file b, int chunks) +{ + int alo,ahi,blo,bhi; + struct csl *csl1, *csl2; + struct best *best = malloc(sizeof(struct best)*(chunks+1)); + int i; + struct file asmall, bsmall; + + asmall = reduce(a); + bsmall = reduce(b); + + alo = blo = 0; + ahi = asmall.elcnt; + bhi = bsmall.elcnt; +/* printf("start: %d,%d %d,%d\n", alo,blo,ahi,bhi); */ + + for (i=0; i<"); + printword(bsmall.list[best[i].yhi-1]);printf(">\n"); + } +#endif + remap(best,chunks+1,asmall,bsmall,a,b); +#if 0 +/* for(i=0; i0) { +#if 0 + int j; + printf("Before:\n"); + for (j=best[i].xlo; jlen; csl2++); + csl2->a = a.elcnt; + csl2->b = b.elcnt; + } else { + csl1 = malloc(sizeof(*csl1)); + csl1->len = 0; + csl1->a = a.elcnt; + csl1->b = b.elcnt; + } + free(best); + return csl1; +} diff --git a/diff.c b/diff.c new file mode 100644 index 0000000..c48340a --- /dev/null +++ b/diff.c @@ -0,0 +1,428 @@ +/* + * wiggle - apply rejected patches + * + * Copyright (C) 2003 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * Paper: Neil Brown + * School of Computer Science and Engineering + * The University of New South Wales + * Sydney, 2052 + * Australia + */ + +/* + * calculate longest common sequence between two sequences + * + * Each sequence contains strings with + * hash start length + * We produce a list of tripples: a b len + * where A and B point to elements in the two sequences, and len is the number + * of common elements there + * + * To help keep matches close together (important when matching a changed fragment + * against a whole) we track the disagonal of the first and last match on any path. + * When choosing the best of two paths, we choose the furthest reaching unless + * the other has a match and it's absolute diagonal difference is significantly smaller. + * 'Significant' if the reduction in difference exceeds the loss of progress by a + * factor of 2. + * + */ + +#include +#include "wiggle.h" +#include +#include + + +struct v { + int x; /* x location of furthest reaching path of current cost */ + int md; /* diagonal location of midline crossing */ + int l; /* number of continuous common sequences found so far */ +}; + + +static int find_common(struct file *a, struct file *b, + int *alop, int *ahip, + int *blop, int *bhip, + int mid, + struct v *v) +{ + /* examine matrix from alo to ahi and blo to bhi + * finding longest subsequence. + * return new {a,b}{lo,hi} either side of midline. + * i.e. alo+blo <= mid <= ahi+bhi + * and alo,blo to ahi,bhi is a common (possibly empty) subseq + * + * v is scratch space each is indexable from + * alo-bhi to ahi-blo inclusive + */ + + int klo, khi; + int k; + int alo = *alop; + int ahi = *ahip; + int blo = *blop; + int bhi = *bhip; + int x,y; + + int best = (ahi-alo)+(bhi-blo); + int dist; + + klo = khi = alo-blo; + v[klo].x = alo; + v[klo].l = 0; + + while(1) { + + for (k=klo ; k <= khi ; k+= 2) { + int snake = 0; + struct v vnew = v[k]; + x = v[k].x; + y = x-k; + if (y > bhi) abort(); + while (x < ahi && y < bhi && + match(&a->list[x], &b->list[y]) + ) { + x++; + y++; + snake=1; + } + vnew.x = x; + vnew.l += snake; + dist = (ahi-x)+(bhi-y); + if (dist < best) best = dist; + if (x+y >= mid && + v[k].x+v[k].x-k <= mid) { + vnew.md = k; + } + v[k] = vnew; + + if (dist == 0) { + /* OK! We have arrived. + * We crossed the midpoint at or after v[k].xm,v[k].ym + */ + if (x != ahi) abort(); + x = (v[k].md+mid)/2; + y = x-v[k].md; + *alop = x; + *blop = y; + + while (x < ahi && y < bhi && + match(&a->list[x], &b->list[y]) + ) { + x++; + y++; + } + + *ahip = x; + *bhip = y; + + return k; + } + } + + for (k=klo+1; k <= khi-1 ; k+= 2) { + if (v[k-1].x+1 >= v[k+1].x ) { + v[k] = v[k-1]; + v[k].x++; + } else { + v[k] = v[k+1]; + } + } + + x = v[klo].x; y = x -(klo-1); + dist = abs((ahi-x)-(bhi-y)); + if (dist <= best) { + v[klo-1] = v[klo]; + klo --; + } else + while (dist > best) { + klo ++; + x = v[klo].x; y = x -(klo-1); + dist = abs((ahi-x)-(bhi-y)); + } + + x = v[khi].x+1; y = x - (khi+1); + dist = abs((ahi-x)-(bhi-y)); + if (dist <= best) { + v[khi+1] = v[khi]; + v[khi+1].x++; + khi ++; + } else + while (dist > best) { + khi --; + x = v[khi].x+1; y = x - (khi+1); + dist = abs((ahi-x)-(bhi-y)); + } + } +} + +static struct csl *lcsl(struct file *a, int alo, int ahi, + struct file *b, int blo, int bhi, + struct csl *csl, + struct v *v) +{ + int len; + int alo1 = alo; + int ahi1 = ahi; + int blo1 = blo; + int bhi1 = bhi; + struct csl *rv = NULL; + int k; + + if (ahi <= alo || bhi <= blo) + return csl; + + + k = find_common(a,b, + &alo1, &ahi1, + &blo1, &bhi1, + (ahi+bhi+alo+blo)/2, + v); + if (k != ahi-bhi) abort(); + + len = v[k].l; + + if (csl == NULL) { + rv = csl = malloc((len+1)*sizeof(*csl)); + csl->len = 0; + } + if (len) { + csl = lcsl(a,alo,alo1, + b,blo,blo1, + csl, v); + + if (ahi1 > alo1) { + /* need to add this common seq, possibly attach + * to last + */ + if (csl->len && + csl->a+csl->len == alo1 && + csl->b+csl->len == blo1) { + csl->len += ahi1-alo1; + } else { + if (csl->len) csl++; + csl->len = ahi1-alo1; + csl->a = alo1; + csl->b = blo1; + csl[1].len = 0; + } + } + csl = lcsl(a,ahi1,ahi, + b,bhi1,bhi, + csl,v); + } + if (rv) { + if (csl->len) + csl++; + csl->a = ahi; + csl->b = bhi; +#if 1 + if (rv+len != csl || csl->len != 0) + abort(); /* number of runs was wrong */ +#endif + return rv; + } else + return csl; +} + +/* if two common sequences are separated by only an add or remove, + * and the first common ends the same as the middle text, + * extend the second and contract the first in the hope that the + * first might become empty. This ameliorates against the greedyness + * of the 'diff' algorithm. + * Once this is done, repeat the process but extend the first + * in favour of the second. The acknowledges that semantic units + * more often end with common text ("return 0;\n}\n", "\n") than + * start with it. + */ +static void fixup(struct file *a, struct file *b, struct csl *list) +{ + struct csl *list1, *orig; + int lasteol = -1; + if (!list) return; + orig = list; + list1 = list+1; + while (list->len && list1->len) { + if ((list->a+list->len == list1->a && + /* text at b inserted */ + match(&b->list[list->b+list->len-1], + &b->list[list1->b-1]) + ) + || + (list->b+list->len == list1->b && + /* text at a deleted */ + match(&a->list[list->a+list->len-1], + &a->list[list1->a-1]) + ) + ) { +/* printword(a->list[list1->a-1]); + printf("fixup %d,%d %d : %d,%d %d\n", + list->a,list->b,list->len, + list1->a,list1->b,list1->len); +*/ if (ends_line(a->list[list->a+list->len-1]) + && a->list[list->a+list->len-1].len==1 + && lasteol == -1 + ) { +/* printf("E\n");*/ + lasteol = list1->a-1; + } + list1->a--; + list1->b--; + list1->len++; + list->len--; + if (list->len == 0) { + lasteol = -1; + if (list > orig) + list--; + else { + *list = *list1++; +/* printf("C\n");*/ + } + } + } else { + if (lasteol >= 0) { +/* printf("seek %d\n", lasteol);*/ + while (list1->a <= lasteol && list1->len>1) { + list1->a++; + list1->b++; + list1->len--; + list->len++; + } + lasteol=-1; + } + *++list = *list1++; + } + } + list[1] = list1[0]; +} + +struct csl *diff(struct file a, struct file b) +{ + struct v *v; + struct csl *csl; + v = malloc(sizeof(struct v)*(a.elcnt+b.elcnt+2)); + v += b.elcnt+1; + + csl = lcsl(&a, 0, a.elcnt, + &b, 0, b.elcnt, + NULL, v); + free(v-(b.elcnt+1)); + fixup(&a, &b, csl); + if (!csl) { + csl = malloc(sizeof(*csl)); + csl->len = 0; + csl->a = a.elcnt; + csl->b = b.elcnt; + } + return csl; +} + +struct csl *diff_partial(struct file a, struct file b, + int alo, int ahi, int blo, int bhi) +{ + struct v *v; + struct csl *csl; + v = malloc(sizeof(struct v)*(ahi-alo+bhi-blo+2)); + v += bhi-alo+1; + + csl = lcsl(&a, alo, ahi, + &b, blo, bhi, + NULL, v); + free(v-(bhi-alo+1)); + fixup(&a, &b, csl); + return csl; +} + + +#ifdef MAIN + +main(int argc, char *argv[]) +{ + struct file a, b; + struct csl *csl; + struct elmnt *lst = malloc(argc*sizeof(*lst)); + int arg; + int alo, ahi, blo, bhi; + struct v *v; + int ln; + + arg = 1; + a.elcnt = 0; + a.list = lst; + while (argv[arg] && strcmp(argv[arg],"--")) { + lst->hash = 0; + lst->start = argv[arg]; + lst->len = strlen(argv[arg]); + a.elcnt++; + lst++; + arg++; + } + if (!argv[arg]) { + printf("AARGH\n"); + exit(1); + } + arg++; + b.elcnt = 0; + b.list = lst; + while (argv[arg] && strcmp(argv[arg],"--")) { + lst->hash = 0; + lst->start = argv[arg]; + lst->len = strlen(argv[arg]); + b.elcnt++; + lst++; + arg++; + } + + v = malloc(sizeof(struct v)*(a.elcnt+b.elcnt+2)); + v += b.elcnt+1; + alo = blo = 0; + ahi = a.elcnt; + bhi = b.elcnt; +#if 0 + ln = find_common(&a, &b, + &alo, &ahi, &blo, &bhi, + (ahi+bhi)/2, + v); + + printf("ln=%d (%d,%d) -> (%d,%d)\n", ln, + alo,blo,ahi,bhi); +#else + csl = lcsl(&a, 0, a.elcnt, + &b, 0, b.elcnt, + NULL, v); + fixup(&a, &b, csl); + while (csl && csl->len) { + int i; + printf("%d,%d for %d:\n", csl->a,csl->b,csl->len); + for (i=0; ilen; i++) { + printf(" %.*s (%.*s)\n", + a.list[csl->a+i].len, a.list[csl->a+i].start, + b.list[csl->b+i].len, b.list[csl->b+i].start); + } + csl++; + } +#endif + + exit(0); +} + +#endif + diff --git a/dotest b/dotest new file mode 100755 index 0000000..217c193 --- /dev/null +++ b/dotest @@ -0,0 +1,88 @@ +#!/bin/bash + +dir=$PWD + +while [ ! -f $dir/wiggle ] +do + case $dir in + / ) echo >&2 Cannot find wiggle program : $WIGGLE + exit 1;; + * ) dir=${dir%/*} + esac +done +export WIGGLE=$dir/wiggle + +if [ -d tests ] +then cd tests +fi + +status=0 +ok=0 +fail=0 + +find . -name core | xargs rm -f +list=$(find . -type f \( -name script -o -name diff -o -name ldiff \ + -o -name rediff -o -name merge -o -name wmerge -o -name lmerge -o -name replace \) + ) +for path in $list +do + dir=${path%/*} + base=${path##*/} + ( + cd $dir + > .time + case $base in + script ) ./script ;; + diff ) if [ -f new ] + then /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -dw orig new | diff -u diff - ; xit=$? + else /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -dwp1 orig patch | diff -u diff - ; xit=$? + fi + ;; + ldiff ) if [ -f new ] + then /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -dl orig new | diff -u ldiff - ; xit=$? + else /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -dlp1 orig patch | diff -u ldiff - ; xit=$? + fi + ;; + reldiff ) /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -dl patch | diff -u reldiff - ; xit=$? + ;; + rediff ) /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -dw patch | diff -u rediff - ; xit=$? + ;; + merge ) if [ -f patch ] + then /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -m orig patch | diff -u merge - ; xit=$? + elif [ -f new ] + then /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -m orig new new2 | diff -u merge - ; xit=$? + else /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -m orig | diff -u merge - ; xit=$? + fi + ;; + replace ) cp orig orig.tmp + if [ -f patch ] + then /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -mr orig.tmp patch + else /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -mr orig.tmp new new2 + fi + diff -u merge orig.tmp ; xit=$? + rm orig.tmp orig.tmp.porig + ;; + lmerge ) if [ -f patch ] + then /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -ml orig patch | diff -u lmerge - ; xit=$? + else /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -ml orig new new2 | diff -u lmerge - ; xit=$? + fi + ;; + wmerge ) if [ -f patch ] + then /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -mw orig patch | diff -u wmerge - ; xit=$? + else /usr/bin/time --quiet -o .time -f '%U' $WIGGLE -mw orig new new2 | diff -u wmerge - ; xit=$? + fi + ;; + esac + if [ $xit = 0 ]; then msg=SUCCEEDED; else msg=FAILED; fi + echo $path $msg `cat .time 2> /dev/null` + rm -f .time + exit $xit + ) + if [ $? = 0 ] + then let ok++; + else status=1 ; let fail++ + fi +done +find . -name core -ls +echo $ok succeeded and $fail failed +exit $status diff --git a/extract.c b/extract.c new file mode 100644 index 0000000..0959b44 --- /dev/null +++ b/extract.c @@ -0,0 +1,260 @@ +/* + * wiggle - apply rejected patches + * + * Copyright (C) 2003 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * Paper: Neil Brown + * School of Computer Science and Engineering + * The University of New South Wales + * Sydney, 2052 + * Australia + */ + +/* + * split patch or merge files. + * + */ + +#include "wiggle.h" +#include +#include + +void skip_eol(char **cp, char *end) +{ + char *c = *cp; + while (c < end && *c != '\n') + c++; + if (c < end) c++; + *cp = c; +} + +void copyline(struct stream *s, char **cp, char *end) +{ + char *from = *cp; + char *to = s->body+s->len; + + while (from < end && *from != '\n') + *to++ = *from++; + if (from < end) + *to++ = *from++; + s->len = to-s->body; + *cp = from; +} + +int split_patch(struct stream f, struct stream *f1, struct stream *f2) +{ + struct stream r1, r2; + int chunks=0; + char *cp, *end; + int state = 0; + int acnt=0, bcnt=0; + int a,b,c,d; + int lineno = 0; + + f1->body = f2->body = NULL; + + r1.body = malloc(f.len); + r2.body = malloc(f.len); + if (!r1.body || !r2.body) + die(); + + r1.len = r2.len = 0; + + cp = f.body; + end = f.body+f.len; + while (cp < end) { + /* state: + * 0 not in a patch + * 1 first half of context + * 2 second half of context + * 3 unified + */ + lineno++; + switch(state) { + case 0: + if (sscanf(cp, "@@ -%d,%d +%d,%d @@", &a, &b, &c, &d)==4) { + acnt = b; + bcnt = d; + state = 3; + } else if (sscanf(cp, "*** %d,%d ****", &a, &b)==2) { + acnt = b-a+1; + state = 1; + } else if (sscanf(cp, "--- %d,%d ----", &c, &d)==2) { + bcnt = d-c+1; + state = 2; + } + skip_eol(&cp, end); + if (state==1 || state == 3) { + char buf[20]; + buf[0] = 0; + chunks++; + sprintf(buf+1, "%5d %5d %5d\n", chunks, a, acnt); + memcpy(r1.body+r1.len, buf, 19); + r1.len += 19; + } + if (state==2 || state == 3) { + char buf[20]; + buf[0] = 0; + sprintf(buf+1, "%5d %5d %5d\n", chunks, c, bcnt); + memcpy(r2.body+r2.len, buf, 19); + r2.len += 19; + } + break; + case 1: + if ((*cp == ' ' || *cp=='!' || *cp == '-' || *cp == '+') + && cp[1] == ' ') { + cp+=2; + copyline(&r1, &cp, end); + acnt--; + if (acnt == 0) + state = 0; + } else { + fprintf(stderr, "wiggle: bad context patch at line %d\n", lineno); + return 0; + } + break; + case 2: + if ((*cp == ' ' || *cp=='!' || *cp == '-' || *cp == '+') + && cp[1] == ' ') { + cp+= 2; + copyline(&r2, &cp, end); + bcnt--; + if (bcnt == 0) + state = 0; + } else { + fprintf(stderr, "wiggle: bad context patch/2 at line %d\n", lineno); + return 0; + } + break; + case 3: + if (*cp == ' ') { + char *cp2; + cp++; + cp2 = cp; + copyline(&r1, &cp, end); + copyline(&r2, &cp2, end); + acnt--; bcnt--; + } else if (*cp == '-') { + cp++; + copyline(&r1, &cp, end); + acnt--; + } else if (*cp == '+') { + cp++; + copyline(&r2, &cp, end); + bcnt--; + } else { + fprintf(stderr, "wiggle: bad unified patch at line %d\n", lineno); + return 0; + } + if (acnt <= 0 && bcnt <= 0) + state = 0; + break; + } + } + if (r1.len > f.len || r2.len > f.len) + abort(); + *f1 = r1; + *f2 = r2; + return chunks; +} + +/* + * extract parts of a "diff3 -m" or "wiggle -m" output + */ +int split_merge(struct stream f, struct stream *f1, struct stream *f2, struct stream *f3) +{ + int lineno; + int state = 0; + char *cp, *end; + struct stream r1,r2,r3; + f1->body = f2->body = f2->body = NULL; + + r1.body = malloc(f.len); + r2.body = malloc(f.len); + r3.body = malloc(f.len); + if (!r1.body || !r2.body || !r3.body) + die(); + + r1.len = r2.len = r3.len = 0; + + cp = f.body; + end = f.body+f.len; + while (cp < end) { + /* state: + * 0 not in conflict + * 1 in file 1 of conflict + * 2 in file 2 of conflict + * 3 in file 3 of conflict + */ + int len = end-cp; + lineno++; + switch(state) { + case 0: + if (len>8 && + strncmp(cp, "<<<<<<<", 7)==0 && + (cp[7] == ' ' || cp[7] == '\n') + ) { + state = 1; + skip_eol(&cp, end); + } else { + char *cp2= cp; + copyline(&r1, &cp2, end); + cp2 = cp; + copyline(&r2, &cp2, end); + copyline(&r3, &cp, end); + } + break; + case 1: + if (len>8 && + strncmp(cp, "|||||||", 7)==0 && + (cp[7] == ' ' || cp[7] == '\n') + ) { + state = 2; + skip_eol(&cp, end); + } else + copyline(&r1, &cp, end); + break; + case 2: + if (len>8 && + strncmp(cp, "=======", 7)==0 && + (cp[7] == ' ' || cp[7] == '\n') + ) { + state = 3; + skip_eol(&cp, end); + } else + copyline(&r2, &cp, end); + break; + case 3: + if (len>8 && + strncmp(cp, ">>>>>>>", 7)==0 && + (cp[7] == ' ' || cp[7] == '\n') + ) { + state = 0; + skip_eol(&cp, end); + } else + copyline(&r3, &cp, end); + break; + } + } + *f1 = r1; + *f2 = r2; + *f3 = r3; + return state == 0; +} diff --git a/get-p-options b/get-p-options new file mode 100644 index 0000000..dec5352 --- /dev/null +++ b/get-p-options @@ -0,0 +1,8 @@ +#!/bin/sh +# +# make sure all p commands are in the help file + +sed -n -e '/^case/,/^esac/p' p | grep ')$' | grep -v '(' | + tr '\011' '@' | grep -v '@' | tr -cs '[A-Za-z0-9]' '\n' | sort > /tmp/p.cmds +sed -n -e '/^[a-z]/p' p.help | sort > /tmp/p.hlp +comm -3 /tmp/p.cmds /tmp/p.hlp diff --git a/hash.h b/hash.h new file mode 100644 index 0000000..edd4271 --- /dev/null +++ b/hash.h @@ -0,0 +1,92 @@ +/* Fast hashing routine for a long. + (C) 2002 William Lee Irwin III, IBM */ + +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ +#if BITS_PER_LONG == 32 +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e370001UL +#elif BITS_PER_LONG == 64 +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL +#else +#error Define GOLDEN_RATIO_PRIME for your wordsize. +#endif + +static inline unsigned long hash_long(unsigned long val, unsigned int bits) +{ + unsigned long hash = val; + +#if BITS_PER_LONG == 64 + /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ + unsigned long n = hash; + n <<= 18; + hash -= n; + n <<= 33; + hash -= n; + n <<= 3; + hash += n; + n <<= 3; + hash -= n; + n <<= 4; + hash += n; + n <<= 2; + hash += n; +#else + /* On some cpus multiply is faster, on others gcc will do shifts */ + hash *= GOLDEN_RATIO_PRIME; +#endif + + /* High bits are more random, so use them. */ + return hash >> (BITS_PER_LONG - bits); +} + +static inline unsigned long hash_ptr(void *ptr, unsigned int bits) +{ + return hash_long((unsigned long)ptr, bits); +} + +static inline unsigned long hash_str(char *name, int bits) +{ + unsigned long hash = 0; + unsigned long l = 0; + int len = 0; + unsigned char c; + do { + if (!(c = *name++)) { + c = (char)len; len = -1; + } + l = (l << 8) | c; + len++; + if ((len & (BITS_PER_LONG/8-1))==0) + hash = hash_long(hash^l, BITS_PER_LONG); + } while (len); + return hash >> (BITS_PER_LONG - bits); +} + +static inline unsigned long hash_mem(char *buf, int length, int bits) +{ + unsigned long hash = 0; + unsigned long l = 0; + int len = 0; + unsigned char c; + do { + if (len == length) { + c = (char)len; len = -1; + } else + c = *buf++; + l = (l << 8) | c; + len++; + if ((len & (BITS_PER_LONG/8-1))==0) + hash = hash_long(hash^l, BITS_PER_LONG); + } while (len); + return hash >> (BITS_PER_LONG - bits); +} diff --git a/load.c b/load.c new file mode 100644 index 0000000..a6ef250 --- /dev/null +++ b/load.c @@ -0,0 +1,142 @@ +/* + * wiggle - apply rejected patches + * + * Copyright (C) 2003 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * Paper: Neil Brown + * School of Computer Science and Engineering + * The University of New South Wales + * Sydney, 2052 + * Australia + */ + +/* + * read in files + * + * Files are read in whole and stored in a + * struct stream {char*, len} + * + * + * loading the file "-" reads from stdin which might require + * reading into several buffers + */ + +#include "wiggle.h" +#include +#include +#include +#include +#include + +static void join_streams(struct stream list[], int cnt) +{ + /* join all the streams in the list (upto body=NULL) + * into one by re-allocing list[0].body and copying + */ + int len=0; + int i; + char *c; + + for (i=0; i + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * Paper: Neil Brown + * School of Computer Science and Engineering + * The University of New South Wales + * Sydney, 2052 + * Australia + */ + + +/* + * This file contains routines use to create a merge. + * The core process is to take two coincidence lists, A-B and B-C, + * which identify coincidences and, but ommission, changes, and + * to apply to replace every part of A that matches B with the + * part of C that aligns with that part of B. In the case where + * a B-C difference does not align completely with an A-B coincidence, + * we have a conflict. + * + * Throught the processing of merges we need a concept of a position in the + * overall merge. This is represented by an index into one of the files, and + * and indicator as to which file. + * If the point is in: + * A - then it is an unmatched part of A, before a coincidence. + * B - then it is in a section where A matches B and B matches C. + * C - then it is in an unmatched part of C, but the corresponding part + * of B completely coincides with A. + * With each position we keep indexes into the coincidence lists for + * the containing or next coincidence in each. + * + * + * The first stage of merge processing is to identify conflicts. + * A conflict is identified by a start point and an end point. + * The first approximation for the start point is the end + * of the last A-B coincidence that starts before the B start + * of the B-C difference that causes the conflict. + + * + * We have a concept of a 'point' + * The start and end of the file are each points. + * Also the start and end of any conflict is a point. + * Outside of a conflict, we can move points forwards or backwards + * through the merger. Inside a conflict movement is not well defined. + * Any point is 'forward looking' or 'backward looking'. + * A forward looking point can be moved forward but not backward. + * A backward looking point can be moved backward, not forward. + * + * If a forward looking point is a tri-point, in a double-coincidence, + * then c1/c2 will be set to the furthest forward double coincidence that is before + * or contains the point, thus it well-defines the start of a double coincidence + * or that end of a conflict. + * inversely, a BL point well-defines the end of a DC or start of a conflict. + * + * The start of a conflict is a backward looking point. + * The end of a conflict is a forward looking point. + * + * In normal (Word/line) mode, we move the boundaries of a conflict out + * until they are at end-of-line. + * When moving forward, this is until we cross over a newline word. + * When moving backward, this is until one step before crossing over + * a newline word, so we need to remember our last position. + * + * Away from a conflict, every point can be clearly defined as a + * location either in A or in C. The 'point' is immediately before + * the word at that location. + * At the end of a conflict, this is still well defined as the 'next word' + * is outside a conflict. + * At the start of a conflict this may not be well defined as there may not + * be a clear 'next' word. We choose the point the would be reached by + * the step-forward algorithm so that it is easy to test if at start-of-conflict. + * + * A conflict is always bounded by a double coincidence. i.e. the word before a conflict + * is the same in all 3 texts, and the word after a conflict is the same in all + * 3 texts. To allow for conflicts at start and end of file, we consider the + * start and end of the three texts to each be double co-incidences. + * + * Each double co-incidence has a start and an end. When we find a conflict, it + * is taken to extend from the end of the previous double coincidence to the + * start of the next double co-incidence. + * Between conflicts we can mergers which can be printed simply be advancing the start + * point and printing each word as we go. + * + * The double co-incidence at the start begins forward-looking A=0 or C=0, + * depending on which word is first, and ends at backward-looking A=0. + * The double co-incidence at the end begins at forward-looking + * C=max and ends at backward looking A=max or C=max depending on which + * would be the last word. + * + * Each point is defined by a flag "in_a" which is true if the point is in A, + * and index 'pos' which gives the position in A or C depending on "in_a", and + * an index into each co-incidence list, c1 and c2. + * + * For forward looking points: + * if in_a: + * c1 is the first co-incidence that ends after pos. - or is tail co-incidence. + * c2 is the first co-incidence that ends at or after c1.b + * if in_c: + * c2 is the first co-incidence that ends after pos - or is tail co-incidence. + * c1 is the first co-incidence that ends at or after c2.a + * + * For a backward looking point: + * if in_a: + * c1 is the last co-incidence that starts before pos, or -1 + * c2 is the last co-incidence that starts at or before c1.b + * if in_c: + * c2 is the last co-incidence that starts before pos, or -1 + * c1 is the last co-incidence that .. lines up properly. + * + * To advance a point we increment pos, then + * if in_a and at start of c1 + * slide up to c and if at end of c2, advance c2, then c1 and repeat + * if in_c and within c2 and corresponding a at end of c1, and c1->len != 0 + * slide down to a, increment c1 and advance c2, then repeat. + * + * To retreat a backward facing point + * if in_a and at end of c1 and c1!=-1, + * slide up to c and if at start of c2, retreat c2, thenc 1, and repeat + * if in_c and within c2 and corresponding a at start of c1 + * slide down to a, decrement c1 and retreat c2, then repeat. + * Then decrement pos. + * + * We never actually compare points for ordering. We should 'know' the likely order + * and only compare equal or not. This can be tested independant of direction, + * and done by simply comparing in_a and pos. + */ + + +/* Each point involves a location in each of A, B, and C. + * There is a conflict for each change in B-C where the B section + * is not wholey contained in an A-B co-incidence. + * The start point of a conflict is determined as: + * C is the end of the C side of the previous B-C coincidence (or + * start of file + * B is the end of the B side of the matching A-B coincidence if + * the point is in an A-B coincidence, or the end of the previous + * A-B coincidence of not. + * As B moves backwards searching for an A-B coincidence, if it enters + * a B-C coincidence, C is moved backwards too. + * A is the matching point to B in the A-B coincidence that B is in. + * + * The end point of a conflict is determined in a similar way, + * except that B is in a coincidence that is at, or *follows* the + * end of the next B-C coincidence. + * + * Once these coincidences have been enumerated, the endpoints are + * optionally moved to be at start-of-line. The start point is moved + * backwards and the endpoint forwards. The endofline must be in an + * A-B coincidence and may be in C if there is also a B-C coincidence. + * + * The next step is to merge adjacent conflicts where the B point + * from one overlaps the next. + * + */ +#include +#include +#include "wiggle.h" + +/* A point is somewhere either in_a or not in_a (in which case, in C). + * if in_a, c1 points to the next a-b coincidence strictly after pos + * c2 points to the b-c coincidence that contains (possibly as end point) or follows c1.b + * if !in_a, c2 points to the b-c coincidence that contains (possibly as endpoint) or follows pos + * c1 points to the a-b coincidence that contains c2.b + * + * A point is not well defined inside a conflict, Though it is at the + * 'start' and 'end' of a conflict. + * + * At the start of the file c1 and c2 will be the firsts match in A-B and B-C + * If [c1]->a is 0, then !in_a and pos is [c2]->b+x where x is + * chosen such that [c1]->b == [c2]->a+x and x < [c2]->len. If such choice + * is not possible, there is a conflict at the start of the file and so we choose + * a point as if [c1]->a were not 0. + * + * If [c1]->a is not 0, then in_a and pos == 0. + * + * To find the start of file, we set in_a and pos==-1, and advance one step. + * + * At the end of the file, c1 will be the EOF match in A-B, c2 will be the + * EOF match in B-C, !in_a and pos == [c2]->b + */ +struct point { int pos, in_a; int c1,c2; }; + + +static int tripoint(struct point *here, + struct csl *c1, struct csl *c2, + int *a, int *b, int *c) +{ + /* find a, b, and c for 'here'. + * If any are not well defined, return 0. + */ + c1 += here->c1; + c2 += here->c2; + + if (here->in_a) { + *a = here->pos; + + if (here->c1 < 0) { + if (*a) return 0; + *b = 0; + } else if (c1->a <= *a && c1->a+c1->len >= *a) + *b = c1->b + (*a - c1->a); + else + return 0; + + if (here->c2 < 0) { + if (*b) return 0; + *c = 0; + } else if (c2->a <= *b && c2->a + c2->len >= *b) + *c = c2->b + *b - c2->a; + else + return 0; + } else { + *c = here->pos; + + if (here->c2 < 0) { + if (*c) return 0; + *b = 0; + } else if (c2->b <= *c && c2->b +c2->len >= *c) + *b = c2->a + *c - c2->b; + else + return 0; + + + if (here->c1 < 0) { + if (*b) return 0; + *a = 0; + } else if (c1->b <= *b && c1->b + c1->len >= *b) + *a = c1->a + *b - c1->b; + else + return 0; + } + return 1; +} + +static int retreat(struct csl *c1, struct csl *c2, struct point *p) +{ + int a,b,c; + int slid = 0; + + retry: + if (p->in_a) { + /* retreat c1 to first coincidence containing or after pos */ + a = p->pos; + while ((p->c1 == 0 && a == 0) || + (p->c1 > 0 && c1[p->c1-1].a + c1[p->c1-1].len >= a)) { + if (!slid) + if ( a >= c1[p->c1].a) + break; + p->c1--; + } + + /* if we aren't in a co-incidence, just return */ + if (p->c1 >=0 && + c1[p->c1].a > a) + return 1; + + /* retreat c2 to first coincidence containing or after pos->b */ + if (p->c1 == -1) + b = 0; + else + b = c1[p->c1].b + a - c1[p->c1].a; + while ((p->c2 == 0 && b == 0) || + (p->c1 > 0 && c2[p->c2-1].a + c2[p->c2-1].len >= b)) { + if (!slid) + if (b >= c2[p->c2].a) + break; + p->c2--; + } + + /* check if this is a conflict */ + if ((p->c2>=0 && c2[p->c2].a > b)) + return 2; + + if (p->c2 == -1) + c = 0; + else + c = c2[p->c2].b + b - c2[p->c2].a; + + /* ok, this is the furthest backward double coincidence + * if we are not at the start of the A-B coincidence, + * slip up to C + */ + if (p->c1 >= 0 && a > c1[p->c1].a) { + p->in_a = 0; + p->pos = c; + slid = 1; + goto retry; + } + } else { + /* retreat c2 to first coincidence containing or after pos */ + c = p->pos; + while ((p->c2 == 0 && c == 0) || + (p->c2 > 0 && c2[p->c2-1].b + c2[p->c2-1].len >= c)) { + if (!slid) + if (c >= c2[p->c2].b) + break; + p->c2--; + } + + /* if we aren't in a coincidence, return */ + if (p->c2 >= 0 && + c2[p->c2].b > c) + return 1; + + /* retreat c1 to first coincidence containing or afer pos->b */ + if (p->c2 == -1) + b = 0; + else + b = c2[p->c2].a + c - c2[p->c2].b; + while ((p->c1==0 && b == 0) || + (p->c1 > 0 && c1[p->c1-1].b + c1[p->c1-1].len >= b)) { + if (!slid) + if (b >= c1[p->c1].b) + break; + p->c1--; + } + + /* check if this is a conflict */ + if ((p->c1>=0 && c1[p->c1].b > b)) + return 2; + + if (p->c1 == -1) + a = 0; + else + a = c1[p->c1].a + b - c1[p->c1].b; + + /* ok, this is the furthest backward double coincidence + * if we are at the start of the A-B coincidence, slide down to A + */ + if (p->c1 == -1 || + a == c1[p->c1].a) { + p->in_a = 1; + p->pos = a; + slid = 1; + goto retry; + } + } + if (p->pos == 0) + return 0; /* StartOfFile */ + + if (!slid) { + slid = 1; + goto retry; + } + + return 1; +} + +static int advance(struct csl *c1, struct csl *c2, struct point *p) +{ + int a,b,c; + int slid = 0; + /* make next char at point is the 'right' one, either in a or c. + * This might involve move p->c1 and p->c2 forward + * and changing pos/in_a to an 'equivalent' point + */ +/* + if (!p->in_a && c2[p->c2].b == p->pos && c2[p->c2].len == 0) + return 0; / * at end of file * / +*/ + retry: + if (p->in_a) { + /* advance c1 to last coincidence containing or before pos */ + a = p->pos; + while ((p->c1 == -1 || c1[p->c1].len) && + c1[p->c1+1].a <= a) { + if (!slid) + if ((p->c1== -1 && a ==0) || + (p->c1>=0 && a <= c1[p->c1].a+c1[p->c1].len)) + break; + p->c1++; + } + + /* if we aren't in a co-incidence, just return */ + if (p->c1 == -1 || c1[p->c1].a+c1[p->c1].len < a) + return 1; + + /* advance c2 to last coincidence containing or before pos->b */ + b = c1[p->c1].b + a- c1[p->c1].a; + while ((p->c2 == -1 || c2[p->c2].len) && + c2[p->c2+1].a <= b) { + if (!slid) + if ((p->c2 == -1 && b == 0) || + (p->c2 >= 0 && b <= c2[p->c2].a+c2[p->c2].len)) + break; + p->c2++; + } + + /* check if this is a conflict */ + if ((p->c2 == -1 && b >0) || + (p->c2>=0 && c2[p->c2].a + c2[p->c2].len < b)) + return 2; + + if (p->c2 == -1) + c = 0; + else + c = c2[p->c2].b + b - c2[p->c2].a; + + /* Ok, this is the furthest forward double coincidence + * If we are at eof, or the next char is in the coincidence + * slip up to c + */ + if (c1[p->c1].len == 0 || + a < c1[p->c1].a + c1[p->c1].len) { + p->in_a = 0; + p->pos = c; + slid = 1; + goto retry; + } + } else { + /* advance c2 to last coincidence containing or before pos */ + c = p->pos; + while ((p->c2 == -1 || c2[p->c2].len) && + c2[p->c2+1].b <= c) { + if (!slid) + if ((p->c2 == -1 && c == 0) || + (p->c2 >= 0 && c <= c2[p->c2].b+c2[p->c2].len)) + break; + p->c2++; + } + + /* if we aren't in a co-incidence then just return */ + if (p->c2 == -1 || c2[p->c2].b+c2[p->c2].len < c) + return 1; + + /* advance c1 to last coincidence containing or before pos->b */ + b = c2[p->c2].a + c - c2[p->c2].b; + while ((p->c1 == -1 || c1[p->c1].len) && + c1[p->c1+1].b <= b) { + if (!slid) + if ((p->c1 == -1 && b ==0) || + (p->c1 >= 0 && b <= c1[p->c1].b+c1[p->c1].len)) + break; + p->c1++; + } + + /* check if this is a conflict */ + if (p->c1 == -1 || c1[p->c1].b + c1[p->c1].len < b) + return 2; + + a = c1[p->c1].a + b - c1[p->c1].b; + + /* ok, this is the furthest forward double coincidence + * If it is the end of an A-B coincidence but not EOF, + * slide down to A + */ + if (a == c1[p->c1].a+ c1[p->c1].len && + c1[p->c1].len) { + p->in_a = 1; + p->pos = a; + slid = 1; + goto retry; + } + } + if (!p->in_a && c2[p->c2].b == p->pos && c2[p->c2].len == 0) + return 0; /* at end of file */ + if (!slid) { + slid = 1; + goto retry; + } + return 1; +} + +static int point_crossed(struct point first, struct point second, + struct csl *cs1, struct csl *cs2) +{ + int a1,b1,c1; + int a2,b2,c2; + + if (tripoint(&first, cs1,cs2, &a1,&b1,&c1) && + tripoint(&second, cs1,cs2, &a2,&b2,&c2)) + return a1>=a2 && b1>=b2 && c1>=c2; + return 0; +/* + return first.in_a == second.in_a && + first.pos == second.pos; +*/ +} + + +static void print_merger(FILE *out, struct file *a, struct file *c, + struct csl *cs1, struct csl *cs2, + struct point start, struct point end) +{ + while (!point_crossed(start, end, cs1,cs2)) { +#if 0 + printf("%c %d (%d,%d)\n", start.in_a?'A':'C', start.pos, start.c1,start.c2); +#endif + if (start.in_a) + printword(out, a->list[start.pos]); + else + printword(out, c->list[start.pos]); + fflush(out); /* DEBUG */ + + start.pos++; + if (point_crossed(start, end, cs1,cs2)) + break; + advance(cs1, cs2, &start); + + } +} + +static int inline at_sol(struct file *f, int i) +{ + return i == 0 || i == f->elcnt || + ends_line(f->list[i-1]); +} + +static void print_range(FILE *out, struct file *f, int start, int end) +{ + for (; start < end ; start++) + printword(out, f->list[start]); +} + +static int print_conflict(FILE *out, struct file *a, struct file *b, struct file *c, + struct csl *c1, struct csl *c2, + struct point start, struct point end, + int words) +{ + int astart, bstart, cstart; + int aend, bend, cend; + int bi; + +#if 0 + if (point_same(start,end)) + return 0; /* no conflict here !! */ +#endif + if (!tripoint(&start, c1,c2, &astart, &bstart, &cstart)) + abort(); + if (!tripoint(&end, c1,c2, &aend, &bend, &cend)) + abort(); + + + /* Now contract the conflict if possible, but insist on + * an end-of-line boundary unless 'words'. + */ + /* first contract leading removed text. + * so <<<--- X 1 ||| X 2 === 3 --->>> becomes <<<--- 1 ||| 2 === 3 --->>> + */ + bi = bstart; + while (bi < bend && start.c1 >= 0 && bi >= c1[start.c1].b && bi < c1[start.c1].b + c1[start.c1].len) { + bi++; + if (words || at_sol(b,bi)) { + astart += bi-bstart; + bstart = bi; + } + } + /* and contract trailing removed text */ + bi = bend; + while (bi > bstart && bi > c1[end.c1].b) { + bi--; + if (words || at_sol(b, bi)) { + aend -= bend-bi; + bend = bi; + } + } + + /* now contract leading unmatched text so + * <<<--- 1 ||| X 2 === X 3 --->>> becomes <<<--- 1 ||| 2 === 3 --->>> + */ + bi = bstart; + while (bi < bend && start.c2 >= 0 && bi >= c2[start.c2].a && bi < c2[start.c2].a + c2[start.c2].len) { + bi++; + if (words || at_sol(b,bi)) { + cstart += bi-bstart; + bstart = bi; + } + } + /* and trailing unmatched */ + bi = bend; + while (bi > bstart && bi > c2[end.c2].a) { + bi--; + if (words || at_sol(b,bi)) { + cend -= bend-bi; + bend = bi; + } + } + if (astart >= aend && bstart >= bend && cstart >= cend) + return 0; + + fputs(words?"<<<---":"<<<<<<<\n", out); + print_range(out, a, astart, aend); + fputs(words?"|||":"|||||||\n", out); + print_range(out, b, bstart, bend); + fputs(words?"===":"=======\n", out); + print_range(out, c, cstart, cend); + fputs(words?"--->>>":">>>>>>>\n", out); + return 1; +} + +static int end_of_file(struct point p, struct csl *c1, struct csl *c2) +{ + return advance(c1,c2,&p)==0; +} + +static int next_conflict(struct point here, struct csl *start_c1, struct csl *start_c2, + struct point *start, struct point *end, + struct file *a, struct file *b, struct file *c) +{ + /* We want to find the start and end of the 'next' conflict. + * There may not be another conflict, in which case set start and + * end to the end of the files. + * The start and end of a conflict must be the end and start of + * regions where A matches B and B matches C - except for + * The start which might be the start of the file. + * 'here' is a potentially valid starting point. Any other starting + * point must be the end of a double coincidence. + * + * So we walk c1 and c2 looking for double coincidences and conflicts. + * When we find a conflict, we remember the fact. + * When we find a double coincidence we: + * Set 'end' to the start of the DC. + * If conflict-found - return. + * Set 'start' to the end of the DC. + * If the DC was EOF, start will == end == EOF, and we return. + * + * A double coincidence is easily detected by just looking at a single + * entry in c1 and c2. If + * c1->b+c1->len > c2->a && c2->a+c2->len > c1->b + * || c1->len == c2->len == 0 + * then we have a double coincidence. + * + * A conflict is detected when stepping forward. + * If we step c2 forward and the new coincidence is beyond or at the + * end of c1, or we step forward c1 and it's start is beyond or at the end of c2, + * then that is a conflict. + * Also, we can detect a conflict at start-of-file (here.in_a, here.pos==0) if + * c2 doesn't start at 0. + * + * 'here' is significant only for its c1/c2 values. They will contain a + * double coincidence, though it might be start-of-file. + * start must be set to a backward-looking point at the end of a double-coincidence + * and end to a forward-looking point and the start of a double-coincidence + */ + + + int conflict_found = 0; + struct csl *c1 = start_c1; + struct csl *c2 = start_c2; + + + c1 += here.c1; + c2 += here.c2; + + *start = here; + + while (1) { + /* Step one of c1 or c2 forward + * depending on which ends earlier. + * Watch to see if we are stepping over a conflict. + */ + if (c2 < start_c2) { + /* start-of-file. + * Move both c1 and c2 forward. + * + * We have a conflict iff new c1->b > 0 and c2->a > 0 + * or c1->b >0 && c2->b > 0 + */ + c1++; c2++; + if (c1->b > 0 && + (c2->a > 0 || c2->b > 0)) + conflict_found = 1; + if (c2->a+c2->len < c1->b) + conflict_found = 1; + } else if (c1->b+c1->len == c2->a+c2->len) { + /* both coincidences end at same place. There is + * a conflict if there is a gap in c1->b or + * c2->a has no gap but c2->b does (implying insertion + * at undefined location + */ + if (c1->len && c2->len) { + if (c1[1].b > c1->b + c1->len || + (c2[1].a == c2->a + c2->len && + c2[1].b > c2->b + c2->len)) + conflict_found = 1; + } + if (c1->len) + c1++; + if (c2->len) + c2++; + } else if (c2->len ==0 || (c1->len && c1->b+c1->len < c2->a+c2->len)) { + /* c1 ends earlier. If the new start of c1 is + * beyond the current end of c2, we have a conflict + */ + c1++; + if (c1->b > c2->a+c2->len) + conflict_found = 1; + } else { + /* c2 ends earlier. If the new start of c2 is + * beyond the end of c1, we have a conflict. + * Also if the new start of c2 is at the end of c1, + * and the old end of c2 is also at end of c1, + * then have a conflict, as long as there was actually + * something inserted there... + */ + c2++; + if (c2->a > c1->b+c1->len) + conflict_found = 1; + } + if ((c1->len == 0 && c2->len ==0) || + (c1->b+c1->len >= c2->a && c2->a+c2->len >= c1->b) + ) { + /* double coincidence ! + * It starts at max of c1->b and c2->a, in c + * and ends at min of c1->b+len (in a), c2->a+len (in c) + */ + end->c1 = c1-start_c1; + end->c2 = c2-start_c2; + + if (conflict_found) { + /* end->c1/c2 holds the end of the conflict, + * and start->c1/c2 holds the start + * We need to set in_a and pos for each + * so that start is backward-looking and the end + * of a double-coincidence, and end is forward-looking + * at the start of a double-coincidence. + */ + + c1 = start_c1; + c2 = start_c2; + + if (start->c1 == -1) { + start->in_a = 1; + start->pos = 0; + } else if (c1[start->c1].b+c1[start->c1].len <= + c2[start->c2].a+c2[start->c2].len) { + start->in_a = 1; + start->pos = c1[start->c1].a+c1[start->c1].len; + } else { + start->in_a = 0; + start->pos = c2[start->c2].b+c2[start->c2].len; + } + retreat(c1,c2, start); + + if (c1[end->c1].b <= c2[end->c2].a) { + end->in_a = 0; + end->pos = c2[end->c2].b; + } else { + end->in_a = 0; + end->pos = c2[end->c2].b + + c1[end->c1].b - c2[end->c2].a; + } + advance(c1,c2, end); + return 1; + } + start->c1 = c1-start_c1; + start->c2 = c2-start_c2; + + if (c1->len == 0 && c2->len == 0) { + /* eof and no conflict found. + * set start and end to eof + */ + start->in_a = end->in_a = 0; + start->pos = end->pos = c2->b; + return 0; + } + } + } +} + +static int already_applied(struct csl *cs1, struct csl *cs2, + struct point start, struct point end, + struct file *a, struct file *b, struct file *c) +{ + /* check if this conflict reflects and already-applied change + * i.e. the section in a matches the section in b + */ + int a1,b1,c1; + int a2,b2,c2; + + if (!tripoint(&start,cs1,cs2,&a1,&b1,&c1)) + abort(); + if (!tripoint(&end,cs1,cs2,&a2,&b2,&c2)) + abort(); + if (a1==a2 && b1==b2) return 0; + if ((a2-a1) != (c2-c1)) return 0; + + while (a1list[a1], &c->list[c1])) + return 0; + a1++; + c1++; + } + return 1; +} + +static int Startofline(struct point p, struct csl *cs1, struct csl *cs2, + struct file *a, struct file *b, struct file *c) +{ + int a1,b1,c1; + return + tripoint(&p,cs1,cs2,&a1,&b1,&c1) && + at_sol(a,a1) && at_sol(b,b1) && at_sol(c,c1); + +} + +struct ci print_merge(FILE *out, struct file *a, struct file *b, struct file *c, + struct csl *c1, struct csl *c2, + int words) +{ + struct point start_last, end_last, start_next, end_next; + + struct ci rv; + rv.ignored = rv.conflicts = 0; + +#if 0 + { int i; + for (i=0; c1[i].len; i++) printf("%2d c1 %d:%d %d\n", i, c1[i].a,c1[i].b,c1[i].len); + printf("%2d c1 %d:%d END\n", i, c1[i].a,c1[i].b); + for (i=0; c2[i].len; i++) printf("%2d c2 %d:%d %d\n", i, c2[i].a,c2[i].b,c2[i].len); + printf("%2d c2 %d:%d END\n", i, c2[i].a,c2[i].b); + } +#endif + /* end_last is a forward looking point */ + end_last.pos = 0; + end_last.in_a = 1; + end_last.c1 = end_last.c2 = -1; + advance(c1,c2, &end_last); + + /* start_last is a backward looking point */ + start_last.pos = 0; + start_last.in_a = 1; + start_last.c1 = start_last.c2 = 0; + retreat(c1,c2, &start_last); + + while (!end_of_file(end_last, c1, c2)) { + next_conflict(end_last, c1, c2, &start_next, &end_next, a, b, c); + while (already_applied(c1,c2,start_next,end_next,a,b,c)) { + rv.ignored++; + next_conflict(end_next, c1,c2,&start_next,&end_next,a,b,c); + } +#if 0 + printf("start %d %d (%d,%d) end %d %d (%d,%d)\n", + start_next.in_a, start_next.pos, start_next.c1, start_next.c2, + end_next.in_a, end_next.pos, end_next.c1, end_next.c2); +#endif + while (!point_crossed(end_last, start_next,c1,c2) && + !(words || Startofline(end_last, c1,c2, a,b,c))) { + end_last.pos++; + advance(c1,c2, &end_last); + } + + while (!point_crossed(end_last, start_next, c1,c2) && + !(words || Startofline(start_next, c1,c2, a,b,c))) { + start_next.pos--; + retreat(c1,c2, &start_next); + } + + if (point_crossed(end_last, start_next, c1,c2)) { + end_last = end_next; + continue; + } + if (print_conflict(out, a,b,c, c1,c2, start_last, end_last, words)) + rv.conflicts++; + + print_merger(out, a,c, c1,c2, end_last, start_next); + start_last = start_next; + end_last = end_next; + } + if (print_conflict(out,a,b,c, c1,c2, start_last, end_last, words)) + rv.conflicts++; + return rv; +} diff --git a/notes b/notes new file mode 100644 index 0000000..e5922d2 --- /dev/null +++ b/notes @@ -0,0 +1,101 @@ + +Wiggle - wiggle a mis-match patch into a file. + +Given + 1/ a file + 2/ a patch - which is two file fragments + + find the minimal differences between the fragments in the patch + and apply those to the file. + This requires us to do a word-diff of file with frag-A, and + frag-A with frag-B, and the merge the result. + + We read in the file and 2 frags and break them into words and keeping + an index and hash for each. + + We then perform the two diffs producing lists of inserts and deletes. + + + +ToDo + + implement --replace + describe and implement correct replacement procedure + Reject matches that have a dis-proportionate cost + implement testing structure. DONE + + + +Testing: + A directory tree containing tests. We look for key files + and run the appropriate test. + Key files are: + script : run that script in that directory + diff : if new exists, diff orig with new + else diff 'orig' with -1 of 'patch' + ldiff : as above, but lines + rediff : rediff 'patch' + merge : if 'patch' merge 'orig' with 'patch' + else merge 'orig' 'new' 'new2' + + +Replacement procedure: + + Goal: Every change between A' and B' must be merged into + A somehow to produce B. + + We can think of changes as additions, deletions, or replacements. + + Every addition must be inserted somewhere, at the site of + best match for the context. If there is no good match... + I guess we insert at start or finish. + + Every deletion is merged either by deleting matching text, + or inserting the string <<<---deleted-text--->>> and some + reasonably appropriate location. + + Every replacement is merged either by removing the original + and replacing by the new, or by inserting + <<<---oldtext///newtext+++>>> + + + For each difference b->c between B and C: + if b precisely aligns with a in A, then replace a with c + else find some set of lines that b maybe is in and produce: + + <<<<<<<<<< + segment from A + |||||||||| + b, upto newlines + ========== + c, upto newlines + >>>>>>>>>> + + + Maybe several (two?) passes. + +-mw orig new new2 in tests/test dies. - FIXED + +in test5, -dw orig new + produces strange output FIXED + + +if no matches are found, core is domps as lcsl is NULL FIXED + +wdiff to look more like udiff + unchanged ++addition +-deletion +|change<<<+++additions+++>>> and <<<---deletions--->>>> + + +@@ line,numbers @@ in diff output + +Speed: us aproxword for pdiff lineup.DONE + +"refine" takes a diff and refines it, sortof + +return a lcsl when reading a patch and refine that +rather than computing from scratch. + +FIXME: pdiff should pick best bit, and rediff the two sides. DONE \ No newline at end of file diff --git a/p b/p new file mode 100755 index 0000000..755e0a2 --- /dev/null +++ b/p @@ -0,0 +1,727 @@ +#!/bin/bash + +# patch management +# +# Copyright (C) 2003 Neil Brown +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Author: Neil Brown +# Email: +# Paper: Neil Brown +# School of Computer Science and Engineering +# The University of New South Wales +# Sydney, 2052 +# Australia + + +# metadata is in .patches +# there is: +# files: list of all files checked out +# name: name of current patch +# status: status of current patch +# notes: notes on current patch +# applied/ patches applied nnn-name +# removed/ patches removed nnn-name +# included/ patches that have been included upstream +# patch: a recent copy of the 'current' patch +# get-version: a script which will report the version number of the base dist +# dest/ symlink to directory to publish snapshots to +# mail/ composed mail messages ready for sending +# maintainer who to email patches to (Linus etc) +# cc who to CC patches to: prefix address +# +# the nnn in names in applied and removed are sequence numbers +# whenever we add a file we choose one more than the highest used number +# patch files contain then name implicitly and start with +# Status: status +# then a blank line, normally a one line description, another blank, and more detail. +# + +# +# Todo - auto bk pull: +# bk pull +# bk export -t patch -r DEVEL, > /tmp/apatch +# bk tag DEVEL +# while p open last && p discard ; do : ; done +# p clean +# patch -p1 -f < /tmp/apatch + +find_home() +{ + # walk up directory tree until a .patches directory + # is found. + # set OrigDir to name of where we were .. not dots. + OrigDir= + dir=`pwd` + while [ ! -d .patches -a " $dir" != " /" ] + do + base=${dir##*/} + base=${base#/} + dir=${dir%/*} + case $dir in + "" ) dir=/ + esac + OrigDir=$base/$OrigDir + cd .. + done + test -d .patches +} + +get_meta() +{ + name=`cat .patches/name 2> /dev/null` + status=`cat .patches/status 2> /dev/null` +} + +forget_one() +{ + if cmp -s "$1" "$1~current~" && cmp -s "$1" "$1~orig~" + then + rm -f "$1~current~" "$1~orig~" + chmod -w "$1" + else + echo >&2 "ERROR $1 doesn't match original" + fi +} + + +snap_one() +{ + cp "$1" "$1~snapshot~" +} + +snap_diff() +{ + diff -u "$1" "$1~snapshot~" +} +snap_back() +{ + cp "$1~snapshot~" "$1" +} + +check_out() +{ + file=$1 + file=${file#./} + [ -f $file ] || >> $file + if [ -f $file ] + then + if [ ! -f "$file~orig~" ] ; then + mv "$file" "$file~orig~" + cp "$file~orig~" "$file" + echo $file >> .patches/files + sort -o .patches/files .patches/files + chmod u+w "$file" + fi + if [ ! -f "$file~current~" ] ; then + mv "$file" "$file~current~" + cp "$file~current~" "$file" + fi + else + echo >&2 Cannot checkout $file + fi +} + +all_files() +{ + >> .patches/files + while read file + do eval $1 $file + done < .patches/files +} + +diff_one() +{ + if cmp -s "$1~current~" "$1" + then : + else + echo + echo "diff ./$1~current~ ./$1" + diff --show-c-function -u ./$1~current~ ./$1 + fi +} + +diff_one_orig() +{ + if cmp -s "$1~orig~" "$1" + then : + else + echo + echo "diff ./$1~orig~ ./$1" + diff --show-c-function -u ./$1~orig~ ./$1 + fi +} + +commit_one() +{ + rm -f "$1~current~" + mv "$1" "$1~current~" + cp "$1~current~" $1 + chmod u+w $1 +} + +discard_one() +{ + rm -f "$1" + cp "$1~current~" $1 + chmod u+w $1 +} + +swap_one() +{ + mv "$1" "$1.tmp" + mv "$1~current~" "$1" + mv "$1.tmp" "$1~current~" +} + +make_diff() +{ + { + [ -s .patches/status ] && echo "Status: `cat .patches/status`" + echo + [ -s .patches/notes ] && { cat .patches/notes ; echo; } + all_files diff_one > .patches/tmp + echo " ----------- Diffstat output ------------" + diffstat -p0 2> /dev/null < .patches/tmp + cat .patches/tmp + [ -s .patches/tmp ] || rm .patches/patch + rm .patches/tmp + } > .patches/patch +} + +save_patch() +{ + dir=.patches/$1 + name=$2 + # move .patches/patch to $dir/nnn$name + #for some new nnn + [ -d $dir ] || mkdir $dir || exit 1 + largest=`ls $dir | sed -n -e 's/^\([0-9][0-9][0-9]\).*/\1/p' | sort -n | tail -1` + if [ "0$largest" -eq 999 ] + then echo >&2 'ARRG - too many patches!' ; exit 1 + fi + new=`expr "0$largest" + 1001` + new=${new#1} + mv .patches/patch $dir/$new$name +} + +find_prefix() +{ + # set "prefix" to number for -pn by looking at first file in given patch. + file=`lsdiff $1 | head -1` + orig=$file + prefix=0 + while [ -n "$file" -a ! -f "$file" ] + do + file=`expr "$file" : '[^/]*/\(.*\)'` + prefix=`expr $prefix + 1` + done + if [ -z "$file" ] + then echo "Cannot find $orig" >&2 ; exit 1; + fi + if [ " $orig" != " $file" ] + then + echo "Found $orig as $file - prefix $prefix" + fi +} + +extract_notes() +{ + # remove first line, Status: line, leading blanks, + # everything from ' *---' and trailing blanks + awk ' + BEGIN { head= 1; blanks=0 ; } + head == 1 && ( $1 == "Status:" || $0 == "" ) { + next; + } + { head = 0; } + $0 == "" { blanks++; next; } + $0 ~ /^ *---/ { exit } + { while (blanks > 0) { + blanks--; print ""; + } + print $0; + } + ' $1 +} + + +if [ $# -eq 0 ] +then + echo >&2 'Usage: p [help|co|make|discard|commit|status|name|...] args' + exit 1 +fi +cmd=$1 +shift + +if [ " $cmd" = " help" ] || find_home +then : +else echo >&2 "p $cmd: cannot find .patches directory" + exit 1 +fi + +case $cmd in + co ) + if [ $# -ne 1 ] ; then + echo >&2 Usage: p co file; exit 1 + fi + file=$1 + if [ ! -f "$OrigDir$file" ] + then + echo >&2 "p co: file $file not found"; exit 1; + fi + check_out "$OrigDir$file" + + ;; + make | view ) + case $1 in + "" ) + make_diff + if [ -s .patches/patch ] ; then + pfile=.patches/patch + else + echo >&2 "No current patch" ; exit 1; + fi + ;; + + */* ) pfile=$1;; + * ) pfile=`echo .patches/[ra][ep][mp]*/*$1*` + esac + if [ ! -f "$pfile" ] + then echo >&2 "Cannot find unique patch '$1' - found: $pfile"; exit 1; + fi + ${PAGER-less} $pfile; + ;; + + all ) + all_files diff_one_orig + ;; + status | name ) + case $# in + 1 ) + get_meta + if [ $cmd = name ] ; then + if [ -n "$name" ]; then + echo "changing name from '$name' to '$1'" + else + echo "Setting name to '$1'" + fi + echo "$1" > .patches/name + fi + if [ $cmd = status ] ; then + if [ -n "$status" ]; then + echo "changing status from '$status' to '$1'" + else + echo "Setting status to '$1'" + fi + echo "$1" > .patches/status + fi + ;; + 0 ) + get_meta + echo -n "Name ($name)? " ; read name + echo -n "Status ($status)? " ; read status + [ -n "$name" ] && { echo $name > .patches/name ; } + [ -n "$status" ] && { echo $status > .patches/status ; } + ;; + * ) + echo "Usage: p $cmd [new-$cmd]"; exit 1; + esac + ;; + note* ) + >> .patches/notes + ${EDITOR:-vi} .patches/notes + ;; + discard|commit ) + make_diff + if [ -s .patches/patch ] + then : + else echo >&2 No patch to $cmd ; exit 1 + fi + if [ -s .patches/to-resolv ] + then echo "Please resolve outstanding conflicts first with 'p resolve'" + exit 1 + fi + get_meta + if [ -z "$name" ] ; then + echo -n "Name? " ; read name + if [ -z "$name" ] ; then + echo >&2 "No current name, please set with 'p name'" + exit 1; + fi + echo $name > .patches/name + fi + if [ -z "$status" ] ; then + echo -n "Status? " ; read status + if [ -z "$status" ] ; then + echo >&2 "No current status, please set with 'p status'" + exit 1; + fi + echo $status > .patches/status + fi + if [ -s .patches/notes ] + then : + else + { echo "Title...." + echo + echo "Description..." + echo + echo "====Do Not Remove====" + cat .patches/patch + } > .patches/notes + ${EDITOR-vi} .patches/notes + mv .patches/notes .patches/tmp + sed '/^====Do Not Remove====/,$d' .patches/tmp > .patches/notes + rm .patches/tmp + fi + make_diff + + if [ $cmd = commit ] ; then + save_patch applied "$name" + echo Saved as $new$name + all_files commit_one + else + save_patch removed "$name" + echo Saved as $new$name + all_files discard_one + fi + rm -f .patches/name .patches/status .patches/notes + ;; + + purge ) + make_diff + mv .patches/patch .patches/last-purge + all_files discard_one + rm -f .patches/name .patches/status .patches/notes + ;; + open ) + make_diff + get_meta + if [ -s .patches/patch ] + then + echo >&2 Patch $name already open - please commit; exit 1; + fi + if [ $# -eq 0 ] + then + echo "Available patches are:" + ls .patches/applied + exit 0 + fi + if [ $# -ne 1 ] + then echo >&2 "Usage: p open patchname" ; exit 1 + fi + if [ " $1" = " last" ] + then + pfile=`ls -d .patches/applied/[0-9]* | tail -1` + else + pfile=`echo .patches/applied/*$1*` + fi + if [ ! -f "$pfile" ] + then echo >&2 "Cannot find unique patch '$1' - found: $pfile"; exit 1 + fi + # lets see if it applies cleanly + if patch -s --fuzz=0 --dry-run -R -f -p0 < "$pfile" + then echo Ok, it seems to apply + else echo >&2 "Sorry, that patch doesn't apply" ; exit 1 + fi + # lets go for it ... + patch --fuzz=0 -R -f -p0 < "$pfile" + all_files swap_one + sed -n -e '2q' -e 's/^Status: *//p' $pfile > .patches/status + base=${pfile##*/[0-9][0-9][0-9]} + [ -s .patches/name ] || echo $base > .patches/name + extract_notes $pfile >> .patches/notes + mv $pfile .patches/patch + + ;; + included ) + force= + if [ " $1" = " -f" ] ; then + force=yes; shift + fi + make_diff; get_meta + if [ -s .patches/patch ] + then + echo >&2 Patch $name already open, please commit; exit 1; + fi + if [ $# -eq 0 ] + then + echo "Unapplied patches are:" + ls .patches/removed + exit 0; + fi + if [ $# -ne 1 ] + then + echo >&2 "Usage: p included patchname"; exit 1 + fi + case $1 in + last ) pfile=`ls -d .patches/removed/[0-9]* | tail -1` ;; + */* ) echo >&2 "Only local patches can have been included"; exit 1 ;; + *) pfile=`echo .patches/removed/*$1*` + esac + if [ ! -f "$pfile" ] + then echo >&2 "Cannot find unique patch '$1' - found $pfile"; exit 1 + fi + echo "Using $pfile..." + + # make sure patch applies in reverse + if patch -s --fuzz=0 --dry-run -f -p0 -R < "$pfile" + then echo "Yep, that seems to be included" + elif [ -n "$force" ] + then echo "It doesn't apply reverse-out cleanly, but you asked for it..." + else echo >&2 "Sorry, patch cannot be removed"; exit 1 + fi + mv "$pfile" .patches/patch + name=${pfile##*/[0-9][0-9][0-9]} + save_patch included $name + echo "Moved to $new$name" + ;; + list ) + echo "Applied patches are:" + ls .patches/applied + + echo "Unapplied patches are:" + ls .patches/removed + exit 0 + ;; + apply ) + force= append= + if [ " $1" = " -f" ]; then + force=yes; shift + fi + if [ " $1" = " -a" ]; then + append=yes; shift + fi + make_diff + get_meta + if [ -s .patches/patch -a -z "$append" ] + then + echo >&2 Patch $name already open - please commit ; exit 1; + fi + if [ $# -eq 0 ] + then + echo "Unapplied patches are:" + ls .patches/removed + exit 0 + fi + if [ $# -ne 1 ] + then echo >&2 "Usage: p apply patchname"; exit 1 + fi + case $1 in + last ) pfile=`ls -d .patches/removed/[0-9]* | tail -1` ; echo last is "$pfile";; + */* ) pfile=$1 ;; + * ) pfile=`echo .patches/removed/*$1*` + esac + if [ ! -f "$pfile" ] + then echo >&2 "Cannot find unique patch '$1' - found: $pfile"; exit 1 + fi + find_prefix "$pfile" + lsdiff --strip=$prefix "$pfile" | grep -v 'file.*changed' | while read a b + do check_out $a + done + # lets see if it applies cleanly + if patch -s --fuzz=0 --dry-run -f -p$prefix < "$pfile" + then echo OK, it seems to apply + elif [ -n "$force" ] + then echo "It doesn't apply cleanly, but you asked for it...." + echo "Saving original at .patches/last-applied" + cp $pfile .patches/last-applied + else echo >&2 "Sorry, patch doesn't apply"; exit 1 + fi + # lets go for it ... + patch --fuzz=0 -f -p$prefix < "$pfile" | tee .patches/tmp + sed -n -e '2q' -e 's/^Status: *//p' $pfile > .patches/status + base=${pfile##*/} + base=${base##[0-9][0-9][0-9]} + base=${base##patch-?-} + [ -s .patches/name ] || echo $base > .patches/name + extract_notes $pfile >> .patches/notes + + sed -n -e 's/.*saving rejects to file \(.*\).rej/\1/p' .patches/tmp | + while read file + do echo Wiggling $file.rej into place + rm -f $file.porig + wiggle --replace --merge $file $file.rej || + echo $file >> .patches/to-resolve + done + + case $pfile in + .patches/removed/* ) + mv $pfile .patches/patch + esac + ;; + + publish ) + name=`date -u +%Y-%m-%d:%H` + if [ -d .patches/dest ] + then : good + else echo >&2 No destination specified at .patches/dest ; exit 1; + fi + if [ -d .patches/dest/$name ] + then + echo >&2 $name already exists ; exit 1 + fi + target=.patches/dest/$name + mkdir $target + if [ -f .patches/get-version ] ; + then ./.patches/get-version > $target/version + fi + [ -f .config ] && cp .config $target + cp .patches/applied/* $target + mkdir $target/misc + cp 2> /dev/null .patches/removed/* $target/misc || rmdir $target/misc + chmod -R a+rX $target + all_files diff_one_orig > $target/patch-all-$name + cd $target + echo Published at `/bin/pwd` + ;; + clean ) + all_files forget_one + > .patches/files + ;; + openall ) + while p open last && p discard ; do : ; done + ;; + snapshot ) + all_files snap_one + ;; + snapdiff ) + all_files snap_diff + ;; + snapback ) + all_files snap_back + ;; + resolve ) + if [ ! -s .patches/resolving ] + then sort -u .patches/to-resolve > .patches/resolving ; > .patches/to-resolve + fi + if [ ! -s .patches/resolving ] + then echo "Nothing to resolve" ; exit 0; + fi + echo "Resolving: " ; cat .patches/resolving + for file in `cat .patches/resolving` + do + ${EDITOR:-vi} $file + rm -f $file.porig + wiggle --replace --merge $file || + echo $file >> .patches/to-resolve + done + > .patches/resolving + ;; + pull ) + cd .patches/SOURCE && bk pull + ;; + update ) + p openall && p clean && + (cd .patches/SOURCE ; bk export -tpatch -rLATEST, ) > .patches/imported-patch && + patch --dry-run -f -p1 < .patches/imported-patch && + patch -f -p1 < .patches/imported-patch && + ( rm .patches/imported-patch ; cd .patches/SOURCE ; bk tag LATEST ) + ;; + + premail ) + # Convert some applied patches into email messages. + # Select patches that start with $1. Look in .patches/cc for who to Cc: to + rmdir .patches/mail 2>/dev/null + if [ -d .patches/mail ] ; then + echo >&2 There is already some email - run "email" or "nomail" + ls .patches/mail + exit 1; + fi + mkdir .patches/mail + if [ ! -s .patches/maintainer ] ; then + echo "No maintainer - please add one" + exit 1; + fi + if [ ! -s .patches/owner ] ; then + echo "Your address and other headers must be in .patches/owner" + exit 1; + fi + cnt=$(ls .patches/applied/???${1}* | wc -l) + cnt=$(echo $cnt) # discard spaces + this=1 + for patch in .patches/applied/???${1}* + do + { + sprefix= + cat .patches/owner + echo "To: `cat .patches/maintainer`" + if [ -s .patches/cc ] ; then + while read word prefix addr + do if [ " $word" = " $1" ] ; then + echo "Cc: $addr" + sprefix="$prefix - " + fi + done < .patches/cc + fi + head=`sed -e '/^Status/d' -e '/^$/d' -e q $patch` + if [ $cnt = 1 ] + then + echo "Subject: [PATCH] $sprefix $head" + else + echo "Subject: [PATCH] $sprefix$this of $cnt - $head" + fi + echo + echo '### Comments for ChangeSet' + sed -e '1,/^[^S]/d' $patch + } > .patches/mail/${patch#.patches/applied/} + this=$(expr $this + 1) + done + ls .patches/mail + ;; + + nomail ) + echo "Removing .patches/mail directory" + rm -rf .patches/mail + ;; + + email ) + PATH=/usr/lib:/usr/sbin:$PATH + for i in .patches/mail/* + do + if [ -f "$i" ] + then + echo Sending $i. + sendmail -t < $i && rm $i + fi + done + ;; + help ) + helpfile=$0.help + if [ ! -f $helpfile ] + then echo >&2 $helpfile not found: no help available ; exit 2; + fi + if [ -z "$1" ] ; then + echo + sed -n -e '/^ /p' -e '/^[^ ]/q' $helpfile + echo + echo "Available help topics are:" + sed -n '/^[^ ]/p' $helpfile | sort | column + else + echo + awk '$0 ~ /^[^ ]/ && printed {doprint=0; printed=0} + doprint && $0 !~ /^[^ ]/ {print; printed=1;} + $0 == "'$1'" {doprint=1; found=1} + END { if (!found) print "No help available for '$1'"; } + ' $helpfile + echo + fi + ;; + * ) + echo >&2 "p $cmd - unknown command - try 'p help'"; exit 1; +esac +exit 0; diff --git a/p.help b/p.help new file mode 100644 index 0000000..fc22772 --- /dev/null +++ b/p.help @@ -0,0 +1,327 @@ + p is a tool for managing patches. It contains many + subcommands. To use a particular subcommand, give it + as the first argument to p, and then give any arguments + that subcommand requires + +files + p keeps all it's files and patches in a subdirectory of + the toplevel directory of a project. This subdirectory + is called ".patches". It is often convenient for + ".patches" to actually be a symbolic link to somewhere + else altogether. + + The files and directories contained in .patches are: + applied/ A directory containing applied patches + removed/ A directory containing removed patches + include/ A directory containing included patches + Files in these directories are prefixed by a 3digit number + which indicate thr order in which patches were added. + The remainder of the filename is the name of the patch. + Each file contains: + Status: status + ... notes ... + ... diffstat output ... + the actual patch + name A file containing the name of the current patch + status A file containing the status of the current patch + notes A file with notes about the patch + patch A a recently generated copy of the current patch + files A list of files that are 'checked out' + to-resolve A list of files that might have conflicts that need resolving + tmp A temporary file + last-applied A most recently apply patch that had conflicts + last-purge + dest/ A directory where 'p publish' puts patch sets. + SOURCE/ A directory where a bk repository lives. + mail/ A directory of patches converted to email messages + cc A files listing: prefix name emailaddr + When mailing patches which start with prefix, name + is put on the subject line, and the mail is cc:ed to + emailaddr + maintainer This is where patches are mailed to + owner These mail headers are included in each mail message + get-version A script to get a base version number for use when publishing + to-resolve List of files have have outstanding conflicts to be resolved. + + +model +overview + What is 'p' ? + + 'p' is a patch management system, not a source code control system. + It allows you to create a set of patches against a base release, to + annotate those patches with comments, and to revisit and edit patches + after they have been committed. + + It also allows you to update the base release that the patches are + against, and then re-apply all patches. + + At any time, there are a number of applied patches, a number of + removed patches and possibly a current patch. + The sets of applied and removed patches act much like stacks. The current + patch can be moved to the top of either (commit or discard), and the top + of either patch can be moved to the current patch (open or apply). + open and apply actualy allow any patch in the corresponding stack to be + made current, and assume that the use won't re-order patches that + should not be re-ordered. + + To enable 'p' for a project, you simply create a directory called ".patches" + in the top level directory of that project. Files should be checked out + ("p co filename") before editing but never need to be checked in. Applying + and external patch automatically checks out all modified files. + + Often it is appropriate to have the .patches directory elsewhere (for + example in an http-export directory tree for public access) and have a + symlink from .patches to that location. + + p can be run from any subdirectory of a project containing a .patches + directory. + + To find out about the contents of the .patches directory, see + p help files + + Some common commands are: + p co filename # monitor changes to filename + p make # create and view the current patch + p commit # commit the current patch + p discard # discard current patch, saving it as + # a removed patch + p apply # re-apply a removed patch, or apply + # an external patch + p list # list current patches + +co + Usage: p co filename + + prepare filename for editing. This makes sure there is a + copy of the file with a ~current~ suffix, and that the file + is listed in in .patches/files. This command can be run from + a subdirectory of the project, and it will still do the + right thing. + +make +view + Usage: p make + p view [patchnamefragment] + + make and view provide the same functionality. + When given a patch name fragment, they will allow the unique + patch with that name (either applied or removed) to be viewed + (using the pager $PAGER, or less). + Without an argument, the current patch is calculated and + displayed. This explains the two names as with no argument, + they both make, and view the current patch. + +all + Usage: p all + + Generate a composite patch of all currently applied patches. + This involves creation a patch from the ~orig~ version of every + file to it's current version. + +status +name + + Usage: p status [newstatus] + p name [newname] + + If a new status or name is given, it is recorded as the current + status or name for the current patch. If no argument is given, + the command will prompt for both a new name and a new status. + The current value is offered as a default in each case. + +note +notes + Usage: p notes + + Open the notes describing the current patch in an $EDITOR + The notes should contain a simple one-line description, + a black line, and then a detailed description. + +discard + Usage: p discard + + The current patch is discard: moved to the .patches/removed + directory. If it doesn't have a name or status, these are + prompted for. + +commit + Usage: p commit + + The current patch is commit: moved to the .patches/applied + directory. If name or status aren't set, these are prompted + for. If no notes have been written, and $EDITOR session is + started with a template for some notes. + The patch is presented in the file being edited for reference, + but will be removed from the notes on exit. + +open + Usage: p open [last | patch-name-fragment] + + The open command is used to open a previously commited + patch for further editing. + + Without any argument, a list of available commited patches + is given. + If the argument 'last'is given, then the most recently commited + patch is opened. + Otherwise a unique patch with a name containing the name fragment + is openned. If there is no such unique patch, and error message + is given. + +included + Usage: p included [-f] [last | patch-name-fragment] + + After updating the base release of a project, some of the patches + which are currently "removed" may already have been included in that + release and so don't need to be maintained any more. + + The "included" command will check if a given patch appears to have + been included and if so, moves it to the .patches/included directory. + The test is performed by seeing if 'patch' is able to remove the + patch. If it cannot, but you are sure that the patch has been included + (the problems patch reports are spurious) then using '-f' will cause + the patch to be moved to 'included' anyway. + +list + Usage: p list + + List all the patches in either 'applied' or 'removed'. + +apply + Usage: p apply [-f] [-a] [last | patch-name-fragment | filename] + + This command is used for applying a patch to the project. + If a patch in 'removed' is given, then it is moved out of 'removed' + and is applied. If a filename is given, the patch in that file is + applied but the file is left unchanged. + + When applying a patch, all affected files are checked-out first. + + If 'patch' cannot apply the patch without error, 'apply' will fail. + Giving the '-f' option will cause 'apply' to apply the patch anyway, + and then run 'wiggle' to merge any rejected patch chunks as best + as possible. Any files for which wiggle finds unresolvaable conflicts + while have its name saved in a file (.patches/to-resolve). This + list is used by the 'p resolve' command. + + Normally, 'apply' will not apply a patch to be applies if there is + one already open. However the '-a' option may be given to ask + 'apply' to "append" the patch to the current patch. + +resolve + Usage: p resolve + + This is used to resolve any conflicts found by wiggle. Each file + listed in .patches/to-resolve is presented for editing, and then + has wiggle run over it again to check that all conflicts have + been resolved. + +publish + Usage: p publish + + The 'publish' command will create a new subdirectory of + .patches/dest + (which is often a symlink to a web-page area) and copy + all current applied and removed patches into that directory. + It also creates a complete patch (with "p all") and stores + that in the directory. + +clean + Usage: p clean + + clean checks that no patches are currently applied, and + cleans up any ~current~ or ~orig~ files that have been left + in the source tree. It also removed write permission from + all checked-out files. + + It effectively undoes all check-outs. + + It is run as part of 'update' which incorporates upstream + changes into a source tree. + +openall + Usage: p openall + + This command repeatedly runs "p open last && p discard" until + that fails, which usually means that all patches have been + discarded. This is part of the preparation for incorporating + upstream changes. + +snapshot + Usage: p snapshot + + This command takes a shapshot of the current patch so that further + work can be done in the patch, but it can easily be removed if + there are problems. + + This might be used before appending a patch incase something goes + wrong in the appending process. + +snapdiff + Usage: p snapdiff + + Display the differences between the latest snapshot and the current + source. + +snapback + Usage: p snapback + + Revert all changes since the last snapshot + +pull + Usage: p pull + + Update the local copy of the official source repository. This + can be found by following the .patches/SOURCE link. + + Currently the code assumes it is a BitKeeper repository and + runs "bk pull". It should be enhanced to recognise CVS and + run "cvs update". + +update + Usage: p update + + This command updates the based release of the package. To + do this it removes all patches (p openall), cleans up (p clean), + creates a patch from information in .patches/SOURCE, and applies + that patch. It currently makes no attempt to re-apply any + patches, or to "p included" and patches. + + Currently the code assumes a BitKeeper repository and uses + "bk export -tpatch -rLASTEST," to extract a patch, and then + retags the repository with "bk tag LATEST". It should be + enhanced to recognise and work with CVS as well. + +premail + Usage: p premail [patch-name-prefix] + + This command converts a selection of patches to Email messages. + The email messages are stored in .patches/mail. + SAY MORE HERE + +nomail + Usage: p nomail + + Remove the .patches/mail directory and contents. + +email + Usage: p email + + Send all mail messages in .patches/mail. On success, each + email message is removed. + +help + Usage: p help [topic] + + Print out help messages, which are contained in a file + p.help + in the same directory that p was run from. + Without a topic, a general introduction and a list of topics + is presented. With a topic, help on that topic is presented. + +purge + Usage: p purge + + Make copy of the current patch in .patches/last-purge (just + in case) and then purge the current patch complete. diff --git a/split.c b/split.c new file mode 100644 index 0000000..b5987bd --- /dev/null +++ b/split.c @@ -0,0 +1,124 @@ +/* + * wiggle - apply rejected patches + * + * Copyright (C) 2003 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * Paper: Neil Brown + * School of Computer Science and Engineering + * The University of New South Wales + * Sydney, 2052 + * Australia + */ + +/* + * split a stream into words or line + * When splitting into words we can either be approximate or precise. + * Precise mode includes every char in a word. + * Approximate mode excluses white-space words and might unite some special chars + * + * In general, a word is one of: + * string of [A-Za-z0-9_] + * or string of [ \t] + * or single char. + * + * A line is any string that ends with \n + * + * As a special case to allow proper aligning of multiple chunks + * in a patch, a word starting \0 will include 5 chars and a newline + * + * + * We make two passes through the stream. + * Firstly we count the number of item so an array can be allocated, + * then we store start and length of each item in the array + * + */ + +#include "wiggle.h" +#include +#include +#include +#define BITS_PER_LONG 32 + +#include "hash.h" + +static int split_internal(char *start, char *end, int type, struct elmnt *list, int reverse) +{ + int cnt = 0; + + while (start < end) { + char *cp = start; + + if (*cp == '\0' && cp+16 < end && cp[18] == '\n') { + /* special word */ + cp += 19; + } else switch(type) { + case ByLine: + while (cp < end && *cp != '\n') + cp++; + if (cpstart = start; + list->len = cp-start; + if (*start) + list->hash = hash_mem(start, list->len, BITS_PER_LONG); + else + list->hash = atoi(start+1); + if (!reverse) list++; + } + cnt++; + } + start = cp; + } + return cnt; +} + +struct file split_stream(struct stream s, int type, int reverse) +{ + int cnt; + struct file f; + + char *c, *end; + + end = s.body+s.len; + c = s.body; + + cnt = split_internal(c, end, type, NULL, reverse); +/* fprintf(stderr, "cnt %d\n", cnt);*/ + f.list = malloc(cnt*sizeof(struct elmnt)); + + f.elcnt = split_internal(c, end, type, f.list + reverse*cnt, reverse); + return f; +} diff --git a/tests/linux/inode-fullpatch/diff b/tests/linux/inode-fullpatch/diff new file mode 100644 index 0000000..d2a8b0d --- /dev/null +++ b/tests/linux/inode-fullpatch/diff @@ -0,0 +1,1330 @@ +@@ -1,1323 +1,43 @@ +-/* +- * linux/fs/inode.c +- * +- * (C) 1997 Linus Torvalds +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-/* +- * This is needed for the following functions: +- * - inode_has_buffers +- * - invalidate_inode_buffers +- * - fsync_bdev +- * - invalidate_bdev +- * +- * FIXME: remove all knowledge of the buffer layer from this file +- */ +-#include +- +-/* +- * New inode.c implementation. +- * +- * This implementation has the basic premise of trying +- * to be extremely low-overhead and SMP-safe, yet be +- * simple enough to be "obviously correct". +- * +- * Famous last words. +- */ +- +-/* inode dynamic allocation 1999, Andrea Arcangeli */ +- +-/* #define INODE_PARANOIA 1 */ +-/* #define INODE_DEBUG 1 */ +- +-/* +- * Inode lookup is no longer as critical as it used to be: +- * most of the lookups are going to be through the dcache. +- */ +-#define I_HASHBITS i_hash_shift +-#define I_HASHMASK i_hash_mask +- +-static unsigned int i_hash_mask; +-static unsigned int i_hash_shift; +- +-/* +- * Each inode can be on two separate lists. One is +- * the hash list of the inode, used for lookups. The +- * other linked list is the "type" list: +- * "in_use" - valid inode, i_count > 0, i_nlink > 0 +- * "dirty" - as "in_use" but also dirty +- * "unused" - valid inode, i_count = 0 +- * +- * A "dirty" list is maintained for each super block, +- * allowing for low-overhead inode sync() operations. +- */ +- +-LIST_HEAD(inode_in_use); +-LIST_HEAD(inode_unused); +-static struct hlist_head *inode_hashtable; +-static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */ +- +-/* +- * A simple spinlock to protect the list manipulations. +- * +- * NOTE! You also have to own the lock if you change +- * the i_state of an inode while it is in use.. +- */ +-spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; +- +-/* +- * iprune_sem provides exclusion between the kswapd or try_to_free_pages +- * icache shrinking path, and the umount path. Without this exclusion, +- * by the time prune_icache calls iput for the inode whose pages it has +- * been invalidating, or by the time it calls clear_inode & destroy_inode +- * from its final dispose_list, the struct super_block they refer to +- * (for inode->i_sb->s_op) may already have been freed and reused. +- */ +-static DECLARE_MUTEX(iprune_sem); +- +-/* +- * Statistics gathering.. +- */ +-struct inodes_stat_t inodes_stat; +- +-static kmem_cache_t * inode_cachep; +- +-static struct inode *alloc_inode(struct super_block *sb) +-{ +- static struct address_space_operations empty_aops; +- static struct inode_operations empty_iops; +- static struct file_operations empty_fops; +- struct inode *inode; +- +- if (sb->s_op->alloc_inode) +- inode = sb->s_op->alloc_inode(sb); +- else +- inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); +- +- if (inode) { +- struct address_space * const mapping = &inode->i_data; +- +- inode->i_sb = sb; +- inode->i_blkbits = sb->s_blocksize_bits; +- inode->i_flags = 0; +- atomic_set(&inode->i_count, 1); +- inode->i_sock = 0; +- inode->i_op = &empty_iops; +- inode->i_fop = &empty_fops; +- inode->i_nlink = 1; +- atomic_set(&inode->i_writecount, 0); +- inode->i_size = 0; +- inode->i_blocks = 0; +- inode->i_bytes = 0; +- inode->i_generation = 0; +- memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); +- inode->i_pipe = NULL; +- inode->i_bdev = NULL; +- inode->i_rdev = to_kdev_t(0); +- inode->i_security = NULL; +- if (security_inode_alloc(inode)) { +- if (inode->i_sb->s_op->destroy_inode) +- inode->i_sb->s_op->destroy_inode(inode); +- else +- kmem_cache_free(inode_cachep, (inode)); +- return NULL; +- } +- +- mapping->a_ops = &empty_aops; +- mapping->host = inode; +- mapping->gfp_mask = GFP_HIGHUSER; +- mapping->dirtied_when = 0; +- mapping->assoc_mapping = NULL; +- mapping->backing_dev_info = &default_backing_dev_info; +- if (sb->s_bdev) +- mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; +- memset(&inode->u, 0, sizeof(inode->u)); +- inode->i_mapping = mapping; +- } +- return inode; +-} +- +-void destroy_inode(struct inode *inode) +-{ +- if (inode_has_buffers(inode)) +- BUG(); +- security_inode_free(inode); +- if (inode->i_sb->s_op->destroy_inode) +- inode->i_sb->s_op->destroy_inode(inode); +- else +- kmem_cache_free(inode_cachep, (inode)); +-} +- +- +-/* +- * These are initializations that only need to be done +- * once, because the fields are idempotent across use +- * of the inode, so let the slab aware of that. +- */ +-void inode_init_once(struct inode *inode) +-{ +- memset(inode, 0, sizeof(*inode)); +- INIT_HLIST_NODE(&inode->i_hash); +- INIT_LIST_HEAD(&inode->i_data.clean_pages); +- INIT_LIST_HEAD(&inode->i_data.dirty_pages); +- INIT_LIST_HEAD(&inode->i_data.locked_pages); +- INIT_LIST_HEAD(&inode->i_data.io_pages); +- INIT_LIST_HEAD(&inode->i_dentry); +- INIT_LIST_HEAD(&inode->i_devices); +- sema_init(&inode->i_sem, 1); +- INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); +- rwlock_init(&inode->i_data.page_lock); +- init_MUTEX(&inode->i_data.i_shared_sem); +- INIT_LIST_HEAD(&inode->i_data.private_list); +- spin_lock_init(&inode->i_data.private_lock); +- INIT_LIST_HEAD(&inode->i_data.i_mmap); +- INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); +- spin_lock_init(&inode->i_lock); +-} +- +-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +-{ +- struct inode * inode = (struct inode *) foo; +- +- if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == +- SLAB_CTOR_CONSTRUCTOR) +- inode_init_once(inode); +-} +- +-/* +- * inode_lock must be held +- */ +-void __iget(struct inode * inode) +-{ +- if (atomic_read(&inode->i_count)) { +- atomic_inc(&inode->i_count); +- return; +- } +- atomic_inc(&inode->i_count); +- if (!(inode->i_state & (I_DIRTY|I_LOCK))) { +- list_del(&inode->i_list); +- list_add(&inode->i_list, &inode_in_use); +- } +- inodes_stat.nr_unused--; +-} +- +-/** +- * clear_inode - clear an inode +- * @inode: inode to clear +- * +- * This is called by the filesystem to tell us +- * that the inode is no longer useful. We just +- * terminate it with extreme prejudice. +- */ +- +-void clear_inode(struct inode *inode) +-{ +- invalidate_inode_buffers(inode); +- +- if (inode->i_data.nrpages) +- BUG(); +- if (!(inode->i_state & I_FREEING)) +- BUG(); +- if (inode->i_state & I_CLEAR) +- BUG(); +- wait_on_inode(inode); +- DQUOT_DROP(inode); +- if (inode->i_sb && inode->i_sb->s_op->clear_inode) +- inode->i_sb->s_op->clear_inode(inode); +- if (inode->i_bdev) +- bd_forget(inode); +- inode->i_state = I_CLEAR; +-} +- +-/* +- * Dispose-list gets a local list with local inodes in it, so it doesn't +- * need to worry about list corruption and SMP locks. +- */ +-static void dispose_list(struct list_head *head) +-{ +- int nr_disposed = 0; +- +- while (!list_empty(head)) { +- struct inode *inode; +- +- inode = list_entry(head->next, struct inode, i_list); +- list_del(&inode->i_list); +- +- if (inode->i_data.nrpages) +- truncate_inode_pages(&inode->i_data, 0); +- clear_inode(inode); +- destroy_inode(inode); +- nr_disposed++; +- } +- spin_lock(&inode_lock); +- inodes_stat.nr_inodes -= nr_disposed; +- spin_unlock(&inode_lock); +-} +- +-/* +- * Invalidate all inodes for a device. +- */ +-static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +-{ +- struct list_head *next; +- int busy = 0, count = 0; +- +- next = head->next; +- for (;;) { +- struct list_head * tmp = next; +- struct inode * inode; +- +- next = next->next; +- if (tmp == head) +- break; +- inode = list_entry(tmp, struct inode, i_list); +- if (inode->i_sb != sb) +- continue; +- invalidate_inode_buffers(inode); +- if (!atomic_read(&inode->i_count)) { +- hlist_del_init(&inode->i_hash); +- list_del(&inode->i_list); +- list_add(&inode->i_list, dispose); +- inode->i_state |= I_FREEING; +- count++; +- continue; +- } +- busy = 1; +- } +- /* only unused inodes may be cached with i_count zero */ +- inodes_stat.nr_unused -= count; +- return busy; +-} +- +-/* +- * This is a two-stage process. First we collect all +- * offending inodes onto the throw-away list, and in +- * the second stage we actually dispose of them. This +- * is because we don't want to sleep while messing +- * with the global lists.. +- */ +- +-/** +- * invalidate_inodes - discard the inodes on a device +- * @sb: superblock +- * +- * Discard all of the inodes for a given superblock. If the discard +- * fails because there are busy inodes then a non zero value is returned. +- * If the discard is successful all the inodes have been discarded. +- */ +- +-int invalidate_inodes(struct super_block * sb) +-{ +- int busy; +- LIST_HEAD(throw_away); +- +- down(&iprune_sem); +- spin_lock(&inode_lock); +- busy = invalidate_list(&inode_in_use, sb, &throw_away); +- busy |= invalidate_list(&inode_unused, sb, &throw_away); +- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); +- busy |= invalidate_list(&sb->s_io, sb, &throw_away); +- spin_unlock(&inode_lock); +- +- dispose_list(&throw_away); +- up(&iprune_sem); +- +- return busy; +-} +- +-int invalidate_device(kdev_t dev, int do_sync) +-{ +- struct super_block *sb; +- struct block_device *bdev = bdget(kdev_t_to_nr(dev)); +- int res; +- +- if (!bdev) +- return 0; +- +- if (do_sync) +- fsync_bdev(bdev); +- +- res = 0; +- sb = get_super(bdev); +- if (sb) { +- /* +- * no need to lock the super, get_super holds the +- * read semaphore so the filesystem cannot go away +- * under us (->put_super runs with the write lock +- * hold). +- */ +- shrink_dcache_sb(sb); +- res = invalidate_inodes(sb); +- drop_super(sb); +- } +- invalidate_bdev(bdev, 0); +- bdput(bdev); +- return res; +-} +- +-static int can_unuse(struct inode *inode) +-{ +- if (inode->i_state) +- return 0; +- if (inode_has_buffers(inode)) +- return 0; +- if (atomic_read(&inode->i_count)) +- return 0; +- if (inode->i_data.nrpages) +- return 0; +- return 1; +-} +- +-/* +- * Scan `goal' inodes on the unused list for freeable ones. They are moved to +- * a temporary list and then are freed outside inode_lock by dispose_list(). +- * +- * Any inodes which are pinned purely because of attached pagecache have their +- * pagecache removed. We expect the final iput() on that inode to add it to +- * the front of the inode_unused list. So look for it there and if the +- * inode is still freeable, proceed. The right inode is found 99.9% of the +- * time in testing on a 4-way. +- * +- * If the inode has metadata buffers attached to mapping->private_list then +- * try to remove them. +- */ +-static void prune_icache(int nr_to_scan) +-{ +- LIST_HEAD(freeable); +- int nr_pruned = 0; +- int nr_scanned; +- unsigned long reap = 0; +- +- down(&iprune_sem); +- spin_lock(&inode_lock); +- for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { +- struct inode *inode; +- +- if (list_empty(&inode_unused)) +- break; +- +- inode = list_entry(inode_unused.prev, struct inode, i_list); +- +- if (inode->i_state || atomic_read(&inode->i_count)) { +- list_move(&inode->i_list, &inode_unused); +- continue; +- } +- if (inode_has_buffers(inode) || inode->i_data.nrpages) { +- __iget(inode); +- spin_unlock(&inode_lock); +- if (remove_inode_buffers(inode)) +- reap += invalidate_inode_pages(&inode->i_data); +- iput(inode); +- spin_lock(&inode_lock); +- +- if (inode != list_entry(inode_unused.next, +- struct inode, i_list)) +- continue; /* wrong inode or list_empty */ +- if (!can_unuse(inode)) +- continue; +- } +- hlist_del_init(&inode->i_hash); +- list_move(&inode->i_list, &freeable); +- inode->i_state |= I_FREEING; +- nr_pruned++; +- } +- inodes_stat.nr_unused -= nr_pruned; +- spin_unlock(&inode_lock); +- +- dispose_list(&freeable); +- up(&iprune_sem); +- +- if (current_is_kswapd) +- mod_page_state(kswapd_inodesteal, reap); +- else +- mod_page_state(pginodesteal, reap); +-} +- +-/* +- * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, +- * "unused" means that no dentries are referring to the inodes: the files are +- * not open and the dcache references to those inodes have already been +- * reclaimed. +- * +- * This function is passed the number of inodes to scan, and it returns the +- * total number of remaining possibly-reclaimable inodes. +- */ +-static int shrink_icache_memory(int nr, unsigned int gfp_mask) +-{ +- if (nr) { +- /* +- * Nasty deadlock avoidance. We may hold various FS locks, +- * and we don't want to recurse into the FS that called us +- * in clear_inode() and friends.. +- */ +- if (gfp_mask & __GFP_FS) +- prune_icache(nr); +- } ++*** 470,6 **** 1 +| return inodes_stat.<<<--nr_unused-->>><<<++nr_inodes++>>>; + } + + /* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() +- * by hand after calling find_inode now! This simplifies iunique and won't +- * add any additional branch in the common code. +- */ +-static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) +-{ +- struct hlist_node *node; +- struct inode * inode = NULL; +- +- hlist_for_each (node, head) { +- prefetch(node->next); +- inode = hlist_entry(node, struct inode, i_hash); +- if (inode->i_sb != sb) ++*** 492,6 **** 2 + continue; + if (!test(inode, data)) + continue; + break; + } +| return<<<-- node ?-->>> inode<<<-- : NULL-->>>; +-} +- +-/* +- * find_inode_fast is the fast path version of find_inode, see the comment at +- * iget_locked for details. +- */ +-static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) +-{ +- struct hlist_node *node; +- struct inode * inode = NULL; +- +- hlist_for_each (node, head) { +- prefetch(node->next); +- inode = list_entry(node, struct inode, i_hash); +- if (inode->i_ino != ino) ++*** 517,6 **** 3 + continue; + if (inode->i_sb != sb) + continue; + break; + } +| return<<<-- node ?-->>> inode<<<-- : NULL-->>>; +-} +- +-/** +- * new_inode - obtain an inode +- * @sb: superblock +- * +- * Allocates a new inode for given superblock. +- */ +- +-struct inode *new_inode(struct super_block *sb) +-{ +- static unsigned long last_ino; +- struct inode * inode; +- +- spin_lock_prefetch(&inode_lock); +- +- inode = alloc_inode(sb); +- if (inode) { +- spin_lock(&inode_lock); +- inodes_stat.nr_inodes++; +- list_add(&inode->i_list, &inode_in_use); +- inode->i_ino = ++last_ino; +- inode->i_state = 0; +- spin_unlock(&inode_lock); +- } +- return inode; +-} +- +-void unlock_new_inode(struct inode *inode) +-{ +- /* +- * This is special! We do not need the spinlock +- * when clearing I_LOCK, because we're guaranteed +- * that nobody else tries to do anything about the +- * state of the inode when it is locked, as we +- * just created it (so there can be no old holders +- * that haven't tested I_LOCK). +- */ +- inode->i_state &= ~(I_LOCK|I_NEW); +- wake_up_inode(inode); +-} +-EXPORT_SYMBOL(unlock_new_inode); +- +-/* +- * This is called without the inode lock held.. Be careful. +- * +- * We no longer cache the sb_flags in i_flags - see fs.h +- * -- rmk@arm.uk.linux.org +- */ +-static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +-{ +- struct inode * inode; +- +- inode = alloc_inode(sb); +- if (inode) { +- struct inode * old; +- +- spin_lock(&inode_lock); +- /* We released the lock, so.. */ +- old = find_inode(sb, head, test, data); +- if (!old) { +- if (set(inode, data)) +- goto set_failed; +- +- inodes_stat.nr_inodes++; +- list_add(&inode->i_list, &inode_in_use); +- hlist_add_head(&inode->i_hash, head); +- inode->i_state = I_LOCK|I_NEW; +- spin_unlock(&inode_lock); +- +- /* Return the locked inode with I_NEW set, the +- * caller is responsible for filling in the contents +- */ +- return inode; +- } +- +- /* +- * Uhhuh, somebody else created the same inode under +- * us. Use the old inode instead of the one we just +- * allocated. +- */ +- __iget(old); +- spin_unlock(&inode_lock); +- destroy_inode(inode); +- inode = old; +- wait_on_inode(inode); +- } +- return inode; +- +-set_failed: +- spin_unlock(&inode_lock); +- destroy_inode(inode); +- return NULL; +-} +- +-/* +- * get_new_inode_fast is the fast path version of get_new_inode, see the +- * comment at iget_locked for details. +- */ +-static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) +-{ +- struct inode * inode; +- +- inode = alloc_inode(sb); +- if (inode) { +- struct inode * old; +- +- spin_lock(&inode_lock); +- /* We released the lock, so.. */ +- old = find_inode_fast(sb, head, ino); +- if (!old) { +- inode->i_ino = ino; +- inodes_stat.nr_inodes++; +- list_add(&inode->i_list, &inode_in_use); +- hlist_add_head(&inode->i_hash, head); +- inode->i_state = I_LOCK|I_NEW; +- spin_unlock(&inode_lock); +- +- /* Return the locked inode with I_NEW set, the +- * caller is responsible for filling in the contents +- */ +- return inode; +- } +- +- /* +- * Uhhuh, somebody else created the same inode under +- * us. Use the old inode instead of the one we just +- * allocated. +- */ +- __iget(old); +- spin_unlock(&inode_lock); +- destroy_inode(inode); +- inode = old; +- wait_on_inode(inode); +- } +- return inode; +-} +- +-static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +-{ +- unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES); +- tmp = tmp + (tmp >> I_HASHBITS); +- return tmp & I_HASHMASK; +-} +- +-/* Yeah, I know about quadratic hash. Maybe, later. */ +- +-/** +- * iunique - get a unique inode number +- * @sb: superblock +- * @max_reserved: highest reserved inode number +- * +- * Obtain an inode number that is unique on the system for a given +- * superblock. This is used by file systems that have no natural +- * permanent inode numbering system. An inode number is returned that +- * is higher than the reserved limit but unique. +- * +- * BUGS: +- * With a large number of inodes live on the file system this function +- * currently becomes quite slow. +- */ +- +-ino_t iunique(struct super_block *sb, ino_t max_reserved) +-{ +- static ino_t counter = 0; +- struct inode *inode; +- struct hlist_head * head; +- ino_t res; +- spin_lock(&inode_lock); +-retry: +- if (counter > max_reserved) { +- head = inode_hashtable + hash(sb,counter); +- res = counter++; +- inode = find_inode_fast(sb, head, res); +- if (!inode) { +- spin_unlock(&inode_lock); +- return res; +- } +- } else { +- counter = max_reserved + 1; +- } +- goto retry; +- +-} +- +-struct inode *igrab(struct inode *inode) +-{ +- spin_lock(&inode_lock); +- if (!(inode->i_state & I_FREEING)) +- __iget(inode); +- else +- /* +- * Handle the case where s_op->clear_inode is not been +- * called yet, and somebody is calling igrab +- * while the inode is getting freed. +- */ +- inode = NULL; +- spin_unlock(&inode_lock); +- return inode; +-} +- +-/** +- * ifind - internal function, you want ilookup5() or iget5(). +- * @sb: super block of file system to search +- * @hashval: hash value (usually inode number) to search for +- * @test: callback used for comparisons between inodes +- * @data: opaque data pointer to pass to @test +- * +- * ifind() searches for the inode specified by @hashval and @data in the inode +- * cache. This is a generalized version of ifind_fast() for file systems where +- * the inode number is not sufficient for unique identification of an inode. +- * +- * If the inode is in the cache, the inode is returned with an incremented +- * reference count. +- * +- * Otherwise NULL is returned. +- * +- * Note, @test is called with the inode_lock held, so can't sleep. +- */ +-static inline struct inode *ifind(struct super_block *sb, +- struct hlist_head *head, int (*test)(struct inode *, void *), +- void *data) +-{ +- struct inode *inode; +- +- spin_lock(&inode_lock); +- inode = find_inode(sb, head, test, data); +- if (inode) { +- __iget(inode); +- spin_unlock(&inode_lock); +- wait_on_inode(inode); +- return inode; +- } +- spin_unlock(&inode_lock); +- return NULL; +-} +- +-/** +- * ifind_fast - internal function, you want ilookup() or iget(). +- * @sb: super block of file system to search +- * @ino: inode number to search for +- * +- * ifind_fast() searches for the inode @ino in the inode cache. This is for +- * file systems where the inode number is sufficient for unique identification +- * of an inode. +- * +- * If the inode is in the cache, the inode is returned with an incremented +- * reference count. +- * +- * Otherwise NULL is returned. +- */ +-static inline struct inode *ifind_fast(struct super_block *sb, +- struct hlist_head *head, unsigned long ino) +-{ +- struct inode *inode; +- +- spin_lock(&inode_lock); +- inode = find_inode_fast(sb, head, ino); +- if (inode) { +- __iget(inode); +- spin_unlock(&inode_lock); +- wait_on_inode(inode); +- return inode; +- } +- spin_unlock(&inode_lock); +- return NULL; +-} +- +-/** +- * ilookup5 - search for an inode in the inode cache +- * @sb: super block of file system to search +- * @hashval: hash value (usually inode number) to search for +- * @test: callback used for comparisons between inodes +- * @data: opaque data pointer to pass to @test +- * +- * ilookup5() uses ifind() to search for the inode specified by @hashval and +- * @data in the inode cache. This is a generalized version of ilookup() for +- * file systems where the inode number is not sufficient for unique +- * identification of an inode. +- * +- * If the inode is in the cache, the inode is returned with an incremented +- * reference count. +- * +- * Otherwise NULL is returned. +- * +- * Note, @test is called with the inode_lock held, so can't sleep. +- */ +-struct inode *ilookup5(struct super_block *sb, unsigned long hashval, +- int (*test)(struct inode *, void *), void *data) +-{ +- struct hlist_head *head = inode_hashtable + hash(sb, hashval); +- +- return ifind(sb, head, test, data); +-} +-EXPORT_SYMBOL(ilookup5); +- +-/** +- * ilookup - search for an inode in the inode cache +- * @sb: super block of file system to search +- * @ino: inode number to search for +- * +- * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. +- * This is for file systems where the inode number is sufficient for unique +- * identification of an inode. +- * +- * If the inode is in the cache, the inode is returned with an incremented +- * reference count. +- * +- * Otherwise NULL is returned. +- */ +-struct inode *ilookup(struct super_block *sb, unsigned long ino) +-{ +- struct hlist_head *head = inode_hashtable + hash(sb, ino); +- +- return ifind_fast(sb, head, ino); +-} +-EXPORT_SYMBOL(ilookup); +- +-/** +- * iget5_locked - obtain an inode from a mounted file system +- * @sb: super block of file system +- * @hashval: hash value (usually inode number) to get +- * @test: callback used for comparisons between inodes +- * @set: callback used to initialize a new struct inode +- * @data: opaque data pointer to pass to @test and @set +- * +- * This is iget() without the read_inode() portion of get_new_inode(). +- * +- * iget5_locked() uses ifind() to search for the inode specified by @hashval +- * and @data in the inode cache and if present it is returned with an increased +- * reference count. This is a generalized version of iget_locked() for file +- * systems where the inode number is not sufficient for unique identification +- * of an inode. +- * +- * If the inode is not in cache, get_new_inode() is called to allocate a new +- * inode and this is returned locked, hashed, and with the I_NEW flag set. The +- * file system gets to fill it in before unlocking it via unlock_new_inode(). +- * +- * Note both @test and @set are called with the inode_lock held, so can't sleep. +- */ +-struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, +- int (*test)(struct inode *, void *), +- int (*set)(struct inode *, void *), void *data) +-{ +- struct hlist_head *head = inode_hashtable + hash(sb, hashval); +- struct inode *inode; +- +- inode = ifind(sb, head, test, data); +- if (inode) +- return inode; +- /* +- * get_new_inode() will do the right thing, re-trying the search +- * in case it had to block at any point. +- */ +- return get_new_inode(sb, head, test, set, data); +-} +-EXPORT_SYMBOL(iget5_locked); +- +-/** +- * iget_locked - obtain an inode from a mounted file system +- * @sb: super block of file system +- * @ino: inode number to get +- * +- * This is iget() without the read_inode() portion of get_new_inode_fast(). +- * +- * iget_locked() uses ifind_fast() to search for the inode specified by @ino in +- * the inode cache and if present it is returned with an increased reference +- * count. This is for file systems where the inode number is sufficient for +- * unique identification of an inode. +- * +- * If the inode is not in cache, get_new_inode_fast() is called to allocate a +- * new inode and this is returned locked, hashed, and with the I_NEW flag set. +- * The file system gets to fill it in before unlocking it via +- * unlock_new_inode(). +- */ +-struct inode *iget_locked(struct super_block *sb, unsigned long ino) +-{ +- struct hlist_head *head = inode_hashtable + hash(sb, ino); +- struct inode *inode; +- +- inode = ifind_fast(sb, head, ino); +- if (inode) +- return inode; +- /* +- * get_new_inode_fast() will do the right thing, re-trying the search +- * in case it had to block at any point. +- */ +- return get_new_inode_fast(sb, head, ino); +-} +-EXPORT_SYMBOL(iget_locked); +- +-/** +- * __insert_inode_hash - hash an inode +- * @inode: unhashed inode +- * @hashval: unsigned long value used to locate this object in the +- * inode_hashtable. +- * +- * Add an inode to the inode hash for this superblock. If the inode +- * has no superblock it is added to a separate anonymous chain. +- */ +- +-void __insert_inode_hash(struct inode *inode, unsigned long hashval) +-{ +- struct hlist_head *head = &anon_hash_chain; +- if (inode->i_sb) +- head = inode_hashtable + hash(inode->i_sb, hashval); +- spin_lock(&inode_lock); +- hlist_add_head(&inode->i_hash, head); +- spin_unlock(&inode_lock); +-} +- +-/** +- * remove_inode_hash - remove an inode from the hash +- * @inode: inode to unhash +- * +- * Remove an inode from the superblock or anonymous hash. +- */ +- +-void remove_inode_hash(struct inode *inode) +-{ +- spin_lock(&inode_lock); +- hlist_del_init(&inode->i_hash); +- spin_unlock(&inode_lock); +-} +- +-void generic_delete_inode(struct inode *inode) ++*** 949,7 **** 4 + { + struct super_operations *op = inode->i_sb->s_op; + +| <<<--hlist_del_init-->>><<<++list_del_init++>>>(&inode->i_hash); + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; +- spin_unlock(&inode_lock); +- +- if (inode->i_data.nrpages) +- truncate_inode_pages(&inode->i_data, 0); +- +- security_inode_delete(inode); +- +- if (op->delete_inode) { +- void (*delete)(struct inode *) = op->delete_inode; +- if (!is_bad_inode(inode)) +- DQUOT_INIT(inode); +- /* s_op->delete_inode internally recalls clear_inode() */ ++*** 968,6 **** 5 + delete(inode); + } else + clear_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +-} +-EXPORT_SYMBOL(generic_delete_inode); +- +-static void generic_forget_inode(struct inode *inode) +-{ +- struct super_block *sb = inode->i_sb; +- +- if (!hlist_unhashed(&inode->i_hash)) { +- if (!(inode->i_state & (I_DIRTY|I_LOCK))) { +- list_del(&inode->i_list); +- list_add(&inode->i_list, &inode_unused); +- } +- inodes_stat.nr_unused++; +- spin_unlock(&inode_lock); +- if (!sb || (sb->s_flags & MS_ACTIVE)) +- return; +- write_inode_now(inode, 1); +- spin_lock(&inode_lock); +- inodes_stat.nr_unused--; +- hlist_del_init(&inode->i_hash); +- } +- list_del_init(&inode->i_list); +- inode->i_state|=I_FREEING; +- inodes_stat.nr_inodes--; +- spin_unlock(&inode_lock); +- if (inode->i_data.nrpages) +- truncate_inode_pages(&inode->i_data, 0); +- clear_inode(inode); +- destroy_inode(inode); +-} +- +-/* +- * Normal UNIX filesystem behaviour: delete the +- * inode when the usage count drops to zero, and +- * i_nlink is zero. +- */ +-static void generic_drop_inode(struct inode *inode) +-{ +- if (!inode->i_nlink) +- generic_delete_inode(inode); +- else +- generic_forget_inode(inode); +-} +- +-/* +- * Called when we're dropping the last reference +- * to an inode. +- * +- * Call the FS "drop()" function, defaulting to +- * the legacy UNIX filesystem behaviour.. +- * +- * NOTE! NOTE! NOTE! We're called with the inode lock +- * held, and the drop function is supposed to release +- * the lock! +- */ +-static inline void iput_final(struct inode *inode) +-{ +- struct super_operations *op = inode->i_sb->s_op; +- void (*drop)(struct inode *) = generic_drop_inode; +- +- if (op && op->drop_inode) +- drop = op->drop_inode; +- drop(inode); +-} +- +-/** +- * iput - put an inode +- * @inode: inode to put +- * +- * Puts an inode, dropping its usage count. If the inode use count hits +- * zero the inode is also then freed and may be destroyed. +- */ +- +-void iput(struct inode *inode) +-{ +- if (inode) { +- struct super_operations *op = inode->i_sb->s_op; +- +- if (inode->i_state == I_CLEAR) +- BUG(); +- +- if (op && op->put_inode) +- op->put_inode(inode); +- +- if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) +- iput_final(inode); +- } +-} +- +-/** +- * bmap - find a block number in a file +- * @inode: inode of file +- * @block: block to find +- * +- * Returns the block number on the device holding the inode that +- * is the disk block number for the block of the file requested. +- * That is, asked for block 4 of inode 1 the function will return the +- * disk block relative to the disk start that holds that block of the +- * file. +- */ +- +-sector_t bmap(struct inode * inode, sector_t block) +-{ +- sector_t res = 0; +- if (inode->i_mapping->a_ops->bmap) +- res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); +- return res; +-} +- +-/* +- * Return true if the filesystem which backs this inode considers the two +- * passed timespecs to be sufficiently different to warrant flushing the +- * altered time out to disk. +- */ +-static int inode_times_differ(struct inode *inode, +- struct timespec *old, struct timespec *new) +-{ +- if (IS_ONE_SECOND(inode)) +- return old->tv_sec != new->tv_sec; +- return !timespec_equal(old, new); +-} +- +-/** +- * update_atime - update the access time +- * @inode: inode accessed +- * +- * Update the accessed time on an inode and mark it for writeback. +- * This function automatically handles read only file systems and media, +- * as well as the "noatime" flag and inode specific "noatime" markers. +- */ +- +-void update_atime(struct inode *inode) +-{ +- struct timespec now; +- +- if (IS_NOATIME(inode)) +- return; +- if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) +- return; +- if (IS_RDONLY(inode)) +- return; +- +- now = current_kernel_time(); +- if (inode_times_differ(inode, &inode->i_atime, &now)) { +- inode->i_atime = now; +- mark_inode_dirty_sync(inode); +- } else { +- if (!timespec_equal(&inode->i_atime, &now)) +- inode->i_atime = now; +- } +-} +- +-/** +- * inode_update_time - update mtime and ctime time +- * @inode: inode accessed +- * @ctime_too: update ctime too +- * +- * Update the mtime time on an inode and mark it for writeback. +- * When ctime_too is specified update the ctime too. +- */ +- +-void inode_update_time(struct inode *inode, int ctime_too) +-{ +- struct timespec now = current_kernel_time(); +- int sync_it = 0; +- +- if (inode_times_differ(inode, &inode->i_mtime, &now)) +- sync_it = 1; +- inode->i_mtime = now; +- +- if (ctime_too) { +- if (inode_times_differ(inode, &inode->i_ctime, &now)) +- sync_it = 1; +- inode->i_ctime = now; +- } +- if (sync_it) +- mark_inode_dirty_sync(inode); +-} +-EXPORT_SYMBOL(inode_update_time); +- +-int inode_needs_sync(struct inode *inode) +-{ +- if (IS_SYNC(inode)) +- return 1; +- if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) +- return 1; +- return 0; +-} +-EXPORT_SYMBOL(inode_needs_sync); +- +-/* +- * Quota functions that want to walk the inode lists.. +- */ +-#ifdef CONFIG_QUOTA +- +-/* Functions back in dquot.c */ +-void put_dquot_list(struct list_head *); +-int remove_inode_dquot_ref(struct inode *, int, struct list_head *); +- +-void remove_dquot_ref(struct super_block *sb, int type) +-{ +- struct inode *inode; +- struct list_head *act_head; +- LIST_HEAD(tofree_head); +- +- if (!sb->dq_op) +- return; /* nothing to do */ +- spin_lock(&inode_lock); /* This lock is for inodes code */ +- /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ +- +- list_for_each(act_head, &inode_in_use) { +- inode = list_entry(act_head, struct inode, i_list); +- if (inode->i_sb == sb && IS_QUOTAINIT(inode)) +- remove_inode_dquot_ref(inode, type, &tofree_head); +- } +- list_for_each(act_head, &inode_unused) { +- inode = list_entry(act_head, struct inode, i_list); +- if (inode->i_sb == sb && IS_QUOTAINIT(inode)) +- remove_inode_dquot_ref(inode, type, &tofree_head); +- } +- list_for_each(act_head, &sb->s_dirty) { +- inode = list_entry(act_head, struct inode, i_list); +- if (IS_QUOTAINIT(inode)) +- remove_inode_dquot_ref(inode, type, &tofree_head); +- } +- list_for_each(act_head, &sb->s_io) { +- inode = list_entry(act_head, struct inode, i_list); +- if (IS_QUOTAINIT(inode)) +- remove_inode_dquot_ref(inode, type, &tofree_head); +- } +- spin_unlock(&inode_lock); +- +- put_dquot_list(&tofree_head); +-} +- +-#endif +- +-/* +- * Hashed waitqueues for wait_on_inode(). The table is pretty small - the +- * kernel doesn't lock many inodes at the same time. +- */ +-#define I_WAIT_TABLE_ORDER 3 +-static struct i_wait_queue_head { +- wait_queue_head_t wqh; +-} ____cacheline_aligned_in_smp i_wait_queue_heads[1<i_state & I_LOCK) { +- schedule(); +- goto repeat; +- } +- remove_wait_queue(wq, &wait); +|<<<-- __set_current_state(-->>><<<++*** 1219,6 **** 6 +| current->state = ++>>>TASK_RUNNING<<<--)-->>>; + } + + void wake_up_inode(struct inode *inode) + { + wait_queue_head_t *wq = i_waitq_head(inode); +- +- /* +- * Prevent speculative execution through spin_unlock(&inode_lock); +- */ +- smp_mb(); +- if (waitqueue_active(wq)) +- wake_up_all(wq); +-} +- +-/* +- * Initialize the waitqueues and inode hash table. +- */ +-void __init inode_init(unsigned long mempages) +-{ +- struct hlist_head *head; +- unsigned long order; +- unsigned int nr_hash; +- int i; +- +- for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) +- init_waitqueue_head(&i_wait_queue_heads[i].wqh); +- +- mempages >>= (14 - PAGE_SHIFT); +- mempages *= sizeof(struct list_head); +- for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) +- ; +- +- do { +- unsigned long tmp; +- +- nr_hash = (1UL << order) * PAGE_SIZE / +- sizeof(struct hlist_head); +- i_hash_mask = (nr_hash - 1); +- +- tmp = nr_hash; +- i_hash_shift = 0; +- while ((tmp >>= 1UL) != 0UL) +- i_hash_shift++; +- +- inode_hashtable = (struct hlist_head *) +- __get_free_pages(GFP_ATOMIC, order); +- } while (inode_hashtable == NULL && --order >= 0); +- +- printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", +- nr_hash, order, (PAGE_SIZE << order)); +- +- if (!inode_hashtable) +- panic("Failed to allocate inode hash table\n"); +- +- head = inode_hashtable; +- i = nr_hash; +- do { +- INIT_HLIST_HEAD(head); +- head++; +- i--; +- } while (i); +- +- /* inode slab cache */ +- inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), +- 0, SLAB_HWCACHE_ALIGN, init_once, +- NULL); +- if (!inode_cachep) +- panic("cannot create inode slab cache"); +- +- set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); +-} +- +-void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +-{ +- inode->i_mode = mode; +- if (S_ISCHR(mode)) { +- inode->i_fop = &def_chr_fops; +- inode->i_rdev = to_kdev_t(rdev); +- } else if (S_ISBLK(mode)) { +- inode->i_fop = &def_blk_fops; +- inode->i_rdev = to_kdev_t(rdev); +- } else if (S_ISFIFO(mode)) +- inode->i_fop = &def_fifo_fops; +- else if (S_ISSOCK(mode)) +- inode->i_fop = &bad_sock_fops; +- else +- printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", +- mode); +-} diff --git a/tests/linux/inode-fullpatch/merge b/tests/linux/inode-fullpatch/merge new file mode 100644 index 0000000..685b14e --- /dev/null +++ b/tests/linux/inode-fullpatch/merge @@ -0,0 +1,1358 @@ +/* + * linux/fs/inode.c + * + * (C) 1997 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is needed for the following functions: + * - inode_has_buffers + * - invalidate_inode_buffers + * - fsync_bdev + * - invalidate_bdev + * + * FIXME: remove all knowledge of the buffer layer from this file + */ +#include + +/* + * New inode.c implementation. + * + * This implementation has the basic premise of trying + * to be extremely low-overhead and SMP-safe, yet be + * simple enough to be "obviously correct". + * + * Famous last words. + */ + +/* inode dynamic allocation 1999, Andrea Arcangeli */ + +/* #define INODE_PARANOIA 1 */ +/* #define INODE_DEBUG 1 */ + +/* + * Inode lookup is no longer as critical as it used to be: + * most of the lookups are going to be through the dcache. + */ +#define I_HASHBITS i_hash_shift +#define I_HASHMASK i_hash_mask + +static unsigned int i_hash_mask; +static unsigned int i_hash_shift; + +/* + * Each inode can be on two separate lists. One is + * the hash list of the inode, used for lookups. The + * other linked list is the "type" list: + * "in_use" - valid inode, i_count > 0, i_nlink > 0 + * "dirty" - as "in_use" but also dirty + * "unused" - valid inode, i_count = 0 + * + * A "dirty" list is maintained for each super block, + * allowing for low-overhead inode sync() operations. + */ + +LIST_HEAD(inode_in_use); +LIST_HEAD(inode_unused); +static struct hlist_head *inode_hashtable; +static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */ + +/* + * A simple spinlock to protect the list manipulations. + * + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; + +/* + * iprune_sem provides exclusion between the kswapd or try_to_free_pages + * icache shrinking path, and the umount path. Without this exclusion, + * by the time prune_icache calls iput for the inode whose pages it has + * been invalidating, or by the time it calls clear_inode & destroy_inode + * from its final dispose_list, the struct super_block they refer to + * (for inode->i_sb->s_op) may already have been freed and reused. + */ +static DECLARE_MUTEX(iprune_sem); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; + +static kmem_cache_t * inode_cachep; + +static struct inode *alloc_inode(struct super_block *sb) +{ + static struct address_space_operations empty_aops; + static struct inode_operations empty_iops; + static struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); + + if (inode) { + struct address_space * const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_sock = 0; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_rdev = to_kdev_t(0); + inode->i_security = NULL; + if (security_inode_alloc(inode)) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; + } + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->gfp_mask = GFP_HIGHUSER; + mapping->dirtied_when = 0; + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + if (sb->s_bdev) + mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + memset(&inode->u, 0, sizeof(inode->u)); + inode->i_mapping = mapping; + } + return inode; +} + +void destroy_inode(struct inode *inode) +{ + if (inode_has_buffers(inode)) + BUG(); + security_inode_free(inode); + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); +} + + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_data.clean_pages); + INIT_LIST_HEAD(&inode->i_data.dirty_pages); + INIT_LIST_HEAD(&inode->i_data.locked_pages); + INIT_LIST_HEAD(&inode->i_data.io_pages); + INIT_LIST_HEAD(&inode->i_dentry); + INIT_LIST_HEAD(&inode->i_devices); + sema_init(&inode->i_sem, 1); + INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); + rwlock_init(&inode->i_data.page_lock); + init_MUTEX(&inode->i_data.i_shared_sem); + INIT_LIST_HEAD(&inode->i_data.private_list); + spin_lock_init(&inode->i_data.private_lock); + INIT_LIST_HEAD(&inode->i_data.i_mmap); + INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); + spin_lock_init(&inode->i_lock); +} + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct inode * inode = (struct inode *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(inode); +} + +/* + * inode_lock must be held + */ +void __iget(struct inode * inode) +{ + if (atomic_read(&inode->i_count)) { + atomic_inc(&inode->i_count); + return; + } + atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_in_use); + } + inodes_stat.nr_unused--; +} + +/** + * clear_inode - clear an inode + * @inode: inode to clear + * + * This is called by the filesystem to tell us + * that the inode is no longer useful. We just + * terminate it with extreme prejudice. + */ + +void clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + + if (inode->i_data.nrpages) + BUG(); + if (!(inode->i_state & I_FREEING)) + BUG(); + if (inode->i_state & I_CLEAR) + BUG(); + wait_on_inode(inode); + DQUOT_DROP(inode); + if (inode->i_sb && inode->i_sb->s_op->clear_inode) + inode->i_sb->s_op->clear_inode(inode); + if (inode->i_bdev) + bd_forget(inode); + inode->i_state = I_CLEAR; +} + +/* + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + int nr_disposed = 0; + + while (!list_empty(head)) { + struct inode *inode; + + inode = list_entry(head->next, struct inode, i_list); + list_del(&inode->i_list); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); + nr_disposed++; + } + spin_lock(&inode_lock); + inodes_stat.nr_inodes -= nr_disposed; + spin_unlock(&inode_lock); +} + +/* + * Invalidate all inodes for a device. + */ +static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +{ + struct list_head *next; + int busy = 0, count = 0; + + next = head->next; + for (;;) { + struct list_head * tmp = next; + struct inode * inode; + + next = next->next; + if (tmp == head) + break; + inode = list_entry(tmp, struct inode, i_list); + if (inode->i_sb != sb) + continue; + invalidate_inode_buffers(inode); + if (!atomic_read(&inode->i_count)) { + hlist_del_init(&inode->i_hash); + list_del(&inode->i_list); + list_add(&inode->i_list, dispose); + inode->i_state |= I_FREEING; + count++; + continue; + } + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ + inodes_stat.nr_unused -= count; + return busy; +} + +/* + * This is a two-stage process. First we collect all + * offending inodes onto the throw-away list, and in + * the second stage we actually dispose of them. This + * is because we don't want to sleep while messing + * with the global lists.. + */ + +/** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +int invalidate_inodes(struct super_block * sb) +{ + int busy; + LIST_HEAD(throw_away); + + down(&iprune_sem); + spin_lock(&inode_lock); + busy = invalidate_list(&inode_in_use, sb, &throw_away); + busy |= invalidate_list(&inode_unused, sb, &throw_away); + busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_io, sb, &throw_away); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); + up(&iprune_sem); + + return busy; +} + +int invalidate_device(kdev_t dev, int do_sync) +{ + struct super_block *sb; + struct block_device *bdev = bdget(kdev_t_to_nr(dev)); + int res; + + if (!bdev) + return 0; + + if (do_sync) + fsync_bdev(bdev); + + res = 0; + sb = get_super(bdev); + if (sb) { + /* + * no need to lock the super, get_super holds the + * read semaphore so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb); + drop_super(sb); + } + invalidate_bdev(bdev, 0); + bdput(bdev); + return res; +} + +static int can_unuse(struct inode *inode) +{ + if (inode->i_state) + return 0; + if (inode_has_buffers(inode)) + return 0; + if (atomic_read(&inode->i_count)) + return 0; + if (inode->i_data.nrpages) + return 0; + return 1; +} + +/* + * Scan `goal' inodes on the unused list for freeable ones. They are moved to + * a temporary list and then are freed outside inode_lock by dispose_list(). + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. We expect the final iput() on that inode to add it to + * the front of the inode_unused list. So look for it there and if the + * inode is still freeable, proceed. The right inode is found 99.9% of the + * time in testing on a 4-way. + * + * If the inode has metadata buffers attached to mapping->private_list then + * try to remove them. + */ +static void prune_icache(int nr_to_scan) +{ + LIST_HEAD(freeable); + int nr_pruned = 0; + int nr_scanned; + unsigned long reap = 0; + + down(&iprune_sem); + spin_lock(&inode_lock); + for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + struct inode *inode; + + if (list_empty(&inode_unused)) + break; + + inode = list_entry(inode_unused.prev, struct inode, i_list); + + if (inode->i_state || atomic_read(&inode->i_count)) { + list_move(&inode->i_list, &inode_unused); + continue; + } + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode_lock); + if (remove_inode_buffers(inode)) + reap += invalidate_inode_pages(&inode->i_data); + iput(inode); + spin_lock(&inode_lock); + + if (inode != list_entry(inode_unused.next, + struct inode, i_list)) + continue; /* wrong inode or list_empty */ + if (!can_unuse(inode)) + continue; + } + hlist_del_init(&inode->i_hash); + list_move(&inode->i_list, &freeable); + inode->i_state |= I_FREEING; + nr_pruned++; + } + inodes_stat.nr_unused -= nr_pruned; + spin_unlock(&inode_lock); + + dispose_list(&freeable); + up(&iprune_sem); + + if (current_is_kswapd) + mod_page_state(kswapd_inodesteal, reap); + else + mod_page_state(pginodesteal, reap); +} + +/* + * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, + * "unused" means that no dentries are referring to the inodes: the files are + * not open and the dcache references to those inodes have already been + * reclaimed. + * + * This function is passed the number of inodes to scan, and it returns the + * total number of remaining possibly-reclaimable inodes. + */ +static int shrink_icache_memory(int nr, unsigned int gfp_mask) +{ + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, + * and we don't want to recurse into the FS that called us + * in clear_inode() and friends.. + */ + if (gfp_mask & __GFP_FS) + prune_icache(nr); + } + return inodes_stat.nr_unused; +} + +void __wait_on_freeing_inode(struct inode *inode); +/* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() + * by hand after calling find_inode now! This simplifies iunique and won't + * add any additional branch in the common code. + */ +static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = hlist_entry(node, struct inode, i_hash); + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = list_entry(node, struct inode, i_hash); + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + */ + +struct inode *new_inode(struct super_block *sb) +{ + static unsigned long last_ino; + struct inode * inode; + + spin_lock_prefetch(&inode_lock); + + inode = alloc_inode(sb); + if (inode) { + spin_lock(&inode_lock); + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + inode->i_ino = ++last_ino; + inode->i_state = 0; + spin_unlock(&inode_lock); + } + return inode; +} + +void unlock_new_inode(struct inode *inode) +{ + /* + * This is special! We do not need the spinlock + * when clearing I_LOCK, because we're guaranteed + * that nobody else tries to do anything about the + * state of the inode when it is locked, as we + * just created it (so there can be no old holders + * that haven't tested I_LOCK). + */ + inode->i_state &= ~(I_LOCK|I_NEW); + wake_up_inode(inode); +} +EXPORT_SYMBOL(unlock_new_inode); + +/* + * This is called without the inode lock held.. Be careful. + * + * We no longer cache the sb_flags in i_flags - see fs.h + * -- rmk@arm.uk.linux.org + */ +static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode(sb, head, test, data); + if (!old) { + if (set(inode, data)) + goto set_failed; + + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; + +set_failed: + spin_unlock(&inode_lock); + destroy_inode(inode); + return NULL; +} + +/* + * get_new_inode_fast is the fast path version of get_new_inode, see the + * comment at iget_locked for details. + */ +static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} + +static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES); + tmp = tmp + (tmp >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +/* Yeah, I know about quadratic hash. Maybe, later. */ + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ + +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + static ino_t counter = 0; + struct inode *inode; + struct hlist_head * head; + ino_t res; + spin_lock(&inode_lock); +retry: + if (counter > max_reserved) { + head = inode_hashtable + hash(sb,counter); + res = counter++; + inode = find_inode_fast(sb, head, res); + if (!inode) { + spin_unlock(&inode_lock); + return res; + } + } else { + counter = max_reserved + 1; + } + goto retry; + +} + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode_lock); + if (!(inode->i_state & I_FREEING)) + __iget(inode); + else + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + spin_unlock(&inode_lock); + return inode; +} + +/** + * ifind - internal function, you want ilookup5() or iget5(). + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ifind() searches for the inode specified by @hashval and @data in the inode + * cache. This is a generalized version of ifind_fast() for file systems where + * the inode number is not sufficient for unique identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +static inline struct inode *ifind(struct super_block *sb, + struct hlist_head *head, int (*test)(struct inode *, void *), + void *data) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode(sb, head, test, data); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ifind_fast - internal function, you want ilookup() or iget(). + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ifind_fast() searches for the inode @ino in the inode cache. This is for + * file systems where the inode number is sufficient for unique identification + * of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +static inline struct inode *ifind_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode_fast(sb, head, ino); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ilookup5() uses ifind() to search for the inode specified by @hashval and + * @data in the inode cache. This is a generalized version of ilookup() for + * file systems where the inode number is not sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + return ifind(sb, head, test, data); +} +EXPORT_SYMBOL(ilookup5); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. + * This is for file systems where the inode number is sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + return ifind_fast(sb, head, ino); +} +EXPORT_SYMBOL(ilookup); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * This is iget() without the read_inode() portion of get_new_inode(). + * + * iget5_locked() uses ifind() to search for the inode specified by @hashval + * and @data in the inode cache and if present it is returned with an increased + * reference count. This is a generalized version of iget_locked() for file + * systems where the inode number is not sufficient for unique identification + * of an inode. + * + * If the inode is not in cache, get_new_inode() is called to allocate a new + * inode and this is returned locked, hashed, and with the I_NEW flag set. The + * file system gets to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_lock held, so can't sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + inode = ifind(sb, head, test, data); + if (inode) + return inode; + /* + * get_new_inode() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode(sb, head, test, set, data); +} +EXPORT_SYMBOL(iget5_locked); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * This is iget() without the read_inode() portion of get_new_inode_fast(). + * + * iget_locked() uses ifind_fast() to search for the inode specified by @ino in + * the inode cache and if present it is returned with an increased reference + * count. This is for file systems where the inode number is sufficient for + * unique identification of an inode. + * + * If the inode is not in cache, get_new_inode_fast() is called to allocate a + * new inode and this is returned locked, hashed, and with the I_NEW flag set. + * The file system gets to fill it in before unlocking it via + * unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + inode = ifind_fast(sb, head, ino); + if (inode) + return inode; + /* + * get_new_inode_fast() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode_fast(sb, head, ino); +} +EXPORT_SYMBOL(iget_locked); + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. If the inode + * has no superblock it is added to a separate anonymous chain. + */ + +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *head = &anon_hash_chain; + if (inode->i_sb) + head = inode_hashtable + hash(inode->i_sb, hashval); + spin_lock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); +} + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock or anonymous hash. + */ + +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); +} + +void generic_delete_inode(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + +<<<<<<< + hlist_del_init(&inode->i_hash); +||||||| + list_del_init(&inode->i_hash); +======= +>>>>>>> + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + security_inode_delete(inode); + + if (op->delete_inode) { + void (*delete)(struct inode *) = op->delete_inode; + if (!is_bad_inode(inode)) + DQUOT_INIT(inode); + /* s_op->delete_inode internally recalls clear_inode() */ + delete(inode); + } else + clear_inode(inode); + spin_lock(&inode_lock); + list_del_init(&inode->i_hash); + spin_unlock(&inode_lock); + wake_up_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +} +EXPORT_SYMBOL(generic_delete_inode); + +static void generic_forget_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!hlist_unhashed(&inode->i_hash)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_unused); + } + inodes_stat.nr_unused++; + spin_unlock(&inode_lock); + if (!sb || (sb->s_flags & MS_ACTIVE)) + return; + write_inode_now(inode, 1); + spin_lock(&inode_lock); + inodes_stat.nr_unused--; + hlist_del_init(&inode->i_hash); + } + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); +} + +/* + * Normal UNIX filesystem behaviour: delete the + * inode when the usage count drops to zero, and + * i_nlink is zero. + */ +static void generic_drop_inode(struct inode *inode) +{ + if (!inode->i_nlink) + generic_delete_inode(inode); + else + generic_forget_inode(inode); +} + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop()" function, defaulting to + * the legacy UNIX filesystem behaviour.. + * + * NOTE! NOTE! NOTE! We're called with the inode lock + * held, and the drop function is supposed to release + * the lock! + */ +static inline void iput_final(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + void (*drop)(struct inode *) = generic_drop_inode; + + if (op && op->drop_inode) + drop = op->drop_inode; + drop(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero the inode is also then freed and may be destroyed. + */ + +void iput(struct inode *inode) +{ + if (inode) { + struct super_operations *op = inode->i_sb->s_op; + + if (inode->i_state == I_CLEAR) + BUG(); + + if (op && op->put_inode) + op->put_inode(inode); + + if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + iput_final(inode); + } +} + +/** + * bmap - find a block number in a file + * @inode: inode of file + * @block: block to find + * + * Returns the block number on the device holding the inode that + * is the disk block number for the block of the file requested. + * That is, asked for block 4 of inode 1 the function will return the + * disk block relative to the disk start that holds that block of the + * file. + */ + +sector_t bmap(struct inode * inode, sector_t block) +{ + sector_t res = 0; + if (inode->i_mapping->a_ops->bmap) + res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); + return res; +} + +/* + * Return true if the filesystem which backs this inode considers the two + * passed timespecs to be sufficiently different to warrant flushing the + * altered time out to disk. + */ +static int inode_times_differ(struct inode *inode, + struct timespec *old, struct timespec *new) +{ + if (IS_ONE_SECOND(inode)) + return old->tv_sec != new->tv_sec; + return !timespec_equal(old, new); +} + +/** + * update_atime - update the access time + * @inode: inode accessed + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ + +void update_atime(struct inode *inode) +{ + struct timespec now; + + if (IS_NOATIME(inode)) + return; + if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) + return; + if (IS_RDONLY(inode)) + return; + + now = current_kernel_time(); + if (inode_times_differ(inode, &inode->i_atime, &now)) { + inode->i_atime = now; + mark_inode_dirty_sync(inode); + } else { + if (!timespec_equal(&inode->i_atime, &now)) + inode->i_atime = now; + } +} + +/** + * inode_update_time - update mtime and ctime time + * @inode: inode accessed + * @ctime_too: update ctime too + * + * Update the mtime time on an inode and mark it for writeback. + * When ctime_too is specified update the ctime too. + */ + +void inode_update_time(struct inode *inode, int ctime_too) +{ + struct timespec now = current_kernel_time(); + int sync_it = 0; + + if (inode_times_differ(inode, &inode->i_mtime, &now)) + sync_it = 1; + inode->i_mtime = now; + + if (ctime_too) { + if (inode_times_differ(inode, &inode->i_ctime, &now)) + sync_it = 1; + inode->i_ctime = now; + } + if (sync_it) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL(inode_update_time); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} +EXPORT_SYMBOL(inode_needs_sync); + +/* + * Quota functions that want to walk the inode lists.. + */ +#ifdef CONFIG_QUOTA + +/* Functions back in dquot.c */ +void put_dquot_list(struct list_head *); +int remove_inode_dquot_ref(struct inode *, int, struct list_head *); + +void remove_dquot_ref(struct super_block *sb, int type) +{ + struct inode *inode; + struct list_head *act_head; + LIST_HEAD(tofree_head); + + if (!sb->dq_op) + return; /* nothing to do */ + spin_lock(&inode_lock); /* This lock is for inodes code */ + /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ + + list_for_each(act_head, &inode_in_use) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &inode_unused) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_dirty) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_io) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + spin_unlock(&inode_lock); + + put_dquot_list(&tofree_head); +} + +#endif + +/* + * Hashed waitqueues for wait_on_inode(). The table is pretty small - the + * kernel doesn't lock many inodes at the same time. + */ +#define I_WAIT_TABLE_ORDER 3 +static struct i_wait_queue_head { + wait_queue_head_t wqh; +} ____cacheline_aligned_in_smp i_wait_queue_heads[1<i_state & I_LOCK) { + schedule(); + goto repeat; + } + remove_wait_queue(wq, &wait); + __set_current_state(TASK_RUNNING); +} + +void __wait_on_freeing_inode(struct inode *inode) +{ + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *wq = i_waitq_head(inode); + + add_wait_queue(wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&inode_lock); + schedule(); + remove_wait_queue(wq, &wait); + current->state = TASK_RUNNING; + spin_lock(&inode_lock); +} + + +void wake_up_inode(struct inode *inode) +{ + wait_queue_head_t *wq = i_waitq_head(inode); + + /* + * Prevent speculative execution through spin_unlock(&inode_lock); + */ + smp_mb(); + if (waitqueue_active(wq)) + wake_up_all(wq); +} + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init(unsigned long mempages) +{ + struct hlist_head *head; + unsigned long order; + unsigned int nr_hash; + int i; + + for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) + init_waitqueue_head(&i_wait_queue_heads[i].wqh); + + mempages >>= (14 - PAGE_SHIFT); + mempages *= sizeof(struct list_head); + for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) + ; + + do { + unsigned long tmp; + + nr_hash = (1UL << order) * PAGE_SIZE / + sizeof(struct hlist_head); + i_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + i_hash_shift = 0; + while ((tmp >>= 1UL) != 0UL) + i_hash_shift++; + + inode_hashtable = (struct hlist_head *) + __get_free_pages(GFP_ATOMIC, order); + } while (inode_hashtable == NULL && --order >= 0); + + printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!inode_hashtable) + panic("Failed to allocate inode hash table\n"); + + head = inode_hashtable; + i = nr_hash; + do { + INIT_HLIST_HEAD(head); + head++; + i--; + } while (i); + + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), + 0, SLAB_HWCACHE_ALIGN, init_once, + NULL); + if (!inode_cachep) + panic("cannot create inode slab cache"); + + set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) + inode->i_fop = &bad_sock_fops; + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", + mode); +} diff --git a/tests/linux/inode-fullpatch/orig b/tests/linux/inode-fullpatch/orig new file mode 100644 index 0000000..47e7429 --- /dev/null +++ b/tests/linux/inode-fullpatch/orig @@ -0,0 +1,1323 @@ +/* + * linux/fs/inode.c + * + * (C) 1997 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is needed for the following functions: + * - inode_has_buffers + * - invalidate_inode_buffers + * - fsync_bdev + * - invalidate_bdev + * + * FIXME: remove all knowledge of the buffer layer from this file + */ +#include + +/* + * New inode.c implementation. + * + * This implementation has the basic premise of trying + * to be extremely low-overhead and SMP-safe, yet be + * simple enough to be "obviously correct". + * + * Famous last words. + */ + +/* inode dynamic allocation 1999, Andrea Arcangeli */ + +/* #define INODE_PARANOIA 1 */ +/* #define INODE_DEBUG 1 */ + +/* + * Inode lookup is no longer as critical as it used to be: + * most of the lookups are going to be through the dcache. + */ +#define I_HASHBITS i_hash_shift +#define I_HASHMASK i_hash_mask + +static unsigned int i_hash_mask; +static unsigned int i_hash_shift; + +/* + * Each inode can be on two separate lists. One is + * the hash list of the inode, used for lookups. The + * other linked list is the "type" list: + * "in_use" - valid inode, i_count > 0, i_nlink > 0 + * "dirty" - as "in_use" but also dirty + * "unused" - valid inode, i_count = 0 + * + * A "dirty" list is maintained for each super block, + * allowing for low-overhead inode sync() operations. + */ + +LIST_HEAD(inode_in_use); +LIST_HEAD(inode_unused); +static struct hlist_head *inode_hashtable; +static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */ + +/* + * A simple spinlock to protect the list manipulations. + * + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; + +/* + * iprune_sem provides exclusion between the kswapd or try_to_free_pages + * icache shrinking path, and the umount path. Without this exclusion, + * by the time prune_icache calls iput for the inode whose pages it has + * been invalidating, or by the time it calls clear_inode & destroy_inode + * from its final dispose_list, the struct super_block they refer to + * (for inode->i_sb->s_op) may already have been freed and reused. + */ +static DECLARE_MUTEX(iprune_sem); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; + +static kmem_cache_t * inode_cachep; + +static struct inode *alloc_inode(struct super_block *sb) +{ + static struct address_space_operations empty_aops; + static struct inode_operations empty_iops; + static struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); + + if (inode) { + struct address_space * const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_sock = 0; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_rdev = to_kdev_t(0); + inode->i_security = NULL; + if (security_inode_alloc(inode)) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; + } + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->gfp_mask = GFP_HIGHUSER; + mapping->dirtied_when = 0; + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + if (sb->s_bdev) + mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + memset(&inode->u, 0, sizeof(inode->u)); + inode->i_mapping = mapping; + } + return inode; +} + +void destroy_inode(struct inode *inode) +{ + if (inode_has_buffers(inode)) + BUG(); + security_inode_free(inode); + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); +} + + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_data.clean_pages); + INIT_LIST_HEAD(&inode->i_data.dirty_pages); + INIT_LIST_HEAD(&inode->i_data.locked_pages); + INIT_LIST_HEAD(&inode->i_data.io_pages); + INIT_LIST_HEAD(&inode->i_dentry); + INIT_LIST_HEAD(&inode->i_devices); + sema_init(&inode->i_sem, 1); + INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); + rwlock_init(&inode->i_data.page_lock); + init_MUTEX(&inode->i_data.i_shared_sem); + INIT_LIST_HEAD(&inode->i_data.private_list); + spin_lock_init(&inode->i_data.private_lock); + INIT_LIST_HEAD(&inode->i_data.i_mmap); + INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); + spin_lock_init(&inode->i_lock); +} + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct inode * inode = (struct inode *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(inode); +} + +/* + * inode_lock must be held + */ +void __iget(struct inode * inode) +{ + if (atomic_read(&inode->i_count)) { + atomic_inc(&inode->i_count); + return; + } + atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_in_use); + } + inodes_stat.nr_unused--; +} + +/** + * clear_inode - clear an inode + * @inode: inode to clear + * + * This is called by the filesystem to tell us + * that the inode is no longer useful. We just + * terminate it with extreme prejudice. + */ + +void clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + + if (inode->i_data.nrpages) + BUG(); + if (!(inode->i_state & I_FREEING)) + BUG(); + if (inode->i_state & I_CLEAR) + BUG(); + wait_on_inode(inode); + DQUOT_DROP(inode); + if (inode->i_sb && inode->i_sb->s_op->clear_inode) + inode->i_sb->s_op->clear_inode(inode); + if (inode->i_bdev) + bd_forget(inode); + inode->i_state = I_CLEAR; +} + +/* + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + int nr_disposed = 0; + + while (!list_empty(head)) { + struct inode *inode; + + inode = list_entry(head->next, struct inode, i_list); + list_del(&inode->i_list); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); + nr_disposed++; + } + spin_lock(&inode_lock); + inodes_stat.nr_inodes -= nr_disposed; + spin_unlock(&inode_lock); +} + +/* + * Invalidate all inodes for a device. + */ +static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +{ + struct list_head *next; + int busy = 0, count = 0; + + next = head->next; + for (;;) { + struct list_head * tmp = next; + struct inode * inode; + + next = next->next; + if (tmp == head) + break; + inode = list_entry(tmp, struct inode, i_list); + if (inode->i_sb != sb) + continue; + invalidate_inode_buffers(inode); + if (!atomic_read(&inode->i_count)) { + hlist_del_init(&inode->i_hash); + list_del(&inode->i_list); + list_add(&inode->i_list, dispose); + inode->i_state |= I_FREEING; + count++; + continue; + } + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ + inodes_stat.nr_unused -= count; + return busy; +} + +/* + * This is a two-stage process. First we collect all + * offending inodes onto the throw-away list, and in + * the second stage we actually dispose of them. This + * is because we don't want to sleep while messing + * with the global lists.. + */ + +/** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +int invalidate_inodes(struct super_block * sb) +{ + int busy; + LIST_HEAD(throw_away); + + down(&iprune_sem); + spin_lock(&inode_lock); + busy = invalidate_list(&inode_in_use, sb, &throw_away); + busy |= invalidate_list(&inode_unused, sb, &throw_away); + busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_io, sb, &throw_away); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); + up(&iprune_sem); + + return busy; +} + +int invalidate_device(kdev_t dev, int do_sync) +{ + struct super_block *sb; + struct block_device *bdev = bdget(kdev_t_to_nr(dev)); + int res; + + if (!bdev) + return 0; + + if (do_sync) + fsync_bdev(bdev); + + res = 0; + sb = get_super(bdev); + if (sb) { + /* + * no need to lock the super, get_super holds the + * read semaphore so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb); + drop_super(sb); + } + invalidate_bdev(bdev, 0); + bdput(bdev); + return res; +} + +static int can_unuse(struct inode *inode) +{ + if (inode->i_state) + return 0; + if (inode_has_buffers(inode)) + return 0; + if (atomic_read(&inode->i_count)) + return 0; + if (inode->i_data.nrpages) + return 0; + return 1; +} + +/* + * Scan `goal' inodes on the unused list for freeable ones. They are moved to + * a temporary list and then are freed outside inode_lock by dispose_list(). + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. We expect the final iput() on that inode to add it to + * the front of the inode_unused list. So look for it there and if the + * inode is still freeable, proceed. The right inode is found 99.9% of the + * time in testing on a 4-way. + * + * If the inode has metadata buffers attached to mapping->private_list then + * try to remove them. + */ +static void prune_icache(int nr_to_scan) +{ + LIST_HEAD(freeable); + int nr_pruned = 0; + int nr_scanned; + unsigned long reap = 0; + + down(&iprune_sem); + spin_lock(&inode_lock); + for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + struct inode *inode; + + if (list_empty(&inode_unused)) + break; + + inode = list_entry(inode_unused.prev, struct inode, i_list); + + if (inode->i_state || atomic_read(&inode->i_count)) { + list_move(&inode->i_list, &inode_unused); + continue; + } + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode_lock); + if (remove_inode_buffers(inode)) + reap += invalidate_inode_pages(&inode->i_data); + iput(inode); + spin_lock(&inode_lock); + + if (inode != list_entry(inode_unused.next, + struct inode, i_list)) + continue; /* wrong inode or list_empty */ + if (!can_unuse(inode)) + continue; + } + hlist_del_init(&inode->i_hash); + list_move(&inode->i_list, &freeable); + inode->i_state |= I_FREEING; + nr_pruned++; + } + inodes_stat.nr_unused -= nr_pruned; + spin_unlock(&inode_lock); + + dispose_list(&freeable); + up(&iprune_sem); + + if (current_is_kswapd) + mod_page_state(kswapd_inodesteal, reap); + else + mod_page_state(pginodesteal, reap); +} + +/* + * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, + * "unused" means that no dentries are referring to the inodes: the files are + * not open and the dcache references to those inodes have already been + * reclaimed. + * + * This function is passed the number of inodes to scan, and it returns the + * total number of remaining possibly-reclaimable inodes. + */ +static int shrink_icache_memory(int nr, unsigned int gfp_mask) +{ + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, + * and we don't want to recurse into the FS that called us + * in clear_inode() and friends.. + */ + if (gfp_mask & __GFP_FS) + prune_icache(nr); + } + return inodes_stat.nr_unused; +} + +/* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() + * by hand after calling find_inode now! This simplifies iunique and won't + * add any additional branch in the common code. + */ +static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = hlist_entry(node, struct inode, i_hash); + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + break; + } + return node ? inode : NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = list_entry(node, struct inode, i_hash); + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + break; + } + return node ? inode : NULL; +} + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + */ + +struct inode *new_inode(struct super_block *sb) +{ + static unsigned long last_ino; + struct inode * inode; + + spin_lock_prefetch(&inode_lock); + + inode = alloc_inode(sb); + if (inode) { + spin_lock(&inode_lock); + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + inode->i_ino = ++last_ino; + inode->i_state = 0; + spin_unlock(&inode_lock); + } + return inode; +} + +void unlock_new_inode(struct inode *inode) +{ + /* + * This is special! We do not need the spinlock + * when clearing I_LOCK, because we're guaranteed + * that nobody else tries to do anything about the + * state of the inode when it is locked, as we + * just created it (so there can be no old holders + * that haven't tested I_LOCK). + */ + inode->i_state &= ~(I_LOCK|I_NEW); + wake_up_inode(inode); +} +EXPORT_SYMBOL(unlock_new_inode); + +/* + * This is called without the inode lock held.. Be careful. + * + * We no longer cache the sb_flags in i_flags - see fs.h + * -- rmk@arm.uk.linux.org + */ +static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode(sb, head, test, data); + if (!old) { + if (set(inode, data)) + goto set_failed; + + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; + +set_failed: + spin_unlock(&inode_lock); + destroy_inode(inode); + return NULL; +} + +/* + * get_new_inode_fast is the fast path version of get_new_inode, see the + * comment at iget_locked for details. + */ +static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} + +static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES); + tmp = tmp + (tmp >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +/* Yeah, I know about quadratic hash. Maybe, later. */ + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ + +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + static ino_t counter = 0; + struct inode *inode; + struct hlist_head * head; + ino_t res; + spin_lock(&inode_lock); +retry: + if (counter > max_reserved) { + head = inode_hashtable + hash(sb,counter); + res = counter++; + inode = find_inode_fast(sb, head, res); + if (!inode) { + spin_unlock(&inode_lock); + return res; + } + } else { + counter = max_reserved + 1; + } + goto retry; + +} + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode_lock); + if (!(inode->i_state & I_FREEING)) + __iget(inode); + else + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + spin_unlock(&inode_lock); + return inode; +} + +/** + * ifind - internal function, you want ilookup5() or iget5(). + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ifind() searches for the inode specified by @hashval and @data in the inode + * cache. This is a generalized version of ifind_fast() for file systems where + * the inode number is not sufficient for unique identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +static inline struct inode *ifind(struct super_block *sb, + struct hlist_head *head, int (*test)(struct inode *, void *), + void *data) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode(sb, head, test, data); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ifind_fast - internal function, you want ilookup() or iget(). + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ifind_fast() searches for the inode @ino in the inode cache. This is for + * file systems where the inode number is sufficient for unique identification + * of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +static inline struct inode *ifind_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode_fast(sb, head, ino); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ilookup5() uses ifind() to search for the inode specified by @hashval and + * @data in the inode cache. This is a generalized version of ilookup() for + * file systems where the inode number is not sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + return ifind(sb, head, test, data); +} +EXPORT_SYMBOL(ilookup5); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. + * This is for file systems where the inode number is sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + return ifind_fast(sb, head, ino); +} +EXPORT_SYMBOL(ilookup); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * This is iget() without the read_inode() portion of get_new_inode(). + * + * iget5_locked() uses ifind() to search for the inode specified by @hashval + * and @data in the inode cache and if present it is returned with an increased + * reference count. This is a generalized version of iget_locked() for file + * systems where the inode number is not sufficient for unique identification + * of an inode. + * + * If the inode is not in cache, get_new_inode() is called to allocate a new + * inode and this is returned locked, hashed, and with the I_NEW flag set. The + * file system gets to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_lock held, so can't sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + inode = ifind(sb, head, test, data); + if (inode) + return inode; + /* + * get_new_inode() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode(sb, head, test, set, data); +} +EXPORT_SYMBOL(iget5_locked); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * This is iget() without the read_inode() portion of get_new_inode_fast(). + * + * iget_locked() uses ifind_fast() to search for the inode specified by @ino in + * the inode cache and if present it is returned with an increased reference + * count. This is for file systems where the inode number is sufficient for + * unique identification of an inode. + * + * If the inode is not in cache, get_new_inode_fast() is called to allocate a + * new inode and this is returned locked, hashed, and with the I_NEW flag set. + * The file system gets to fill it in before unlocking it via + * unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + inode = ifind_fast(sb, head, ino); + if (inode) + return inode; + /* + * get_new_inode_fast() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode_fast(sb, head, ino); +} +EXPORT_SYMBOL(iget_locked); + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. If the inode + * has no superblock it is added to a separate anonymous chain. + */ + +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *head = &anon_hash_chain; + if (inode->i_sb) + head = inode_hashtable + hash(inode->i_sb, hashval); + spin_lock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); +} + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock or anonymous hash. + */ + +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); +} + +void generic_delete_inode(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + + hlist_del_init(&inode->i_hash); + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + security_inode_delete(inode); + + if (op->delete_inode) { + void (*delete)(struct inode *) = op->delete_inode; + if (!is_bad_inode(inode)) + DQUOT_INIT(inode); + /* s_op->delete_inode internally recalls clear_inode() */ + delete(inode); + } else + clear_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +} +EXPORT_SYMBOL(generic_delete_inode); + +static void generic_forget_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!hlist_unhashed(&inode->i_hash)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_unused); + } + inodes_stat.nr_unused++; + spin_unlock(&inode_lock); + if (!sb || (sb->s_flags & MS_ACTIVE)) + return; + write_inode_now(inode, 1); + spin_lock(&inode_lock); + inodes_stat.nr_unused--; + hlist_del_init(&inode->i_hash); + } + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); +} + +/* + * Normal UNIX filesystem behaviour: delete the + * inode when the usage count drops to zero, and + * i_nlink is zero. + */ +static void generic_drop_inode(struct inode *inode) +{ + if (!inode->i_nlink) + generic_delete_inode(inode); + else + generic_forget_inode(inode); +} + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop()" function, defaulting to + * the legacy UNIX filesystem behaviour.. + * + * NOTE! NOTE! NOTE! We're called with the inode lock + * held, and the drop function is supposed to release + * the lock! + */ +static inline void iput_final(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + void (*drop)(struct inode *) = generic_drop_inode; + + if (op && op->drop_inode) + drop = op->drop_inode; + drop(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero the inode is also then freed and may be destroyed. + */ + +void iput(struct inode *inode) +{ + if (inode) { + struct super_operations *op = inode->i_sb->s_op; + + if (inode->i_state == I_CLEAR) + BUG(); + + if (op && op->put_inode) + op->put_inode(inode); + + if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + iput_final(inode); + } +} + +/** + * bmap - find a block number in a file + * @inode: inode of file + * @block: block to find + * + * Returns the block number on the device holding the inode that + * is the disk block number for the block of the file requested. + * That is, asked for block 4 of inode 1 the function will return the + * disk block relative to the disk start that holds that block of the + * file. + */ + +sector_t bmap(struct inode * inode, sector_t block) +{ + sector_t res = 0; + if (inode->i_mapping->a_ops->bmap) + res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); + return res; +} + +/* + * Return true if the filesystem which backs this inode considers the two + * passed timespecs to be sufficiently different to warrant flushing the + * altered time out to disk. + */ +static int inode_times_differ(struct inode *inode, + struct timespec *old, struct timespec *new) +{ + if (IS_ONE_SECOND(inode)) + return old->tv_sec != new->tv_sec; + return !timespec_equal(old, new); +} + +/** + * update_atime - update the access time + * @inode: inode accessed + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ + +void update_atime(struct inode *inode) +{ + struct timespec now; + + if (IS_NOATIME(inode)) + return; + if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) + return; + if (IS_RDONLY(inode)) + return; + + now = current_kernel_time(); + if (inode_times_differ(inode, &inode->i_atime, &now)) { + inode->i_atime = now; + mark_inode_dirty_sync(inode); + } else { + if (!timespec_equal(&inode->i_atime, &now)) + inode->i_atime = now; + } +} + +/** + * inode_update_time - update mtime and ctime time + * @inode: inode accessed + * @ctime_too: update ctime too + * + * Update the mtime time on an inode and mark it for writeback. + * When ctime_too is specified update the ctime too. + */ + +void inode_update_time(struct inode *inode, int ctime_too) +{ + struct timespec now = current_kernel_time(); + int sync_it = 0; + + if (inode_times_differ(inode, &inode->i_mtime, &now)) + sync_it = 1; + inode->i_mtime = now; + + if (ctime_too) { + if (inode_times_differ(inode, &inode->i_ctime, &now)) + sync_it = 1; + inode->i_ctime = now; + } + if (sync_it) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL(inode_update_time); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} +EXPORT_SYMBOL(inode_needs_sync); + +/* + * Quota functions that want to walk the inode lists.. + */ +#ifdef CONFIG_QUOTA + +/* Functions back in dquot.c */ +void put_dquot_list(struct list_head *); +int remove_inode_dquot_ref(struct inode *, int, struct list_head *); + +void remove_dquot_ref(struct super_block *sb, int type) +{ + struct inode *inode; + struct list_head *act_head; + LIST_HEAD(tofree_head); + + if (!sb->dq_op) + return; /* nothing to do */ + spin_lock(&inode_lock); /* This lock is for inodes code */ + /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ + + list_for_each(act_head, &inode_in_use) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &inode_unused) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_dirty) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_io) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + spin_unlock(&inode_lock); + + put_dquot_list(&tofree_head); +} + +#endif + +/* + * Hashed waitqueues for wait_on_inode(). The table is pretty small - the + * kernel doesn't lock many inodes at the same time. + */ +#define I_WAIT_TABLE_ORDER 3 +static struct i_wait_queue_head { + wait_queue_head_t wqh; +} ____cacheline_aligned_in_smp i_wait_queue_heads[1<i_state & I_LOCK) { + schedule(); + goto repeat; + } + remove_wait_queue(wq, &wait); + __set_current_state(TASK_RUNNING); +} + +void wake_up_inode(struct inode *inode) +{ + wait_queue_head_t *wq = i_waitq_head(inode); + + /* + * Prevent speculative execution through spin_unlock(&inode_lock); + */ + smp_mb(); + if (waitqueue_active(wq)) + wake_up_all(wq); +} + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init(unsigned long mempages) +{ + struct hlist_head *head; + unsigned long order; + unsigned int nr_hash; + int i; + + for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) + init_waitqueue_head(&i_wait_queue_heads[i].wqh); + + mempages >>= (14 - PAGE_SHIFT); + mempages *= sizeof(struct list_head); + for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) + ; + + do { + unsigned long tmp; + + nr_hash = (1UL << order) * PAGE_SIZE / + sizeof(struct hlist_head); + i_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + i_hash_shift = 0; + while ((tmp >>= 1UL) != 0UL) + i_hash_shift++; + + inode_hashtable = (struct hlist_head *) + __get_free_pages(GFP_ATOMIC, order); + } while (inode_hashtable == NULL && --order >= 0); + + printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!inode_hashtable) + panic("Failed to allocate inode hash table\n"); + + head = inode_hashtable; + i = nr_hash; + do { + INIT_HLIST_HEAD(head); + head++; + i--; + } while (i); + + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), + 0, SLAB_HWCACHE_ALIGN, init_once, + NULL); + if (!inode_cachep) + panic("cannot create inode slab cache"); + + set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) + inode->i_fop = &bad_sock_fops; + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", + mode); +} diff --git a/tests/linux/inode-fullpatch/patch b/tests/linux/inode-fullpatch/patch new file mode 100644 index 0000000..aeafa41 --- /dev/null +++ b/tests/linux/inode-fullpatch/patch @@ -0,0 +1,77 @@ + +diff ./fs/inode.c~current~ ./fs/inode.c +--- ./fs/inode.c~current~ 2003-03-10 15:13:52.000000000 +1100 ++++ ./fs/inode.c 2003-03-10 15:13:53.000000000 +1100 +@@ -470,6 +470,7 @@ static int shrink_icache_memory(int nr, + return inodes_stat.nr_inodes; + } + ++void __wait_on_freeing_inode(struct inode *inode); + /* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() +@@ -492,6 +493,11 @@ static struct inode * find_inode(struct + continue; + if (!test(inode, data)) + continue; ++ if (inode->i_state & (I_FREEING|I_CLEAR)) { ++ __wait_on_freeing_inode(inode); ++ tmp = head; ++ continue; ++ } + break; + } + return inode; +@@ -517,6 +523,11 @@ static struct inode * find_inode_fast(st + continue; + if (inode->i_sb != sb) + continue; ++ if (inode->i_state & (I_FREEING|I_CLEAR)) { ++ __wait_on_freeing_inode(inode); ++ tmp = head; ++ continue; ++ } + break; + } + return inode; +@@ -949,7 +960,6 @@ void generic_delete_inode(struct inode * + { + struct super_operations *op = inode->i_sb->s_op; + +- list_del_init(&inode->i_hash); + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; +@@ -968,6 +978,10 @@ void generic_delete_inode(struct inode * + delete(inode); + } else + clear_inode(inode); ++ spin_lock(&inode_lock); ++ list_del_init(&inode->i_hash); ++ spin_unlock(&inode_lock); ++ wake_up_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +@@ -1219,6 +1233,21 @@ repeat: + current->state = TASK_RUNNING; + } + ++void __wait_on_freeing_inode(struct inode *inode) ++{ ++ DECLARE_WAITQUEUE(wait, current); ++ wait_queue_head_t *wq = i_waitq_head(inode); ++ ++ add_wait_queue(wq, &wait); ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ spin_unlock(&inode_lock); ++ schedule(); ++ remove_wait_queue(wq, &wait); ++ current->state = TASK_RUNNING; ++ spin_lock(&inode_lock); ++} ++ ++ + void wake_up_inode(struct inode *inode) + { + wait_queue_head_t *wq = i_waitq_head(inode); diff --git a/tests/linux/inode-fullpatch/rediff b/tests/linux/inode-fullpatch/rediff new file mode 100644 index 0000000..ea080cf --- /dev/null +++ b/tests/linux/inode-fullpatch/rediff @@ -0,0 +1,73 @@ +@@ -470,6 +470,7 @@ + return inodes_stat.nr_inodes; + } + ++void __wait_on_freeing_inode(struct inode *inode); + /* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() +@@ -492,6 +493,11 @@ + continue; + if (!test(inode, data)) + continue; ++ if (inode->i_state & (I_FREEING|I_CLEAR)) { ++ __wait_on_freeing_inode(inode); ++ tmp = head; ++ continue; ++ } + break; + } + return inode; +@@ -517,6 +523,11 @@ + continue; + if (inode->i_sb != sb) + continue; ++ if (inode->i_state & (I_FREEING|I_CLEAR)) { ++ __wait_on_freeing_inode(inode); ++ tmp = head; ++ continue; ++ } + break; + } + return inode; +@@ -949,7 +960,6 @@ + { + struct super_operations *op = inode->i_sb->s_op; + +- list_del_init(&inode->i_hash); + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; +@@ -968,6 +978,10 @@ + delete(inode); + } else + clear_inode(inode); ++ spin_lock(&inode_lock); ++ list_del_init(&inode->i_hash); ++ spin_unlock(&inode_lock); ++ wake_up_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +@@ -1219,6 +1233,21 @@ + current->state = TASK_RUNNING; + } + ++void __wait_on_freeing_inode(struct inode *inode) ++{ ++ DECLARE_WAITQUEUE(wait, current); ++ wait_queue_head_t *wq = i_waitq_head(inode); ++ ++ add_wait_queue(wq, &wait); ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ spin_unlock(&inode_lock); ++ schedule(); ++ remove_wait_queue(wq, &wait); ++ current->state = TASK_RUNNING; ++ spin_lock(&inode_lock); ++} ++ ++ + void wake_up_inode(struct inode *inode) + { + wait_queue_head_t *wq = i_waitq_head(inode); diff --git a/tests/linux/inode-fullpatch/wmerge b/tests/linux/inode-fullpatch/wmerge new file mode 100644 index 0000000..1ffda02 --- /dev/null +++ b/tests/linux/inode-fullpatch/wmerge @@ -0,0 +1,1352 @@ +/* + * linux/fs/inode.c + * + * (C) 1997 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is needed for the following functions: + * - inode_has_buffers + * - invalidate_inode_buffers + * - fsync_bdev + * - invalidate_bdev + * + * FIXME: remove all knowledge of the buffer layer from this file + */ +#include + +/* + * New inode.c implementation. + * + * This implementation has the basic premise of trying + * to be extremely low-overhead and SMP-safe, yet be + * simple enough to be "obviously correct". + * + * Famous last words. + */ + +/* inode dynamic allocation 1999, Andrea Arcangeli */ + +/* #define INODE_PARANOIA 1 */ +/* #define INODE_DEBUG 1 */ + +/* + * Inode lookup is no longer as critical as it used to be: + * most of the lookups are going to be through the dcache. + */ +#define I_HASHBITS i_hash_shift +#define I_HASHMASK i_hash_mask + +static unsigned int i_hash_mask; +static unsigned int i_hash_shift; + +/* + * Each inode can be on two separate lists. One is + * the hash list of the inode, used for lookups. The + * other linked list is the "type" list: + * "in_use" - valid inode, i_count > 0, i_nlink > 0 + * "dirty" - as "in_use" but also dirty + * "unused" - valid inode, i_count = 0 + * + * A "dirty" list is maintained for each super block, + * allowing for low-overhead inode sync() operations. + */ + +LIST_HEAD(inode_in_use); +LIST_HEAD(inode_unused); +static struct hlist_head *inode_hashtable; +static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */ + +/* + * A simple spinlock to protect the list manipulations. + * + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; + +/* + * iprune_sem provides exclusion between the kswapd or try_to_free_pages + * icache shrinking path, and the umount path. Without this exclusion, + * by the time prune_icache calls iput for the inode whose pages it has + * been invalidating, or by the time it calls clear_inode & destroy_inode + * from its final dispose_list, the struct super_block they refer to + * (for inode->i_sb->s_op) may already have been freed and reused. + */ +static DECLARE_MUTEX(iprune_sem); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; + +static kmem_cache_t * inode_cachep; + +static struct inode *alloc_inode(struct super_block *sb) +{ + static struct address_space_operations empty_aops; + static struct inode_operations empty_iops; + static struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); + + if (inode) { + struct address_space * const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_sock = 0; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_rdev = to_kdev_t(0); + inode->i_security = NULL; + if (security_inode_alloc(inode)) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; + } + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->gfp_mask = GFP_HIGHUSER; + mapping->dirtied_when = 0; + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + if (sb->s_bdev) + mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + memset(&inode->u, 0, sizeof(inode->u)); + inode->i_mapping = mapping; + } + return inode; +} + +void destroy_inode(struct inode *inode) +{ + if (inode_has_buffers(inode)) + BUG(); + security_inode_free(inode); + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); +} + + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_data.clean_pages); + INIT_LIST_HEAD(&inode->i_data.dirty_pages); + INIT_LIST_HEAD(&inode->i_data.locked_pages); + INIT_LIST_HEAD(&inode->i_data.io_pages); + INIT_LIST_HEAD(&inode->i_dentry); + INIT_LIST_HEAD(&inode->i_devices); + sema_init(&inode->i_sem, 1); + INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); + rwlock_init(&inode->i_data.page_lock); + init_MUTEX(&inode->i_data.i_shared_sem); + INIT_LIST_HEAD(&inode->i_data.private_list); + spin_lock_init(&inode->i_data.private_lock); + INIT_LIST_HEAD(&inode->i_data.i_mmap); + INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); + spin_lock_init(&inode->i_lock); +} + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct inode * inode = (struct inode *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(inode); +} + +/* + * inode_lock must be held + */ +void __iget(struct inode * inode) +{ + if (atomic_read(&inode->i_count)) { + atomic_inc(&inode->i_count); + return; + } + atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_in_use); + } + inodes_stat.nr_unused--; +} + +/** + * clear_inode - clear an inode + * @inode: inode to clear + * + * This is called by the filesystem to tell us + * that the inode is no longer useful. We just + * terminate it with extreme prejudice. + */ + +void clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + + if (inode->i_data.nrpages) + BUG(); + if (!(inode->i_state & I_FREEING)) + BUG(); + if (inode->i_state & I_CLEAR) + BUG(); + wait_on_inode(inode); + DQUOT_DROP(inode); + if (inode->i_sb && inode->i_sb->s_op->clear_inode) + inode->i_sb->s_op->clear_inode(inode); + if (inode->i_bdev) + bd_forget(inode); + inode->i_state = I_CLEAR; +} + +/* + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + int nr_disposed = 0; + + while (!list_empty(head)) { + struct inode *inode; + + inode = list_entry(head->next, struct inode, i_list); + list_del(&inode->i_list); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); + nr_disposed++; + } + spin_lock(&inode_lock); + inodes_stat.nr_inodes -= nr_disposed; + spin_unlock(&inode_lock); +} + +/* + * Invalidate all inodes for a device. + */ +static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +{ + struct list_head *next; + int busy = 0, count = 0; + + next = head->next; + for (;;) { + struct list_head * tmp = next; + struct inode * inode; + + next = next->next; + if (tmp == head) + break; + inode = list_entry(tmp, struct inode, i_list); + if (inode->i_sb != sb) + continue; + invalidate_inode_buffers(inode); + if (!atomic_read(&inode->i_count)) { + hlist_del_init(&inode->i_hash); + list_del(&inode->i_list); + list_add(&inode->i_list, dispose); + inode->i_state |= I_FREEING; + count++; + continue; + } + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ + inodes_stat.nr_unused -= count; + return busy; +} + +/* + * This is a two-stage process. First we collect all + * offending inodes onto the throw-away list, and in + * the second stage we actually dispose of them. This + * is because we don't want to sleep while messing + * with the global lists.. + */ + +/** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +int invalidate_inodes(struct super_block * sb) +{ + int busy; + LIST_HEAD(throw_away); + + down(&iprune_sem); + spin_lock(&inode_lock); + busy = invalidate_list(&inode_in_use, sb, &throw_away); + busy |= invalidate_list(&inode_unused, sb, &throw_away); + busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_io, sb, &throw_away); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); + up(&iprune_sem); + + return busy; +} + +int invalidate_device(kdev_t dev, int do_sync) +{ + struct super_block *sb; + struct block_device *bdev = bdget(kdev_t_to_nr(dev)); + int res; + + if (!bdev) + return 0; + + if (do_sync) + fsync_bdev(bdev); + + res = 0; + sb = get_super(bdev); + if (sb) { + /* + * no need to lock the super, get_super holds the + * read semaphore so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb); + drop_super(sb); + } + invalidate_bdev(bdev, 0); + bdput(bdev); + return res; +} + +static int can_unuse(struct inode *inode) +{ + if (inode->i_state) + return 0; + if (inode_has_buffers(inode)) + return 0; + if (atomic_read(&inode->i_count)) + return 0; + if (inode->i_data.nrpages) + return 0; + return 1; +} + +/* + * Scan `goal' inodes on the unused list for freeable ones. They are moved to + * a temporary list and then are freed outside inode_lock by dispose_list(). + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. We expect the final iput() on that inode to add it to + * the front of the inode_unused list. So look for it there and if the + * inode is still freeable, proceed. The right inode is found 99.9% of the + * time in testing on a 4-way. + * + * If the inode has metadata buffers attached to mapping->private_list then + * try to remove them. + */ +static void prune_icache(int nr_to_scan) +{ + LIST_HEAD(freeable); + int nr_pruned = 0; + int nr_scanned; + unsigned long reap = 0; + + down(&iprune_sem); + spin_lock(&inode_lock); + for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + struct inode *inode; + + if (list_empty(&inode_unused)) + break; + + inode = list_entry(inode_unused.prev, struct inode, i_list); + + if (inode->i_state || atomic_read(&inode->i_count)) { + list_move(&inode->i_list, &inode_unused); + continue; + } + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode_lock); + if (remove_inode_buffers(inode)) + reap += invalidate_inode_pages(&inode->i_data); + iput(inode); + spin_lock(&inode_lock); + + if (inode != list_entry(inode_unused.next, + struct inode, i_list)) + continue; /* wrong inode or list_empty */ + if (!can_unuse(inode)) + continue; + } + hlist_del_init(&inode->i_hash); + list_move(&inode->i_list, &freeable); + inode->i_state |= I_FREEING; + nr_pruned++; + } + inodes_stat.nr_unused -= nr_pruned; + spin_unlock(&inode_lock); + + dispose_list(&freeable); + up(&iprune_sem); + + if (current_is_kswapd) + mod_page_state(kswapd_inodesteal, reap); + else + mod_page_state(pginodesteal, reap); +} + +/* + * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, + * "unused" means that no dentries are referring to the inodes: the files are + * not open and the dcache references to those inodes have already been + * reclaimed. + * + * This function is passed the number of inodes to scan, and it returns the + * total number of remaining possibly-reclaimable inodes. + */ +static int shrink_icache_memory(int nr, unsigned int gfp_mask) +{ + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, + * and we don't want to recurse into the FS that called us + * in clear_inode() and friends.. + */ + if (gfp_mask & __GFP_FS) + prune_icache(nr); + } + return inodes_stat.nr_unused; +} + +void __wait_on_freeing_inode(struct inode *inode); +/* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() + * by hand after calling find_inode now! This simplifies iunique and won't + * add any additional branch in the common code. + */ +static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = hlist_entry(node, struct inode, i_hash); + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = list_entry(node, struct inode, i_hash); + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + */ + +struct inode *new_inode(struct super_block *sb) +{ + static unsigned long last_ino; + struct inode * inode; + + spin_lock_prefetch(&inode_lock); + + inode = alloc_inode(sb); + if (inode) { + spin_lock(&inode_lock); + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + inode->i_ino = ++last_ino; + inode->i_state = 0; + spin_unlock(&inode_lock); + } + return inode; +} + +void unlock_new_inode(struct inode *inode) +{ + /* + * This is special! We do not need the spinlock + * when clearing I_LOCK, because we're guaranteed + * that nobody else tries to do anything about the + * state of the inode when it is locked, as we + * just created it (so there can be no old holders + * that haven't tested I_LOCK). + */ + inode->i_state &= ~(I_LOCK|I_NEW); + wake_up_inode(inode); +} +EXPORT_SYMBOL(unlock_new_inode); + +/* + * This is called without the inode lock held.. Be careful. + * + * We no longer cache the sb_flags in i_flags - see fs.h + * -- rmk@arm.uk.linux.org + */ +static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode(sb, head, test, data); + if (!old) { + if (set(inode, data)) + goto set_failed; + + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; + +set_failed: + spin_unlock(&inode_lock); + destroy_inode(inode); + return NULL; +} + +/* + * get_new_inode_fast is the fast path version of get_new_inode, see the + * comment at iget_locked for details. + */ +static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} + +static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES); + tmp = tmp + (tmp >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +/* Yeah, I know about quadratic hash. Maybe, later. */ + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ + +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + static ino_t counter = 0; + struct inode *inode; + struct hlist_head * head; + ino_t res; + spin_lock(&inode_lock); +retry: + if (counter > max_reserved) { + head = inode_hashtable + hash(sb,counter); + res = counter++; + inode = find_inode_fast(sb, head, res); + if (!inode) { + spin_unlock(&inode_lock); + return res; + } + } else { + counter = max_reserved + 1; + } + goto retry; + +} + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode_lock); + if (!(inode->i_state & I_FREEING)) + __iget(inode); + else + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + spin_unlock(&inode_lock); + return inode; +} + +/** + * ifind - internal function, you want ilookup5() or iget5(). + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ifind() searches for the inode specified by @hashval and @data in the inode + * cache. This is a generalized version of ifind_fast() for file systems where + * the inode number is not sufficient for unique identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +static inline struct inode *ifind(struct super_block *sb, + struct hlist_head *head, int (*test)(struct inode *, void *), + void *data) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode(sb, head, test, data); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ifind_fast - internal function, you want ilookup() or iget(). + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ifind_fast() searches for the inode @ino in the inode cache. This is for + * file systems where the inode number is sufficient for unique identification + * of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +static inline struct inode *ifind_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode_fast(sb, head, ino); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ilookup5() uses ifind() to search for the inode specified by @hashval and + * @data in the inode cache. This is a generalized version of ilookup() for + * file systems where the inode number is not sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + return ifind(sb, head, test, data); +} +EXPORT_SYMBOL(ilookup5); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. + * This is for file systems where the inode number is sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + return ifind_fast(sb, head, ino); +} +EXPORT_SYMBOL(ilookup); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * This is iget() without the read_inode() portion of get_new_inode(). + * + * iget5_locked() uses ifind() to search for the inode specified by @hashval + * and @data in the inode cache and if present it is returned with an increased + * reference count. This is a generalized version of iget_locked() for file + * systems where the inode number is not sufficient for unique identification + * of an inode. + * + * If the inode is not in cache, get_new_inode() is called to allocate a new + * inode and this is returned locked, hashed, and with the I_NEW flag set. The + * file system gets to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_lock held, so can't sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + inode = ifind(sb, head, test, data); + if (inode) + return inode; + /* + * get_new_inode() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode(sb, head, test, set, data); +} +EXPORT_SYMBOL(iget5_locked); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * This is iget() without the read_inode() portion of get_new_inode_fast(). + * + * iget_locked() uses ifind_fast() to search for the inode specified by @ino in + * the inode cache and if present it is returned with an increased reference + * count. This is for file systems where the inode number is sufficient for + * unique identification of an inode. + * + * If the inode is not in cache, get_new_inode_fast() is called to allocate a + * new inode and this is returned locked, hashed, and with the I_NEW flag set. + * The file system gets to fill it in before unlocking it via + * unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + inode = ifind_fast(sb, head, ino); + if (inode) + return inode; + /* + * get_new_inode_fast() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode_fast(sb, head, ino); +} +EXPORT_SYMBOL(iget_locked); + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. If the inode + * has no superblock it is added to a separate anonymous chain. + */ + +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *head = &anon_hash_chain; + if (inode->i_sb) + head = inode_hashtable + hash(inode->i_sb, hashval); + spin_lock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); +} + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock or anonymous hash. + */ + +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); +} + +void generic_delete_inode(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + +<<<---hlist_del_init|||list_del_init===--->>> list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + security_inode_delete(inode); + + if (op->delete_inode) { + void (*delete)(struct inode *) = op->delete_inode; + if (!is_bad_inode(inode)) + DQUOT_INIT(inode); + /* s_op->delete_inode internally recalls clear_inode() */ + delete(inode); + } else + clear_inode(inode); + spin_lock(&inode_lock); + list_del_init(&inode->i_hash); + spin_unlock(&inode_lock); + wake_up_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +} +EXPORT_SYMBOL(generic_delete_inode); + +static void generic_forget_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!hlist_unhashed(&inode->i_hash)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_unused); + } + inodes_stat.nr_unused++; + spin_unlock(&inode_lock); + if (!sb || (sb->s_flags & MS_ACTIVE)) + return; + write_inode_now(inode, 1); + spin_lock(&inode_lock); + inodes_stat.nr_unused--; + hlist_del_init(&inode->i_hash); + } + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); +} + +/* + * Normal UNIX filesystem behaviour: delete the + * inode when the usage count drops to zero, and + * i_nlink is zero. + */ +static void generic_drop_inode(struct inode *inode) +{ + if (!inode->i_nlink) + generic_delete_inode(inode); + else + generic_forget_inode(inode); +} + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop()" function, defaulting to + * the legacy UNIX filesystem behaviour.. + * + * NOTE! NOTE! NOTE! We're called with the inode lock + * held, and the drop function is supposed to release + * the lock! + */ +static inline void iput_final(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + void (*drop)(struct inode *) = generic_drop_inode; + + if (op && op->drop_inode) + drop = op->drop_inode; + drop(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero the inode is also then freed and may be destroyed. + */ + +void iput(struct inode *inode) +{ + if (inode) { + struct super_operations *op = inode->i_sb->s_op; + + if (inode->i_state == I_CLEAR) + BUG(); + + if (op && op->put_inode) + op->put_inode(inode); + + if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + iput_final(inode); + } +} + +/** + * bmap - find a block number in a file + * @inode: inode of file + * @block: block to find + * + * Returns the block number on the device holding the inode that + * is the disk block number for the block of the file requested. + * That is, asked for block 4 of inode 1 the function will return the + * disk block relative to the disk start that holds that block of the + * file. + */ + +sector_t bmap(struct inode * inode, sector_t block) +{ + sector_t res = 0; + if (inode->i_mapping->a_ops->bmap) + res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); + return res; +} + +/* + * Return true if the filesystem which backs this inode considers the two + * passed timespecs to be sufficiently different to warrant flushing the + * altered time out to disk. + */ +static int inode_times_differ(struct inode *inode, + struct timespec *old, struct timespec *new) +{ + if (IS_ONE_SECOND(inode)) + return old->tv_sec != new->tv_sec; + return !timespec_equal(old, new); +} + +/** + * update_atime - update the access time + * @inode: inode accessed + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ + +void update_atime(struct inode *inode) +{ + struct timespec now; + + if (IS_NOATIME(inode)) + return; + if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) + return; + if (IS_RDONLY(inode)) + return; + + now = current_kernel_time(); + if (inode_times_differ(inode, &inode->i_atime, &now)) { + inode->i_atime = now; + mark_inode_dirty_sync(inode); + } else { + if (!timespec_equal(&inode->i_atime, &now)) + inode->i_atime = now; + } +} + +/** + * inode_update_time - update mtime and ctime time + * @inode: inode accessed + * @ctime_too: update ctime too + * + * Update the mtime time on an inode and mark it for writeback. + * When ctime_too is specified update the ctime too. + */ + +void inode_update_time(struct inode *inode, int ctime_too) +{ + struct timespec now = current_kernel_time(); + int sync_it = 0; + + if (inode_times_differ(inode, &inode->i_mtime, &now)) + sync_it = 1; + inode->i_mtime = now; + + if (ctime_too) { + if (inode_times_differ(inode, &inode->i_ctime, &now)) + sync_it = 1; + inode->i_ctime = now; + } + if (sync_it) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL(inode_update_time); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} +EXPORT_SYMBOL(inode_needs_sync); + +/* + * Quota functions that want to walk the inode lists.. + */ +#ifdef CONFIG_QUOTA + +/* Functions back in dquot.c */ +void put_dquot_list(struct list_head *); +int remove_inode_dquot_ref(struct inode *, int, struct list_head *); + +void remove_dquot_ref(struct super_block *sb, int type) +{ + struct inode *inode; + struct list_head *act_head; + LIST_HEAD(tofree_head); + + if (!sb->dq_op) + return; /* nothing to do */ + spin_lock(&inode_lock); /* This lock is for inodes code */ + /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ + + list_for_each(act_head, &inode_in_use) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &inode_unused) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_dirty) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_io) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + spin_unlock(&inode_lock); + + put_dquot_list(&tofree_head); +} + +#endif + +/* + * Hashed waitqueues for wait_on_inode(). The table is pretty small - the + * kernel doesn't lock many inodes at the same time. + */ +#define I_WAIT_TABLE_ORDER 3 +static struct i_wait_queue_head { + wait_queue_head_t wqh; +} ____cacheline_aligned_in_smp i_wait_queue_heads[1<i_state & I_LOCK) { + schedule(); + goto repeat; + } + remove_wait_queue(wq, &wait); + __set_current_state(TASK_RUNNING); +} + +void __wait_on_freeing_inode(struct inode *inode) +{ + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *wq = i_waitq_head(inode); + + add_wait_queue(wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&inode_lock); + schedule(); + remove_wait_queue(wq, &wait); + current->state = TASK_RUNNING; + spin_lock(&inode_lock); +} + + +void wake_up_inode(struct inode *inode) +{ + wait_queue_head_t *wq = i_waitq_head(inode); + + /* + * Prevent speculative execution through spin_unlock(&inode_lock); + */ + smp_mb(); + if (waitqueue_active(wq)) + wake_up_all(wq); +} + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init(unsigned long mempages) +{ + struct hlist_head *head; + unsigned long order; + unsigned int nr_hash; + int i; + + for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) + init_waitqueue_head(&i_wait_queue_heads[i].wqh); + + mempages >>= (14 - PAGE_SHIFT); + mempages *= sizeof(struct list_head); + for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) + ; + + do { + unsigned long tmp; + + nr_hash = (1UL << order) * PAGE_SIZE / + sizeof(struct hlist_head); + i_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + i_hash_shift = 0; + while ((tmp >>= 1UL) != 0UL) + i_hash_shift++; + + inode_hashtable = (struct hlist_head *) + __get_free_pages(GFP_ATOMIC, order); + } while (inode_hashtable == NULL && --order >= 0); + + printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!inode_hashtable) + panic("Failed to allocate inode hash table\n"); + + head = inode_hashtable; + i = nr_hash; + do { + INIT_HLIST_HEAD(head); + head++; + i--; + } while (i); + + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), + 0, SLAB_HWCACHE_ALIGN, init_once, + NULL); + if (!inode_cachep) + panic("cannot create inode slab cache"); + + set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) + inode->i_fop = &bad_sock_fops; + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", + mode); +} diff --git a/tests/linux/inode-justrej/lmerge b/tests/linux/inode-justrej/lmerge new file mode 100644 index 0000000..685b14e --- /dev/null +++ b/tests/linux/inode-justrej/lmerge @@ -0,0 +1,1358 @@ +/* + * linux/fs/inode.c + * + * (C) 1997 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is needed for the following functions: + * - inode_has_buffers + * - invalidate_inode_buffers + * - fsync_bdev + * - invalidate_bdev + * + * FIXME: remove all knowledge of the buffer layer from this file + */ +#include + +/* + * New inode.c implementation. + * + * This implementation has the basic premise of trying + * to be extremely low-overhead and SMP-safe, yet be + * simple enough to be "obviously correct". + * + * Famous last words. + */ + +/* inode dynamic allocation 1999, Andrea Arcangeli */ + +/* #define INODE_PARANOIA 1 */ +/* #define INODE_DEBUG 1 */ + +/* + * Inode lookup is no longer as critical as it used to be: + * most of the lookups are going to be through the dcache. + */ +#define I_HASHBITS i_hash_shift +#define I_HASHMASK i_hash_mask + +static unsigned int i_hash_mask; +static unsigned int i_hash_shift; + +/* + * Each inode can be on two separate lists. One is + * the hash list of the inode, used for lookups. The + * other linked list is the "type" list: + * "in_use" - valid inode, i_count > 0, i_nlink > 0 + * "dirty" - as "in_use" but also dirty + * "unused" - valid inode, i_count = 0 + * + * A "dirty" list is maintained for each super block, + * allowing for low-overhead inode sync() operations. + */ + +LIST_HEAD(inode_in_use); +LIST_HEAD(inode_unused); +static struct hlist_head *inode_hashtable; +static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */ + +/* + * A simple spinlock to protect the list manipulations. + * + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; + +/* + * iprune_sem provides exclusion between the kswapd or try_to_free_pages + * icache shrinking path, and the umount path. Without this exclusion, + * by the time prune_icache calls iput for the inode whose pages it has + * been invalidating, or by the time it calls clear_inode & destroy_inode + * from its final dispose_list, the struct super_block they refer to + * (for inode->i_sb->s_op) may already have been freed and reused. + */ +static DECLARE_MUTEX(iprune_sem); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; + +static kmem_cache_t * inode_cachep; + +static struct inode *alloc_inode(struct super_block *sb) +{ + static struct address_space_operations empty_aops; + static struct inode_operations empty_iops; + static struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); + + if (inode) { + struct address_space * const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_sock = 0; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_rdev = to_kdev_t(0); + inode->i_security = NULL; + if (security_inode_alloc(inode)) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; + } + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->gfp_mask = GFP_HIGHUSER; + mapping->dirtied_when = 0; + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + if (sb->s_bdev) + mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + memset(&inode->u, 0, sizeof(inode->u)); + inode->i_mapping = mapping; + } + return inode; +} + +void destroy_inode(struct inode *inode) +{ + if (inode_has_buffers(inode)) + BUG(); + security_inode_free(inode); + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); +} + + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_data.clean_pages); + INIT_LIST_HEAD(&inode->i_data.dirty_pages); + INIT_LIST_HEAD(&inode->i_data.locked_pages); + INIT_LIST_HEAD(&inode->i_data.io_pages); + INIT_LIST_HEAD(&inode->i_dentry); + INIT_LIST_HEAD(&inode->i_devices); + sema_init(&inode->i_sem, 1); + INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); + rwlock_init(&inode->i_data.page_lock); + init_MUTEX(&inode->i_data.i_shared_sem); + INIT_LIST_HEAD(&inode->i_data.private_list); + spin_lock_init(&inode->i_data.private_lock); + INIT_LIST_HEAD(&inode->i_data.i_mmap); + INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); + spin_lock_init(&inode->i_lock); +} + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct inode * inode = (struct inode *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(inode); +} + +/* + * inode_lock must be held + */ +void __iget(struct inode * inode) +{ + if (atomic_read(&inode->i_count)) { + atomic_inc(&inode->i_count); + return; + } + atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_in_use); + } + inodes_stat.nr_unused--; +} + +/** + * clear_inode - clear an inode + * @inode: inode to clear + * + * This is called by the filesystem to tell us + * that the inode is no longer useful. We just + * terminate it with extreme prejudice. + */ + +void clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + + if (inode->i_data.nrpages) + BUG(); + if (!(inode->i_state & I_FREEING)) + BUG(); + if (inode->i_state & I_CLEAR) + BUG(); + wait_on_inode(inode); + DQUOT_DROP(inode); + if (inode->i_sb && inode->i_sb->s_op->clear_inode) + inode->i_sb->s_op->clear_inode(inode); + if (inode->i_bdev) + bd_forget(inode); + inode->i_state = I_CLEAR; +} + +/* + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + int nr_disposed = 0; + + while (!list_empty(head)) { + struct inode *inode; + + inode = list_entry(head->next, struct inode, i_list); + list_del(&inode->i_list); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); + nr_disposed++; + } + spin_lock(&inode_lock); + inodes_stat.nr_inodes -= nr_disposed; + spin_unlock(&inode_lock); +} + +/* + * Invalidate all inodes for a device. + */ +static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +{ + struct list_head *next; + int busy = 0, count = 0; + + next = head->next; + for (;;) { + struct list_head * tmp = next; + struct inode * inode; + + next = next->next; + if (tmp == head) + break; + inode = list_entry(tmp, struct inode, i_list); + if (inode->i_sb != sb) + continue; + invalidate_inode_buffers(inode); + if (!atomic_read(&inode->i_count)) { + hlist_del_init(&inode->i_hash); + list_del(&inode->i_list); + list_add(&inode->i_list, dispose); + inode->i_state |= I_FREEING; + count++; + continue; + } + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ + inodes_stat.nr_unused -= count; + return busy; +} + +/* + * This is a two-stage process. First we collect all + * offending inodes onto the throw-away list, and in + * the second stage we actually dispose of them. This + * is because we don't want to sleep while messing + * with the global lists.. + */ + +/** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +int invalidate_inodes(struct super_block * sb) +{ + int busy; + LIST_HEAD(throw_away); + + down(&iprune_sem); + spin_lock(&inode_lock); + busy = invalidate_list(&inode_in_use, sb, &throw_away); + busy |= invalidate_list(&inode_unused, sb, &throw_away); + busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_io, sb, &throw_away); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); + up(&iprune_sem); + + return busy; +} + +int invalidate_device(kdev_t dev, int do_sync) +{ + struct super_block *sb; + struct block_device *bdev = bdget(kdev_t_to_nr(dev)); + int res; + + if (!bdev) + return 0; + + if (do_sync) + fsync_bdev(bdev); + + res = 0; + sb = get_super(bdev); + if (sb) { + /* + * no need to lock the super, get_super holds the + * read semaphore so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb); + drop_super(sb); + } + invalidate_bdev(bdev, 0); + bdput(bdev); + return res; +} + +static int can_unuse(struct inode *inode) +{ + if (inode->i_state) + return 0; + if (inode_has_buffers(inode)) + return 0; + if (atomic_read(&inode->i_count)) + return 0; + if (inode->i_data.nrpages) + return 0; + return 1; +} + +/* + * Scan `goal' inodes on the unused list for freeable ones. They are moved to + * a temporary list and then are freed outside inode_lock by dispose_list(). + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. We expect the final iput() on that inode to add it to + * the front of the inode_unused list. So look for it there and if the + * inode is still freeable, proceed. The right inode is found 99.9% of the + * time in testing on a 4-way. + * + * If the inode has metadata buffers attached to mapping->private_list then + * try to remove them. + */ +static void prune_icache(int nr_to_scan) +{ + LIST_HEAD(freeable); + int nr_pruned = 0; + int nr_scanned; + unsigned long reap = 0; + + down(&iprune_sem); + spin_lock(&inode_lock); + for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + struct inode *inode; + + if (list_empty(&inode_unused)) + break; + + inode = list_entry(inode_unused.prev, struct inode, i_list); + + if (inode->i_state || atomic_read(&inode->i_count)) { + list_move(&inode->i_list, &inode_unused); + continue; + } + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode_lock); + if (remove_inode_buffers(inode)) + reap += invalidate_inode_pages(&inode->i_data); + iput(inode); + spin_lock(&inode_lock); + + if (inode != list_entry(inode_unused.next, + struct inode, i_list)) + continue; /* wrong inode or list_empty */ + if (!can_unuse(inode)) + continue; + } + hlist_del_init(&inode->i_hash); + list_move(&inode->i_list, &freeable); + inode->i_state |= I_FREEING; + nr_pruned++; + } + inodes_stat.nr_unused -= nr_pruned; + spin_unlock(&inode_lock); + + dispose_list(&freeable); + up(&iprune_sem); + + if (current_is_kswapd) + mod_page_state(kswapd_inodesteal, reap); + else + mod_page_state(pginodesteal, reap); +} + +/* + * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, + * "unused" means that no dentries are referring to the inodes: the files are + * not open and the dcache references to those inodes have already been + * reclaimed. + * + * This function is passed the number of inodes to scan, and it returns the + * total number of remaining possibly-reclaimable inodes. + */ +static int shrink_icache_memory(int nr, unsigned int gfp_mask) +{ + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, + * and we don't want to recurse into the FS that called us + * in clear_inode() and friends.. + */ + if (gfp_mask & __GFP_FS) + prune_icache(nr); + } + return inodes_stat.nr_unused; +} + +void __wait_on_freeing_inode(struct inode *inode); +/* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() + * by hand after calling find_inode now! This simplifies iunique and won't + * add any additional branch in the common code. + */ +static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = hlist_entry(node, struct inode, i_hash); + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = list_entry(node, struct inode, i_hash); + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + */ + +struct inode *new_inode(struct super_block *sb) +{ + static unsigned long last_ino; + struct inode * inode; + + spin_lock_prefetch(&inode_lock); + + inode = alloc_inode(sb); + if (inode) { + spin_lock(&inode_lock); + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + inode->i_ino = ++last_ino; + inode->i_state = 0; + spin_unlock(&inode_lock); + } + return inode; +} + +void unlock_new_inode(struct inode *inode) +{ + /* + * This is special! We do not need the spinlock + * when clearing I_LOCK, because we're guaranteed + * that nobody else tries to do anything about the + * state of the inode when it is locked, as we + * just created it (so there can be no old holders + * that haven't tested I_LOCK). + */ + inode->i_state &= ~(I_LOCK|I_NEW); + wake_up_inode(inode); +} +EXPORT_SYMBOL(unlock_new_inode); + +/* + * This is called without the inode lock held.. Be careful. + * + * We no longer cache the sb_flags in i_flags - see fs.h + * -- rmk@arm.uk.linux.org + */ +static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode(sb, head, test, data); + if (!old) { + if (set(inode, data)) + goto set_failed; + + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; + +set_failed: + spin_unlock(&inode_lock); + destroy_inode(inode); + return NULL; +} + +/* + * get_new_inode_fast is the fast path version of get_new_inode, see the + * comment at iget_locked for details. + */ +static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} + +static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES); + tmp = tmp + (tmp >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +/* Yeah, I know about quadratic hash. Maybe, later. */ + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ + +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + static ino_t counter = 0; + struct inode *inode; + struct hlist_head * head; + ino_t res; + spin_lock(&inode_lock); +retry: + if (counter > max_reserved) { + head = inode_hashtable + hash(sb,counter); + res = counter++; + inode = find_inode_fast(sb, head, res); + if (!inode) { + spin_unlock(&inode_lock); + return res; + } + } else { + counter = max_reserved + 1; + } + goto retry; + +} + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode_lock); + if (!(inode->i_state & I_FREEING)) + __iget(inode); + else + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + spin_unlock(&inode_lock); + return inode; +} + +/** + * ifind - internal function, you want ilookup5() or iget5(). + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ifind() searches for the inode specified by @hashval and @data in the inode + * cache. This is a generalized version of ifind_fast() for file systems where + * the inode number is not sufficient for unique identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +static inline struct inode *ifind(struct super_block *sb, + struct hlist_head *head, int (*test)(struct inode *, void *), + void *data) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode(sb, head, test, data); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ifind_fast - internal function, you want ilookup() or iget(). + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ifind_fast() searches for the inode @ino in the inode cache. This is for + * file systems where the inode number is sufficient for unique identification + * of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +static inline struct inode *ifind_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode_fast(sb, head, ino); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ilookup5() uses ifind() to search for the inode specified by @hashval and + * @data in the inode cache. This is a generalized version of ilookup() for + * file systems where the inode number is not sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + return ifind(sb, head, test, data); +} +EXPORT_SYMBOL(ilookup5); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. + * This is for file systems where the inode number is sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + return ifind_fast(sb, head, ino); +} +EXPORT_SYMBOL(ilookup); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * This is iget() without the read_inode() portion of get_new_inode(). + * + * iget5_locked() uses ifind() to search for the inode specified by @hashval + * and @data in the inode cache and if present it is returned with an increased + * reference count. This is a generalized version of iget_locked() for file + * systems where the inode number is not sufficient for unique identification + * of an inode. + * + * If the inode is not in cache, get_new_inode() is called to allocate a new + * inode and this is returned locked, hashed, and with the I_NEW flag set. The + * file system gets to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_lock held, so can't sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + inode = ifind(sb, head, test, data); + if (inode) + return inode; + /* + * get_new_inode() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode(sb, head, test, set, data); +} +EXPORT_SYMBOL(iget5_locked); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * This is iget() without the read_inode() portion of get_new_inode_fast(). + * + * iget_locked() uses ifind_fast() to search for the inode specified by @ino in + * the inode cache and if present it is returned with an increased reference + * count. This is for file systems where the inode number is sufficient for + * unique identification of an inode. + * + * If the inode is not in cache, get_new_inode_fast() is called to allocate a + * new inode and this is returned locked, hashed, and with the I_NEW flag set. + * The file system gets to fill it in before unlocking it via + * unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + inode = ifind_fast(sb, head, ino); + if (inode) + return inode; + /* + * get_new_inode_fast() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode_fast(sb, head, ino); +} +EXPORT_SYMBOL(iget_locked); + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. If the inode + * has no superblock it is added to a separate anonymous chain. + */ + +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *head = &anon_hash_chain; + if (inode->i_sb) + head = inode_hashtable + hash(inode->i_sb, hashval); + spin_lock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); +} + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock or anonymous hash. + */ + +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); +} + +void generic_delete_inode(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + +<<<<<<< + hlist_del_init(&inode->i_hash); +||||||| + list_del_init(&inode->i_hash); +======= +>>>>>>> + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + security_inode_delete(inode); + + if (op->delete_inode) { + void (*delete)(struct inode *) = op->delete_inode; + if (!is_bad_inode(inode)) + DQUOT_INIT(inode); + /* s_op->delete_inode internally recalls clear_inode() */ + delete(inode); + } else + clear_inode(inode); + spin_lock(&inode_lock); + list_del_init(&inode->i_hash); + spin_unlock(&inode_lock); + wake_up_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +} +EXPORT_SYMBOL(generic_delete_inode); + +static void generic_forget_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!hlist_unhashed(&inode->i_hash)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_unused); + } + inodes_stat.nr_unused++; + spin_unlock(&inode_lock); + if (!sb || (sb->s_flags & MS_ACTIVE)) + return; + write_inode_now(inode, 1); + spin_lock(&inode_lock); + inodes_stat.nr_unused--; + hlist_del_init(&inode->i_hash); + } + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); +} + +/* + * Normal UNIX filesystem behaviour: delete the + * inode when the usage count drops to zero, and + * i_nlink is zero. + */ +static void generic_drop_inode(struct inode *inode) +{ + if (!inode->i_nlink) + generic_delete_inode(inode); + else + generic_forget_inode(inode); +} + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop()" function, defaulting to + * the legacy UNIX filesystem behaviour.. + * + * NOTE! NOTE! NOTE! We're called with the inode lock + * held, and the drop function is supposed to release + * the lock! + */ +static inline void iput_final(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + void (*drop)(struct inode *) = generic_drop_inode; + + if (op && op->drop_inode) + drop = op->drop_inode; + drop(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero the inode is also then freed and may be destroyed. + */ + +void iput(struct inode *inode) +{ + if (inode) { + struct super_operations *op = inode->i_sb->s_op; + + if (inode->i_state == I_CLEAR) + BUG(); + + if (op && op->put_inode) + op->put_inode(inode); + + if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + iput_final(inode); + } +} + +/** + * bmap - find a block number in a file + * @inode: inode of file + * @block: block to find + * + * Returns the block number on the device holding the inode that + * is the disk block number for the block of the file requested. + * That is, asked for block 4 of inode 1 the function will return the + * disk block relative to the disk start that holds that block of the + * file. + */ + +sector_t bmap(struct inode * inode, sector_t block) +{ + sector_t res = 0; + if (inode->i_mapping->a_ops->bmap) + res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); + return res; +} + +/* + * Return true if the filesystem which backs this inode considers the two + * passed timespecs to be sufficiently different to warrant flushing the + * altered time out to disk. + */ +static int inode_times_differ(struct inode *inode, + struct timespec *old, struct timespec *new) +{ + if (IS_ONE_SECOND(inode)) + return old->tv_sec != new->tv_sec; + return !timespec_equal(old, new); +} + +/** + * update_atime - update the access time + * @inode: inode accessed + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ + +void update_atime(struct inode *inode) +{ + struct timespec now; + + if (IS_NOATIME(inode)) + return; + if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) + return; + if (IS_RDONLY(inode)) + return; + + now = current_kernel_time(); + if (inode_times_differ(inode, &inode->i_atime, &now)) { + inode->i_atime = now; + mark_inode_dirty_sync(inode); + } else { + if (!timespec_equal(&inode->i_atime, &now)) + inode->i_atime = now; + } +} + +/** + * inode_update_time - update mtime and ctime time + * @inode: inode accessed + * @ctime_too: update ctime too + * + * Update the mtime time on an inode and mark it for writeback. + * When ctime_too is specified update the ctime too. + */ + +void inode_update_time(struct inode *inode, int ctime_too) +{ + struct timespec now = current_kernel_time(); + int sync_it = 0; + + if (inode_times_differ(inode, &inode->i_mtime, &now)) + sync_it = 1; + inode->i_mtime = now; + + if (ctime_too) { + if (inode_times_differ(inode, &inode->i_ctime, &now)) + sync_it = 1; + inode->i_ctime = now; + } + if (sync_it) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL(inode_update_time); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} +EXPORT_SYMBOL(inode_needs_sync); + +/* + * Quota functions that want to walk the inode lists.. + */ +#ifdef CONFIG_QUOTA + +/* Functions back in dquot.c */ +void put_dquot_list(struct list_head *); +int remove_inode_dquot_ref(struct inode *, int, struct list_head *); + +void remove_dquot_ref(struct super_block *sb, int type) +{ + struct inode *inode; + struct list_head *act_head; + LIST_HEAD(tofree_head); + + if (!sb->dq_op) + return; /* nothing to do */ + spin_lock(&inode_lock); /* This lock is for inodes code */ + /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ + + list_for_each(act_head, &inode_in_use) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &inode_unused) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_dirty) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_io) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + spin_unlock(&inode_lock); + + put_dquot_list(&tofree_head); +} + +#endif + +/* + * Hashed waitqueues for wait_on_inode(). The table is pretty small - the + * kernel doesn't lock many inodes at the same time. + */ +#define I_WAIT_TABLE_ORDER 3 +static struct i_wait_queue_head { + wait_queue_head_t wqh; +} ____cacheline_aligned_in_smp i_wait_queue_heads[1<i_state & I_LOCK) { + schedule(); + goto repeat; + } + remove_wait_queue(wq, &wait); + __set_current_state(TASK_RUNNING); +} + +void __wait_on_freeing_inode(struct inode *inode) +{ + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *wq = i_waitq_head(inode); + + add_wait_queue(wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&inode_lock); + schedule(); + remove_wait_queue(wq, &wait); + current->state = TASK_RUNNING; + spin_lock(&inode_lock); +} + + +void wake_up_inode(struct inode *inode) +{ + wait_queue_head_t *wq = i_waitq_head(inode); + + /* + * Prevent speculative execution through spin_unlock(&inode_lock); + */ + smp_mb(); + if (waitqueue_active(wq)) + wake_up_all(wq); +} + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init(unsigned long mempages) +{ + struct hlist_head *head; + unsigned long order; + unsigned int nr_hash; + int i; + + for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) + init_waitqueue_head(&i_wait_queue_heads[i].wqh); + + mempages >>= (14 - PAGE_SHIFT); + mempages *= sizeof(struct list_head); + for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) + ; + + do { + unsigned long tmp; + + nr_hash = (1UL << order) * PAGE_SIZE / + sizeof(struct hlist_head); + i_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + i_hash_shift = 0; + while ((tmp >>= 1UL) != 0UL) + i_hash_shift++; + + inode_hashtable = (struct hlist_head *) + __get_free_pages(GFP_ATOMIC, order); + } while (inode_hashtable == NULL && --order >= 0); + + printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!inode_hashtable) + panic("Failed to allocate inode hash table\n"); + + head = inode_hashtable; + i = nr_hash; + do { + INIT_HLIST_HEAD(head); + head++; + i--; + } while (i); + + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), + 0, SLAB_HWCACHE_ALIGN, init_once, + NULL); + if (!inode_cachep) + panic("cannot create inode slab cache"); + + set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) + inode->i_fop = &bad_sock_fops; + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", + mode); +} diff --git a/tests/linux/inode-justrej/merge b/tests/linux/inode-justrej/merge new file mode 100644 index 0000000..685b14e --- /dev/null +++ b/tests/linux/inode-justrej/merge @@ -0,0 +1,1358 @@ +/* + * linux/fs/inode.c + * + * (C) 1997 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is needed for the following functions: + * - inode_has_buffers + * - invalidate_inode_buffers + * - fsync_bdev + * - invalidate_bdev + * + * FIXME: remove all knowledge of the buffer layer from this file + */ +#include + +/* + * New inode.c implementation. + * + * This implementation has the basic premise of trying + * to be extremely low-overhead and SMP-safe, yet be + * simple enough to be "obviously correct". + * + * Famous last words. + */ + +/* inode dynamic allocation 1999, Andrea Arcangeli */ + +/* #define INODE_PARANOIA 1 */ +/* #define INODE_DEBUG 1 */ + +/* + * Inode lookup is no longer as critical as it used to be: + * most of the lookups are going to be through the dcache. + */ +#define I_HASHBITS i_hash_shift +#define I_HASHMASK i_hash_mask + +static unsigned int i_hash_mask; +static unsigned int i_hash_shift; + +/* + * Each inode can be on two separate lists. One is + * the hash list of the inode, used for lookups. The + * other linked list is the "type" list: + * "in_use" - valid inode, i_count > 0, i_nlink > 0 + * "dirty" - as "in_use" but also dirty + * "unused" - valid inode, i_count = 0 + * + * A "dirty" list is maintained for each super block, + * allowing for low-overhead inode sync() operations. + */ + +LIST_HEAD(inode_in_use); +LIST_HEAD(inode_unused); +static struct hlist_head *inode_hashtable; +static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */ + +/* + * A simple spinlock to protect the list manipulations. + * + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; + +/* + * iprune_sem provides exclusion between the kswapd or try_to_free_pages + * icache shrinking path, and the umount path. Without this exclusion, + * by the time prune_icache calls iput for the inode whose pages it has + * been invalidating, or by the time it calls clear_inode & destroy_inode + * from its final dispose_list, the struct super_block they refer to + * (for inode->i_sb->s_op) may already have been freed and reused. + */ +static DECLARE_MUTEX(iprune_sem); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; + +static kmem_cache_t * inode_cachep; + +static struct inode *alloc_inode(struct super_block *sb) +{ + static struct address_space_operations empty_aops; + static struct inode_operations empty_iops; + static struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); + + if (inode) { + struct address_space * const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_sock = 0; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_rdev = to_kdev_t(0); + inode->i_security = NULL; + if (security_inode_alloc(inode)) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; + } + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->gfp_mask = GFP_HIGHUSER; + mapping->dirtied_when = 0; + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + if (sb->s_bdev) + mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + memset(&inode->u, 0, sizeof(inode->u)); + inode->i_mapping = mapping; + } + return inode; +} + +void destroy_inode(struct inode *inode) +{ + if (inode_has_buffers(inode)) + BUG(); + security_inode_free(inode); + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); +} + + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_data.clean_pages); + INIT_LIST_HEAD(&inode->i_data.dirty_pages); + INIT_LIST_HEAD(&inode->i_data.locked_pages); + INIT_LIST_HEAD(&inode->i_data.io_pages); + INIT_LIST_HEAD(&inode->i_dentry); + INIT_LIST_HEAD(&inode->i_devices); + sema_init(&inode->i_sem, 1); + INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); + rwlock_init(&inode->i_data.page_lock); + init_MUTEX(&inode->i_data.i_shared_sem); + INIT_LIST_HEAD(&inode->i_data.private_list); + spin_lock_init(&inode->i_data.private_lock); + INIT_LIST_HEAD(&inode->i_data.i_mmap); + INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); + spin_lock_init(&inode->i_lock); +} + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct inode * inode = (struct inode *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(inode); +} + +/* + * inode_lock must be held + */ +void __iget(struct inode * inode) +{ + if (atomic_read(&inode->i_count)) { + atomic_inc(&inode->i_count); + return; + } + atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_in_use); + } + inodes_stat.nr_unused--; +} + +/** + * clear_inode - clear an inode + * @inode: inode to clear + * + * This is called by the filesystem to tell us + * that the inode is no longer useful. We just + * terminate it with extreme prejudice. + */ + +void clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + + if (inode->i_data.nrpages) + BUG(); + if (!(inode->i_state & I_FREEING)) + BUG(); + if (inode->i_state & I_CLEAR) + BUG(); + wait_on_inode(inode); + DQUOT_DROP(inode); + if (inode->i_sb && inode->i_sb->s_op->clear_inode) + inode->i_sb->s_op->clear_inode(inode); + if (inode->i_bdev) + bd_forget(inode); + inode->i_state = I_CLEAR; +} + +/* + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + int nr_disposed = 0; + + while (!list_empty(head)) { + struct inode *inode; + + inode = list_entry(head->next, struct inode, i_list); + list_del(&inode->i_list); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); + nr_disposed++; + } + spin_lock(&inode_lock); + inodes_stat.nr_inodes -= nr_disposed; + spin_unlock(&inode_lock); +} + +/* + * Invalidate all inodes for a device. + */ +static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +{ + struct list_head *next; + int busy = 0, count = 0; + + next = head->next; + for (;;) { + struct list_head * tmp = next; + struct inode * inode; + + next = next->next; + if (tmp == head) + break; + inode = list_entry(tmp, struct inode, i_list); + if (inode->i_sb != sb) + continue; + invalidate_inode_buffers(inode); + if (!atomic_read(&inode->i_count)) { + hlist_del_init(&inode->i_hash); + list_del(&inode->i_list); + list_add(&inode->i_list, dispose); + inode->i_state |= I_FREEING; + count++; + continue; + } + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ + inodes_stat.nr_unused -= count; + return busy; +} + +/* + * This is a two-stage process. First we collect all + * offending inodes onto the throw-away list, and in + * the second stage we actually dispose of them. This + * is because we don't want to sleep while messing + * with the global lists.. + */ + +/** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +int invalidate_inodes(struct super_block * sb) +{ + int busy; + LIST_HEAD(throw_away); + + down(&iprune_sem); + spin_lock(&inode_lock); + busy = invalidate_list(&inode_in_use, sb, &throw_away); + busy |= invalidate_list(&inode_unused, sb, &throw_away); + busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_io, sb, &throw_away); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); + up(&iprune_sem); + + return busy; +} + +int invalidate_device(kdev_t dev, int do_sync) +{ + struct super_block *sb; + struct block_device *bdev = bdget(kdev_t_to_nr(dev)); + int res; + + if (!bdev) + return 0; + + if (do_sync) + fsync_bdev(bdev); + + res = 0; + sb = get_super(bdev); + if (sb) { + /* + * no need to lock the super, get_super holds the + * read semaphore so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb); + drop_super(sb); + } + invalidate_bdev(bdev, 0); + bdput(bdev); + return res; +} + +static int can_unuse(struct inode *inode) +{ + if (inode->i_state) + return 0; + if (inode_has_buffers(inode)) + return 0; + if (atomic_read(&inode->i_count)) + return 0; + if (inode->i_data.nrpages) + return 0; + return 1; +} + +/* + * Scan `goal' inodes on the unused list for freeable ones. They are moved to + * a temporary list and then are freed outside inode_lock by dispose_list(). + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. We expect the final iput() on that inode to add it to + * the front of the inode_unused list. So look for it there and if the + * inode is still freeable, proceed. The right inode is found 99.9% of the + * time in testing on a 4-way. + * + * If the inode has metadata buffers attached to mapping->private_list then + * try to remove them. + */ +static void prune_icache(int nr_to_scan) +{ + LIST_HEAD(freeable); + int nr_pruned = 0; + int nr_scanned; + unsigned long reap = 0; + + down(&iprune_sem); + spin_lock(&inode_lock); + for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + struct inode *inode; + + if (list_empty(&inode_unused)) + break; + + inode = list_entry(inode_unused.prev, struct inode, i_list); + + if (inode->i_state || atomic_read(&inode->i_count)) { + list_move(&inode->i_list, &inode_unused); + continue; + } + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode_lock); + if (remove_inode_buffers(inode)) + reap += invalidate_inode_pages(&inode->i_data); + iput(inode); + spin_lock(&inode_lock); + + if (inode != list_entry(inode_unused.next, + struct inode, i_list)) + continue; /* wrong inode or list_empty */ + if (!can_unuse(inode)) + continue; + } + hlist_del_init(&inode->i_hash); + list_move(&inode->i_list, &freeable); + inode->i_state |= I_FREEING; + nr_pruned++; + } + inodes_stat.nr_unused -= nr_pruned; + spin_unlock(&inode_lock); + + dispose_list(&freeable); + up(&iprune_sem); + + if (current_is_kswapd) + mod_page_state(kswapd_inodesteal, reap); + else + mod_page_state(pginodesteal, reap); +} + +/* + * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, + * "unused" means that no dentries are referring to the inodes: the files are + * not open and the dcache references to those inodes have already been + * reclaimed. + * + * This function is passed the number of inodes to scan, and it returns the + * total number of remaining possibly-reclaimable inodes. + */ +static int shrink_icache_memory(int nr, unsigned int gfp_mask) +{ + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, + * and we don't want to recurse into the FS that called us + * in clear_inode() and friends.. + */ + if (gfp_mask & __GFP_FS) + prune_icache(nr); + } + return inodes_stat.nr_unused; +} + +void __wait_on_freeing_inode(struct inode *inode); +/* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() + * by hand after calling find_inode now! This simplifies iunique and won't + * add any additional branch in the common code. + */ +static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = hlist_entry(node, struct inode, i_hash); + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = list_entry(node, struct inode, i_hash); + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + */ + +struct inode *new_inode(struct super_block *sb) +{ + static unsigned long last_ino; + struct inode * inode; + + spin_lock_prefetch(&inode_lock); + + inode = alloc_inode(sb); + if (inode) { + spin_lock(&inode_lock); + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + inode->i_ino = ++last_ino; + inode->i_state = 0; + spin_unlock(&inode_lock); + } + return inode; +} + +void unlock_new_inode(struct inode *inode) +{ + /* + * This is special! We do not need the spinlock + * when clearing I_LOCK, because we're guaranteed + * that nobody else tries to do anything about the + * state of the inode when it is locked, as we + * just created it (so there can be no old holders + * that haven't tested I_LOCK). + */ + inode->i_state &= ~(I_LOCK|I_NEW); + wake_up_inode(inode); +} +EXPORT_SYMBOL(unlock_new_inode); + +/* + * This is called without the inode lock held.. Be careful. + * + * We no longer cache the sb_flags in i_flags - see fs.h + * -- rmk@arm.uk.linux.org + */ +static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode(sb, head, test, data); + if (!old) { + if (set(inode, data)) + goto set_failed; + + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; + +set_failed: + spin_unlock(&inode_lock); + destroy_inode(inode); + return NULL; +} + +/* + * get_new_inode_fast is the fast path version of get_new_inode, see the + * comment at iget_locked for details. + */ +static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} + +static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES); + tmp = tmp + (tmp >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +/* Yeah, I know about quadratic hash. Maybe, later. */ + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ + +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + static ino_t counter = 0; + struct inode *inode; + struct hlist_head * head; + ino_t res; + spin_lock(&inode_lock); +retry: + if (counter > max_reserved) { + head = inode_hashtable + hash(sb,counter); + res = counter++; + inode = find_inode_fast(sb, head, res); + if (!inode) { + spin_unlock(&inode_lock); + return res; + } + } else { + counter = max_reserved + 1; + } + goto retry; + +} + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode_lock); + if (!(inode->i_state & I_FREEING)) + __iget(inode); + else + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + spin_unlock(&inode_lock); + return inode; +} + +/** + * ifind - internal function, you want ilookup5() or iget5(). + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ifind() searches for the inode specified by @hashval and @data in the inode + * cache. This is a generalized version of ifind_fast() for file systems where + * the inode number is not sufficient for unique identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +static inline struct inode *ifind(struct super_block *sb, + struct hlist_head *head, int (*test)(struct inode *, void *), + void *data) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode(sb, head, test, data); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ifind_fast - internal function, you want ilookup() or iget(). + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ifind_fast() searches for the inode @ino in the inode cache. This is for + * file systems where the inode number is sufficient for unique identification + * of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +static inline struct inode *ifind_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode_fast(sb, head, ino); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ilookup5() uses ifind() to search for the inode specified by @hashval and + * @data in the inode cache. This is a generalized version of ilookup() for + * file systems where the inode number is not sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + return ifind(sb, head, test, data); +} +EXPORT_SYMBOL(ilookup5); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. + * This is for file systems where the inode number is sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + return ifind_fast(sb, head, ino); +} +EXPORT_SYMBOL(ilookup); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * This is iget() without the read_inode() portion of get_new_inode(). + * + * iget5_locked() uses ifind() to search for the inode specified by @hashval + * and @data in the inode cache and if present it is returned with an increased + * reference count. This is a generalized version of iget_locked() for file + * systems where the inode number is not sufficient for unique identification + * of an inode. + * + * If the inode is not in cache, get_new_inode() is called to allocate a new + * inode and this is returned locked, hashed, and with the I_NEW flag set. The + * file system gets to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_lock held, so can't sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + inode = ifind(sb, head, test, data); + if (inode) + return inode; + /* + * get_new_inode() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode(sb, head, test, set, data); +} +EXPORT_SYMBOL(iget5_locked); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * This is iget() without the read_inode() portion of get_new_inode_fast(). + * + * iget_locked() uses ifind_fast() to search for the inode specified by @ino in + * the inode cache and if present it is returned with an increased reference + * count. This is for file systems where the inode number is sufficient for + * unique identification of an inode. + * + * If the inode is not in cache, get_new_inode_fast() is called to allocate a + * new inode and this is returned locked, hashed, and with the I_NEW flag set. + * The file system gets to fill it in before unlocking it via + * unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + inode = ifind_fast(sb, head, ino); + if (inode) + return inode; + /* + * get_new_inode_fast() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode_fast(sb, head, ino); +} +EXPORT_SYMBOL(iget_locked); + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. If the inode + * has no superblock it is added to a separate anonymous chain. + */ + +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *head = &anon_hash_chain; + if (inode->i_sb) + head = inode_hashtable + hash(inode->i_sb, hashval); + spin_lock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); +} + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock or anonymous hash. + */ + +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); +} + +void generic_delete_inode(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + +<<<<<<< + hlist_del_init(&inode->i_hash); +||||||| + list_del_init(&inode->i_hash); +======= +>>>>>>> + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + security_inode_delete(inode); + + if (op->delete_inode) { + void (*delete)(struct inode *) = op->delete_inode; + if (!is_bad_inode(inode)) + DQUOT_INIT(inode); + /* s_op->delete_inode internally recalls clear_inode() */ + delete(inode); + } else + clear_inode(inode); + spin_lock(&inode_lock); + list_del_init(&inode->i_hash); + spin_unlock(&inode_lock); + wake_up_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +} +EXPORT_SYMBOL(generic_delete_inode); + +static void generic_forget_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!hlist_unhashed(&inode->i_hash)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_unused); + } + inodes_stat.nr_unused++; + spin_unlock(&inode_lock); + if (!sb || (sb->s_flags & MS_ACTIVE)) + return; + write_inode_now(inode, 1); + spin_lock(&inode_lock); + inodes_stat.nr_unused--; + hlist_del_init(&inode->i_hash); + } + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); +} + +/* + * Normal UNIX filesystem behaviour: delete the + * inode when the usage count drops to zero, and + * i_nlink is zero. + */ +static void generic_drop_inode(struct inode *inode) +{ + if (!inode->i_nlink) + generic_delete_inode(inode); + else + generic_forget_inode(inode); +} + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop()" function, defaulting to + * the legacy UNIX filesystem behaviour.. + * + * NOTE! NOTE! NOTE! We're called with the inode lock + * held, and the drop function is supposed to release + * the lock! + */ +static inline void iput_final(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + void (*drop)(struct inode *) = generic_drop_inode; + + if (op && op->drop_inode) + drop = op->drop_inode; + drop(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero the inode is also then freed and may be destroyed. + */ + +void iput(struct inode *inode) +{ + if (inode) { + struct super_operations *op = inode->i_sb->s_op; + + if (inode->i_state == I_CLEAR) + BUG(); + + if (op && op->put_inode) + op->put_inode(inode); + + if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + iput_final(inode); + } +} + +/** + * bmap - find a block number in a file + * @inode: inode of file + * @block: block to find + * + * Returns the block number on the device holding the inode that + * is the disk block number for the block of the file requested. + * That is, asked for block 4 of inode 1 the function will return the + * disk block relative to the disk start that holds that block of the + * file. + */ + +sector_t bmap(struct inode * inode, sector_t block) +{ + sector_t res = 0; + if (inode->i_mapping->a_ops->bmap) + res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); + return res; +} + +/* + * Return true if the filesystem which backs this inode considers the two + * passed timespecs to be sufficiently different to warrant flushing the + * altered time out to disk. + */ +static int inode_times_differ(struct inode *inode, + struct timespec *old, struct timespec *new) +{ + if (IS_ONE_SECOND(inode)) + return old->tv_sec != new->tv_sec; + return !timespec_equal(old, new); +} + +/** + * update_atime - update the access time + * @inode: inode accessed + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ + +void update_atime(struct inode *inode) +{ + struct timespec now; + + if (IS_NOATIME(inode)) + return; + if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) + return; + if (IS_RDONLY(inode)) + return; + + now = current_kernel_time(); + if (inode_times_differ(inode, &inode->i_atime, &now)) { + inode->i_atime = now; + mark_inode_dirty_sync(inode); + } else { + if (!timespec_equal(&inode->i_atime, &now)) + inode->i_atime = now; + } +} + +/** + * inode_update_time - update mtime and ctime time + * @inode: inode accessed + * @ctime_too: update ctime too + * + * Update the mtime time on an inode and mark it for writeback. + * When ctime_too is specified update the ctime too. + */ + +void inode_update_time(struct inode *inode, int ctime_too) +{ + struct timespec now = current_kernel_time(); + int sync_it = 0; + + if (inode_times_differ(inode, &inode->i_mtime, &now)) + sync_it = 1; + inode->i_mtime = now; + + if (ctime_too) { + if (inode_times_differ(inode, &inode->i_ctime, &now)) + sync_it = 1; + inode->i_ctime = now; + } + if (sync_it) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL(inode_update_time); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} +EXPORT_SYMBOL(inode_needs_sync); + +/* + * Quota functions that want to walk the inode lists.. + */ +#ifdef CONFIG_QUOTA + +/* Functions back in dquot.c */ +void put_dquot_list(struct list_head *); +int remove_inode_dquot_ref(struct inode *, int, struct list_head *); + +void remove_dquot_ref(struct super_block *sb, int type) +{ + struct inode *inode; + struct list_head *act_head; + LIST_HEAD(tofree_head); + + if (!sb->dq_op) + return; /* nothing to do */ + spin_lock(&inode_lock); /* This lock is for inodes code */ + /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ + + list_for_each(act_head, &inode_in_use) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &inode_unused) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_dirty) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_io) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + spin_unlock(&inode_lock); + + put_dquot_list(&tofree_head); +} + +#endif + +/* + * Hashed waitqueues for wait_on_inode(). The table is pretty small - the + * kernel doesn't lock many inodes at the same time. + */ +#define I_WAIT_TABLE_ORDER 3 +static struct i_wait_queue_head { + wait_queue_head_t wqh; +} ____cacheline_aligned_in_smp i_wait_queue_heads[1<i_state & I_LOCK) { + schedule(); + goto repeat; + } + remove_wait_queue(wq, &wait); + __set_current_state(TASK_RUNNING); +} + +void __wait_on_freeing_inode(struct inode *inode) +{ + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *wq = i_waitq_head(inode); + + add_wait_queue(wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&inode_lock); + schedule(); + remove_wait_queue(wq, &wait); + current->state = TASK_RUNNING; + spin_lock(&inode_lock); +} + + +void wake_up_inode(struct inode *inode) +{ + wait_queue_head_t *wq = i_waitq_head(inode); + + /* + * Prevent speculative execution through spin_unlock(&inode_lock); + */ + smp_mb(); + if (waitqueue_active(wq)) + wake_up_all(wq); +} + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init(unsigned long mempages) +{ + struct hlist_head *head; + unsigned long order; + unsigned int nr_hash; + int i; + + for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) + init_waitqueue_head(&i_wait_queue_heads[i].wqh); + + mempages >>= (14 - PAGE_SHIFT); + mempages *= sizeof(struct list_head); + for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) + ; + + do { + unsigned long tmp; + + nr_hash = (1UL << order) * PAGE_SIZE / + sizeof(struct hlist_head); + i_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + i_hash_shift = 0; + while ((tmp >>= 1UL) != 0UL) + i_hash_shift++; + + inode_hashtable = (struct hlist_head *) + __get_free_pages(GFP_ATOMIC, order); + } while (inode_hashtable == NULL && --order >= 0); + + printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!inode_hashtable) + panic("Failed to allocate inode hash table\n"); + + head = inode_hashtable; + i = nr_hash; + do { + INIT_HLIST_HEAD(head); + head++; + i--; + } while (i); + + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), + 0, SLAB_HWCACHE_ALIGN, init_once, + NULL); + if (!inode_cachep) + panic("cannot create inode slab cache"); + + set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) + inode->i_fop = &bad_sock_fops; + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", + mode); +} diff --git a/tests/linux/inode-justrej/orig b/tests/linux/inode-justrej/orig new file mode 100644 index 0000000..299c900 --- /dev/null +++ b/tests/linux/inode-justrej/orig @@ -0,0 +1,1353 @@ +/* + * linux/fs/inode.c + * + * (C) 1997 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is needed for the following functions: + * - inode_has_buffers + * - invalidate_inode_buffers + * - fsync_bdev + * - invalidate_bdev + * + * FIXME: remove all knowledge of the buffer layer from this file + */ +#include + +/* + * New inode.c implementation. + * + * This implementation has the basic premise of trying + * to be extremely low-overhead and SMP-safe, yet be + * simple enough to be "obviously correct". + * + * Famous last words. + */ + +/* inode dynamic allocation 1999, Andrea Arcangeli */ + +/* #define INODE_PARANOIA 1 */ +/* #define INODE_DEBUG 1 */ + +/* + * Inode lookup is no longer as critical as it used to be: + * most of the lookups are going to be through the dcache. + */ +#define I_HASHBITS i_hash_shift +#define I_HASHMASK i_hash_mask + +static unsigned int i_hash_mask; +static unsigned int i_hash_shift; + +/* + * Each inode can be on two separate lists. One is + * the hash list of the inode, used for lookups. The + * other linked list is the "type" list: + * "in_use" - valid inode, i_count > 0, i_nlink > 0 + * "dirty" - as "in_use" but also dirty + * "unused" - valid inode, i_count = 0 + * + * A "dirty" list is maintained for each super block, + * allowing for low-overhead inode sync() operations. + */ + +LIST_HEAD(inode_in_use); +LIST_HEAD(inode_unused); +static struct hlist_head *inode_hashtable; +static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */ + +/* + * A simple spinlock to protect the list manipulations. + * + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; + +/* + * iprune_sem provides exclusion between the kswapd or try_to_free_pages + * icache shrinking path, and the umount path. Without this exclusion, + * by the time prune_icache calls iput for the inode whose pages it has + * been invalidating, or by the time it calls clear_inode & destroy_inode + * from its final dispose_list, the struct super_block they refer to + * (for inode->i_sb->s_op) may already have been freed and reused. + */ +static DECLARE_MUTEX(iprune_sem); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; + +static kmem_cache_t * inode_cachep; + +static struct inode *alloc_inode(struct super_block *sb) +{ + static struct address_space_operations empty_aops; + static struct inode_operations empty_iops; + static struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); + + if (inode) { + struct address_space * const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_sock = 0; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_rdev = to_kdev_t(0); + inode->i_security = NULL; + if (security_inode_alloc(inode)) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; + } + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->gfp_mask = GFP_HIGHUSER; + mapping->dirtied_when = 0; + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + if (sb->s_bdev) + mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + memset(&inode->u, 0, sizeof(inode->u)); + inode->i_mapping = mapping; + } + return inode; +} + +void destroy_inode(struct inode *inode) +{ + if (inode_has_buffers(inode)) + BUG(); + security_inode_free(inode); + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); +} + + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_data.clean_pages); + INIT_LIST_HEAD(&inode->i_data.dirty_pages); + INIT_LIST_HEAD(&inode->i_data.locked_pages); + INIT_LIST_HEAD(&inode->i_data.io_pages); + INIT_LIST_HEAD(&inode->i_dentry); + INIT_LIST_HEAD(&inode->i_devices); + sema_init(&inode->i_sem, 1); + INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); + rwlock_init(&inode->i_data.page_lock); + init_MUTEX(&inode->i_data.i_shared_sem); + INIT_LIST_HEAD(&inode->i_data.private_list); + spin_lock_init(&inode->i_data.private_lock); + INIT_LIST_HEAD(&inode->i_data.i_mmap); + INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); + spin_lock_init(&inode->i_lock); +} + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct inode * inode = (struct inode *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(inode); +} + +/* + * inode_lock must be held + */ +void __iget(struct inode * inode) +{ + if (atomic_read(&inode->i_count)) { + atomic_inc(&inode->i_count); + return; + } + atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_in_use); + } + inodes_stat.nr_unused--; +} + +/** + * clear_inode - clear an inode + * @inode: inode to clear + * + * This is called by the filesystem to tell us + * that the inode is no longer useful. We just + * terminate it with extreme prejudice. + */ + +void clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + + if (inode->i_data.nrpages) + BUG(); + if (!(inode->i_state & I_FREEING)) + BUG(); + if (inode->i_state & I_CLEAR) + BUG(); + wait_on_inode(inode); + DQUOT_DROP(inode); + if (inode->i_sb && inode->i_sb->s_op->clear_inode) + inode->i_sb->s_op->clear_inode(inode); + if (inode->i_bdev) + bd_forget(inode); + inode->i_state = I_CLEAR; +} + +/* + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + int nr_disposed = 0; + + while (!list_empty(head)) { + struct inode *inode; + + inode = list_entry(head->next, struct inode, i_list); + list_del(&inode->i_list); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); + nr_disposed++; + } + spin_lock(&inode_lock); + inodes_stat.nr_inodes -= nr_disposed; + spin_unlock(&inode_lock); +} + +/* + * Invalidate all inodes for a device. + */ +static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +{ + struct list_head *next; + int busy = 0, count = 0; + + next = head->next; + for (;;) { + struct list_head * tmp = next; + struct inode * inode; + + next = next->next; + if (tmp == head) + break; + inode = list_entry(tmp, struct inode, i_list); + if (inode->i_sb != sb) + continue; + invalidate_inode_buffers(inode); + if (!atomic_read(&inode->i_count)) { + hlist_del_init(&inode->i_hash); + list_del(&inode->i_list); + list_add(&inode->i_list, dispose); + inode->i_state |= I_FREEING; + count++; + continue; + } + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ + inodes_stat.nr_unused -= count; + return busy; +} + +/* + * This is a two-stage process. First we collect all + * offending inodes onto the throw-away list, and in + * the second stage we actually dispose of them. This + * is because we don't want to sleep while messing + * with the global lists.. + */ + +/** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +int invalidate_inodes(struct super_block * sb) +{ + int busy; + LIST_HEAD(throw_away); + + down(&iprune_sem); + spin_lock(&inode_lock); + busy = invalidate_list(&inode_in_use, sb, &throw_away); + busy |= invalidate_list(&inode_unused, sb, &throw_away); + busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_io, sb, &throw_away); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); + up(&iprune_sem); + + return busy; +} + +int invalidate_device(kdev_t dev, int do_sync) +{ + struct super_block *sb; + struct block_device *bdev = bdget(kdev_t_to_nr(dev)); + int res; + + if (!bdev) + return 0; + + if (do_sync) + fsync_bdev(bdev); + + res = 0; + sb = get_super(bdev); + if (sb) { + /* + * no need to lock the super, get_super holds the + * read semaphore so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb); + drop_super(sb); + } + invalidate_bdev(bdev, 0); + bdput(bdev); + return res; +} + +static int can_unuse(struct inode *inode) +{ + if (inode->i_state) + return 0; + if (inode_has_buffers(inode)) + return 0; + if (atomic_read(&inode->i_count)) + return 0; + if (inode->i_data.nrpages) + return 0; + return 1; +} + +/* + * Scan `goal' inodes on the unused list for freeable ones. They are moved to + * a temporary list and then are freed outside inode_lock by dispose_list(). + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. We expect the final iput() on that inode to add it to + * the front of the inode_unused list. So look for it there and if the + * inode is still freeable, proceed. The right inode is found 99.9% of the + * time in testing on a 4-way. + * + * If the inode has metadata buffers attached to mapping->private_list then + * try to remove them. + */ +static void prune_icache(int nr_to_scan) +{ + LIST_HEAD(freeable); + int nr_pruned = 0; + int nr_scanned; + unsigned long reap = 0; + + down(&iprune_sem); + spin_lock(&inode_lock); + for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + struct inode *inode; + + if (list_empty(&inode_unused)) + break; + + inode = list_entry(inode_unused.prev, struct inode, i_list); + + if (inode->i_state || atomic_read(&inode->i_count)) { + list_move(&inode->i_list, &inode_unused); + continue; + } + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode_lock); + if (remove_inode_buffers(inode)) + reap += invalidate_inode_pages(&inode->i_data); + iput(inode); + spin_lock(&inode_lock); + + if (inode != list_entry(inode_unused.next, + struct inode, i_list)) + continue; /* wrong inode or list_empty */ + if (!can_unuse(inode)) + continue; + } + hlist_del_init(&inode->i_hash); + list_move(&inode->i_list, &freeable); + inode->i_state |= I_FREEING; + nr_pruned++; + } + inodes_stat.nr_unused -= nr_pruned; + spin_unlock(&inode_lock); + + dispose_list(&freeable); + up(&iprune_sem); + + if (current_is_kswapd) + mod_page_state(kswapd_inodesteal, reap); + else + mod_page_state(pginodesteal, reap); +} + +/* + * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, + * "unused" means that no dentries are referring to the inodes: the files are + * not open and the dcache references to those inodes have already been + * reclaimed. + * + * This function is passed the number of inodes to scan, and it returns the + * total number of remaining possibly-reclaimable inodes. + */ +static int shrink_icache_memory(int nr, unsigned int gfp_mask) +{ + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, + * and we don't want to recurse into the FS that called us + * in clear_inode() and friends.. + */ + if (gfp_mask & __GFP_FS) + prune_icache(nr); + } + return inodes_stat.nr_unused; +} + +void __wait_on_freeing_inode(struct inode *inode); +/* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() + * by hand after calling find_inode now! This simplifies iunique and won't + * add any additional branch in the common code. + */ +static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = hlist_entry(node, struct inode, i_hash); + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = list_entry(node, struct inode, i_hash); + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + */ + +struct inode *new_inode(struct super_block *sb) +{ + static unsigned long last_ino; + struct inode * inode; + + spin_lock_prefetch(&inode_lock); + + inode = alloc_inode(sb); + if (inode) { + spin_lock(&inode_lock); + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + inode->i_ino = ++last_ino; + inode->i_state = 0; + spin_unlock(&inode_lock); + } + return inode; +} + +void unlock_new_inode(struct inode *inode) +{ + /* + * This is special! We do not need the spinlock + * when clearing I_LOCK, because we're guaranteed + * that nobody else tries to do anything about the + * state of the inode when it is locked, as we + * just created it (so there can be no old holders + * that haven't tested I_LOCK). + */ + inode->i_state &= ~(I_LOCK|I_NEW); + wake_up_inode(inode); +} +EXPORT_SYMBOL(unlock_new_inode); + +/* + * This is called without the inode lock held.. Be careful. + * + * We no longer cache the sb_flags in i_flags - see fs.h + * -- rmk@arm.uk.linux.org + */ +static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode(sb, head, test, data); + if (!old) { + if (set(inode, data)) + goto set_failed; + + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; + +set_failed: + spin_unlock(&inode_lock); + destroy_inode(inode); + return NULL; +} + +/* + * get_new_inode_fast is the fast path version of get_new_inode, see the + * comment at iget_locked for details. + */ +static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} + +static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES); + tmp = tmp + (tmp >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +/* Yeah, I know about quadratic hash. Maybe, later. */ + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ + +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + static ino_t counter = 0; + struct inode *inode; + struct hlist_head * head; + ino_t res; + spin_lock(&inode_lock); +retry: + if (counter > max_reserved) { + head = inode_hashtable + hash(sb,counter); + res = counter++; + inode = find_inode_fast(sb, head, res); + if (!inode) { + spin_unlock(&inode_lock); + return res; + } + } else { + counter = max_reserved + 1; + } + goto retry; + +} + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode_lock); + if (!(inode->i_state & I_FREEING)) + __iget(inode); + else + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + spin_unlock(&inode_lock); + return inode; +} + +/** + * ifind - internal function, you want ilookup5() or iget5(). + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ifind() searches for the inode specified by @hashval and @data in the inode + * cache. This is a generalized version of ifind_fast() for file systems where + * the inode number is not sufficient for unique identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +static inline struct inode *ifind(struct super_block *sb, + struct hlist_head *head, int (*test)(struct inode *, void *), + void *data) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode(sb, head, test, data); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ifind_fast - internal function, you want ilookup() or iget(). + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ifind_fast() searches for the inode @ino in the inode cache. This is for + * file systems where the inode number is sufficient for unique identification + * of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +static inline struct inode *ifind_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode_fast(sb, head, ino); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ilookup5() uses ifind() to search for the inode specified by @hashval and + * @data in the inode cache. This is a generalized version of ilookup() for + * file systems where the inode number is not sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + return ifind(sb, head, test, data); +} +EXPORT_SYMBOL(ilookup5); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. + * This is for file systems where the inode number is sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + return ifind_fast(sb, head, ino); +} +EXPORT_SYMBOL(ilookup); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * This is iget() without the read_inode() portion of get_new_inode(). + * + * iget5_locked() uses ifind() to search for the inode specified by @hashval + * and @data in the inode cache and if present it is returned with an increased + * reference count. This is a generalized version of iget_locked() for file + * systems where the inode number is not sufficient for unique identification + * of an inode. + * + * If the inode is not in cache, get_new_inode() is called to allocate a new + * inode and this is returned locked, hashed, and with the I_NEW flag set. The + * file system gets to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_lock held, so can't sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + inode = ifind(sb, head, test, data); + if (inode) + return inode; + /* + * get_new_inode() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode(sb, head, test, set, data); +} +EXPORT_SYMBOL(iget5_locked); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * This is iget() without the read_inode() portion of get_new_inode_fast(). + * + * iget_locked() uses ifind_fast() to search for the inode specified by @ino in + * the inode cache and if present it is returned with an increased reference + * count. This is for file systems where the inode number is sufficient for + * unique identification of an inode. + * + * If the inode is not in cache, get_new_inode_fast() is called to allocate a + * new inode and this is returned locked, hashed, and with the I_NEW flag set. + * The file system gets to fill it in before unlocking it via + * unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + inode = ifind_fast(sb, head, ino); + if (inode) + return inode; + /* + * get_new_inode_fast() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode_fast(sb, head, ino); +} +EXPORT_SYMBOL(iget_locked); + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. If the inode + * has no superblock it is added to a separate anonymous chain. + */ + +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *head = &anon_hash_chain; + if (inode->i_sb) + head = inode_hashtable + hash(inode->i_sb, hashval); + spin_lock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); +} + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock or anonymous hash. + */ + +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); +} + +void generic_delete_inode(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + + hlist_del_init(&inode->i_hash); + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + security_inode_delete(inode); + + if (op->delete_inode) { + void (*delete)(struct inode *) = op->delete_inode; + if (!is_bad_inode(inode)) + DQUOT_INIT(inode); + /* s_op->delete_inode internally recalls clear_inode() */ + delete(inode); + } else + clear_inode(inode); + spin_lock(&inode_lock); + list_del_init(&inode->i_hash); + spin_unlock(&inode_lock); + wake_up_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +} +EXPORT_SYMBOL(generic_delete_inode); + +static void generic_forget_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!hlist_unhashed(&inode->i_hash)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_unused); + } + inodes_stat.nr_unused++; + spin_unlock(&inode_lock); + if (!sb || (sb->s_flags & MS_ACTIVE)) + return; + write_inode_now(inode, 1); + spin_lock(&inode_lock); + inodes_stat.nr_unused--; + hlist_del_init(&inode->i_hash); + } + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); +} + +/* + * Normal UNIX filesystem behaviour: delete the + * inode when the usage count drops to zero, and + * i_nlink is zero. + */ +static void generic_drop_inode(struct inode *inode) +{ + if (!inode->i_nlink) + generic_delete_inode(inode); + else + generic_forget_inode(inode); +} + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop()" function, defaulting to + * the legacy UNIX filesystem behaviour.. + * + * NOTE! NOTE! NOTE! We're called with the inode lock + * held, and the drop function is supposed to release + * the lock! + */ +static inline void iput_final(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + void (*drop)(struct inode *) = generic_drop_inode; + + if (op && op->drop_inode) + drop = op->drop_inode; + drop(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero the inode is also then freed and may be destroyed. + */ + +void iput(struct inode *inode) +{ + if (inode) { + struct super_operations *op = inode->i_sb->s_op; + + if (inode->i_state == I_CLEAR) + BUG(); + + if (op && op->put_inode) + op->put_inode(inode); + + if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + iput_final(inode); + } +} + +/** + * bmap - find a block number in a file + * @inode: inode of file + * @block: block to find + * + * Returns the block number on the device holding the inode that + * is the disk block number for the block of the file requested. + * That is, asked for block 4 of inode 1 the function will return the + * disk block relative to the disk start that holds that block of the + * file. + */ + +sector_t bmap(struct inode * inode, sector_t block) +{ + sector_t res = 0; + if (inode->i_mapping->a_ops->bmap) + res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); + return res; +} + +/* + * Return true if the filesystem which backs this inode considers the two + * passed timespecs to be sufficiently different to warrant flushing the + * altered time out to disk. + */ +static int inode_times_differ(struct inode *inode, + struct timespec *old, struct timespec *new) +{ + if (IS_ONE_SECOND(inode)) + return old->tv_sec != new->tv_sec; + return !timespec_equal(old, new); +} + +/** + * update_atime - update the access time + * @inode: inode accessed + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ + +void update_atime(struct inode *inode) +{ + struct timespec now; + + if (IS_NOATIME(inode)) + return; + if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) + return; + if (IS_RDONLY(inode)) + return; + + now = current_kernel_time(); + if (inode_times_differ(inode, &inode->i_atime, &now)) { + inode->i_atime = now; + mark_inode_dirty_sync(inode); + } else { + if (!timespec_equal(&inode->i_atime, &now)) + inode->i_atime = now; + } +} + +/** + * inode_update_time - update mtime and ctime time + * @inode: inode accessed + * @ctime_too: update ctime too + * + * Update the mtime time on an inode and mark it for writeback. + * When ctime_too is specified update the ctime too. + */ + +void inode_update_time(struct inode *inode, int ctime_too) +{ + struct timespec now = current_kernel_time(); + int sync_it = 0; + + if (inode_times_differ(inode, &inode->i_mtime, &now)) + sync_it = 1; + inode->i_mtime = now; + + if (ctime_too) { + if (inode_times_differ(inode, &inode->i_ctime, &now)) + sync_it = 1; + inode->i_ctime = now; + } + if (sync_it) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL(inode_update_time); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} +EXPORT_SYMBOL(inode_needs_sync); + +/* + * Quota functions that want to walk the inode lists.. + */ +#ifdef CONFIG_QUOTA + +/* Functions back in dquot.c */ +void put_dquot_list(struct list_head *); +int remove_inode_dquot_ref(struct inode *, int, struct list_head *); + +void remove_dquot_ref(struct super_block *sb, int type) +{ + struct inode *inode; + struct list_head *act_head; + LIST_HEAD(tofree_head); + + if (!sb->dq_op) + return; /* nothing to do */ + spin_lock(&inode_lock); /* This lock is for inodes code */ + /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ + + list_for_each(act_head, &inode_in_use) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &inode_unused) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_dirty) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_io) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + spin_unlock(&inode_lock); + + put_dquot_list(&tofree_head); +} + +#endif + +/* + * Hashed waitqueues for wait_on_inode(). The table is pretty small - the + * kernel doesn't lock many inodes at the same time. + */ +#define I_WAIT_TABLE_ORDER 3 +static struct i_wait_queue_head { + wait_queue_head_t wqh; +} ____cacheline_aligned_in_smp i_wait_queue_heads[1<i_state & I_LOCK) { + schedule(); + goto repeat; + } + remove_wait_queue(wq, &wait); + __set_current_state(TASK_RUNNING); +} + +void __wait_on_freeing_inode(struct inode *inode) +{ + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *wq = i_waitq_head(inode); + + add_wait_queue(wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&inode_lock); + schedule(); + remove_wait_queue(wq, &wait); + current->state = TASK_RUNNING; + spin_lock(&inode_lock); +} + + +void wake_up_inode(struct inode *inode) +{ + wait_queue_head_t *wq = i_waitq_head(inode); + + /* + * Prevent speculative execution through spin_unlock(&inode_lock); + */ + smp_mb(); + if (waitqueue_active(wq)) + wake_up_all(wq); +} + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init(unsigned long mempages) +{ + struct hlist_head *head; + unsigned long order; + unsigned int nr_hash; + int i; + + for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) + init_waitqueue_head(&i_wait_queue_heads[i].wqh); + + mempages >>= (14 - PAGE_SHIFT); + mempages *= sizeof(struct list_head); + for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) + ; + + do { + unsigned long tmp; + + nr_hash = (1UL << order) * PAGE_SIZE / + sizeof(struct hlist_head); + i_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + i_hash_shift = 0; + while ((tmp >>= 1UL) != 0UL) + i_hash_shift++; + + inode_hashtable = (struct hlist_head *) + __get_free_pages(GFP_ATOMIC, order); + } while (inode_hashtable == NULL && --order >= 0); + + printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!inode_hashtable) + panic("Failed to allocate inode hash table\n"); + + head = inode_hashtable; + i = nr_hash; + do { + INIT_HLIST_HEAD(head); + head++; + i--; + } while (i); + + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), + 0, SLAB_HWCACHE_ALIGN, init_once, + NULL); + if (!inode_cachep) + panic("cannot create inode slab cache"); + + set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) + inode->i_fop = &bad_sock_fops; + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", + mode); +} diff --git a/tests/linux/inode-justrej/patch b/tests/linux/inode-justrej/patch new file mode 100644 index 0000000..ec42e22 --- /dev/null +++ b/tests/linux/inode-justrej/patch @@ -0,0 +1,16 @@ +*************** +*** 942,948 **** + { + struct super_operations *op = inode->i_sb->s_op; + +- list_del_init(&inode->i_hash); + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; +--- 953,958 ---- + { + struct super_operations *op = inode->i_sb->s_op; + + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; diff --git a/tests/linux/inode-justrej/wmerge b/tests/linux/inode-justrej/wmerge new file mode 100644 index 0000000..1ffda02 --- /dev/null +++ b/tests/linux/inode-justrej/wmerge @@ -0,0 +1,1352 @@ +/* + * linux/fs/inode.c + * + * (C) 1997 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is needed for the following functions: + * - inode_has_buffers + * - invalidate_inode_buffers + * - fsync_bdev + * - invalidate_bdev + * + * FIXME: remove all knowledge of the buffer layer from this file + */ +#include + +/* + * New inode.c implementation. + * + * This implementation has the basic premise of trying + * to be extremely low-overhead and SMP-safe, yet be + * simple enough to be "obviously correct". + * + * Famous last words. + */ + +/* inode dynamic allocation 1999, Andrea Arcangeli */ + +/* #define INODE_PARANOIA 1 */ +/* #define INODE_DEBUG 1 */ + +/* + * Inode lookup is no longer as critical as it used to be: + * most of the lookups are going to be through the dcache. + */ +#define I_HASHBITS i_hash_shift +#define I_HASHMASK i_hash_mask + +static unsigned int i_hash_mask; +static unsigned int i_hash_shift; + +/* + * Each inode can be on two separate lists. One is + * the hash list of the inode, used for lookups. The + * other linked list is the "type" list: + * "in_use" - valid inode, i_count > 0, i_nlink > 0 + * "dirty" - as "in_use" but also dirty + * "unused" - valid inode, i_count = 0 + * + * A "dirty" list is maintained for each super block, + * allowing for low-overhead inode sync() operations. + */ + +LIST_HEAD(inode_in_use); +LIST_HEAD(inode_unused); +static struct hlist_head *inode_hashtable; +static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */ + +/* + * A simple spinlock to protect the list manipulations. + * + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; + +/* + * iprune_sem provides exclusion between the kswapd or try_to_free_pages + * icache shrinking path, and the umount path. Without this exclusion, + * by the time prune_icache calls iput for the inode whose pages it has + * been invalidating, or by the time it calls clear_inode & destroy_inode + * from its final dispose_list, the struct super_block they refer to + * (for inode->i_sb->s_op) may already have been freed and reused. + */ +static DECLARE_MUTEX(iprune_sem); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; + +static kmem_cache_t * inode_cachep; + +static struct inode *alloc_inode(struct super_block *sb) +{ + static struct address_space_operations empty_aops; + static struct inode_operations empty_iops; + static struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); + + if (inode) { + struct address_space * const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_sock = 0; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_rdev = to_kdev_t(0); + inode->i_security = NULL; + if (security_inode_alloc(inode)) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; + } + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->gfp_mask = GFP_HIGHUSER; + mapping->dirtied_when = 0; + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + if (sb->s_bdev) + mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + memset(&inode->u, 0, sizeof(inode->u)); + inode->i_mapping = mapping; + } + return inode; +} + +void destroy_inode(struct inode *inode) +{ + if (inode_has_buffers(inode)) + BUG(); + security_inode_free(inode); + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); +} + + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_data.clean_pages); + INIT_LIST_HEAD(&inode->i_data.dirty_pages); + INIT_LIST_HEAD(&inode->i_data.locked_pages); + INIT_LIST_HEAD(&inode->i_data.io_pages); + INIT_LIST_HEAD(&inode->i_dentry); + INIT_LIST_HEAD(&inode->i_devices); + sema_init(&inode->i_sem, 1); + INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); + rwlock_init(&inode->i_data.page_lock); + init_MUTEX(&inode->i_data.i_shared_sem); + INIT_LIST_HEAD(&inode->i_data.private_list); + spin_lock_init(&inode->i_data.private_lock); + INIT_LIST_HEAD(&inode->i_data.i_mmap); + INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); + spin_lock_init(&inode->i_lock); +} + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct inode * inode = (struct inode *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(inode); +} + +/* + * inode_lock must be held + */ +void __iget(struct inode * inode) +{ + if (atomic_read(&inode->i_count)) { + atomic_inc(&inode->i_count); + return; + } + atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_in_use); + } + inodes_stat.nr_unused--; +} + +/** + * clear_inode - clear an inode + * @inode: inode to clear + * + * This is called by the filesystem to tell us + * that the inode is no longer useful. We just + * terminate it with extreme prejudice. + */ + +void clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + + if (inode->i_data.nrpages) + BUG(); + if (!(inode->i_state & I_FREEING)) + BUG(); + if (inode->i_state & I_CLEAR) + BUG(); + wait_on_inode(inode); + DQUOT_DROP(inode); + if (inode->i_sb && inode->i_sb->s_op->clear_inode) + inode->i_sb->s_op->clear_inode(inode); + if (inode->i_bdev) + bd_forget(inode); + inode->i_state = I_CLEAR; +} + +/* + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + int nr_disposed = 0; + + while (!list_empty(head)) { + struct inode *inode; + + inode = list_entry(head->next, struct inode, i_list); + list_del(&inode->i_list); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); + nr_disposed++; + } + spin_lock(&inode_lock); + inodes_stat.nr_inodes -= nr_disposed; + spin_unlock(&inode_lock); +} + +/* + * Invalidate all inodes for a device. + */ +static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +{ + struct list_head *next; + int busy = 0, count = 0; + + next = head->next; + for (;;) { + struct list_head * tmp = next; + struct inode * inode; + + next = next->next; + if (tmp == head) + break; + inode = list_entry(tmp, struct inode, i_list); + if (inode->i_sb != sb) + continue; + invalidate_inode_buffers(inode); + if (!atomic_read(&inode->i_count)) { + hlist_del_init(&inode->i_hash); + list_del(&inode->i_list); + list_add(&inode->i_list, dispose); + inode->i_state |= I_FREEING; + count++; + continue; + } + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ + inodes_stat.nr_unused -= count; + return busy; +} + +/* + * This is a two-stage process. First we collect all + * offending inodes onto the throw-away list, and in + * the second stage we actually dispose of them. This + * is because we don't want to sleep while messing + * with the global lists.. + */ + +/** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +int invalidate_inodes(struct super_block * sb) +{ + int busy; + LIST_HEAD(throw_away); + + down(&iprune_sem); + spin_lock(&inode_lock); + busy = invalidate_list(&inode_in_use, sb, &throw_away); + busy |= invalidate_list(&inode_unused, sb, &throw_away); + busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_io, sb, &throw_away); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); + up(&iprune_sem); + + return busy; +} + +int invalidate_device(kdev_t dev, int do_sync) +{ + struct super_block *sb; + struct block_device *bdev = bdget(kdev_t_to_nr(dev)); + int res; + + if (!bdev) + return 0; + + if (do_sync) + fsync_bdev(bdev); + + res = 0; + sb = get_super(bdev); + if (sb) { + /* + * no need to lock the super, get_super holds the + * read semaphore so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb); + drop_super(sb); + } + invalidate_bdev(bdev, 0); + bdput(bdev); + return res; +} + +static int can_unuse(struct inode *inode) +{ + if (inode->i_state) + return 0; + if (inode_has_buffers(inode)) + return 0; + if (atomic_read(&inode->i_count)) + return 0; + if (inode->i_data.nrpages) + return 0; + return 1; +} + +/* + * Scan `goal' inodes on the unused list for freeable ones. They are moved to + * a temporary list and then are freed outside inode_lock by dispose_list(). + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. We expect the final iput() on that inode to add it to + * the front of the inode_unused list. So look for it there and if the + * inode is still freeable, proceed. The right inode is found 99.9% of the + * time in testing on a 4-way. + * + * If the inode has metadata buffers attached to mapping->private_list then + * try to remove them. + */ +static void prune_icache(int nr_to_scan) +{ + LIST_HEAD(freeable); + int nr_pruned = 0; + int nr_scanned; + unsigned long reap = 0; + + down(&iprune_sem); + spin_lock(&inode_lock); + for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + struct inode *inode; + + if (list_empty(&inode_unused)) + break; + + inode = list_entry(inode_unused.prev, struct inode, i_list); + + if (inode->i_state || atomic_read(&inode->i_count)) { + list_move(&inode->i_list, &inode_unused); + continue; + } + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode_lock); + if (remove_inode_buffers(inode)) + reap += invalidate_inode_pages(&inode->i_data); + iput(inode); + spin_lock(&inode_lock); + + if (inode != list_entry(inode_unused.next, + struct inode, i_list)) + continue; /* wrong inode or list_empty */ + if (!can_unuse(inode)) + continue; + } + hlist_del_init(&inode->i_hash); + list_move(&inode->i_list, &freeable); + inode->i_state |= I_FREEING; + nr_pruned++; + } + inodes_stat.nr_unused -= nr_pruned; + spin_unlock(&inode_lock); + + dispose_list(&freeable); + up(&iprune_sem); + + if (current_is_kswapd) + mod_page_state(kswapd_inodesteal, reap); + else + mod_page_state(pginodesteal, reap); +} + +/* + * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, + * "unused" means that no dentries are referring to the inodes: the files are + * not open and the dcache references to those inodes have already been + * reclaimed. + * + * This function is passed the number of inodes to scan, and it returns the + * total number of remaining possibly-reclaimable inodes. + */ +static int shrink_icache_memory(int nr, unsigned int gfp_mask) +{ + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, + * and we don't want to recurse into the FS that called us + * in clear_inode() and friends.. + */ + if (gfp_mask & __GFP_FS) + prune_icache(nr); + } + return inodes_stat.nr_unused; +} + +void __wait_on_freeing_inode(struct inode *inode); +/* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() + * by hand after calling find_inode now! This simplifies iunique and won't + * add any additional branch in the common code. + */ +static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = hlist_entry(node, struct inode, i_hash); + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) +{ + struct hlist_node *node; + struct inode * inode = NULL; + + hlist_for_each (node, head) { + prefetch(node->next); + inode = list_entry(node, struct inode, i_hash); + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + tmp = head; + continue; + } + break; + } + return node ? inode : NULL; +} + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + */ + +struct inode *new_inode(struct super_block *sb) +{ + static unsigned long last_ino; + struct inode * inode; + + spin_lock_prefetch(&inode_lock); + + inode = alloc_inode(sb); + if (inode) { + spin_lock(&inode_lock); + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + inode->i_ino = ++last_ino; + inode->i_state = 0; + spin_unlock(&inode_lock); + } + return inode; +} + +void unlock_new_inode(struct inode *inode) +{ + /* + * This is special! We do not need the spinlock + * when clearing I_LOCK, because we're guaranteed + * that nobody else tries to do anything about the + * state of the inode when it is locked, as we + * just created it (so there can be no old holders + * that haven't tested I_LOCK). + */ + inode->i_state &= ~(I_LOCK|I_NEW); + wake_up_inode(inode); +} +EXPORT_SYMBOL(unlock_new_inode); + +/* + * This is called without the inode lock held.. Be careful. + * + * We no longer cache the sb_flags in i_flags - see fs.h + * -- rmk@arm.uk.linux.org + */ +static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode(sb, head, test, data); + if (!old) { + if (set(inode, data)) + goto set_failed; + + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; + +set_failed: + spin_unlock(&inode_lock); + destroy_inode(inode); + return NULL; +} + +/* + * get_new_inode_fast is the fast path version of get_new_inode, see the + * comment at iget_locked for details. + */ +static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} + +static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES); + tmp = tmp + (tmp >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +/* Yeah, I know about quadratic hash. Maybe, later. */ + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ + +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + static ino_t counter = 0; + struct inode *inode; + struct hlist_head * head; + ino_t res; + spin_lock(&inode_lock); +retry: + if (counter > max_reserved) { + head = inode_hashtable + hash(sb,counter); + res = counter++; + inode = find_inode_fast(sb, head, res); + if (!inode) { + spin_unlock(&inode_lock); + return res; + } + } else { + counter = max_reserved + 1; + } + goto retry; + +} + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode_lock); + if (!(inode->i_state & I_FREEING)) + __iget(inode); + else + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + spin_unlock(&inode_lock); + return inode; +} + +/** + * ifind - internal function, you want ilookup5() or iget5(). + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ifind() searches for the inode specified by @hashval and @data in the inode + * cache. This is a generalized version of ifind_fast() for file systems where + * the inode number is not sufficient for unique identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +static inline struct inode *ifind(struct super_block *sb, + struct hlist_head *head, int (*test)(struct inode *, void *), + void *data) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode(sb, head, test, data); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ifind_fast - internal function, you want ilookup() or iget(). + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ifind_fast() searches for the inode @ino in the inode cache. This is for + * file systems where the inode number is sufficient for unique identification + * of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +static inline struct inode *ifind_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode_fast(sb, head, ino); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ilookup5() uses ifind() to search for the inode specified by @hashval and + * @data in the inode cache. This is a generalized version of ilookup() for + * file systems where the inode number is not sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + return ifind(sb, head, test, data); +} +EXPORT_SYMBOL(ilookup5); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. + * This is for file systems where the inode number is sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + return ifind_fast(sb, head, ino); +} +EXPORT_SYMBOL(ilookup); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * This is iget() without the read_inode() portion of get_new_inode(). + * + * iget5_locked() uses ifind() to search for the inode specified by @hashval + * and @data in the inode cache and if present it is returned with an increased + * reference count. This is a generalized version of iget_locked() for file + * systems where the inode number is not sufficient for unique identification + * of an inode. + * + * If the inode is not in cache, get_new_inode() is called to allocate a new + * inode and this is returned locked, hashed, and with the I_NEW flag set. The + * file system gets to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_lock held, so can't sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + inode = ifind(sb, head, test, data); + if (inode) + return inode; + /* + * get_new_inode() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode(sb, head, test, set, data); +} +EXPORT_SYMBOL(iget5_locked); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * This is iget() without the read_inode() portion of get_new_inode_fast(). + * + * iget_locked() uses ifind_fast() to search for the inode specified by @ino in + * the inode cache and if present it is returned with an increased reference + * count. This is for file systems where the inode number is sufficient for + * unique identification of an inode. + * + * If the inode is not in cache, get_new_inode_fast() is called to allocate a + * new inode and this is returned locked, hashed, and with the I_NEW flag set. + * The file system gets to fill it in before unlocking it via + * unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + inode = ifind_fast(sb, head, ino); + if (inode) + return inode; + /* + * get_new_inode_fast() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode_fast(sb, head, ino); +} +EXPORT_SYMBOL(iget_locked); + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. If the inode + * has no superblock it is added to a separate anonymous chain. + */ + +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *head = &anon_hash_chain; + if (inode->i_sb) + head = inode_hashtable + hash(inode->i_sb, hashval); + spin_lock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); +} + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock or anonymous hash. + */ + +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); +} + +void generic_delete_inode(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + +<<<---hlist_del_init|||list_del_init===--->>> list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + security_inode_delete(inode); + + if (op->delete_inode) { + void (*delete)(struct inode *) = op->delete_inode; + if (!is_bad_inode(inode)) + DQUOT_INIT(inode); + /* s_op->delete_inode internally recalls clear_inode() */ + delete(inode); + } else + clear_inode(inode); + spin_lock(&inode_lock); + list_del_init(&inode->i_hash); + spin_unlock(&inode_lock); + wake_up_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +} +EXPORT_SYMBOL(generic_delete_inode); + +static void generic_forget_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!hlist_unhashed(&inode->i_hash)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_unused); + } + inodes_stat.nr_unused++; + spin_unlock(&inode_lock); + if (!sb || (sb->s_flags & MS_ACTIVE)) + return; + write_inode_now(inode, 1); + spin_lock(&inode_lock); + inodes_stat.nr_unused--; + hlist_del_init(&inode->i_hash); + } + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); +} + +/* + * Normal UNIX filesystem behaviour: delete the + * inode when the usage count drops to zero, and + * i_nlink is zero. + */ +static void generic_drop_inode(struct inode *inode) +{ + if (!inode->i_nlink) + generic_delete_inode(inode); + else + generic_forget_inode(inode); +} + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop()" function, defaulting to + * the legacy UNIX filesystem behaviour.. + * + * NOTE! NOTE! NOTE! We're called with the inode lock + * held, and the drop function is supposed to release + * the lock! + */ +static inline void iput_final(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + void (*drop)(struct inode *) = generic_drop_inode; + + if (op && op->drop_inode) + drop = op->drop_inode; + drop(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero the inode is also then freed and may be destroyed. + */ + +void iput(struct inode *inode) +{ + if (inode) { + struct super_operations *op = inode->i_sb->s_op; + + if (inode->i_state == I_CLEAR) + BUG(); + + if (op && op->put_inode) + op->put_inode(inode); + + if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + iput_final(inode); + } +} + +/** + * bmap - find a block number in a file + * @inode: inode of file + * @block: block to find + * + * Returns the block number on the device holding the inode that + * is the disk block number for the block of the file requested. + * That is, asked for block 4 of inode 1 the function will return the + * disk block relative to the disk start that holds that block of the + * file. + */ + +sector_t bmap(struct inode * inode, sector_t block) +{ + sector_t res = 0; + if (inode->i_mapping->a_ops->bmap) + res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); + return res; +} + +/* + * Return true if the filesystem which backs this inode considers the two + * passed timespecs to be sufficiently different to warrant flushing the + * altered time out to disk. + */ +static int inode_times_differ(struct inode *inode, + struct timespec *old, struct timespec *new) +{ + if (IS_ONE_SECOND(inode)) + return old->tv_sec != new->tv_sec; + return !timespec_equal(old, new); +} + +/** + * update_atime - update the access time + * @inode: inode accessed + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ + +void update_atime(struct inode *inode) +{ + struct timespec now; + + if (IS_NOATIME(inode)) + return; + if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) + return; + if (IS_RDONLY(inode)) + return; + + now = current_kernel_time(); + if (inode_times_differ(inode, &inode->i_atime, &now)) { + inode->i_atime = now; + mark_inode_dirty_sync(inode); + } else { + if (!timespec_equal(&inode->i_atime, &now)) + inode->i_atime = now; + } +} + +/** + * inode_update_time - update mtime and ctime time + * @inode: inode accessed + * @ctime_too: update ctime too + * + * Update the mtime time on an inode and mark it for writeback. + * When ctime_too is specified update the ctime too. + */ + +void inode_update_time(struct inode *inode, int ctime_too) +{ + struct timespec now = current_kernel_time(); + int sync_it = 0; + + if (inode_times_differ(inode, &inode->i_mtime, &now)) + sync_it = 1; + inode->i_mtime = now; + + if (ctime_too) { + if (inode_times_differ(inode, &inode->i_ctime, &now)) + sync_it = 1; + inode->i_ctime = now; + } + if (sync_it) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL(inode_update_time); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} +EXPORT_SYMBOL(inode_needs_sync); + +/* + * Quota functions that want to walk the inode lists.. + */ +#ifdef CONFIG_QUOTA + +/* Functions back in dquot.c */ +void put_dquot_list(struct list_head *); +int remove_inode_dquot_ref(struct inode *, int, struct list_head *); + +void remove_dquot_ref(struct super_block *sb, int type) +{ + struct inode *inode; + struct list_head *act_head; + LIST_HEAD(tofree_head); + + if (!sb->dq_op) + return; /* nothing to do */ + spin_lock(&inode_lock); /* This lock is for inodes code */ + /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ + + list_for_each(act_head, &inode_in_use) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &inode_unused) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_dirty) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_io) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + spin_unlock(&inode_lock); + + put_dquot_list(&tofree_head); +} + +#endif + +/* + * Hashed waitqueues for wait_on_inode(). The table is pretty small - the + * kernel doesn't lock many inodes at the same time. + */ +#define I_WAIT_TABLE_ORDER 3 +static struct i_wait_queue_head { + wait_queue_head_t wqh; +} ____cacheline_aligned_in_smp i_wait_queue_heads[1<i_state & I_LOCK) { + schedule(); + goto repeat; + } + remove_wait_queue(wq, &wait); + __set_current_state(TASK_RUNNING); +} + +void __wait_on_freeing_inode(struct inode *inode) +{ + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *wq = i_waitq_head(inode); + + add_wait_queue(wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&inode_lock); + schedule(); + remove_wait_queue(wq, &wait); + current->state = TASK_RUNNING; + spin_lock(&inode_lock); +} + + +void wake_up_inode(struct inode *inode) +{ + wait_queue_head_t *wq = i_waitq_head(inode); + + /* + * Prevent speculative execution through spin_unlock(&inode_lock); + */ + smp_mb(); + if (waitqueue_active(wq)) + wake_up_all(wq); +} + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init(unsigned long mempages) +{ + struct hlist_head *head; + unsigned long order; + unsigned int nr_hash; + int i; + + for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) + init_waitqueue_head(&i_wait_queue_heads[i].wqh); + + mempages >>= (14 - PAGE_SHIFT); + mempages *= sizeof(struct list_head); + for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) + ; + + do { + unsigned long tmp; + + nr_hash = (1UL << order) * PAGE_SIZE / + sizeof(struct hlist_head); + i_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + i_hash_shift = 0; + while ((tmp >>= 1UL) != 0UL) + i_hash_shift++; + + inode_hashtable = (struct hlist_head *) + __get_free_pages(GFP_ATOMIC, order); + } while (inode_hashtable == NULL && --order >= 0); + + printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!inode_hashtable) + panic("Failed to allocate inode hash table\n"); + + head = inode_hashtable; + i = nr_hash; + do { + INIT_HLIST_HEAD(head); + head++; + i--; + } while (i); + + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), + 0, SLAB_HWCACHE_ALIGN, init_once, + NULL); + if (!inode_cachep) + panic("cannot create inode slab cache"); + + set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = to_kdev_t(rdev); + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) + inode->i_fop = &bad_sock_fops; + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", + mode); +} diff --git a/tests/linux/md-autostart/merge b/tests/linux/md-autostart/merge new file mode 100644 index 0000000..b3bde61 --- /dev/null +++ b/tests/linux/md-autostart/merge @@ -0,0 +1,4025 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher + - Devfs support by Richard Gooch + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown . + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_KMOD +#include +#endif + +#define __KERNEL_SYSCALLS__ +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER + +#include + +#define DEBUG 0 +#if DEBUG +# define dprintk(x...) printk(x) +#else +# define dprintk(x...) do { } while(0) +#endif + +#ifndef MODULE +static void autostart_arrays (void); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 100; +static int sysctl_speed_limit_max = 100000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min", + &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, + {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max", + &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table raid_dir_table[] = { + {DEV_RAID, "raid", NULL, 0, 0555, raid_table}, + {0} +}; + +static ctl_table raid_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table}, + {0} +}; + +/* + * these have to be allocated separately because external + * subsystems want to have a pre-defined structure + */ +struct hd_struct md_hd_struct[MAX_MD_DEVS]; +static int md_blocksizes[MAX_MD_DEVS]; +static int md_hardsect_sizes[MAX_MD_DEVS]; +static void md_recover_arrays(void); +static mdk_thread_t *md_recovery_thread; + +int md_size[MAX_MD_DEVS]; + +static struct block_device_operations md_fops; +static devfs_handle_t devfs_handle; + +static struct gendisk md_gendisk= +{ + major: MD_MAJOR, + major_name: "md", + minor_shift: 0, + max_p: 1, + part: md_hd_struct, + sizes: md_size, + nr_real: MAX_MD_DEVS, + real_devices: NULL, + next: NULL, + fops: &md_fops, +}; + +/* + * Enables to iterate over all existing md arrays + * all_mddevs_lock protects this list as well as mddev_map. + */ +static MD_LIST_HEAD(all_mddevs); +static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; + + +/* + * iterates through all used mddevs in the system. + * We take care to grab the all_mddevs_lock whenever navigating + * the list, and to always hold a refcount when unlocked. + * Any code which breaks out of this loop while own + * a reference to the current mddev and must mddev_put it. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (spin_lock(&all_mddevs_lock), \ + (tmp = all_mddevs.next), \ + (mddev = NULL); \ + (void)(tmp != &all_mddevs && \ + mddev_get(list_entry(tmp, mddev_t, all_mddevs))),\ + spin_unlock(&all_mddevs_lock), \ + (mddev ? mddev_put(mddev):(void)NULL), \ + (mddev = list_entry(tmp, mddev_t, all_mddevs)), \ + (tmp != &all_mddevs); \ + spin_lock(&all_mddevs_lock), \ + (tmp = tmp->next) \ + ) + +static mddev_t *mddev_map[MAX_MD_DEVS]; + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ + bio_io_error(bio); + return 0; +} + +static inline mddev_t *mddev_get(mddev_t *mddev) +{ + atomic_inc(&mddev->active); + return mddev; +} + +static void mddev_put(mddev_t *mddev) +{ + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) + return; + if (!mddev->sb && list_empty(&mddev->disks)) { + list_del(&mddev->all_mddevs); + mddev_map[mdidx(mddev)] = NULL; + kfree(mddev); + MOD_DEC_USE_COUNT; + } + spin_unlock(&all_mddevs_lock); +} + +static mddev_t * mddev_find(int unit) +{ + mddev_t *mddev, *new = NULL; + + retry: + spin_lock(&all_mddevs_lock); + if (mddev_map[unit]) { + mddev = mddev_get(mddev_map[unit]); + spin_unlock(&all_mddevs_lock); + if (new) + kfree(new); + return mddev; + } + if (new) { + mddev_map[unit] = new; + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + MOD_INC_USE_COUNT; + return new; + } + spin_unlock(&all_mddevs_lock); + + new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + memset(new, 0, sizeof(*new)); + + new->__minor = unit; + init_MUTEX(&new->reconfig_sem); + MD_INIT_LIST_HEAD(&new->disks); + MD_INIT_LIST_HEAD(&new->all_mddevs); + atomic_set(&new->active, 1); + + goto retry; +} + +static inline int mddev_lock(mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +static inline int mddev_trylock(mddev_t * mddev) +{ + return down_trylock(&mddev->reconfig_sem); +} + +static inline void mddev_unlock(mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +static MD_LIST_HEAD(device_names); + +char * partition_name(kdev_t dev) +{ + struct gendisk *hd; + static char nomem [] = ""; + dev_name_t *dname; + struct md_list_head *tmp; + + list_for_each(tmp, &device_names) { + dname = md_list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + + if (!dname) + return nomem; + /* + * ok, add this new device name to the list + */ + hd = get_gendisk (dev); + dname->name = NULL; + if (hd) + dname->name = disk_name (hd, MINOR(dev), dname->namebuf); + if (!dname->name) { + sprintf (dname->namebuf, "[dev %s]", kdevname(dev)); + dname->name = dname->namebuf; + } + + dname->dev = dev; + md_list_add(&dname->list, &device_names); + + return dname->name; +} + +static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev, + int persistent) +{ + unsigned int size = 0; + + if (blk_size[MAJOR(dev)]) + size = blk_size[MAJOR(dev)][MINOR(dev)]; + if (persistent) + size = MD_NEW_SIZE_BLOCKS(size); + return size; +} + +static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent) +{ + unsigned int size; + + size = calc_dev_sboffset(dev, mddev, persistent); + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +static unsigned int zoned_raid_size(mddev_t *mddev) +{ + unsigned int mask; + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* + * do size and offset calculations. + */ + mask = ~(mddev->sb->chunk_size/1024 - 1); + + ITERATE_RDEV(mddev,rdev,tmp) { + rdev->size &= mask; + md_size[mdidx(mddev)] += rdev->size; + } + return 0; +} + +static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb) +{ + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = 0; + disk->minor = 0; + mark_disk_removed(disk); +} + +#define BAD_MAGIC KERN_ERR \ +"md: invalid raid superblock magic on %s\n" + +#define BAD_MINOR KERN_ERR \ +"md: %s: invalid raid minor (%x)\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_SB KERN_ERR \ +"md: disabled device %s, could not read superblock.\n" + +#define BAD_CSUM KERN_WARNING \ +"md: invalid superblock checksum on %s\n" + +static int alloc_array_sb(mddev_t * mddev) +{ + if (mddev->sb) { + MD_BUG(); + return 0; + } + + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) + return -ENOMEM; + md_clear_page(mddev->sb); + return 0; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(OUT_OF_MEM); + return -EINVAL; + } + rdev->sb = (mdp_super_t *) page_address(rdev->sb_page); + + return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) { + page_cache_release(rdev->sb_page); + rdev->sb = NULL; + rdev->sb_page = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->faulty) + MD_BUG(); + } +} + + +static void bh_complete(struct buffer_head *bh, int uptodate) +{ + + if (uptodate) + set_bit(BH_Uptodate, &bh->b_state); + + complete((struct completion*)bh->b_private); +} + +static int sync_page_io(kdev_t dev, unsigned long sector, int size, + struct page *page, int rw) +{ + struct buffer_head bh; + struct completion event; + + init_completion(&event); + init_buffer(&bh, bh_complete, &event); + bh.b_rdev = dev; + bh.b_rsector = sector; + bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock); + bh.b_size = size; + bh.b_page = page; + bh.b_reqnext = NULL; + bh.b_data = page_address(page); + generic_make_request(rw, &bh); + + run_task_queue(&tq_disk); + wait_for_completion(&event); + + return test_bit(BH_Uptodate, &bh.b_state); +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ + int ret = -EINVAL; + kdev_t dev = rdev->dev; + unsigned long sb_offset; + + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + + /* + * Calculate the position of the superblock, + * it's at the end of the disk + */ + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); + rdev->sb_offset = sb_offset; + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) { + printk(NO_SB,partition_name(dev)); + return -EINVAL; + } + printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo); + ret = 0; +abort: + return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb(mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + printk(BAD_MAGIC, partition_name(rdev->dev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor); + goto abort; + } + + if (calc_sb_csum(sb) != sb->sb_csum) { + printk(BAD_CSUM, partition_name(rdev->dev)); + goto abort; + } + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = get_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + printk( KERN_WARNING +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" +" protection against single-disk failure might be compromised.\n", + mdidx(mddev), partition_name(rdev->dev), + partition_name(same_pdev->dev)); + + md_list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev)); +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del_init(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev)); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev(mdk_rdev_t *rdev) +{ + int err = 0; + struct block_device *bdev; + + bdev = bdget(rdev->dev); + if (!bdev) + return -ENOMEM; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + if (!err) + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + blkdev_put(bdev, BDEV_RAW); +} + +void md_autodetect_dev(kdev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ + printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev)); + if (rdev->mddev) + MD_BUG(); + unlock_rdev(rdev); + free_disk_sb(rdev); + list_del_init(&rdev->all); + if (!list_empty(&rdev->pending)) { + printk(KERN_INFO "md: (%s was pending)\n", + partition_name(rdev->dev)); + list_del_init(&rdev->pending); + } +#ifndef MODULE + md_autodetect_dev(rdev->dev); +#endif + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); +} + +static void free_mddev(mddev_t *mddev) +{ + if (!mddev) { + MD_BUG(); + return; + } + + export_array(mddev); + md_size[mdidx(mddev)] = 0; + md_hd_struct[mdidx(mddev)].nr_sects = 0; +} + +#undef BAD_CSUM +#undef BAD_MAGIC +#undef OUT_OF_MEM +#undef NO_SB + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ", + partition_name(rdev->dev), partition_name(rdev->old_dev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk(KERN_INFO "md: rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * *\n"); + printk("md: **********************************\n"); + ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", partition_name(rdev->dev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + mddev_unlock(mddev); + } + printk("md: **********************************\n"); + printk("\n"); +} + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) +{ + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) + + return 1; + + return 0; +} + +static mdk_rdev_t * find_rdev_all(kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + list_for_each(tmp, &all_raid_disks) { + rdev = md_list_entry(tmp, mdk_rdev_t, all); + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +#define GETBLK_FAILED KERN_ERR \ +"md: getblk failed for device %s\n" + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + kdev_t dev; + unsigned long sb_offset, size; + + if (!rdev->sb) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + if (rdev->sb->md_magic != MD_SB_MAGIC) { + MD_BUG(); + return 1; + } + + dev = rdev->dev; + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); + if (rdev->sb_offset != sb_offset) { + printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n", + partition_name(dev), rdev->sb_offset, sb_offset); + goto skip; + } + /* + * If the disk went offline meanwhile and it's just a spare, then + * its size has changed to zero silently, and the MD code does + * not yet know that it's faulty. + */ + size = calc_dev_size(dev, rdev->mddev, 1); + if (size != rdev->size) { + printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n", + partition_name(dev), rdev->size, size); + goto skip; + } + + printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset); + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) { + printk("md: write_disk_sb failed for device %s\n", partition_name(dev)); + return 1; + } +skip: + return 0; +} +#undef GETBLK_FAILED + +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int i, ok = 0; + mdp_disk_t *desc; + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = mddev->sb->disks + i; +#if 0 + if (disk_faulty(desc)) { + if (MKDEV(desc->major,desc->minor) == rdev->dev) + ok = 1; + continue; + } +#endif + if (MKDEV(desc->major,desc->minor) == rdev->dev) { + rdev->sb->this_disk = *desc; + rdev->desc_nr = desc->number; + ok = 1; + break; + } + } + + if (!ok) { + MD_BUG(); + } +} + +static int sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + mdp_super_t *sb; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty || rdev->alias_device) + continue; + sb = rdev->sb; + *sb = *mddev->sb; + set_this_disk(mddev, rdev); + sb->sb_csum = calc_sb_csum(sb); + } + return 0; +} + +void __md_update_sb(mddev_t * mddev) +{ + int err, count = 100; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + if (!mddev->sb_dirty) { + printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0)); + return 0; + } + mddev->sb_dirty = 0; +repeat: + mddev->sb->utime = CURRENT_TIME; + if ((++mddev->sb->events_lo)==0) + ++mddev->sb->events_hi; + + if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff; + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (mddev->sb->not_persistent) + return; + + printk(KERN_INFO "md: updating md%d RAID superblock on device\n", + mdidx(mddev)); + + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + printk(KERN_INFO "md: "); + if (rdev->faulty) + printk("(skipping faulty "); + if (rdev->alias_device) + printk("(skipping alias "); + if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) { + printk("(skipping new-faulty %s )\n", + partition_name(rdev->dev)); + continue; + } + printk("%s ", partition_name(rdev->dev)); + if (!rdev->faulty && !rdev->alias_device) { + printk("[events: %08lx]", + (unsigned long)rdev->sb->events_lo); + err += write_disk_sb(rdev); + } else + printk(")\n"); + } + if (err) { + if (--count) { + printk(KERN_ERR "md: errors occurred during superblock update, repeating\n"); + goto repeat; + } + printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n"); + } +} + +void md_update_sb(mddev_t *mddev) +{ + if (mddev_lock(mddev)) + return; + if (mddev->sb_dirty) + __md_update_sb(mddev); + mddev_unlock(mddev); +} + + +/* + * Import a device. If 'on_disk', then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + */ +static int md_import_device(kdev_t newdev, int on_disk) +{ + int err; + mdk_rdev_t *rdev; + unsigned int size; + + if (find_rdev_all(newdev)) + return -EEXIST; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev)); + return -ENOMEM; + } + memset(rdev, 0, sizeof(*rdev)); + + if (is_mounted(newdev)) { + printk(KERN_WARNING "md: can not import %s, has active inodes!\n", + partition_name(newdev)); + err = -EBUSY; + goto abort_free; + } + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + rdev->dev = newdev; + if (lock_rdev(rdev)) { + printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + + size = 0; + if (blk_size[MAJOR(newdev)]) + size = blk_size[MAJOR(newdev)][MINOR(newdev)]; + if (!size) { + printk(KERN_WARNING "md: %s has zero size, marking faulty!\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + + if (on_disk) { + if ((err = read_disk_sb(rdev))) { + printk(KERN_WARNING "md: could not read %s's sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + if ((err = check_disk_sb(rdev))) { + printk(KERN_WARNING "md: %s has invalid sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + + if (rdev->sb->level != -4) { + rdev->old_dev = MKDEV(rdev->sb->this_disk.major, + rdev->sb->this_disk.minor); + rdev->desc_nr = rdev->sb->this_disk.number; + } else { + rdev->old_dev = MKDEV(0, 0); + rdev->desc_nr = -1; + } + } + md_list_add(&rdev->all, &all_raid_disks); + MD_INIT_LIST_HEAD(&rdev->pending); + INIT_LIST_HEAD(&rdev->same_set); + + return 0; + +abort_free: + if (rdev->sb) { + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return err; +} + +/* + * Check a full RAID array for plausibility + */ + +#define INCONSISTENT KERN_ERR \ +"md: fatal superblock inconsistency in %s -- removing from array\n" + +#define OUT_OF_DATE KERN_ERR \ +"md: superblock update time inconsistency -- using the most recent one\n" + +#define OLD_VERSION KERN_ALERT \ +"md: md%d: unsupported raid array version %d.%d.%d\n" + +#define NOT_CLEAN_IGNORE KERN_ERR \ +"md: md%d: raid array is not clean -- starting background reconstruction\n" + +#define UNKNOWN_LEVEL KERN_ERR \ +"md: md%d: unsupported raid level %d\n" + +static int analyze_sbs(mddev_t * mddev) +{ + int out_of_date = 0, i, first; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2, *freshest; + mdp_super_t *sb; + + /* + * Verify the RAID superblock on each real device + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + MD_BUG(); + goto abort; + } + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + if (check_disk_sb(rdev)) + goto abort; + } + + /* + * The superblock constant part has to be the same + * for all disks in the array. + */ + sb = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!sb) { + sb = rdev->sb; + continue; + } + if (!sb_equal(sb, rdev->sb)) { + printk(INCONSISTENT, partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * OK, we have all disks and the array is ready to run. Let's + * find the freshest superblock, that one will be the superblock + * that represents the whole array. + */ + if (!mddev->sb) + if (alloc_array_sb(mddev)) + goto abort; + sb = mddev->sb; + freshest = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2; + /* + * if the checksum is invalid, use the superblock + * only as a last resort. (decrease it's age by + * one event) + */ + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { + if (rdev->sb->events_lo || rdev->sb->events_hi) + if ((rdev->sb->events_lo--)==0) + rdev->sb->events_hi--; + } + + printk(KERN_INFO "md: %s's event counter: %08lx\n", + partition_name(rdev->dev), + (unsigned long)rdev->sb->events_lo); + if (!freshest) { + freshest = rdev; + continue; + } + /* + * Find the newest superblock version + */ + ev1 = md_event(rdev->sb); + ev2 = md_event(freshest->sb); + if (ev1 != ev2) { + out_of_date = 1; + if (ev1 > ev2) + freshest = rdev; + } + } + if (out_of_date) { + printk(OUT_OF_DATE); + printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev)); + } + memcpy (sb, freshest->sb, sizeof(*sb)); + + /* + * at this point we have picked the 'best' superblock + * from all available superblocks. + * now we validate this superblock and kick out possibly + * failed disks. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Kick all non-fresh devices + */ + __u64 ev1, ev2; + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ++ev1; + if (ev1 < ev2) { + printk(KERN_WARNING "md: kicking non-fresh %s from array!\n", + partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * Fix up changed device names ... but only if this disk has a + * recent update time. Use faulty checksum ones too. + */ + if (mddev->sb->level != -4) + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2, ev3; + if (rdev->faulty || rdev->alias_device) { + MD_BUG(); + goto abort; + } + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ev3 = ev2; + --ev3; + if ((rdev->dev != rdev->old_dev) && + ((ev1 == ev2) || (ev1 == ev3))) { + mdp_disk_t *desc; + + printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n", + partition_name(rdev->old_dev), partition_name(rdev->dev)); + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + desc = &sb->disks[rdev->desc_nr]; + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { + MD_BUG(); + goto abort; + } + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + desc = &rdev->sb->this_disk; + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + } + } + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + /* + * We kick faulty devices/descriptors immediately. + * + * Note: multipath devices are a special case. Since we + * were able to read the superblock on the path, we don't + * care if it was previously marked as faulty, it's up now + * so enable it. + */ + if (disk_faulty(desc) && mddev->sb->level != -4) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + printk(KERN_WARNING "md%d: kicking faulty %s!\n", + mdidx(mddev),partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + if (dev == MKDEV(0,0)) + continue; + printk(KERN_WARNING "md%d: removing former faulty %s!\n", + mdidx(mddev), partition_name(dev)); + } + remove_descriptor(desc, sb); + continue; + } else if (disk_faulty(desc)) { + /* + * multipath entry marked as faulty, unfaulty it + */ + rdev = find_rdev(mddev, dev); + if(rdev) + mark_disk_spare(desc); + else + remove_descriptor(desc, sb); + } + + if (dev == MKDEV(0,0)) + continue; + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Multi-path IO special-case: since we have no + * this_disk descriptor at auto-detect time, + * we cannot check rdev->number. + * We can check the device though. + */ + if ((sb->level == -4) && (rdev->dev == + MKDEV(desc->major,desc->minor))) { + found = 1; + break; + } + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } + } + if (found) + continue; + + printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n", + mdidx(mddev), partition_name(dev)); + remove_descriptor(desc, sb); + } + + /* + * Double check wether all devices mentioned in the + * superblock are in the rdev ring. + */ + first = 1; + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + + if (disk_faulty(desc)) { + MD_BUG(); + goto abort; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + /* + * In the case of Multipath-IO, we have no + * other information source to find out which + * disk is which, only the position of the device + * in the superblock: + */ + if (mddev->sb->level == -4) { + if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) { + MD_BUG(); + goto abort; + } + rdev->desc_nr = i; + if (!first) + rdev->alias_device = 1; + else + first = 0; + } + } + + /* + * Kick all rdevs that are not in the + * descriptor array: + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) + kick_rdev_from_array(rdev); + } + + /* + * Do a final reality check. + */ + if (mddev->sb->level != -4) { + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + /* + * is the desc_nr unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->desc_nr == rdev->desc_nr)) { + MD_BUG(); + goto abort; + } + } + /* + * is the device unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->dev == rdev->dev)) { + MD_BUG(); + goto abort; + } + } + } + } + + /* + * Check if we can support this RAID array + */ + if (sb->major_version != MD_MAJOR_VERSION || + sb->minor_version > MD_MINOR_VERSION) { + + printk(OLD_VERSION, mdidx(mddev), sb->major_version, + sb->minor_version, sb->patch_version); + goto abort; + } + + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || + (sb->level == 4) || (sb->level == 5))) + printk(NOT_CLEAN_IGNORE, mdidx(mddev)); + + return 0; +abort: + return 1; +} + +#undef INCONSISTENT +#undef OUT_OF_DATE +#undef OLD_VERSION +#undef OLD_LEVEL + +static int device_size_calculation(mddev_t * mddev) +{ + int data_disks = 0, persistent; + unsigned int readahead; + mdp_super_t *sb = mddev->sb; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + persistent = !mddev->sb->not_persistent; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size) { + MD_BUG(); + continue; + } + rdev->size = calc_dev_size(rdev->dev, mddev, persistent); + if (rdev->size < sb->chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size: %ldk < %dk\n", + partition_name(rdev->dev), + rdev->size, sb->chunk_size / 1024); + return -EINVAL; + } + } + + switch (sb->level) { + case -4: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case -1: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = sb->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = sb->raid_disks-1; + break; + default: + printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = sb->size * data_disks; + + readahead = MD_READAHEAD; + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) { + readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + // (no multipath branch - it uses the default setting) + if (sb->level == -3) + readahead = 0; + } + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; +abort: + return 1; +} + + +#define TOO_BIG_CHUNKSIZE KERN_ERR \ +"too big chunk_size: %d > %d\n" + +#define TOO_SMALL_CHUNKSIZE KERN_ERR \ +"too small chunk_size: %d < %ld\n" + +#define BAD_CHUNKSIZE KERN_ERR \ +"no chunksize specified, see 'man raidtab'\n" + +static int do_md_run(mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Resize disks to align partitions size on a given + * chunk size. + */ + md_size[mdidx(mddev)] = 0; + + /* + * Analyze all RAID superblock(s) + */ + if (analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->sb->chunk_size; + pnum = level_to_pers(mddev->sb->level); + + if ((pnum != MULTIPATH) && (pnum != RAID1)) { + if (!chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We dont + * want to continue the bad practice. + */ + printk(BAD_CHUNKSIZE); + return -EINVAL; + } + if (chunk_size > MAX_CHUNK_SIZE) { + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); + return -EINVAL; + } + } else + if (chunk_size) + printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n", + mddev->sb->level); + + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (!pers[pnum]) + { +#ifdef CONFIG_KMOD + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + if (!pers[pnum]) +#endif + { + printk(KERN_ERR "md: personality %d is not loaded!\n", + pnum); + return -EINVAL; + } + } + + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + md_hardsect_sizes[mdidx(mddev)] = 512; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + invalidate_device(rdev->dev, 1); + if (get_hardsect_size(rdev->dev) + > md_hardsect_sizes[mdidx(mddev)]) + md_hardsect_sizes[mdidx(mddev)] = + get_hardsect_size(rdev->dev); + } + md_blocksizes[mdidx(mddev)] = 1024; + if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)]) + md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)]; + mddev->pers = pers[pnum]; + + blk_queue_make_request(&mddev->queue, mddev->pers->make_request); + mddev->queue.queuedata = mddev; + + err = mddev->pers->run(mddev); + if (err) { + printk(KERN_ERR "md: pers->run() failed ...\n"); + mddev->pers = NULL; + return -EINVAL; + } + + mddev->in_sync = (mddev->sb->state & (1<pers->sync_request) + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + mddev->sb_dirty = 1; + __md_update_sb(mddev); + + md_recover_arrays(); + /* + * md_size has units of 1K blocks, which are + * twice as large as sectors. + */ + md_hd_struct[mdidx(mddev)].start_sect = 0; + register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)), + 1, &md_fops, md_size[mdidx(mddev)]<<1); + + read_ahead[MD_MAJOR] = 1024; + return (0); +} + +#undef TOO_BIG_CHUNKSIZE +#undef BAD_CHUNKSIZE + +static int restart_array(mddev_t *mddev) +{ + int err; + + /* + * Complain if it has no devices + */ + err = -ENXIO; + if (list_empty(&mddev->disks)) + goto out; + + if (mddev->pers) { + err = -EBUSY; + if (!mddev->ro) + goto out; + + mddev->ro = 0; + set_device_ro(mddev_to_kdev(mddev), 0); + + printk(KERN_INFO + "md: md%d switched to read-write mode.\n", mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + md_recover_arrays(); + err = 0; + } else { + printk(KERN_ERR "md: md%d has no personality assigned.\n", + mdidx(mddev)); + err = -EINVAL; + } + +out: + return err; +} + +#define STILL_MOUNTED KERN_WARNING \ +"md: md%d still mounted.\n" +#define STILL_IN_USE \ +"md: md%d still in use.\n" + +static int do_md_stop(mddev_t * mddev, int ro) +{ + int err = 0; + kdev_t dev = mddev_to_kdev(mddev); + + if (atomic_read(&mddev->active)>1) { + printk(STILL_IN_USE, mdidx(mddev)); + err = -EBUSY; + goto out; + } + + if (mddev->pers) { + if (mddev->sync_thread) { + if (mddev->recovery_running > 0) + mddev->recovery_running = -EINTR; + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (mddev->spare) { + mddev->pers->diskop(mddev, &mddev->spare, + DISKOP_SPARE_INACTIVE); + mddev->spare = NULL; + } + } + + invalidate_device(dev, 1); + + if (ro) { + err = -ENXIO; + if (mddev->ro) + goto out; + mddev->ro = 1; + } else { + if (mddev->ro) + set_device_ro(dev, 0); + if (mddev->pers->stop(mddev)) { + err = -EBUSY; + if (mddev->ro) + set_device_ro(dev, 1); + goto out; + } + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->sb) { + /* + * mark it clean only if there was no resync + * interrupted. + */ + if (mddev->in_sync) { + printk(KERN_INFO "md: marking sb clean...\n"); + mddev->sb->state |= 1 << MD_SB_CLEAN; + } + mddev->sb_dirty = 1; + __md_update_sb(mddev); + } + if (ro) + set_device_ro(dev, 1); + } + + /* + * Free resources if final stop + */ + if (!ro) { + printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); + free_mddev(mddev); + } else + printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev)); + err = 0; +out: + return err; +} + +/* + * We have to safely support old arrays too. + */ +int detect_old_array(mdp_super_t *sb) +{ + if (sb->major_version > 0) + return 0; + if (sb->minor_version >= 90) + return 0; + + return -EINVAL; +} + + +static void autorun_array(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int err; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return; + } + + printk(KERN_INFO "md: running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", partition_name(rdev->dev)); + } + printk("\n"); + + err = do_md_run (mddev); + if (err) { + printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + /* + * prevent the writeback of an unrunnable array + */ + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in the ->pending list) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(void) +{ + struct md_list_head candidates; + struct md_list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + + printk(KERN_INFO "md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + rdev0 = md_list_entry(pending_raid_disks.next, + mdk_rdev_t, pending); + + printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev)); + MD_INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) { + if (uuid_equal(rdev0, rdev)) { + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING + "md: %s has same UUID as %s, but superblocks differ ...\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + continue; + } + printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev)); + md_list_del(&rdev->pending); + md_list_add(&rdev->pending, &candidates); + } + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + + mddev = mddev_find(rdev0->sb->md_minor); + if (!mddev) { + printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); + break; + } + if (mddev_lock(mddev)) + printk(KERN_WARNING "md: md%d locked, cannot run\n", + mdidx(mddev)); + else if (mddev->sb || !list_empty(&mddev->disks)) { + printk(KERN_WARNING "md: md%d already running, cannot run %s\n", + mdidx(mddev), partition_name(rdev0->dev)); + mddev_unlock(mddev); + } else { + printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + bind_rdev_to_array(rdev, mddev); + list_del_init(&rdev->pending); + } + autorun_array(mddev); + mddev_unlock(mddev); + } + /* on success, candidates will be empty, on error + * it wont... + */ + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) + export_rdev(rdev); + mddev_put(mddev); + } + printk(KERN_INFO "md: ... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +#define BAD_VERSION KERN_ERR \ +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_DEVICE KERN_ERR \ +"md: disabled device %s\n" + +#define AUTOADD_FAILED KERN_ERR \ +"md: auto-adding devices to md%d FAILED (error %d).\n" + +#define AUTOADD_FAILED_USED KERN_ERR \ +"md: cannot auto-add device %s to md%d, already used.\n" + +#define AUTORUN_FAILED KERN_ERR \ +"md: auto-running md%d FAILED (error %d).\n" + +#define MDDEV_BUSY KERN_ERR \ +"md: cannot auto-add to md%d, already running.\n" + +#define AUTOADDING KERN_INFO \ +"md: auto-adding devices to md%d, based on %s's superblock.\n" + +#define AUTORUNNING KERN_INFO \ +"md: auto-running md%d.\n" + +static int autostart_array(kdev_t startdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + if (md_import_device(startdev, 1)) { + printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev)); + goto abort; + } + + start_rdev = find_rdev_all(startdev); + if (!start_rdev) { + MD_BUG(); + goto abort; + } + if (start_rdev->faulty) { + printk(KERN_WARNING "md: can not autostart based on faulty %s!\n", + partition_name(startdev)); + goto abort; + } + md_list_add(&start_rdev->pending, &pending_raid_disks); + + sb = start_rdev->sb; + + err = detect_old_array(sb); + if (err) { + printk(KERN_WARNING "md: array version is too old to be autostarted ," + "use raidtools 0.90 mkraid --upgrade to upgrade the array " + "without data loss!\n"); + goto abort; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + if (dev == startdev) + continue; + if (md_import_device(dev, 1)) { + printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n", + partition_name(dev)); + continue; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(); + return 0; + +abort: + if (start_rdev) + export_rdev(start_rdev); + return err; +} + +#undef BAD_VERSION +#undef OUT_OF_MEM +#undef NO_DEVICE +#undef AUTOADD_FAILED_USED +#undef AUTOADD_FAILED +#undef AUTORUN_FAILED +#undef AUTOADDING +#undef AUTORUNNING + + +static int get_version(void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (md_copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +#define SET_FROM_SB(x) info.x = mddev->sb->x +static int get_array_info(mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + + SET_FROM_SB(major_version); + SET_FROM_SB(minor_version); + SET_FROM_SB(patch_version); + SET_FROM_SB(ctime); + SET_FROM_SB(level); + SET_FROM_SB(size); + SET_FROM_SB(nr_disks); + SET_FROM_SB(raid_disks); + SET_FROM_SB(md_minor); + SET_FROM_SB(not_persistent); + + SET_FROM_SB(utime); + SET_FROM_SB(state); + SET_FROM_SB(active_disks); + SET_FROM_SB(working_disks); + SET_FROM_SB(failed_disks); + SET_FROM_SB(spare_disks); + + SET_FROM_SB(layout); + SET_FROM_SB(chunk_size); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x +static int get_disk_info(mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= MD_SB_DISKS) + return -EINVAL; + + SET_FROM_SB(major); + SET_FROM_SB(minor); + SET_FROM_SB(raid_disk); + SET_FROM_SB(state); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_SB(x) mddev->sb->disks[nr].x = info->x + +static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) +{ + int err, size, persistent; + mdk_rdev_t *rdev; + unsigned int nr; + kdev_t dev; + dev = MKDEV(info->major,info->minor); + + if (find_rdev_all(dev)) { + printk(KERN_WARNING "md: device %s already used in a RAID array!\n", + partition_name(dev)); + return -EBUSY; + } + if (!mddev->sb) { + /* expecting a device which has a superblock */ + err = md_import_device(dev, 1); + if (err) { + printk(KERN_WARNING "md: md_import_device returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (!list_empty(&mddev->disks)) { + mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + if (!uuid_equal(rdev0, rdev)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + } + bind_rdev_to_array(rdev, mddev); + return 0; + } + + nr = info->number; + if (nr >= mddev->sb->nr_disks) { + MD_BUG(); + return -EINVAL; + } + + + SET_SB(number); + SET_SB(major); + SET_SB(minor); + SET_SB(raid_disk); + SET_SB(state); + + if ((info->state & (1<old_dev = dev; + rdev->desc_nr = info->number; + + bind_rdev_to_array(rdev, mddev); + + persistent = !mddev->sb->not_persistent; + if (!persistent) + printk(KERN_INFO "md: nonpersistent superblock ...\n"); + + size = calc_dev_size(dev, mddev, persistent); + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + if (!mddev->sb->size || (mddev->sb->size > size)) + mddev->sb->size = size; + } + + /* + * sync all other superblocks with the main superblock + */ + sync_sbs(mddev); + + return 0; +} +#undef SET_SB + +static int hot_generate_error(mddev_t * mddev, kdev_t dev) +{ + struct request_queue *q; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + return -ENXIO; + } + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (!disk_active(disk)) + return -ENODEV; + + q = blk_get_queue(rdev->dev); + if (!q) { + MD_BUG(); + return -ENODEV; + } + printk(KERN_INFO "md: okay, generating error!\n"); +// q->oneshot_error = 1; // disabled for now + + return 0; +} + +static int hot_remove_disk(mddev_t * mddev, kdev_t dev) +{ + int err; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (disk_active(disk)) + goto busy; + + if (disk_removed(disk)) + return -EINVAL; + + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); + if (err == -EBUSY) + goto busy; + + if (err) { + MD_BUG(); + return -EINVAL; + } + + remove_descriptor(disk, mddev->sb); + kick_rdev_from_array(rdev); + __md_update_sb(mddev); + + return 0; +busy: + printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk(mddev_t * mddev, kdev_t dev) +{ + int i, err, persistent; + unsigned int size; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + persistent = !mddev->sb->not_persistent; + + rdev = find_rdev(mddev, dev); + if (rdev) + return -EBUSY; + + err = md_import_device (dev, 0); + if (err) { + printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (rdev->faulty) { + printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n", + partition_name(dev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + size = calc_dev_size(dev, mddev, persistent); + + if (size < mddev->sb->size) { + printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n", + mdidx(mddev), size, mddev->sb->size); + err = -ENOSPC; + goto abort_export; + } + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + rdev->old_dev = dev; + rdev->size = size; + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + disk = mddev->sb->disks + mddev->sb->raid_disks; + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { + disk = mddev->sb->disks + i; + + if (!disk->major && !disk->minor) + break; + if (disk_removed(disk)) + break; + } + if (i == MD_SB_DISKS) { + printk(KERN_WARNING "md%d: can not hot-add to full array!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + if (disk_removed(disk)) { + /* + * reuse slot + */ + if (disk->number != i) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + } else { + disk->number = i; + } + + disk->raid_disk = disk->number; + disk->major = MAJOR(dev); + disk->minor = MINOR(dev); + + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + + mark_disk_spare(disk); + mddev->sb->nr_disks++; + mddev->sb->spare_disks++; + mddev->sb->working_disks++; + + __md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + md_recover_arrays(); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +#define SET_SB(x) mddev->sb->x = info->x +static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +{ + + if (alloc_array_sb(mddev)) + return -ENOMEM; + + mddev->sb->major_version = MD_MAJOR_VERSION; + mddev->sb->minor_version = MD_MINOR_VERSION; + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; + mddev->sb->ctime = CURRENT_TIME; + + SET_SB(level); + SET_SB(size); + SET_SB(nr_disks); + SET_SB(raid_disks); + SET_SB(md_minor); + SET_SB(not_persistent); + + SET_SB(state); + SET_SB(active_disks); + SET_SB(working_disks); + SET_SB(failed_disks); + SET_SB(spare_disks); + + SET_SB(layout); + SET_SB(chunk_size); + + mddev->sb->md_magic = MD_SB_MAGIC; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(&mddev->sb->set_uuid0, 4); + get_random_bytes(&mddev->sb->set_uuid1, 4); + get_random_bytes(&mddev->sb->set_uuid2, 4); + get_random_bytes(&mddev->sb->set_uuid3, 4); + + return 0; +} +#undef SET_SB + +static int set_disk_faulty(mddev_t *mddev, kdev_t dev) +{ + int ret; + + ret = md_error(mddev, dev); + return ret; +} + +static int md_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!md_capable_admin()) + return -EACCES; + + dev = inode->i_rdev; + minor = MINOR(dev); + if (minor >= MAX_MD_DEVS) { + MD_BUG(); + return -EINVAL; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; + +#ifndef MODULE + case RAID_AUTORUN: + err = 0; + autostart_arrays(); + goto done; +#endif + + case BLKGETSIZE: + case BLKGETSIZE64: + case BLKRAGET: + case BLKRASET: + case BLKFLSBUF: + case BLKBSZGET: + case BLKBSZSET: + err = blk_ioctl (dev, cmd, arg); + goto abort; + + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) { + BUG(); + goto abort; + } + + + if (cmd == START_ARRAY) { + /* START_ARRAY doesn't need to lock the array as autostart_array + * does the locking, and it could even be a different array + */ + err = autostart_array(val_to_kdev(arg)); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name(val_to_kdev(arg))); + goto abort; + } + goto done; + } + + err = mddev_lock(mddev); + if (err) { + printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n", + err, cmd); + goto abort; + } + + switch (cmd) + { + case SET_ARRAY_INFO: + + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING "md: array md%d already has disks!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->sb) { + printk(KERN_WARNING "md: array md%d already has a superblock!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (arg) { + mdu_array_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldnt set array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + +<<<<<<< + err = autostart_array((kdev_t)arg); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name((kdev_t)arg)); +||||||| + err = autostart_array(val_to_kdev(arg)); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name(val_to_kdev(arg))); +======= +>>>>>>> + default:; + } + + /* + * Commands querying/configuring an existing array: + */ + /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop (mddev, 0); + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = md_put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = md_put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[minor].start_sect, + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + case HOT_GENERATE_ERROR: + err = hot_generate_error(mddev, (kdev_t)arg); + goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, (kdev_t)arg); + goto done_unlock; + + case RUN_ARRAY: + { + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + */ + if (err) { + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } + goto done_unlock; + } + + default: + printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, " + "upgrade your software to use new ictls.\n", + current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + mddev_unlock(mddev); + + return err; +done: + if (err) + MD_BUG(); +abort: + return err; +} + +static int md_open(struct inode *inode, struct file *file) +{ + /* + * Succeed if we can find or allocate a mddev structure. + */ + mddev_t *mddev = mddev_find(minor(inode->i_rdev)); + int err = -ENOMEM; + + if (!mddev) + goto out; + + if ((err = mddev_lock(mddev))) + goto put; + + err = 0; + mddev_unlock(mddev); + inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev); + put: + mddev_put(mddev); + out: + return err; +} + +static int md_release(struct inode *inode, struct file * file) +{ + mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) + BUG(); + mddev_put(mddev); + + return 0; +} + +static struct block_device_operations md_fops= +{ + owner: THIS_MODULE, + open: md_open, + release: md_release, + ioctl: md_ioctl, +}; + + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + md_lock_kernel(); + + /* + * Detach thread + */ + + daemonize(); + reparent_to_init(); + + sprintf(current->comm, thread->name); + md_init_signals(); + md_flush_signals(); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + current->policy = SCHED_OTHER; + current->nice = -20; + md_unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run)(void *data); + + wait_event_interruptible(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags)); + + clear_bit(THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->data); + run_task_queue(&tq_disk); + } + if (md_signal_pending(current)) + md_flush_signals(); + } + complete(thread->event); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + dprintk("md: waking up MD thread %p.\n", thread); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); +} + +mdk_thread_t *md_register_thread(void (*run) (void *), + void *data, const char *name) +{ + mdk_thread_t *thread; + int ret; + struct completion event; + + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; + + memset(thread, 0, sizeof(mdk_thread_t)); + md_init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->data = data; + thread->name = name; + ret = kernel_thread(md_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +void md_interrupt_thread(mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread(mdk_thread_t *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + md_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +static void md_recover_arrays(void) +{ + if (!md_recovery_thread) { + MD_BUG(); + return; + } + md_wakeup_thread(md_recovery_thread); +} + + +int md_error(mddev_t *mddev, kdev_t rdev) +{ + mdk_rdev_t * rrdev; + + dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", + MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev), + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); + + if (!mddev) { + MD_BUG(); + return 0; + } + rrdev = find_rdev(mddev, rdev); + if (!rrdev || rrdev->faulty) + return 0; + if (!mddev->pers->error_handler + || mddev->pers->error_handler(mddev,rdev) <= 0) { + rrdev->faulty = 1; + } else + return 1; + /* + * if recovery was running, stop it now. + */ + if (mddev->recovery_running) + mddev->recovery_running = -EIO; + md_recover_arrays(); + + return 0; +} + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + seq_printf(seq, "unused devices: "); + + ITERATE_RDEV_ALL(rdev,tmp) { + if (list_empty(&rdev->same_set)) { + /* + * The device is not yet used by any array. + */ + i++; + seq_printf(seq, "%s ", + partition_name(rdev->dev)); + } + } + if (!i) + seq_printf(seq, ""); + + seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, mddev_t * mddev) +{ + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + max_blocks = mddev->sb->size; + + /* + * Should not happen. + */ + if (!max_blocks) + MD_BUG(); + + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", + (mddev->spare ? "recovery" : "resync"), + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt/2); + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/dt); + +} + + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + mddev_t *mddev; + + if (l > 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, mddev_t, all_mddevs); + return mddev; + } + return (void*)2;/* tail */ +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + mddev_t *next_mddev, *mddev = v; + + ++*pos; + if (v == (void*)2) + return NULL; + + if (v == (void*)1) + tmp = all_mddevs.next; + else + tmp = mddev->all_mddevs.next; + if (tmp != &all_mddevs) + next_mddev = list_entry(tmp,mddev_t,all_mddevs); + else { + next_mddev = (void*)2; + *pos = 0x10000; + } + + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + int j, size; + struct md_list_head *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev = v; + + if (v == (void*)1) { + seq_printf(seq, "Personalities : "); + for (j = 0; j < MAX_PERSONALITY; j++) + if (pers[j]) + seq_printf(seq, "[%s] ", pers[j]->name); + + seq_printf(seq, "\n"); + seq_printf(seq, "read_ahead "); + if (read_ahead[MD_MAJOR] == INT_MAX) + seq_printf(seq, "not set\n"); + else + seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + seq_printf(seq, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + seq_printf(seq, " (read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + seq_printf(seq, " %s[%d]", + partition_name(rdev->dev), rdev->desc_nr); + if (rdev->faulty) { + seq_printf(seq, "(F)"); + continue; + } + size += rdev->size; + } + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %d blocks", + md_size[mdidx(mddev)]); + else + seq_printf(seq, "\n %d blocks", size); + } + + if (mddev->pers) { + + mddev->pers->status (seq, mddev); + + seq_printf(seq, "\n "); + if (mddev->curr_resync > 1) + status_resync (seq, mddev); + else if (mddev->curr_resync == 1) + seq_printf(seq, " resync=DELAYED"); + + } + seq_printf(seq, "\n"); + return 0; +} + + +static struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + int error; + + error = seq_open(file, &md_seq_ops); + return error; +} + +static struct file_operations md_seq_fops = { + .open = md_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +int register_md_personality(int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (pers[pnum]) { + MD_BUG(); + return -EBUSY; + } + + pers[pnum] = p; + printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); + return 0; +} + +int unregister_md_personality(int pnum) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); + pers[pnum] = NULL; + return 0; +} + +mdp_disk_t *get_spare(mddev_t *mddev) +{ + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (!rdev->sb) { + MD_BUG(); + continue; + } + disk = &sb->disks[rdev->desc_nr]; + if (disk_faulty(disk)) { + MD_BUG(); + continue; + } + if (disk_active(disk)) + continue; + return disk; + } + return NULL; +} + +static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; +void md_sync_acct(kdev_t dev, unsigned long nr_sectors) +{ + unsigned int major = MAJOR(dev); + unsigned int index; + + index = disk_index(dev); + if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + return; + + sync_io[major][index] += nr_sectors; +} + +static int is_mddev_idle(mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + int major = MAJOR(rdev->dev); + int idx = disk_index(rdev->dev); + + if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + continue; + + curr_events = kstat.dk_drive_rblk[major][idx] + + kstat.dk_drive_wblk[major][idx] ; + curr_events -= sync_io[major][idx]; + if ((curr_events - rdev->last_events) > 32) { + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + mddev->recovery_running = -EIO; + md_recover_arrays(); + // stop recovery, signal do_sync .... + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + } +} + + +DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +static void md_do_sync(void *data) +{ + mddev_t *mddev = data; + mddev_t *mddev2; + unsigned int max_sectors, currspeed, + j, window, err; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct md_list_head *tmp; + unsigned long last_check; + + /* just incase thread restarts... */ + if (mddev->recovery_running <= 0) + return; + + /* we overload curr_resync somewhat here. + * 0 == not engaged in resync at all + * 2 == checking that there is no conflict with another sync + * 1 == like 2, but have yielded to allow conflicting resync to + * commense + * other == active in resync - this many blocks + */ + do { + mddev->curr_resync = 2; + + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && + match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: delaying resync of md%d until md%d " + "has finished resync (they share one or more physical units)\n", + mdidx(mddev), mdidx(mddev2)); + if (mddev < mddev2) /* arbitrarily yield */ + mddev->curr_resync = 1; + if (wait_event_interruptible(resync_wait, + mddev2->curr_resync < 2)) { + md_flush_signals(); + err = -EINTR; + mddev_put(mddev2); + goto out; + } + } + } + } while (mddev->curr_resync < 2); + + max_sectors = mddev->sb->size<<1; + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", + sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith " + "(but not more than %d KB/sec) for reconstruction.\n", + sysctl_speed_limit_max); + + /* + * Resync has low priority. + */ + current->nice = 19; + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = 0; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = vm_max_readahead*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", + window/2,max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + for (j = 0; j < max_sectors;) { + int sectors; + + sectors = mddev->pers->sync_request(mddev, j); + + if (sectors < 0) { + err = sectors; + goto out; + } + atomic_add(sectors, &mddev->recovery_active); + j += sectors; + if (j>1) mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + last_check = j; + + run_task_queue(&tq_disk); + + repeat: + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (md_signal_pending(current)) { + /* + * got a signal, exit. + */ + printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n"); + md_flush_signals(); + err = -EINTR; + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + if (md_need_resched(current)) + schedule(); + + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { + current->nice = 19; + + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + md_schedule_timeout(HZ/4); + goto repeat; + } + } else + current->nice = -20; + } + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + err = 0; + /* + * this also signals 'finished resyncing' to md_stop + */ +out: + wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0); + /* tell personality that we are finished */ + mddev->pers->sync_request(mddev, max_sectors, 1); + + mddev->curr_resync = 0; + if (err) + mddev->recovery_running = err; + if (mddev->recovery_running > 0) + mddev->recovery_running = 0; + if (mddev->recovery_running == 0) + mddev->in_sync = 1; + md_recover_arrays(); +} + + +/* + * This is the kernel thread that watches all md arrays for re-sync action + * that might be needed. + * It does not do any resync itself, but rather "forks" off other threads + * to do that as needed. + * When it is determined that resync is needed, we set "->recovery_running" and + * create a thread at ->sync_thread. + * When the thread finishes is clears recovery_running (or set and error) + * and wakeup up this thread which will reap the thread and finish up. + */ +void md_do_recovery(void *data) +{ + mddev_t *mddev; + mdp_super_t *sb; + struct md_list_head *tmp; + + dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); + + ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) { + sb = mddev->sb; + if (!sb || !mddev->pers || !mddev->pers->diskop || mddev->ro) + goto unlock; + if (mddev->recovery_running > 0) + /* resync/recovery still happening */ + goto unlock; + if (mddev->sb_dirty) + md_update_sb(mddev); + if (mddev->sync_thread) { + /* resync has finished, collect result */ + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (mddev->recovery_running < 0) { + /* some sort of failure. + * If we were doing a reconstruction, + * we need to retrieve the spare + */ + if (mddev->spare) { + mddev->pers->diskop(mddev, &mddev->spare, + DISKOP_SPARE_INACTIVE); + mddev->spare = NULL; + } + } else { + /* success...*/ + if (mddev->spare) { + mddev->pers->diskop(mddev, &mddev->spare, + DISKOP_SPARE_ACTIVE); + mark_disk_sync(mddev->spare); + mark_disk_active(mddev->spare); + sb->active_disks++; + sb->spare_disks--; + mddev->spare = NULL; + } + } + __md_update_sb(mddev); + mddev->recovery_running = 0; + wake_up(&resync_wait); + goto unlock; + } + if (mddev->recovery_running) { + /* that's odd.. */ + mddev->recovery_running = 0; + wake_up(&resync_wait); + } + + if (sb->active_disks < sb->raid_disks) { + mddev->spare = get_spare(mddev); + if (!mddev->spare) + printk(KERN_ERR "md%d: no spare disk to reconstruct array! " + "-- continuing in degraded mode\n", mdidx(mddev)); + else + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", + mdidx(mddev), partition_name(MKDEV(mddev->spare->major,mddev->spare->minor))); + } + if (!mddev->spare && mddev->in_sync) { + /* nothing we can do ... */ + goto unlock; + } + if (mddev->pers->sync_request) { + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "md_resync"); + if (!mddev->sync_thread) { + printk(KERN_ERR "md%d: could not start resync thread...\n", mdidx(mddev)); + if (mddev->spare) + mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_INACTIVE); + mddev->spare = NULL; + mddev->recovery_running = 0; + } else { + if (mddev->spare) + mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_WRITE); + mddev->recovery_running = 1; + md_wakeup_thread(mddev->sync_thread); + } + } + unlock: + mddev_unlock(mddev); + } + dprintk(KERN_INFO "md: recovery thread finished ...\n"); + +} + +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct md_list_head *tmp; + mddev_t *mddev; + + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) + || (code == MD_SYS_POWER_OFF)) { + + printk(KERN_INFO "md: stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + if (mddev_trylock(mddev)==0) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + md_mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + notifier_call: md_notify_reboot, + next: NULL, + priority: INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + struct proc_dir_entry *p; + int i; + + for(i = 0; i < MAX_MD_DEVS; i++) { + md_blocksizes[i] = 1024; + md_size[i] = 0; + md_hardsect_sizes[i] = 512; + } + blksize_size[MAJOR_NR] = md_blocksizes; + blk_size[MAJOR_NR] = md_size; + max_readahead[MAJOR_NR] = md_maxreadahead; + hardsect_size[MAJOR_NR] = md_hardsect_sizes; + + dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + p = create_proc_entry("mdstat", S_IRUGO, NULL); + if (p) + p->proc_fops = &md_seq_fops; +#endif +} + +request_queue_t * md_queue_proc(kdev_t dev) +{ + mddev_t *mddev = mddev_find(minor(dev)); + request_queue_t *q = BLK_DEFAULT_QUEUE(MAJOR_NR); + if (!mddev || atomic_read(&mddev->active)<2) + BUG(); + if (mddev->pers) + q = &mddev->queue; + mddev_put(mddev); /* the caller must hold a reference... */ + return q; +} + +int md__init md_init(void) +{ + static char * name = "mdrecoveryd"; + int minor; + + printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + + if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops)) + { + printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR); + return (-1); + } + devfs_handle = devfs_mk_dir (NULL, "md", NULL); + /* we don't use devfs_register_series because we want to fill md_hd_struct */ + for (minor=0; minor < MAX_MD_DEVS; ++minor) { + char devname[128]; + sprintf (devname, "%u", minor); + md_hd_struct[minor].de = devfs_register (devfs_handle, + devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor, + S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); + } + + /* all requests on an uninitialised device get failed... */ + blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request); + blk_dev[MAJOR_NR].queue = md_queue_proc; + + + read_ahead[MAJOR_NR] = INT_MAX; + + add_gendisk(&md_gendisk); + + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); + if (!md_recovery_thread) + printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n"); + + md_register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table, 1); + + md_geninit(); + return (0); +} + + +#ifndef MODULE + +/* + * When md (and any require personalities) are compiled into the kernel + * (not a module), arrays can be assembles are boot time using with AUTODETECT + * where specially marked partitions are registered with md_autodetect_dev(), + * and with MD_BOOT where devices to be collected are given on the boot line + * with md=..... + * The code for that is here. + */ + +struct { + int set; + int noautodetect; +} raid_setup_args md__initdata; + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +static kdev_t detected_devices[128]; +static int dev_cnt; + +void md_autodetect_dev(kdev_t dev) +{ + if (dev_cnt >= 0 && dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} + + +static void autostart_arrays(void) +{ + mdk_rdev_t *rdev; + int i; + + printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + + for (i = 0; i < dev_cnt; i++) { + kdev_t dev = detected_devices[i]; + + if (md_import_device(dev,1)) { + printk(KERN_ALERT "md: could not import %s!\n", + partition_name(dev)); + continue; + } + /* + * Sanity checks: + */ + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + dev_cnt = 0; + + autorun_devices(); +} + +static struct { + char device_set [MAX_MD_DEVS]; + int pers[MAX_MD_DEVS]; + int chunk[MAX_MD_DEVS]; + char *device_names[MAX_MD_DEVS]; +} md_setup_args md__initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistant-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + * 2001-06-03: Dave Cinege + * Shifted name_to_kdev_t() and related operations to md_set_drive() + * for later execution. Rewrote section to make devfs compatible. + */ +static int md__init md_setup(char *str) +{ + int minor, level, factor, fault; + char *pername = ""; + char *str1 = str; + + if (get_option(&str, &minor) != 2) { /* MD Number */ + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + if (minor >= MAX_MD_DEVS) { + printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor); + return 0; + } else if (md_setup_args.device_names[minor]) { + printk(KERN_WARNING "md: md=%d, Specified more then once. " + "Replacing previous definition.\n", minor); + } + switch (get_option(&str, &level)) { /* RAID Personality */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == -1) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args.pers[minor] = level; + md_setup_args.chunk[minor] = 1 << (factor+12); + switch(level) { + case -1: + level = LINEAR; + pername = "linear"; + break; + case 0: + level = RAID0; + pername = "raid0"; + break; + default: + printk(KERN_WARNING + "md: The kernel has not been configured for raid%d support!\n", + level); + return 0; + } + md_setup_args.pers[minor] = level; + break; + } + /* FALL THROUGH */ + case 1: /* the first device is numeric */ + str = str1; + /* FALL THROUGH */ + case 0: + md_setup_args.pers[minor] = 0; + pername="super-block"; + } + + printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", + minor, pername, str); + md_setup_args.device_names[minor] = str; + + return 1; +} + +extern kdev_t name_to_kdev_t(char *line) md__init; +void md__init md_setup_drive(void) +{ + int minor, i; + kdev_t dev; + mddev_t*mddev; + kdev_t devices[MD_SB_DISKS+1]; + + for (minor = 0; minor < MAX_MD_DEVS; minor++) { + int err = 0; + char *devname; + mdu_disk_info_t dinfo; + + if ((devname = md_setup_args.device_names[minor]) == 0) continue; + + for (i = 0; i < MD_SB_DISKS && devname != 0; i++) { + + char *p; + void *handle; + + p = strchr(devname, ','); + if (p) + *p++ = 0; + + dev = name_to_kdev_t(devname); + handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev), + DEVFS_SPECIAL_BLK, 1); + if (handle != 0) { + unsigned major, minor; + devfs_get_maj_min(handle, &major, &minor); + dev = MKDEV(major, minor); + } + if (dev == 0) { + printk(KERN_WARNING "md: Unknown device name: %s\n", devname); + break; + } + + devices[i] = dev; + md_setup_args.device_set[minor] = 1; + + devname = p; + } + devices[i] = 0; + + if (md_setup_args.device_set[minor] == 0) + continue; + + printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]); + + mddev = mddev_find(minor); + if (!mddev) { + printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor); + continue; + } + if (mddev_lock(mddev)) { + printk(KERN_WARNING + "md: Ignoring md=%d, cannot lock!\n", + minor); + mddev_put(mddev); + continue; + } + + if (mddev->sb || !list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", + minor); + mddev_unlock(mddev); + mddev_put(mddev); + continue; + } + if (md_setup_args.pers[minor]) { + /* non-persistent */ + mdu_array_info_t ainfo; + ainfo.level = pers_to_level(md_setup_args.pers[minor]); + ainfo.size = 0; + ainfo.nr_disks =0; + ainfo.raid_disks =0; + ainfo.md_minor =minor; + ainfo.not_persistent = 1; + + ainfo.state = (1 << MD_SB_CLEAN); + ainfo.active_disks = 0; + ainfo.working_disks = 0; + ainfo.failed_disks = 0; + ainfo.spare_disks = 0; + ainfo.layout = 0; + ainfo.chunk_size = md_setup_args.chunk[minor]; + err = set_array_info(mddev, &ainfo); + for (i = 0; !err && (dev = devices[i]); i++) { + dinfo.number = i; + dinfo.raid_disk = i; + dinfo.state = (1<sb->nr_disks++; + mddev->sb->raid_disks++; + mddev->sb->active_disks++; + mddev->sb->working_disks++; + err = add_new_disk (mddev, &dinfo); + } + } else { + /* persistent */ + for (i = 0; (dev = devices[i]); i++) { + dinfo.major = MAJOR(dev); + dinfo.minor = MINOR(dev); + add_new_disk (mddev, &dinfo); + } + } + if (!err) + err = do_md_run(mddev); + if (err) { + mddev->sb_dirty = 0; + do_md_stop(mddev, 0); + printk(KERN_WARNING "md: starting md%d failed\n", minor); + } + mddev_unlock(mddev); + mddev_put(mddev); + } +} + +static int md__init raid_setup(char *str) +{ + int len, pos; + + len = strlen(str) + 1; + pos = 0; + + while (pos < len) { + char *comma = strchr(str+pos, ','); + int wlen; + if (comma) + wlen = (comma-str)-pos; + else wlen = (len-1)-pos; + + if (strncmp(str, "noautodetect", wlen) == 0) + raid_setup_args.noautodetect = 1; + pos += wlen+1; + } + raid_setup_args.set = 1; + return 1; +} + +int md__init md_run_setup(void) +{ + if (raid_setup_args.noautodetect) + printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n"); + else + autostart_arrays(); + md_setup_drive(); + return 0; +} + +__setup("raid=", raid_setup); +__setup("md=", md_setup); + +__initcall(md_init); +__initcall(md_run_setup); + +#else /* It is a MODULE */ + +int init_module(void) +{ + return md_init(); +} + +static void free_device_names(void) +{ + while (!list_empty(&device_names)) { + struct dname *tmp = list_entry(device_names.next, + dev_name_t, list); + list_del(&tmp->list); + kfree(tmp); + } +} + + +void cleanup_module(void) +{ + md_unregister_thread(md_recovery_thread); + devfs_unregister(devfs_handle); + + devfs_unregister_blkdev(MAJOR_NR,"md"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("mdstat", NULL); +#endif + + del_gendisk(&md_gendisk); + + blk_dev[MAJOR_NR].queue = NULL; + blksize_size[MAJOR_NR] = NULL; + blk_size[MAJOR_NR] = NULL; + max_readahead[MAJOR_NR] = NULL; + hardsect_size[MAJOR_NR] = NULL; + + free_device_names(); + +} +#endif + +MD_EXPORT_SYMBOL(md_size); +MD_EXPORT_SYMBOL(register_md_personality); +MD_EXPORT_SYMBOL(unregister_md_personality); +MD_EXPORT_SYMBOL(partition_name); +MD_EXPORT_SYMBOL(md_error); +MD_EXPORT_SYMBOL(md_done_sync); +MD_EXPORT_SYMBOL(md_unregister_thread); +MD_EXPORT_SYMBOL(md_update_sb); +MD_EXPORT_SYMBOL(md_wakeup_thread); +MD_EXPORT_SYMBOL(md_print_devices); +MD_EXPORT_SYMBOL(find_rdev_nr); +MD_EXPORT_SYMBOL(md_interrupt_thread); +MODULE_LICENSE("GPL"); diff --git a/tests/linux/md-autostart/orig b/tests/linux/md-autostart/orig new file mode 100644 index 0000000..12a3519 --- /dev/null +++ b/tests/linux/md-autostart/orig @@ -0,0 +1,4025 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher + - Devfs support by Richard Gooch + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown . + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_KMOD +#include +#endif + +#define __KERNEL_SYSCALLS__ +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER + +#include + +#define DEBUG 0 +#if DEBUG +# define dprintk(x...) printk(x) +#else +# define dprintk(x...) do { } while(0) +#endif + +#ifndef MODULE +static void autostart_arrays (void); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 100; +static int sysctl_speed_limit_max = 100000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min", + &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, + {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max", + &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table raid_dir_table[] = { + {DEV_RAID, "raid", NULL, 0, 0555, raid_table}, + {0} +}; + +static ctl_table raid_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table}, + {0} +}; + +/* + * these have to be allocated separately because external + * subsystems want to have a pre-defined structure + */ +struct hd_struct md_hd_struct[MAX_MD_DEVS]; +static int md_blocksizes[MAX_MD_DEVS]; +static int md_hardsect_sizes[MAX_MD_DEVS]; +static void md_recover_arrays(void); +static mdk_thread_t *md_recovery_thread; + +int md_size[MAX_MD_DEVS]; + +static struct block_device_operations md_fops; +static devfs_handle_t devfs_handle; + +static struct gendisk md_gendisk= +{ + major: MD_MAJOR, + major_name: "md", + minor_shift: 0, + max_p: 1, + part: md_hd_struct, + sizes: md_size, + nr_real: MAX_MD_DEVS, + real_devices: NULL, + next: NULL, + fops: &md_fops, +}; + +/* + * Enables to iterate over all existing md arrays + * all_mddevs_lock protects this list as well as mddev_map. + */ +static MD_LIST_HEAD(all_mddevs); +static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; + + +/* + * iterates through all used mddevs in the system. + * We take care to grab the all_mddevs_lock whenever navigating + * the list, and to always hold a refcount when unlocked. + * Any code which breaks out of this loop while own + * a reference to the current mddev and must mddev_put it. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (spin_lock(&all_mddevs_lock), \ + (tmp = all_mddevs.next), \ + (mddev = NULL); \ + (void)(tmp != &all_mddevs && \ + mddev_get(list_entry(tmp, mddev_t, all_mddevs))),\ + spin_unlock(&all_mddevs_lock), \ + (mddev ? mddev_put(mddev):(void)NULL), \ + (mddev = list_entry(tmp, mddev_t, all_mddevs)), \ + (tmp != &all_mddevs); \ + spin_lock(&all_mddevs_lock), \ + (tmp = tmp->next) \ + ) + +static mddev_t *mddev_map[MAX_MD_DEVS]; + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ + bio_io_error(bio); + return 0; +} + +static inline mddev_t *mddev_get(mddev_t *mddev) +{ + atomic_inc(&mddev->active); + return mddev; +} + +static void mddev_put(mddev_t *mddev) +{ + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) + return; + if (!mddev->sb && list_empty(&mddev->disks)) { + list_del(&mddev->all_mddevs); + mddev_map[mdidx(mddev)] = NULL; + kfree(mddev); + MOD_DEC_USE_COUNT; + } + spin_unlock(&all_mddevs_lock); +} + +static mddev_t * mddev_find(int unit) +{ + mddev_t *mddev, *new = NULL; + + retry: + spin_lock(&all_mddevs_lock); + if (mddev_map[unit]) { + mddev = mddev_get(mddev_map[unit]); + spin_unlock(&all_mddevs_lock); + if (new) + kfree(new); + return mddev; + } + if (new) { + mddev_map[unit] = new; + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + MOD_INC_USE_COUNT; + return new; + } + spin_unlock(&all_mddevs_lock); + + new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + memset(new, 0, sizeof(*new)); + + new->__minor = unit; + init_MUTEX(&new->reconfig_sem); + MD_INIT_LIST_HEAD(&new->disks); + MD_INIT_LIST_HEAD(&new->all_mddevs); + atomic_set(&new->active, 1); + + goto retry; +} + +static inline int mddev_lock(mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +static inline int mddev_trylock(mddev_t * mddev) +{ + return down_trylock(&mddev->reconfig_sem); +} + +static inline void mddev_unlock(mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +static MD_LIST_HEAD(device_names); + +char * partition_name(kdev_t dev) +{ + struct gendisk *hd; + static char nomem [] = ""; + dev_name_t *dname; + struct md_list_head *tmp; + + list_for_each(tmp, &device_names) { + dname = md_list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + + if (!dname) + return nomem; + /* + * ok, add this new device name to the list + */ + hd = get_gendisk (dev); + dname->name = NULL; + if (hd) + dname->name = disk_name (hd, MINOR(dev), dname->namebuf); + if (!dname->name) { + sprintf (dname->namebuf, "[dev %s]", kdevname(dev)); + dname->name = dname->namebuf; + } + + dname->dev = dev; + md_list_add(&dname->list, &device_names); + + return dname->name; +} + +static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev, + int persistent) +{ + unsigned int size = 0; + + if (blk_size[MAJOR(dev)]) + size = blk_size[MAJOR(dev)][MINOR(dev)]; + if (persistent) + size = MD_NEW_SIZE_BLOCKS(size); + return size; +} + +static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent) +{ + unsigned int size; + + size = calc_dev_sboffset(dev, mddev, persistent); + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +static unsigned int zoned_raid_size(mddev_t *mddev) +{ + unsigned int mask; + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* + * do size and offset calculations. + */ + mask = ~(mddev->sb->chunk_size/1024 - 1); + + ITERATE_RDEV(mddev,rdev,tmp) { + rdev->size &= mask; + md_size[mdidx(mddev)] += rdev->size; + } + return 0; +} + +static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb) +{ + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = 0; + disk->minor = 0; + mark_disk_removed(disk); +} + +#define BAD_MAGIC KERN_ERR \ +"md: invalid raid superblock magic on %s\n" + +#define BAD_MINOR KERN_ERR \ +"md: %s: invalid raid minor (%x)\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_SB KERN_ERR \ +"md: disabled device %s, could not read superblock.\n" + +#define BAD_CSUM KERN_WARNING \ +"md: invalid superblock checksum on %s\n" + +static int alloc_array_sb(mddev_t * mddev) +{ + if (mddev->sb) { + MD_BUG(); + return 0; + } + + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) + return -ENOMEM; + md_clear_page(mddev->sb); + return 0; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(OUT_OF_MEM); + return -EINVAL; + } + rdev->sb = (mdp_super_t *) page_address(rdev->sb_page); + + return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) { + page_cache_release(rdev->sb_page); + rdev->sb = NULL; + rdev->sb_page = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->faulty) + MD_BUG(); + } +} + + +static void bh_complete(struct buffer_head *bh, int uptodate) +{ + + if (uptodate) + set_bit(BH_Uptodate, &bh->b_state); + + complete((struct completion*)bh->b_private); +} + +static int sync_page_io(kdev_t dev, unsigned long sector, int size, + struct page *page, int rw) +{ + struct buffer_head bh; + struct completion event; + + init_completion(&event); + init_buffer(&bh, bh_complete, &event); + bh.b_rdev = dev; + bh.b_rsector = sector; + bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock); + bh.b_size = size; + bh.b_page = page; + bh.b_reqnext = NULL; + bh.b_data = page_address(page); + generic_make_request(rw, &bh); + + run_task_queue(&tq_disk); + wait_for_completion(&event); + + return test_bit(BH_Uptodate, &bh.b_state); +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ + int ret = -EINVAL; + kdev_t dev = rdev->dev; + unsigned long sb_offset; + + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + + /* + * Calculate the position of the superblock, + * it's at the end of the disk + */ + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); + rdev->sb_offset = sb_offset; + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) { + printk(NO_SB,partition_name(dev)); + return -EINVAL; + } + printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo); + ret = 0; +abort: + return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb(mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + printk(BAD_MAGIC, partition_name(rdev->dev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor); + goto abort; + } + + if (calc_sb_csum(sb) != sb->sb_csum) { + printk(BAD_CSUM, partition_name(rdev->dev)); + goto abort; + } + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = get_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + printk( KERN_WARNING +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" +" protection against single-disk failure might be compromised.\n", + mdidx(mddev), partition_name(rdev->dev), + partition_name(same_pdev->dev)); + + md_list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev)); +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del_init(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev)); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev(mdk_rdev_t *rdev) +{ + int err = 0; + struct block_device *bdev; + + bdev = bdget(rdev->dev); + if (!bdev) + return -ENOMEM; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + if (!err) + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + blkdev_put(bdev, BDEV_RAW); +} + +void md_autodetect_dev(kdev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ + printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev)); + if (rdev->mddev) + MD_BUG(); + unlock_rdev(rdev); + free_disk_sb(rdev); + list_del_init(&rdev->all); + if (!list_empty(&rdev->pending)) { + printk(KERN_INFO "md: (%s was pending)\n", + partition_name(rdev->dev)); + list_del_init(&rdev->pending); + } +#ifndef MODULE + md_autodetect_dev(rdev->dev); +#endif + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); +} + +static void free_mddev(mddev_t *mddev) +{ + if (!mddev) { + MD_BUG(); + return; + } + + export_array(mddev); + md_size[mdidx(mddev)] = 0; + md_hd_struct[mdidx(mddev)].nr_sects = 0; +} + +#undef BAD_CSUM +#undef BAD_MAGIC +#undef OUT_OF_MEM +#undef NO_SB + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ", + partition_name(rdev->dev), partition_name(rdev->old_dev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk(KERN_INFO "md: rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * *\n"); + printk("md: **********************************\n"); + ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", partition_name(rdev->dev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + mddev_unlock(mddev); + } + printk("md: **********************************\n"); + printk("\n"); +} + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) +{ + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) + + return 1; + + return 0; +} + +static mdk_rdev_t * find_rdev_all(kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + list_for_each(tmp, &all_raid_disks) { + rdev = md_list_entry(tmp, mdk_rdev_t, all); + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +#define GETBLK_FAILED KERN_ERR \ +"md: getblk failed for device %s\n" + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + kdev_t dev; + unsigned long sb_offset, size; + + if (!rdev->sb) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + if (rdev->sb->md_magic != MD_SB_MAGIC) { + MD_BUG(); + return 1; + } + + dev = rdev->dev; + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); + if (rdev->sb_offset != sb_offset) { + printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n", + partition_name(dev), rdev->sb_offset, sb_offset); + goto skip; + } + /* + * If the disk went offline meanwhile and it's just a spare, then + * its size has changed to zero silently, and the MD code does + * not yet know that it's faulty. + */ + size = calc_dev_size(dev, rdev->mddev, 1); + if (size != rdev->size) { + printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n", + partition_name(dev), rdev->size, size); + goto skip; + } + + printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset); + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) { + printk("md: write_disk_sb failed for device %s\n", partition_name(dev)); + return 1; + } +skip: + return 0; +} +#undef GETBLK_FAILED + +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int i, ok = 0; + mdp_disk_t *desc; + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = mddev->sb->disks + i; +#if 0 + if (disk_faulty(desc)) { + if (MKDEV(desc->major,desc->minor) == rdev->dev) + ok = 1; + continue; + } +#endif + if (MKDEV(desc->major,desc->minor) == rdev->dev) { + rdev->sb->this_disk = *desc; + rdev->desc_nr = desc->number; + ok = 1; + break; + } + } + + if (!ok) { + MD_BUG(); + } +} + +static int sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + mdp_super_t *sb; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty || rdev->alias_device) + continue; + sb = rdev->sb; + *sb = *mddev->sb; + set_this_disk(mddev, rdev); + sb->sb_csum = calc_sb_csum(sb); + } + return 0; +} + +void __md_update_sb(mddev_t * mddev) +{ + int err, count = 100; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + if (!mddev->sb_dirty) { + printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0)); + return 0; + } + mddev->sb_dirty = 0; +repeat: + mddev->sb->utime = CURRENT_TIME; + if ((++mddev->sb->events_lo)==0) + ++mddev->sb->events_hi; + + if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff; + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (mddev->sb->not_persistent) + return; + + printk(KERN_INFO "md: updating md%d RAID superblock on device\n", + mdidx(mddev)); + + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + printk(KERN_INFO "md: "); + if (rdev->faulty) + printk("(skipping faulty "); + if (rdev->alias_device) + printk("(skipping alias "); + if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) { + printk("(skipping new-faulty %s )\n", + partition_name(rdev->dev)); + continue; + } + printk("%s ", partition_name(rdev->dev)); + if (!rdev->faulty && !rdev->alias_device) { + printk("[events: %08lx]", + (unsigned long)rdev->sb->events_lo); + err += write_disk_sb(rdev); + } else + printk(")\n"); + } + if (err) { + if (--count) { + printk(KERN_ERR "md: errors occurred during superblock update, repeating\n"); + goto repeat; + } + printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n"); + } +} + +void md_update_sb(mddev_t *mddev) +{ + if (mddev_lock(mddev)) + return; + if (mddev->sb_dirty) + __md_update_sb(mddev); + mddev_unlock(mddev); +} + + +/* + * Import a device. If 'on_disk', then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + */ +static int md_import_device(kdev_t newdev, int on_disk) +{ + int err; + mdk_rdev_t *rdev; + unsigned int size; + + if (find_rdev_all(newdev)) + return -EEXIST; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev)); + return -ENOMEM; + } + memset(rdev, 0, sizeof(*rdev)); + + if (is_mounted(newdev)) { + printk(KERN_WARNING "md: can not import %s, has active inodes!\n", + partition_name(newdev)); + err = -EBUSY; + goto abort_free; + } + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + rdev->dev = newdev; + if (lock_rdev(rdev)) { + printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + + size = 0; + if (blk_size[MAJOR(newdev)]) + size = blk_size[MAJOR(newdev)][MINOR(newdev)]; + if (!size) { + printk(KERN_WARNING "md: %s has zero size, marking faulty!\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + + if (on_disk) { + if ((err = read_disk_sb(rdev))) { + printk(KERN_WARNING "md: could not read %s's sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + if ((err = check_disk_sb(rdev))) { + printk(KERN_WARNING "md: %s has invalid sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + + if (rdev->sb->level != -4) { + rdev->old_dev = MKDEV(rdev->sb->this_disk.major, + rdev->sb->this_disk.minor); + rdev->desc_nr = rdev->sb->this_disk.number; + } else { + rdev->old_dev = MKDEV(0, 0); + rdev->desc_nr = -1; + } + } + md_list_add(&rdev->all, &all_raid_disks); + MD_INIT_LIST_HEAD(&rdev->pending); + INIT_LIST_HEAD(&rdev->same_set); + + return 0; + +abort_free: + if (rdev->sb) { + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return err; +} + +/* + * Check a full RAID array for plausibility + */ + +#define INCONSISTENT KERN_ERR \ +"md: fatal superblock inconsistency in %s -- removing from array\n" + +#define OUT_OF_DATE KERN_ERR \ +"md: superblock update time inconsistency -- using the most recent one\n" + +#define OLD_VERSION KERN_ALERT \ +"md: md%d: unsupported raid array version %d.%d.%d\n" + +#define NOT_CLEAN_IGNORE KERN_ERR \ +"md: md%d: raid array is not clean -- starting background reconstruction\n" + +#define UNKNOWN_LEVEL KERN_ERR \ +"md: md%d: unsupported raid level %d\n" + +static int analyze_sbs(mddev_t * mddev) +{ + int out_of_date = 0, i, first; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2, *freshest; + mdp_super_t *sb; + + /* + * Verify the RAID superblock on each real device + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + MD_BUG(); + goto abort; + } + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + if (check_disk_sb(rdev)) + goto abort; + } + + /* + * The superblock constant part has to be the same + * for all disks in the array. + */ + sb = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!sb) { + sb = rdev->sb; + continue; + } + if (!sb_equal(sb, rdev->sb)) { + printk(INCONSISTENT, partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * OK, we have all disks and the array is ready to run. Let's + * find the freshest superblock, that one will be the superblock + * that represents the whole array. + */ + if (!mddev->sb) + if (alloc_array_sb(mddev)) + goto abort; + sb = mddev->sb; + freshest = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2; + /* + * if the checksum is invalid, use the superblock + * only as a last resort. (decrease it's age by + * one event) + */ + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { + if (rdev->sb->events_lo || rdev->sb->events_hi) + if ((rdev->sb->events_lo--)==0) + rdev->sb->events_hi--; + } + + printk(KERN_INFO "md: %s's event counter: %08lx\n", + partition_name(rdev->dev), + (unsigned long)rdev->sb->events_lo); + if (!freshest) { + freshest = rdev; + continue; + } + /* + * Find the newest superblock version + */ + ev1 = md_event(rdev->sb); + ev2 = md_event(freshest->sb); + if (ev1 != ev2) { + out_of_date = 1; + if (ev1 > ev2) + freshest = rdev; + } + } + if (out_of_date) { + printk(OUT_OF_DATE); + printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev)); + } + memcpy (sb, freshest->sb, sizeof(*sb)); + + /* + * at this point we have picked the 'best' superblock + * from all available superblocks. + * now we validate this superblock and kick out possibly + * failed disks. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Kick all non-fresh devices + */ + __u64 ev1, ev2; + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ++ev1; + if (ev1 < ev2) { + printk(KERN_WARNING "md: kicking non-fresh %s from array!\n", + partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * Fix up changed device names ... but only if this disk has a + * recent update time. Use faulty checksum ones too. + */ + if (mddev->sb->level != -4) + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2, ev3; + if (rdev->faulty || rdev->alias_device) { + MD_BUG(); + goto abort; + } + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ev3 = ev2; + --ev3; + if ((rdev->dev != rdev->old_dev) && + ((ev1 == ev2) || (ev1 == ev3))) { + mdp_disk_t *desc; + + printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n", + partition_name(rdev->old_dev), partition_name(rdev->dev)); + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + desc = &sb->disks[rdev->desc_nr]; + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { + MD_BUG(); + goto abort; + } + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + desc = &rdev->sb->this_disk; + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + } + } + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + /* + * We kick faulty devices/descriptors immediately. + * + * Note: multipath devices are a special case. Since we + * were able to read the superblock on the path, we don't + * care if it was previously marked as faulty, it's up now + * so enable it. + */ + if (disk_faulty(desc) && mddev->sb->level != -4) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + printk(KERN_WARNING "md%d: kicking faulty %s!\n", + mdidx(mddev),partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + if (dev == MKDEV(0,0)) + continue; + printk(KERN_WARNING "md%d: removing former faulty %s!\n", + mdidx(mddev), partition_name(dev)); + } + remove_descriptor(desc, sb); + continue; + } else if (disk_faulty(desc)) { + /* + * multipath entry marked as faulty, unfaulty it + */ + rdev = find_rdev(mddev, dev); + if(rdev) + mark_disk_spare(desc); + else + remove_descriptor(desc, sb); + } + + if (dev == MKDEV(0,0)) + continue; + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Multi-path IO special-case: since we have no + * this_disk descriptor at auto-detect time, + * we cannot check rdev->number. + * We can check the device though. + */ + if ((sb->level == -4) && (rdev->dev == + MKDEV(desc->major,desc->minor))) { + found = 1; + break; + } + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } + } + if (found) + continue; + + printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n", + mdidx(mddev), partition_name(dev)); + remove_descriptor(desc, sb); + } + + /* + * Double check wether all devices mentioned in the + * superblock are in the rdev ring. + */ + first = 1; + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + + if (disk_faulty(desc)) { + MD_BUG(); + goto abort; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + /* + * In the case of Multipath-IO, we have no + * other information source to find out which + * disk is which, only the position of the device + * in the superblock: + */ + if (mddev->sb->level == -4) { + if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) { + MD_BUG(); + goto abort; + } + rdev->desc_nr = i; + if (!first) + rdev->alias_device = 1; + else + first = 0; + } + } + + /* + * Kick all rdevs that are not in the + * descriptor array: + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) + kick_rdev_from_array(rdev); + } + + /* + * Do a final reality check. + */ + if (mddev->sb->level != -4) { + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + /* + * is the desc_nr unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->desc_nr == rdev->desc_nr)) { + MD_BUG(); + goto abort; + } + } + /* + * is the device unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->dev == rdev->dev)) { + MD_BUG(); + goto abort; + } + } + } + } + + /* + * Check if we can support this RAID array + */ + if (sb->major_version != MD_MAJOR_VERSION || + sb->minor_version > MD_MINOR_VERSION) { + + printk(OLD_VERSION, mdidx(mddev), sb->major_version, + sb->minor_version, sb->patch_version); + goto abort; + } + + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || + (sb->level == 4) || (sb->level == 5))) + printk(NOT_CLEAN_IGNORE, mdidx(mddev)); + + return 0; +abort: + return 1; +} + +#undef INCONSISTENT +#undef OUT_OF_DATE +#undef OLD_VERSION +#undef OLD_LEVEL + +static int device_size_calculation(mddev_t * mddev) +{ + int data_disks = 0, persistent; + unsigned int readahead; + mdp_super_t *sb = mddev->sb; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + persistent = !mddev->sb->not_persistent; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size) { + MD_BUG(); + continue; + } + rdev->size = calc_dev_size(rdev->dev, mddev, persistent); + if (rdev->size < sb->chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size: %ldk < %dk\n", + partition_name(rdev->dev), + rdev->size, sb->chunk_size / 1024); + return -EINVAL; + } + } + + switch (sb->level) { + case -4: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case -1: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = sb->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = sb->raid_disks-1; + break; + default: + printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = sb->size * data_disks; + + readahead = MD_READAHEAD; + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) { + readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + // (no multipath branch - it uses the default setting) + if (sb->level == -3) + readahead = 0; + } + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; +abort: + return 1; +} + + +#define TOO_BIG_CHUNKSIZE KERN_ERR \ +"too big chunk_size: %d > %d\n" + +#define TOO_SMALL_CHUNKSIZE KERN_ERR \ +"too small chunk_size: %d < %ld\n" + +#define BAD_CHUNKSIZE KERN_ERR \ +"no chunksize specified, see 'man raidtab'\n" + +static int do_md_run(mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Resize disks to align partitions size on a given + * chunk size. + */ + md_size[mdidx(mddev)] = 0; + + /* + * Analyze all RAID superblock(s) + */ + if (analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->sb->chunk_size; + pnum = level_to_pers(mddev->sb->level); + + if ((pnum != MULTIPATH) && (pnum != RAID1)) { + if (!chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We dont + * want to continue the bad practice. + */ + printk(BAD_CHUNKSIZE); + return -EINVAL; + } + if (chunk_size > MAX_CHUNK_SIZE) { + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); + return -EINVAL; + } + } else + if (chunk_size) + printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n", + mddev->sb->level); + + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (!pers[pnum]) + { +#ifdef CONFIG_KMOD + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + if (!pers[pnum]) +#endif + { + printk(KERN_ERR "md: personality %d is not loaded!\n", + pnum); + return -EINVAL; + } + } + + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + md_hardsect_sizes[mdidx(mddev)] = 512; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + invalidate_device(rdev->dev, 1); + if (get_hardsect_size(rdev->dev) + > md_hardsect_sizes[mdidx(mddev)]) + md_hardsect_sizes[mdidx(mddev)] = + get_hardsect_size(rdev->dev); + } + md_blocksizes[mdidx(mddev)] = 1024; + if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)]) + md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)]; + mddev->pers = pers[pnum]; + + blk_queue_make_request(&mddev->queue, mddev->pers->make_request); + mddev->queue.queuedata = mddev; + + err = mddev->pers->run(mddev); + if (err) { + printk(KERN_ERR "md: pers->run() failed ...\n"); + mddev->pers = NULL; + return -EINVAL; + } + + mddev->in_sync = (mddev->sb->state & (1<pers->sync_request) + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + mddev->sb_dirty = 1; + __md_update_sb(mddev); + + md_recover_arrays(); + /* + * md_size has units of 1K blocks, which are + * twice as large as sectors. + */ + md_hd_struct[mdidx(mddev)].start_sect = 0; + register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)), + 1, &md_fops, md_size[mdidx(mddev)]<<1); + + read_ahead[MD_MAJOR] = 1024; + return (0); +} + +#undef TOO_BIG_CHUNKSIZE +#undef BAD_CHUNKSIZE + +static int restart_array(mddev_t *mddev) +{ + int err; + + /* + * Complain if it has no devices + */ + err = -ENXIO; + if (list_empty(&mddev->disks)) + goto out; + + if (mddev->pers) { + err = -EBUSY; + if (!mddev->ro) + goto out; + + mddev->ro = 0; + set_device_ro(mddev_to_kdev(mddev), 0); + + printk(KERN_INFO + "md: md%d switched to read-write mode.\n", mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + md_recover_arrays(); + err = 0; + } else { + printk(KERN_ERR "md: md%d has no personality assigned.\n", + mdidx(mddev)); + err = -EINVAL; + } + +out: + return err; +} + +#define STILL_MOUNTED KERN_WARNING \ +"md: md%d still mounted.\n" +#define STILL_IN_USE \ +"md: md%d still in use.\n" + +static int do_md_stop(mddev_t * mddev, int ro) +{ + int err = 0; + kdev_t dev = mddev_to_kdev(mddev); + + if (atomic_read(&mddev->active)>1) { + printk(STILL_IN_USE, mdidx(mddev)); + err = -EBUSY; + goto out; + } + + if (mddev->pers) { + if (mddev->sync_thread) { + if (mddev->recovery_running > 0) + mddev->recovery_running = -EINTR; + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (mddev->spare) { + mddev->pers->diskop(mddev, &mddev->spare, + DISKOP_SPARE_INACTIVE); + mddev->spare = NULL; + } + } + + invalidate_device(dev, 1); + + if (ro) { + err = -ENXIO; + if (mddev->ro) + goto out; + mddev->ro = 1; + } else { + if (mddev->ro) + set_device_ro(dev, 0); + if (mddev->pers->stop(mddev)) { + err = -EBUSY; + if (mddev->ro) + set_device_ro(dev, 1); + goto out; + } + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->sb) { + /* + * mark it clean only if there was no resync + * interrupted. + */ + if (mddev->in_sync) { + printk(KERN_INFO "md: marking sb clean...\n"); + mddev->sb->state |= 1 << MD_SB_CLEAN; + } + mddev->sb_dirty = 1; + __md_update_sb(mddev); + } + if (ro) + set_device_ro(dev, 1); + } + + /* + * Free resources if final stop + */ + if (!ro) { + printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); + free_mddev(mddev); + } else + printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev)); + err = 0; +out: + return err; +} + +/* + * We have to safely support old arrays too. + */ +int detect_old_array(mdp_super_t *sb) +{ + if (sb->major_version > 0) + return 0; + if (sb->minor_version >= 90) + return 0; + + return -EINVAL; +} + + +static void autorun_array(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int err; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return; + } + + printk(KERN_INFO "md: running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", partition_name(rdev->dev)); + } + printk("\n"); + + err = do_md_run (mddev); + if (err) { + printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + /* + * prevent the writeback of an unrunnable array + */ + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in the ->pending list) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(void) +{ + struct md_list_head candidates; + struct md_list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + + printk(KERN_INFO "md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + rdev0 = md_list_entry(pending_raid_disks.next, + mdk_rdev_t, pending); + + printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev)); + MD_INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) { + if (uuid_equal(rdev0, rdev)) { + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING + "md: %s has same UUID as %s, but superblocks differ ...\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + continue; + } + printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev)); + md_list_del(&rdev->pending); + md_list_add(&rdev->pending, &candidates); + } + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + + mddev = mddev_find(rdev0->sb->md_minor); + if (!mddev) { + printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); + break; + } + if (mddev_lock(mddev)) + printk(KERN_WARNING "md: md%d locked, cannot run\n", + mdidx(mddev)); + else if (mddev->sb || !list_empty(&mddev->disks)) { + printk(KERN_WARNING "md: md%d already running, cannot run %s\n", + mdidx(mddev), partition_name(rdev0->dev)); + mddev_unlock(mddev); + } else { + printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + bind_rdev_to_array(rdev, mddev); + list_del_init(&rdev->pending); + } + autorun_array(mddev); + mddev_unlock(mddev); + } + /* on success, candidates will be empty, on error + * it wont... + */ + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) + export_rdev(rdev); + mddev_put(mddev); + } + printk(KERN_INFO "md: ... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +#define BAD_VERSION KERN_ERR \ +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_DEVICE KERN_ERR \ +"md: disabled device %s\n" + +#define AUTOADD_FAILED KERN_ERR \ +"md: auto-adding devices to md%d FAILED (error %d).\n" + +#define AUTOADD_FAILED_USED KERN_ERR \ +"md: cannot auto-add device %s to md%d, already used.\n" + +#define AUTORUN_FAILED KERN_ERR \ +"md: auto-running md%d FAILED (error %d).\n" + +#define MDDEV_BUSY KERN_ERR \ +"md: cannot auto-add to md%d, already running.\n" + +#define AUTOADDING KERN_INFO \ +"md: auto-adding devices to md%d, based on %s's superblock.\n" + +#define AUTORUNNING KERN_INFO \ +"md: auto-running md%d.\n" + +static int autostart_array(kdev_t startdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + if (md_import_device(startdev, 1)) { + printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev)); + goto abort; + } + + start_rdev = find_rdev_all(startdev); + if (!start_rdev) { + MD_BUG(); + goto abort; + } + if (start_rdev->faulty) { + printk(KERN_WARNING "md: can not autostart based on faulty %s!\n", + partition_name(startdev)); + goto abort; + } + md_list_add(&start_rdev->pending, &pending_raid_disks); + + sb = start_rdev->sb; + + err = detect_old_array(sb); + if (err) { + printk(KERN_WARNING "md: array version is too old to be autostarted ," + "use raidtools 0.90 mkraid --upgrade to upgrade the array " + "without data loss!\n"); + goto abort; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + if (dev == startdev) + continue; + if (md_import_device(dev, 1)) { + printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n", + partition_name(dev)); + continue; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(); + return 0; + +abort: + if (start_rdev) + export_rdev(start_rdev); + return err; +} + +#undef BAD_VERSION +#undef OUT_OF_MEM +#undef NO_DEVICE +#undef AUTOADD_FAILED_USED +#undef AUTOADD_FAILED +#undef AUTORUN_FAILED +#undef AUTOADDING +#undef AUTORUNNING + + +static int get_version(void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (md_copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +#define SET_FROM_SB(x) info.x = mddev->sb->x +static int get_array_info(mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + + SET_FROM_SB(major_version); + SET_FROM_SB(minor_version); + SET_FROM_SB(patch_version); + SET_FROM_SB(ctime); + SET_FROM_SB(level); + SET_FROM_SB(size); + SET_FROM_SB(nr_disks); + SET_FROM_SB(raid_disks); + SET_FROM_SB(md_minor); + SET_FROM_SB(not_persistent); + + SET_FROM_SB(utime); + SET_FROM_SB(state); + SET_FROM_SB(active_disks); + SET_FROM_SB(working_disks); + SET_FROM_SB(failed_disks); + SET_FROM_SB(spare_disks); + + SET_FROM_SB(layout); + SET_FROM_SB(chunk_size); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x +static int get_disk_info(mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= MD_SB_DISKS) + return -EINVAL; + + SET_FROM_SB(major); + SET_FROM_SB(minor); + SET_FROM_SB(raid_disk); + SET_FROM_SB(state); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_SB(x) mddev->sb->disks[nr].x = info->x + +static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) +{ + int err, size, persistent; + mdk_rdev_t *rdev; + unsigned int nr; + kdev_t dev; + dev = MKDEV(info->major,info->minor); + + if (find_rdev_all(dev)) { + printk(KERN_WARNING "md: device %s already used in a RAID array!\n", + partition_name(dev)); + return -EBUSY; + } + if (!mddev->sb) { + /* expecting a device which has a superblock */ + err = md_import_device(dev, 1); + if (err) { + printk(KERN_WARNING "md: md_import_device returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (!list_empty(&mddev->disks)) { + mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + if (!uuid_equal(rdev0, rdev)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + } + bind_rdev_to_array(rdev, mddev); + return 0; + } + + nr = info->number; + if (nr >= mddev->sb->nr_disks) { + MD_BUG(); + return -EINVAL; + } + + + SET_SB(number); + SET_SB(major); + SET_SB(minor); + SET_SB(raid_disk); + SET_SB(state); + + if ((info->state & (1<old_dev = dev; + rdev->desc_nr = info->number; + + bind_rdev_to_array(rdev, mddev); + + persistent = !mddev->sb->not_persistent; + if (!persistent) + printk(KERN_INFO "md: nonpersistent superblock ...\n"); + + size = calc_dev_size(dev, mddev, persistent); + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + if (!mddev->sb->size || (mddev->sb->size > size)) + mddev->sb->size = size; + } + + /* + * sync all other superblocks with the main superblock + */ + sync_sbs(mddev); + + return 0; +} +#undef SET_SB + +static int hot_generate_error(mddev_t * mddev, kdev_t dev) +{ + struct request_queue *q; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + return -ENXIO; + } + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (!disk_active(disk)) + return -ENODEV; + + q = blk_get_queue(rdev->dev); + if (!q) { + MD_BUG(); + return -ENODEV; + } + printk(KERN_INFO "md: okay, generating error!\n"); +// q->oneshot_error = 1; // disabled for now + + return 0; +} + +static int hot_remove_disk(mddev_t * mddev, kdev_t dev) +{ + int err; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (disk_active(disk)) + goto busy; + + if (disk_removed(disk)) + return -EINVAL; + + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); + if (err == -EBUSY) + goto busy; + + if (err) { + MD_BUG(); + return -EINVAL; + } + + remove_descriptor(disk, mddev->sb); + kick_rdev_from_array(rdev); + __md_update_sb(mddev); + + return 0; +busy: + printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk(mddev_t * mddev, kdev_t dev) +{ + int i, err, persistent; + unsigned int size; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + persistent = !mddev->sb->not_persistent; + + rdev = find_rdev(mddev, dev); + if (rdev) + return -EBUSY; + + err = md_import_device (dev, 0); + if (err) { + printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (rdev->faulty) { + printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n", + partition_name(dev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + size = calc_dev_size(dev, mddev, persistent); + + if (size < mddev->sb->size) { + printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n", + mdidx(mddev), size, mddev->sb->size); + err = -ENOSPC; + goto abort_export; + } + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + rdev->old_dev = dev; + rdev->size = size; + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + disk = mddev->sb->disks + mddev->sb->raid_disks; + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { + disk = mddev->sb->disks + i; + + if (!disk->major && !disk->minor) + break; + if (disk_removed(disk)) + break; + } + if (i == MD_SB_DISKS) { + printk(KERN_WARNING "md%d: can not hot-add to full array!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + if (disk_removed(disk)) { + /* + * reuse slot + */ + if (disk->number != i) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + } else { + disk->number = i; + } + + disk->raid_disk = disk->number; + disk->major = MAJOR(dev); + disk->minor = MINOR(dev); + + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + + mark_disk_spare(disk); + mddev->sb->nr_disks++; + mddev->sb->spare_disks++; + mddev->sb->working_disks++; + + __md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + md_recover_arrays(); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +#define SET_SB(x) mddev->sb->x = info->x +static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +{ + + if (alloc_array_sb(mddev)) + return -ENOMEM; + + mddev->sb->major_version = MD_MAJOR_VERSION; + mddev->sb->minor_version = MD_MINOR_VERSION; + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; + mddev->sb->ctime = CURRENT_TIME; + + SET_SB(level); + SET_SB(size); + SET_SB(nr_disks); + SET_SB(raid_disks); + SET_SB(md_minor); + SET_SB(not_persistent); + + SET_SB(state); + SET_SB(active_disks); + SET_SB(working_disks); + SET_SB(failed_disks); + SET_SB(spare_disks); + + SET_SB(layout); + SET_SB(chunk_size); + + mddev->sb->md_magic = MD_SB_MAGIC; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(&mddev->sb->set_uuid0, 4); + get_random_bytes(&mddev->sb->set_uuid1, 4); + get_random_bytes(&mddev->sb->set_uuid2, 4); + get_random_bytes(&mddev->sb->set_uuid3, 4); + + return 0; +} +#undef SET_SB + +static int set_disk_faulty(mddev_t *mddev, kdev_t dev) +{ + int ret; + + ret = md_error(mddev, dev); + return ret; +} + +static int md_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!md_capable_admin()) + return -EACCES; + + dev = inode->i_rdev; + minor = MINOR(dev); + if (minor >= MAX_MD_DEVS) { + MD_BUG(); + return -EINVAL; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; + +#ifndef MODULE + case RAID_AUTORUN: + err = 0; + autostart_arrays(); + goto done; +#endif + + case BLKGETSIZE: + case BLKGETSIZE64: + case BLKRAGET: + case BLKRASET: + case BLKFLSBUF: + case BLKBSZGET: + case BLKBSZSET: + err = blk_ioctl (dev, cmd, arg); + goto abort; + + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) { + BUG(); + goto abort; + } + + + if (cmd == START_ARRAY) { + /* START_ARRAY doesn't need to lock the array as autostart_array + * does the locking, and it could even be a different array + */ + err = autostart_array(val_to_kdev(arg)); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name(val_to_kdev(arg))); + goto abort; + } + goto done; + } + + err = mddev_lock(mddev); + if (err) { + printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n", + err, cmd); + goto abort; + } + + switch (cmd) + { + case SET_ARRAY_INFO: + + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING "md: array md%d already has disks!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->sb) { + printk(KERN_WARNING "md: array md%d already has a superblock!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (arg) { + mdu_array_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldnt set array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + + case START_ARRAY: + /* + * possibly make it lock the array ... + */ + err = autostart_array((kdev_t)arg); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name((kdev_t)arg)); + goto abort_unlock; + } + goto done_unlock; + + default:; + } + + /* + * Commands querying/configuring an existing array: + */ + /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop (mddev, 0); + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = md_put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = md_put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[minor].start_sect, + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + case HOT_GENERATE_ERROR: + err = hot_generate_error(mddev, (kdev_t)arg); + goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, (kdev_t)arg); + goto done_unlock; + + case RUN_ARRAY: + { + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + */ + if (err) { + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } + goto done_unlock; + } + + default: + printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, " + "upgrade your software to use new ictls.\n", + current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + mddev_unlock(mddev); + + return err; +done: + if (err) + MD_BUG(); +abort: + return err; +} + +static int md_open(struct inode *inode, struct file *file) +{ + /* + * Succeed if we can find or allocate a mddev structure. + */ + mddev_t *mddev = mddev_find(minor(inode->i_rdev)); + int err = -ENOMEM; + + if (!mddev) + goto out; + + if ((err = mddev_lock(mddev))) + goto put; + + err = 0; + mddev_unlock(mddev); + inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev); + put: + mddev_put(mddev); + out: + return err; +} + +static int md_release(struct inode *inode, struct file * file) +{ + mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) + BUG(); + mddev_put(mddev); + + return 0; +} + +static struct block_device_operations md_fops= +{ + owner: THIS_MODULE, + open: md_open, + release: md_release, + ioctl: md_ioctl, +}; + + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + md_lock_kernel(); + + /* + * Detach thread + */ + + daemonize(); + reparent_to_init(); + + sprintf(current->comm, thread->name); + md_init_signals(); + md_flush_signals(); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + current->policy = SCHED_OTHER; + current->nice = -20; + md_unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run)(void *data); + + wait_event_interruptible(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags)); + + clear_bit(THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->data); + run_task_queue(&tq_disk); + } + if (md_signal_pending(current)) + md_flush_signals(); + } + complete(thread->event); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + dprintk("md: waking up MD thread %p.\n", thread); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); +} + +mdk_thread_t *md_register_thread(void (*run) (void *), + void *data, const char *name) +{ + mdk_thread_t *thread; + int ret; + struct completion event; + + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; + + memset(thread, 0, sizeof(mdk_thread_t)); + md_init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->data = data; + thread->name = name; + ret = kernel_thread(md_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +void md_interrupt_thread(mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread(mdk_thread_t *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + md_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +static void md_recover_arrays(void) +{ + if (!md_recovery_thread) { + MD_BUG(); + return; + } + md_wakeup_thread(md_recovery_thread); +} + + +int md_error(mddev_t *mddev, kdev_t rdev) +{ + mdk_rdev_t * rrdev; + + dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", + MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev), + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); + + if (!mddev) { + MD_BUG(); + return 0; + } + rrdev = find_rdev(mddev, rdev); + if (!rrdev || rrdev->faulty) + return 0; + if (!mddev->pers->error_handler + || mddev->pers->error_handler(mddev,rdev) <= 0) { + rrdev->faulty = 1; + } else + return 1; + /* + * if recovery was running, stop it now. + */ + if (mddev->recovery_running) + mddev->recovery_running = -EIO; + md_recover_arrays(); + + return 0; +} + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + seq_printf(seq, "unused devices: "); + + ITERATE_RDEV_ALL(rdev,tmp) { + if (list_empty(&rdev->same_set)) { + /* + * The device is not yet used by any array. + */ + i++; + seq_printf(seq, "%s ", + partition_name(rdev->dev)); + } + } + if (!i) + seq_printf(seq, ""); + + seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, mddev_t * mddev) +{ + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + max_blocks = mddev->sb->size; + + /* + * Should not happen. + */ + if (!max_blocks) + MD_BUG(); + + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", + (mddev->spare ? "recovery" : "resync"), + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt/2); + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/dt); + +} + + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + mddev_t *mddev; + + if (l > 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, mddev_t, all_mddevs); + return mddev; + } + return (void*)2;/* tail */ +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + mddev_t *next_mddev, *mddev = v; + + ++*pos; + if (v == (void*)2) + return NULL; + + if (v == (void*)1) + tmp = all_mddevs.next; + else + tmp = mddev->all_mddevs.next; + if (tmp != &all_mddevs) + next_mddev = list_entry(tmp,mddev_t,all_mddevs); + else { + next_mddev = (void*)2; + *pos = 0x10000; + } + + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + int j, size; + struct md_list_head *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev = v; + + if (v == (void*)1) { + seq_printf(seq, "Personalities : "); + for (j = 0; j < MAX_PERSONALITY; j++) + if (pers[j]) + seq_printf(seq, "[%s] ", pers[j]->name); + + seq_printf(seq, "\n"); + seq_printf(seq, "read_ahead "); + if (read_ahead[MD_MAJOR] == INT_MAX) + seq_printf(seq, "not set\n"); + else + seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + seq_printf(seq, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + seq_printf(seq, " (read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + seq_printf(seq, " %s[%d]", + partition_name(rdev->dev), rdev->desc_nr); + if (rdev->faulty) { + seq_printf(seq, "(F)"); + continue; + } + size += rdev->size; + } + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %d blocks", + md_size[mdidx(mddev)]); + else + seq_printf(seq, "\n %d blocks", size); + } + + if (mddev->pers) { + + mddev->pers->status (seq, mddev); + + seq_printf(seq, "\n "); + if (mddev->curr_resync > 1) + status_resync (seq, mddev); + else if (mddev->curr_resync == 1) + seq_printf(seq, " resync=DELAYED"); + + } + seq_printf(seq, "\n"); + return 0; +} + + +static struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + int error; + + error = seq_open(file, &md_seq_ops); + return error; +} + +static struct file_operations md_seq_fops = { + .open = md_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +int register_md_personality(int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (pers[pnum]) { + MD_BUG(); + return -EBUSY; + } + + pers[pnum] = p; + printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); + return 0; +} + +int unregister_md_personality(int pnum) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); + pers[pnum] = NULL; + return 0; +} + +mdp_disk_t *get_spare(mddev_t *mddev) +{ + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (!rdev->sb) { + MD_BUG(); + continue; + } + disk = &sb->disks[rdev->desc_nr]; + if (disk_faulty(disk)) { + MD_BUG(); + continue; + } + if (disk_active(disk)) + continue; + return disk; + } + return NULL; +} + +static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; +void md_sync_acct(kdev_t dev, unsigned long nr_sectors) +{ + unsigned int major = MAJOR(dev); + unsigned int index; + + index = disk_index(dev); + if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + return; + + sync_io[major][index] += nr_sectors; +} + +static int is_mddev_idle(mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + int major = MAJOR(rdev->dev); + int idx = disk_index(rdev->dev); + + if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + continue; + + curr_events = kstat.dk_drive_rblk[major][idx] + + kstat.dk_drive_wblk[major][idx] ; + curr_events -= sync_io[major][idx]; + if ((curr_events - rdev->last_events) > 32) { + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + mddev->recovery_running = -EIO; + md_recover_arrays(); + // stop recovery, signal do_sync .... + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + } +} + + +DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +static void md_do_sync(void *data) +{ + mddev_t *mddev = data; + mddev_t *mddev2; + unsigned int max_sectors, currspeed, + j, window, err; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct md_list_head *tmp; + unsigned long last_check; + + /* just incase thread restarts... */ + if (mddev->recovery_running <= 0) + return; + + /* we overload curr_resync somewhat here. + * 0 == not engaged in resync at all + * 2 == checking that there is no conflict with another sync + * 1 == like 2, but have yielded to allow conflicting resync to + * commense + * other == active in resync - this many blocks + */ + do { + mddev->curr_resync = 2; + + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && + match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: delaying resync of md%d until md%d " + "has finished resync (they share one or more physical units)\n", + mdidx(mddev), mdidx(mddev2)); + if (mddev < mddev2) /* arbitrarily yield */ + mddev->curr_resync = 1; + if (wait_event_interruptible(resync_wait, + mddev2->curr_resync < 2)) { + md_flush_signals(); + err = -EINTR; + mddev_put(mddev2); + goto out; + } + } + } + } while (mddev->curr_resync < 2); + + max_sectors = mddev->sb->size<<1; + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", + sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith " + "(but not more than %d KB/sec) for reconstruction.\n", + sysctl_speed_limit_max); + + /* + * Resync has low priority. + */ + current->nice = 19; + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = 0; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = vm_max_readahead*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", + window/2,max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + for (j = 0; j < max_sectors;) { + int sectors; + + sectors = mddev->pers->sync_request(mddev, j); + + if (sectors < 0) { + err = sectors; + goto out; + } + atomic_add(sectors, &mddev->recovery_active); + j += sectors; + if (j>1) mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + last_check = j; + + run_task_queue(&tq_disk); + + repeat: + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (md_signal_pending(current)) { + /* + * got a signal, exit. + */ + printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n"); + md_flush_signals(); + err = -EINTR; + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + if (md_need_resched(current)) + schedule(); + + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { + current->nice = 19; + + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + md_schedule_timeout(HZ/4); + goto repeat; + } + } else + current->nice = -20; + } + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + err = 0; + /* + * this also signals 'finished resyncing' to md_stop + */ +out: + wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0); + /* tell personality that we are finished */ + mddev->pers->sync_request(mddev, max_sectors, 1); + + mddev->curr_resync = 0; + if (err) + mddev->recovery_running = err; + if (mddev->recovery_running > 0) + mddev->recovery_running = 0; + if (mddev->recovery_running == 0) + mddev->in_sync = 1; + md_recover_arrays(); +} + + +/* + * This is the kernel thread that watches all md arrays for re-sync action + * that might be needed. + * It does not do any resync itself, but rather "forks" off other threads + * to do that as needed. + * When it is determined that resync is needed, we set "->recovery_running" and + * create a thread at ->sync_thread. + * When the thread finishes is clears recovery_running (or set and error) + * and wakeup up this thread which will reap the thread and finish up. + */ +void md_do_recovery(void *data) +{ + mddev_t *mddev; + mdp_super_t *sb; + struct md_list_head *tmp; + + dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); + + ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) { + sb = mddev->sb; + if (!sb || !mddev->pers || !mddev->pers->diskop || mddev->ro) + goto unlock; + if (mddev->recovery_running > 0) + /* resync/recovery still happening */ + goto unlock; + if (mddev->sb_dirty) + md_update_sb(mddev); + if (mddev->sync_thread) { + /* resync has finished, collect result */ + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (mddev->recovery_running < 0) { + /* some sort of failure. + * If we were doing a reconstruction, + * we need to retrieve the spare + */ + if (mddev->spare) { + mddev->pers->diskop(mddev, &mddev->spare, + DISKOP_SPARE_INACTIVE); + mddev->spare = NULL; + } + } else { + /* success...*/ + if (mddev->spare) { + mddev->pers->diskop(mddev, &mddev->spare, + DISKOP_SPARE_ACTIVE); + mark_disk_sync(mddev->spare); + mark_disk_active(mddev->spare); + sb->active_disks++; + sb->spare_disks--; + mddev->spare = NULL; + } + } + __md_update_sb(mddev); + mddev->recovery_running = 0; + wake_up(&resync_wait); + goto unlock; + } + if (mddev->recovery_running) { + /* that's odd.. */ + mddev->recovery_running = 0; + wake_up(&resync_wait); + } + + if (sb->active_disks < sb->raid_disks) { + mddev->spare = get_spare(mddev); + if (!mddev->spare) + printk(KERN_ERR "md%d: no spare disk to reconstruct array! " + "-- continuing in degraded mode\n", mdidx(mddev)); + else + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", + mdidx(mddev), partition_name(MKDEV(mddev->spare->major,mddev->spare->minor))); + } + if (!mddev->spare && mddev->in_sync) { + /* nothing we can do ... */ + goto unlock; + } + if (mddev->pers->sync_request) { + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "md_resync"); + if (!mddev->sync_thread) { + printk(KERN_ERR "md%d: could not start resync thread...\n", mdidx(mddev)); + if (mddev->spare) + mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_INACTIVE); + mddev->spare = NULL; + mddev->recovery_running = 0; + } else { + if (mddev->spare) + mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_WRITE); + mddev->recovery_running = 1; + md_wakeup_thread(mddev->sync_thread); + } + } + unlock: + mddev_unlock(mddev); + } + dprintk(KERN_INFO "md: recovery thread finished ...\n"); + +} + +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct md_list_head *tmp; + mddev_t *mddev; + + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) + || (code == MD_SYS_POWER_OFF)) { + + printk(KERN_INFO "md: stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + if (mddev_trylock(mddev)==0) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + md_mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + notifier_call: md_notify_reboot, + next: NULL, + priority: INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + struct proc_dir_entry *p; + int i; + + for(i = 0; i < MAX_MD_DEVS; i++) { + md_blocksizes[i] = 1024; + md_size[i] = 0; + md_hardsect_sizes[i] = 512; + } + blksize_size[MAJOR_NR] = md_blocksizes; + blk_size[MAJOR_NR] = md_size; + max_readahead[MAJOR_NR] = md_maxreadahead; + hardsect_size[MAJOR_NR] = md_hardsect_sizes; + + dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + p = create_proc_entry("mdstat", S_IRUGO, NULL); + if (p) + p->proc_fops = &md_seq_fops; +#endif +} + +request_queue_t * md_queue_proc(kdev_t dev) +{ + mddev_t *mddev = mddev_find(minor(dev)); + request_queue_t *q = BLK_DEFAULT_QUEUE(MAJOR_NR); + if (!mddev || atomic_read(&mddev->active)<2) + BUG(); + if (mddev->pers) + q = &mddev->queue; + mddev_put(mddev); /* the caller must hold a reference... */ + return q; +} + +int md__init md_init(void) +{ + static char * name = "mdrecoveryd"; + int minor; + + printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + + if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops)) + { + printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR); + return (-1); + } + devfs_handle = devfs_mk_dir (NULL, "md", NULL); + /* we don't use devfs_register_series because we want to fill md_hd_struct */ + for (minor=0; minor < MAX_MD_DEVS; ++minor) { + char devname[128]; + sprintf (devname, "%u", minor); + md_hd_struct[minor].de = devfs_register (devfs_handle, + devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor, + S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); + } + + /* all requests on an uninitialised device get failed... */ + blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request); + blk_dev[MAJOR_NR].queue = md_queue_proc; + + + read_ahead[MAJOR_NR] = INT_MAX; + + add_gendisk(&md_gendisk); + + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); + if (!md_recovery_thread) + printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n"); + + md_register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table, 1); + + md_geninit(); + return (0); +} + + +#ifndef MODULE + +/* + * When md (and any require personalities) are compiled into the kernel + * (not a module), arrays can be assembles are boot time using with AUTODETECT + * where specially marked partitions are registered with md_autodetect_dev(), + * and with MD_BOOT where devices to be collected are given on the boot line + * with md=..... + * The code for that is here. + */ + +struct { + int set; + int noautodetect; +} raid_setup_args md__initdata; + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +static kdev_t detected_devices[128]; +static int dev_cnt; + +void md_autodetect_dev(kdev_t dev) +{ + if (dev_cnt >= 0 && dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} + + +static void autostart_arrays(void) +{ + mdk_rdev_t *rdev; + int i; + + printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + + for (i = 0; i < dev_cnt; i++) { + kdev_t dev = detected_devices[i]; + + if (md_import_device(dev,1)) { + printk(KERN_ALERT "md: could not import %s!\n", + partition_name(dev)); + continue; + } + /* + * Sanity checks: + */ + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + dev_cnt = 0; + + autorun_devices(); +} + +static struct { + char device_set [MAX_MD_DEVS]; + int pers[MAX_MD_DEVS]; + int chunk[MAX_MD_DEVS]; + char *device_names[MAX_MD_DEVS]; +} md_setup_args md__initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistant-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + * 2001-06-03: Dave Cinege + * Shifted name_to_kdev_t() and related operations to md_set_drive() + * for later execution. Rewrote section to make devfs compatible. + */ +static int md__init md_setup(char *str) +{ + int minor, level, factor, fault; + char *pername = ""; + char *str1 = str; + + if (get_option(&str, &minor) != 2) { /* MD Number */ + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + if (minor >= MAX_MD_DEVS) { + printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor); + return 0; + } else if (md_setup_args.device_names[minor]) { + printk(KERN_WARNING "md: md=%d, Specified more then once. " + "Replacing previous definition.\n", minor); + } + switch (get_option(&str, &level)) { /* RAID Personality */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == -1) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args.pers[minor] = level; + md_setup_args.chunk[minor] = 1 << (factor+12); + switch(level) { + case -1: + level = LINEAR; + pername = "linear"; + break; + case 0: + level = RAID0; + pername = "raid0"; + break; + default: + printk(KERN_WARNING + "md: The kernel has not been configured for raid%d support!\n", + level); + return 0; + } + md_setup_args.pers[minor] = level; + break; + } + /* FALL THROUGH */ + case 1: /* the first device is numeric */ + str = str1; + /* FALL THROUGH */ + case 0: + md_setup_args.pers[minor] = 0; + pername="super-block"; + } + + printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", + minor, pername, str); + md_setup_args.device_names[minor] = str; + + return 1; +} + +extern kdev_t name_to_kdev_t(char *line) md__init; +void md__init md_setup_drive(void) +{ + int minor, i; + kdev_t dev; + mddev_t*mddev; + kdev_t devices[MD_SB_DISKS+1]; + + for (minor = 0; minor < MAX_MD_DEVS; minor++) { + int err = 0; + char *devname; + mdu_disk_info_t dinfo; + + if ((devname = md_setup_args.device_names[minor]) == 0) continue; + + for (i = 0; i < MD_SB_DISKS && devname != 0; i++) { + + char *p; + void *handle; + + p = strchr(devname, ','); + if (p) + *p++ = 0; + + dev = name_to_kdev_t(devname); + handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev), + DEVFS_SPECIAL_BLK, 1); + if (handle != 0) { + unsigned major, minor; + devfs_get_maj_min(handle, &major, &minor); + dev = MKDEV(major, minor); + } + if (dev == 0) { + printk(KERN_WARNING "md: Unknown device name: %s\n", devname); + break; + } + + devices[i] = dev; + md_setup_args.device_set[minor] = 1; + + devname = p; + } + devices[i] = 0; + + if (md_setup_args.device_set[minor] == 0) + continue; + + printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]); + + mddev = mddev_find(minor); + if (!mddev) { + printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor); + continue; + } + if (mddev_lock(mddev)) { + printk(KERN_WARNING + "md: Ignoring md=%d, cannot lock!\n", + minor); + mddev_put(mddev); + continue; + } + + if (mddev->sb || !list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", + minor); + mddev_unlock(mddev); + mddev_put(mddev); + continue; + } + if (md_setup_args.pers[minor]) { + /* non-persistent */ + mdu_array_info_t ainfo; + ainfo.level = pers_to_level(md_setup_args.pers[minor]); + ainfo.size = 0; + ainfo.nr_disks =0; + ainfo.raid_disks =0; + ainfo.md_minor =minor; + ainfo.not_persistent = 1; + + ainfo.state = (1 << MD_SB_CLEAN); + ainfo.active_disks = 0; + ainfo.working_disks = 0; + ainfo.failed_disks = 0; + ainfo.spare_disks = 0; + ainfo.layout = 0; + ainfo.chunk_size = md_setup_args.chunk[minor]; + err = set_array_info(mddev, &ainfo); + for (i = 0; !err && (dev = devices[i]); i++) { + dinfo.number = i; + dinfo.raid_disk = i; + dinfo.state = (1<sb->nr_disks++; + mddev->sb->raid_disks++; + mddev->sb->active_disks++; + mddev->sb->working_disks++; + err = add_new_disk (mddev, &dinfo); + } + } else { + /* persistent */ + for (i = 0; (dev = devices[i]); i++) { + dinfo.major = MAJOR(dev); + dinfo.minor = MINOR(dev); + add_new_disk (mddev, &dinfo); + } + } + if (!err) + err = do_md_run(mddev); + if (err) { + mddev->sb_dirty = 0; + do_md_stop(mddev, 0); + printk(KERN_WARNING "md: starting md%d failed\n", minor); + } + mddev_unlock(mddev); + mddev_put(mddev); + } +} + +static int md__init raid_setup(char *str) +{ + int len, pos; + + len = strlen(str) + 1; + pos = 0; + + while (pos < len) { + char *comma = strchr(str+pos, ','); + int wlen; + if (comma) + wlen = (comma-str)-pos; + else wlen = (len-1)-pos; + + if (strncmp(str, "noautodetect", wlen) == 0) + raid_setup_args.noautodetect = 1; + pos += wlen+1; + } + raid_setup_args.set = 1; + return 1; +} + +int md__init md_run_setup(void) +{ + if (raid_setup_args.noautodetect) + printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n"); + else + autostart_arrays(); + md_setup_drive(); + return 0; +} + +__setup("raid=", raid_setup); +__setup("md=", md_setup); + +__initcall(md_init); +__initcall(md_run_setup); + +#else /* It is a MODULE */ + +int init_module(void) +{ + return md_init(); +} + +static void free_device_names(void) +{ + while (!list_empty(&device_names)) { + struct dname *tmp = list_entry(device_names.next, + dev_name_t, list); + list_del(&tmp->list); + kfree(tmp); + } +} + + +void cleanup_module(void) +{ + md_unregister_thread(md_recovery_thread); + devfs_unregister(devfs_handle); + + devfs_unregister_blkdev(MAJOR_NR,"md"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("mdstat", NULL); +#endif + + del_gendisk(&md_gendisk); + + blk_dev[MAJOR_NR].queue = NULL; + blksize_size[MAJOR_NR] = NULL; + blk_size[MAJOR_NR] = NULL; + max_readahead[MAJOR_NR] = NULL; + hardsect_size[MAJOR_NR] = NULL; + + free_device_names(); + +} +#endif + +MD_EXPORT_SYMBOL(md_size); +MD_EXPORT_SYMBOL(register_md_personality); +MD_EXPORT_SYMBOL(unregister_md_personality); +MD_EXPORT_SYMBOL(partition_name); +MD_EXPORT_SYMBOL(md_error); +MD_EXPORT_SYMBOL(md_done_sync); +MD_EXPORT_SYMBOL(md_unregister_thread); +MD_EXPORT_SYMBOL(md_update_sb); +MD_EXPORT_SYMBOL(md_wakeup_thread); +MD_EXPORT_SYMBOL(md_print_devices); +MD_EXPORT_SYMBOL(find_rdev_nr); +MD_EXPORT_SYMBOL(md_interrupt_thread); +MODULE_LICENSE("GPL"); diff --git a/tests/linux/md-autostart/patch b/tests/linux/md-autostart/patch new file mode 100644 index 0000000..9d6d660 --- /dev/null +++ b/tests/linux/md-autostart/patch @@ -0,0 +1,27 @@ +*************** +*** 2584,2601 **** + printk(KERN_WARNING "md: couldnt set array info. %d\n", err); + goto abort_unlock; + } +- } +- goto done_unlock; +- +- case START_ARRAY: +- /* +- * possibly make it lock the array ... +- */ +- err = autostart_array(val_to_kdev(arg)); +- if (err) { +- printk(KERN_WARNING "md: autostart %s failed!\n", +- partition_name(val_to_kdev(arg))); +- goto abort_unlock; + } + goto done_unlock; + +--- 2598,2603 ---- + printk(KERN_WARNING "md: couldnt set array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + diff --git a/tests/linux/md-loop/1 b/tests/linux/md-loop/1 new file mode 100644 index 0000000..f0abb8e --- /dev/null +++ b/tests/linux/md-loop/1 @@ -0,0 +1,3949 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher + - Devfs support by Richard Gooch + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown . + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_KMOD +#include +#endif + +#define __KERNEL_SYSCALLS__ +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER + +#include + +#define DEBUG 0 +#if DEBUG +# define dprintk(x...) printk(x) +#else +# define dprintk(x...) do { } while(0) +#endif + +#ifndef MODULE +static void autostart_arrays (void); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 100; +static int sysctl_speed_limit_max = 100000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min", + &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, + {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max", + &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table raid_dir_table[] = { + {DEV_RAID, "raid", NULL, 0, 0555, raid_table}, + {0} +}; + +static ctl_table raid_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table}, + {0} +}; + +/* + * these have to be allocated separately because external + * subsystems want to have a pre-defined structure + */ +struct hd_struct md_hd_struct[MAX_MD_DEVS]; +static int md_blocksizes[MAX_MD_DEVS]; +static int md_hardsect_sizes[MAX_MD_DEVS]; +static mdk_thread_t *md_recovery_thread; + +int md_size[MAX_MD_DEVS]; + +static struct block_device_operations md_fops; +static devfs_handle_t devfs_handle; + +static struct gendisk md_gendisk= +{ + major: MD_MAJOR, + major_name: "md", + minor_shift: 0, + max_p: 1, + part: md_hd_struct, + sizes: md_size, + nr_real: MAX_MD_DEVS, + real_devices: NULL, + next: NULL, + fops: &md_fops, +}; + +/* + * Enables to iterate over all existing md arrays + */ +static MD_LIST_HEAD(all_mddevs); + +static mddev_t *mddev_map[MAX_MD_DEVS]; + +static inline mddev_t * kdev_to_mddev (kdev_t dev) +{ + if (MAJOR(dev) != MD_MAJOR) + BUG(); + return mddev_map[MINOR(dev)]; +} + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ + bio_io_error(bio); + return 0; +} + +static mddev_t * alloc_mddev(kdev_t dev) +{ + mddev_t *mddev; + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return 0; + } + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); + if (!mddev) + return NULL; + + memset(mddev, 0, sizeof(*mddev)); + + mddev->__minor = MINOR(dev); + init_MUTEX(&mddev->reconfig_sem); + init_MUTEX(&mddev->recovery_sem); + init_MUTEX(&mddev->resync_sem); + MD_INIT_LIST_HEAD(&mddev->disks); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + atomic_set(&mddev->active, 0); + + mddev_map[mdidx(mddev)] = mddev; + md_list_add(&mddev->all_mddevs, &all_mddevs); + + MOD_INC_USE_COUNT; + + return mddev; +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +static MD_LIST_HEAD(device_names); + +char * partition_name(kdev_t dev) +{ + struct gendisk *hd; + static char nomem [] = ""; + dev_name_t *dname; + struct md_list_head *tmp; + + list_for_each(tmp, &device_names) { + dname = md_list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + + if (!dname) + return nomem; + /* + * ok, add this new device name to the list + */ + hd = get_gendisk (dev); + dname->name = NULL; + if (hd) + dname->name = disk_name (hd, MINOR(dev), dname->namebuf); + if (!dname->name) { + sprintf (dname->namebuf, "[dev %s]", kdevname(dev)); + dname->name = dname->namebuf; + } + + dname->dev = dev; + md_list_add(&dname->list, &device_names); + + return dname->name; +} + +static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev, + int persistent) +{ + unsigned int size = 0; + + if (blk_size[MAJOR(dev)]) + size = blk_size[MAJOR(dev)][MINOR(dev)]; + if (persistent) + size = MD_NEW_SIZE_BLOCKS(size); + return size; +} + +static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent) +{ + unsigned int size; + + size = calc_dev_sboffset(dev, mddev, persistent); + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +static unsigned int zoned_raid_size(mddev_t *mddev) +{ + unsigned int mask; + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* + * do size and offset calculations. + */ + mask = ~(mddev->sb->chunk_size/1024 - 1); + + ITERATE_RDEV(mddev,rdev,tmp) { + rdev->size &= mask; + md_size[mdidx(mddev)] += rdev->size; + } + return 0; +} + +static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb) +{ + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = 0; + disk->minor = 0; + mark_disk_removed(disk); +} + +#define BAD_MAGIC KERN_ERR \ +"md: invalid raid superblock magic on %s\n" + +#define BAD_MINOR KERN_ERR \ +"md: %s: invalid raid minor (%x)\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_SB KERN_ERR \ +"md: disabled device %s, could not read superblock.\n" + +#define BAD_CSUM KERN_WARNING \ +"md: invalid superblock checksum on %s\n" + +static int alloc_array_sb(mddev_t * mddev) +{ + if (mddev->sb) { + MD_BUG(); + return 0; + } + + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) + return -ENOMEM; + md_clear_page(mddev->sb); + return 0; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(OUT_OF_MEM); + return -EINVAL; + } + rdev->sb = (mdp_super_t *) page_address(rdev->sb_page); + + return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) { + page_cache_release(rdev->sb_page); + rdev->sb = NULL; + rdev->sb_page = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->faulty) + MD_BUG(); + } +} + + +static void bh_complete(struct buffer_head *bh, int uptodate) +{ + + if (uptodate) + set_bit(BH_Uptodate, &bh->b_state); + + complete((struct completion*)bh->b_private); +} + +static int sync_page_io(kdev_t dev, unsigned long sector, int size, + struct page *page, int rw) +{ + struct buffer_head bh; + struct completion event; + + init_completion(&event); + init_buffer(&bh, bh_complete, &event); + bh.b_rdev = dev; + bh.b_rsector = sector; + bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock); + bh.b_size = size; + bh.b_page = page; + bh.b_reqnext = NULL; + bh.b_data = page_address(page); + generic_make_request(rw, &bh); + + run_task_queue(&tq_disk); + wait_for_completion(&event); + + return test_bit(BH_Uptodate, &bh.b_state); +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ + int ret = -EINVAL; + kdev_t dev = rdev->dev; + unsigned long sb_offset; + + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + + /* + * Calculate the position of the superblock, + * it's at the end of the disk + */ + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); + rdev->sb_offset = sb_offset; + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) { + printk(NO_SB,partition_name(dev)); + return -EINVAL; + } + printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo); + ret = 0; +abort: + return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb(mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + printk(BAD_MAGIC, partition_name(rdev->dev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor); + goto abort; + } + + if (calc_sb_csum(sb) != sb->sb_csum) { + printk(BAD_CSUM, partition_name(rdev->dev)); + goto abort; + } + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = get_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + printk( KERN_WARNING +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" +" protection against single-disk failure might be compromised.\n", + mdidx(mddev), partition_name(rdev->dev), + partition_name(same_pdev->dev)); + + md_list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev)); +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del_init(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev)); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev(mdk_rdev_t *rdev) +{ + int err = 0; + struct block_device *bdev; + + bdev = bdget(rdev->dev); + if (!bdev) + return -ENOMEM; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + if (!err) + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + blkdev_put(bdev, BDEV_RAW); +} + +void md_autodetect_dev(kdev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ + printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev)); + if (rdev->mddev) + MD_BUG(); + unlock_rdev(rdev); + free_disk_sb(rdev); + list_del_init(&rdev->all); + if (!list_empty(&rdev->pending)) { + printk(KERN_INFO "md: (%s was pending)\n", + partition_name(rdev->dev)); + list_del_init(&rdev->pending); + } +#ifndef MODULE + md_autodetect_dev(rdev->dev); +#endif + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); +} + +static void free_mddev(mddev_t *mddev) +{ + if (!mddev) { + MD_BUG(); + return; + } + + export_array(mddev); + md_size[mdidx(mddev)] = 0; + md_hd_struct[mdidx(mddev)].nr_sects = 0; + + /* + * Make sure nobody else is using this mddev + * (careful, we rely on the global kernel lock here) + */ + while (sem_getcount(&mddev->resync_sem) != 1) + schedule(); + while (sem_getcount(&mddev->recovery_sem) != 1) + schedule(); + + del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); + md_list_del(&mddev->all_mddevs); + kfree(mddev); + MOD_DEC_USE_COUNT; +} + +#undef BAD_CSUM +#undef BAD_MAGIC +#undef OUT_OF_MEM +#undef NO_SB + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ", + partition_name(rdev->dev), partition_name(rdev->old_dev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk(KERN_INFO "md: rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * *\n"); + printk("md: **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", partition_name(rdev->dev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk("md: **********************************\n"); + printk("\n"); +} + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) +{ + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) + + return 1; + + return 0; +} + +static mdk_rdev_t * find_rdev_all(kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + list_for_each(tmp, &all_raid_disks) { + rdev = md_list_entry(tmp, mdk_rdev_t, all); + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +#define GETBLK_FAILED KERN_ERR \ +"md: getblk failed for device %s\n" + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + kdev_t dev; + unsigned long sb_offset, size; + + if (!rdev->sb) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + if (rdev->sb->md_magic != MD_SB_MAGIC) { + MD_BUG(); + return 1; + } + + dev = rdev->dev; + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); + if (rdev->sb_offset != sb_offset) { + printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n", + partition_name(dev), rdev->sb_offset, sb_offset); + goto skip; + } + /* + * If the disk went offline meanwhile and it's just a spare, then + * its size has changed to zero silently, and the MD code does + * not yet know that it's faulty. + */ + size = calc_dev_size(dev, rdev->mddev, 1); + if (size != rdev->size) { + printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n", + partition_name(dev), rdev->size, size); + goto skip; + } + + printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset); + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) { + printk("md: write_disk_sb failed for device %s\n", partition_name(dev)); + return 1; + } +skip: + return 0; +} +#undef GETBLK_FAILED + +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int i, ok = 0; + mdp_disk_t *desc; + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = mddev->sb->disks + i; +#if 0 + if (disk_faulty(desc)) { + if (MKDEV(desc->major,desc->minor) == rdev->dev) + ok = 1; + continue; + } +#endif + if (MKDEV(desc->major,desc->minor) == rdev->dev) { + rdev->sb->this_disk = *desc; + rdev->desc_nr = desc->number; + ok = 1; + break; + } + } + + if (!ok) { + MD_BUG(); + } +} + +static int sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + mdp_super_t *sb; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty || rdev->alias_device) + continue; + sb = rdev->sb; + *sb = *mddev->sb; + set_this_disk(mddev, rdev); + sb->sb_csum = calc_sb_csum(sb); + } + return 0; +} + +int md_update_sb(mddev_t * mddev) +{ + int err, count = 100; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + if (!mddev->sb_dirty) { + printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0)); + return 0; + } + mddev->sb_dirty = 0; +repeat: + mddev->sb->utime = CURRENT_TIME; + if ((++mddev->sb->events_lo)==0) + ++mddev->sb->events_hi; + + if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff; + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (mddev->sb->not_persistent) + return 0; + + printk(KERN_INFO "md: updating md%d RAID superblock on device\n", + mdidx(mddev)); + + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + printk(KERN_INFO "md: "); + if (rdev->faulty) + printk("(skipping faulty "); + if (rdev->alias_device) + printk("(skipping alias "); + if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) { + printk("(skipping new-faulty %s )\n", + partition_name(rdev->dev)); + continue; + } + printk("%s ", partition_name(rdev->dev)); + if (!rdev->faulty && !rdev->alias_device) { + printk("[events: %08lx]", + (unsigned long)rdev->sb->events_lo); + err += write_disk_sb(rdev); + } else + printk(")\n"); + } + if (err) { + if (--count) { + printk(KERN_ERR "md: errors occurred during superblock update, repeating\n"); + goto repeat; + } + printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n"); + } + return 0; +} + +/* + * Import a device. If 'on_disk', then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + */ +static int md_import_device(kdev_t newdev, int on_disk) +{ + int err; + mdk_rdev_t *rdev; + unsigned int size; + + if (find_rdev_all(newdev)) + return -EEXIST; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev)); + return -ENOMEM; + } + memset(rdev, 0, sizeof(*rdev)); + + if (is_mounted(newdev)) { + printk(KERN_WARNING "md: can not import %s, has active inodes!\n", + partition_name(newdev)); + err = -EBUSY; + goto abort_free; + } + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + rdev->dev = newdev; + if (lock_rdev(rdev)) { + printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + + size = 0; + if (blk_size[MAJOR(newdev)]) + size = blk_size[MAJOR(newdev)][MINOR(newdev)]; + if (!size) { + printk(KERN_WARNING "md: %s has zero size, marking faulty!\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + + if (on_disk) { + if ((err = read_disk_sb(rdev))) { + printk(KERN_WARNING "md: could not read %s's sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + if ((err = check_disk_sb(rdev))) { + printk(KERN_WARNING "md: %s has invalid sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + + if (rdev->sb->level != -4) { + rdev->old_dev = MKDEV(rdev->sb->this_disk.major, + rdev->sb->this_disk.minor); + rdev->desc_nr = rdev->sb->this_disk.number; + } else { + rdev->old_dev = MKDEV(0, 0); + rdev->desc_nr = -1; + } + } + md_list_add(&rdev->all, &all_raid_disks); + MD_INIT_LIST_HEAD(&rdev->pending); + INIT_LIST_HEAD(&rdev->same_set); + + return 0; + +abort_free: + if (rdev->sb) { + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return err; +} + +/* + * Check a full RAID array for plausibility + */ + +#define INCONSISTENT KERN_ERR \ +"md: fatal superblock inconsistency in %s -- removing from array\n" + +#define OUT_OF_DATE KERN_ERR \ +"md: superblock update time inconsistency -- using the most recent one\n" + +#define OLD_VERSION KERN_ALERT \ +"md: md%d: unsupported raid array version %d.%d.%d\n" + +#define NOT_CLEAN_IGNORE KERN_ERR \ +"md: md%d: raid array is not clean -- starting background reconstruction\n" + +#define UNKNOWN_LEVEL KERN_ERR \ +"md: md%d: unsupported raid level %d\n" + +static int analyze_sbs(mddev_t * mddev) +{ + int out_of_date = 0, i, first; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2, *freshest; + mdp_super_t *sb; + + /* + * Verify the RAID superblock on each real device + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + MD_BUG(); + goto abort; + } + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + if (check_disk_sb(rdev)) + goto abort; + } + + /* + * The superblock constant part has to be the same + * for all disks in the array. + */ + sb = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!sb) { + sb = rdev->sb; + continue; + } + if (!sb_equal(sb, rdev->sb)) { + printk(INCONSISTENT, partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * OK, we have all disks and the array is ready to run. Let's + * find the freshest superblock, that one will be the superblock + * that represents the whole array. + */ + if (!mddev->sb) + if (alloc_array_sb(mddev)) + goto abort; + sb = mddev->sb; + freshest = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2; + /* + * if the checksum is invalid, use the superblock + * only as a last resort. (decrease it's age by + * one event) + */ + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { + if (rdev->sb->events_lo || rdev->sb->events_hi) + if ((rdev->sb->events_lo--)==0) + rdev->sb->events_hi--; + } + + printk(KERN_INFO "md: %s's event counter: %08lx\n", + partition_name(rdev->dev), + (unsigned long)rdev->sb->events_lo); + if (!freshest) { + freshest = rdev; + continue; + } + /* + * Find the newest superblock version + */ + ev1 = md_event(rdev->sb); + ev2 = md_event(freshest->sb); + if (ev1 != ev2) { + out_of_date = 1; + if (ev1 > ev2) + freshest = rdev; + } + } + if (out_of_date) { + printk(OUT_OF_DATE); + printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev)); + } + memcpy (sb, freshest->sb, sizeof(*sb)); + + /* + * at this point we have picked the 'best' superblock + * from all available superblocks. + * now we validate this superblock and kick out possibly + * failed disks. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Kick all non-fresh devices + */ + __u64 ev1, ev2; + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ++ev1; + if (ev1 < ev2) { + printk(KERN_WARNING "md: kicking non-fresh %s from array!\n", + partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * Fix up changed device names ... but only if this disk has a + * recent update time. Use faulty checksum ones too. + */ + if (mddev->sb->level != -4) + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2, ev3; + if (rdev->faulty || rdev->alias_device) { + MD_BUG(); + goto abort; + } + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ev3 = ev2; + --ev3; + if ((rdev->dev != rdev->old_dev) && + ((ev1 == ev2) || (ev1 == ev3))) { + mdp_disk_t *desc; + + printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n", + partition_name(rdev->old_dev), partition_name(rdev->dev)); + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + desc = &sb->disks[rdev->desc_nr]; + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { + MD_BUG(); + goto abort; + } + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + desc = &rdev->sb->this_disk; + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + } + } + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + /* + * We kick faulty devices/descriptors immediately. + * + * Note: multipath devices are a special case. Since we + * were able to read the superblock on the path, we don't + * care if it was previously marked as faulty, it's up now + * so enable it. + */ + if (disk_faulty(desc) && mddev->sb->level != -4) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + printk(KERN_WARNING "md%d: kicking faulty %s!\n", + mdidx(mddev),partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + if (dev == MKDEV(0,0)) + continue; + printk(KERN_WARNING "md%d: removing former faulty %s!\n", + mdidx(mddev), partition_name(dev)); + } + remove_descriptor(desc, sb); + continue; + } else if (disk_faulty(desc)) { + /* + * multipath entry marked as faulty, unfaulty it + */ + rdev = find_rdev(mddev, dev); + if(rdev) + mark_disk_spare(desc); + else + remove_descriptor(desc, sb); + } + + if (dev == MKDEV(0,0)) + continue; + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Multi-path IO special-case: since we have no + * this_disk descriptor at auto-detect time, + * we cannot check rdev->number. + * We can check the device though. + */ + if ((sb->level == -4) && (rdev->dev == + MKDEV(desc->major,desc->minor))) { + found = 1; + break; + } + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } + } + if (found) + continue; + + printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n", + mdidx(mddev), partition_name(dev)); + remove_descriptor(desc, sb); + } + + /* + * Double check wether all devices mentioned in the + * superblock are in the rdev ring. + */ + first = 1; + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + + if (disk_faulty(desc)) { + MD_BUG(); + goto abort; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + /* + * In the case of Multipath-IO, we have no + * other information source to find out which + * disk is which, only the position of the device + * in the superblock: + */ + if (mddev->sb->level == -4) { + if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) { + MD_BUG(); + goto abort; + } + rdev->desc_nr = i; + if (!first) + rdev->alias_device = 1; + else + first = 0; + } + } + + /* + * Kick all rdevs that are not in the + * descriptor array: + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) + kick_rdev_from_array(rdev); + } + + /* + * Do a final reality check. + */ + if (mddev->sb->level != -4) { + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + /* + * is the desc_nr unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->desc_nr == rdev->desc_nr)) { + MD_BUG(); + goto abort; + } + } + /* + * is the device unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->dev == rdev->dev)) { + MD_BUG(); + goto abort; + } + } + } + } + + /* + * Check if we can support this RAID array + */ + if (sb->major_version != MD_MAJOR_VERSION || + sb->minor_version > MD_MINOR_VERSION) { + + printk(OLD_VERSION, mdidx(mddev), sb->major_version, + sb->minor_version, sb->patch_version); + goto abort; + } + + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || + (sb->level == 4) || (sb->level == 5))) + printk(NOT_CLEAN_IGNORE, mdidx(mddev)); + + return 0; +abort: + return 1; +} + +#undef INCONSISTENT +#undef OUT_OF_DATE +#undef OLD_VERSION +#undef OLD_LEVEL + +static int device_size_calculation(mddev_t * mddev) +{ + int data_disks = 0, persistent; + unsigned int readahead; + mdp_super_t *sb = mddev->sb; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + persistent = !mddev->sb->not_persistent; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size) { + MD_BUG(); + continue; + } + rdev->size = calc_dev_size(rdev->dev, mddev, persistent); + if (rdev->size < sb->chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size: %ldk < %dk\n", + partition_name(rdev->dev), + rdev->size, sb->chunk_size / 1024); + return -EINVAL; + } + } + + switch (sb->level) { + case -4: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case -1: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = sb->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = sb->raid_disks-1; + break; + default: + printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = sb->size * data_disks; + + readahead = MD_READAHEAD; + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) { + readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + // (no multipath branch - it uses the default setting) + if (sb->level == -3) + readahead = 0; + } + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; +abort: + return 1; +} + + +#define TOO_BIG_CHUNKSIZE KERN_ERR \ +"too big chunk_size: %d > %d\n" + +#define TOO_SMALL_CHUNKSIZE KERN_ERR \ +"too small chunk_size: %d < %ld\n" + +#define BAD_CHUNKSIZE KERN_ERR \ +"no chunksize specified, see 'man raidtab'\n" + +static int do_md_run(mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Resize disks to align partitions size on a given + * chunk size. + */ + md_size[mdidx(mddev)] = 0; + + /* + * Analyze all RAID superblock(s) + */ + if (analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->sb->chunk_size; + pnum = level_to_pers(mddev->sb->level); + + if ((pnum != MULTIPATH) && (pnum != RAID1)) { + if (!chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We dont + * want to continue the bad practice. + */ + printk(BAD_CHUNKSIZE); + return -EINVAL; + } + if (chunk_size > MAX_CHUNK_SIZE) { + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); + return -EINVAL; + } + } else + if (chunk_size) + printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n", + mddev->sb->level); + + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (!pers[pnum]) + { +#ifdef CONFIG_KMOD + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + if (!pers[pnum]) +#endif + { + printk(KERN_ERR "md: personality %d is not loaded!\n", + pnum); + return -EINVAL; + } + } + + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + md_hardsect_sizes[mdidx(mddev)] = 512; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + invalidate_device(rdev->dev, 1); + if (get_hardsect_size(rdev->dev) + > md_hardsect_sizes[mdidx(mddev)]) + md_hardsect_sizes[mdidx(mddev)] = + get_hardsect_size(rdev->dev); + } + md_blocksizes[mdidx(mddev)] = 1024; + if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)]) + md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)]; + mddev->pers = pers[pnum]; + + blk_queue_make_request(&mddev->queue, mddev->pers->make_request); + mddev->queue.queuedata = mddev; + + err = mddev->pers->run(mddev); + if (err) { + printk(KERN_ERR "md: pers->run() failed ...\n"); + mddev->pers = NULL; + return -EINVAL; + } + + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + /* + * md_size has units of 1K blocks, which are + * twice as large as sectors. + */ + md_hd_struct[mdidx(mddev)].start_sect = 0; + register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)), + 1, &md_fops, md_size[mdidx(mddev)]<<1); + + read_ahead[MD_MAJOR] = 1024; + return (0); +} + +#undef TOO_BIG_CHUNKSIZE +#undef BAD_CHUNKSIZE + +static int restart_array(mddev_t *mddev) +{ + int err; + + /* + * Complain if it has no devices + */ + err = -ENXIO; + if (list_empty(&mddev->disks)) + goto out; + + if (mddev->pers) { + err = -EBUSY; + if (!mddev->ro) + goto out; + + mddev->ro = 0; + set_device_ro(mddev_to_kdev(mddev), 0); + + printk(KERN_INFO + "md: md%d switched to read-write mode.\n", mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + md_recover_arrays(); + if (mddev->pers->restart_resync) + mddev->pers->restart_resync(mddev); + err = 0; + } else { + printk(KERN_ERR "md: md%d has no personality assigned.\n", + mdidx(mddev)); + err = -EINVAL; + } + +out: + return err; +} + +#define STILL_MOUNTED KERN_WARNING \ +"md: md%d still mounted.\n" +#define STILL_IN_USE \ +"md: md%d still in use.\n" + +static int do_md_stop(mddev_t * mddev, int ro) +{ + int err = 0, resync_interrupted = 0; + kdev_t dev = mddev_to_kdev(mddev); + + if (atomic_read(&mddev->active)>1) { + printk(STILL_IN_USE, mdidx(mddev)); + err = -EBUSY; + goto out; + } + + if (mddev->pers) { + /* + * It is safe to call stop here, it only frees private + * data. Also, it tells us if a device is unstoppable + * (eg. resyncing is in progress) + */ + if (mddev->pers->stop_resync) + if (mddev->pers->stop_resync(mddev)) + resync_interrupted = 1; + + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + + /* + * This synchronizes with signal delivery to the + * resync or reconstruction thread. It also nicely + * hangs the process if some reconstruction has not + * finished. + */ + down(&mddev->recovery_sem); + up(&mddev->recovery_sem); + + invalidate_device(dev, 1); + + if (ro) { + err = -ENXIO; + if (mddev->ro) + goto out; + mddev->ro = 1; + } else { + if (mddev->ro) + set_device_ro(dev, 0); + if (mddev->pers->stop(mddev)) { + err = -EBUSY; + if (mddev->ro) + set_device_ro(dev, 1); + goto out; + } + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->sb) { + /* + * mark it clean only if there was no resync + * interrupted. + */ + if (!mddev->recovery_running && !resync_interrupted) { + printk(KERN_INFO "md: marking sb clean...\n"); + mddev->sb->state |= 1 << MD_SB_CLEAN; + } + mddev->sb_dirty = 1; + md_update_sb(mddev); + } + if (ro) + set_device_ro(dev, 1); + } + + /* + * Free resources if final stop + */ + if (!ro) { + printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); + free_mddev(mddev); + } else + printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev)); + err = 0; +out: + return err; +} + +/* + * We have to safely support old arrays too. + */ +int detect_old_array(mdp_super_t *sb) +{ + if (sb->major_version > 0) + return 0; + if (sb->minor_version >= 90) + return 0; + + return -EINVAL; +} + + +static void autorun_array(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int err; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return; + } + + printk(KERN_INFO "md: running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", partition_name(rdev->dev)); + } + printk("\n"); + + err = do_md_run (mddev); + if (err) { + printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + /* + * prevent the writeback of an unrunnable array + */ + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in the ->pending list) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(kdev_t countdev) +{ + struct md_list_head candidates; + struct md_list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + kdev_t md_kdev; + + + printk(KERN_INFO "md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + rdev0 = md_list_entry(pending_raid_disks.next, + mdk_rdev_t, pending); + + printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev)); + MD_INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) { + if (uuid_equal(rdev0, rdev)) { + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING + "md: %s has same UUID as %s, but superblocks differ ...\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + continue; + } + printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev)); + md_list_del(&rdev->pending); + md_list_add(&rdev->pending, &candidates); + } + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); + mddev = kdev_to_mddev(md_kdev); + if (mddev) { + printk(KERN_WARNING "md: md%d already running, cannot run %s\n", + mdidx(mddev), partition_name(rdev0->dev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) + export_rdev(rdev); + continue; + } + mddev = alloc_mddev(md_kdev); + if (!mddev) { + printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); + break; + } + if (md_kdev == countdev) + atomic_inc(&mddev->active); + printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + bind_rdev_to_array(rdev, mddev); + list_del_init(&rdev->pending); + } + autorun_array(mddev); + } + printk(KERN_INFO "md: ... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +#define BAD_VERSION KERN_ERR \ +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_DEVICE KERN_ERR \ +"md: disabled device %s\n" + +#define AUTOADD_FAILED KERN_ERR \ +"md: auto-adding devices to md%d FAILED (error %d).\n" + +#define AUTOADD_FAILED_USED KERN_ERR \ +"md: cannot auto-add device %s to md%d, already used.\n" + +#define AUTORUN_FAILED KERN_ERR \ +"md: auto-running md%d FAILED (error %d).\n" + +#define MDDEV_BUSY KERN_ERR \ +"md: cannot auto-add to md%d, already running.\n" + +#define AUTOADDING KERN_INFO \ +"md: auto-adding devices to md%d, based on %s's superblock.\n" + +#define AUTORUNNING KERN_INFO \ +"md: auto-running md%d.\n" + +static int autostart_array(kdev_t startdev, kdev_t countdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + if (md_import_device(startdev, 1)) { + printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev)); + goto abort; + } + + start_rdev = find_rdev_all(startdev); + if (!start_rdev) { + MD_BUG(); + goto abort; + } + if (start_rdev->faulty) { + printk(KERN_WARNING "md: can not autostart based on faulty %s!\n", + partition_name(startdev)); + goto abort; + } + md_list_add(&start_rdev->pending, &pending_raid_disks); + + sb = start_rdev->sb; + + err = detect_old_array(sb); + if (err) { + printk(KERN_WARNING "md: array version is too old to be autostarted ," + "use raidtools 0.90 mkraid --upgrade to upgrade the array " + "without data loss!\n"); + goto abort; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + if (dev == startdev) + continue; + if (md_import_device(dev, 1)) { + printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n", + partition_name(dev)); + continue; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(countdev); + return 0; + +abort: + if (start_rdev) + export_rdev(start_rdev); + return err; +} + +#undef BAD_VERSION +#undef OUT_OF_MEM +#undef NO_DEVICE +#undef AUTOADD_FAILED_USED +#undef AUTOADD_FAILED +#undef AUTORUN_FAILED +#undef AUTOADDING +#undef AUTORUNNING + + +static int get_version(void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (md_copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +#define SET_FROM_SB(x) info.x = mddev->sb->x +static int get_array_info(mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + + SET_FROM_SB(major_version); + SET_FROM_SB(minor_version); + SET_FROM_SB(patch_version); + SET_FROM_SB(ctime); + SET_FROM_SB(level); + SET_FROM_SB(size); + SET_FROM_SB(nr_disks); + SET_FROM_SB(raid_disks); + SET_FROM_SB(md_minor); + SET_FROM_SB(not_persistent); + + SET_FROM_SB(utime); + SET_FROM_SB(state); + SET_FROM_SB(active_disks); + SET_FROM_SB(working_disks); + SET_FROM_SB(failed_disks); + SET_FROM_SB(spare_disks); + + SET_FROM_SB(layout); + SET_FROM_SB(chunk_size); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x +static int get_disk_info(mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= MD_SB_DISKS) + return -EINVAL; + + SET_FROM_SB(major); + SET_FROM_SB(minor); + SET_FROM_SB(raid_disk); + SET_FROM_SB(state); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_SB(x) mddev->sb->disks[nr].x = info->x + +static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) +{ + int err, size, persistent; + mdk_rdev_t *rdev; + unsigned int nr; + kdev_t dev; + dev = MKDEV(info->major,info->minor); + + if (find_rdev_all(dev)) { + printk(KERN_WARNING "md: device %s already used in a RAID array!\n", + partition_name(dev)); + return -EBUSY; + } + if (!mddev->sb) { + /* expecting a device which has a superblock */ + err = md_import_device(dev, 1); + if (err) { + printk(KERN_WARNING "md: md_import_device returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (!list_empty(&mddev->disks)) { + mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + if (!uuid_equal(rdev0, rdev)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + } + bind_rdev_to_array(rdev, mddev); + return 0; + } + + nr = info->number; + if (nr >= mddev->sb->nr_disks) { + MD_BUG(); + return -EINVAL; + } + + + SET_SB(number); + SET_SB(major); + SET_SB(minor); + SET_SB(raid_disk); + SET_SB(state); + + if ((info->state & (1<old_dev = dev; + rdev->desc_nr = info->number; + + bind_rdev_to_array(rdev, mddev); + + persistent = !mddev->sb->not_persistent; + if (!persistent) + printk(KERN_INFO "md: nonpersistent superblock ...\n"); + + size = calc_dev_size(dev, mddev, persistent); + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + if (!mddev->sb->size || (mddev->sb->size > size)) + mddev->sb->size = size; + } + + /* + * sync all other superblocks with the main superblock + */ + sync_sbs(mddev); + + return 0; +} +#undef SET_SB + +static int hot_generate_error(mddev_t * mddev, kdev_t dev) +{ + struct request_queue *q; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + return -ENXIO; + } + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (!disk_active(disk)) + return -ENODEV; + + q = blk_get_queue(rdev->dev); + if (!q) { + MD_BUG(); + return -ENODEV; + } + printk(KERN_INFO "md: okay, generating error!\n"); +// q->oneshot_error = 1; // disabled for now + + return 0; +} + +static int hot_remove_disk(mddev_t * mddev, kdev_t dev) +{ + int err; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (disk_active(disk)) + goto busy; + + if (disk_removed(disk)) + return -EINVAL; + + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); + if (err == -EBUSY) + goto busy; + + if (err) { + MD_BUG(); + return -EINVAL; + } + + remove_descriptor(disk, mddev->sb); + kick_rdev_from_array(rdev); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + return 0; +busy: + printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk(mddev_t * mddev, kdev_t dev) +{ + int i, err, persistent; + unsigned int size; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + persistent = !mddev->sb->not_persistent; + + rdev = find_rdev(mddev, dev); + if (rdev) + return -EBUSY; + + err = md_import_device (dev, 0); + if (err) { + printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (rdev->faulty) { + printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n", + partition_name(dev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + size = calc_dev_size(dev, mddev, persistent); + + if (size < mddev->sb->size) { + printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n", + mdidx(mddev), size, mddev->sb->size); + err = -ENOSPC; + goto abort_export; + } + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + rdev->old_dev = dev; + rdev->size = size; + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + disk = mddev->sb->disks + mddev->sb->raid_disks; + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { + disk = mddev->sb->disks + i; + + if (!disk->major && !disk->minor) + break; + if (disk_removed(disk)) + break; + } + if (i == MD_SB_DISKS) { + printk(KERN_WARNING "md%d: can not hot-add to full array!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + if (disk_removed(disk)) { + /* + * reuse slot + */ + if (disk->number != i) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + } else { + disk->number = i; + } + + disk->raid_disk = disk->number; + disk->major = MAJOR(dev); + disk->minor = MINOR(dev); + + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + + mark_disk_spare(disk); + mddev->sb->nr_disks++; + mddev->sb->spare_disks++; + mddev->sb->working_disks++; + + mddev->sb_dirty = 1; + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + md_recover_arrays(); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +#define SET_SB(x) mddev->sb->x = info->x +static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +{ + + if (alloc_array_sb(mddev)) + return -ENOMEM; + + mddev->sb->major_version = MD_MAJOR_VERSION; + mddev->sb->minor_version = MD_MINOR_VERSION; + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; + mddev->sb->ctime = CURRENT_TIME; + + SET_SB(level); + SET_SB(size); + SET_SB(nr_disks); + SET_SB(raid_disks); + SET_SB(md_minor); + SET_SB(not_persistent); + + SET_SB(state); + SET_SB(active_disks); + SET_SB(working_disks); + SET_SB(failed_disks); + SET_SB(spare_disks); + + SET_SB(layout); + SET_SB(chunk_size); + + mddev->sb->md_magic = MD_SB_MAGIC; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(&mddev->sb->set_uuid0, 4); + get_random_bytes(&mddev->sb->set_uuid1, 4); + get_random_bytes(&mddev->sb->set_uuid2, 4); + get_random_bytes(&mddev->sb->set_uuid3, 4); + + return 0; +} +#undef SET_SB + +static int set_disk_faulty(mddev_t *mddev, kdev_t dev) +{ + int ret; + + ret = md_error(mddev, dev); + return ret; +} + +static int md_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!md_capable_admin()) + return -EACCES; + + dev = inode->i_rdev; + minor = MINOR(dev); + if (minor >= MAX_MD_DEVS) { + MD_BUG(); + return -EINVAL; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done_unlock; + +#ifndef MODULE + case RAID_AUTORUN: + err = 0; + autostart_arrays(); + goto done; +#endif + + case BLKGETSIZE: + case BLKGETSIZE64: + case BLKRAGET: + case BLKRASET: + case BLKFLSBUF: + case BLKBSZGET: + case BLKBSZSET: + err = blk_ioctl (dev, cmd, arg); + goto abort; + + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = kdev_to_mddev(dev); + + switch (cmd) + { + case SET_ARRAY_INFO: + case START_ARRAY: + if (mddev) { + printk(KERN_WARNING "md: array md%d already exists!\n", + mdidx(mddev)); + err = -EEXIST; + goto abort; + } + default:; + } + switch (cmd) + { + case SET_ARRAY_INFO: + mddev = alloc_mddev(dev); + if (!mddev) { + err = -ENOMEM; + goto abort; + } + atomic_inc(&mddev->active); + + /* + * alloc_mddev() should possibly self-lock. + */ + err = lock_mddev(mddev); + if (err) { + printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n", + err, cmd); + goto abort; + } + + if (mddev->sb) { + printk(KERN_WARNING "md: array md%d already has a superblock!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (arg) { + mdu_array_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldnt set array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + + case START_ARRAY: + /* + * possibly make it lock the array ... + */ + err = autostart_array((kdev_t)arg, dev); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name((kdev_t)arg)); + goto abort; + } + goto done; + + default:; + } + + /* + * Commands querying/configuring an existing array: + */ + + if (!mddev) { + err = -ENODEV; + goto abort; + } + err = lock_mddev(mddev); + if (err) { + printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); + goto abort; + } + /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + if (!(err = do_md_stop (mddev, 0))) + mddev = NULL; + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = md_put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = md_put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[minor].start_sect, + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + case HOT_GENERATE_ERROR: + err = hot_generate_error(mddev, (kdev_t)arg); + goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, (kdev_t)arg); + goto done_unlock; + + case RUN_ARRAY: + { + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + */ + if (err) { + mddev->sb_dirty = 0; + if (!do_md_stop (mddev, 0)) + mddev = NULL; + } + goto done_unlock; + } + + default: + printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, " + "upgrade your software to use new ictls.\n", + current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + if (mddev) + unlock_mddev(mddev); + + return err; +done: + if (err) + MD_BUG(); +abort: + return err; +} + +static int md_open(struct inode *inode, struct file *file) +{ + /* + * Always succeed, but increment the usage count + */ + mddev_t *mddev = kdev_to_mddev(inode->i_rdev); + if (mddev) + atomic_inc(&mddev->active); + return (0); +} + +static int md_release(struct inode *inode, struct file * file) +{ + mddev_t *mddev = kdev_to_mddev(inode->i_rdev); + if (mddev) + atomic_dec(&mddev->active); + return 0; +} + +static struct block_device_operations md_fops= +{ + owner: THIS_MODULE, + open: md_open, + release: md_release, + ioctl: md_ioctl, +}; + + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + md_lock_kernel(); + + /* + * Detach thread + */ + + daemonize(); + + sprintf(current->comm, thread->name); + md_init_signals(); + md_flush_signals(); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + current->policy = SCHED_OTHER; + current->nice = -20; + md_unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run)(void *data); + + wait_event_interruptible(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags)); + + clear_bit(THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->data); + run_task_queue(&tq_disk); + } + if (md_signal_pending(current)) + md_flush_signals(); + } + complete(thread->event); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + dprintk("md: waking up MD thread %p.\n", thread); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); +} + +mdk_thread_t *md_register_thread(void (*run) (void *), + void *data, const char *name) +{ + mdk_thread_t *thread; + int ret; + struct completion event; + + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; + + memset(thread, 0, sizeof(mdk_thread_t)); + md_init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->data = data; + thread->name = name; + ret = kernel_thread(md_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +void md_interrupt_thread(mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread(mdk_thread_t *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + md_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +void md_recover_arrays(void) +{ + if (!md_recovery_thread) { + MD_BUG(); + return; + } + md_wakeup_thread(md_recovery_thread); +} + + +int md_error(mddev_t *mddev, kdev_t rdev) +{ + mdk_rdev_t * rrdev; + + dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", + MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev), + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); + + if (!mddev) { + MD_BUG(); + return 0; + } + rrdev = find_rdev(mddev, rdev); + if (!rrdev || rrdev->faulty) + return 0; + if (!mddev->pers->error_handler + || mddev->pers->error_handler(mddev,rdev) <= 0) { + rrdev->faulty = 1; + } else + return 1; + /* + * if recovery was running, stop it now. + */ + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + md_recover_arrays(); + + return 0; +} + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + seq_printf(seq, "unused devices: "); + + ITERATE_RDEV_ALL(rdev,tmp) { + if (list_empty(&rdev->same_set)) { + /* + * The device is not yet used by any array. + */ + i++; + seq_printf(seq, "%s ", + partition_name(rdev->dev)); + } + } + if (!i) + seq_printf(seq, ""); + + seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, mddev_t * mddev) +{ + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + max_blocks = mddev->sb->size; + + /* + * Should not happen. + */ + if (!max_blocks) + MD_BUG(); + + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + if (!mddev->recovery_running) + /* + * true resync + */ + seq_printf(seq, " resync =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + else + /* + * recovery ... + */ + seq_printf(seq, " recovery =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt/2); + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/dt); + +} + + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + mddev_t *mddev; + + if (l > 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, mddev_t, all_mddevs); + return mddev; + } + return (void*)2;/* tail */ +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + mddev_t *next_mddev, *mddev = v; + + ++*pos; + if (v == (void*)2) + return NULL; + + if (v == (void*)1) + tmp = all_mddevs.next; + else + tmp = mddev->all_mddevs.next; + if (tmp != &all_mddevs) + next_mddev = list_entry(tmp,mddev_t,all_mddevs); + else { + next_mddev = (void*)2; + *pos = 0x10000; + } + + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + int j, size; + struct md_list_head *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev = v; + + if (v == (void*)1) { + seq_printf(seq, "Personalities : "); + for (j = 0; j < MAX_PERSONALITY; j++) + if (pers[j]) + seq_printf(seq, "[%s] ", pers[j]->name); + + seq_printf(seq, "\n"); + seq_printf(seq, "read_ahead "); + if (read_ahead[MD_MAJOR] == INT_MAX) + seq_printf(seq, "not set\n"); + else + seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + seq_printf(seq, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + seq_printf(seq, " (read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + seq_printf(seq, " %s[%d]", + partition_name(rdev->dev), rdev->desc_nr); + if (rdev->faulty) { + seq_printf(seq, "(F)"); + continue; + } + size += rdev->size; + } + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %d blocks", + md_size[mdidx(mddev)]); + else + seq_printf(seq, "\n %d blocks", size); + } + + if (mddev->pers) { + + mddev->pers->status (seq, mddev); + + seq_printf(seq, "\n "); + if (mddev->curr_resync) { + status_resync (seq, mddev); + } else { + if (sem_getcount(&mddev->resync_sem) != 1) + seq_printf(seq, " resync=DELAYED"); + } + } + seq_printf(seq, "\n"); + + return 0; +} + + +static struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + int error; + + error = seq_open(file, &md_seq_ops); + return error; +} + +static struct file_operations md_seq_fops = { + .open = md_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +int register_md_personality(int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (pers[pnum]) { + MD_BUG(); + return -EBUSY; + } + + pers[pnum] = p; + printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); + return 0; +} + +int unregister_md_personality(int pnum) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); + pers[pnum] = NULL; + return 0; +} + +mdp_disk_t *get_spare(mddev_t *mddev) +{ + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (!rdev->sb) { + MD_BUG(); + continue; + } + disk = &sb->disks[rdev->desc_nr]; + if (disk_faulty(disk)) { + MD_BUG(); + continue; + } + if (disk_active(disk)) + continue; + return disk; + } + return NULL; +} + +static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; +void md_sync_acct(kdev_t dev, unsigned long nr_sectors) +{ + unsigned int major = MAJOR(dev); + unsigned int index; + + index = disk_index(dev); + if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + return; + + sync_io[major][index] += nr_sectors; +} + +static int is_mddev_idle(mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + int major = MAJOR(rdev->dev); + int idx = disk_index(rdev->dev); + + if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + continue; + + curr_events = kstat.dk_drive_rblk[major][idx] + + kstat.dk_drive_wblk[major][idx] ; + curr_events -= sync_io[major][idx]; + if ((curr_events - rdev->last_events) > 32) { + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + // stop recovery, signal do_sync .... + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + } +} + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +int md_do_sync(mddev_t *mddev, mdp_disk_t *spare) +{ + mddev_t *mddev2; + unsigned int max_sectors, currspeed, + j, window, err, serialize; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct md_list_head *tmp; + unsigned long last_check; + + + err = down_interruptible(&mddev->resync_sem); + if (err) + goto out_nolock; + +recheck: + serialize = 0; + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: delaying resync of md%d until md%d " + "has finished resync (they share one or more physical units)\n", + mdidx(mddev), mdidx(mddev2)); + serialize = 1; + break; + } + } + if (serialize) { + interruptible_sleep_on(&resync_wait); + if (md_signal_pending(current)) { + md_flush_signals(); + err = -EINTR; + goto out; + } + goto recheck; + } + + mddev->curr_resync = 1; + + max_sectors = mddev->sb->size<<1; + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", + sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith " + "(but not more than %d KB/sec) for reconstruction.\n", + sysctl_speed_limit_max); + + /* + * Resync has low priority. + */ + current->nice = 19; + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = 0; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = vm_max_readahead*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", + window/2,max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + for (j = 0; j < max_sectors;) { + int sectors; + + sectors = mddev->pers->sync_request(mddev, j); + + if (sectors < 0) { + err = sectors; + goto out; + } + atomic_add(sectors, &mddev->recovery_active); + j += sectors; + mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + last_check = j; + + run_task_queue(&tq_disk); + + repeat: + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (md_signal_pending(current)) { + /* + * got a signal, exit. + */ + mddev->curr_resync = 0; + printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n"); + md_flush_signals(); + err = -EINTR; + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + if (md_need_resched(current)) + schedule(); + + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { + current->nice = 19; + + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + md_schedule_timeout(HZ/4); + goto repeat; + } + } else + current->nice = -20; + } + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + err = 0; + /* + * this also signals 'finished resyncing' to md_stop + */ +out: + wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0); + up(&mddev->resync_sem); +out_nolock: + mddev->curr_resync = 0; + wake_up(&resync_wait); + return err; +} + + +/* + * This is a kernel thread which syncs a spare disk with the active array + * + * the amount of foolproofing might seem to be a tad excessive, but an + * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs + * of my root partition with the first 0.5 gigs of my /home partition ... so + * i'm a bit nervous ;) + */ +void md_do_recovery(void *data) +{ + int err; + mddev_t *mddev; + mdp_super_t *sb; + mdp_disk_t *spare; + struct md_list_head *tmp; + + printk(KERN_INFO "md: recovery thread got woken up ...\n"); +restart: + ITERATE_MDDEV(mddev,tmp) { + sb = mddev->sb; + if (!sb) + continue; + if (mddev->recovery_running) + continue; + if (sb->active_disks == sb->raid_disks) + continue; + if (mddev->sb_dirty) + md_update_sb(mddev); + if (!sb->spare_disks) { + printk(KERN_ERR "md%d: no spare disk to reconstruct array! " + "-- continuing in degraded mode\n", mdidx(mddev)); + continue; + } + /* + * now here we get the spare and resync it. + */ + spare = get_spare(mddev); + if (!spare) + continue; + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", + mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!mddev->pers->diskop) + continue; + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) + continue; + down(&mddev->recovery_sem); + mddev->recovery_running = 1; + err = md_do_sync(mddev, spare); + if (err == -EIO) { + printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", + mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!disk_faulty(spare)) { + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); + mark_disk_faulty(spare); + mark_disk_nonsync(spare); + mark_disk_inactive(spare); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + } + } else + if (disk_faulty(spare)) + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + if (err == -EINTR || err == -ENOMEM) { + /* + * Recovery got interrupted, or ran out of mem ... + * signal back that we have finished using the array. + */ + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + up(&mddev->recovery_sem); + mddev->recovery_running = 0; + continue; + } else { + mddev->recovery_running = 0; + up(&mddev->recovery_sem); + } + if (!disk_faulty(spare)) { + /* + * the SPARE_ACTIVE diskop possibly changes the + * pointer too + */ + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); + mark_disk_sync(spare); + mark_disk_active(spare); + sb->active_disks++; + sb->spare_disks--; + } + mddev->sb_dirty = 1; + md_update_sb(mddev); + goto restart; + } + printk(KERN_INFO "md: recovery thread finished ...\n"); + +} + +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct md_list_head *tmp; + mddev_t *mddev; + + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) + || (code == MD_SYS_POWER_OFF)) { + + printk(KERN_INFO "md: stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + md_mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + notifier_call: md_notify_reboot, + next: NULL, + priority: INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + struct proc_dir_entry *p; + int i; + + for(i = 0; i < MAX_MD_DEVS; i++) { + md_blocksizes[i] = 1024; + md_size[i] = 0; + md_hardsect_sizes[i] = 512; + } + blksize_size[MAJOR_NR] = md_blocksizes; + blk_size[MAJOR_NR] = md_size; + max_readahead[MAJOR_NR] = md_maxreadahead; + hardsect_size[MAJOR_NR] = md_hardsect_sizes; + + dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + p = create_proc_entry("mdstat", S_IRUGO, NULL); + if (p) + p->proc_fops = &md_seq_fops; +#endif +} + +request_queue_t * md_queue_proc(kdev_t dev) +{ + mddev_t *mddev = kdev_to_mddev(dev); + if (mddev == NULL) + return BLK_DEFAULT_QUEUE(MAJOR_NR); + else + return &mddev->queue; +} + +int md__init md_init(void) +{ + static char * name = "mdrecoveryd"; + int minor; + + printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + + if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops)) + { + printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR); + return (-1); + } + devfs_handle = devfs_mk_dir (NULL, "md", NULL); + /* we don't use devfs_register_series because we want to fill md_hd_struct */ + for (minor=0; minor < MAX_MD_DEVS; ++minor) { + char devname[128]; + sprintf (devname, "%u", minor); + md_hd_struct[minor].de = devfs_register (devfs_handle, + devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor, + S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); + } + + /* all requests on an uninitialised device get failed... */ + blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request); + blk_dev[MAJOR_NR].queue = md_queue_proc; + + + read_ahead[MAJOR_NR] = INT_MAX; + + add_gendisk(&md_gendisk); + + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); + if (!md_recovery_thread) + printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n"); + + md_register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table, 1); + + md_geninit(); + return (0); +} + + +#ifndef MODULE + +/* + * When md (and any require personalities) are compiled into the kernel + * (not a module), arrays can be assembles are boot time using with AUTODETECT + * where specially marked partitions are registered with md_autodetect_dev(), + * and with MD_BOOT where devices to be collected are given on the boot line + * with md=..... + * The code for that is here. + */ + +struct { + int set; + int noautodetect; +} raid_setup_args md__initdata; + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +static kdev_t detected_devices[128]; +static int dev_cnt; + +void md_autodetect_dev(kdev_t dev) +{ + if (dev_cnt >= 0 && dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} + + +static void autostart_arrays(void) +{ + mdk_rdev_t *rdev; + int i; + + printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + + for (i = 0; i < dev_cnt; i++) { + kdev_t dev = detected_devices[i]; + + if (md_import_device(dev,1)) { + printk(KERN_ALERT "md: could not import %s!\n", + partition_name(dev)); + continue; + } + /* + * Sanity checks: + */ + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + dev_cnt = 0; + + autorun_devices(-1); +} + +static struct { + char device_set [MAX_MD_DEVS]; + int pers[MAX_MD_DEVS]; + int chunk[MAX_MD_DEVS]; + char *device_names[MAX_MD_DEVS]; +} md_setup_args md__initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistant-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + * 2001-06-03: Dave Cinege + * Shifted name_to_kdev_t() and related operations to md_set_drive() + * for later execution. Rewrote section to make devfs compatible. + */ +static int md__init md_setup(char *str) +{ + int minor, level, factor, fault; + char *pername = ""; + char *str1 = str; + + if (get_option(&str, &minor) != 2) { /* MD Number */ + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + if (minor >= MAX_MD_DEVS) { + printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor); + return 0; + } else if (md_setup_args.device_names[minor]) { + printk(KERN_WARNING "md: md=%d, Specified more then once. " + "Replacing previous definition.\n", minor); + } + switch (get_option(&str, &level)) { /* RAID Personality */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == -1) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args.pers[minor] = level; + md_setup_args.chunk[minor] = 1 << (factor+12); + switch(level) { + case -1: + level = LINEAR; + pername = "linear"; + break; + case 0: + level = RAID0; + pername = "raid0"; + break; + default: + printk(KERN_WARNING + "md: The kernel has not been configured for raid%d support!\n", + level); + return 0; + } + md_setup_args.pers[minor] = level; + break; + } + /* FALL THROUGH */ + case 1: /* the first device is numeric */ + str = str1; + /* FALL THROUGH */ + case 0: + md_setup_args.pers[minor] = 0; + pername="super-block"; + } + + printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", + minor, pername, str); + md_setup_args.device_names[minor] = str; + + return 1; +} + +extern kdev_t name_to_kdev_t(char *line) md__init; +void md__init md_setup_drive(void) +{ + int minor, i; + kdev_t dev; + mddev_t*mddev; + kdev_t devices[MD_SB_DISKS+1]; + + for (minor = 0; minor < MAX_MD_DEVS; minor++) { + int err = 0; + char *devname; + mdu_disk_info_t dinfo; + + if ((devname = md_setup_args.device_names[minor]) == 0) continue; + + for (i = 0; i < MD_SB_DISKS && devname != 0; i++) { + + char *p; + void *handle; + + p = strchr(devname, ','); + if (p) + *p++ = 0; + + dev = name_to_kdev_t(devname); + handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev), + DEVFS_SPECIAL_BLK, 1); + if (handle != 0) { + unsigned major, minor; + devfs_get_maj_min(handle, &major, &minor); + dev = MKDEV(major, minor); + } + if (dev == 0) { + printk(KERN_WARNING "md: Unknown device name: %s\n", devname); + break; + } + + devices[i] = dev; + md_setup_args.device_set[minor] = 1; + + devname = p; + } + devices[i] = 0; + + if (md_setup_args.device_set[minor] == 0) + continue; + + if (mddev_map[minor]) { + printk(KERN_WARNING + "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", + minor); + continue; + } + printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]); + + mddev = alloc_mddev(MKDEV(MD_MAJOR,minor)); + if (!mddev) { + printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor); + continue; + } + if (md_setup_args.pers[minor]) { + /* non-persistent */ + mdu_array_info_t ainfo; + ainfo.level = pers_to_level(md_setup_args.pers[minor]); + ainfo.size = 0; + ainfo.nr_disks =0; + ainfo.raid_disks =0; + ainfo.md_minor =minor; + ainfo.not_persistent = 1; + + ainfo.state = (1 << MD_SB_CLEAN); + ainfo.active_disks = 0; + ainfo.working_disks = 0; + ainfo.failed_disks = 0; + ainfo.spare_disks = 0; + ainfo.layout = 0; + ainfo.chunk_size = md_setup_args.chunk[minor]; + err = set_array_info(mddev, &ainfo); + for (i = 0; !err && (dev = devices[i]); i++) { + dinfo.number = i; + dinfo.raid_disk = i; + dinfo.state = (1<sb->nr_disks++; + mddev->sb->raid_disks++; + mddev->sb->active_disks++; + mddev->sb->working_disks++; + err = add_new_disk (mddev, &dinfo); + } + } else { + /* persistent */ + for (i = 0; (dev = devices[i]); i++) { + dinfo.major = MAJOR(dev); + dinfo.minor = MINOR(dev); + add_new_disk (mddev, &dinfo); + } + } + if (!err) + err = do_md_run(mddev); + if (err) { + mddev->sb_dirty = 0; + do_md_stop(mddev, 0); + printk(KERN_WARNING "md: starting md%d failed\n", minor); + } + } +} + +static int md__init raid_setup(char *str) +{ + int len, pos; + + len = strlen(str) + 1; + pos = 0; + + while (pos < len) { + char *comma = strchr(str+pos, ','); + int wlen; + if (comma) + wlen = (comma-str)-pos; + else wlen = (len-1)-pos; + + if (strncmp(str, "noautodetect", wlen) == 0) + raid_setup_args.noautodetect = 1; + pos += wlen+1; + } + raid_setup_args.set = 1; + return 1; +} + +int md__init md_run_setup(void) +{ + if (raid_setup_args.noautodetect) + printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n"); + else + autostart_arrays(); + md_setup_drive(); + return 0; +} + +__setup("raid=", raid_setup); +__setup("md=", md_setup); + +__initcall(md_init); +__initcall(md_run_setup); + +#else /* It is a MODULE */ + +int init_module(void) +{ + return md_init(); +} + +static void free_device_names(void) +{ + while (!list_empty(&device_names)) { + struct dname *tmp = list_entry(device_names.next, + dev_name_t, list); + list_del(&tmp->list); + kfree(tmp); + } +} + + +void cleanup_module(void) +{ + md_unregister_thread(md_recovery_thread); + devfs_unregister(devfs_handle); + + devfs_unregister_blkdev(MAJOR_NR,"md"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("mdstat", NULL); +#endif + + del_gendisk(&md_gendisk); + + blk_dev[MAJOR_NR].queue = NULL; + blksize_size[MAJOR_NR] = NULL; + blk_size[MAJOR_NR] = NULL; + max_readahead[MAJOR_NR] = NULL; + hardsect_size[MAJOR_NR] = NULL; + + free_device_names(); + +} +#endif + +MD_EXPORT_SYMBOL(md_size); +MD_EXPORT_SYMBOL(register_md_personality); +MD_EXPORT_SYMBOL(unregister_md_personality); +MD_EXPORT_SYMBOL(partition_name); +MD_EXPORT_SYMBOL(md_error); +MD_EXPORT_SYMBOL(md_do_sync); +MD_EXPORT_SYMBOL(md_sync_acct); +MD_EXPORT_SYMBOL(md_done_sync); +MD_EXPORT_SYMBOL(md_recover_arrays); +MD_EXPORT_SYMBOL(md_register_thread); +MD_EXPORT_SYMBOL(md_unregister_thread); +MD_EXPORT_SYMBOL(md_update_sb); +MD_EXPORT_SYMBOL(md_wakeup_thread); +MD_EXPORT_SYMBOL(md_print_devices); +MD_EXPORT_SYMBOL(find_rdev_nr); +MD_EXPORT_SYMBOL(md_interrupt_thread); +MD_EXPORT_SYMBOL(mddev_map); +MODULE_LICENSE("GPL"); diff --git a/tests/linux/md-loop/2 b/tests/linux/md-loop/2 new file mode 100644 index 0000000..fc01423 --- /dev/null +++ b/tests/linux/md-loop/2 @@ -0,0 +1,3949 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher + - Devfs support by Richard Gooch + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown . + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_KMOD +#include +#endif + +#define __KERNEL_SYSCALLS__ +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER + +#include + +#define DEBUG 0 +#if DEBUG +# define dprintk(x...) printk(x) +#else +# define dprintk(x...) do { } while(0) +#endif + +#ifndef MODULE +static void autostart_arrays (void); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 100; +static int sysctl_speed_limit_max = 100000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min", + &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, + {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max", + &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table raid_dir_table[] = { + {DEV_RAID, "raid", NULL, 0, 0555, raid_table}, + {0} +}; + +static ctl_table raid_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table}, + {0} +}; + +/* + * these have to be allocated separately because external + * subsystems want to have a pre-defined structure + */ +struct hd_struct md_hd_struct[MAX_MD_DEVS]; +static int md_blocksizes[MAX_MD_DEVS]; +static int md_hardsect_sizes[MAX_MD_DEVS]; +static mdk_thread_t *md_recovery_thread; + +int md_size[MAX_MD_DEVS]; + +static struct block_device_operations md_fops; +static devfs_handle_t devfs_handle; + +static struct gendisk md_gendisk= +{ + major: MD_MAJOR, + major_name: "md", + minor_shift: 0, + max_p: 1, + part: md_hd_struct, + sizes: md_size, + nr_real: MAX_MD_DEVS, + real_devices: NULL, + next: NULL, + fops: &md_fops, +}; + +/* + * Enables to iterate over all existing md arrays + */ +static MD_LIST_HEAD(all_mddevs); + +static mddev_t *mddev_map[MAX_MD_DEVS]; + +static inline mddev_t * kdev_to_mddev (kdev_t dev) +{ + if (MAJOR(dev) != MD_MAJOR) + BUG(); + return mddev_map[MINOR(dev)]; +} + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ + bio_io_error(bio); + return 0; +} + +static mddev_t * alloc_mddev(kdev_t dev) +{ + mddev_t *mddev; + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return 0; + } + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); + if (!mddev) + return NULL; + + memset(mddev, 0, sizeof(*mddev)); + + mddev->__minor = MINOR(dev); + init_MUTEX(&mddev->reconfig_sem); + init_MUTEX(&mddev->recovery_sem); + init_MUTEX(&mddev->resync_sem); + MD_INIT_LIST_HEAD(&mddev->disks); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + atomic_set(&mddev->active, 0); + + mddev_map[mdidx(mddev)] = mddev; + md_list_add(&mddev->all_mddevs, &all_mddevs); + + MOD_INC_USE_COUNT; + + return mddev; +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +static MD_LIST_HEAD(device_names); + +char * partition_name(kdev_t dev) +{ + struct gendisk *hd; + static char nomem [] = ""; + dev_name_t *dname; + struct md_list_head *tmp; + + list_for_each(tmp, &device_names) { + dname = md_list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + + if (!dname) + return nomem; + /* + * ok, add this new device name to the list + */ + hd = get_gendisk (dev); + dname->name = NULL; + if (hd) + dname->name = disk_name (hd, MINOR(dev), dname->namebuf); + if (!dname->name) { + sprintf (dname->namebuf, "[dev %s]", kdevname(dev)); + dname->name = dname->namebuf; + } + + dname->dev = dev; + md_list_add(&dname->list, &device_names); + + return dname->name; +} + +static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev, + int persistent) +{ + unsigned int size = 0; + + if (blk_size[MAJOR(dev)]) + size = blk_size[MAJOR(dev)][MINOR(dev)]; + if (persistent) + size = MD_NEW_SIZE_BLOCKS(size); + return size; +} + +static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent) +{ + unsigned int size; + + size = calc_dev_sboffset(dev, mddev, persistent); + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +static unsigned int zoned_raid_size(mddev_t *mddev) +{ + unsigned int mask; + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* + * do size and offset calculations. + */ + mask = ~(mddev->sb->chunk_size/1024 - 1); + + ITERATE_RDEV(mddev,rdev,tmp) { + rdev->size &= mask; + md_size[mdidx(mddev)] += rdev->size; + } + return 0; +} + +static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb) +{ + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = 0; + disk->minor = 0; + mark_disk_removed(disk); +} + +#define BAD_MAGIC KERN_ERR \ +"md: invalid raid superblock magic on %s\n" + +#define BAD_MINOR KERN_ERR \ +"md: %s: invalid raid minor (%x)\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_SB KERN_ERR \ +"md: disabled device %s, could not read superblock.\n" + +#define BAD_CSUM KERN_WARNING \ +"md: invalid superblock checksum on %s\n" + +static int alloc_array_sb(mddev_t * mddev) +{ + if (mddev->sb) { + MD_BUG(); + return 0; + } + + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) + return -ENOMEM; + md_clear_page(mddev->sb); + return 0; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(OUT_OF_MEM); + return -EINVAL; + } + rdev->sb = (mdp_super_t *) page_address(rdev->sb_page); + + return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) { + page_cache_release(rdev->sb_page); + rdev->sb = NULL; + rdev->sb_page = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->faulty) + MD_BUG(); + } +} + + +static void bh_complete(struct buffer_head *bh, int uptodate) +{ + + if (uptodate) + set_bit(BH_Uptodate, &bh->b_state); + + complete((struct completion*)bh->b_private); +} + +static int sync_page_io(kdev_t dev, unsigned long sector, int size, + struct page *page, int rw) +{ + struct buffer_head bh; + struct completion event; + + init_completion(&event); + init_buffer(&bh, bh_complete, &event); + bh.b_rdev = dev; + bh.b_rsector = sector; + bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock); + bh.b_size = size; + bh.b_page = page; + bh.b_reqnext = NULL; + bh.b_data = page_address(page); + generic_make_request(rw, &bh); + + run_task_queue(&tq_disk); + wait_for_completion(&event); + + return test_bit(BH_Uptodate, &bh.b_state); +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ + int ret = -EINVAL; + kdev_t dev = rdev->dev; + unsigned long sb_offset; + + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + + /* + * Calculate the position of the superblock, + * it's at the end of the disk + */ + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); + rdev->sb_offset = sb_offset; + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) { + printk(NO_SB,partition_name(dev)); + return -EINVAL; + } + printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo); + ret = 0; +abort: + return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb(mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + printk(BAD_MAGIC, partition_name(rdev->dev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor); + goto abort; + } + + if (calc_sb_csum(sb) != sb->sb_csum) { + printk(BAD_CSUM, partition_name(rdev->dev)); + goto abort; + } + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = get_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + printk( KERN_WARNING +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" +" protection against single-disk failure might be compromised.\n", + mdidx(mddev), partition_name(rdev->dev), + partition_name(same_pdev->dev)); + + md_list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev)); +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del_init(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev)); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev(mdk_rdev_t *rdev) +{ + int err = 0; + struct block_device *bdev; + + bdev = bdget(rdev->dev); + if (!bdev) + return -ENOMEM; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + if (!err) + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + blkdev_put(bdev, BDEV_RAW); +} + +void md_autodetect_dev(kdev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ + printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev)); + if (rdev->mddev) + MD_BUG(); + unlock_rdev(rdev); + free_disk_sb(rdev); + list_del_init(&rdev->all); + if (!list_empty(&rdev->pending)) { + printk(KERN_INFO "md: (%s was pending)\n", + partition_name(rdev->dev)); + list_del_init(&rdev->pending); + } +#ifndef MODULE + md_autodetect_dev(rdev->dev); +#endif + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); +} + +static void free_mddev(mddev_t *mddev) +{ + if (!mddev) { + MD_BUG(); + return; + } + + export_array(mddev); + md_size[mdidx(mddev)] = 0; + md_hd_struct[mdidx(mddev)].nr_sects = 0; + + /* + * Make sure nobody else is using this mddev + * (careful, we rely on the global kernel lock here) + */ + while (sem_getcount(&mddev->resync_sem) != 1) + schedule(); + while (sem_getcount(&mddev->recovery_sem) != 1) + schedule(); + + del_mddev_mapping(mddev, mk_kdev(MD_MAJOR, mdidx(mddev))); + md_list_del(&mddev->all_mddevs); + kfree(mddev); + MOD_DEC_USE_COUNT; +} + +#undef BAD_CSUM +#undef BAD_MAGIC +#undef OUT_OF_MEM +#undef NO_SB + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ", + partition_name(rdev->dev), partition_name(rdev->old_dev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk(KERN_INFO "md: rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * *\n"); + printk("md: **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", partition_name(rdev->dev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk("md: **********************************\n"); + printk("\n"); +} + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) +{ + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) + + return 1; + + return 0; +} + +static mdk_rdev_t * find_rdev_all(kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + list_for_each(tmp, &all_raid_disks) { + rdev = md_list_entry(tmp, mdk_rdev_t, all); + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +#define GETBLK_FAILED KERN_ERR \ +"md: getblk failed for device %s\n" + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + kdev_t dev; + unsigned long sb_offset, size; + + if (!rdev->sb) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + if (rdev->sb->md_magic != MD_SB_MAGIC) { + MD_BUG(); + return 1; + } + + dev = rdev->dev; + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); + if (rdev->sb_offset != sb_offset) { + printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n", + partition_name(dev), rdev->sb_offset, sb_offset); + goto skip; + } + /* + * If the disk went offline meanwhile and it's just a spare, then + * its size has changed to zero silently, and the MD code does + * not yet know that it's faulty. + */ + size = calc_dev_size(dev, rdev->mddev, 1); + if (size != rdev->size) { + printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n", + partition_name(dev), rdev->size, size); + goto skip; + } + + printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset); + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) { + printk("md: write_disk_sb failed for device %s\n", partition_name(dev)); + return 1; + } +skip: + return 0; +} +#undef GETBLK_FAILED + +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int i, ok = 0; + mdp_disk_t *desc; + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = mddev->sb->disks + i; +#if 0 + if (disk_faulty(desc)) { + if (MKDEV(desc->major,desc->minor) == rdev->dev) + ok = 1; + continue; + } +#endif + if (MKDEV(desc->major,desc->minor) == rdev->dev) { + rdev->sb->this_disk = *desc; + rdev->desc_nr = desc->number; + ok = 1; + break; + } + } + + if (!ok) { + MD_BUG(); + } +} + +static int sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + mdp_super_t *sb; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty || rdev->alias_device) + continue; + sb = rdev->sb; + *sb = *mddev->sb; + set_this_disk(mddev, rdev); + sb->sb_csum = calc_sb_csum(sb); + } + return 0; +} + +int md_update_sb(mddev_t * mddev) +{ + int err, count = 100; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + if (!mddev->sb_dirty) { + printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0)); + return 0; + } + mddev->sb_dirty = 0; +repeat: + mddev->sb->utime = CURRENT_TIME; + if ((++mddev->sb->events_lo)==0) + ++mddev->sb->events_hi; + + if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff; + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (mddev->sb->not_persistent) + return 0; + + printk(KERN_INFO "md: updating md%d RAID superblock on device\n", + mdidx(mddev)); + + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + printk(KERN_INFO "md: "); + if (rdev->faulty) + printk("(skipping faulty "); + if (rdev->alias_device) + printk("(skipping alias "); + if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) { + printk("(skipping new-faulty %s )\n", + partition_name(rdev->dev)); + continue; + } + printk("%s ", partition_name(rdev->dev)); + if (!rdev->faulty && !rdev->alias_device) { + printk("[events: %08lx]", + (unsigned long)rdev->sb->events_lo); + err += write_disk_sb(rdev); + } else + printk(")\n"); + } + if (err) { + if (--count) { + printk(KERN_ERR "md: errors occurred during superblock update, repeating\n"); + goto repeat; + } + printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n"); + } + return 0; +} + +/* + * Import a device. If 'on_disk', then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + */ +static int md_import_device(kdev_t newdev, int on_disk) +{ + int err; + mdk_rdev_t *rdev; + unsigned int size; + + if (find_rdev_all(newdev)) + return -EEXIST; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev)); + return -ENOMEM; + } + memset(rdev, 0, sizeof(*rdev)); + + if (is_mounted(newdev)) { + printk(KERN_WARNING "md: can not import %s, has active inodes!\n", + partition_name(newdev)); + err = -EBUSY; + goto abort_free; + } + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + rdev->dev = newdev; + if (lock_rdev(rdev)) { + printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + + size = 0; + if (blk_size[MAJOR(newdev)]) + size = blk_size[MAJOR(newdev)][MINOR(newdev)]; + if (!size) { + printk(KERN_WARNING "md: %s has zero size, marking faulty!\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + + if (on_disk) { + if ((err = read_disk_sb(rdev))) { + printk(KERN_WARNING "md: could not read %s's sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + if ((err = check_disk_sb(rdev))) { + printk(KERN_WARNING "md: %s has invalid sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + + if (rdev->sb->level != -4) { + rdev->old_dev = MKDEV(rdev->sb->this_disk.major, + rdev->sb->this_disk.minor); + rdev->desc_nr = rdev->sb->this_disk.number; + } else { + rdev->old_dev = MKDEV(0, 0); + rdev->desc_nr = -1; + } + } + md_list_add(&rdev->all, &all_raid_disks); + MD_INIT_LIST_HEAD(&rdev->pending); + INIT_LIST_HEAD(&rdev->same_set); + + return 0; + +abort_free: + if (rdev->sb) { + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return err; +} + +/* + * Check a full RAID array for plausibility + */ + +#define INCONSISTENT KERN_ERR \ +"md: fatal superblock inconsistency in %s -- removing from array\n" + +#define OUT_OF_DATE KERN_ERR \ +"md: superblock update time inconsistency -- using the most recent one\n" + +#define OLD_VERSION KERN_ALERT \ +"md: md%d: unsupported raid array version %d.%d.%d\n" + +#define NOT_CLEAN_IGNORE KERN_ERR \ +"md: md%d: raid array is not clean -- starting background reconstruction\n" + +#define UNKNOWN_LEVEL KERN_ERR \ +"md: md%d: unsupported raid level %d\n" + +static int analyze_sbs(mddev_t * mddev) +{ + int out_of_date = 0, i, first; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2, *freshest; + mdp_super_t *sb; + + /* + * Verify the RAID superblock on each real device + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + MD_BUG(); + goto abort; + } + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + if (check_disk_sb(rdev)) + goto abort; + } + + /* + * The superblock constant part has to be the same + * for all disks in the array. + */ + sb = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!sb) { + sb = rdev->sb; + continue; + } + if (!sb_equal(sb, rdev->sb)) { + printk(INCONSISTENT, partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * OK, we have all disks and the array is ready to run. Let's + * find the freshest superblock, that one will be the superblock + * that represents the whole array. + */ + if (!mddev->sb) + if (alloc_array_sb(mddev)) + goto abort; + sb = mddev->sb; + freshest = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2; + /* + * if the checksum is invalid, use the superblock + * only as a last resort. (decrease it's age by + * one event) + */ + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { + if (rdev->sb->events_lo || rdev->sb->events_hi) + if ((rdev->sb->events_lo--)==0) + rdev->sb->events_hi--; + } + + printk(KERN_INFO "md: %s's event counter: %08lx\n", + partition_name(rdev->dev), + (unsigned long)rdev->sb->events_lo); + if (!freshest) { + freshest = rdev; + continue; + } + /* + * Find the newest superblock version + */ + ev1 = md_event(rdev->sb); + ev2 = md_event(freshest->sb); + if (ev1 != ev2) { + out_of_date = 1; + if (ev1 > ev2) + freshest = rdev; + } + } + if (out_of_date) { + printk(OUT_OF_DATE); + printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev)); + } + memcpy (sb, freshest->sb, sizeof(*sb)); + + /* + * at this point we have picked the 'best' superblock + * from all available superblocks. + * now we validate this superblock and kick out possibly + * failed disks. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Kick all non-fresh devices + */ + __u64 ev1, ev2; + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ++ev1; + if (ev1 < ev2) { + printk(KERN_WARNING "md: kicking non-fresh %s from array!\n", + partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * Fix up changed device names ... but only if this disk has a + * recent update time. Use faulty checksum ones too. + */ + if (mddev->sb->level != -4) + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2, ev3; + if (rdev->faulty || rdev->alias_device) { + MD_BUG(); + goto abort; + } + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ev3 = ev2; + --ev3; + if ((rdev->dev != rdev->old_dev) && + ((ev1 == ev2) || (ev1 == ev3))) { + mdp_disk_t *desc; + + printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n", + partition_name(rdev->old_dev), partition_name(rdev->dev)); + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + desc = &sb->disks[rdev->desc_nr]; + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { + MD_BUG(); + goto abort; + } + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + desc = &rdev->sb->this_disk; + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + } + } + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + /* + * We kick faulty devices/descriptors immediately. + * + * Note: multipath devices are a special case. Since we + * were able to read the superblock on the path, we don't + * care if it was previously marked as faulty, it's up now + * so enable it. + */ + if (disk_faulty(desc) && mddev->sb->level != -4) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + printk(KERN_WARNING "md%d: kicking faulty %s!\n", + mdidx(mddev),partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + if (dev == MKDEV(0,0)) + continue; + printk(KERN_WARNING "md%d: removing former faulty %s!\n", + mdidx(mddev), partition_name(dev)); + } + remove_descriptor(desc, sb); + continue; + } else if (disk_faulty(desc)) { + /* + * multipath entry marked as faulty, unfaulty it + */ + rdev = find_rdev(mddev, dev); + if(rdev) + mark_disk_spare(desc); + else + remove_descriptor(desc, sb); + } + + if (dev == MKDEV(0,0)) + continue; + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Multi-path IO special-case: since we have no + * this_disk descriptor at auto-detect time, + * we cannot check rdev->number. + * We can check the device though. + */ + if ((sb->level == -4) && (rdev->dev == + MKDEV(desc->major,desc->minor))) { + found = 1; + break; + } + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } + } + if (found) + continue; + + printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n", + mdidx(mddev), partition_name(dev)); + remove_descriptor(desc, sb); + } + + /* + * Double check wether all devices mentioned in the + * superblock are in the rdev ring. + */ + first = 1; + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + + if (disk_faulty(desc)) { + MD_BUG(); + goto abort; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + /* + * In the case of Multipath-IO, we have no + * other information source to find out which + * disk is which, only the position of the device + * in the superblock: + */ + if (mddev->sb->level == -4) { + if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) { + MD_BUG(); + goto abort; + } + rdev->desc_nr = i; + if (!first) + rdev->alias_device = 1; + else + first = 0; + } + } + + /* + * Kick all rdevs that are not in the + * descriptor array: + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) + kick_rdev_from_array(rdev); + } + + /* + * Do a final reality check. + */ + if (mddev->sb->level != -4) { + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + /* + * is the desc_nr unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->desc_nr == rdev->desc_nr)) { + MD_BUG(); + goto abort; + } + } + /* + * is the device unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->dev == rdev->dev)) { + MD_BUG(); + goto abort; + } + } + } + } + + /* + * Check if we can support this RAID array + */ + if (sb->major_version != MD_MAJOR_VERSION || + sb->minor_version > MD_MINOR_VERSION) { + + printk(OLD_VERSION, mdidx(mddev), sb->major_version, + sb->minor_version, sb->patch_version); + goto abort; + } + + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || + (sb->level == 4) || (sb->level == 5))) + printk(NOT_CLEAN_IGNORE, mdidx(mddev)); + + return 0; +abort: + return 1; +} + +#undef INCONSISTENT +#undef OUT_OF_DATE +#undef OLD_VERSION +#undef OLD_LEVEL + +static int device_size_calculation(mddev_t * mddev) +{ + int data_disks = 0, persistent; + unsigned int readahead; + mdp_super_t *sb = mddev->sb; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + persistent = !mddev->sb->not_persistent; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size) { + MD_BUG(); + continue; + } + rdev->size = calc_dev_size(rdev->dev, mddev, persistent); + if (rdev->size < sb->chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size: %ldk < %dk\n", + partition_name(rdev->dev), + rdev->size, sb->chunk_size / 1024); + return -EINVAL; + } + } + + switch (sb->level) { + case -4: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case -1: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = sb->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = sb->raid_disks-1; + break; + default: + printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = sb->size * data_disks; + + readahead = MD_READAHEAD; + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) { + readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + // (no multipath branch - it uses the default setting) + if (sb->level == -3) + readahead = 0; + } + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; +abort: + return 1; +} + + +#define TOO_BIG_CHUNKSIZE KERN_ERR \ +"too big chunk_size: %d > %d\n" + +#define TOO_SMALL_CHUNKSIZE KERN_ERR \ +"too small chunk_size: %d < %ld\n" + +#define BAD_CHUNKSIZE KERN_ERR \ +"no chunksize specified, see 'man raidtab'\n" + +static int do_md_run(mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Resize disks to align partitions size on a given + * chunk size. + */ + md_size[mdidx(mddev)] = 0; + + /* + * Analyze all RAID superblock(s) + */ + if (analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->sb->chunk_size; + pnum = level_to_pers(mddev->sb->level); + + if ((pnum != MULTIPATH) && (pnum != RAID1)) { + if (!chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We dont + * want to continue the bad practice. + */ + printk(BAD_CHUNKSIZE); + return -EINVAL; + } + if (chunk_size > MAX_CHUNK_SIZE) { + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); + return -EINVAL; + } + } else + if (chunk_size) + printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n", + mddev->sb->level); + + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (!pers[pnum]) + { +#ifdef CONFIG_KMOD + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + if (!pers[pnum]) +#endif + { + printk(KERN_ERR "md: personality %d is not loaded!\n", + pnum); + return -EINVAL; + } + } + + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + md_hardsect_sizes[mdidx(mddev)] = 512; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + invalidate_device(rdev->dev, 1); + if (get_hardsect_size(rdev->dev) + > md_hardsect_sizes[mdidx(mddev)]) + md_hardsect_sizes[mdidx(mddev)] = + get_hardsect_size(rdev->dev); + } + md_blocksizes[mdidx(mddev)] = 1024; + if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)]) + md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)]; + mddev->pers = pers[pnum]; + + blk_queue_make_request(&mddev->queue, mddev->pers->make_request); + mddev->queue.queuedata = mddev; + + err = mddev->pers->run(mddev); + if (err) { + printk(KERN_ERR "md: pers->run() failed ...\n"); + mddev->pers = NULL; + return -EINVAL; + } + + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + /* + * md_size has units of 1K blocks, which are + * twice as large as sectors. + */ + md_hd_struct[mdidx(mddev)].start_sect = 0; + register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)), + 1, &md_fops, md_size[mdidx(mddev)]<<1); + + read_ahead[MD_MAJOR] = 1024; + return (0); +} + +#undef TOO_BIG_CHUNKSIZE +#undef BAD_CHUNKSIZE + +static int restart_array(mddev_t *mddev) +{ + int err; + + /* + * Complain if it has no devices + */ + err = -ENXIO; + if (list_empty(&mddev->disks)) + goto out; + + if (mddev->pers) { + err = -EBUSY; + if (!mddev->ro) + goto out; + + mddev->ro = 0; + set_device_ro(mddev_to_kdev(mddev), 0); + + printk(KERN_INFO + "md: md%d switched to read-write mode.\n", mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + md_recover_arrays(); + if (mddev->pers->restart_resync) + mddev->pers->restart_resync(mddev); + err = 0; + } else { + printk(KERN_ERR "md: md%d has no personality assigned.\n", + mdidx(mddev)); + err = -EINVAL; + } + +out: + return err; +} + +#define STILL_MOUNTED KERN_WARNING \ +"md: md%d still mounted.\n" +#define STILL_IN_USE \ +"md: md%d still in use.\n" + +static int do_md_stop(mddev_t * mddev, int ro) +{ + int err = 0, resync_interrupted = 0; + kdev_t dev = mddev_to_kdev(mddev); + + if (atomic_read(&mddev->active)>1) { + printk(STILL_IN_USE, mdidx(mddev)); + err = -EBUSY; + goto out; + } + + if (mddev->pers) { + /* + * It is safe to call stop here, it only frees private + * data. Also, it tells us if a device is unstoppable + * (eg. resyncing is in progress) + */ + if (mddev->pers->stop_resync) + if (mddev->pers->stop_resync(mddev)) + resync_interrupted = 1; + + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + + /* + * This synchronizes with signal delivery to the + * resync or reconstruction thread. It also nicely + * hangs the process if some reconstruction has not + * finished. + */ + down(&mddev->recovery_sem); + up(&mddev->recovery_sem); + + invalidate_device(dev, 1); + + if (ro) { + err = -ENXIO; + if (mddev->ro) + goto out; + mddev->ro = 1; + } else { + if (mddev->ro) + set_device_ro(dev, 0); + if (mddev->pers->stop(mddev)) { + err = -EBUSY; + if (mddev->ro) + set_device_ro(dev, 1); + goto out; + } + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->sb) { + /* + * mark it clean only if there was no resync + * interrupted. + */ + if (!mddev->recovery_running && !resync_interrupted) { + printk(KERN_INFO "md: marking sb clean...\n"); + mddev->sb->state |= 1 << MD_SB_CLEAN; + } + mddev->sb_dirty = 1; + md_update_sb(mddev); + } + if (ro) + set_device_ro(dev, 1); + } + + /* + * Free resources if final stop + */ + if (!ro) { + printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); + free_mddev(mddev); + } else + printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev)); + err = 0; +out: + return err; +} + +/* + * We have to safely support old arrays too. + */ +int detect_old_array(mdp_super_t *sb) +{ + if (sb->major_version > 0) + return 0; + if (sb->minor_version >= 90) + return 0; + + return -EINVAL; +} + + +static void autorun_array(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int err; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return; + } + + printk(KERN_INFO "md: running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", partition_name(rdev->dev)); + } + printk("\n"); + + err = do_md_run (mddev); + if (err) { + printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + /* + * prevent the writeback of an unrunnable array + */ + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in the ->pending list) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(kdev_t countdev) +{ + struct md_list_head candidates; + struct md_list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + kdev_t md_kdev; + + + printk(KERN_INFO "md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + rdev0 = md_list_entry(pending_raid_disks.next, + mdk_rdev_t, pending); + + printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev)); + MD_INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) { + if (uuid_equal(rdev0, rdev)) { + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING + "md: %s has same UUID as %s, but superblocks differ ...\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + continue; + } + printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev)); + md_list_del(&rdev->pending); + md_list_add(&rdev->pending, &candidates); + } + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); + mddev = kdev_to_mddev(md_kdev); + if (mddev) { + printk(KERN_WARNING "md: md%d already running, cannot run %s\n", + mdidx(mddev), partition_name(rdev0->dev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) + export_rdev(rdev); + continue; + } + mddev = alloc_mddev(md_kdev); + if (!mddev) { + printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); + break; + } + if (md_kdev == countdev) + atomic_inc(&mddev->active); + printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + bind_rdev_to_array(rdev, mddev); + list_del_init(&rdev->pending); + } + autorun_array(mddev); + } + printk(KERN_INFO "md: ... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +#define BAD_VERSION KERN_ERR \ +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_DEVICE KERN_ERR \ +"md: disabled device %s\n" + +#define AUTOADD_FAILED KERN_ERR \ +"md: auto-adding devices to md%d FAILED (error %d).\n" + +#define AUTOADD_FAILED_USED KERN_ERR \ +"md: cannot auto-add device %s to md%d, already used.\n" + +#define AUTORUN_FAILED KERN_ERR \ +"md: auto-running md%d FAILED (error %d).\n" + +#define MDDEV_BUSY KERN_ERR \ +"md: cannot auto-add to md%d, already running.\n" + +#define AUTOADDING KERN_INFO \ +"md: auto-adding devices to md%d, based on %s's superblock.\n" + +#define AUTORUNNING KERN_INFO \ +"md: auto-running md%d.\n" + +static int autostart_array(kdev_t startdev, kdev_t countdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + if (md_import_device(startdev, 1)) { + printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev)); + goto abort; + } + + start_rdev = find_rdev_all(startdev); + if (!start_rdev) { + MD_BUG(); + goto abort; + } + if (start_rdev->faulty) { + printk(KERN_WARNING "md: can not autostart based on faulty %s!\n", + partition_name(startdev)); + goto abort; + } + md_list_add(&start_rdev->pending, &pending_raid_disks); + + sb = start_rdev->sb; + + err = detect_old_array(sb); + if (err) { + printk(KERN_WARNING "md: array version is too old to be autostarted ," + "use raidtools 0.90 mkraid --upgrade to upgrade the array " + "without data loss!\n"); + goto abort; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + if (dev == startdev) + continue; + if (md_import_device(dev, 1)) { + printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n", + partition_name(dev)); + continue; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(countdev); + return 0; + +abort: + if (start_rdev) + export_rdev(start_rdev); + return err; +} + +#undef BAD_VERSION +#undef OUT_OF_MEM +#undef NO_DEVICE +#undef AUTOADD_FAILED_USED +#undef AUTOADD_FAILED +#undef AUTORUN_FAILED +#undef AUTOADDING +#undef AUTORUNNING + + +static int get_version(void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (md_copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +#define SET_FROM_SB(x) info.x = mddev->sb->x +static int get_array_info(mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + + SET_FROM_SB(major_version); + SET_FROM_SB(minor_version); + SET_FROM_SB(patch_version); + SET_FROM_SB(ctime); + SET_FROM_SB(level); + SET_FROM_SB(size); + SET_FROM_SB(nr_disks); + SET_FROM_SB(raid_disks); + SET_FROM_SB(md_minor); + SET_FROM_SB(not_persistent); + + SET_FROM_SB(utime); + SET_FROM_SB(state); + SET_FROM_SB(active_disks); + SET_FROM_SB(working_disks); + SET_FROM_SB(failed_disks); + SET_FROM_SB(spare_disks); + + SET_FROM_SB(layout); + SET_FROM_SB(chunk_size); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x +static int get_disk_info(mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= MD_SB_DISKS) + return -EINVAL; + + SET_FROM_SB(major); + SET_FROM_SB(minor); + SET_FROM_SB(raid_disk); + SET_FROM_SB(state); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_SB(x) mddev->sb->disks[nr].x = info->x + +static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) +{ + int err, size, persistent; + mdk_rdev_t *rdev; + unsigned int nr; + kdev_t dev; + dev = MKDEV(info->major,info->minor); + + if (find_rdev_all(dev)) { + printk(KERN_WARNING "md: device %s already used in a RAID array!\n", + partition_name(dev)); + return -EBUSY; + } + if (!mddev->sb) { + /* expecting a device which has a superblock */ + err = md_import_device(dev, 1); + if (err) { + printk(KERN_WARNING "md: md_import_device returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (!list_empty(&mddev->disks)) { + mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + if (!uuid_equal(rdev0, rdev)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + } + bind_rdev_to_array(rdev, mddev); + return 0; + } + + nr = info->number; + if (nr >= mddev->sb->nr_disks) { + MD_BUG(); + return -EINVAL; + } + + + SET_SB(number); + SET_SB(major); + SET_SB(minor); + SET_SB(raid_disk); + SET_SB(state); + + if ((info->state & (1<old_dev = dev; + rdev->desc_nr = info->number; + + bind_rdev_to_array(rdev, mddev); + + persistent = !mddev->sb->not_persistent; + if (!persistent) + printk(KERN_INFO "md: nonpersistent superblock ...\n"); + + size = calc_dev_size(dev, mddev, persistent); + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + if (!mddev->sb->size || (mddev->sb->size > size)) + mddev->sb->size = size; + } + + /* + * sync all other superblocks with the main superblock + */ + sync_sbs(mddev); + + return 0; +} +#undef SET_SB + +static int hot_generate_error(mddev_t * mddev, kdev_t dev) +{ + struct request_queue *q; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + return -ENXIO; + } + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (!disk_active(disk)) + return -ENODEV; + + q = blk_get_queue(rdev->dev); + if (!q) { + MD_BUG(); + return -ENODEV; + } + printk(KERN_INFO "md: okay, generating error!\n"); +// q->oneshot_error = 1; // disabled for now + + return 0; +} + +static int hot_remove_disk(mddev_t * mddev, kdev_t dev) +{ + int err; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (disk_active(disk)) + goto busy; + + if (disk_removed(disk)) + return -EINVAL; + + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); + if (err == -EBUSY) + goto busy; + + if (err) { + MD_BUG(); + return -EINVAL; + } + + remove_descriptor(disk, mddev->sb); + kick_rdev_from_array(rdev); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + return 0; +busy: + printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk(mddev_t * mddev, kdev_t dev) +{ + int i, err, persistent; + unsigned int size; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + persistent = !mddev->sb->not_persistent; + + rdev = find_rdev(mddev, dev); + if (rdev) + return -EBUSY; + + err = md_import_device (dev, 0); + if (err) { + printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (rdev->faulty) { + printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n", + partition_name(dev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + size = calc_dev_size(dev, mddev, persistent); + + if (size < mddev->sb->size) { + printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n", + mdidx(mddev), size, mddev->sb->size); + err = -ENOSPC; + goto abort_export; + } + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + rdev->old_dev = dev; + rdev->size = size; + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + disk = mddev->sb->disks + mddev->sb->raid_disks; + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { + disk = mddev->sb->disks + i; + + if (!disk->major && !disk->minor) + break; + if (disk_removed(disk)) + break; + } + if (i == MD_SB_DISKS) { + printk(KERN_WARNING "md%d: can not hot-add to full array!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + if (disk_removed(disk)) { + /* + * reuse slot + */ + if (disk->number != i) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + } else { + disk->number = i; + } + + disk->raid_disk = disk->number; + disk->major = MAJOR(dev); + disk->minor = MINOR(dev); + + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + + mark_disk_spare(disk); + mddev->sb->nr_disks++; + mddev->sb->spare_disks++; + mddev->sb->working_disks++; + + mddev->sb_dirty = 1; + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + md_recover_arrays(); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +#define SET_SB(x) mddev->sb->x = info->x +static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +{ + + if (alloc_array_sb(mddev)) + return -ENOMEM; + + mddev->sb->major_version = MD_MAJOR_VERSION; + mddev->sb->minor_version = MD_MINOR_VERSION; + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; + mddev->sb->ctime = CURRENT_TIME; + + SET_SB(level); + SET_SB(size); + SET_SB(nr_disks); + SET_SB(raid_disks); + SET_SB(md_minor); + SET_SB(not_persistent); + + SET_SB(state); + SET_SB(active_disks); + SET_SB(working_disks); + SET_SB(failed_disks); + SET_SB(spare_disks); + + SET_SB(layout); + SET_SB(chunk_size); + + mddev->sb->md_magic = MD_SB_MAGIC; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(&mddev->sb->set_uuid0, 4); + get_random_bytes(&mddev->sb->set_uuid1, 4); + get_random_bytes(&mddev->sb->set_uuid2, 4); + get_random_bytes(&mddev->sb->set_uuid3, 4); + + return 0; +} +#undef SET_SB + +static int set_disk_faulty(mddev_t *mddev, kdev_t dev) +{ + int ret; + + ret = md_error(mddev, dev); + return ret; +} + +static int md_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!md_capable_admin()) + return -EACCES; + + dev = inode->i_rdev; + minor = MINOR(dev); + if (minor >= MAX_MD_DEVS) { + MD_BUG(); + return -EINVAL; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done_unlock; + +#ifndef MODULE + case RAID_AUTORUN: + err = 0; + autostart_arrays(); + goto done; +#endif + + case BLKGETSIZE: + case BLKGETSIZE64: + case BLKRAGET: + case BLKRASET: + case BLKFLSBUF: + case BLKBSZGET: + case BLKBSZSET: + err = blk_ioctl (dev, cmd, arg); + goto abort; + + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = kdev_to_mddev(dev); + + switch (cmd) + { + case SET_ARRAY_INFO: + case START_ARRAY: + if (mddev) { + printk(KERN_WARNING "md: array md%d already exists!\n", + mdidx(mddev)); + err = -EEXIST; + goto abort; + } + default:; + } + switch (cmd) + { + case SET_ARRAY_INFO: + mddev = alloc_mddev(dev); + if (!mddev) { + err = -ENOMEM; + goto abort; + } + atomic_inc(&mddev->active); + + /* + * alloc_mddev() should possibly self-lock. + */ + err = lock_mddev(mddev); + if (err) { + printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n", + err, cmd); + goto abort; + } + + if (mddev->sb) { + printk(KERN_WARNING "md: array md%d already has a superblock!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (arg) { + mdu_array_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldnt set array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + + case START_ARRAY: + /* + * possibly make it lock the array ... + */ + err = autostart_array((kdev_t)arg, dev); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name((kdev_t)arg)); + goto abort; + } + goto done; + + default:; + } + + /* + * Commands querying/configuring an existing array: + */ + + if (!mddev) { + err = -ENODEV; + goto abort; + } + err = lock_mddev(mddev); + if (err) { + printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); + goto abort; + } + /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + if (!(err = do_md_stop (mddev, 0))) + mddev = NULL; + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = md_put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = md_put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[minor].start_sect, + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + case HOT_GENERATE_ERROR: + err = hot_generate_error(mddev, (kdev_t)arg); + goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, (kdev_t)arg); + goto done_unlock; + + case RUN_ARRAY: + { + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + */ + if (err) { + mddev->sb_dirty = 0; + if (!do_md_stop (mddev, 0)) + mddev = NULL; + } + goto done_unlock; + } + + default: + printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, " + "upgrade your software to use new ictls.\n", + current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + if (mddev) + unlock_mddev(mddev); + + return err; +done: + if (err) + MD_BUG(); +abort: + return err; +} + +static int md_open(struct inode *inode, struct file *file) +{ + /* + * Always succeed, but increment the usage count + */ + mddev_t *mddev = kdev_to_mddev(inode->i_rdev); + if (mddev) + atomic_inc(&mddev->active); + return (0); +} + +static int md_release(struct inode *inode, struct file * file) +{ + mddev_t *mddev = kdev_to_mddev(inode->i_rdev); + if (mddev) + atomic_dec(&mddev->active); + return 0; +} + +static struct block_device_operations md_fops= +{ + owner: THIS_MODULE, + open: md_open, + release: md_release, + ioctl: md_ioctl, +}; + + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + md_lock_kernel(); + + /* + * Detach thread + */ + + daemonize(); + + sprintf(current->comm, thread->name); + md_init_signals(); + md_flush_signals(); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + current->policy = SCHED_OTHER; + current->nice = -20; + md_unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run)(void *data); + + wait_event_interruptible(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags)); + + clear_bit(THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->data); + run_task_queue(&tq_disk); + } + if (md_signal_pending(current)) + md_flush_signals(); + } + complete(thread->event); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + dprintk("md: waking up MD thread %p.\n", thread); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); +} + +mdk_thread_t *md_register_thread(void (*run) (void *), + void *data, const char *name) +{ + mdk_thread_t *thread; + int ret; + struct completion event; + + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; + + memset(thread, 0, sizeof(mdk_thread_t)); + md_init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->data = data; + thread->name = name; + ret = kernel_thread(md_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +void md_interrupt_thread(mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread(mdk_thread_t *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + md_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +void md_recover_arrays(void) +{ + if (!md_recovery_thread) { + MD_BUG(); + return; + } + md_wakeup_thread(md_recovery_thread); +} + + +int md_error(mddev_t *mddev, kdev_t rdev) +{ + mdk_rdev_t * rrdev; + + dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", + MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev), + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); + + if (!mddev) { + MD_BUG(); + return 0; + } + rrdev = find_rdev(mddev, rdev); + if (!rrdev || rrdev->faulty) + return 0; + if (!mddev->pers->error_handler + || mddev->pers->error_handler(mddev,rdev) <= 0) { + rrdev->faulty = 1; + } else + return 1; + /* + * if recovery was running, stop it now. + */ + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + md_recover_arrays(); + + return 0; +} + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + seq_printf(seq, "unused devices: "); + + ITERATE_RDEV_ALL(rdev,tmp) { + if (list_empty(&rdev->same_set)) { + /* + * The device is not yet used by any array. + */ + i++; + seq_printf(seq, "%s ", + partition_name(rdev->dev)); + } + } + if (!i) + seq_printf(seq, ""); + + seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, mddev_t * mddev) +{ + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + max_blocks = mddev->sb->size; + + /* + * Should not happen. + */ + if (!max_blocks) + MD_BUG(); + + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + if (!mddev->recovery_running) + /* + * true resync + */ + seq_printf(seq, " resync =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + else + /* + * recovery ... + */ + seq_printf(seq, " recovery =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt/2); + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/dt); + +} + + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + mddev_t *mddev; + + if (l > 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, mddev_t, all_mddevs); + return mddev; + } + return (void*)2;/* tail */ +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + mddev_t *next_mddev, *mddev = v; + + ++*pos; + if (v == (void*)2) + return NULL; + + if (v == (void*)1) + tmp = all_mddevs.next; + else + tmp = mddev->all_mddevs.next; + if (tmp != &all_mddevs) + next_mddev = list_entry(tmp,mddev_t,all_mddevs); + else { + next_mddev = (void*)2; + *pos = 0x10000; + } + + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + int j, size; + struct md_list_head *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev = v; + + if (v == (void*)1) { + seq_printf(seq, "Personalities : "); + for (j = 0; j < MAX_PERSONALITY; j++) + if (pers[j]) + seq_printf(seq, "[%s] ", pers[j]->name); + + seq_printf(seq, "\n"); + seq_printf(seq, "read_ahead "); + if (read_ahead[MD_MAJOR] == INT_MAX) + seq_printf(seq, "not set\n"); + else + seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + seq_printf(seq, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + seq_printf(seq, " (read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + seq_printf(seq, " %s[%d]", + partition_name(rdev->dev), rdev->desc_nr); + if (rdev->faulty) { + seq_printf(seq, "(F)"); + continue; + } + size += rdev->size; + } + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %d blocks", + md_size[mdidx(mddev)]); + else + seq_printf(seq, "\n %d blocks", size); + } + + if (mddev->pers) { + + mddev->pers->status (seq, mddev); + + seq_printf(seq, "\n "); + if (mddev->curr_resync) { + status_resync (seq, mddev); + } else { + if (sem_getcount(&mddev->resync_sem) != 1) + seq_printf(seq, " resync=DELAYED"); + } + } + seq_printf(seq, "\n"); + + return 0; +} + + +static struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + int error; + + error = seq_open(file, &md_seq_ops); + return error; +} + +static struct file_operations md_seq_fops = { + .open = md_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +int register_md_personality(int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (pers[pnum]) { + MD_BUG(); + return -EBUSY; + } + + pers[pnum] = p; + printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); + return 0; +} + +int unregister_md_personality(int pnum) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); + pers[pnum] = NULL; + return 0; +} + +mdp_disk_t *get_spare(mddev_t *mddev) +{ + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (!rdev->sb) { + MD_BUG(); + continue; + } + disk = &sb->disks[rdev->desc_nr]; + if (disk_faulty(disk)) { + MD_BUG(); + continue; + } + if (disk_active(disk)) + continue; + return disk; + } + return NULL; +} + +static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; +void md_sync_acct(kdev_t dev, unsigned long nr_sectors) +{ + unsigned int major = MAJOR(dev); + unsigned int index; + + index = disk_index(dev); + if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + return; + + sync_io[major][index] += nr_sectors; +} + +static int is_mddev_idle(mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + int major = MAJOR(rdev->dev); + int idx = disk_index(rdev->dev); + + if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + continue; + + curr_events = kstat.dk_drive_rblk[major][idx] + + kstat.dk_drive_wblk[major][idx] ; + curr_events -= sync_io[major][idx]; + if ((curr_events - rdev->last_events) > 32) { + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + // stop recovery, signal do_sync .... + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + } +} + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +int md_do_sync(mddev_t *mddev, mdp_disk_t *spare) +{ + mddev_t *mddev2; + unsigned int max_sectors, currspeed, + j, window, err, serialize; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct md_list_head *tmp; + unsigned long last_check; + + + err = down_interruptible(&mddev->resync_sem); + if (err) + goto out_nolock; + +recheck: + serialize = 0; + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: delaying resync of md%d until md%d " + "has finished resync (they share one or more physical units)\n", + mdidx(mddev), mdidx(mddev2)); + serialize = 1; + break; + } + } + if (serialize) { + interruptible_sleep_on(&resync_wait); + if (md_signal_pending(current)) { + md_flush_signals(); + err = -EINTR; + goto out; + } + goto recheck; + } + + mddev->curr_resync = 1; + + max_sectors = mddev->sb->size<<1; + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", + sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith " + "(but not more than %d KB/sec) for reconstruction.\n", + sysctl_speed_limit_max); + + /* + * Resync has low priority. + */ + current->nice = 19; + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = 0; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = vm_max_readahead*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", + window/2,max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + for (j = 0; j < max_sectors;) { + int sectors; + + sectors = mddev->pers->sync_request(mddev, j); + + if (sectors < 0) { + err = sectors; + goto out; + } + atomic_add(sectors, &mddev->recovery_active); + j += sectors; + mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + last_check = j; + + run_task_queue(&tq_disk); + + repeat: + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (md_signal_pending(current)) { + /* + * got a signal, exit. + */ + mddev->curr_resync = 0; + printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n"); + md_flush_signals(); + err = -EINTR; + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + if (md_need_resched(current)) + schedule(); + + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { + current->nice = 19; + + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + md_schedule_timeout(HZ/4); + goto repeat; + } + } else + current->nice = -20; + } + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + err = 0; + /* + * this also signals 'finished resyncing' to md_stop + */ +out: + wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0); + up(&mddev->resync_sem); +out_nolock: + mddev->curr_resync = 0; + wake_up(&resync_wait); + return err; +} + + +/* + * This is a kernel thread which syncs a spare disk with the active array + * + * the amount of foolproofing might seem to be a tad excessive, but an + * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs + * of my root partition with the first 0.5 gigs of my /home partition ... so + * i'm a bit nervous ;) + */ +void md_do_recovery(void *data) +{ + int err; + mddev_t *mddev; + mdp_super_t *sb; + mdp_disk_t *spare; + struct md_list_head *tmp; + + printk(KERN_INFO "md: recovery thread got woken up ...\n"); +restart: + ITERATE_MDDEV(mddev,tmp) { + sb = mddev->sb; + if (!sb) + continue; + if (mddev->recovery_running) + continue; + if (sb->active_disks == sb->raid_disks) + continue; + if (mddev->sb_dirty) + md_update_sb(mddev); + if (!sb->spare_disks) { + printk(KERN_ERR "md%d: no spare disk to reconstruct array! " + "-- continuing in degraded mode\n", mdidx(mddev)); + continue; + } + /* + * now here we get the spare and resync it. + */ + spare = get_spare(mddev); + if (!spare) + continue; + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", + mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!mddev->pers->diskop) + continue; + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) + continue; + down(&mddev->recovery_sem); + mddev->recovery_running = 1; + err = md_do_sync(mddev, spare); + if (err == -EIO) { + printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", + mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!disk_faulty(spare)) { + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); + mark_disk_faulty(spare); + mark_disk_nonsync(spare); + mark_disk_inactive(spare); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + } + } else + if (disk_faulty(spare)) + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + if (err == -EINTR || err == -ENOMEM) { + /* + * Recovery got interrupted, or ran out of mem ... + * signal back that we have finished using the array. + */ + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + up(&mddev->recovery_sem); + mddev->recovery_running = 0; + continue; + } else { + mddev->recovery_running = 0; + up(&mddev->recovery_sem); + } + if (!disk_faulty(spare)) { + /* + * the SPARE_ACTIVE diskop possibly changes the + * pointer too + */ + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); + mark_disk_sync(spare); + mark_disk_active(spare); + sb->active_disks++; + sb->spare_disks--; + } + mddev->sb_dirty = 1; + md_update_sb(mddev); + goto restart; + } + printk(KERN_INFO "md: recovery thread finished ...\n"); + +} + +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct md_list_head *tmp; + mddev_t *mddev; + + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) + || (code == MD_SYS_POWER_OFF)) { + + printk(KERN_INFO "md: stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + md_mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + notifier_call: md_notify_reboot, + next: NULL, + priority: INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + struct proc_dir_entry *p; + int i; + + for(i = 0; i < MAX_MD_DEVS; i++) { + md_blocksizes[i] = 1024; + md_size[i] = 0; + md_hardsect_sizes[i] = 512; + } + blksize_size[MAJOR_NR] = md_blocksizes; + blk_size[MAJOR_NR] = md_size; + max_readahead[MAJOR_NR] = md_maxreadahead; + hardsect_size[MAJOR_NR] = md_hardsect_sizes; + + dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + p = create_proc_entry("mdstat", S_IRUGO, NULL); + if (p) + p->proc_fops = &md_seq_fops; +#endif +} + +request_queue_t * md_queue_proc(kdev_t dev) +{ + mddev_t *mddev = kdev_to_mddev(dev); + if (mddev == NULL) + return BLK_DEFAULT_QUEUE(MAJOR_NR); + else + return &mddev->queue; +} + +int md__init md_init(void) +{ + static char * name = "mdrecoveryd"; + int minor; + + printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + + if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops)) + { + printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR); + return (-1); + } + devfs_handle = devfs_mk_dir (NULL, "md", NULL); + /* we don't use devfs_register_series because we want to fill md_hd_struct */ + for (minor=0; minor < MAX_MD_DEVS; ++minor) { + char devname[128]; + sprintf (devname, "%u", minor); + md_hd_struct[minor].de = devfs_register (devfs_handle, + devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor, + S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); + } + + /* all requests on an uninitialised device get failed... */ + blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request); + blk_dev[MAJOR_NR].queue = md_queue_proc; + + + read_ahead[MAJOR_NR] = INT_MAX; + + add_gendisk(&md_gendisk); + + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); + if (!md_recovery_thread) + printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n"); + + md_register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table, 1); + + md_geninit(); + return (0); +} + + +#ifndef MODULE + +/* + * When md (and any require personalities) are compiled into the kernel + * (not a module), arrays can be assembles are boot time using with AUTODETECT + * where specially marked partitions are registered with md_autodetect_dev(), + * and with MD_BOOT where devices to be collected are given on the boot line + * with md=..... + * The code for that is here. + */ + +struct { + int set; + int noautodetect; +} raid_setup_args md__initdata; + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +static kdev_t detected_devices[128]; +static int dev_cnt; + +void md_autodetect_dev(kdev_t dev) +{ + if (dev_cnt >= 0 && dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} + + +static void autostart_arrays(void) +{ + mdk_rdev_t *rdev; + int i; + + printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + + for (i = 0; i < dev_cnt; i++) { + kdev_t dev = detected_devices[i]; + + if (md_import_device(dev,1)) { + printk(KERN_ALERT "md: could not import %s!\n", + partition_name(dev)); + continue; + } + /* + * Sanity checks: + */ + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + dev_cnt = 0; + + autorun_devices(-1); +} + +static struct { + char device_set [MAX_MD_DEVS]; + int pers[MAX_MD_DEVS]; + int chunk[MAX_MD_DEVS]; + char *device_names[MAX_MD_DEVS]; +} md_setup_args md__initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistant-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + * 2001-06-03: Dave Cinege + * Shifted name_to_kdev_t() and related operations to md_set_drive() + * for later execution. Rewrote section to make devfs compatible. + */ +static int md__init md_setup(char *str) +{ + int minor, level, factor, fault; + char *pername = ""; + char *str1 = str; + + if (get_option(&str, &minor) != 2) { /* MD Number */ + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + if (minor >= MAX_MD_DEVS) { + printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor); + return 0; + } else if (md_setup_args.device_names[minor]) { + printk(KERN_WARNING "md: md=%d, Specified more then once. " + "Replacing previous definition.\n", minor); + } + switch (get_option(&str, &level)) { /* RAID Personality */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == -1) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args.pers[minor] = level; + md_setup_args.chunk[minor] = 1 << (factor+12); + switch(level) { + case -1: + level = LINEAR; + pername = "linear"; + break; + case 0: + level = RAID0; + pername = "raid0"; + break; + default: + printk(KERN_WARNING + "md: The kernel has not been configured for raid%d support!\n", + level); + return 0; + } + md_setup_args.pers[minor] = level; + break; + } + /* FALL THROUGH */ + case 1: /* the first device is numeric */ + str = str1; + /* FALL THROUGH */ + case 0: + md_setup_args.pers[minor] = 0; + pername="super-block"; + } + + printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", + minor, pername, str); + md_setup_args.device_names[minor] = str; + + return 1; +} + +extern kdev_t name_to_kdev_t(char *line) md__init; +void md__init md_setup_drive(void) +{ + int minor, i; + kdev_t dev; + mddev_t*mddev; + kdev_t devices[MD_SB_DISKS+1]; + + for (minor = 0; minor < MAX_MD_DEVS; minor++) { + int err = 0; + char *devname; + mdu_disk_info_t dinfo; + + if ((devname = md_setup_args.device_names[minor]) == 0) continue; + + for (i = 0; i < MD_SB_DISKS && devname != 0; i++) { + + char *p; + void *handle; + + p = strchr(devname, ','); + if (p) + *p++ = 0; + + dev = name_to_kdev_t(devname); + handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev), + DEVFS_SPECIAL_BLK, 1); + if (handle != 0) { + unsigned major, minor; + devfs_get_maj_min(handle, &major, &minor); + dev = MKDEV(major, minor); + } + if (dev == 0) { + printk(KERN_WARNING "md: Unknown device name: %s\n", devname); + break; + } + + devices[i] = dev; + md_setup_args.device_set[minor] = 1; + + devname = p; + } + devices[i] = 0; + + if (md_setup_args.device_set[minor] == 0) + continue; + + if (mddev_map[minor]) { + printk(KERN_WARNING + "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", + minor); + continue; + } + printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]); + + mddev = alloc_mddev(MKDEV(MD_MAJOR,minor)); + if (!mddev) { + printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor); + continue; + } + if (md_setup_args.pers[minor]) { + /* non-persistent */ + mdu_array_info_t ainfo; + ainfo.level = pers_to_level(md_setup_args.pers[minor]); + ainfo.size = 0; + ainfo.nr_disks =0; + ainfo.raid_disks =0; + ainfo.md_minor =minor; + ainfo.not_persistent = 1; + + ainfo.state = (1 << MD_SB_CLEAN); + ainfo.active_disks = 0; + ainfo.working_disks = 0; + ainfo.failed_disks = 0; + ainfo.spare_disks = 0; + ainfo.layout = 0; + ainfo.chunk_size = md_setup_args.chunk[minor]; + err = set_array_info(mddev, &ainfo); + for (i = 0; !err && (dev = devices[i]); i++) { + dinfo.number = i; + dinfo.raid_disk = i; + dinfo.state = (1<sb->nr_disks++; + mddev->sb->raid_disks++; + mddev->sb->active_disks++; + mddev->sb->working_disks++; + err = add_new_disk (mddev, &dinfo); + } + } else { + /* persistent */ + for (i = 0; (dev = devices[i]); i++) { + dinfo.major = MAJOR(dev); + dinfo.minor = MINOR(dev); + add_new_disk (mddev, &dinfo); + } + } + if (!err) + err = do_md_run(mddev); + if (err) { + mddev->sb_dirty = 0; + do_md_stop(mddev, 0); + printk(KERN_WARNING "md: starting md%d failed\n", minor); + } + } +} + +static int md__init raid_setup(char *str) +{ + int len, pos; + + len = strlen(str) + 1; + pos = 0; + + while (pos < len) { + char *comma = strchr(str+pos, ','); + int wlen; + if (comma) + wlen = (comma-str)-pos; + else wlen = (len-1)-pos; + + if (strncmp(str, "noautodetect", wlen) == 0) + raid_setup_args.noautodetect = 1; + pos += wlen+1; + } + raid_setup_args.set = 1; + return 1; +} + +int md__init md_run_setup(void) +{ + if (raid_setup_args.noautodetect) + printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n"); + else + autostart_arrays(); + md_setup_drive(); + return 0; +} + +__setup("raid=", raid_setup); +__setup("md=", md_setup); + +__initcall(md_init); +__initcall(md_run_setup); + +#else /* It is a MODULE */ + +int init_module(void) +{ + return md_init(); +} + +static void free_device_names(void) +{ + while (!list_empty(&device_names)) { + struct dname *tmp = list_entry(device_names.next, + dev_name_t, list); + list_del(&tmp->list); + kfree(tmp); + } +} + + +void cleanup_module(void) +{ + md_unregister_thread(md_recovery_thread); + devfs_unregister(devfs_handle); + + devfs_unregister_blkdev(MAJOR_NR,"md"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("mdstat", NULL); +#endif + + del_gendisk(&md_gendisk); + + blk_dev[MAJOR_NR].queue = NULL; + blksize_size[MAJOR_NR] = NULL; + blk_size[MAJOR_NR] = NULL; + max_readahead[MAJOR_NR] = NULL; + hardsect_size[MAJOR_NR] = NULL; + + free_device_names(); + +} +#endif + +MD_EXPORT_SYMBOL(md_size); +MD_EXPORT_SYMBOL(register_md_personality); +MD_EXPORT_SYMBOL(unregister_md_personality); +MD_EXPORT_SYMBOL(partition_name); +MD_EXPORT_SYMBOL(md_error); +MD_EXPORT_SYMBOL(md_do_sync); +MD_EXPORT_SYMBOL(md_sync_acct); +MD_EXPORT_SYMBOL(md_done_sync); +MD_EXPORT_SYMBOL(md_recover_arrays); +MD_EXPORT_SYMBOL(md_register_thread); +MD_EXPORT_SYMBOL(md_unregister_thread); +MD_EXPORT_SYMBOL(md_update_sb); +MD_EXPORT_SYMBOL(md_wakeup_thread); +MD_EXPORT_SYMBOL(md_print_devices); +MD_EXPORT_SYMBOL(find_rdev_nr); +MD_EXPORT_SYMBOL(md_interrupt_thread); +EXPORT_SYMBOL(mddev_map); +MODULE_LICENSE("GPL"); diff --git a/tests/linux/md-loop/merge b/tests/linux/md-loop/merge new file mode 100644 index 0000000..682ed20 --- /dev/null +++ b/tests/linux/md-loop/merge @@ -0,0 +1,3960 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher + - Devfs support by Richard Gooch + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown . + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_KMOD +#include +#endif + +#define __KERNEL_SYSCALLS__ +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER + +#include + +#define DEBUG 0 +#if DEBUG +# define dprintk(x...) printk(x) +#else +# define dprintk(x...) do { } while(0) +#endif + +#ifndef MODULE +static void autostart_arrays (void); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 100; +static int sysctl_speed_limit_max = 100000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min", + &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, + {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max", + &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table raid_dir_table[] = { + {DEV_RAID, "raid", NULL, 0, 0555, raid_table}, + {0} +}; + +static ctl_table raid_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table}, + {0} +}; + +/* + * these have to be allocated separately because external + * subsystems want to have a pre-defined structure + */ +struct hd_struct md_hd_struct[MAX_MD_DEVS]; +static int md_blocksizes[MAX_MD_DEVS]; +static int md_hardsect_sizes[MAX_MD_DEVS]; +static mdk_thread_t *md_recovery_thread; + +int md_size[MAX_MD_DEVS]; + +static struct block_device_operations md_fops; +static devfs_handle_t devfs_handle; + +static struct gendisk md_gendisk= +{ + major: MD_MAJOR, + major_name: "md", + minor_shift: 0, + max_p: 1, + part: md_hd_struct, + sizes: md_size, + nr_real: MAX_MD_DEVS, + real_devices: NULL, + next: NULL, + fops: &md_fops, +}; + +/* + * Enables to iterate over all existing md arrays + */ +static MD_LIST_HEAD(all_mddevs); + +static mddev_t *mddev_map[MAX_MD_DEVS]; + +static inline mddev_t * kdev_to_mddev (kdev_t dev) +{ + if (MAJOR(dev) != MD_MAJOR) + BUG(); + return mddev_map[MINOR(dev)]; +} + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ + bio_io_error(bio); + return 0; +} + +static mddev_t * alloc_mddev(kdev_t dev) +{ + mddev_t *mddev; + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return 0; + } + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); + if (!mddev) + return NULL; + + memset(mddev, 0, sizeof(*mddev)); + + mddev->__minor = MINOR(dev); + init_MUTEX(&mddev->reconfig_sem); + init_MUTEX(&mddev->recovery_sem); + init_MUTEX(&mddev->resync_sem); + MD_INIT_LIST_HEAD(&mddev->disks); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + atomic_set(&mddev->active, 0); + + mddev_map[mdidx(mddev)] = mddev; + md_list_add(&mddev->all_mddevs, &all_mddevs); + + MOD_INC_USE_COUNT; + + return mddev; +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +static MD_LIST_HEAD(device_names); + +char * partition_name(kdev_t dev) +{ + struct gendisk *hd; + static char nomem [] = ""; + dev_name_t *dname; + struct md_list_head *tmp; + + list_for_each(tmp, &device_names) { + dname = md_list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + + if (!dname) + return nomem; + /* + * ok, add this new device name to the list + */ + hd = get_gendisk (dev); + dname->name = NULL; + if (hd) + dname->name = disk_name (hd, MINOR(dev), dname->namebuf); + if (!dname->name) { + sprintf (dname->namebuf, "[dev %s]", kdevname(dev)); + dname->name = dname->namebuf; + } + + dname->dev = dev; + md_list_add(&dname->list, &device_names); + + return dname->name; +} + +static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev, + int persistent) +{ + unsigned int size = 0; + + if (blk_size[MAJOR(dev)]) + size = blk_size[MAJOR(dev)][MINOR(dev)]; + if (persistent) + size = MD_NEW_SIZE_BLOCKS(size); + return size; +} + +static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent) +{ + unsigned int size; + + size = calc_dev_sboffset(dev, mddev, persistent); + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +static unsigned int zoned_raid_size(mddev_t *mddev) +{ + unsigned int mask; + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* + * do size and offset calculations. + */ + mask = ~(mddev->sb->chunk_size/1024 - 1); + + ITERATE_RDEV(mddev,rdev,tmp) { + rdev->size &= mask; + md_size[mdidx(mddev)] += rdev->size; + } + return 0; +} + +static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb) +{ + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = 0; + disk->minor = 0; + mark_disk_removed(disk); +} + +#define BAD_MAGIC KERN_ERR \ +"md: invalid raid superblock magic on %s\n" + +#define BAD_MINOR KERN_ERR \ +"md: %s: invalid raid minor (%x)\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_SB KERN_ERR \ +"md: disabled device %s, could not read superblock.\n" + +#define BAD_CSUM KERN_WARNING \ +"md: invalid superblock checksum on %s\n" + +static int alloc_array_sb(mddev_t * mddev) +{ + if (mddev->sb) { + MD_BUG(); + return 0; + } + + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) + return -ENOMEM; + md_clear_page(mddev->sb); + return 0; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(OUT_OF_MEM); + return -EINVAL; + } + rdev->sb = (mdp_super_t *) page_address(rdev->sb_page); + + return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) { + page_cache_release(rdev->sb_page); + rdev->sb = NULL; + rdev->sb_page = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->faulty) + MD_BUG(); + } +} + + +static void bh_complete(struct buffer_head *bh, int uptodate) +{ + + if (uptodate) + set_bit(BH_Uptodate, &bh->b_state); + + complete((struct completion*)bh->b_private); +} + +static int sync_page_io(kdev_t dev, unsigned long sector, int size, + struct page *page, int rw) +{ + struct buffer_head bh; + struct completion event; + + init_completion(&event); + init_buffer(&bh, bh_complete, &event); + bh.b_rdev = dev; + bh.b_rsector = sector; + bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock); + bh.b_size = size; + bh.b_page = page; + bh.b_reqnext = NULL; + bh.b_data = page_address(page); + generic_make_request(rw, &bh); + + run_task_queue(&tq_disk); + wait_for_completion(&event); + + return test_bit(BH_Uptodate, &bh.b_state); +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ + int ret = -EINVAL; + kdev_t dev = rdev->dev; + unsigned long sb_offset; + + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + + /* + * Calculate the position of the superblock, + * it's at the end of the disk + */ + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); + rdev->sb_offset = sb_offset; + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) { + printk(NO_SB,partition_name(dev)); + return -EINVAL; + } + printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo); + ret = 0; +abort: + return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb(mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + printk(BAD_MAGIC, partition_name(rdev->dev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor); + goto abort; + } + + if (calc_sb_csum(sb) != sb->sb_csum) { + printk(BAD_CSUM, partition_name(rdev->dev)); + goto abort; + } + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = get_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + printk( KERN_WARNING +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" +" protection against single-disk failure might be compromised.\n", + mdidx(mddev), partition_name(rdev->dev), + partition_name(same_pdev->dev)); + + md_list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev)); +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del_init(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev)); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev(mdk_rdev_t *rdev) +{ + int err = 0; + struct block_device *bdev; + + bdev = bdget(rdev->dev); + if (!bdev) + return -ENOMEM; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + if (!err) + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + blkdev_put(bdev, BDEV_RAW); +} + +void md_autodetect_dev(kdev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ + printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev)); + if (rdev->mddev) + MD_BUG(); + unlock_rdev(rdev); + free_disk_sb(rdev); + list_del_init(&rdev->all); + if (!list_empty(&rdev->pending)) { + printk(KERN_INFO "md: (%s was pending)\n", + partition_name(rdev->dev)); + list_del_init(&rdev->pending); + } +#ifndef MODULE + md_autodetect_dev(rdev->dev); +#endif + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); +} + +static void free_mddev(mddev_t *mddev) +{ + if (!mddev) { + MD_BUG(); + return; + } + + export_array(mddev); + md_size[mdidx(mddev)] = 0; + md_hd_struct[mdidx(mddev)].nr_sects = 0; + + /* + * Make sure nobody else is using this mddev + * (careful, we rely on the global kernel lock here) + */ + while (sem_getcount(&mddev->resync_sem) != 1) + schedule(); + while (sem_getcount(&mddev->recovery_sem) != 1) + schedule(); + +<<<<<<< + del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); +||||||| + del_mddev_mapping(mddev, mk_kdev(MD_MAJOR, mdidx(mddev))); +======= + mddev_map[mdidx(mddev)] = NULL; +>>>>>>> + md_list_del(&mddev->all_mddevs); + kfree(mddev); + MOD_DEC_USE_COUNT; +} + +#undef BAD_CSUM +#undef BAD_MAGIC +#undef OUT_OF_MEM +#undef NO_SB + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ", + partition_name(rdev->dev), partition_name(rdev->old_dev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk(KERN_INFO "md: rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * *\n"); + printk("md: **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", partition_name(rdev->dev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk("md: **********************************\n"); + printk("\n"); +} + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) +{ + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) + + return 1; + + return 0; +} + +static mdk_rdev_t * find_rdev_all(kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + list_for_each(tmp, &all_raid_disks) { + rdev = md_list_entry(tmp, mdk_rdev_t, all); + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +#define GETBLK_FAILED KERN_ERR \ +"md: getblk failed for device %s\n" + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + kdev_t dev; + unsigned long sb_offset, size; + + if (!rdev->sb) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + if (rdev->sb->md_magic != MD_SB_MAGIC) { + MD_BUG(); + return 1; + } + + dev = rdev->dev; + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); + if (rdev->sb_offset != sb_offset) { + printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n", + partition_name(dev), rdev->sb_offset, sb_offset); + goto skip; + } + /* + * If the disk went offline meanwhile and it's just a spare, then + * its size has changed to zero silently, and the MD code does + * not yet know that it's faulty. + */ + size = calc_dev_size(dev, rdev->mddev, 1); + if (size != rdev->size) { + printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n", + partition_name(dev), rdev->size, size); + goto skip; + } + + printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset); + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) { + printk("md: write_disk_sb failed for device %s\n", partition_name(dev)); + return 1; + } +skip: + return 0; +} +#undef GETBLK_FAILED + +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int i, ok = 0; + mdp_disk_t *desc; + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = mddev->sb->disks + i; +#if 0 + if (disk_faulty(desc)) { + if (MKDEV(desc->major,desc->minor) == rdev->dev) + ok = 1; + continue; + } +#endif + if (MKDEV(desc->major,desc->minor) == rdev->dev) { + rdev->sb->this_disk = *desc; + rdev->desc_nr = desc->number; + ok = 1; + break; + } + } + + if (!ok) { + MD_BUG(); + } +} + +static int sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + mdp_super_t *sb; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty || rdev->alias_device) + continue; + sb = rdev->sb; + *sb = *mddev->sb; + set_this_disk(mddev, rdev); + sb->sb_csum = calc_sb_csum(sb); + } + return 0; +} + +int md_update_sb(mddev_t * mddev) +{ + int err, count = 100; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + if (!mddev->sb_dirty) { + printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0)); + return 0; + } + mddev->sb_dirty = 0; +repeat: + mddev->sb->utime = CURRENT_TIME; + if ((++mddev->sb->events_lo)==0) + ++mddev->sb->events_hi; + + if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff; + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (mddev->sb->not_persistent) + return 0; + + printk(KERN_INFO "md: updating md%d RAID superblock on device\n", + mdidx(mddev)); + + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + printk(KERN_INFO "md: "); + if (rdev->faulty) + printk("(skipping faulty "); + if (rdev->alias_device) + printk("(skipping alias "); + if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) { + printk("(skipping new-faulty %s )\n", + partition_name(rdev->dev)); + continue; + } + printk("%s ", partition_name(rdev->dev)); + if (!rdev->faulty && !rdev->alias_device) { + printk("[events: %08lx]", + (unsigned long)rdev->sb->events_lo); + err += write_disk_sb(rdev); + } else + printk(")\n"); + } + if (err) { + if (--count) { + printk(KERN_ERR "md: errors occurred during superblock update, repeating\n"); + goto repeat; + } + printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n"); + } + return 0; +} + +/* + * Import a device. If 'on_disk', then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + */ +static int md_import_device(kdev_t newdev, int on_disk) +{ + int err; + mdk_rdev_t *rdev; + unsigned int size; + + if (find_rdev_all(newdev)) + return -EEXIST; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev)); + return -ENOMEM; + } + memset(rdev, 0, sizeof(*rdev)); + + if (is_mounted(newdev)) { + printk(KERN_WARNING "md: can not import %s, has active inodes!\n", + partition_name(newdev)); + err = -EBUSY; + goto abort_free; + } + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + rdev->dev = newdev; + if (lock_rdev(rdev)) { + printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + + size = 0; + if (blk_size[MAJOR(newdev)]) + size = blk_size[MAJOR(newdev)][MINOR(newdev)]; + if (!size) { + printk(KERN_WARNING "md: %s has zero size, marking faulty!\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + + if (on_disk) { + if ((err = read_disk_sb(rdev))) { + printk(KERN_WARNING "md: could not read %s's sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + if ((err = check_disk_sb(rdev))) { + printk(KERN_WARNING "md: %s has invalid sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + + if (rdev->sb->level != -4) { + rdev->old_dev = MKDEV(rdev->sb->this_disk.major, + rdev->sb->this_disk.minor); + rdev->desc_nr = rdev->sb->this_disk.number; + } else { + rdev->old_dev = MKDEV(0, 0); + rdev->desc_nr = -1; + } + } + md_list_add(&rdev->all, &all_raid_disks); + MD_INIT_LIST_HEAD(&rdev->pending); + INIT_LIST_HEAD(&rdev->same_set); + + return 0; + +abort_free: + if (rdev->sb) { + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return err; +} + +/* + * Check a full RAID array for plausibility + */ + +#define INCONSISTENT KERN_ERR \ +"md: fatal superblock inconsistency in %s -- removing from array\n" + +#define OUT_OF_DATE KERN_ERR \ +"md: superblock update time inconsistency -- using the most recent one\n" + +#define OLD_VERSION KERN_ALERT \ +"md: md%d: unsupported raid array version %d.%d.%d\n" + +#define NOT_CLEAN_IGNORE KERN_ERR \ +"md: md%d: raid array is not clean -- starting background reconstruction\n" + +#define UNKNOWN_LEVEL KERN_ERR \ +"md: md%d: unsupported raid level %d\n" + +static int analyze_sbs(mddev_t * mddev) +{ + int out_of_date = 0, i, first; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2, *freshest; + mdp_super_t *sb; + + /* + * Verify the RAID superblock on each real device + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + MD_BUG(); + goto abort; + } + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + if (check_disk_sb(rdev)) + goto abort; + } + + /* + * The superblock constant part has to be the same + * for all disks in the array. + */ + sb = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!sb) { + sb = rdev->sb; + continue; + } + if (!sb_equal(sb, rdev->sb)) { + printk(INCONSISTENT, partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * OK, we have all disks and the array is ready to run. Let's + * find the freshest superblock, that one will be the superblock + * that represents the whole array. + */ + if (!mddev->sb) + if (alloc_array_sb(mddev)) + goto abort; + sb = mddev->sb; + freshest = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2; + /* + * if the checksum is invalid, use the superblock + * only as a last resort. (decrease it's age by + * one event) + */ + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { + if (rdev->sb->events_lo || rdev->sb->events_hi) + if ((rdev->sb->events_lo--)==0) + rdev->sb->events_hi--; + } + + printk(KERN_INFO "md: %s's event counter: %08lx\n", + partition_name(rdev->dev), + (unsigned long)rdev->sb->events_lo); + if (!freshest) { + freshest = rdev; + continue; + } + /* + * Find the newest superblock version + */ + ev1 = md_event(rdev->sb); + ev2 = md_event(freshest->sb); + if (ev1 != ev2) { + out_of_date = 1; + if (ev1 > ev2) + freshest = rdev; + } + } + if (out_of_date) { + printk(OUT_OF_DATE); + printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev)); + } + memcpy (sb, freshest->sb, sizeof(*sb)); + + /* + * at this point we have picked the 'best' superblock + * from all available superblocks. + * now we validate this superblock and kick out possibly + * failed disks. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Kick all non-fresh devices + */ + __u64 ev1, ev2; + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ++ev1; + if (ev1 < ev2) { + printk(KERN_WARNING "md: kicking non-fresh %s from array!\n", + partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * Fix up changed device names ... but only if this disk has a + * recent update time. Use faulty checksum ones too. + */ + if (mddev->sb->level != -4) + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2, ev3; + if (rdev->faulty || rdev->alias_device) { + MD_BUG(); + goto abort; + } + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ev3 = ev2; + --ev3; + if ((rdev->dev != rdev->old_dev) && + ((ev1 == ev2) || (ev1 == ev3))) { + mdp_disk_t *desc; + + printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n", + partition_name(rdev->old_dev), partition_name(rdev->dev)); + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + desc = &sb->disks[rdev->desc_nr]; + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { + MD_BUG(); + goto abort; + } + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + desc = &rdev->sb->this_disk; + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + } + } + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + /* + * We kick faulty devices/descriptors immediately. + * + * Note: multipath devices are a special case. Since we + * were able to read the superblock on the path, we don't + * care if it was previously marked as faulty, it's up now + * so enable it. + */ + if (disk_faulty(desc) && mddev->sb->level != -4) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + printk(KERN_WARNING "md%d: kicking faulty %s!\n", + mdidx(mddev),partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + if (dev == MKDEV(0,0)) + continue; + printk(KERN_WARNING "md%d: removing former faulty %s!\n", + mdidx(mddev), partition_name(dev)); + } + remove_descriptor(desc, sb); + continue; + } else if (disk_faulty(desc)) { + /* + * multipath entry marked as faulty, unfaulty it + */ + rdev = find_rdev(mddev, dev); + if(rdev) + mark_disk_spare(desc); + else + remove_descriptor(desc, sb); + } + + if (dev == MKDEV(0,0)) + continue; + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Multi-path IO special-case: since we have no + * this_disk descriptor at auto-detect time, + * we cannot check rdev->number. + * We can check the device though. + */ + if ((sb->level == -4) && (rdev->dev == + MKDEV(desc->major,desc->minor))) { + found = 1; + break; + } + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } + } + if (found) + continue; + + printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n", + mdidx(mddev), partition_name(dev)); + remove_descriptor(desc, sb); + } + + /* + * Double check wether all devices mentioned in the + * superblock are in the rdev ring. + */ + first = 1; + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + + if (disk_faulty(desc)) { + MD_BUG(); + goto abort; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + /* + * In the case of Multipath-IO, we have no + * other information source to find out which + * disk is which, only the position of the device + * in the superblock: + */ + if (mddev->sb->level == -4) { + if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) { + MD_BUG(); + goto abort; + } + rdev->desc_nr = i; + if (!first) + rdev->alias_device = 1; + else + first = 0; + } + } + + /* + * Kick all rdevs that are not in the + * descriptor array: + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) + kick_rdev_from_array(rdev); + } + + /* + * Do a final reality check. + */ + if (mddev->sb->level != -4) { + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + /* + * is the desc_nr unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->desc_nr == rdev->desc_nr)) { + MD_BUG(); + goto abort; + } + } + /* + * is the device unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->dev == rdev->dev)) { + MD_BUG(); + goto abort; + } + } + } + } + + /* + * Check if we can support this RAID array + */ + if (sb->major_version != MD_MAJOR_VERSION || + sb->minor_version > MD_MINOR_VERSION) { + + printk(OLD_VERSION, mdidx(mddev), sb->major_version, + sb->minor_version, sb->patch_version); + goto abort; + } + + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || + (sb->level == 4) || (sb->level == 5))) + printk(NOT_CLEAN_IGNORE, mdidx(mddev)); + + return 0; +abort: + return 1; +} + +#undef INCONSISTENT +#undef OUT_OF_DATE +#undef OLD_VERSION +#undef OLD_LEVEL + +static int device_size_calculation(mddev_t * mddev) +{ + int data_disks = 0, persistent; + unsigned int readahead; + mdp_super_t *sb = mddev->sb; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + persistent = !mddev->sb->not_persistent; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size) { + MD_BUG(); + continue; + } + rdev->size = calc_dev_size(rdev->dev, mddev, persistent); + if (rdev->size < sb->chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size: %ldk < %dk\n", + partition_name(rdev->dev), + rdev->size, sb->chunk_size / 1024); + return -EINVAL; + } + } + + switch (sb->level) { + case -4: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case -1: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = sb->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = sb->raid_disks-1; + break; + default: + printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = sb->size * data_disks; + + readahead = MD_READAHEAD; + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) { + readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + // (no multipath branch - it uses the default setting) + if (sb->level == -3) + readahead = 0; + } + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; +abort: + return 1; +} + + +#define TOO_BIG_CHUNKSIZE KERN_ERR \ +"too big chunk_size: %d > %d\n" + +#define TOO_SMALL_CHUNKSIZE KERN_ERR \ +"too small chunk_size: %d < %ld\n" + +#define BAD_CHUNKSIZE KERN_ERR \ +"no chunksize specified, see 'man raidtab'\n" + +static int do_md_run(mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Resize disks to align partitions size on a given + * chunk size. + */ + md_size[mdidx(mddev)] = 0; + + /* + * Analyze all RAID superblock(s) + */ + if (analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->sb->chunk_size; + pnum = level_to_pers(mddev->sb->level); + + if ((pnum != MULTIPATH) && (pnum != RAID1)) { + if (!chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We dont + * want to continue the bad practice. + */ + printk(BAD_CHUNKSIZE); + return -EINVAL; + } + if (chunk_size > MAX_CHUNK_SIZE) { + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); + return -EINVAL; + } + } else + if (chunk_size) + printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n", + mddev->sb->level); + + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (!pers[pnum]) + { +#ifdef CONFIG_KMOD + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + if (!pers[pnum]) +#endif + { + printk(KERN_ERR "md: personality %d is not loaded!\n", + pnum); + return -EINVAL; + } + } + + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + md_hardsect_sizes[mdidx(mddev)] = 512; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + invalidate_device(rdev->dev, 1); + if (get_hardsect_size(rdev->dev) + > md_hardsect_sizes[mdidx(mddev)]) + md_hardsect_sizes[mdidx(mddev)] = + get_hardsect_size(rdev->dev); + } + md_blocksizes[mdidx(mddev)] = 1024; + if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)]) + md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)]; + mddev->pers = pers[pnum]; + + blk_queue_make_request(&mddev->queue, mddev->pers->make_request); + mddev->queue.queuedata = mddev; + + err = mddev->pers->run(mddev); + if (err) { + printk(KERN_ERR "md: pers->run() failed ...\n"); + mddev->pers = NULL; + return -EINVAL; + } + + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + /* + * md_size has units of 1K blocks, which are + * twice as large as sectors. + */ + md_hd_struct[mdidx(mddev)].start_sect = 0; + register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)), + 1, &md_fops, md_size[mdidx(mddev)]<<1); + + read_ahead[MD_MAJOR] = 1024; + return (0); +} + +#undef TOO_BIG_CHUNKSIZE +#undef BAD_CHUNKSIZE + +static int restart_array(mddev_t *mddev) +{ + int err; + + /* + * Complain if it has no devices + */ + err = -ENXIO; + if (list_empty(&mddev->disks)) + goto out; + + if (mddev->pers) { + err = -EBUSY; + if (!mddev->ro) + goto out; + + mddev->ro = 0; + set_device_ro(mddev_to_kdev(mddev), 0); + + printk(KERN_INFO + "md: md%d switched to read-write mode.\n", mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + md_recover_arrays(); + if (mddev->pers->restart_resync) + mddev->pers->restart_resync(mddev); + err = 0; + } else { + printk(KERN_ERR "md: md%d has no personality assigned.\n", + mdidx(mddev)); + err = -EINVAL; + } + +out: + return err; +} + +#define STILL_MOUNTED KERN_WARNING \ +"md: md%d still mounted.\n" +#define STILL_IN_USE \ +"md: md%d still in use.\n" + +static int do_md_stop(mddev_t * mddev, int ro) +{ + int err = 0, resync_interrupted = 0; + kdev_t dev = mddev_to_kdev(mddev); + + if (atomic_read(&mddev->active)>1) { + printk(STILL_IN_USE, mdidx(mddev)); + err = -EBUSY; + goto out; + } + + if (mddev->pers) { + /* + * It is safe to call stop here, it only frees private + * data. Also, it tells us if a device is unstoppable + * (eg. resyncing is in progress) + */ + if (mddev->pers->stop_resync) + if (mddev->pers->stop_resync(mddev)) + resync_interrupted = 1; + + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + + /* + * This synchronizes with signal delivery to the + * resync or reconstruction thread. It also nicely + * hangs the process if some reconstruction has not + * finished. + */ + down(&mddev->recovery_sem); + up(&mddev->recovery_sem); + + invalidate_device(dev, 1); + + if (ro) { + err = -ENXIO; + if (mddev->ro) + goto out; + mddev->ro = 1; + } else { + if (mddev->ro) + set_device_ro(dev, 0); + if (mddev->pers->stop(mddev)) { + err = -EBUSY; + if (mddev->ro) + set_device_ro(dev, 1); + goto out; + } + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->sb) { + /* + * mark it clean only if there was no resync + * interrupted. + */ + if (!mddev->recovery_running && !resync_interrupted) { + printk(KERN_INFO "md: marking sb clean...\n"); + mddev->sb->state |= 1 << MD_SB_CLEAN; + } + mddev->sb_dirty = 1; + md_update_sb(mddev); + } + if (ro) + set_device_ro(dev, 1); + } + + /* + * Free resources if final stop + */ + if (!ro) { + printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); + free_mddev(mddev); + } else + printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev)); + err = 0; +out: + return err; +} + +/* + * We have to safely support old arrays too. + */ +int detect_old_array(mdp_super_t *sb) +{ + if (sb->major_version > 0) + return 0; + if (sb->minor_version >= 90) + return 0; + + return -EINVAL; +} + + +static void autorun_array(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int err; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return; + } + + printk(KERN_INFO "md: running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", partition_name(rdev->dev)); + } + printk("\n"); + + err = do_md_run (mddev); + if (err) { + printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + /* + * prevent the writeback of an unrunnable array + */ + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in the ->pending list) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(kdev_t countdev) +{ + struct md_list_head candidates; + struct md_list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + kdev_t md_kdev; + + + printk(KERN_INFO "md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + rdev0 = md_list_entry(pending_raid_disks.next, + mdk_rdev_t, pending); + + printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev)); + MD_INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) { + if (uuid_equal(rdev0, rdev)) { + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING + "md: %s has same UUID as %s, but superblocks differ ...\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + continue; + } + printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev)); + md_list_del(&rdev->pending); + md_list_add(&rdev->pending, &candidates); + } + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); + mddev = kdev_to_mddev(md_kdev); + if (mddev) { + printk(KERN_WARNING "md: md%d already running, cannot run %s\n", + mdidx(mddev), partition_name(rdev0->dev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) + export_rdev(rdev); + continue; + } + mddev = alloc_mddev(md_kdev); + if (!mddev) { + printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); + break; + } + if (md_kdev == countdev) + atomic_inc(&mddev->active); + printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + bind_rdev_to_array(rdev, mddev); + list_del_init(&rdev->pending); + } + autorun_array(mddev); + } + printk(KERN_INFO "md: ... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +#define BAD_VERSION KERN_ERR \ +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_DEVICE KERN_ERR \ +"md: disabled device %s\n" + +#define AUTOADD_FAILED KERN_ERR \ +"md: auto-adding devices to md%d FAILED (error %d).\n" + +#define AUTOADD_FAILED_USED KERN_ERR \ +"md: cannot auto-add device %s to md%d, already used.\n" + +#define AUTORUN_FAILED KERN_ERR \ +"md: auto-running md%d FAILED (error %d).\n" + +#define MDDEV_BUSY KERN_ERR \ +"md: cannot auto-add to md%d, already running.\n" + +#define AUTOADDING KERN_INFO \ +"md: auto-adding devices to md%d, based on %s's superblock.\n" + +#define AUTORUNNING KERN_INFO \ +"md: auto-running md%d.\n" + +static int autostart_array(kdev_t startdev, kdev_t countdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + if (md_import_device(startdev, 1)) { + printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev)); + goto abort; + } + + start_rdev = find_rdev_all(startdev); + if (!start_rdev) { + MD_BUG(); + goto abort; + } + if (start_rdev->faulty) { + printk(KERN_WARNING "md: can not autostart based on faulty %s!\n", + partition_name(startdev)); + goto abort; + } + md_list_add(&start_rdev->pending, &pending_raid_disks); + + sb = start_rdev->sb; + + err = detect_old_array(sb); + if (err) { + printk(KERN_WARNING "md: array version is too old to be autostarted ," + "use raidtools 0.90 mkraid --upgrade to upgrade the array " + "without data loss!\n"); + goto abort; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + if (dev == startdev) + continue; + if (md_import_device(dev, 1)) { + printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n", + partition_name(dev)); + continue; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(countdev); + return 0; + +abort: + if (start_rdev) + export_rdev(start_rdev); + return err; +} + +#undef BAD_VERSION +#undef OUT_OF_MEM +#undef NO_DEVICE +#undef AUTOADD_FAILED_USED +#undef AUTOADD_FAILED +#undef AUTORUN_FAILED +#undef AUTOADDING +#undef AUTORUNNING + + +static int get_version(void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (md_copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +#define SET_FROM_SB(x) info.x = mddev->sb->x +static int get_array_info(mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + + SET_FROM_SB(major_version); + SET_FROM_SB(minor_version); + SET_FROM_SB(patch_version); + SET_FROM_SB(ctime); + SET_FROM_SB(level); + SET_FROM_SB(size); + SET_FROM_SB(nr_disks); + SET_FROM_SB(raid_disks); + SET_FROM_SB(md_minor); + SET_FROM_SB(not_persistent); + + SET_FROM_SB(utime); + SET_FROM_SB(state); + SET_FROM_SB(active_disks); + SET_FROM_SB(working_disks); + SET_FROM_SB(failed_disks); + SET_FROM_SB(spare_disks); + + SET_FROM_SB(layout); + SET_FROM_SB(chunk_size); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x +static int get_disk_info(mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= MD_SB_DISKS) + return -EINVAL; + + SET_FROM_SB(major); + SET_FROM_SB(minor); + SET_FROM_SB(raid_disk); + SET_FROM_SB(state); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_SB(x) mddev->sb->disks[nr].x = info->x + +static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) +{ + int err, size, persistent; + mdk_rdev_t *rdev; + unsigned int nr; + kdev_t dev; + dev = MKDEV(info->major,info->minor); + + if (find_rdev_all(dev)) { + printk(KERN_WARNING "md: device %s already used in a RAID array!\n", + partition_name(dev)); + return -EBUSY; + } + if (!mddev->sb) { + /* expecting a device which has a superblock */ + err = md_import_device(dev, 1); + if (err) { + printk(KERN_WARNING "md: md_import_device returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (!list_empty(&mddev->disks)) { + mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + if (!uuid_equal(rdev0, rdev)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + } + bind_rdev_to_array(rdev, mddev); + return 0; + } + + nr = info->number; + if (nr >= mddev->sb->nr_disks) { + MD_BUG(); + return -EINVAL; + } + + + SET_SB(number); + SET_SB(major); + SET_SB(minor); + SET_SB(raid_disk); + SET_SB(state); + + if ((info->state & (1<old_dev = dev; + rdev->desc_nr = info->number; + + bind_rdev_to_array(rdev, mddev); + + persistent = !mddev->sb->not_persistent; + if (!persistent) + printk(KERN_INFO "md: nonpersistent superblock ...\n"); + + size = calc_dev_size(dev, mddev, persistent); + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + if (!mddev->sb->size || (mddev->sb->size > size)) + mddev->sb->size = size; + } + + /* + * sync all other superblocks with the main superblock + */ + sync_sbs(mddev); + + return 0; +} +#undef SET_SB + +static int hot_generate_error(mddev_t * mddev, kdev_t dev) +{ + struct request_queue *q; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + return -ENXIO; + } + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (!disk_active(disk)) + return -ENODEV; + + q = blk_get_queue(rdev->dev); + if (!q) { + MD_BUG(); + return -ENODEV; + } + printk(KERN_INFO "md: okay, generating error!\n"); +// q->oneshot_error = 1; // disabled for now + + return 0; +} + +static int hot_remove_disk(mddev_t * mddev, kdev_t dev) +{ + int err; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (disk_active(disk)) + goto busy; + + if (disk_removed(disk)) + return -EINVAL; + + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); + if (err == -EBUSY) + goto busy; + + if (err) { + MD_BUG(); + return -EINVAL; + } + + remove_descriptor(disk, mddev->sb); + kick_rdev_from_array(rdev); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + return 0; +busy: + printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk(mddev_t * mddev, kdev_t dev) +{ + int i, err, persistent; + unsigned int size; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + persistent = !mddev->sb->not_persistent; + + rdev = find_rdev(mddev, dev); + if (rdev) + return -EBUSY; + + err = md_import_device (dev, 0); + if (err) { + printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (rdev->faulty) { + printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n", + partition_name(dev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + size = calc_dev_size(dev, mddev, persistent); + + if (size < mddev->sb->size) { + printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n", + mdidx(mddev), size, mddev->sb->size); + err = -ENOSPC; + goto abort_export; + } + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + rdev->old_dev = dev; + rdev->size = size; + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + disk = mddev->sb->disks + mddev->sb->raid_disks; + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { + disk = mddev->sb->disks + i; + + if (!disk->major && !disk->minor) + break; + if (disk_removed(disk)) + break; + } + if (i == MD_SB_DISKS) { + printk(KERN_WARNING "md%d: can not hot-add to full array!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + if (disk_removed(disk)) { + /* + * reuse slot + */ + if (disk->number != i) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + } else { + disk->number = i; + } + + disk->raid_disk = disk->number; + disk->major = MAJOR(dev); + disk->minor = MINOR(dev); + + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + + mark_disk_spare(disk); + mddev->sb->nr_disks++; + mddev->sb->spare_disks++; + mddev->sb->working_disks++; + + mddev->sb_dirty = 1; + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + md_recover_arrays(); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +#define SET_SB(x) mddev->sb->x = info->x +static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +{ + + if (alloc_array_sb(mddev)) + return -ENOMEM; + + mddev->sb->major_version = MD_MAJOR_VERSION; + mddev->sb->minor_version = MD_MINOR_VERSION; + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; + mddev->sb->ctime = CURRENT_TIME; + + SET_SB(level); + SET_SB(size); + SET_SB(nr_disks); + SET_SB(raid_disks); + SET_SB(md_minor); + SET_SB(not_persistent); + + SET_SB(state); + SET_SB(active_disks); + SET_SB(working_disks); + SET_SB(failed_disks); + SET_SB(spare_disks); + + SET_SB(layout); + SET_SB(chunk_size); + + mddev->sb->md_magic = MD_SB_MAGIC; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(&mddev->sb->set_uuid0, 4); + get_random_bytes(&mddev->sb->set_uuid1, 4); + get_random_bytes(&mddev->sb->set_uuid2, 4); + get_random_bytes(&mddev->sb->set_uuid3, 4); + + return 0; +} +#undef SET_SB + +static int set_disk_faulty(mddev_t *mddev, kdev_t dev) +{ + int ret; + + ret = md_error(mddev, dev); + return ret; +} + +static int md_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!md_capable_admin()) + return -EACCES; + + dev = inode->i_rdev; + minor = MINOR(dev); + if (minor >= MAX_MD_DEVS) { + MD_BUG(); + return -EINVAL; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done_unlock; + +#ifndef MODULE + case RAID_AUTORUN: + err = 0; + autostart_arrays(); + goto done; +#endif + + case BLKGETSIZE: + case BLKGETSIZE64: + case BLKRAGET: + case BLKRASET: + case BLKFLSBUF: + case BLKBSZGET: + case BLKBSZSET: + err = blk_ioctl (dev, cmd, arg); + goto abort; + + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = kdev_to_mddev(dev); + + switch (cmd) + { + case SET_ARRAY_INFO: + case START_ARRAY: + if (mddev) { + printk(KERN_WARNING "md: array md%d already exists!\n", + mdidx(mddev)); + err = -EEXIST; + goto abort; + } + default:; + } + switch (cmd) + { + case SET_ARRAY_INFO: + mddev = alloc_mddev(dev); + if (!mddev) { + err = -ENOMEM; + goto abort; + } + atomic_inc(&mddev->active); + + /* + * alloc_mddev() should possibly self-lock. + */ + err = lock_mddev(mddev); + if (err) { + printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n", + err, cmd); + goto abort; + } + + if (mddev->sb) { + printk(KERN_WARNING "md: array md%d already has a superblock!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (arg) { + mdu_array_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldnt set array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + + case START_ARRAY: + /* + * possibly make it lock the array ... + */ + err = autostart_array((kdev_t)arg, dev); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name((kdev_t)arg)); + goto abort; + } + goto done; + + default:; + } + + /* + * Commands querying/configuring an existing array: + */ + + if (!mddev) { + err = -ENODEV; + goto abort; + } + err = lock_mddev(mddev); + if (err) { + printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); + goto abort; + } + /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + if (!(err = do_md_stop (mddev, 0))) + mddev = NULL; + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = md_put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = md_put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[minor].start_sect, + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + case HOT_GENERATE_ERROR: + err = hot_generate_error(mddev, (kdev_t)arg); + goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, (kdev_t)arg); + goto done_unlock; + + case RUN_ARRAY: + { + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + */ + if (err) { + mddev->sb_dirty = 0; + if (!do_md_stop (mddev, 0)) + mddev = NULL; + } + goto done_unlock; + } + + default: + printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, " + "upgrade your software to use new ictls.\n", + current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + if (mddev) + unlock_mddev(mddev); + + return err; +done: + if (err) + MD_BUG(); +abort: + return err; +} + +static int md_open(struct inode *inode, struct file *file) +{ + /* + * Always succeed, but increment the usage count + */ + mddev_t *mddev = kdev_to_mddev(inode->i_rdev); + if (mddev) + atomic_inc(&mddev->active); + return (0); +} + +static int md_release(struct inode *inode, struct file * file) +{ + mddev_t *mddev = kdev_to_mddev(inode->i_rdev); + if (mddev) + atomic_dec(&mddev->active); + return 0; +} + +static struct block_device_operations md_fops= +{ + owner: THIS_MODULE, + open: md_open, + release: md_release, + ioctl: md_ioctl, +}; + + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + md_lock_kernel(); + + /* + * Detach thread + */ + + daemonize(); + + sprintf(current->comm, thread->name); + md_init_signals(); + md_flush_signals(); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + current->policy = SCHED_OTHER; + current->nice = -20; + md_unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run)(void *data); + + wait_event_interruptible(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags)); + + clear_bit(THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->data); + run_task_queue(&tq_disk); + } + if (md_signal_pending(current)) + md_flush_signals(); + } + complete(thread->event); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + dprintk("md: waking up MD thread %p.\n", thread); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); +} + +mdk_thread_t *md_register_thread(void (*run) (void *), + void *data, const char *name) +{ + mdk_thread_t *thread; + int ret; + struct completion event; + + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; + + memset(thread, 0, sizeof(mdk_thread_t)); + md_init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->data = data; + thread->name = name; + ret = kernel_thread(md_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +void md_interrupt_thread(mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread(mdk_thread_t *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + md_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +void md_recover_arrays(void) +{ + if (!md_recovery_thread) { + MD_BUG(); + return; + } + md_wakeup_thread(md_recovery_thread); +} + + +int md_error(mddev_t *mddev, kdev_t rdev) +{ + mdk_rdev_t * rrdev; + + dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", + MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev), + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); + + if (!mddev) { + MD_BUG(); + return 0; + } + rrdev = find_rdev(mddev, rdev); + if (!rrdev || rrdev->faulty) + return 0; + if (!mddev->pers->error_handler + || mddev->pers->error_handler(mddev,rdev) <= 0) { + rrdev->faulty = 1; + } else + return 1; + /* + * if recovery was running, stop it now. + */ + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + md_recover_arrays(); + + return 0; +} + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + seq_printf(seq, "unused devices: "); + + ITERATE_RDEV_ALL(rdev,tmp) { + if (list_empty(&rdev->same_set)) { + /* + * The device is not yet used by any array. + */ + i++; + seq_printf(seq, "%s ", + partition_name(rdev->dev)); + } + } + if (!i) + seq_printf(seq, ""); + + seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, mddev_t * mddev) +{ + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + max_blocks = mddev->sb->size; + + /* + * Should not happen. + */ + if (!max_blocks) + MD_BUG(); + + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + if (!mddev->recovery_running) + /* + * true resync + */ + seq_printf(seq, " resync =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + else + /* + * recovery ... + */ + seq_printf(seq, " recovery =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt/2); + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/dt); + +} + + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + mddev_t *mddev; + + if (l > 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, mddev_t, all_mddevs); + return mddev; + } + return (void*)2;/* tail */ +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + mddev_t *next_mddev, *mddev = v; + + ++*pos; + if (v == (void*)2) + return NULL; + + if (v == (void*)1) + tmp = all_mddevs.next; + else + tmp = mddev->all_mddevs.next; + if (tmp != &all_mddevs) + next_mddev = list_entry(tmp,mddev_t,all_mddevs); + else { + next_mddev = (void*)2; + *pos = 0x10000; + } + + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + int j, size; + struct md_list_head *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev = v; + + if (v == (void*)1) { + seq_printf(seq, "Personalities : "); + for (j = 0; j < MAX_PERSONALITY; j++) + if (pers[j]) + seq_printf(seq, "[%s] ", pers[j]->name); + + seq_printf(seq, "\n"); + seq_printf(seq, "read_ahead "); + if (read_ahead[MD_MAJOR] == INT_MAX) + seq_printf(seq, "not set\n"); + else + seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + seq_printf(seq, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + seq_printf(seq, " (read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + seq_printf(seq, " %s[%d]", + partition_name(rdev->dev), rdev->desc_nr); + if (rdev->faulty) { + seq_printf(seq, "(F)"); + continue; + } + size += rdev->size; + } + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %d blocks", + md_size[mdidx(mddev)]); + else + seq_printf(seq, "\n %d blocks", size); + } + + if (mddev->pers) { + + mddev->pers->status (seq, mddev); + + seq_printf(seq, "\n "); + if (mddev->curr_resync) { + status_resync (seq, mddev); + } else { + if (sem_getcount(&mddev->resync_sem) != 1) + seq_printf(seq, " resync=DELAYED"); + } + } + seq_printf(seq, "\n"); + + return 0; +} + + +static struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + int error; + + error = seq_open(file, &md_seq_ops); + return error; +} + +static struct file_operations md_seq_fops = { + .open = md_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +int register_md_personality(int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (pers[pnum]) { + MD_BUG(); + return -EBUSY; + } + + pers[pnum] = p; + printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); + return 0; +} + +int unregister_md_personality(int pnum) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); + pers[pnum] = NULL; + return 0; +} + +mdp_disk_t *get_spare(mddev_t *mddev) +{ + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (!rdev->sb) { + MD_BUG(); + continue; + } + disk = &sb->disks[rdev->desc_nr]; + if (disk_faulty(disk)) { + MD_BUG(); + continue; + } + if (disk_active(disk)) + continue; + return disk; + } + return NULL; +} + +static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; +void md_sync_acct(kdev_t dev, unsigned long nr_sectors) +{ + unsigned int major = MAJOR(dev); + unsigned int index; + + index = disk_index(dev); + if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + return; + + sync_io[major][index] += nr_sectors; +} + +static int is_mddev_idle(mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + int major = MAJOR(rdev->dev); + int idx = disk_index(rdev->dev); + + if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + continue; + + curr_events = kstat.dk_drive_rblk[major][idx] + + kstat.dk_drive_wblk[major][idx] ; + curr_events -= sync_io[major][idx]; + if ((curr_events - rdev->last_events) > 32) { + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + // stop recovery, signal do_sync .... + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + } +} + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +int md_do_sync(mddev_t *mddev, mdp_disk_t *spare) +{ + mddev_t *mddev2; + unsigned int max_sectors, currspeed, + j, window, err, serialize; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct md_list_head *tmp; + unsigned long last_check; + + + err = down_interruptible(&mddev->resync_sem); + if (err) + goto out_nolock; + +recheck: + serialize = 0; + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: delaying resync of md%d until md%d " + "has finished resync (they share one or more physical units)\n", + mdidx(mddev), mdidx(mddev2)); + serialize = 1; + break; + } + } + if (serialize) { + interruptible_sleep_on(&resync_wait); + if (md_signal_pending(current)) { + md_flush_signals(); + err = -EINTR; + goto out; + } + goto recheck; + } + + mddev->curr_resync = 1; + + max_sectors = mddev->sb->size<<1; + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", + sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith " + "(but not more than %d KB/sec) for reconstruction.\n", + sysctl_speed_limit_max); + + /* + * Resync has low priority. + */ + current->nice = 19; + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = 0; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = vm_max_readahead*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", + window/2,max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + for (j = 0; j < max_sectors;) { + int sectors; + + sectors = mddev->pers->sync_request(mddev, j); + + if (sectors < 0) { + err = sectors; + goto out; + } + atomic_add(sectors, &mddev->recovery_active); + j += sectors; + mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + last_check = j; + + run_task_queue(&tq_disk); + + repeat: + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (md_signal_pending(current)) { + /* + * got a signal, exit. + */ + mddev->curr_resync = 0; + printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n"); + md_flush_signals(); + err = -EINTR; + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + if (md_need_resched(current)) + schedule(); + + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { + current->nice = 19; + + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + md_schedule_timeout(HZ/4); + goto repeat; + } + } else + current->nice = -20; + } + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + err = 0; + /* + * this also signals 'finished resyncing' to md_stop + */ +out: + wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0); + up(&mddev->resync_sem); +out_nolock: + mddev->curr_resync = 0; + wake_up(&resync_wait); + return err; +} + + +/* + * This is a kernel thread which syncs a spare disk with the active array + * + * the amount of foolproofing might seem to be a tad excessive, but an + * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs + * of my root partition with the first 0.5 gigs of my /home partition ... so + * i'm a bit nervous ;) + */ +void md_do_recovery(void *data) +{ + int err; + mddev_t *mddev; + mdp_super_t *sb; + mdp_disk_t *spare; + struct md_list_head *tmp; + + printk(KERN_INFO "md: recovery thread got woken up ...\n"); +restart: + ITERATE_MDDEV(mddev,tmp) { + sb = mddev->sb; + if (!sb) + continue; + if (mddev->recovery_running) + continue; + if (sb->active_disks == sb->raid_disks) + continue; + if (mddev->sb_dirty) + md_update_sb(mddev); + if (!sb->spare_disks) { + printk(KERN_ERR "md%d: no spare disk to reconstruct array! " + "-- continuing in degraded mode\n", mdidx(mddev)); + continue; + } + /* + * now here we get the spare and resync it. + */ + spare = get_spare(mddev); + if (!spare) + continue; + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", + mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!mddev->pers->diskop) + continue; + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) + continue; + down(&mddev->recovery_sem); + mddev->recovery_running = 1; + err = md_do_sync(mddev, spare); + if (err == -EIO) { + printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", + mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!disk_faulty(spare)) { + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); + mark_disk_faulty(spare); + mark_disk_nonsync(spare); + mark_disk_inactive(spare); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + } + } else + if (disk_faulty(spare)) + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + if (err == -EINTR || err == -ENOMEM) { + /* + * Recovery got interrupted, or ran out of mem ... + * signal back that we have finished using the array. + */ + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + up(&mddev->recovery_sem); + mddev->recovery_running = 0; + continue; + } else { + mddev->recovery_running = 0; + up(&mddev->recovery_sem); + } + if (!disk_faulty(spare)) { + /* + * the SPARE_ACTIVE diskop possibly changes the + * pointer too + */ + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); + mark_disk_sync(spare); + mark_disk_active(spare); + sb->active_disks++; + sb->spare_disks--; + } + mddev->sb_dirty = 1; + md_update_sb(mddev); + goto restart; + } + printk(KERN_INFO "md: recovery thread finished ...\n"); + +} + +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct md_list_head *tmp; + mddev_t *mddev; + + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) + || (code == MD_SYS_POWER_OFF)) { + + printk(KERN_INFO "md: stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + md_mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + notifier_call: md_notify_reboot, + next: NULL, + priority: INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + struct proc_dir_entry *p; + int i; + + for(i = 0; i < MAX_MD_DEVS; i++) { + md_blocksizes[i] = 1024; + md_size[i] = 0; + md_hardsect_sizes[i] = 512; + } + blksize_size[MAJOR_NR] = md_blocksizes; + blk_size[MAJOR_NR] = md_size; + max_readahead[MAJOR_NR] = md_maxreadahead; + hardsect_size[MAJOR_NR] = md_hardsect_sizes; + + dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + p = create_proc_entry("mdstat", S_IRUGO, NULL); + if (p) + p->proc_fops = &md_seq_fops; +#endif +} + +request_queue_t * md_queue_proc(kdev_t dev) +{ + mddev_t *mddev = kdev_to_mddev(dev); + if (mddev == NULL) + return BLK_DEFAULT_QUEUE(MAJOR_NR); + else + return &mddev->queue; +} + +int md__init md_init(void) +{ + static char * name = "mdrecoveryd"; + int minor; + + printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + + if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops)) + { + printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR); + return (-1); + } + devfs_handle = devfs_mk_dir (NULL, "md", NULL); + /* we don't use devfs_register_series because we want to fill md_hd_struct */ + for (minor=0; minor < MAX_MD_DEVS; ++minor) { + char devname[128]; + sprintf (devname, "%u", minor); + md_hd_struct[minor].de = devfs_register (devfs_handle, + devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor, + S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); + } + + /* all requests on an uninitialised device get failed... */ + blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request); + blk_dev[MAJOR_NR].queue = md_queue_proc; + + + read_ahead[MAJOR_NR] = INT_MAX; + + add_gendisk(&md_gendisk); + + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); + if (!md_recovery_thread) + printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n"); + + md_register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table, 1); + + md_geninit(); + return (0); +} + + +#ifndef MODULE + +/* + * When md (and any require personalities) are compiled into the kernel + * (not a module), arrays can be assembles are boot time using with AUTODETECT + * where specially marked partitions are registered with md_autodetect_dev(), + * and with MD_BOOT where devices to be collected are given on the boot line + * with md=..... + * The code for that is here. + */ + +struct { + int set; + int noautodetect; +} raid_setup_args md__initdata; + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +static kdev_t detected_devices[128]; +static int dev_cnt; + +void md_autodetect_dev(kdev_t dev) +{ + if (dev_cnt >= 0 && dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} + + +static void autostart_arrays(void) +{ + mdk_rdev_t *rdev; + int i; + + printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + + for (i = 0; i < dev_cnt; i++) { + kdev_t dev = detected_devices[i]; + + if (md_import_device(dev,1)) { + printk(KERN_ALERT "md: could not import %s!\n", + partition_name(dev)); + continue; + } + /* + * Sanity checks: + */ + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + dev_cnt = 0; + + autorun_devices(-1); +} + +static struct { + char device_set [MAX_MD_DEVS]; + int pers[MAX_MD_DEVS]; + int chunk[MAX_MD_DEVS]; + char *device_names[MAX_MD_DEVS]; +} md_setup_args md__initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistant-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + * 2001-06-03: Dave Cinege + * Shifted name_to_kdev_t() and related operations to md_set_drive() + * for later execution. Rewrote section to make devfs compatible. + */ +static int md__init md_setup(char *str) +{ + int minor, level, factor, fault; + char *pername = ""; + char *str1 = str; + + if (get_option(&str, &minor) != 2) { /* MD Number */ + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + if (minor >= MAX_MD_DEVS) { + printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor); + return 0; + } else if (md_setup_args.device_names[minor]) { + printk(KERN_WARNING "md: md=%d, Specified more then once. " + "Replacing previous definition.\n", minor); + } + switch (get_option(&str, &level)) { /* RAID Personality */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == -1) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args.pers[minor] = level; + md_setup_args.chunk[minor] = 1 << (factor+12); + switch(level) { + case -1: + level = LINEAR; + pername = "linear"; + break; + case 0: + level = RAID0; + pername = "raid0"; + break; + default: + printk(KERN_WARNING + "md: The kernel has not been configured for raid%d support!\n", + level); + return 0; + } + md_setup_args.pers[minor] = level; + break; + } + /* FALL THROUGH */ + case 1: /* the first device is numeric */ + str = str1; + /* FALL THROUGH */ + case 0: + md_setup_args.pers[minor] = 0; + pername="super-block"; + } + + printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", + minor, pername, str); + md_setup_args.device_names[minor] = str; + + return 1; +} + +extern kdev_t name_to_kdev_t(char *line) md__init; +void md__init md_setup_drive(void) +{ + int minor, i; + kdev_t dev; + mddev_t*mddev; + kdev_t devices[MD_SB_DISKS+1]; + + for (minor = 0; minor < MAX_MD_DEVS; minor++) { + int err = 0; + char *devname; + mdu_disk_info_t dinfo; + + if ((devname = md_setup_args.device_names[minor]) == 0) continue; + + for (i = 0; i < MD_SB_DISKS && devname != 0; i++) { + + char *p; + void *handle; + + p = strchr(devname, ','); + if (p) + *p++ = 0; + + dev = name_to_kdev_t(devname); + handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev), + DEVFS_SPECIAL_BLK, 1); + if (handle != 0) { + unsigned major, minor; + devfs_get_maj_min(handle, &major, &minor); + dev = MKDEV(major, minor); + } + if (dev == 0) { + printk(KERN_WARNING "md: Unknown device name: %s\n", devname); + break; + } + + devices[i] = dev; + md_setup_args.device_set[minor] = 1; + + devname = p; + } + devices[i] = 0; + + if (md_setup_args.device_set[minor] == 0) + continue; + + if (mddev_map[minor]) { + printk(KERN_WARNING + "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", + minor); + continue; + } + printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]); + + mddev = alloc_mddev(MKDEV(MD_MAJOR,minor)); + if (!mddev) { + printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor); + continue; + } + if (md_setup_args.pers[minor]) { + /* non-persistent */ + mdu_array_info_t ainfo; + ainfo.level = pers_to_level(md_setup_args.pers[minor]); + ainfo.size = 0; + ainfo.nr_disks =0; + ainfo.raid_disks =0; + ainfo.md_minor =minor; + ainfo.not_persistent = 1; + + ainfo.state = (1 << MD_SB_CLEAN); + ainfo.active_disks = 0; + ainfo.working_disks = 0; + ainfo.failed_disks = 0; + ainfo.spare_disks = 0; + ainfo.layout = 0; + ainfo.chunk_size = md_setup_args.chunk[minor]; + err = set_array_info(mddev, &ainfo); + for (i = 0; !err && (dev = devices[i]); i++) { + dinfo.number = i; + dinfo.raid_disk = i; + dinfo.state = (1<sb->nr_disks++; + mddev->sb->raid_disks++; + mddev->sb->active_disks++; + mddev->sb->working_disks++; + err = add_new_disk (mddev, &dinfo); + } + } else { + /* persistent */ + for (i = 0; (dev = devices[i]); i++) { + dinfo.major = MAJOR(dev); + dinfo.minor = MINOR(dev); + add_new_disk (mddev, &dinfo); + } + } + if (!err) + err = do_md_run(mddev); + if (err) { + mddev->sb_dirty = 0; + do_md_stop(mddev, 0); + printk(KERN_WARNING "md: starting md%d failed\n", minor); + } + } +} + +static int md__init raid_setup(char *str) +{ + int len, pos; + + len = strlen(str) + 1; + pos = 0; + + while (pos < len) { + char *comma = strchr(str+pos, ','); + int wlen; + if (comma) + wlen = (comma-str)-pos; + else wlen = (len-1)-pos; + + if (strncmp(str, "noautodetect", wlen) == 0) + raid_setup_args.noautodetect = 1; + pos += wlen+1; + } + raid_setup_args.set = 1; + return 1; +} + +int md__init md_run_setup(void) +{ + if (raid_setup_args.noautodetect) + printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n"); + else + autostart_arrays(); + md_setup_drive(); + return 0; +} + +__setup("raid=", raid_setup); +__setup("md=", md_setup); + +__initcall(md_init); +__initcall(md_run_setup); + +#else /* It is a MODULE */ + +int init_module(void) +{ + return md_init(); +} + +static void free_device_names(void) +{ + while (!list_empty(&device_names)) { + struct dname *tmp = list_entry(device_names.next, + dev_name_t, list); + list_del(&tmp->list); + kfree(tmp); + } +} + + +void cleanup_module(void) +{ + md_unregister_thread(md_recovery_thread); + devfs_unregister(devfs_handle); + + devfs_unregister_blkdev(MAJOR_NR,"md"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("mdstat", NULL); +#endif + + del_gendisk(&md_gendisk); + + blk_dev[MAJOR_NR].queue = NULL; + blksize_size[MAJOR_NR] = NULL; + blk_size[MAJOR_NR] = NULL; + max_readahead[MAJOR_NR] = NULL; + hardsect_size[MAJOR_NR] = NULL; + + free_device_names(); + +} +#endif + +MD_EXPORT_SYMBOL(md_size); +MD_EXPORT_SYMBOL(register_md_personality); +MD_EXPORT_SYMBOL(unregister_md_personality); +MD_EXPORT_SYMBOL(partition_name); +MD_EXPORT_SYMBOL(md_error); +MD_EXPORT_SYMBOL(md_do_sync); +MD_EXPORT_SYMBOL(md_sync_acct); +MD_EXPORT_SYMBOL(md_done_sync); +MD_EXPORT_SYMBOL(md_recover_arrays); +MD_EXPORT_SYMBOL(md_register_thread); +MD_EXPORT_SYMBOL(md_unregister_thread); +MD_EXPORT_SYMBOL(md_update_sb); +MD_EXPORT_SYMBOL(md_wakeup_thread); +MD_EXPORT_SYMBOL(md_print_devices); +MD_EXPORT_SYMBOL(find_rdev_nr); +MD_EXPORT_SYMBOL(md_interrupt_thread); +<<<<<<< +MD_EXPORT_SYMBOL(mddev_map); +||||||| +EXPORT_SYMBOL(mddev_map); +======= +>>>>>>> +MODULE_LICENSE("GPL"); diff --git a/tests/linux/md-loop/orig b/tests/linux/md-loop/orig new file mode 100644 index 0000000..682ed20 --- /dev/null +++ b/tests/linux/md-loop/orig @@ -0,0 +1,3960 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher + - Devfs support by Richard Gooch + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown . + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_KMOD +#include +#endif + +#define __KERNEL_SYSCALLS__ +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER + +#include + +#define DEBUG 0 +#if DEBUG +# define dprintk(x...) printk(x) +#else +# define dprintk(x...) do { } while(0) +#endif + +#ifndef MODULE +static void autostart_arrays (void); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 100; +static int sysctl_speed_limit_max = 100000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min", + &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, + {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max", + &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table raid_dir_table[] = { + {DEV_RAID, "raid", NULL, 0, 0555, raid_table}, + {0} +}; + +static ctl_table raid_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table}, + {0} +}; + +/* + * these have to be allocated separately because external + * subsystems want to have a pre-defined structure + */ +struct hd_struct md_hd_struct[MAX_MD_DEVS]; +static int md_blocksizes[MAX_MD_DEVS]; +static int md_hardsect_sizes[MAX_MD_DEVS]; +static mdk_thread_t *md_recovery_thread; + +int md_size[MAX_MD_DEVS]; + +static struct block_device_operations md_fops; +static devfs_handle_t devfs_handle; + +static struct gendisk md_gendisk= +{ + major: MD_MAJOR, + major_name: "md", + minor_shift: 0, + max_p: 1, + part: md_hd_struct, + sizes: md_size, + nr_real: MAX_MD_DEVS, + real_devices: NULL, + next: NULL, + fops: &md_fops, +}; + +/* + * Enables to iterate over all existing md arrays + */ +static MD_LIST_HEAD(all_mddevs); + +static mddev_t *mddev_map[MAX_MD_DEVS]; + +static inline mddev_t * kdev_to_mddev (kdev_t dev) +{ + if (MAJOR(dev) != MD_MAJOR) + BUG(); + return mddev_map[MINOR(dev)]; +} + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ + bio_io_error(bio); + return 0; +} + +static mddev_t * alloc_mddev(kdev_t dev) +{ + mddev_t *mddev; + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return 0; + } + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); + if (!mddev) + return NULL; + + memset(mddev, 0, sizeof(*mddev)); + + mddev->__minor = MINOR(dev); + init_MUTEX(&mddev->reconfig_sem); + init_MUTEX(&mddev->recovery_sem); + init_MUTEX(&mddev->resync_sem); + MD_INIT_LIST_HEAD(&mddev->disks); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + atomic_set(&mddev->active, 0); + + mddev_map[mdidx(mddev)] = mddev; + md_list_add(&mddev->all_mddevs, &all_mddevs); + + MOD_INC_USE_COUNT; + + return mddev; +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +static MD_LIST_HEAD(device_names); + +char * partition_name(kdev_t dev) +{ + struct gendisk *hd; + static char nomem [] = ""; + dev_name_t *dname; + struct md_list_head *tmp; + + list_for_each(tmp, &device_names) { + dname = md_list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + + if (!dname) + return nomem; + /* + * ok, add this new device name to the list + */ + hd = get_gendisk (dev); + dname->name = NULL; + if (hd) + dname->name = disk_name (hd, MINOR(dev), dname->namebuf); + if (!dname->name) { + sprintf (dname->namebuf, "[dev %s]", kdevname(dev)); + dname->name = dname->namebuf; + } + + dname->dev = dev; + md_list_add(&dname->list, &device_names); + + return dname->name; +} + +static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev, + int persistent) +{ + unsigned int size = 0; + + if (blk_size[MAJOR(dev)]) + size = blk_size[MAJOR(dev)][MINOR(dev)]; + if (persistent) + size = MD_NEW_SIZE_BLOCKS(size); + return size; +} + +static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent) +{ + unsigned int size; + + size = calc_dev_sboffset(dev, mddev, persistent); + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +static unsigned int zoned_raid_size(mddev_t *mddev) +{ + unsigned int mask; + mdk_rdev_t * rdev; + struct md_list_head *tmp; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* + * do size and offset calculations. + */ + mask = ~(mddev->sb->chunk_size/1024 - 1); + + ITERATE_RDEV(mddev,rdev,tmp) { + rdev->size &= mask; + md_size[mdidx(mddev)] += rdev->size; + } + return 0; +} + +static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb) +{ + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = 0; + disk->minor = 0; + mark_disk_removed(disk); +} + +#define BAD_MAGIC KERN_ERR \ +"md: invalid raid superblock magic on %s\n" + +#define BAD_MINOR KERN_ERR \ +"md: %s: invalid raid minor (%x)\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_SB KERN_ERR \ +"md: disabled device %s, could not read superblock.\n" + +#define BAD_CSUM KERN_WARNING \ +"md: invalid superblock checksum on %s\n" + +static int alloc_array_sb(mddev_t * mddev) +{ + if (mddev->sb) { + MD_BUG(); + return 0; + } + + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) + return -ENOMEM; + md_clear_page(mddev->sb); + return 0; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(OUT_OF_MEM); + return -EINVAL; + } + rdev->sb = (mdp_super_t *) page_address(rdev->sb_page); + + return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) { + page_cache_release(rdev->sb_page); + rdev->sb = NULL; + rdev->sb_page = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->faulty) + MD_BUG(); + } +} + + +static void bh_complete(struct buffer_head *bh, int uptodate) +{ + + if (uptodate) + set_bit(BH_Uptodate, &bh->b_state); + + complete((struct completion*)bh->b_private); +} + +static int sync_page_io(kdev_t dev, unsigned long sector, int size, + struct page *page, int rw) +{ + struct buffer_head bh; + struct completion event; + + init_completion(&event); + init_buffer(&bh, bh_complete, &event); + bh.b_rdev = dev; + bh.b_rsector = sector; + bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock); + bh.b_size = size; + bh.b_page = page; + bh.b_reqnext = NULL; + bh.b_data = page_address(page); + generic_make_request(rw, &bh); + + run_task_queue(&tq_disk); + wait_for_completion(&event); + + return test_bit(BH_Uptodate, &bh.b_state); +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ + int ret = -EINVAL; + kdev_t dev = rdev->dev; + unsigned long sb_offset; + + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + + /* + * Calculate the position of the superblock, + * it's at the end of the disk + */ + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); + rdev->sb_offset = sb_offset; + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) { + printk(NO_SB,partition_name(dev)); + return -EINVAL; + } + printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo); + ret = 0; +abort: + return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb(mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + printk(BAD_MAGIC, partition_name(rdev->dev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor); + goto abort; + } + + if (calc_sb_csum(sb) != sb->sb_csum) { + printk(BAD_CSUM, partition_name(rdev->dev)); + goto abort; + } + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = get_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + printk( KERN_WARNING +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" +" protection against single-disk failure might be compromised.\n", + mdidx(mddev), partition_name(rdev->dev), + partition_name(same_pdev->dev)); + + md_list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev)); +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del_init(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev)); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev(mdk_rdev_t *rdev) +{ + int err = 0; + struct block_device *bdev; + + bdev = bdget(rdev->dev); + if (!bdev) + return -ENOMEM; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + if (!err) + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + blkdev_put(bdev, BDEV_RAW); +} + +void md_autodetect_dev(kdev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ + printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev)); + if (rdev->mddev) + MD_BUG(); + unlock_rdev(rdev); + free_disk_sb(rdev); + list_del_init(&rdev->all); + if (!list_empty(&rdev->pending)) { + printk(KERN_INFO "md: (%s was pending)\n", + partition_name(rdev->dev)); + list_del_init(&rdev->pending); + } +#ifndef MODULE + md_autodetect_dev(rdev->dev); +#endif + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); +} + +static void free_mddev(mddev_t *mddev) +{ + if (!mddev) { + MD_BUG(); + return; + } + + export_array(mddev); + md_size[mdidx(mddev)] = 0; + md_hd_struct[mdidx(mddev)].nr_sects = 0; + + /* + * Make sure nobody else is using this mddev + * (careful, we rely on the global kernel lock here) + */ + while (sem_getcount(&mddev->resync_sem) != 1) + schedule(); + while (sem_getcount(&mddev->recovery_sem) != 1) + schedule(); + +<<<<<<< + del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); +||||||| + del_mddev_mapping(mddev, mk_kdev(MD_MAJOR, mdidx(mddev))); +======= + mddev_map[mdidx(mddev)] = NULL; +>>>>>>> + md_list_del(&mddev->all_mddevs); + kfree(mddev); + MOD_DEC_USE_COUNT; +} + +#undef BAD_CSUM +#undef BAD_MAGIC +#undef OUT_OF_MEM +#undef NO_SB + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ", + partition_name(rdev->dev), partition_name(rdev->old_dev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk(KERN_INFO "md: rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * *\n"); + printk("md: **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", partition_name(rdev->dev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk("md: **********************************\n"); + printk("\n"); +} + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) +{ + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) + + return 1; + + return 0; +} + +static mdk_rdev_t * find_rdev_all(kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + list_for_each(tmp, &all_raid_disks) { + rdev = md_list_entry(tmp, mdk_rdev_t, all); + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +#define GETBLK_FAILED KERN_ERR \ +"md: getblk failed for device %s\n" + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + kdev_t dev; + unsigned long sb_offset, size; + + if (!rdev->sb) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + if (rdev->sb->md_magic != MD_SB_MAGIC) { + MD_BUG(); + return 1; + } + + dev = rdev->dev; + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); + if (rdev->sb_offset != sb_offset) { + printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n", + partition_name(dev), rdev->sb_offset, sb_offset); + goto skip; + } + /* + * If the disk went offline meanwhile and it's just a spare, then + * its size has changed to zero silently, and the MD code does + * not yet know that it's faulty. + */ + size = calc_dev_size(dev, rdev->mddev, 1); + if (size != rdev->size) { + printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n", + partition_name(dev), rdev->size, size); + goto skip; + } + + printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset); + + if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) { + printk("md: write_disk_sb failed for device %s\n", partition_name(dev)); + return 1; + } +skip: + return 0; +} +#undef GETBLK_FAILED + +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int i, ok = 0; + mdp_disk_t *desc; + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = mddev->sb->disks + i; +#if 0 + if (disk_faulty(desc)) { + if (MKDEV(desc->major,desc->minor) == rdev->dev) + ok = 1; + continue; + } +#endif + if (MKDEV(desc->major,desc->minor) == rdev->dev) { + rdev->sb->this_disk = *desc; + rdev->desc_nr = desc->number; + ok = 1; + break; + } + } + + if (!ok) { + MD_BUG(); + } +} + +static int sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + mdp_super_t *sb; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty || rdev->alias_device) + continue; + sb = rdev->sb; + *sb = *mddev->sb; + set_this_disk(mddev, rdev); + sb->sb_csum = calc_sb_csum(sb); + } + return 0; +} + +int md_update_sb(mddev_t * mddev) +{ + int err, count = 100; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + if (!mddev->sb_dirty) { + printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0)); + return 0; + } + mddev->sb_dirty = 0; +repeat: + mddev->sb->utime = CURRENT_TIME; + if ((++mddev->sb->events_lo)==0) + ++mddev->sb->events_hi; + + if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff; + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (mddev->sb->not_persistent) + return 0; + + printk(KERN_INFO "md: updating md%d RAID superblock on device\n", + mdidx(mddev)); + + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + printk(KERN_INFO "md: "); + if (rdev->faulty) + printk("(skipping faulty "); + if (rdev->alias_device) + printk("(skipping alias "); + if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) { + printk("(skipping new-faulty %s )\n", + partition_name(rdev->dev)); + continue; + } + printk("%s ", partition_name(rdev->dev)); + if (!rdev->faulty && !rdev->alias_device) { + printk("[events: %08lx]", + (unsigned long)rdev->sb->events_lo); + err += write_disk_sb(rdev); + } else + printk(")\n"); + } + if (err) { + if (--count) { + printk(KERN_ERR "md: errors occurred during superblock update, repeating\n"); + goto repeat; + } + printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n"); + } + return 0; +} + +/* + * Import a device. If 'on_disk', then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + */ +static int md_import_device(kdev_t newdev, int on_disk) +{ + int err; + mdk_rdev_t *rdev; + unsigned int size; + + if (find_rdev_all(newdev)) + return -EEXIST; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev)); + return -ENOMEM; + } + memset(rdev, 0, sizeof(*rdev)); + + if (is_mounted(newdev)) { + printk(KERN_WARNING "md: can not import %s, has active inodes!\n", + partition_name(newdev)); + err = -EBUSY; + goto abort_free; + } + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + rdev->dev = newdev; + if (lock_rdev(rdev)) { + printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + + size = 0; + if (blk_size[MAJOR(newdev)]) + size = blk_size[MAJOR(newdev)][MINOR(newdev)]; + if (!size) { + printk(KERN_WARNING "md: %s has zero size, marking faulty!\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + + if (on_disk) { + if ((err = read_disk_sb(rdev))) { + printk(KERN_WARNING "md: could not read %s's sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + if ((err = check_disk_sb(rdev))) { + printk(KERN_WARNING "md: %s has invalid sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + + if (rdev->sb->level != -4) { + rdev->old_dev = MKDEV(rdev->sb->this_disk.major, + rdev->sb->this_disk.minor); + rdev->desc_nr = rdev->sb->this_disk.number; + } else { + rdev->old_dev = MKDEV(0, 0); + rdev->desc_nr = -1; + } + } + md_list_add(&rdev->all, &all_raid_disks); + MD_INIT_LIST_HEAD(&rdev->pending); + INIT_LIST_HEAD(&rdev->same_set); + + return 0; + +abort_free: + if (rdev->sb) { + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return err; +} + +/* + * Check a full RAID array for plausibility + */ + +#define INCONSISTENT KERN_ERR \ +"md: fatal superblock inconsistency in %s -- removing from array\n" + +#define OUT_OF_DATE KERN_ERR \ +"md: superblock update time inconsistency -- using the most recent one\n" + +#define OLD_VERSION KERN_ALERT \ +"md: md%d: unsupported raid array version %d.%d.%d\n" + +#define NOT_CLEAN_IGNORE KERN_ERR \ +"md: md%d: raid array is not clean -- starting background reconstruction\n" + +#define UNKNOWN_LEVEL KERN_ERR \ +"md: md%d: unsupported raid level %d\n" + +static int analyze_sbs(mddev_t * mddev) +{ + int out_of_date = 0, i, first; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2, *freshest; + mdp_super_t *sb; + + /* + * Verify the RAID superblock on each real device + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + MD_BUG(); + goto abort; + } + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + if (check_disk_sb(rdev)) + goto abort; + } + + /* + * The superblock constant part has to be the same + * for all disks in the array. + */ + sb = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!sb) { + sb = rdev->sb; + continue; + } + if (!sb_equal(sb, rdev->sb)) { + printk(INCONSISTENT, partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * OK, we have all disks and the array is ready to run. Let's + * find the freshest superblock, that one will be the superblock + * that represents the whole array. + */ + if (!mddev->sb) + if (alloc_array_sb(mddev)) + goto abort; + sb = mddev->sb; + freshest = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2; + /* + * if the checksum is invalid, use the superblock + * only as a last resort. (decrease it's age by + * one event) + */ + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { + if (rdev->sb->events_lo || rdev->sb->events_hi) + if ((rdev->sb->events_lo--)==0) + rdev->sb->events_hi--; + } + + printk(KERN_INFO "md: %s's event counter: %08lx\n", + partition_name(rdev->dev), + (unsigned long)rdev->sb->events_lo); + if (!freshest) { + freshest = rdev; + continue; + } + /* + * Find the newest superblock version + */ + ev1 = md_event(rdev->sb); + ev2 = md_event(freshest->sb); + if (ev1 != ev2) { + out_of_date = 1; + if (ev1 > ev2) + freshest = rdev; + } + } + if (out_of_date) { + printk(OUT_OF_DATE); + printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev)); + } + memcpy (sb, freshest->sb, sizeof(*sb)); + + /* + * at this point we have picked the 'best' superblock + * from all available superblocks. + * now we validate this superblock and kick out possibly + * failed disks. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Kick all non-fresh devices + */ + __u64 ev1, ev2; + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ++ev1; + if (ev1 < ev2) { + printk(KERN_WARNING "md: kicking non-fresh %s from array!\n", + partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * Fix up changed device names ... but only if this disk has a + * recent update time. Use faulty checksum ones too. + */ + if (mddev->sb->level != -4) + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2, ev3; + if (rdev->faulty || rdev->alias_device) { + MD_BUG(); + goto abort; + } + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + ev3 = ev2; + --ev3; + if ((rdev->dev != rdev->old_dev) && + ((ev1 == ev2) || (ev1 == ev3))) { + mdp_disk_t *desc; + + printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n", + partition_name(rdev->old_dev), partition_name(rdev->dev)); + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + desc = &sb->disks[rdev->desc_nr]; + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { + MD_BUG(); + goto abort; + } + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + desc = &rdev->sb->this_disk; + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + } + } + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + /* + * We kick faulty devices/descriptors immediately. + * + * Note: multipath devices are a special case. Since we + * were able to read the superblock on the path, we don't + * care if it was previously marked as faulty, it's up now + * so enable it. + */ + if (disk_faulty(desc) && mddev->sb->level != -4) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + printk(KERN_WARNING "md%d: kicking faulty %s!\n", + mdidx(mddev),partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + if (dev == MKDEV(0,0)) + continue; + printk(KERN_WARNING "md%d: removing former faulty %s!\n", + mdidx(mddev), partition_name(dev)); + } + remove_descriptor(desc, sb); + continue; + } else if (disk_faulty(desc)) { + /* + * multipath entry marked as faulty, unfaulty it + */ + rdev = find_rdev(mddev, dev); + if(rdev) + mark_disk_spare(desc); + else + remove_descriptor(desc, sb); + } + + if (dev == MKDEV(0,0)) + continue; + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Multi-path IO special-case: since we have no + * this_disk descriptor at auto-detect time, + * we cannot check rdev->number. + * We can check the device though. + */ + if ((sb->level == -4) && (rdev->dev == + MKDEV(desc->major,desc->minor))) { + found = 1; + break; + } + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } + } + if (found) + continue; + + printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n", + mdidx(mddev), partition_name(dev)); + remove_descriptor(desc, sb); + } + + /* + * Double check wether all devices mentioned in the + * superblock are in the rdev ring. + */ + first = 1; + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + + if (disk_faulty(desc)) { + MD_BUG(); + goto abort; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + /* + * In the case of Multipath-IO, we have no + * other information source to find out which + * disk is which, only the position of the device + * in the superblock: + */ + if (mddev->sb->level == -4) { + if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) { + MD_BUG(); + goto abort; + } + rdev->desc_nr = i; + if (!first) + rdev->alias_device = 1; + else + first = 0; + } + } + + /* + * Kick all rdevs that are not in the + * descriptor array: + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) + kick_rdev_from_array(rdev); + } + + /* + * Do a final reality check. + */ + if (mddev->sb->level != -4) { + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + /* + * is the desc_nr unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->desc_nr == rdev->desc_nr)) { + MD_BUG(); + goto abort; + } + } + /* + * is the device unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->dev == rdev->dev)) { + MD_BUG(); + goto abort; + } + } + } + } + + /* + * Check if we can support this RAID array + */ + if (sb->major_version != MD_MAJOR_VERSION || + sb->minor_version > MD_MINOR_VERSION) { + + printk(OLD_VERSION, mdidx(mddev), sb->major_version, + sb->minor_version, sb->patch_version); + goto abort; + } + + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || + (sb->level == 4) || (sb->level == 5))) + printk(NOT_CLEAN_IGNORE, mdidx(mddev)); + + return 0; +abort: + return 1; +} + +#undef INCONSISTENT +#undef OUT_OF_DATE +#undef OLD_VERSION +#undef OLD_LEVEL + +static int device_size_calculation(mddev_t * mddev) +{ + int data_disks = 0, persistent; + unsigned int readahead; + mdp_super_t *sb = mddev->sb; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + persistent = !mddev->sb->not_persistent; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size) { + MD_BUG(); + continue; + } + rdev->size = calc_dev_size(rdev->dev, mddev, persistent); + if (rdev->size < sb->chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size: %ldk < %dk\n", + partition_name(rdev->dev), + rdev->size, sb->chunk_size / 1024); + return -EINVAL; + } + } + + switch (sb->level) { + case -4: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case -1: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = sb->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = sb->raid_disks-1; + break; + default: + printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = sb->size * data_disks; + + readahead = MD_READAHEAD; + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) { + readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + // (no multipath branch - it uses the default setting) + if (sb->level == -3) + readahead = 0; + } + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; +abort: + return 1; +} + + +#define TOO_BIG_CHUNKSIZE KERN_ERR \ +"too big chunk_size: %d > %d\n" + +#define TOO_SMALL_CHUNKSIZE KERN_ERR \ +"too small chunk_size: %d < %ld\n" + +#define BAD_CHUNKSIZE KERN_ERR \ +"no chunksize specified, see 'man raidtab'\n" + +static int do_md_run(mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Resize disks to align partitions size on a given + * chunk size. + */ + md_size[mdidx(mddev)] = 0; + + /* + * Analyze all RAID superblock(s) + */ + if (analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->sb->chunk_size; + pnum = level_to_pers(mddev->sb->level); + + if ((pnum != MULTIPATH) && (pnum != RAID1)) { + if (!chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We dont + * want to continue the bad practice. + */ + printk(BAD_CHUNKSIZE); + return -EINVAL; + } + if (chunk_size > MAX_CHUNK_SIZE) { + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); + return -EINVAL; + } + } else + if (chunk_size) + printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n", + mddev->sb->level); + + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (!pers[pnum]) + { +#ifdef CONFIG_KMOD + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + if (!pers[pnum]) +#endif + { + printk(KERN_ERR "md: personality %d is not loaded!\n", + pnum); + return -EINVAL; + } + } + + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + md_hardsect_sizes[mdidx(mddev)] = 512; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + invalidate_device(rdev->dev, 1); + if (get_hardsect_size(rdev->dev) + > md_hardsect_sizes[mdidx(mddev)]) + md_hardsect_sizes[mdidx(mddev)] = + get_hardsect_size(rdev->dev); + } + md_blocksizes[mdidx(mddev)] = 1024; + if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)]) + md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)]; + mddev->pers = pers[pnum]; + + blk_queue_make_request(&mddev->queue, mddev->pers->make_request); + mddev->queue.queuedata = mddev; + + err = mddev->pers->run(mddev); + if (err) { + printk(KERN_ERR "md: pers->run() failed ...\n"); + mddev->pers = NULL; + return -EINVAL; + } + + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + /* + * md_size has units of 1K blocks, which are + * twice as large as sectors. + */ + md_hd_struct[mdidx(mddev)].start_sect = 0; + register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)), + 1, &md_fops, md_size[mdidx(mddev)]<<1); + + read_ahead[MD_MAJOR] = 1024; + return (0); +} + +#undef TOO_BIG_CHUNKSIZE +#undef BAD_CHUNKSIZE + +static int restart_array(mddev_t *mddev) +{ + int err; + + /* + * Complain if it has no devices + */ + err = -ENXIO; + if (list_empty(&mddev->disks)) + goto out; + + if (mddev->pers) { + err = -EBUSY; + if (!mddev->ro) + goto out; + + mddev->ro = 0; + set_device_ro(mddev_to_kdev(mddev), 0); + + printk(KERN_INFO + "md: md%d switched to read-write mode.\n", mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + md_recover_arrays(); + if (mddev->pers->restart_resync) + mddev->pers->restart_resync(mddev); + err = 0; + } else { + printk(KERN_ERR "md: md%d has no personality assigned.\n", + mdidx(mddev)); + err = -EINVAL; + } + +out: + return err; +} + +#define STILL_MOUNTED KERN_WARNING \ +"md: md%d still mounted.\n" +#define STILL_IN_USE \ +"md: md%d still in use.\n" + +static int do_md_stop(mddev_t * mddev, int ro) +{ + int err = 0, resync_interrupted = 0; + kdev_t dev = mddev_to_kdev(mddev); + + if (atomic_read(&mddev->active)>1) { + printk(STILL_IN_USE, mdidx(mddev)); + err = -EBUSY; + goto out; + } + + if (mddev->pers) { + /* + * It is safe to call stop here, it only frees private + * data. Also, it tells us if a device is unstoppable + * (eg. resyncing is in progress) + */ + if (mddev->pers->stop_resync) + if (mddev->pers->stop_resync(mddev)) + resync_interrupted = 1; + + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + + /* + * This synchronizes with signal delivery to the + * resync or reconstruction thread. It also nicely + * hangs the process if some reconstruction has not + * finished. + */ + down(&mddev->recovery_sem); + up(&mddev->recovery_sem); + + invalidate_device(dev, 1); + + if (ro) { + err = -ENXIO; + if (mddev->ro) + goto out; + mddev->ro = 1; + } else { + if (mddev->ro) + set_device_ro(dev, 0); + if (mddev->pers->stop(mddev)) { + err = -EBUSY; + if (mddev->ro) + set_device_ro(dev, 1); + goto out; + } + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->sb) { + /* + * mark it clean only if there was no resync + * interrupted. + */ + if (!mddev->recovery_running && !resync_interrupted) { + printk(KERN_INFO "md: marking sb clean...\n"); + mddev->sb->state |= 1 << MD_SB_CLEAN; + } + mddev->sb_dirty = 1; + md_update_sb(mddev); + } + if (ro) + set_device_ro(dev, 1); + } + + /* + * Free resources if final stop + */ + if (!ro) { + printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); + free_mddev(mddev); + } else + printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev)); + err = 0; +out: + return err; +} + +/* + * We have to safely support old arrays too. + */ +int detect_old_array(mdp_super_t *sb) +{ + if (sb->major_version > 0) + return 0; + if (sb->minor_version >= 90) + return 0; + + return -EINVAL; +} + + +static void autorun_array(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int err; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return; + } + + printk(KERN_INFO "md: running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", partition_name(rdev->dev)); + } + printk("\n"); + + err = do_md_run (mddev); + if (err) { + printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + /* + * prevent the writeback of an unrunnable array + */ + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in the ->pending list) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(kdev_t countdev) +{ + struct md_list_head candidates; + struct md_list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + kdev_t md_kdev; + + + printk(KERN_INFO "md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + rdev0 = md_list_entry(pending_raid_disks.next, + mdk_rdev_t, pending); + + printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev)); + MD_INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) { + if (uuid_equal(rdev0, rdev)) { + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING + "md: %s has same UUID as %s, but superblocks differ ...\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + continue; + } + printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev)); + md_list_del(&rdev->pending); + md_list_add(&rdev->pending, &candidates); + } + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); + mddev = kdev_to_mddev(md_kdev); + if (mddev) { + printk(KERN_WARNING "md: md%d already running, cannot run %s\n", + mdidx(mddev), partition_name(rdev0->dev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) + export_rdev(rdev); + continue; + } + mddev = alloc_mddev(md_kdev); + if (!mddev) { + printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); + break; + } + if (md_kdev == countdev) + atomic_inc(&mddev->active); + printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + bind_rdev_to_array(rdev, mddev); + list_del_init(&rdev->pending); + } + autorun_array(mddev); + } + printk(KERN_INFO "md: ... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +#define BAD_VERSION KERN_ERR \ +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_DEVICE KERN_ERR \ +"md: disabled device %s\n" + +#define AUTOADD_FAILED KERN_ERR \ +"md: auto-adding devices to md%d FAILED (error %d).\n" + +#define AUTOADD_FAILED_USED KERN_ERR \ +"md: cannot auto-add device %s to md%d, already used.\n" + +#define AUTORUN_FAILED KERN_ERR \ +"md: auto-running md%d FAILED (error %d).\n" + +#define MDDEV_BUSY KERN_ERR \ +"md: cannot auto-add to md%d, already running.\n" + +#define AUTOADDING KERN_INFO \ +"md: auto-adding devices to md%d, based on %s's superblock.\n" + +#define AUTORUNNING KERN_INFO \ +"md: auto-running md%d.\n" + +static int autostart_array(kdev_t startdev, kdev_t countdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + if (md_import_device(startdev, 1)) { + printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev)); + goto abort; + } + + start_rdev = find_rdev_all(startdev); + if (!start_rdev) { + MD_BUG(); + goto abort; + } + if (start_rdev->faulty) { + printk(KERN_WARNING "md: can not autostart based on faulty %s!\n", + partition_name(startdev)); + goto abort; + } + md_list_add(&start_rdev->pending, &pending_raid_disks); + + sb = start_rdev->sb; + + err = detect_old_array(sb); + if (err) { + printk(KERN_WARNING "md: array version is too old to be autostarted ," + "use raidtools 0.90 mkraid --upgrade to upgrade the array " + "without data loss!\n"); + goto abort; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + if (dev == startdev) + continue; + if (md_import_device(dev, 1)) { + printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n", + partition_name(dev)); + continue; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(countdev); + return 0; + +abort: + if (start_rdev) + export_rdev(start_rdev); + return err; +} + +#undef BAD_VERSION +#undef OUT_OF_MEM +#undef NO_DEVICE +#undef AUTOADD_FAILED_USED +#undef AUTOADD_FAILED +#undef AUTORUN_FAILED +#undef AUTOADDING +#undef AUTORUNNING + + +static int get_version(void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (md_copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +#define SET_FROM_SB(x) info.x = mddev->sb->x +static int get_array_info(mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + + SET_FROM_SB(major_version); + SET_FROM_SB(minor_version); + SET_FROM_SB(patch_version); + SET_FROM_SB(ctime); + SET_FROM_SB(level); + SET_FROM_SB(size); + SET_FROM_SB(nr_disks); + SET_FROM_SB(raid_disks); + SET_FROM_SB(md_minor); + SET_FROM_SB(not_persistent); + + SET_FROM_SB(utime); + SET_FROM_SB(state); + SET_FROM_SB(active_disks); + SET_FROM_SB(working_disks); + SET_FROM_SB(failed_disks); + SET_FROM_SB(spare_disks); + + SET_FROM_SB(layout); + SET_FROM_SB(chunk_size); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x +static int get_disk_info(mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= MD_SB_DISKS) + return -EINVAL; + + SET_FROM_SB(major); + SET_FROM_SB(minor); + SET_FROM_SB(raid_disk); + SET_FROM_SB(state); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_SB(x) mddev->sb->disks[nr].x = info->x + +static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) +{ + int err, size, persistent; + mdk_rdev_t *rdev; + unsigned int nr; + kdev_t dev; + dev = MKDEV(info->major,info->minor); + + if (find_rdev_all(dev)) { + printk(KERN_WARNING "md: device %s already used in a RAID array!\n", + partition_name(dev)); + return -EBUSY; + } + if (!mddev->sb) { + /* expecting a device which has a superblock */ + err = md_import_device(dev, 1); + if (err) { + printk(KERN_WARNING "md: md_import_device returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (!list_empty(&mddev->disks)) { + mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + if (!uuid_equal(rdev0, rdev)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n", + partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + } + bind_rdev_to_array(rdev, mddev); + return 0; + } + + nr = info->number; + if (nr >= mddev->sb->nr_disks) { + MD_BUG(); + return -EINVAL; + } + + + SET_SB(number); + SET_SB(major); + SET_SB(minor); + SET_SB(raid_disk); + SET_SB(state); + + if ((info->state & (1<old_dev = dev; + rdev->desc_nr = info->number; + + bind_rdev_to_array(rdev, mddev); + + persistent = !mddev->sb->not_persistent; + if (!persistent) + printk(KERN_INFO "md: nonpersistent superblock ...\n"); + + size = calc_dev_size(dev, mddev, persistent); + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + if (!mddev->sb->size || (mddev->sb->size > size)) + mddev->sb->size = size; + } + + /* + * sync all other superblocks with the main superblock + */ + sync_sbs(mddev); + + return 0; +} +#undef SET_SB + +static int hot_generate_error(mddev_t * mddev, kdev_t dev) +{ + struct request_queue *q; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + return -ENXIO; + } + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (!disk_active(disk)) + return -ENODEV; + + q = blk_get_queue(rdev->dev); + if (!q) { + MD_BUG(); + return -ENODEV; + } + printk(KERN_INFO "md: okay, generating error!\n"); +// q->oneshot_error = 1; // disabled for now + + return 0; +} + +static int hot_remove_disk(mddev_t * mddev, kdev_t dev) +{ + int err; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (disk_active(disk)) + goto busy; + + if (disk_removed(disk)) + return -EINVAL; + + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); + if (err == -EBUSY) + goto busy; + + if (err) { + MD_BUG(); + return -EINVAL; + } + + remove_descriptor(disk, mddev->sb); + kick_rdev_from_array(rdev); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + return 0; +busy: + printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk(mddev_t * mddev, kdev_t dev) +{ + int i, err, persistent; + unsigned int size; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk(KERN_WARNING "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + persistent = !mddev->sb->not_persistent; + + rdev = find_rdev(mddev, dev); + if (rdev) + return -EBUSY; + + err = md_import_device (dev, 0); + if (err) { + printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (rdev->faulty) { + printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n", + partition_name(dev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + size = calc_dev_size(dev, mddev, persistent); + + if (size < mddev->sb->size) { + printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n", + mdidx(mddev), size, mddev->sb->size); + err = -ENOSPC; + goto abort_export; + } + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + rdev->old_dev = dev; + rdev->size = size; + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + disk = mddev->sb->disks + mddev->sb->raid_disks; + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { + disk = mddev->sb->disks + i; + + if (!disk->major && !disk->minor) + break; + if (disk_removed(disk)) + break; + } + if (i == MD_SB_DISKS) { + printk(KERN_WARNING "md%d: can not hot-add to full array!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + if (disk_removed(disk)) { + /* + * reuse slot + */ + if (disk->number != i) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + } else { + disk->number = i; + } + + disk->raid_disk = disk->number; + disk->major = MAJOR(dev); + disk->minor = MINOR(dev); + + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + + mark_disk_spare(disk); + mddev->sb->nr_disks++; + mddev->sb->spare_disks++; + mddev->sb->working_disks++; + + mddev->sb_dirty = 1; + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + md_recover_arrays(); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +#define SET_SB(x) mddev->sb->x = info->x +static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +{ + + if (alloc_array_sb(mddev)) + return -ENOMEM; + + mddev->sb->major_version = MD_MAJOR_VERSION; + mddev->sb->minor_version = MD_MINOR_VERSION; + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; + mddev->sb->ctime = CURRENT_TIME; + + SET_SB(level); + SET_SB(size); + SET_SB(nr_disks); + SET_SB(raid_disks); + SET_SB(md_minor); + SET_SB(not_persistent); + + SET_SB(state); + SET_SB(active_disks); + SET_SB(working_disks); + SET_SB(failed_disks); + SET_SB(spare_disks); + + SET_SB(layout); + SET_SB(chunk_size); + + mddev->sb->md_magic = MD_SB_MAGIC; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(&mddev->sb->set_uuid0, 4); + get_random_bytes(&mddev->sb->set_uuid1, 4); + get_random_bytes(&mddev->sb->set_uuid2, 4); + get_random_bytes(&mddev->sb->set_uuid3, 4); + + return 0; +} +#undef SET_SB + +static int set_disk_faulty(mddev_t *mddev, kdev_t dev) +{ + int ret; + + ret = md_error(mddev, dev); + return ret; +} + +static int md_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!md_capable_admin()) + return -EACCES; + + dev = inode->i_rdev; + minor = MINOR(dev); + if (minor >= MAX_MD_DEVS) { + MD_BUG(); + return -EINVAL; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done_unlock; + +#ifndef MODULE + case RAID_AUTORUN: + err = 0; + autostart_arrays(); + goto done; +#endif + + case BLKGETSIZE: + case BLKGETSIZE64: + case BLKRAGET: + case BLKRASET: + case BLKFLSBUF: + case BLKBSZGET: + case BLKBSZSET: + err = blk_ioctl (dev, cmd, arg); + goto abort; + + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = kdev_to_mddev(dev); + + switch (cmd) + { + case SET_ARRAY_INFO: + case START_ARRAY: + if (mddev) { + printk(KERN_WARNING "md: array md%d already exists!\n", + mdidx(mddev)); + err = -EEXIST; + goto abort; + } + default:; + } + switch (cmd) + { + case SET_ARRAY_INFO: + mddev = alloc_mddev(dev); + if (!mddev) { + err = -ENOMEM; + goto abort; + } + atomic_inc(&mddev->active); + + /* + * alloc_mddev() should possibly self-lock. + */ + err = lock_mddev(mddev); + if (err) { + printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n", + err, cmd); + goto abort; + } + + if (mddev->sb) { + printk(KERN_WARNING "md: array md%d already has a superblock!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (arg) { + mdu_array_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldnt set array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + + case START_ARRAY: + /* + * possibly make it lock the array ... + */ + err = autostart_array((kdev_t)arg, dev); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name((kdev_t)arg)); + goto abort; + } + goto done; + + default:; + } + + /* + * Commands querying/configuring an existing array: + */ + + if (!mddev) { + err = -ENODEV; + goto abort; + } + err = lock_mddev(mddev); + if (err) { + printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); + goto abort; + } + /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + if (!(err = do_md_stop (mddev, 0))) + mddev = NULL; + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = md_put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = md_put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[minor].start_sect, + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + case HOT_GENERATE_ERROR: + err = hot_generate_error(mddev, (kdev_t)arg); + goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, (kdev_t)arg); + goto done_unlock; + + case RUN_ARRAY: + { + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + */ + if (err) { + mddev->sb_dirty = 0; + if (!do_md_stop (mddev, 0)) + mddev = NULL; + } + goto done_unlock; + } + + default: + printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, " + "upgrade your software to use new ictls.\n", + current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + if (mddev) + unlock_mddev(mddev); + + return err; +done: + if (err) + MD_BUG(); +abort: + return err; +} + +static int md_open(struct inode *inode, struct file *file) +{ + /* + * Always succeed, but increment the usage count + */ + mddev_t *mddev = kdev_to_mddev(inode->i_rdev); + if (mddev) + atomic_inc(&mddev->active); + return (0); +} + +static int md_release(struct inode *inode, struct file * file) +{ + mddev_t *mddev = kdev_to_mddev(inode->i_rdev); + if (mddev) + atomic_dec(&mddev->active); + return 0; +} + +static struct block_device_operations md_fops= +{ + owner: THIS_MODULE, + open: md_open, + release: md_release, + ioctl: md_ioctl, +}; + + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + md_lock_kernel(); + + /* + * Detach thread + */ + + daemonize(); + + sprintf(current->comm, thread->name); + md_init_signals(); + md_flush_signals(); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + current->policy = SCHED_OTHER; + current->nice = -20; + md_unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run)(void *data); + + wait_event_interruptible(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags)); + + clear_bit(THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->data); + run_task_queue(&tq_disk); + } + if (md_signal_pending(current)) + md_flush_signals(); + } + complete(thread->event); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + dprintk("md: waking up MD thread %p.\n", thread); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); +} + +mdk_thread_t *md_register_thread(void (*run) (void *), + void *data, const char *name) +{ + mdk_thread_t *thread; + int ret; + struct completion event; + + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; + + memset(thread, 0, sizeof(mdk_thread_t)); + md_init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->data = data; + thread->name = name; + ret = kernel_thread(md_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +void md_interrupt_thread(mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread(mdk_thread_t *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + md_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +void md_recover_arrays(void) +{ + if (!md_recovery_thread) { + MD_BUG(); + return; + } + md_wakeup_thread(md_recovery_thread); +} + + +int md_error(mddev_t *mddev, kdev_t rdev) +{ + mdk_rdev_t * rrdev; + + dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", + MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev), + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); + + if (!mddev) { + MD_BUG(); + return 0; + } + rrdev = find_rdev(mddev, rdev); + if (!rrdev || rrdev->faulty) + return 0; + if (!mddev->pers->error_handler + || mddev->pers->error_handler(mddev,rdev) <= 0) { + rrdev->faulty = 1; + } else + return 1; + /* + * if recovery was running, stop it now. + */ + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + md_recover_arrays(); + + return 0; +} + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + seq_printf(seq, "unused devices: "); + + ITERATE_RDEV_ALL(rdev,tmp) { + if (list_empty(&rdev->same_set)) { + /* + * The device is not yet used by any array. + */ + i++; + seq_printf(seq, "%s ", + partition_name(rdev->dev)); + } + } + if (!i) + seq_printf(seq, ""); + + seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, mddev_t * mddev) +{ + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + max_blocks = mddev->sb->size; + + /* + * Should not happen. + */ + if (!max_blocks) + MD_BUG(); + + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + if (!mddev->recovery_running) + /* + * true resync + */ + seq_printf(seq, " resync =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + else + /* + * recovery ... + */ + seq_printf(seq, " recovery =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt/2); + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/dt); + +} + + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + mddev_t *mddev; + + if (l > 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, mddev_t, all_mddevs); + return mddev; + } + return (void*)2;/* tail */ +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + mddev_t *next_mddev, *mddev = v; + + ++*pos; + if (v == (void*)2) + return NULL; + + if (v == (void*)1) + tmp = all_mddevs.next; + else + tmp = mddev->all_mddevs.next; + if (tmp != &all_mddevs) + next_mddev = list_entry(tmp,mddev_t,all_mddevs); + else { + next_mddev = (void*)2; + *pos = 0x10000; + } + + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + int j, size; + struct md_list_head *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev = v; + + if (v == (void*)1) { + seq_printf(seq, "Personalities : "); + for (j = 0; j < MAX_PERSONALITY; j++) + if (pers[j]) + seq_printf(seq, "[%s] ", pers[j]->name); + + seq_printf(seq, "\n"); + seq_printf(seq, "read_ahead "); + if (read_ahead[MD_MAJOR] == INT_MAX) + seq_printf(seq, "not set\n"); + else + seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + seq_printf(seq, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + seq_printf(seq, " (read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + seq_printf(seq, " %s[%d]", + partition_name(rdev->dev), rdev->desc_nr); + if (rdev->faulty) { + seq_printf(seq, "(F)"); + continue; + } + size += rdev->size; + } + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %d blocks", + md_size[mdidx(mddev)]); + else + seq_printf(seq, "\n %d blocks", size); + } + + if (mddev->pers) { + + mddev->pers->status (seq, mddev); + + seq_printf(seq, "\n "); + if (mddev->curr_resync) { + status_resync (seq, mddev); + } else { + if (sem_getcount(&mddev->resync_sem) != 1) + seq_printf(seq, " resync=DELAYED"); + } + } + seq_printf(seq, "\n"); + + return 0; +} + + +static struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + int error; + + error = seq_open(file, &md_seq_ops); + return error; +} + +static struct file_operations md_seq_fops = { + .open = md_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +int register_md_personality(int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (pers[pnum]) { + MD_BUG(); + return -EBUSY; + } + + pers[pnum] = p; + printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); + return 0; +} + +int unregister_md_personality(int pnum) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); + pers[pnum] = NULL; + return 0; +} + +mdp_disk_t *get_spare(mddev_t *mddev) +{ + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (!rdev->sb) { + MD_BUG(); + continue; + } + disk = &sb->disks[rdev->desc_nr]; + if (disk_faulty(disk)) { + MD_BUG(); + continue; + } + if (disk_active(disk)) + continue; + return disk; + } + return NULL; +} + +static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; +void md_sync_acct(kdev_t dev, unsigned long nr_sectors) +{ + unsigned int major = MAJOR(dev); + unsigned int index; + + index = disk_index(dev); + if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + return; + + sync_io[major][index] += nr_sectors; +} + +static int is_mddev_idle(mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + int major = MAJOR(rdev->dev); + int idx = disk_index(rdev->dev); + + if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + continue; + + curr_events = kstat.dk_drive_rblk[major][idx] + + kstat.dk_drive_wblk[major][idx] ; + curr_events -= sync_io[major][idx]; + if ((curr_events - rdev->last_events) > 32) { + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + // stop recovery, signal do_sync .... + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + } +} + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +int md_do_sync(mddev_t *mddev, mdp_disk_t *spare) +{ + mddev_t *mddev2; + unsigned int max_sectors, currspeed, + j, window, err, serialize; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct md_list_head *tmp; + unsigned long last_check; + + + err = down_interruptible(&mddev->resync_sem); + if (err) + goto out_nolock; + +recheck: + serialize = 0; + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: delaying resync of md%d until md%d " + "has finished resync (they share one or more physical units)\n", + mdidx(mddev), mdidx(mddev2)); + serialize = 1; + break; + } + } + if (serialize) { + interruptible_sleep_on(&resync_wait); + if (md_signal_pending(current)) { + md_flush_signals(); + err = -EINTR; + goto out; + } + goto recheck; + } + + mddev->curr_resync = 1; + + max_sectors = mddev->sb->size<<1; + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", + sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith " + "(but not more than %d KB/sec) for reconstruction.\n", + sysctl_speed_limit_max); + + /* + * Resync has low priority. + */ + current->nice = 19; + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = 0; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = vm_max_readahead*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", + window/2,max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + for (j = 0; j < max_sectors;) { + int sectors; + + sectors = mddev->pers->sync_request(mddev, j); + + if (sectors < 0) { + err = sectors; + goto out; + } + atomic_add(sectors, &mddev->recovery_active); + j += sectors; + mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + last_check = j; + + run_task_queue(&tq_disk); + + repeat: + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (md_signal_pending(current)) { + /* + * got a signal, exit. + */ + mddev->curr_resync = 0; + printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n"); + md_flush_signals(); + err = -EINTR; + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + if (md_need_resched(current)) + schedule(); + + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { + current->nice = 19; + + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + md_schedule_timeout(HZ/4); + goto repeat; + } + } else + current->nice = -20; + } + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + err = 0; + /* + * this also signals 'finished resyncing' to md_stop + */ +out: + wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0); + up(&mddev->resync_sem); +out_nolock: + mddev->curr_resync = 0; + wake_up(&resync_wait); + return err; +} + + +/* + * This is a kernel thread which syncs a spare disk with the active array + * + * the amount of foolproofing might seem to be a tad excessive, but an + * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs + * of my root partition with the first 0.5 gigs of my /home partition ... so + * i'm a bit nervous ;) + */ +void md_do_recovery(void *data) +{ + int err; + mddev_t *mddev; + mdp_super_t *sb; + mdp_disk_t *spare; + struct md_list_head *tmp; + + printk(KERN_INFO "md: recovery thread got woken up ...\n"); +restart: + ITERATE_MDDEV(mddev,tmp) { + sb = mddev->sb; + if (!sb) + continue; + if (mddev->recovery_running) + continue; + if (sb->active_disks == sb->raid_disks) + continue; + if (mddev->sb_dirty) + md_update_sb(mddev); + if (!sb->spare_disks) { + printk(KERN_ERR "md%d: no spare disk to reconstruct array! " + "-- continuing in degraded mode\n", mdidx(mddev)); + continue; + } + /* + * now here we get the spare and resync it. + */ + spare = get_spare(mddev); + if (!spare) + continue; + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", + mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!mddev->pers->diskop) + continue; + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) + continue; + down(&mddev->recovery_sem); + mddev->recovery_running = 1; + err = md_do_sync(mddev, spare); + if (err == -EIO) { + printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", + mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!disk_faulty(spare)) { + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); + mark_disk_faulty(spare); + mark_disk_nonsync(spare); + mark_disk_inactive(spare); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + } + } else + if (disk_faulty(spare)) + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + if (err == -EINTR || err == -ENOMEM) { + /* + * Recovery got interrupted, or ran out of mem ... + * signal back that we have finished using the array. + */ + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + up(&mddev->recovery_sem); + mddev->recovery_running = 0; + continue; + } else { + mddev->recovery_running = 0; + up(&mddev->recovery_sem); + } + if (!disk_faulty(spare)) { + /* + * the SPARE_ACTIVE diskop possibly changes the + * pointer too + */ + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); + mark_disk_sync(spare); + mark_disk_active(spare); + sb->active_disks++; + sb->spare_disks--; + } + mddev->sb_dirty = 1; + md_update_sb(mddev); + goto restart; + } + printk(KERN_INFO "md: recovery thread finished ...\n"); + +} + +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct md_list_head *tmp; + mddev_t *mddev; + + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) + || (code == MD_SYS_POWER_OFF)) { + + printk(KERN_INFO "md: stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + md_mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + notifier_call: md_notify_reboot, + next: NULL, + priority: INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + struct proc_dir_entry *p; + int i; + + for(i = 0; i < MAX_MD_DEVS; i++) { + md_blocksizes[i] = 1024; + md_size[i] = 0; + md_hardsect_sizes[i] = 512; + } + blksize_size[MAJOR_NR] = md_blocksizes; + blk_size[MAJOR_NR] = md_size; + max_readahead[MAJOR_NR] = md_maxreadahead; + hardsect_size[MAJOR_NR] = md_hardsect_sizes; + + dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + p = create_proc_entry("mdstat", S_IRUGO, NULL); + if (p) + p->proc_fops = &md_seq_fops; +#endif +} + +request_queue_t * md_queue_proc(kdev_t dev) +{ + mddev_t *mddev = kdev_to_mddev(dev); + if (mddev == NULL) + return BLK_DEFAULT_QUEUE(MAJOR_NR); + else + return &mddev->queue; +} + +int md__init md_init(void) +{ + static char * name = "mdrecoveryd"; + int minor; + + printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + + if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops)) + { + printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR); + return (-1); + } + devfs_handle = devfs_mk_dir (NULL, "md", NULL); + /* we don't use devfs_register_series because we want to fill md_hd_struct */ + for (minor=0; minor < MAX_MD_DEVS; ++minor) { + char devname[128]; + sprintf (devname, "%u", minor); + md_hd_struct[minor].de = devfs_register (devfs_handle, + devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor, + S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); + } + + /* all requests on an uninitialised device get failed... */ + blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request); + blk_dev[MAJOR_NR].queue = md_queue_proc; + + + read_ahead[MAJOR_NR] = INT_MAX; + + add_gendisk(&md_gendisk); + + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); + if (!md_recovery_thread) + printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n"); + + md_register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table, 1); + + md_geninit(); + return (0); +} + + +#ifndef MODULE + +/* + * When md (and any require personalities) are compiled into the kernel + * (not a module), arrays can be assembles are boot time using with AUTODETECT + * where specially marked partitions are registered with md_autodetect_dev(), + * and with MD_BOOT where devices to be collected are given on the boot line + * with md=..... + * The code for that is here. + */ + +struct { + int set; + int noautodetect; +} raid_setup_args md__initdata; + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +static kdev_t detected_devices[128]; +static int dev_cnt; + +void md_autodetect_dev(kdev_t dev) +{ + if (dev_cnt >= 0 && dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} + + +static void autostart_arrays(void) +{ + mdk_rdev_t *rdev; + int i; + + printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + + for (i = 0; i < dev_cnt; i++) { + kdev_t dev = detected_devices[i]; + + if (md_import_device(dev,1)) { + printk(KERN_ALERT "md: could not import %s!\n", + partition_name(dev)); + continue; + } + /* + * Sanity checks: + */ + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + dev_cnt = 0; + + autorun_devices(-1); +} + +static struct { + char device_set [MAX_MD_DEVS]; + int pers[MAX_MD_DEVS]; + int chunk[MAX_MD_DEVS]; + char *device_names[MAX_MD_DEVS]; +} md_setup_args md__initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistant-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + * 2001-06-03: Dave Cinege + * Shifted name_to_kdev_t() and related operations to md_set_drive() + * for later execution. Rewrote section to make devfs compatible. + */ +static int md__init md_setup(char *str) +{ + int minor, level, factor, fault; + char *pername = ""; + char *str1 = str; + + if (get_option(&str, &minor) != 2) { /* MD Number */ + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + if (minor >= MAX_MD_DEVS) { + printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor); + return 0; + } else if (md_setup_args.device_names[minor]) { + printk(KERN_WARNING "md: md=%d, Specified more then once. " + "Replacing previous definition.\n", minor); + } + switch (get_option(&str, &level)) { /* RAID Personality */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == -1) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args.pers[minor] = level; + md_setup_args.chunk[minor] = 1 << (factor+12); + switch(level) { + case -1: + level = LINEAR; + pername = "linear"; + break; + case 0: + level = RAID0; + pername = "raid0"; + break; + default: + printk(KERN_WARNING + "md: The kernel has not been configured for raid%d support!\n", + level); + return 0; + } + md_setup_args.pers[minor] = level; + break; + } + /* FALL THROUGH */ + case 1: /* the first device is numeric */ + str = str1; + /* FALL THROUGH */ + case 0: + md_setup_args.pers[minor] = 0; + pername="super-block"; + } + + printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", + minor, pername, str); + md_setup_args.device_names[minor] = str; + + return 1; +} + +extern kdev_t name_to_kdev_t(char *line) md__init; +void md__init md_setup_drive(void) +{ + int minor, i; + kdev_t dev; + mddev_t*mddev; + kdev_t devices[MD_SB_DISKS+1]; + + for (minor = 0; minor < MAX_MD_DEVS; minor++) { + int err = 0; + char *devname; + mdu_disk_info_t dinfo; + + if ((devname = md_setup_args.device_names[minor]) == 0) continue; + + for (i = 0; i < MD_SB_DISKS && devname != 0; i++) { + + char *p; + void *handle; + + p = strchr(devname, ','); + if (p) + *p++ = 0; + + dev = name_to_kdev_t(devname); + handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev), + DEVFS_SPECIAL_BLK, 1); + if (handle != 0) { + unsigned major, minor; + devfs_get_maj_min(handle, &major, &minor); + dev = MKDEV(major, minor); + } + if (dev == 0) { + printk(KERN_WARNING "md: Unknown device name: %s\n", devname); + break; + } + + devices[i] = dev; + md_setup_args.device_set[minor] = 1; + + devname = p; + } + devices[i] = 0; + + if (md_setup_args.device_set[minor] == 0) + continue; + + if (mddev_map[minor]) { + printk(KERN_WARNING + "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", + minor); + continue; + } + printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]); + + mddev = alloc_mddev(MKDEV(MD_MAJOR,minor)); + if (!mddev) { + printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor); + continue; + } + if (md_setup_args.pers[minor]) { + /* non-persistent */ + mdu_array_info_t ainfo; + ainfo.level = pers_to_level(md_setup_args.pers[minor]); + ainfo.size = 0; + ainfo.nr_disks =0; + ainfo.raid_disks =0; + ainfo.md_minor =minor; + ainfo.not_persistent = 1; + + ainfo.state = (1 << MD_SB_CLEAN); + ainfo.active_disks = 0; + ainfo.working_disks = 0; + ainfo.failed_disks = 0; + ainfo.spare_disks = 0; + ainfo.layout = 0; + ainfo.chunk_size = md_setup_args.chunk[minor]; + err = set_array_info(mddev, &ainfo); + for (i = 0; !err && (dev = devices[i]); i++) { + dinfo.number = i; + dinfo.raid_disk = i; + dinfo.state = (1<sb->nr_disks++; + mddev->sb->raid_disks++; + mddev->sb->active_disks++; + mddev->sb->working_disks++; + err = add_new_disk (mddev, &dinfo); + } + } else { + /* persistent */ + for (i = 0; (dev = devices[i]); i++) { + dinfo.major = MAJOR(dev); + dinfo.minor = MINOR(dev); + add_new_disk (mddev, &dinfo); + } + } + if (!err) + err = do_md_run(mddev); + if (err) { + mddev->sb_dirty = 0; + do_md_stop(mddev, 0); + printk(KERN_WARNING "md: starting md%d failed\n", minor); + } + } +} + +static int md__init raid_setup(char *str) +{ + int len, pos; + + len = strlen(str) + 1; + pos = 0; + + while (pos < len) { + char *comma = strchr(str+pos, ','); + int wlen; + if (comma) + wlen = (comma-str)-pos; + else wlen = (len-1)-pos; + + if (strncmp(str, "noautodetect", wlen) == 0) + raid_setup_args.noautodetect = 1; + pos += wlen+1; + } + raid_setup_args.set = 1; + return 1; +} + +int md__init md_run_setup(void) +{ + if (raid_setup_args.noautodetect) + printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n"); + else + autostart_arrays(); + md_setup_drive(); + return 0; +} + +__setup("raid=", raid_setup); +__setup("md=", md_setup); + +__initcall(md_init); +__initcall(md_run_setup); + +#else /* It is a MODULE */ + +int init_module(void) +{ + return md_init(); +} + +static void free_device_names(void) +{ + while (!list_empty(&device_names)) { + struct dname *tmp = list_entry(device_names.next, + dev_name_t, list); + list_del(&tmp->list); + kfree(tmp); + } +} + + +void cleanup_module(void) +{ + md_unregister_thread(md_recovery_thread); + devfs_unregister(devfs_handle); + + devfs_unregister_blkdev(MAJOR_NR,"md"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("mdstat", NULL); +#endif + + del_gendisk(&md_gendisk); + + blk_dev[MAJOR_NR].queue = NULL; + blksize_size[MAJOR_NR] = NULL; + blk_size[MAJOR_NR] = NULL; + max_readahead[MAJOR_NR] = NULL; + hardsect_size[MAJOR_NR] = NULL; + + free_device_names(); + +} +#endif + +MD_EXPORT_SYMBOL(md_size); +MD_EXPORT_SYMBOL(register_md_personality); +MD_EXPORT_SYMBOL(unregister_md_personality); +MD_EXPORT_SYMBOL(partition_name); +MD_EXPORT_SYMBOL(md_error); +MD_EXPORT_SYMBOL(md_do_sync); +MD_EXPORT_SYMBOL(md_sync_acct); +MD_EXPORT_SYMBOL(md_done_sync); +MD_EXPORT_SYMBOL(md_recover_arrays); +MD_EXPORT_SYMBOL(md_register_thread); +MD_EXPORT_SYMBOL(md_unregister_thread); +MD_EXPORT_SYMBOL(md_update_sb); +MD_EXPORT_SYMBOL(md_wakeup_thread); +MD_EXPORT_SYMBOL(md_print_devices); +MD_EXPORT_SYMBOL(find_rdev_nr); +MD_EXPORT_SYMBOL(md_interrupt_thread); +<<<<<<< +MD_EXPORT_SYMBOL(mddev_map); +||||||| +EXPORT_SYMBOL(mddev_map); +======= +>>>>>>> +MODULE_LICENSE("GPL"); diff --git a/tests/linux/md-messy/diff b/tests/linux/md-messy/diff new file mode 100644 index 0000000..35d56b7 --- /dev/null +++ b/tests/linux/md-messy/diff @@ -0,0 +1,93 @@ +@@ -1,90 +1,90 @@ +| return <<<--0-->>><<<++1++>>>; +|<<<--abort:-->>><<<++}++>>> +|<<<-- return-->>><<<++ +|#undef++>>> <<<--1; +|}-->>><<<++OLD_LEVEL++>>> + + static int device_size_calculation(mddev_t * mddev) + { + int data_disks = 0; + unsigned int readahead; + struct list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size < mddev->chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size:" + " %lluk < %dk\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->size, + mddev->chunk_size / 1024); + return -EINVAL; + } + } + + switch (mddev->level) { + case LEVEL_MULTIPATH: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case LEVEL_LINEAR: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = mddev->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = mddev->raid_disks-1; + break; + default: + printk(KERN_ERR "md: md%d: unsupported raid level %d\n", + mdidx(mddev), mddev->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = mddev->size * data_disks; + + readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; + if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) { + readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + // (no multipath branch - it uses the default setting) + if (mddev->level == -3) + readahead = 0; + } + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; + abort: + return 1; + } + + static struct gendisk *md_probe(dev_t dev, int *part, void *data) + { + static DECLARE_MUTEX(disks_sem); +|<<<-- -->>> \ No newline at end of file diff --git a/tests/linux/md-messy/new b/tests/linux/md-messy/new new file mode 100644 index 0000000..c9b96f5 --- /dev/null +++ b/tests/linux/md-messy/new @@ -0,0 +1,90 @@ + return 1; +} + +#undef OLD_LEVEL + +static int device_size_calculation(mddev_t * mddev) +{ + int data_disks = 0; + unsigned int readahead; + struct list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size < mddev->chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size:" + " %lluk < %dk\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->size, + mddev->chunk_size / 1024); + return -EINVAL; + } + } + + switch (mddev->level) { + case LEVEL_MULTIPATH: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case LEVEL_LINEAR: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = mddev->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = mddev->raid_disks-1; + break; + default: + printk(KERN_ERR "md: md%d: unsupported raid level %d\n", + mdidx(mddev), mddev->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = mddev->size * data_disks; + + readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; + if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) { + readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + // (no multipath branch - it uses the default setting) + if (mddev->level == -3) + readahead = 0; + } + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; +abort: + return 1; +} + +static struct gendisk *md_probe(dev_t dev, int *part, void *data) +{ + static DECLARE_MUTEX(disks_sem); diff --git a/tests/linux/md-messy/orig b/tests/linux/md-messy/orig new file mode 100644 index 0000000..252f9de --- /dev/null +++ b/tests/linux/md-messy/orig @@ -0,0 +1,91 @@ + return 0; +abort: + return 1; +} + +static int device_size_calculation(mddev_t * mddev) +{ + int data_disks = 0; + unsigned int readahead; + struct list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size < mddev->chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size:" + " %lluk < %dk\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->size, + mddev->chunk_size / 1024); + return -EINVAL; + } + } + + switch (mddev->level) { + case LEVEL_MULTIPATH: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case LEVEL_LINEAR: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = mddev->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = mddev->raid_disks-1; + break; + default: + printk(KERN_ERR "md: md%d: unsupported raid level %d\n", + mdidx(mddev), mddev->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = mddev->size * data_disks; + + readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; + if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) { + readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + // (no multipath branch - it uses the default setting) + if (mddev->level == -3) + readahead = 0; + } + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; +abort: + return 1; +} + +static struct gendisk *md_probe(dev_t dev, int *part, void *data) +{ + static DECLARE_MUTEX(disks_sem); + \ No newline at end of file diff --git a/tests/linux/md-resync/merge b/tests/linux/md-resync/merge new file mode 100644 index 0000000..eb8379e --- /dev/null +++ b/tests/linux/md-resync/merge @@ -0,0 +1,1911 @@ +/* + * raid1.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * RAID-1 management functions. + * + * Better read-balancing code written by Mika Kuoppala , 2000 + * + * Fixes to reconstruction by Jakob Østergaard" + * Various fixes by Neil Brown + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +#define MAX_WORK_PER_DISK 128 + +#define NR_RESERVED_BUFS 32 + + +/* + * The following can be used to debug the driver + */ +#define RAID1_DEBUG 0 + +#if RAID1_DEBUG +#define PRINTK(x...) printk(x) +#define inline +#define __inline__ +#else +#define PRINTK(x...) do { } while (0) +#endif + + +static mdk_personality_t raid1_personality; +static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; +struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail; + +static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) +{ + /* return a linked list of "cnt" struct buffer_heads. + * don't take any off the free list unless we know we can + * get all we need, otherwise we could deadlock + */ + struct buffer_head *bh=NULL; + + while(cnt) { + struct buffer_head *t; + md_spin_lock_irq(&conf->device_lock); + if (!conf->freebh_blocked && conf->freebh_cnt >= cnt) + while (cnt) { + t = conf->freebh; + conf->freebh = t->b_next; + t->b_next = bh; + bh = t; + t->b_state = 0; + conf->freebh_cnt--; + cnt--; + } + md_spin_unlock_irq(&conf->device_lock); + if (cnt == 0) + break; + t = kmem_cache_alloc(bh_cachep, SLAB_NOIO); + if (t) { + t->b_next = bh; + bh = t; + cnt--; + } else { + PRINTK("raid1: waiting for %d bh\n", cnt); + conf->freebh_blocked = 1; + wait_disk_event(conf->wait_buffer, + !conf->freebh_blocked || + conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2); + conf->freebh_blocked = 0; + } + } + return bh; +} + +static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh) +{ + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + while (bh) { + struct buffer_head *t = bh; + bh=bh->b_next; + if (t->b_pprev == NULL) + kmem_cache_free(bh_cachep, t); + else { + t->b_next= conf->freebh; + conf->freebh = t; + conf->freebh_cnt++; + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + wake_up(&conf->wait_buffer); +} + +static int raid1_grow_bh(raid1_conf_t *conf, int cnt) +{ + /* allocate cnt buffer_heads, possibly less if kmalloc fails */ + int i = 0; + + while (i < cnt) { + struct buffer_head *bh; + bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); + if (!bh) break; + + md_spin_lock_irq(&conf->device_lock); + bh->b_pprev = &conf->freebh; + bh->b_next = conf->freebh; + conf->freebh = bh; + conf->freebh_cnt++; + md_spin_unlock_irq(&conf->device_lock); + + i++; + } + return i; +} + +static void raid1_shrink_bh(raid1_conf_t *conf) +{ + /* discard all buffer_heads */ + + md_spin_lock_irq(&conf->device_lock); + while (conf->freebh) { + struct buffer_head *bh = conf->freebh; + conf->freebh = bh->b_next; + kmem_cache_free(bh_cachep, bh); + conf->freebh_cnt--; + } + md_spin_unlock_irq(&conf->device_lock); +} + + +static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh = NULL; + + do { + md_spin_lock_irq(&conf->device_lock); + if (!conf->freer1_blocked && conf->freer1) { + r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + conf->freer1_cnt--; + r1_bh->next_r1 = NULL; + r1_bh->state = (1 << R1BH_PreAlloc); + r1_bh->bh_req.b_state = 0; + } + md_spin_unlock_irq(&conf->device_lock); + if (r1_bh) + return r1_bh; + r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO); + if (r1_bh) { + memset(r1_bh, 0, sizeof(*r1_bh)); + return r1_bh; + } + conf->freer1_blocked = 1; + wait_disk_event(conf->wait_buffer, + !conf->freer1_blocked || + conf->freer1_cnt > NR_RESERVED_BUFS/2 + ); + conf->freer1_blocked = 0; + } while (1); +} + +static inline void raid1_free_r1bh(struct raid1_bh *r1_bh) +{ + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + + r1_bh->mirror_bh_list = NULL; + + if (test_bit(R1BH_PreAlloc, &r1_bh->state)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + r1_bh->next_r1 = conf->freer1; + conf->freer1 = r1_bh; + conf->freer1_cnt++; + spin_unlock_irqrestore(&conf->device_lock, flags); + /* don't need to wakeup wait_buffer because + * raid1_free_bh below will do that + */ + } else { + kfree(r1_bh); + } + raid1_free_bh(conf, bh); +} + +static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt) +{ + int i = 0; + + while (i < cnt) { + struct raid1_bh *r1_bh; + r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) + break; + memset(r1_bh, 0, sizeof(*r1_bh)); + set_bit(R1BH_PreAlloc, &r1_bh->state); + r1_bh->mddev = conf->mddev; + + raid1_free_r1bh(r1_bh); + i++; + } + return i; +} + +static void raid1_shrink_r1bh(raid1_conf_t *conf) +{ + md_spin_lock_irq(&conf->device_lock); + while (conf->freer1) { + struct raid1_bh *r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + conf->freer1_cnt--; + kfree(r1_bh); + } + md_spin_unlock_irq(&conf->device_lock); +} + + + +static inline void raid1_free_buf(struct raid1_bh *r1_bh) +{ + unsigned long flags; + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + r1_bh->mirror_bh_list = NULL; + + spin_lock_irqsave(&conf->device_lock, flags); + r1_bh->next_r1 = conf->freebuf; + conf->freebuf = r1_bh; + spin_unlock_irqrestore(&conf->device_lock, flags); + raid1_free_bh(conf, bh); +} + +static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh; + + md_spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock); + r1_bh = conf->freebuf; + conf->freebuf = r1_bh->next_r1; + r1_bh->next_r1= NULL; + md_spin_unlock_irq(&conf->device_lock); + + return r1_bh; +} + +static int raid1_grow_buffers (raid1_conf_t *conf, int cnt) +{ + int i = 0; + struct raid1_bh *head = NULL, **tail; + tail = &head; + + while (i < cnt) { + struct raid1_bh *r1_bh; + struct page *page; + + page = alloc_page(GFP_KERNEL); + if (!page) + break; + + r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) { + __free_page(page); + break; + } + memset(r1_bh, 0, sizeof(*r1_bh)); + r1_bh->bh_req.b_page = page; + r1_bh->bh_req.b_data = page_address(page); + *tail = r1_bh; + r1_bh->next_r1 = NULL; + tail = & r1_bh->next_r1; + i++; + } + /* this lock probably isn't needed, as at the time when + * we are allocating buffers, nobody else will be touching the + * freebuf list. But it doesn't hurt.... + */ + md_spin_lock_irq(&conf->device_lock); + *tail = conf->freebuf; + conf->freebuf = head; + md_spin_unlock_irq(&conf->device_lock); + return i; +} + +static void raid1_shrink_buffers (raid1_conf_t *conf) +{ + struct raid1_bh *head; + md_spin_lock_irq(&conf->device_lock); + head = conf->freebuf; + conf->freebuf = NULL; + md_spin_unlock_irq(&conf->device_lock); + + while (head) { + struct raid1_bh *r1_bh = head; + head = r1_bh->next_r1; + __free_page(r1_bh->bh_req.b_page); + kfree(r1_bh); + } +} + +static int raid1_map (mddev_t *mddev, kdev_t *rdev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int i, disks = MD_SB_DISKS; + + /* + * Later we do read balancing on the read side + * now we use the first available disk. + */ + + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].operational) { + *rdev = conf->mirrors[i].dev; + return (0); + } + } + + printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n"); + return (-1); +} + +static void raid1_reschedule_retry (struct raid1_bh *r1_bh) +{ + unsigned long flags; + mddev_t *mddev = r1_bh->mddev; + raid1_conf_t *conf = mddev_to_conf(mddev); + + md_spin_lock_irqsave(&retry_list_lock, flags); + if (raid1_retry_list == NULL) + raid1_retry_tail = &raid1_retry_list; + *raid1_retry_tail = r1_bh; + raid1_retry_tail = &r1_bh->next_r1; + r1_bh->next_r1 = NULL; + md_spin_unlock_irqrestore(&retry_list_lock, flags); + md_wakeup_thread(conf->thread); +} + + +static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector < conf->start_active) + conf->cnt_done--; + else if (sector >= conf->start_future && conf->phase == phase) + conf->cnt_future--; + else if (!--conf->cnt_pending) + wake_up(&conf->wait_ready); + + spin_unlock_irqrestore(&conf->segment_lock, flags); +} + +static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector >= conf->start_ready) + --conf->cnt_ready; + else if (sector >= conf->start_active) { + if (!--conf->cnt_active) { + conf->start_active = conf->start_ready; + wake_up(&conf->wait_done); + } + } + spin_unlock_irqrestore(&conf->segment_lock, flags); +} + +/* + * raid1_end_bh_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) +{ + struct buffer_head *bh = r1_bh->master_bh; + + io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev), + test_bit(R1BH_SyncPhase, &r1_bh->state)); + + bh->b_end_io(bh, uptodate); + raid1_free_r1bh(r1_bh); +} +void raid1_end_request (struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + if (!uptodate) + md_error (r1_bh->mddev, bh->b_dev); + else + /* + * Set R1BH_Uptodate in our master buffer_head, so that + * we will return a good error code for to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the complex operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' buffer_head. + */ + set_bit (R1BH_Uptodate, &r1_bh->state); + + /* + * We split up the read and write side, imho they are + * conceptually different. + */ + + if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { + /* + * we have only one buffer_head on the read side + */ + + if (uptodate) { + raid1_end_bh_io(r1_bh, uptodate); + return; + } + /* + * oops, read error: + */ + printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", + partition_name(bh->b_dev), bh->b_blocknr); + raid1_reschedule_retry(r1_bh); + return; + } + + /* + * WRITE: + * + * Let's see if all mirrored write operations have finished + * already. + */ + + if (atomic_dec_and_test(&r1_bh->remaining)) + raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); +} + +/* + * This routine returns the disk from which the requested read should + * be done. It bookkeeps the last read position for every disk + * in array and when new read requests come, the disk which last + * position is nearest to the request, is chosen. + * + * TODO: now if there are 2 mirrors in the same 2 devices, performance + * degrades dramatically because position is mirror, not device based. + * This should be changed to be device based. Also atomic sequential + * reads should be somehow balanced. + */ + +static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) +{ + int new_disk = conf->last_used; + const int sectors = bh->b_size >> 9; + const unsigned long this_sector = bh->b_rsector; + int disk = new_disk; + unsigned long new_distance; + unsigned long current_distance; + + /* + * Check if it is sane at all to balance + */ + + if (!conf->mddev->in_sync) + goto rb_out; + + + /* make sure that disk is operational */ + while( !conf->mirrors[new_disk].operational) { + if (new_disk <= 0) new_disk = conf->raid_disks; + new_disk--; + if (new_disk == disk) { + /* + * This means no working disk was found + * Nothing much to do, lets not change anything + * and hope for the best... + */ + + new_disk = conf->last_used; + + goto rb_out; + } + } + disk = new_disk; + /* now disk == new_disk == starting point for search */ + + /* + * Don't touch anything for sequential reads. + */ + + if (this_sector == conf->mirrors[new_disk].head_position) + goto rb_out; + + /* + * If reads have been done only on a single disk + * for a time, lets give another disk a change. + * This is for kicking those idling disks so that + * they would find work near some hotspot. + */ + + if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { + conf->sect_count = 0; + +#if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92) + /* Work around a compiler bug in egcs-2.92.11 19980921 */ + new_disk = *(volatile int *)&new_disk; +#endif + do { + if (new_disk<=0) + new_disk = conf->raid_disks; + new_disk--; + if (new_disk == disk) + break; + } while ((conf->mirrors[new_disk].write_only) || + (!conf->mirrors[new_disk].operational)); + + goto rb_out; + } + + current_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + /* Find the disk which is closest */ + + do { + if (disk <= 0) + disk = conf->raid_disks; + disk--; + + if ((conf->mirrors[disk].write_only) || + (!conf->mirrors[disk].operational)) + continue; + + new_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + if (new_distance < current_distance) { + conf->sect_count = 0; + current_distance = new_distance; + new_disk = disk; + } + } while (disk != conf->last_used); + +rb_out: + conf->mirrors[new_disk].head_position = this_sector + sectors; + + conf->last_used = new_disk; + conf->sect_count += sectors; + + return new_disk; +} + +static int raid1_make_request (request_queue_t *q, + struct buffer_head * bh) +{ + mddev_t *mddev = q->queuedata; + raid1_conf_t *conf = mddev_to_conf(mddev); + struct buffer_head *bh_req, *bhl; + struct raid1_bh * r1_bh; + int disks = MD_SB_DISKS; + int i, sum_bhs = 0; + struct mirror_info *mirror; + + if (!buffer_locked(bh)) + BUG(); + +/* + * make_request() can abort the operation when READA is being + * used and no empty request is available. + * + * Currently, just replace the command with READ/WRITE. + */ + r1_bh = raid1_alloc_r1bh (conf); + + spin_lock_irq(&conf->segment_lock); + wait_event_lock_irq(conf->wait_done, + bh->b_rsector < conf->start_active || + bh->b_rsector >= conf->start_future, + conf->segment_lock); + if (bh->b_rsector < conf->start_active) + conf->cnt_done++; + else { + conf->cnt_future++; + if (conf->phase) + set_bit(R1BH_SyncPhase, &r1_bh->state); + } + spin_unlock_irq(&conf->segment_lock); + + /* + * i think the read and write branch should be separated completely, + * since we want to do read balancing on the read side for example. + * Alternative implementations? :) --mingo + */ + + r1_bh->master_bh = bh; + r1_bh->mddev = mddev; + r1_bh->cmd = rw; + + if (rw == READ) { + /* + * read balancing logic: + */ + mirror = conf->mirrors + raid1_read_balance(conf, bh); + + bh_req = &r1_bh->bh_req; + memcpy(bh_req, bh, sizeof(*bh)); + bh_req->b_blocknr = bh->b_rsector; + bh_req->b_dev = mirror->dev; + bh_req->b_rdev = mirror->dev; + /* bh_req->b_rsector = bh->n_rsector; */ + bh_req->b_end_io = raid1_end_request; + bh_req->b_private = r1_bh; + generic_make_request (rw, bh_req); + return 0; + } + + /* + * WRITE: + */ + + bhl = raid1_alloc_bh(conf, conf->raid_disks); + for (i = 0; i < disks; i++) { + struct buffer_head *mbh; + if (!conf->mirrors[i].operational) + continue; + + /* + * We should use a private pool (size depending on NR_REQUEST), + * to avoid writes filling up the memory with bhs + * + * Such pools are much faster than kmalloc anyways (so we waste + * almost nothing by not using the master bh when writing and + * win alot of cleanness) but for now we are cool enough. --mingo + * + * It's safe to sleep here, buffer heads cannot be used in a shared + * manner in the write branch. Look how we lock the buffer at the + * beginning of this function to grok the difference ;) + */ + mbh = bhl; + if (mbh == NULL) { + MD_BUG(); + break; + } + bhl = mbh->b_next; + mbh->b_next = NULL; + mbh->b_this_page = (struct buffer_head *)1; + + /* + * prepare mirrored mbh (fields ordered for max mem throughput): + */ + mbh->b_blocknr = bh->b_rsector; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rdev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_rsector; + mbh->b_state = (1<b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = raid1_end_request; + mbh->b_private = r1_bh; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; + sum_bhs++; + } + if (bhl) raid1_free_bh(conf,bhl); + if (!sum_bhs) { + /* Gag - all mirrors non-operational.. */ + raid1_end_bh_io(r1_bh, 0); + return 0; + } + md_atomic_set(&r1_bh->remaining, sum_bhs); + + /* + * We have to be a bit careful about the semaphore above, thats + * why we start the requests separately. Since kmalloc() could + * fail, sleep and make_request() can sleep too, this is the + * safer solution. Imagine, end_request decreasing the semaphore + * before we could have set it up ... We could play tricks with + * the semaphore (presetting it and correcting at the end if + * sum_bhs is not 'n' but we have to do end_request by hand if + * all requests finish until we had a chance to set up the + * semaphore correctly ... lots of races). + */ + bh = r1_bh->mirror_bh_list; + while(bh) { + struct buffer_head *bh2 = bh; + bh = bh->b_next; + generic_make_request(rw, bh2); + } + return (0); +} + +static void raid1_status(struct seq_file *seq, mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int i; + + seq_printf(seq, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + seq_printf(seq, "%s", + conf->mirrors[i].operational ? "U" : "_"); + seq_printf(seq, "]"); +} + +#define LAST_DISK KERN_ALERT \ +"raid1: only one disk left and IO error.\n" + +#define NO_SPARE_DISK KERN_ALERT \ +"raid1: no spare disk left, degrading mirror level by one.\n" + +#define DISK_FAILED KERN_ALERT \ +"raid1: Disk failure on %s, disabling device. \n" \ +" Operation continuing on %d devices\n" + +#define START_SYNCING KERN_ALERT \ +"raid1: start syncing spare disk.\n" + +#define ALREADY_SYNCING KERN_INFO \ +"raid1: syncing already in progress.\n" + +static void mark_disk_bad (mddev_t *mddev, int failed) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror = conf->mirrors+failed; + mdp_super_t *sb = mddev->sb; + + mirror->operational = 0; + mark_disk_faulty(sb->disks+mirror->number); + mark_disk_nonsync(sb->disks+mirror->number); + mark_disk_inactive(sb->disks+mirror->number); + if (!mirror->write_only) + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + if (!mirror->write_only) + conf->working_disks--; + printk (DISK_FAILED, partition_name (mirror->dev), + conf->working_disks); +} + +static int raid1_error (mddev_t *mddev, kdev_t dev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info * mirrors = conf->mirrors; + int disks = MD_SB_DISKS; + int i; + + /* Find the drive. + * If it is not operational, then we have already marked it as dead + * else if it is the last working disks, ignore the error, let the + * next level up know. + * else mark the drive as failed + */ + + for (i = 0; i < disks; i++) + if (mirrors[i].dev==dev && mirrors[i].operational) + break; + if (i == disks) + return 0; + + if (i < conf->raid_disks && conf->working_disks == 1) { + /* Don't fail the drive, act as though we were just a + * normal single drive + */ + + return 1; + } + mark_disk_bad(mddev, i); + return 0; +} + +#undef LAST_DISK +#undef NO_SPARE_DISK +#undef DISK_FAILED +#undef START_SYNCING + + +static void print_raid1_conf (raid1_conf_t *conf) +{ + int i; + struct mirror_info *tmp; + + printk("RAID1 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, + conf->raid_disks, conf->nr_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} + +static void close_sync(raid1_conf_t *conf) +{ + mddev_t *mddev = conf->mddev; + /* If reconstruction was interrupted, we need to close the "active" and "pending" + * holes. + * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0 + */ + /* this is really needed when recovery stops too... */ + spin_lock_irq(&conf->segment_lock); + conf->start_active = conf->start_pending; + conf->start_ready = conf->start_pending; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; + conf->start_future = (mddev->sb->size<<1)+1; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + conf->phase = conf->phase ^1; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; + conf->phase = 0; + conf->cnt_future = conf->cnt_done;; + conf->cnt_done = 0; + spin_unlock_irq(&conf->segment_lock); + wake_up(&conf->wait_done); + + mempool_destroy(conf->r1buf_pool); + conf->r1buf_pool = NULL; +} + +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) +{ + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid1_conf_t *conf = mddev->private; + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + mdk_rdev_t *spare_rdev, *failed_rdev; + + print_raid1_conf(conf); + + switch (state) { + case DISKOP_SPARE_ACTIVE: + case DISKOP_SPARE_INACTIVE: + /* need to wait for pending sync io before locking device */ + close_sync(conf); + } + + md_spin_lock_irq(&conf->device_lock); + /* + * find the disk ... + */ + switch (state) { + + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID1 configuration ... + * (this can only be in the first conf->working_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + } + + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: +<<<<<<< + if (conf->start_future > 0) { + MD_BUG(); +||||||| + close_sync(conf); +======= +>>>>>>> + err = -EBUSY; + break; + } + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->nr_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: +<<<<<<< + if (conf->start_future > 0) { + MD_BUG(); +||||||| + close_sync(conf); +======= +>>>>>>> + err = -EBUSY; + break; + } + sdisk = conf->mirrors + spare_disk; + fdisk = conf->mirrors + failed_disk; + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } + + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + /* + * do the switch finally + */ + spare_rdev = find_rdev_nr(mddev, spare_desc->number); + failed_rdev = find_rdev_nr(mddev, failed_desc->number); + + /* There must be a spare_rdev, but there may not be a + * failed_rdev. That slot might be empty... + */ + spare_rdev->desc_nr = failed_desc->number; + if (failed_rdev) + failed_rdev->desc_nr = spare_desc->number; + + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); + + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); + + *d = failed_desc; + + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + + conf->working_disks++; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->mirrors + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + conf->nr_disks--; + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->mirrors + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + adisk->head_position = 0; + conf->nr_disks++; + + break; + + default: + MD_BUG(); + err = 1; + goto abort; + } +abort: + md_spin_unlock_irq(&conf->device_lock); +<<<<<<< + if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) + /* should move to "END_REBUILD" when such exists */ + raid1_shrink_buffers(conf); + + print_raid1_conf(conf); +||||||| + if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) { + mempool_destroy(conf->r1buf_pool); + conf->r1buf_pool = NULL; + } +======= +>>>>>>> + return err; +} + + +#define IO_ERROR KERN_ALERT \ +"raid1: %s: unrecoverable I/O read error for block %lu\n" + +#define REDIRECT_SECTOR KERN_ERR \ +"raid1: %s: redirecting sector %lu to another mirror\n" + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working mirrors. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array syncronising. + */ +static void end_sync_write(struct buffer_head *bh, int uptodate); +static void end_sync_read(struct buffer_head *bh, int uptodate); + +static void raid1d (void *data) +{ + struct raid1_bh *r1_bh; + struct buffer_head *bh; + unsigned long flags; + raid1_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + kdev_t dev; + + if (mddev->sb_dirty) + md_update_sb(mddev); + + for (;;) { + md_spin_lock_irqsave(&retry_list_lock, flags); + r1_bh = raid1_retry_list; + if (!r1_bh) + break; + raid1_retry_list = r1_bh->next_r1; + md_spin_unlock_irqrestore(&retry_list_lock, flags); + + mddev = r1_bh->mddev; + bh = &r1_bh->bh_req; + switch(r1_bh->cmd) { + case SPECIAL: + /* have to allocate lots of bh structures and + * schedule writes + */ + if (test_bit(R1BH_Uptodate, &r1_bh->state)) { + int i, sum_bhs = 0; + int disks = MD_SB_DISKS; + struct buffer_head *bhl, *mbh; + + conf = mddev_to_conf(mddev); + bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */ + for (i = 0; i < disks ; i++) { + if (!conf->mirrors[i].operational) + continue; + if (i==conf->last_used) + /* we read from here, no need to write */ + continue; + if (i < conf->raid_disks + && mddev->in_sync) + /* don't need to write this, + * we are just rebuilding */ + continue; + mbh = bhl; + if (!mbh) { + MD_BUG(); + break; + } + bhl = mbh->b_next; + mbh->b_this_page = (struct buffer_head *)1; + + + /* + * prepare mirrored bh (fields ordered for max mem throughput): + */ + mbh->b_blocknr = bh->b_blocknr; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rdev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_blocknr; + mbh->b_state = (1<b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = end_sync_write; + mbh->b_private = r1_bh; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; + + sum_bhs++; + } + md_atomic_set(&r1_bh->remaining, sum_bhs); + if (bhl) raid1_free_bh(conf, bhl); + mbh = r1_bh->mirror_bh_list; + + if (!sum_bhs) { + /* nowhere to write this too... I guess we + * must be done + */ + sync_request_done(bh->b_blocknr, conf); + md_done_sync(mddev, bh->b_size>>9, 0); + raid1_free_buf(r1_bh); + } else + while (mbh) { + struct buffer_head *bh1 = mbh; + mbh = mbh->b_next; + generic_make_request(WRITE, bh1); + md_sync_acct(bh1->b_dev, bh1->b_size/512); + } + } else { + /* There is no point trying a read-for-reconstruct + * as reconstruct is about to be aborted + */ + + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + md_done_sync(mddev, bh->b_size>>9, 0); + } + + break; + case READ: + case READA: + dev = bh->b_dev; + raid1_map (mddev, &bh->b_dev); + if (bh->b_dev == dev) { + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + raid1_end_bh_io(r1_bh, 0); + } else { + printk (REDIRECT_SECTOR, + partition_name(bh->b_dev), bh->b_blocknr); + bh->b_rdev = bh->b_dev; + bh->b_rsector = bh->b_blocknr; + generic_make_request (r1_bh->cmd, bh); + } + break; + } + } + md_spin_unlock_irqrestore(&retry_list_lock, flags); +} +#undef IO_ERROR +#undef REDIRECT_SECTOR + + +static int init_resync (conf_t *conf) +{ +*** 1144,16 **** 8 +<<<<<<< + raid1_conf_t *conf = data; +||||||| + conf_t *conf = data; +======= + sector_t max_sector, nr_sectors; +>>>>>>> + int disk, partial; + + if (sector_nr == 0) + if (init_resync(conf)) + return -ENOMEM; + +<<<<<<< +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * This is achieved by conceptually dividing the device space into a + * number of sections: + * DONE: 0 .. a-1 These blocks are in-sync + * ACTIVE: a.. b-1 These blocks may have active sync requests, but + * no normal IO requests + * READY: b .. c-1 These blocks have no normal IO requests - sync + * request may be happening + * PENDING: c .. d-1 These blocks may have IO requests, but no new + * ones will be added + * FUTURE: d .. end These blocks are not to be considered yet. IO may + * be happening, but not sync + * + * We keep a + * phase which flips (0 or 1) each time d moves and + * a count of: + * z = active io requests in FUTURE since d moved - marked with + * current phase + * y = active io requests in FUTURE before d moved, or PENDING - + * marked with previous phase + * x = active sync requests in READY + * w = active sync requests in ACTIVE + * v = active io requests in DONE + * + * Normally, a=b=c=d=0 and z= active io requests + * or a=b=c=d=END and v= active io requests + * Allowed changes to a,b,c,d: + * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase + * B: y==0 -> c=d + * C: b=c, w+=x, x=0 + * D: w==0 -> a=b + * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0 + * + * At start of sync we apply A. + * When y reaches 0, we apply B then A then being sync requests + * When sync point reaches c-1, we wait for y==0, and W==0, and + * then apply apply B then A then D then C. + * Finally, we apply E + * + * The sync request simply issues a "read" against a working drive + * This is marked so that on completion the raid1d thread is woken to + * issue suitable write requests + */ + +static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror; + struct raid1_bh *r1_bh; + struct buffer_head *bh; + int bsize; + int disk; + int block_nr; + int buffs; + + if (!sector_nr) { + /* we want enough buffers to hold twice the window of 128*/ + buffs = 128 *2 / (PAGE_SIZE>>9); + buffs = raid1_grow_buffers(conf, buffs); + if (buffs < 2) + goto nomem; + conf->window = buffs*(PAGE_SIZE>>9)/2; + } + spin_lock_irq(&conf->segment_lock); + if (!sector_nr) { + /* initialize ...*/ + conf->start_active = 0; + conf->start_ready = 0; + conf->start_pending = 0; + conf->start_future = 0; + conf->phase = 0; + + conf->cnt_future += conf->cnt_done+conf->cnt_pending; + conf->cnt_done = conf->cnt_pending = 0; + if (conf->cnt_ready || conf->cnt_active) + MD_BUG(); + } + while (sector_nr >= conf->start_pending) { + PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", + sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, + conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); + wait_event_lock_irq(conf->wait_done, + !conf->cnt_active, + conf->segment_lock); + wait_event_lock_irq(conf->wait_ready, + !conf->cnt_pending, + conf->segment_lock); + conf->start_active = conf->start_ready; + conf->start_ready = conf->start_pending; + conf->start_pending = conf->start_future; + conf->start_future = conf->start_future+conf->window; + // Note: falling off the end is not a problem + conf->phase = conf->phase ^1; + conf->cnt_active = conf->cnt_ready; + conf->cnt_ready = 0; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + wake_up(&conf->wait_done); + } + conf->cnt_ready++; + spin_unlock_irq(&conf->segment_lock); + + + /* If reconstructing, and >1 working disc, + * could dedicate one to rebuild and others to + * service read requests .. + */ + disk = conf->last_used; + /* make sure disk is operational */ + while (!conf->mirrors[disk].operational) { + if (disk <= 0) disk = conf->raid_disks; + disk--; + if (disk == conf->last_used) + break; + } + conf->last_used = disk; + + mirror = conf->mirrors+conf->last_used; + + r1_bh = raid1_alloc_buf (conf); + r1_bh->master_bh = NULL; + r1_bh->mddev = mddev; + r1_bh->cmd = SPECIAL; + bh = &r1_bh->bh_req; + + block_nr = sector_nr; + bsize = 512; + while (!(block_nr & 1) && bsize < PAGE_SIZE + && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) { + block_nr >>= 1; + bsize <<= 1; + } + bh->b_size = bsize; + bh->b_list = BUF_LOCKED; + bh->b_dev = mirror->dev; + bh->b_rdev = mirror->dev; + bh->b_state = (1<b_page) + BUG(); + if (!bh->b_data) + BUG(); + if (bh->b_data != page_address(bh->b_page)) + BUG(); + bh->b_end_io = end_sync_read; + bh->b_private = r1_bh; + bh->b_blocknr = sector_nr; + bh->b_rsector = sector_nr; + init_waitqueue_head(&bh->b_wait); + + generic_make_request(READ, bh); + md_sync_acct(bh->b_dev, bh->b_size/512); + + return (bsize >> 9); +||||||| +static int init_resync(conf_t *conf) +{ +*** 1170,9 **** 8 + sector_t max_sector, nr_sectors; + int disk, partial; +======= + max_sector = mddev->sb->size << 1; +>>>>>>> +nomem: +<<<<<<< + raid1_shrink_buffers(conf); + return -ENOMEM; +} + +static void end_sync_read(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + /* we have read a block, now it needs to be re-written, + * or re-read if the read failed. + * We don't do much here, just schedule handling by raid1d + */ + if (!uptodate) + md_error (r1_bh->mddev, bh->b_dev); + else + set_bit(R1BH_Uptodate, &r1_bh->state); + raid1_reschedule_retry(r1_bh); +} + +static void end_sync_write(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + if (!uptodate) + md_error (r1_bh->mddev, bh->b_dev); + if (atomic_dec_and_test(&r1_bh->remaining)) { + mddev_t *mddev = r1_bh->mddev; + unsigned long sect = bh->b_blocknr; + int size = bh->b_size; + raid1_free_buf(r1_bh); + sync_request_done(sect, mddev_to_conf(mddev)); + md_done_sync(mddev,size>>9, uptodate); +||||||| + if (!sector_nr) + if (init_resync(conf)) + return -ENOMEM; + /* + * If there is non-resync activity waiting for us then + * put in a delay to throttle resync. +*** 1209,10 **** 9 + r1_bio->sector = sector_nr; + r1_bio->cmd = SPECIAL; + + max_sector = mddev->sb->size << 1; + if (sector_nr >= max_sector) + BUG(); + +======= + if (sector_nr >= max_sector) { + close_sync(conf); + return 0; + } + + /* + * If there is non-resync activity waiting for us then + * put in a delay to throttle resync. +*** 1190,6 **** 9 + r1_bio->sector = sector_nr; + r1_bio->cmd = SPECIAL; + +>>>>>>> + } +} + +#define INVALID_LEVEL KERN_WARNING \ +"raid1: md%d: raid level not set to mirroring (%d)\n" + +#define NO_SB KERN_ERR \ +"raid1: disabled mirror %s (couldn't access raid superblock)\n" + +#define ERRORS KERN_ERR \ +"raid1: disabled mirror %s (errors detected)\n" + +#define NOT_IN_SYNC KERN_ERR \ +"raid1: disabled mirror %s (not in sync)\n" + +#define INCONSISTENT KERN_ERR \ +"raid1: disabled mirror %s (inconsistent descriptor)\n" + +#define ALREADY_RUNNING KERN_ERR \ +"raid1: disabled mirror %s (mirror %d already operational)\n" + +#define OPERATIONAL KERN_INFO \ +"raid1: device %s operational as mirror %d\n" + +#define MEM_ERROR KERN_ERR \ +"raid1: couldn't allocate memory for md%d\n" + +#define SPARE KERN_INFO \ +"raid1: spare disk %s\n" + +#define NONE_OPERATIONAL KERN_ERR \ +"raid1: no operational mirrors for md%d\n" + +#define ARRAY_IS_ACTIVE KERN_INFO \ +"raid1: raid set md%d active with %d out of %d mirrors\n" + +#define THREAD_ERROR KERN_ERR \ +"raid1: couldn't allocate thread for md%d\n" + +#define START_RESYNC KERN_WARNING \ +"raid1: raid set md%d not clean; reconstructing mirrors\n" + +static int raid1_run (mddev_t *mddev) +{ + raid1_conf_t *conf; + int i, j, disk_idx; + struct mirror_info *disk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *descriptor; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + MOD_INC_USE_COUNT; + + if (sb->level != 1) { + printk(INVALID_LEVEL, mdidx(mddev), sb->level); + goto out; + } + /* + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in raid1_run(), + * should be freed in raid1_stop()] + */ + + conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL); + mddev->private = conf; + if (!conf) { + printk(MEM_ERROR, mdidx(mddev)); + goto out; + } + memset(conf, 0, sizeof(*conf)); + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk(ERRORS, partition_name(rdev->dev)); + } else { + if (!rdev->sb) { + MD_BUG(); + continue; + } + } + if (rdev->desc_nr == -1) { + MD_BUG(); + continue; + } + descriptor = &sb->disks[rdev->desc_nr]; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor)) { + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + continue; + } + if (disk_active(descriptor)) { + if (!disk_sync(descriptor)) { + printk(NOT_IN_SYNC, + partition_name(rdev->dev)); + continue; + } + if ((descriptor->number > MD_SB_DISKS) || + (disk_idx > sb->raid_disks)) { + + printk(INCONSISTENT, + partition_name(rdev->dev)); + continue; + } + if (disk->operational) { + printk(ALREADY_RUNNING, + partition_name(rdev->dev), + disk_idx); + continue; + } + printk(OPERATIONAL, partition_name(rdev->dev), + disk_idx); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 1; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + conf->working_disks++; + } else { + /* + * Must be a spare disk .. + */ + printk(SPARE, partition_name(rdev->dev)); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + disk->head_position = 0; + } + } + conf->raid_disks = sb->raid_disks; + conf->nr_disks = sb->nr_disks; + conf->mddev = mddev; + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + + conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; + init_waitqueue_head(&conf->wait_buffer); + init_waitqueue_head(&conf->wait_done); + init_waitqueue_head(&conf->wait_ready); + + if (!conf->working_disks) { + printk(NONE_OPERATIONAL, mdidx(mddev)); + goto out_free_conf; + } + + + /* pre-allocate some buffer_head structures. + * As a minimum, 1 r1bh and raid_disks buffer_heads + * would probably get us by in tight memory situations, + * but a few more is probably a good idea. + * For now, try NR_RESERVED_BUFS r1bh and + * NR_RESERVED_BUFS*raid_disks bufferheads + * This will allow at least NR_RESERVED_BUFS concurrent + * reads or writes even if kmalloc starts failing + */ + if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS || + raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks) + < NR_RESERVED_BUFS*conf->raid_disks) { + printk(MEM_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + + descriptor = sb->disks+i; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && + !disk->used_slot) { + + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + } + } + + /* + * find the first working one and use it as a starting point + * to read balancing. + */ + for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++) + /* nothing */; + conf->last_used = j; + + + + { + const char * name = "raid1d"; + + conf->thread = md_register_thread(raid1d, conf, name); + if (!conf->thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + } + +<<<<<<< + (conf->working_disks > 1)) { + const char * name = "raid1syncd"; + + conf->resync_thread = md_register_thread(raid1syncd, conf,name); +||||||| + (conf->working_disks > 1)) { + const char * name = "raid1syncd"; + + conf->resync_thread = md_register_thread(raid1syncd, conf, name); +======= +>>>>>>> + + /* + * Regenerate the "device is in sync with the raid set" bit for + * each device. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + mark_disk_nonsync(sb->disks+i); + for (j = 0; j < sb->raid_disks; j++) { + if (!conf->mirrors[j].operational) + continue; + if (sb->disks[i].number == conf->mirrors[j].number) + mark_disk_sync(sb->disks+i); + } + } + sb->active_disks = conf->working_disks; + + printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); + /* + * Ok, everything is just fine now + */ + return 0; + +out_free_conf: + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf); + raid1_shrink_buffers(conf); + kfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return -EIO; +} + +#undef INVALID_LEVEL +#undef NO_SB +#undef ERRORS +#undef NOT_IN_SYNC +#undef INCONSISTENT +#undef ALREADY_RUNNING +#undef OPERATIONAL +#undef SPARE +#undef NONE_OPERATIONAL +#undef ARRAY_IS_ACTIVE + +<<<<<<< +static int raid1_stop_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_thread) { + if (conf->resync_mirrors) { + md_interrupt_thread(conf->resync_thread); + + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid1_restart_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); +||||||| +static int stop_resync(mddev_t *mddev) +{ + conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_thread) { + if (conf->resync_mirrors) { + md_interrupt_thread(conf->resync_thread); + + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int restart_resync(mddev_t *mddev) +{ + conf_t *conf = mddev_to_conf(mddev); +======= +>>>>>>> +static int raid1_stop (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + md_unregister_thread(conf->thread); + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf); + raid1_shrink_buffers(conf); + kfree(conf); + mddev->private = NULL; + MOD_DEC_USE_COUNT; + return 0; +} + +static mdk_personality_t raid1_personality= +{ + name: "raid1", + make_request: raid1_make_request, + run: raid1_run, + stop: raid1_stop, + status: raid1_status, + error_handler: raid1_error, + diskop: raid1_diskop, +<<<<<<< + stop_resync: raid1_stop_resync, + restart_resync: raid1_restart_resync, +||||||| + stop_resync: stop_resync, + restart_resync: restart_resync, +======= +>>>>>>> + sync_request: raid1_sync_request +}; + +static int md__init raid1_init (void) +{ + return register_md_personality (RAID1, &raid1_personality); +} + +static void raid1_exit (void) +{ + unregister_md_personality (RAID1); +} + +module_init(raid1_init); +module_exit(raid1_exit); +MODULE_LICENSE("GPL"); diff --git a/tests/linux/md-resync/orig b/tests/linux/md-resync/orig new file mode 100644 index 0000000..375e485 --- /dev/null +++ b/tests/linux/md-resync/orig @@ -0,0 +1,1848 @@ +/* + * raid1.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * RAID-1 management functions. + * + * Better read-balancing code written by Mika Kuoppala , 2000 + * + * Fixes to reconstruction by Jakob Østergaard" + * Various fixes by Neil Brown + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +#define MAX_WORK_PER_DISK 128 + +#define NR_RESERVED_BUFS 32 + + +/* + * The following can be used to debug the driver + */ +#define RAID1_DEBUG 0 + +#if RAID1_DEBUG +#define PRINTK(x...) printk(x) +#define inline +#define __inline__ +#else +#define PRINTK(x...) do { } while (0) +#endif + + +static mdk_personality_t raid1_personality; +static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; +struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail; + +static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) +{ + /* return a linked list of "cnt" struct buffer_heads. + * don't take any off the free list unless we know we can + * get all we need, otherwise we could deadlock + */ + struct buffer_head *bh=NULL; + + while(cnt) { + struct buffer_head *t; + md_spin_lock_irq(&conf->device_lock); + if (!conf->freebh_blocked && conf->freebh_cnt >= cnt) + while (cnt) { + t = conf->freebh; + conf->freebh = t->b_next; + t->b_next = bh; + bh = t; + t->b_state = 0; + conf->freebh_cnt--; + cnt--; + } + md_spin_unlock_irq(&conf->device_lock); + if (cnt == 0) + break; + t = kmem_cache_alloc(bh_cachep, SLAB_NOIO); + if (t) { + t->b_next = bh; + bh = t; + cnt--; + } else { + PRINTK("raid1: waiting for %d bh\n", cnt); + conf->freebh_blocked = 1; + wait_disk_event(conf->wait_buffer, + !conf->freebh_blocked || + conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2); + conf->freebh_blocked = 0; + } + } + return bh; +} + +static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh) +{ + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + while (bh) { + struct buffer_head *t = bh; + bh=bh->b_next; + if (t->b_pprev == NULL) + kmem_cache_free(bh_cachep, t); + else { + t->b_next= conf->freebh; + conf->freebh = t; + conf->freebh_cnt++; + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + wake_up(&conf->wait_buffer); +} + +static int raid1_grow_bh(raid1_conf_t *conf, int cnt) +{ + /* allocate cnt buffer_heads, possibly less if kmalloc fails */ + int i = 0; + + while (i < cnt) { + struct buffer_head *bh; + bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); + if (!bh) break; + + md_spin_lock_irq(&conf->device_lock); + bh->b_pprev = &conf->freebh; + bh->b_next = conf->freebh; + conf->freebh = bh; + conf->freebh_cnt++; + md_spin_unlock_irq(&conf->device_lock); + + i++; + } + return i; +} + +static void raid1_shrink_bh(raid1_conf_t *conf) +{ + /* discard all buffer_heads */ + + md_spin_lock_irq(&conf->device_lock); + while (conf->freebh) { + struct buffer_head *bh = conf->freebh; + conf->freebh = bh->b_next; + kmem_cache_free(bh_cachep, bh); + conf->freebh_cnt--; + } + md_spin_unlock_irq(&conf->device_lock); +} + + +static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh = NULL; + + do { + md_spin_lock_irq(&conf->device_lock); + if (!conf->freer1_blocked && conf->freer1) { + r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + conf->freer1_cnt--; + r1_bh->next_r1 = NULL; + r1_bh->state = (1 << R1BH_PreAlloc); + r1_bh->bh_req.b_state = 0; + } + md_spin_unlock_irq(&conf->device_lock); + if (r1_bh) + return r1_bh; + r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO); + if (r1_bh) { + memset(r1_bh, 0, sizeof(*r1_bh)); + return r1_bh; + } + conf->freer1_blocked = 1; + wait_disk_event(conf->wait_buffer, + !conf->freer1_blocked || + conf->freer1_cnt > NR_RESERVED_BUFS/2 + ); + conf->freer1_blocked = 0; + } while (1); +} + +static inline void raid1_free_r1bh(struct raid1_bh *r1_bh) +{ + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + + r1_bh->mirror_bh_list = NULL; + + if (test_bit(R1BH_PreAlloc, &r1_bh->state)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + r1_bh->next_r1 = conf->freer1; + conf->freer1 = r1_bh; + conf->freer1_cnt++; + spin_unlock_irqrestore(&conf->device_lock, flags); + /* don't need to wakeup wait_buffer because + * raid1_free_bh below will do that + */ + } else { + kfree(r1_bh); + } + raid1_free_bh(conf, bh); +} + +static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt) +{ + int i = 0; + + while (i < cnt) { + struct raid1_bh *r1_bh; + r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) + break; + memset(r1_bh, 0, sizeof(*r1_bh)); + set_bit(R1BH_PreAlloc, &r1_bh->state); + r1_bh->mddev = conf->mddev; + + raid1_free_r1bh(r1_bh); + i++; + } + return i; +} + +static void raid1_shrink_r1bh(raid1_conf_t *conf) +{ + md_spin_lock_irq(&conf->device_lock); + while (conf->freer1) { + struct raid1_bh *r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + conf->freer1_cnt--; + kfree(r1_bh); + } + md_spin_unlock_irq(&conf->device_lock); +} + + + +static inline void raid1_free_buf(struct raid1_bh *r1_bh) +{ + unsigned long flags; + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + r1_bh->mirror_bh_list = NULL; + + spin_lock_irqsave(&conf->device_lock, flags); + r1_bh->next_r1 = conf->freebuf; + conf->freebuf = r1_bh; + spin_unlock_irqrestore(&conf->device_lock, flags); + raid1_free_bh(conf, bh); +} + +static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh; + + md_spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock); + r1_bh = conf->freebuf; + conf->freebuf = r1_bh->next_r1; + r1_bh->next_r1= NULL; + md_spin_unlock_irq(&conf->device_lock); + + return r1_bh; +} + +static int raid1_grow_buffers (raid1_conf_t *conf, int cnt) +{ + int i = 0; + struct raid1_bh *head = NULL, **tail; + tail = &head; + + while (i < cnt) { + struct raid1_bh *r1_bh; + struct page *page; + + page = alloc_page(GFP_KERNEL); + if (!page) + break; + + r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) { + __free_page(page); + break; + } + memset(r1_bh, 0, sizeof(*r1_bh)); + r1_bh->bh_req.b_page = page; + r1_bh->bh_req.b_data = page_address(page); + *tail = r1_bh; + r1_bh->next_r1 = NULL; + tail = & r1_bh->next_r1; + i++; + } + /* this lock probably isn't needed, as at the time when + * we are allocating buffers, nobody else will be touching the + * freebuf list. But it doesn't hurt.... + */ + md_spin_lock_irq(&conf->device_lock); + *tail = conf->freebuf; + conf->freebuf = head; + md_spin_unlock_irq(&conf->device_lock); + return i; +} + +static void raid1_shrink_buffers (raid1_conf_t *conf) +{ + struct raid1_bh *head; + md_spin_lock_irq(&conf->device_lock); + head = conf->freebuf; + conf->freebuf = NULL; + md_spin_unlock_irq(&conf->device_lock); + + while (head) { + struct raid1_bh *r1_bh = head; + head = r1_bh->next_r1; + __free_page(r1_bh->bh_req.b_page); + kfree(r1_bh); + } +} + +static int raid1_map (mddev_t *mddev, kdev_t *rdev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int i, disks = MD_SB_DISKS; + + /* + * Later we do read balancing on the read side + * now we use the first available disk. + */ + + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].operational) { + *rdev = conf->mirrors[i].dev; + return (0); + } + } + + printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n"); + return (-1); +} + +static void raid1_reschedule_retry (struct raid1_bh *r1_bh) +{ + unsigned long flags; + mddev_t *mddev = r1_bh->mddev; + raid1_conf_t *conf = mddev_to_conf(mddev); + + md_spin_lock_irqsave(&retry_list_lock, flags); + if (raid1_retry_list == NULL) + raid1_retry_tail = &raid1_retry_list; + *raid1_retry_tail = r1_bh; + raid1_retry_tail = &r1_bh->next_r1; + r1_bh->next_r1 = NULL; + md_spin_unlock_irqrestore(&retry_list_lock, flags); + md_wakeup_thread(conf->thread); +} + + +static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector < conf->start_active) + conf->cnt_done--; + else if (sector >= conf->start_future && conf->phase == phase) + conf->cnt_future--; + else if (!--conf->cnt_pending) + wake_up(&conf->wait_ready); + + spin_unlock_irqrestore(&conf->segment_lock, flags); +} + +static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector >= conf->start_ready) + --conf->cnt_ready; + else if (sector >= conf->start_active) { + if (!--conf->cnt_active) { + conf->start_active = conf->start_ready; + wake_up(&conf->wait_done); + } + } + spin_unlock_irqrestore(&conf->segment_lock, flags); +} + +/* + * raid1_end_bh_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) +{ + struct buffer_head *bh = r1_bh->master_bh; + + io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev), + test_bit(R1BH_SyncPhase, &r1_bh->state)); + + bh->b_end_io(bh, uptodate); + raid1_free_r1bh(r1_bh); +} +void raid1_end_request (struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + /* + * this branch is our 'one mirror IO has finished' event handler: + */ + if (!uptodate) + md_error (r1_bh->mddev, bh->b_dev); + else + /* + * Set R1BH_Uptodate in our master buffer_head, so that + * we will return a good error code for to the higher + * levels even if IO on some other mirrored buffer fails. + * + * The 'master' represents the complex operation to + * user-side. So if something waits for IO, then it will + * wait for the 'master' buffer_head. + */ + set_bit (R1BH_Uptodate, &r1_bh->state); + + /* + * We split up the read and write side, imho they are + * conceptually different. + */ + + if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { + /* + * we have only one buffer_head on the read side + */ + + if (uptodate) { + raid1_end_bh_io(r1_bh, uptodate); + return; + } + /* + * oops, read error: + */ + printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", + partition_name(bh->b_dev), bh->b_blocknr); + raid1_reschedule_retry(r1_bh); + return; + } + + /* + * WRITE: + * + * Let's see if all mirrored write operations have finished + * already. + */ + + if (atomic_dec_and_test(&r1_bh->remaining)) + raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); +} + +/* + * This routine returns the disk from which the requested read should + * be done. It bookkeeps the last read position for every disk + * in array and when new read requests come, the disk which last + * position is nearest to the request, is chosen. + * + * TODO: now if there are 2 mirrors in the same 2 devices, performance + * degrades dramatically because position is mirror, not device based. + * This should be changed to be device based. Also atomic sequential + * reads should be somehow balanced. + */ + +static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) +{ + int new_disk = conf->last_used; + const int sectors = bh->b_size >> 9; + const unsigned long this_sector = bh->b_rsector; + int disk = new_disk; + unsigned long new_distance; + unsigned long current_distance; + + /* + * Check if it is sane at all to balance + */ + + if (conf->resync_mirrors) + goto rb_out; + + + /* make sure that disk is operational */ + while( !conf->mirrors[new_disk].operational) { + if (new_disk <= 0) new_disk = conf->raid_disks; + new_disk--; + if (new_disk == disk) { + /* + * This means no working disk was found + * Nothing much to do, lets not change anything + * and hope for the best... + */ + + new_disk = conf->last_used; + + goto rb_out; + } + } + disk = new_disk; + /* now disk == new_disk == starting point for search */ + + /* + * Don't touch anything for sequential reads. + */ + + if (this_sector == conf->mirrors[new_disk].head_position) + goto rb_out; + + /* + * If reads have been done only on a single disk + * for a time, lets give another disk a change. + * This is for kicking those idling disks so that + * they would find work near some hotspot. + */ + + if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { + conf->sect_count = 0; + +#if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92) + /* Work around a compiler bug in egcs-2.92.11 19980921 */ + new_disk = *(volatile int *)&new_disk; +#endif + do { + if (new_disk<=0) + new_disk = conf->raid_disks; + new_disk--; + if (new_disk == disk) + break; + } while ((conf->mirrors[new_disk].write_only) || + (!conf->mirrors[new_disk].operational)); + + goto rb_out; + } + + current_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + /* Find the disk which is closest */ + + do { + if (disk <= 0) + disk = conf->raid_disks; + disk--; + + if ((conf->mirrors[disk].write_only) || + (!conf->mirrors[disk].operational)) + continue; + + new_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + if (new_distance < current_distance) { + conf->sect_count = 0; + current_distance = new_distance; + new_disk = disk; + } + } while (disk != conf->last_used); + +rb_out: + conf->mirrors[new_disk].head_position = this_sector + sectors; + + conf->last_used = new_disk; + conf->sect_count += sectors; + + return new_disk; +} + +static int raid1_make_request (request_queue_t *q, + struct buffer_head * bh) +{ + mddev_t *mddev = q->queuedata; + raid1_conf_t *conf = mddev_to_conf(mddev); + struct buffer_head *bh_req, *bhl; + struct raid1_bh * r1_bh; + int disks = MD_SB_DISKS; + int i, sum_bhs = 0; + struct mirror_info *mirror; + + if (!buffer_locked(bh)) + BUG(); + +/* + * make_request() can abort the operation when READA is being + * used and no empty request is available. + * + * Currently, just replace the command with READ/WRITE. + */ + r1_bh = raid1_alloc_r1bh (conf); + + spin_lock_irq(&conf->segment_lock); + wait_event_lock_irq(conf->wait_done, + bh->b_rsector < conf->start_active || + bh->b_rsector >= conf->start_future, + conf->segment_lock); + if (bh->b_rsector < conf->start_active) + conf->cnt_done++; + else { + conf->cnt_future++; + if (conf->phase) + set_bit(R1BH_SyncPhase, &r1_bh->state); + } + spin_unlock_irq(&conf->segment_lock); + + /* + * i think the read and write branch should be separated completely, + * since we want to do read balancing on the read side for example. + * Alternative implementations? :) --mingo + */ + + r1_bh->master_bh = bh; + r1_bh->mddev = mddev; + r1_bh->cmd = rw; + + if (rw == READ) { + /* + * read balancing logic: + */ + mirror = conf->mirrors + raid1_read_balance(conf, bh); + + bh_req = &r1_bh->bh_req; + memcpy(bh_req, bh, sizeof(*bh)); + bh_req->b_blocknr = bh->b_rsector; + bh_req->b_dev = mirror->dev; + bh_req->b_rdev = mirror->dev; + /* bh_req->b_rsector = bh->n_rsector; */ + bh_req->b_end_io = raid1_end_request; + bh_req->b_private = r1_bh; + generic_make_request (rw, bh_req); + return 0; + } + + /* + * WRITE: + */ + + bhl = raid1_alloc_bh(conf, conf->raid_disks); + for (i = 0; i < disks; i++) { + struct buffer_head *mbh; + if (!conf->mirrors[i].operational) + continue; + + /* + * We should use a private pool (size depending on NR_REQUEST), + * to avoid writes filling up the memory with bhs + * + * Such pools are much faster than kmalloc anyways (so we waste + * almost nothing by not using the master bh when writing and + * win alot of cleanness) but for now we are cool enough. --mingo + * + * It's safe to sleep here, buffer heads cannot be used in a shared + * manner in the write branch. Look how we lock the buffer at the + * beginning of this function to grok the difference ;) + */ + mbh = bhl; + if (mbh == NULL) { + MD_BUG(); + break; + } + bhl = mbh->b_next; + mbh->b_next = NULL; + mbh->b_this_page = (struct buffer_head *)1; + + /* + * prepare mirrored mbh (fields ordered for max mem throughput): + */ + mbh->b_blocknr = bh->b_rsector; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rdev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_rsector; + mbh->b_state = (1<b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = raid1_end_request; + mbh->b_private = r1_bh; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; + sum_bhs++; + } + if (bhl) raid1_free_bh(conf,bhl); + if (!sum_bhs) { + /* Gag - all mirrors non-operational.. */ + raid1_end_bh_io(r1_bh, 0); + return 0; + } + md_atomic_set(&r1_bh->remaining, sum_bhs); + + /* + * We have to be a bit careful about the semaphore above, thats + * why we start the requests separately. Since kmalloc() could + * fail, sleep and make_request() can sleep too, this is the + * safer solution. Imagine, end_request decreasing the semaphore + * before we could have set it up ... We could play tricks with + * the semaphore (presetting it and correcting at the end if + * sum_bhs is not 'n' but we have to do end_request by hand if + * all requests finish until we had a chance to set up the + * semaphore correctly ... lots of races). + */ + bh = r1_bh->mirror_bh_list; + while(bh) { + struct buffer_head *bh2 = bh; + bh = bh->b_next; + generic_make_request(rw, bh2); + } + return (0); +} + +static void raid1_status(struct seq_file *seq, mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int i; + + seq_printf(seq, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + seq_printf(seq, "%s", + conf->mirrors[i].operational ? "U" : "_"); + seq_printf(seq, "]"); +} + +#define LAST_DISK KERN_ALERT \ +"raid1: only one disk left and IO error.\n" + +#define NO_SPARE_DISK KERN_ALERT \ +"raid1: no spare disk left, degrading mirror level by one.\n" + +#define DISK_FAILED KERN_ALERT \ +"raid1: Disk failure on %s, disabling device. \n" \ +" Operation continuing on %d devices\n" + +#define START_SYNCING KERN_ALERT \ +"raid1: start syncing spare disk.\n" + +#define ALREADY_SYNCING KERN_INFO \ +"raid1: syncing already in progress.\n" + +static void mark_disk_bad (mddev_t *mddev, int failed) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror = conf->mirrors+failed; + mdp_super_t *sb = mddev->sb; + + mirror->operational = 0; + mark_disk_faulty(sb->disks+mirror->number); + mark_disk_nonsync(sb->disks+mirror->number); + mark_disk_inactive(sb->disks+mirror->number); + if (!mirror->write_only) + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + if (!mirror->write_only) + conf->working_disks--; + printk (DISK_FAILED, partition_name (mirror->dev), + conf->working_disks); +} + +static int raid1_error (mddev_t *mddev, kdev_t dev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info * mirrors = conf->mirrors; + int disks = MD_SB_DISKS; + int i; + + /* Find the drive. + * If it is not operational, then we have already marked it as dead + * else if it is the last working disks, ignore the error, let the + * next level up know. + * else mark the drive as failed + */ + + for (i = 0; i < disks; i++) + if (mirrors[i].dev==dev && mirrors[i].operational) + break; + if (i == disks) + return 0; + + if (i < conf->raid_disks && conf->working_disks == 1) { + /* Don't fail the drive, act as though we were just a + * normal single drive + */ + + return 1; + } + mark_disk_bad(mddev, i); + return 0; +} + +#undef LAST_DISK +#undef NO_SPARE_DISK +#undef DISK_FAILED +#undef START_SYNCING + + +static void print_raid1_conf (raid1_conf_t *conf) +{ + int i; + struct mirror_info *tmp; + + printk("RAID1 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, + conf->raid_disks, conf->nr_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} + +static void close_sync(raid1_conf_t *conf) +{ + mddev_t *mddev = conf->mddev; + /* If reconstruction was interrupted, we need to close the "active" and "pending" + * holes. + * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0 + */ + /* this is really needed when recovery stops too... */ + spin_lock_irq(&conf->segment_lock); + conf->start_active = conf->start_pending; + conf->start_ready = conf->start_pending; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; + conf->start_future = (mddev->sb->size<<1)+1; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + conf->phase = conf->phase ^1; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; + conf->phase = 0; + conf->cnt_future = conf->cnt_done;; + conf->cnt_done = 0; + spin_unlock_irq(&conf->segment_lock); + wake_up(&conf->wait_done); +} + +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) +{ + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid1_conf_t *conf = mddev->private; + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + mdk_rdev_t *spare_rdev, *failed_rdev; + + print_raid1_conf(conf); + + switch (state) { + case DISKOP_SPARE_ACTIVE: + case DISKOP_SPARE_INACTIVE: + /* need to wait for pending sync io before locking device */ + close_sync(conf); + } + + md_spin_lock_irq(&conf->device_lock); + /* + * find the disk ... + */ + switch (state) { + + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID1 configuration ... + * (this can only be in the first conf->working_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + } + + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + if (conf->start_future > 0) { + MD_BUG(); + err = -EBUSY; + break; + } + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->nr_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: + if (conf->start_future > 0) { + MD_BUG(); + err = -EBUSY; + break; + } + sdisk = conf->mirrors + spare_disk; + fdisk = conf->mirrors + failed_disk; + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } + + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + /* + * do the switch finally + */ + spare_rdev = find_rdev_nr(mddev, spare_desc->number); + failed_rdev = find_rdev_nr(mddev, failed_desc->number); + + /* There must be a spare_rdev, but there may not be a + * failed_rdev. That slot might be empty... + */ + spare_rdev->desc_nr = failed_desc->number; + if (failed_rdev) + failed_rdev->desc_nr = spare_desc->number; + + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); + + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); + + *d = failed_desc; + + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + + conf->working_disks++; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->mirrors + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + conf->nr_disks--; + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->mirrors + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + adisk->head_position = 0; + conf->nr_disks++; + + break; + + default: + MD_BUG(); + err = 1; + goto abort; + } +abort: + md_spin_unlock_irq(&conf->device_lock); + if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) + /* should move to "END_REBUILD" when such exists */ + raid1_shrink_buffers(conf); + + print_raid1_conf(conf); + return err; +} + + +#define IO_ERROR KERN_ALERT \ +"raid1: %s: unrecoverable I/O read error for block %lu\n" + +#define REDIRECT_SECTOR KERN_ERR \ +"raid1: %s: redirecting sector %lu to another mirror\n" + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working mirrors. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array syncronising. + */ +static void end_sync_write(struct buffer_head *bh, int uptodate); +static void end_sync_read(struct buffer_head *bh, int uptodate); + +static void raid1d (void *data) +{ + struct raid1_bh *r1_bh; + struct buffer_head *bh; + unsigned long flags; + raid1_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + kdev_t dev; + + if (mddev->sb_dirty) + md_update_sb(mddev); + + for (;;) { + md_spin_lock_irqsave(&retry_list_lock, flags); + r1_bh = raid1_retry_list; + if (!r1_bh) + break; + raid1_retry_list = r1_bh->next_r1; + md_spin_unlock_irqrestore(&retry_list_lock, flags); + + mddev = r1_bh->mddev; + bh = &r1_bh->bh_req; + switch(r1_bh->cmd) { + case SPECIAL: + /* have to allocate lots of bh structures and + * schedule writes + */ + if (test_bit(R1BH_Uptodate, &r1_bh->state)) { + int i, sum_bhs = 0; + int disks = MD_SB_DISKS; + struct buffer_head *bhl, *mbh; + + conf = mddev_to_conf(mddev); + bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */ + for (i = 0; i < disks ; i++) { + if (!conf->mirrors[i].operational) + continue; + if (i==conf->last_used) + /* we read from here, no need to write */ + continue; + if (i < conf->raid_disks + && !conf->resync_mirrors) + /* don't need to write this, + * we are just rebuilding */ + continue; + mbh = bhl; + if (!mbh) { + MD_BUG(); + break; + } + bhl = mbh->b_next; + mbh->b_this_page = (struct buffer_head *)1; + + + /* + * prepare mirrored bh (fields ordered for max mem throughput): + */ + mbh->b_blocknr = bh->b_blocknr; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rdev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_blocknr; + mbh->b_state = (1<b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = end_sync_write; + mbh->b_private = r1_bh; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; + + sum_bhs++; + } + md_atomic_set(&r1_bh->remaining, sum_bhs); + if (bhl) raid1_free_bh(conf, bhl); + mbh = r1_bh->mirror_bh_list; + + if (!sum_bhs) { + /* nowhere to write this too... I guess we + * must be done + */ + sync_request_done(bh->b_blocknr, conf); + md_done_sync(mddev, bh->b_size>>9, 0); + raid1_free_buf(r1_bh); + } else + while (mbh) { + struct buffer_head *bh1 = mbh; + mbh = mbh->b_next; + generic_make_request(WRITE, bh1); + md_sync_acct(bh1->b_dev, bh1->b_size/512); + } + } else { + /* There is no point trying a read-for-reconstruct + * as reconstruct is about to be aborted + */ + + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + md_done_sync(mddev, bh->b_size>>9, 0); + } + + break; + case READ: + case READA: + dev = bh->b_dev; + raid1_map (mddev, &bh->b_dev); + if (bh->b_dev == dev) { + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + raid1_end_bh_io(r1_bh, 0); + } else { + printk (REDIRECT_SECTOR, + partition_name(bh->b_dev), bh->b_blocknr); + bh->b_rdev = bh->b_dev; + bh->b_rsector = bh->b_blocknr; + generic_make_request (r1_bh->cmd, bh); + } + break; + } + } + md_spin_unlock_irqrestore(&retry_list_lock, flags); +} +#undef IO_ERROR +#undef REDIRECT_SECTOR + +/* + * Private kernel thread to reconstruct mirrors after an unclean + * shutdown. + */ +static void raid1syncd (void *data) +{ + raid1_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + + if (!conf->resync_mirrors) + return; + if (mddev->recovery_running != 2) + return; + if (!md_do_sync(mddev, NULL)) { + /* + * Only if everything went Ok. + */ + conf->resync_mirrors = 0; + } + + close_sync(conf); + +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * This is achieved by conceptually dividing the device space into a + * number of sections: + * DONE: 0 .. a-1 These blocks are in-sync + * ACTIVE: a.. b-1 These blocks may have active sync requests, but + * no normal IO requests + * READY: b .. c-1 These blocks have no normal IO requests - sync + * request may be happening + * PENDING: c .. d-1 These blocks may have IO requests, but no new + * ones will be added + * FUTURE: d .. end These blocks are not to be considered yet. IO may + * be happening, but not sync + * + * We keep a + * phase which flips (0 or 1) each time d moves and + * a count of: + * z = active io requests in FUTURE since d moved - marked with + * current phase + * y = active io requests in FUTURE before d moved, or PENDING - + * marked with previous phase + * x = active sync requests in READY + * w = active sync requests in ACTIVE + * v = active io requests in DONE + * + * Normally, a=b=c=d=0 and z= active io requests + * or a=b=c=d=END and v= active io requests + * Allowed changes to a,b,c,d: + * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase + * B: y==0 -> c=d + * C: b=c, w+=x, x=0 + * D: w==0 -> a=b + * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0 + * + * At start of sync we apply A. + * When y reaches 0, we apply B then A then being sync requests + * When sync point reaches c-1, we wait for y==0, and W==0, and + * then apply apply B then A then D then C. + * Finally, we apply E + * + * The sync request simply issues a "read" against a working drive + * This is marked so that on completion the raid1d thread is woken to + * issue suitable write requests + */ + +static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror; + struct raid1_bh *r1_bh; + struct buffer_head *bh; + int bsize; + int disk; + int block_nr; + int buffs; + + if (!sector_nr) { + /* we want enough buffers to hold twice the window of 128*/ + buffs = 128 *2 / (PAGE_SIZE>>9); + buffs = raid1_grow_buffers(conf, buffs); + if (buffs < 2) + goto nomem; + conf->window = buffs*(PAGE_SIZE>>9)/2; + } + spin_lock_irq(&conf->segment_lock); + if (!sector_nr) { + /* initialize ...*/ + conf->start_active = 0; + conf->start_ready = 0; + conf->start_pending = 0; + conf->start_future = 0; + conf->phase = 0; + + conf->cnt_future += conf->cnt_done+conf->cnt_pending; + conf->cnt_done = conf->cnt_pending = 0; + if (conf->cnt_ready || conf->cnt_active) + MD_BUG(); + } + while (sector_nr >= conf->start_pending) { + PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", + sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, + conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); + wait_event_lock_irq(conf->wait_done, + !conf->cnt_active, + conf->segment_lock); + wait_event_lock_irq(conf->wait_ready, + !conf->cnt_pending, + conf->segment_lock); + conf->start_active = conf->start_ready; + conf->start_ready = conf->start_pending; + conf->start_pending = conf->start_future; + conf->start_future = conf->start_future+conf->window; + // Note: falling off the end is not a problem + conf->phase = conf->phase ^1; + conf->cnt_active = conf->cnt_ready; + conf->cnt_ready = 0; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + wake_up(&conf->wait_done); + } + conf->cnt_ready++; + spin_unlock_irq(&conf->segment_lock); + + + /* If reconstructing, and >1 working disc, + * could dedicate one to rebuild and others to + * service read requests .. + */ + disk = conf->last_used; + /* make sure disk is operational */ + while (!conf->mirrors[disk].operational) { + if (disk <= 0) disk = conf->raid_disks; + disk--; + if (disk == conf->last_used) + break; + } + conf->last_used = disk; + + mirror = conf->mirrors+conf->last_used; + + r1_bh = raid1_alloc_buf (conf); + r1_bh->master_bh = NULL; + r1_bh->mddev = mddev; + r1_bh->cmd = SPECIAL; + bh = &r1_bh->bh_req; + + block_nr = sector_nr; + bsize = 512; + while (!(block_nr & 1) && bsize < PAGE_SIZE + && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) { + block_nr >>= 1; + bsize <<= 1; + } + bh->b_size = bsize; + bh->b_list = BUF_LOCKED; + bh->b_dev = mirror->dev; + bh->b_rdev = mirror->dev; + bh->b_state = (1<b_page) + BUG(); + if (!bh->b_data) + BUG(); + if (bh->b_data != page_address(bh->b_page)) + BUG(); + bh->b_end_io = end_sync_read; + bh->b_private = r1_bh; + bh->b_blocknr = sector_nr; + bh->b_rsector = sector_nr; + init_waitqueue_head(&bh->b_wait); + + generic_make_request(READ, bh); + md_sync_acct(bh->b_dev, bh->b_size/512); + + return (bsize >> 9); + +nomem: + raid1_shrink_buffers(conf); + return -ENOMEM; +} + +static void end_sync_read(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + /* we have read a block, now it needs to be re-written, + * or re-read if the read failed. + * We don't do much here, just schedule handling by raid1d + */ + if (!uptodate) + md_error (r1_bh->mddev, bh->b_dev); + else + set_bit(R1BH_Uptodate, &r1_bh->state); + raid1_reschedule_retry(r1_bh); +} + +static void end_sync_write(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + if (!uptodate) + md_error (r1_bh->mddev, bh->b_dev); + if (atomic_dec_and_test(&r1_bh->remaining)) { + mddev_t *mddev = r1_bh->mddev; + unsigned long sect = bh->b_blocknr; + int size = bh->b_size; + raid1_free_buf(r1_bh); + sync_request_done(sect, mddev_to_conf(mddev)); + md_done_sync(mddev,size>>9, uptodate); + } +} + +#define INVALID_LEVEL KERN_WARNING \ +"raid1: md%d: raid level not set to mirroring (%d)\n" + +#define NO_SB KERN_ERR \ +"raid1: disabled mirror %s (couldn't access raid superblock)\n" + +#define ERRORS KERN_ERR \ +"raid1: disabled mirror %s (errors detected)\n" + +#define NOT_IN_SYNC KERN_ERR \ +"raid1: disabled mirror %s (not in sync)\n" + +#define INCONSISTENT KERN_ERR \ +"raid1: disabled mirror %s (inconsistent descriptor)\n" + +#define ALREADY_RUNNING KERN_ERR \ +"raid1: disabled mirror %s (mirror %d already operational)\n" + +#define OPERATIONAL KERN_INFO \ +"raid1: device %s operational as mirror %d\n" + +#define MEM_ERROR KERN_ERR \ +"raid1: couldn't allocate memory for md%d\n" + +#define SPARE KERN_INFO \ +"raid1: spare disk %s\n" + +#define NONE_OPERATIONAL KERN_ERR \ +"raid1: no operational mirrors for md%d\n" + +#define ARRAY_IS_ACTIVE KERN_INFO \ +"raid1: raid set md%d active with %d out of %d mirrors\n" + +#define THREAD_ERROR KERN_ERR \ +"raid1: couldn't allocate thread for md%d\n" + +#define START_RESYNC KERN_WARNING \ +"raid1: raid set md%d not clean; reconstructing mirrors\n" + +static int raid1_run (mddev_t *mddev) +{ + raid1_conf_t *conf; + int i, j, disk_idx; + struct mirror_info *disk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *descriptor; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int start_recovery = 0; + + MOD_INC_USE_COUNT; + + if (sb->level != 1) { + printk(INVALID_LEVEL, mdidx(mddev), sb->level); + goto out; + } + /* + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in raid1_run(), + * should be freed in raid1_stop()] + */ + + conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL); + mddev->private = conf; + if (!conf) { + printk(MEM_ERROR, mdidx(mddev)); + goto out; + } + memset(conf, 0, sizeof(*conf)); + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk(ERRORS, partition_name(rdev->dev)); + } else { + if (!rdev->sb) { + MD_BUG(); + continue; + } + } + if (rdev->desc_nr == -1) { + MD_BUG(); + continue; + } + descriptor = &sb->disks[rdev->desc_nr]; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor)) { + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + continue; + } + if (disk_active(descriptor)) { + if (!disk_sync(descriptor)) { + printk(NOT_IN_SYNC, + partition_name(rdev->dev)); + continue; + } + if ((descriptor->number > MD_SB_DISKS) || + (disk_idx > sb->raid_disks)) { + + printk(INCONSISTENT, + partition_name(rdev->dev)); + continue; + } + if (disk->operational) { + printk(ALREADY_RUNNING, + partition_name(rdev->dev), + disk_idx); + continue; + } + printk(OPERATIONAL, partition_name(rdev->dev), + disk_idx); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 1; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + conf->working_disks++; + } else { + /* + * Must be a spare disk .. + */ + printk(SPARE, partition_name(rdev->dev)); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + disk->head_position = 0; + } + } + conf->raid_disks = sb->raid_disks; + conf->nr_disks = sb->nr_disks; + conf->mddev = mddev; + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + + conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; + init_waitqueue_head(&conf->wait_buffer); + init_waitqueue_head(&conf->wait_done); + init_waitqueue_head(&conf->wait_ready); + + if (!conf->working_disks) { + printk(NONE_OPERATIONAL, mdidx(mddev)); + goto out_free_conf; + } + + + /* pre-allocate some buffer_head structures. + * As a minimum, 1 r1bh and raid_disks buffer_heads + * would probably get us by in tight memory situations, + * but a few more is probably a good idea. + * For now, try NR_RESERVED_BUFS r1bh and + * NR_RESERVED_BUFS*raid_disks bufferheads + * This will allow at least NR_RESERVED_BUFS concurrent + * reads or writes even if kmalloc starts failing + */ + if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS || + raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks) + < NR_RESERVED_BUFS*conf->raid_disks) { + printk(MEM_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + + descriptor = sb->disks+i; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && + !disk->used_slot) { + + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + } + } + + /* + * find the first working one and use it as a starting point + * to read balancing. + */ + for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++) + /* nothing */; + conf->last_used = j; + + + + { + const char * name = "raid1d"; + + conf->thread = md_register_thread(raid1d, conf, name); + if (!conf->thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + } + + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) && + (conf->working_disks > 1)) { + const char * name = "raid1syncd"; + + conf->resync_thread = md_register_thread(raid1syncd, conf,name); + if (!conf->resync_thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + printk(START_RESYNC, mdidx(mddev)); + conf->resync_mirrors = 1; + mddev->recovery_running = 2; + md_wakeup_thread(conf->resync_thread); + } + + /* + * Regenerate the "device is in sync with the raid set" bit for + * each device. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + mark_disk_nonsync(sb->disks+i); + for (j = 0; j < sb->raid_disks; j++) { + if (!conf->mirrors[j].operational) + continue; + if (sb->disks[i].number == conf->mirrors[j].number) + mark_disk_sync(sb->disks+i); + } + } + sb->active_disks = conf->working_disks; + + printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); + /* + * Ok, everything is just fine now + */ + return 0; + +out_free_conf: + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf); + raid1_shrink_buffers(conf); + kfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return -EIO; +} + +#undef INVALID_LEVEL +#undef NO_SB +#undef ERRORS +#undef NOT_IN_SYNC +#undef INCONSISTENT +#undef ALREADY_RUNNING +#undef OPERATIONAL +#undef SPARE +#undef NONE_OPERATIONAL +#undef ARRAY_IS_ACTIVE + +static int raid1_stop_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_thread) { + if (conf->resync_mirrors) { + md_interrupt_thread(conf->resync_thread); + + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid1_restart_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_mirrors) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + mddev->recovery_running = 2; + md_wakeup_thread(conf->resync_thread); + return 1; + } + return 0; +} + +static int raid1_stop (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + md_unregister_thread(conf->thread); + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf); + raid1_shrink_buffers(conf); + kfree(conf); + mddev->private = NULL; + MOD_DEC_USE_COUNT; + return 0; +} + +static mdk_personality_t raid1_personality= +{ + name: "raid1", + make_request: raid1_make_request, + run: raid1_run, + stop: raid1_stop, + status: raid1_status, + error_handler: raid1_error, + diskop: raid1_diskop, + stop_resync: raid1_stop_resync, + restart_resync: raid1_restart_resync, + sync_request: raid1_sync_request +}; + +static int md__init raid1_init (void) +{ + return register_md_personality (RAID1, &raid1_personality); +} + +static void raid1_exit (void) +{ + unregister_md_personality (RAID1); +} + +module_init(raid1_init); +module_exit(raid1_exit); +MODULE_LICENSE("GPL"); diff --git a/tests/linux/md-resync/patch b/tests/linux/md-resync/patch new file mode 100644 index 0000000..1ed2ab1 --- /dev/null +++ b/tests/linux/md-resync/patch @@ -0,0 +1,312 @@ +*************** +*** 333,339 **** + * device if no resync is going on, or below the resync window. + * We take the first readable disk when above the resync window. + */ +- if (conf->resync_mirrors && (this_sector + sectors >= conf->next_resync)) { + /* make sure that disk is operational */ + new_disk = 0; + while (!conf->mirrors[new_disk].operational || conf->mirrors[new_disk].write_only) { +--- 333,339 ---- + * device if no resync is going on, or below the resync window. + * We take the first readable disk when above the resync window. + */ ++ if (!conf->mddev->in_sync && (this_sector + sectors >= conf->next_resync)) { + /* make sure that disk is operational */ + new_disk = 0; + while (!conf->mirrors[new_disk].operational || conf->mirrors[new_disk].write_only) { +*************** +*** 652,657 **** + if (conf->barrier) BUG(); + if (waitqueue_active(&conf->wait_idle)) BUG(); + if (waitqueue_active(&conf->wait_resume)) BUG(); + } + + static int diskop(mddev_t *mddev, mdp_disk_t **d, int state) +--- 652,660 ---- + if (conf->barrier) BUG(); + if (waitqueue_active(&conf->wait_idle)) BUG(); + if (waitqueue_active(&conf->wait_resume)) BUG(); ++ ++ mempool_destroy(conf->r1buf_pool); ++ conf->r1buf_pool = NULL; + } + + static int diskop(mddev_t *mddev, mdp_disk_t **d, int state) +*************** +*** 768,774 **** + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: +- close_sync(conf); + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; +--- 771,776 ---- + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; +*************** +*** 781,787 **** + * property) + */ + case DISKOP_SPARE_ACTIVE: +- close_sync(conf); + sdisk = conf->mirrors + spare_disk; + fdisk = conf->mirrors + failed_disk; + +--- 783,788 ---- + * property) + */ + case DISKOP_SPARE_ACTIVE: + sdisk = conf->mirrors + spare_disk; + fdisk = conf->mirrors + failed_disk; + +*************** +*** 915,924 **** + } + abort: + spin_unlock_irq(&conf->device_lock); +- if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) { +- mempool_destroy(conf->r1buf_pool); +- conf->r1buf_pool = NULL; +- } + + print_conf(conf); + return err; +--- 916,921 ---- + } + abort: + spin_unlock_irq(&conf->device_lock); + + print_conf(conf); + return err; +*************** +*** 1008,1014 **** + * we read from here, no need to write + */ + continue; +- if (i < conf->raid_disks && !conf->resync_mirrors) + /* + * don't need to write this we are just rebuilding + */ +--- 1005,1011 ---- + * we read from here, no need to write + */ + continue; ++ if (i < conf->raid_disks && mddev->in_sync) + /* + * don't need to write this we are just rebuilding + */ +*************** +*** 1113,1141 **** + spin_unlock_irqrestore(&retry_list_lock, flags); + } + +- /* +- * Private kernel thread to reconstruct mirrors after an unclean +- * shutdown. +- */ +- static void raid1syncd(void *data) +- { +- conf_t *conf = data; +- mddev_t *mddev = conf->mddev; +- +- if (!conf->resync_mirrors) +- return; +- if (mddev->recovery_running != 2) +- return; +- if (!md_do_sync(mddev, NULL)) { +- /* +- * Only if everything went Ok. +- */ +- conf->resync_mirrors = 0; +- } +- +- close_sync(conf); +- +- } + + static int init_resync(conf_t *conf) + { +--- 1110,1115 ---- + spin_unlock_irqrestore(&retry_list_lock, flags); + } + + + static int init_resync(conf_t *conf) + { +*************** +*** 1170,1178 **** + sector_t max_sector, nr_sectors; + int disk, partial; + +- if (!sector_nr) + if (init_resync(conf)) + return -ENOMEM; + /* + * If there is non-resync activity waiting for us then + * put in a delay to throttle resync. +--- 1144,1159 ---- + sector_t max_sector, nr_sectors; + int disk, partial; + ++ if (sector_nr == 0) + if (init_resync(conf)) + return -ENOMEM; ++ ++ max_sector = mddev->sb->size << 1; ++ if (sector_nr >= max_sector) { ++ close_sync(conf); ++ return 0; ++ } ++ + /* + * If there is non-resync activity waiting for us then + * put in a delay to throttle resync. +*************** +*** 1209,1218 **** + r1_bio->sector = sector_nr; + r1_bio->cmd = SPECIAL; + +- max_sector = mddev->sb->size << 1; +- if (sector_nr >= max_sector) +- BUG(); +- + bio = r1_bio->master_bio; + nr_sectors = RESYNC_BLOCK_SIZE >> 9; + if (max_sector - sector_nr < nr_sectors) +--- 1190,1195 ---- + r1_bio->sector = sector_nr; + r1_bio->cmd = SPECIAL; + + bio = r1_bio->master_bio; + nr_sectors = RESYNC_BLOCK_SIZE >> 9; + if (max_sector - sector_nr < nr_sectors) +*************** +*** 1295,1301 **** + mdp_disk_t *descriptor; + mdk_rdev_t *rdev; + struct list_head *tmp; +- int start_recovery = 0; + + MOD_INC_USE_COUNT; + +--- 1272,1277 ---- + mdp_disk_t *descriptor; + mdk_rdev_t *rdev; + struct list_head *tmp; + + MOD_INC_USE_COUNT; + +*************** +*** 1716,1736 **** + } + } + +- if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) && +- (conf->working_disks > 1)) { +- const char * name = "raid1syncd"; +- +- conf->resync_thread = md_register_thread(raid1syncd, conf, name); +- if (!conf->resync_thread) { +- printk(THREAD_ERROR, mdidx(mddev)); +- goto out_free_conf; +- } +- +- printk(START_RESYNC, mdidx(mddev)); +- conf->resync_mirrors = 1; +- mddev->recovery_running = 2; +- md_wakeup_thread(conf->resync_thread); +- } + + /* + * Regenerate the "device is in sync with the raid set" bit for +--- 1688,1693 ---- + } + } + + + /* + * Regenerate the "device is in sync with the raid set" bit for +*************** +*** 1770,1815 **** + return -EIO; + } + +- static int stop_resync(mddev_t *mddev) +- { +- conf_t *conf = mddev_to_conf(mddev); +- +- if (conf->resync_thread) { +- if (conf->resync_mirrors) { +- md_interrupt_thread(conf->resync_thread); +- +- printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); +- return 1; +- } +- return 0; +- } +- return 0; +- } +- +- static int restart_resync(mddev_t *mddev) +- { +- conf_t *conf = mddev_to_conf(mddev); +- +- if (conf->resync_mirrors) { +- if (!conf->resync_thread) { +- MD_BUG(); +- return 0; +- } +- mddev->recovery_running = 2; +- md_wakeup_thread(conf->resync_thread); +- return 1; +- } +- return 0; +- } +- + static int stop(mddev_t *mddev) + { + conf_t *conf = mddev_to_conf(mddev); + int i; + + md_unregister_thread(conf->thread); +- if (conf->resync_thread) +- md_unregister_thread(conf->resync_thread); + if (conf->r1bio_pool) + mempool_destroy(conf->r1bio_pool); + for (i = 0; i < MD_SB_DISKS; i++) +--- 1723,1734 ---- + return -EIO; + } + + static int stop(mddev_t *mddev) + { + conf_t *conf = mddev_to_conf(mddev); + int i; + + md_unregister_thread(conf->thread); + if (conf->r1bio_pool) + mempool_destroy(conf->r1bio_pool); + for (i = 0; i < MD_SB_DISKS; i++) +*************** +*** 1830,1837 **** + status: status, + error_handler: error, + diskop: diskop, +- stop_resync: stop_resync, +- restart_resync: restart_resync, + sync_request: sync_request + }; + +--- 1749,1754 ---- + status: status, + error_handler: error, + diskop: diskop, + sync_request: sync_request + }; + diff --git a/tests/linux/md/diff b/tests/linux/md/diff new file mode 100644 index 0000000..77e3f76 --- /dev/null +++ b/tests/linux/md/diff @@ -0,0 +1,3680 @@ +@@ -1,3674 +1,101 @@ +-/* +- md.c : Multiple Devices driver for Linux +- Copyright (C) 1998, 1999, 2000 Ingo Molnar +- +- completely rewritten, based on the MD driver code from Marc Zyngier +- +- Changes: +- +- - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar +- - boot support for linear and striped mode by Harald Hoyer +- - kerneld support by Boris Tobotras +- - kmod support by: Cyrus Durgin +- - RAID0 bugfixes: Mark Anthony Lisher +- - Devfs support by Richard Gooch +- +- - lots of fixes and improvements to the RAID1/RAID5 and generic +- RAID code (such as request based resynchronization): +- +- Neil Brown . +- +- This program is free software; you can redistribute it and/or modify +- it under the terms of the GNU General Public License as published by +- the Free Software Foundation; either version 2, or (at your option) +- any later version. +- +- You should have received a copy of the GNU General Public License +- (for example /usr/src/linux/COPYING); if not, write to the Free +- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +-*/ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include /* for invalidate_bdev */ +-#include +- +-#include +- +-#ifdef CONFIG_KMOD +-#include +-#endif +- +-#define __KERNEL_SYSCALLS__ +-#include +- +-#include +- +-#define MAJOR_NR MD_MAJOR +-#define MD_DRIVER +-#define DEVICE_NR(device) (minor(device)) +- +-#include +- +-#define DEBUG 0 +-#define dprintk(x...) ((void)(DEBUG && printk(x))) +- +- +-#ifndef MODULE +-static void autostart_arrays (void); +-#endif +- +-static mdk_personality_t *pers[MAX_PERSONALITY]; +-static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED; +- +-/* +- * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' +- * is 1000 KB/sec, so the extra system load does not show up that much. +- * Increase it if you want to have more _guaranteed_ speed. Note that +- * the RAID driver will use the maximum available bandwith if the IO +- * subsystem is idle. There is also an 'absolute maximum' reconstruction +- * speed limit - in case reconstruction slows down your system despite +- * idle IO detection. +- * +- * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. +- */ +- +-static int sysctl_speed_limit_min = 1000; +-static int sysctl_speed_limit_max = 200000; +- +-static struct ctl_table_header *raid_table_header; +- +-static ctl_table raid_table[] = { +- { +- .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, +- .procname = "speed_limit_min", +- .data = &sysctl_speed_limit_min, +- .maxlen = sizeof(int), +- .mode = 0644, +- .proc_handler = &proc_dointvec, +- }, +- { +- .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, +- .procname = "speed_limit_max", +- .data = &sysctl_speed_limit_max, +- .maxlen = sizeof(int), +- .mode = 0644, +- .proc_handler = &proc_dointvec, +- }, +- { .ctl_name = 0 } +-}; +- +-static ctl_table raid_dir_table[] = { +- { +- .ctl_name = DEV_RAID, +- .procname = "raid", +- .maxlen = 0, +- .mode = 0555, +- .child = raid_table, +- }, +- { .ctl_name = 0 } +-}; +- +-static ctl_table raid_root_table[] = { +- { +- .ctl_name = CTL_DEV, +- .procname = "dev", +- .maxlen = 0, +- .mode = 0555, +- .child = raid_dir_table, +- }, +- { .ctl_name = 0 } +-}; +- +-static struct block_device_operations md_fops; +- +-static struct gendisk *disks[MAX_MD_DEVS]; +- +-/* +- * Enables to iterate over all existing md arrays +- * all_mddevs_lock protects this list as well as mddev_map. +- */ +-static LIST_HEAD(all_mddevs); +-static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; +- +- +-/* +- * iterates through all used mddevs in the system. +- * We take care to grab the all_mddevs_lock whenever navigating +- * the list, and to always hold a refcount when unlocked. +- * Any code which breaks out of this loop while own +- * a reference to the current mddev and must mddev_put it. +- */ +-#define ITERATE_MDDEV(mddev,tmp) \ +- \ +- for (({ spin_lock(&all_mddevs_lock); \ +- tmp = all_mddevs.next; \ +- mddev = NULL;}); \ +- ({ if (tmp != &all_mddevs) \ +- mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ +- spin_unlock(&all_mddevs_lock); \ +- if (mddev) mddev_put(mddev); \ +- mddev = list_entry(tmp, mddev_t, all_mddevs); \ +- tmp != &all_mddevs;}); \ +- ({ spin_lock(&all_mddevs_lock); \ +- tmp = tmp->next;}) \ +- ) +- +-static mddev_t *mddev_map[MAX_MD_DEVS]; +- +-static int md_fail_request (request_queue_t *q, struct bio *bio) +-{ +- bio_io_error(bio, bio->bi_size); +- return 0; +-} +- +-static inline mddev_t *mddev_get(mddev_t *mddev) +-{ +- atomic_inc(&mddev->active); +- return mddev; +-} +- +-static void mddev_put(mddev_t *mddev) +-{ +- if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) +- return; +- if (!mddev->raid_disks && list_empty(&mddev->disks)) { +- list_del(&mddev->all_mddevs); +- mddev_map[mdidx(mddev)] = NULL; +- kfree(mddev); +- MOD_DEC_USE_COUNT; +- } +- spin_unlock(&all_mddevs_lock); +-} +- +-static mddev_t * mddev_find(int unit) +-{ +- mddev_t *mddev, *new = NULL; +- +- retry: +- spin_lock(&all_mddevs_lock); +- if (mddev_map[unit]) { +- mddev = mddev_get(mddev_map[unit]); +- spin_unlock(&all_mddevs_lock); +- if (new) +- kfree(new); +- return mddev; +- } +- if (new) { +- mddev_map[unit] = new; +- list_add(&new->all_mddevs, &all_mddevs); +- spin_unlock(&all_mddevs_lock); +- MOD_INC_USE_COUNT; +- return new; +- } +- spin_unlock(&all_mddevs_lock); +- +- new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); +- if (!new) +- return NULL; +- +- memset(new, 0, sizeof(*new)); +- +- new->__minor = unit; +- init_MUTEX(&new->reconfig_sem); +- INIT_LIST_HEAD(&new->disks); +- INIT_LIST_HEAD(&new->all_mddevs); +- init_timer(&new->safemode_timer); +- atomic_set(&new->active, 1); +- blk_queue_make_request(&new->queue, md_fail_request); +- +- goto retry; +-} +- +-static inline int mddev_lock(mddev_t * mddev) +-{ +- return down_interruptible(&mddev->reconfig_sem); +-} +- +-static inline void mddev_lock_uninterruptible(mddev_t * mddev) +-{ +- down(&mddev->reconfig_sem); +-} +- +-static inline int mddev_trylock(mddev_t * mddev) +-{ +- return down_trylock(&mddev->reconfig_sem); +-} +- +-static inline void mddev_unlock(mddev_t * mddev) +-{ +- up(&mddev->reconfig_sem); +-} +- +-mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +-{ +- mdk_rdev_t * rdev; +- struct list_head *tmp; +- +- ITERATE_RDEV(mddev,rdev,tmp) { +- if (rdev->desc_nr == nr) +- return rdev; +- } +- return NULL; +-} +- +-static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) +-{ +- struct list_head *tmp; +- mdk_rdev_t *rdev; +- +- ITERATE_RDEV(mddev,rdev,tmp) { +- if (rdev->bdev->bd_dev == dev) +- return rdev; +- } +- return NULL; +-} +- +-inline static sector_t calc_dev_sboffset(struct block_device *bdev) +-{ +- sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; +- return MD_NEW_SIZE_BLOCKS(size); +-} +- +-static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) +-{ +- sector_t size; +- +- size = rdev->sb_offset; +- +- if (chunk_size) +- size &= ~((sector_t)chunk_size/1024 - 1); +- return size; +-} +- +-static int alloc_disk_sb(mdk_rdev_t * rdev) +-{ +- if (rdev->sb_page) +- MD_BUG(); +- +- rdev->sb_page = alloc_page(GFP_KERNEL); +- if (!rdev->sb_page) { +- printk(KERN_ALERT "md: out of memory.\n"); +- return -EINVAL; +- } +- +- return 0; +-} +- +-static void free_disk_sb(mdk_rdev_t * rdev) +-{ +- if (rdev->sb_page) { +- page_cache_release(rdev->sb_page); +- rdev->sb_loaded = 0; +- rdev->sb_page = NULL; +- rdev->sb_offset = 0; +- rdev->size = 0; +- } +-} +- +- +-static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) +-{ +- if (bio->bi_size) +- return 1; +- +- complete((struct completion*)bio->bi_private); +- return 0; +-} +- +-static int sync_page_io(struct block_device *bdev, sector_t sector, int size, +- struct page *page, int rw) +-{ +- struct bio bio; +- struct bio_vec vec; +- struct completion event; +- +- bio_init(&bio); +- bio.bi_io_vec = &vec; +- vec.bv_page = page; +- vec.bv_len = size; +- vec.bv_offset = 0; +- bio.bi_vcnt = 1; +- bio.bi_idx = 0; +- bio.bi_size = size; +- bio.bi_bdev = bdev; +- bio.bi_sector = sector; +- init_completion(&event); +- bio.bi_private = &event; +- bio.bi_end_io = bi_complete; +- submit_bio(rw, &bio); +- blk_run_queues(); +- wait_for_completion(&event); +- +- return test_bit(BIO_UPTODATE, &bio.bi_flags); +-} +- +-static int read_disk_sb(mdk_rdev_t * rdev) +-{ +- +- if (!rdev->sb_page) { +- MD_BUG(); +- return -EINVAL; +- } +- if (rdev->sb_loaded) +- return 0; +- +- +- if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) +- goto fail; +- rdev->sb_loaded = 1; +- return 0; +- +-fail: +- printk(KERN_ERR "md: disabled device %s, could not read superblock.\n", +- bdev_partition_name(rdev->bdev)); +- return -EINVAL; +-} +- +-static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) +-{ +- if ( (sb1->set_uuid0 == sb2->set_uuid0) && +- (sb1->set_uuid1 == sb2->set_uuid1) && +- (sb1->set_uuid2 == sb2->set_uuid2) && +- (sb1->set_uuid3 == sb2->set_uuid3)) +- +- return 1; +- +- return 0; +-} +- +- +-static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +-{ +- int ret; +- mdp_super_t *tmp1, *tmp2; +- +- tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); +- tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); +- +- if (!tmp1 || !tmp2) { +- ret = 0; +- printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); +- goto abort; +- } +- +- *tmp1 = *sb1; +- *tmp2 = *sb2; +- +- /* +- * nr_disks is not constant +- */ +- tmp1->nr_disks = 0; +- tmp2->nr_disks = 0; +- +- if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) +- ret = 0; +- else +- ret = 1; +- +-abort: +- if (tmp1) +- kfree(tmp1); +- if (tmp2) +- kfree(tmp2); +- +- return ret; +-} +- +-static unsigned int calc_sb_csum(mdp_super_t * sb) +-{ +- unsigned int disk_csum, csum; +- +- disk_csum = sb->sb_csum; +- sb->sb_csum = 0; +- csum = csum_partial((void *)sb, MD_SB_BYTES, 0); +- sb->sb_csum = disk_csum; +- return csum; +-} +- +-/* +- * Handle superblock details. +- * We want to be able to handle multiple superblock formats +- * so we have a common interface to them all, and an array of +- * different handlers. +- * We rely on user-space to write the initial superblock, and support +- * reading and updating of superblocks. +- * Interface methods are: +- * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) +- * loads and validates a superblock on dev. +- * if refdev != NULL, compare superblocks on both devices +- * Return: +- * 0 - dev has a superblock that is compatible with refdev +- * 1 - dev has a superblock that is compatible and newer than refdev +- * so dev should be used as the refdev in future +- * -EINVAL superblock incompatible or invalid +- * -othererror e.g. -EIO +- * +- * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) +- * Verify that dev is acceptable into mddev. +- * The first time, mddev->raid_disks will be 0, and data from +- * dev should be merged in. Subsequent calls check that dev +- * is new enough. Return 0 or -EINVAL +- * +- * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) +- * Update the superblock for rdev with data in mddev +- * This does not write to disc. +- * +- */ +- +-struct super_type { +- char *name; +- struct module *owner; +- int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); +- int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); +- void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); +-}; +- +-/* +- * load_super for 0.90.0 +- */ +-static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +-{ +- mdp_super_t *sb; +- int ret; +- sector_t sb_offset; +- +- /* +- * Calculate the position of the superblock, +- * it's at the end of the disk. +- * +- * It also happens to be a multiple of 4Kb. +- */ +- sb_offset = calc_dev_sboffset(rdev->bdev); +- rdev->sb_offset = sb_offset; +- +- ret = read_disk_sb(rdev); +- if (ret) return ret; +- +- ret = -EINVAL; +- +- sb = (mdp_super_t*)page_address(rdev->sb_page); +- +- if (sb->md_magic != MD_SB_MAGIC) { +- printk(KERN_ERR "md: invalid raid superblock magic on %s\n", +- bdev_partition_name(rdev->bdev)); +- goto abort; +- } +- +- if (sb->major_version != 0 || +- sb->minor_version != 90) { +- printk(KERN_WARNING "Bad version number %d.%d on %s\n", +- sb->major_version, sb->minor_version, +- bdev_partition_name(rdev->bdev)); +- goto abort; +- } +- +- if (sb->md_minor >= MAX_MD_DEVS) { +- printk(KERN_ERR "md: %s: invalid raid minor (%x)\n", +- bdev_partition_name(rdev->bdev), sb->md_minor); +- goto abort; +- } +- if (sb->raid_disks <= 0) +- goto abort; +- +- if (calc_sb_csum(sb) != sb->sb_csum) { +- printk(KERN_WARNING "md: invalid superblock checksum on %s\n", +- bdev_partition_name(rdev->bdev)); +- goto abort; +- } +- +- rdev->preferred_minor = sb->md_minor; +- rdev->data_offset = 0; +- +- if (sb->level == MULTIPATH) +- rdev->desc_nr = -1; +- else +- rdev->desc_nr = sb->this_disk.number; +- +- if (refdev == 0) +- ret = 1; +- else { +- __u64 ev1, ev2; +- mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); +- if (!uuid_equal(refsb, sb)) { +- printk(KERN_WARNING "md: %s has different UUID to %s\n", +- bdev_partition_name(rdev->bdev), +- bdev_partition_name(refdev->bdev)); +- goto abort; +- } +- if (!sb_equal(refsb, sb)) { +- printk(KERN_WARNING "md: %s has same UUID" +- " but different superblock to %s\n", +- bdev_partition_name(rdev->bdev), +- bdev_partition_name(refdev->bdev)); +- goto abort; +- } +- ev1 = md_event(sb); +- ev2 = md_event(refsb); +- if (ev1 > ev2) +- ret = 1; +- else +- ret = 0; +- } +- rdev->size = calc_dev_size(rdev, sb->chunk_size); +- +- abort: +- return ret; +-} +- +-/* +- * validate_super for 0.90.0 +- */ +-static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) +-{ +- mdp_disk_t *desc; +- mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); +- +- if (mddev->raid_disks == 0) { +- mddev->major_version = 0; +- mddev->minor_version = sb->minor_version; +- mddev->patch_version = sb->patch_version; +- mddev->persistent = ! sb->not_persistent; +- mddev->chunk_size = sb->chunk_size; +- mddev->ctime = sb->ctime; +- mddev->utime = sb->utime; +- mddev->level = sb->level; +- mddev->layout = sb->layout; +- mddev->raid_disks = sb->raid_disks; +- mddev->size = sb->size; +- mddev->events = md_event(sb); +- +- if (sb->state & (1<recovery_cp = MaxSector; +- else { +- if (sb->events_hi == sb->cp_events_hi && +- sb->events_lo == sb->cp_events_lo) { +- mddev->recovery_cp = sb->recovery_cp; +- } else +- mddev->recovery_cp = 0; +- } +- +- memcpy(mddev->uuid+0, &sb->set_uuid0, 4); +- memcpy(mddev->uuid+4, &sb->set_uuid1, 4); +- memcpy(mddev->uuid+8, &sb->set_uuid2, 4); +- memcpy(mddev->uuid+12,&sb->set_uuid3, 4); +- +- mddev->max_disks = MD_SB_DISKS; +- } else { +- __u64 ev1; +- ev1 = md_event(sb); +- ++ev1; +- if (ev1 < mddev->events) +- return -EINVAL; +- } +- if (mddev->level != LEVEL_MULTIPATH) { +- rdev->raid_disk = -1; +- rdev->in_sync = rdev->faulty = 0; +- desc = sb->disks + rdev->desc_nr; +- +- if (desc->state & (1<faulty = 1; +- else if (desc->state & (1<raid_disk < mddev->raid_disks) { +- rdev->in_sync = 1; +- rdev->raid_disk = desc->raid_disk; +- } +- } +- return 0; +-} +- +-/* +- * sync_super for 0.90.0 +- */ +-static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) +-{ +- mdp_super_t *sb; +- struct list_head *tmp; +- mdk_rdev_t *rdev2; +- int next_spare = mddev->raid_disks; +- +- /* make rdev->sb match mddev data.. +- * +- * 1/ zero out disks +- * 2/ Add info for each disk, keeping track of highest desc_nr +- * 3/ any empty disks < highest become removed +- * +- * disks[0] gets initialised to REMOVED because +- * we cannot be sure from other fields if it has +- * been initialised or not. +- */ +- int highest = 0; +- int i; +- int active=0, working=0,failed=0,spare=0,nr_disks=0; +- +- sb = (mdp_super_t*)page_address(rdev->sb_page); +- +- memset(sb, 0, sizeof(*sb)); +- +- sb->md_magic = MD_SB_MAGIC; +- sb->major_version = mddev->major_version; +- sb->minor_version = mddev->minor_version; +- sb->patch_version = mddev->patch_version; +- sb->gvalid_words = 0; /* ignored */ +- memcpy(&sb->set_uuid0, mddev->uuid+0, 4); +- memcpy(&sb->set_uuid1, mddev->uuid+4, 4); +- memcpy(&sb->set_uuid2, mddev->uuid+8, 4); +- memcpy(&sb->set_uuid3, mddev->uuid+12,4); +- +- sb->ctime = mddev->ctime; +- sb->level = mddev->level; +- sb->size = mddev->size; +- sb->raid_disks = mddev->raid_disks; +- sb->md_minor = mddev->__minor; +- sb->not_persistent = !mddev->persistent; +- sb->utime = mddev->utime; +- sb->state = 0; +- sb->events_hi = (mddev->events>>32); +- sb->events_lo = (u32)mddev->events; +- +- if (mddev->in_sync) +- { +- sb->recovery_cp = mddev->recovery_cp; +- sb->cp_events_hi = (mddev->events>>32); +- sb->cp_events_lo = (u32)mddev->events; +- if (mddev->recovery_cp == MaxSector) +- sb->state = (1<< MD_SB_CLEAN); +- } else +- sb->recovery_cp = 0; +- +- sb->layout = mddev->layout; +- sb->chunk_size = mddev->chunk_size; +- +- sb->disks[0].state = (1<raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) +- rdev2->desc_nr = rdev2->raid_disk; +- else +- rdev2->desc_nr = next_spare++; +- d = &sb->disks[rdev2->desc_nr]; +- nr_disks++; +- d->number = rdev2->desc_nr; +- d->major = MAJOR(rdev2->bdev->bd_dev); +- d->minor = MINOR(rdev2->bdev->bd_dev); +- if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) +- d->raid_disk = rdev2->raid_disk; +- else +- d->raid_disk = rdev2->desc_nr; /* compatibility */ +- if (rdev2->faulty) { +- d->state = (1<in_sync) { +- d->state = (1<state |= (1<state = 0; +- spare++; +- working++; +- } +- if (rdev2->desc_nr > highest) +- highest = rdev2->desc_nr; +- } +- +- /* now set the "removed" bit on any non-trailing holes */ +- for (i=0; idisks[i]; +- if (d->state == 0 && d->number == 0) { +- d->number = i; +- d->raid_disk = i; +- d->state = (1<nr_disks = nr_disks; +- sb->active_disks = active; +- sb->working_disks = working; +- sb->failed_disks = failed; +- sb->spare_disks = spare; +- +- sb->this_disk = sb->disks[rdev->desc_nr]; +- sb->sb_csum = calc_sb_csum(sb); +-} +- +-/* +- * version 1 superblock +- */ +- +-static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +-{ +- unsigned int disk_csum, csum; +- int size = 256 + sb->max_dev*2; +- +- disk_csum = sb->sb_csum; +- sb->sb_csum = 0; +- csum = csum_partial((void *)sb, size, 0); +- sb->sb_csum = disk_csum; +- return csum; +-} +- +-static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +-{ +- struct mdp_superblock_1 *sb; +- int ret; +- sector_t sb_offset; +- +- /* +- * Calculate the position of the superblock. +- * It is always aligned to a 4K boundary and +- * depeding on minor_version, it can be: +- * 0: At least 8K, but less than 12K, from end of device +- * 1: At start of device +- * 2: 4K from start of device. +- */ +- switch(minor_version) { +- case 0: +- sb_offset = rdev->bdev->bd_inode->i_size >> 9; +- sb_offset -= 8*2; +- sb_offset &= ~(4*2); +- /* convert from sectors to K */ +- sb_offset /= 2; +- break; +- case 1: +- sb_offset = 0; +- break; +- case 2: +- sb_offset = 4; +- break; +- default: +- return -EINVAL; +- } +- rdev->sb_offset = sb_offset; +- +- ret = read_disk_sb(rdev); +- if (ret) return ret; +- +- +- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); +- +- if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || +- sb->major_version != cpu_to_le32(1) || +- le32_to_cpu(sb->max_dev) > (4096-256)/2 || +- le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || +- sb->feature_map != 0) +- return -EINVAL; +- +- if (calc_sb_1_csum(sb) != sb->sb_csum) { +- printk("md: invalid superblock checksum on %s\n", +- bdev_partition_name(rdev->bdev)); +- return -EINVAL; +- } +- rdev->preferred_minor = 0xffff; +- rdev->data_offset = le64_to_cpu(sb->data_offset); +- +- if (refdev == 0) +- return 1; +- else { +- __u64 ev1, ev2; +- struct mdp_superblock_1 *refsb = +- (struct mdp_superblock_1*)page_address(refdev->sb_page); +- +- if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || +- sb->level != refsb->level || +- sb->layout != refsb->layout || +- sb->chunksize != refsb->chunksize) { +- printk(KERN_WARNING "md: %s has strangely different" +- " superblock to %s\n", +- bdev_partition_name(rdev->bdev), +- bdev_partition_name(refdev->bdev)); +- return -EINVAL; +- } +- ev1 = le64_to_cpu(sb->events); +- ev2 = le64_to_cpu(refsb->events); +- +- if (ev1 > ev2) +- return 1; +- } +- if (minor_version) +- rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; +- else +- rdev->size = rdev->sb_offset; +- if (rdev->size < le64_to_cpu(sb->data_size)/2) +- return -EINVAL; +- rdev->size = le64_to_cpu(sb->data_size)/2; +- if (le32_to_cpu(sb->chunksize)) +- rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); +- return 0; +-} +- +-static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) +-{ +- struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); +- +- if (mddev->raid_disks == 0) { +- mddev->major_version = 1; +- mddev->minor_version = 0; +- mddev->patch_version = 0; +- mddev->persistent = 1; +- mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; +- mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); +- mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); +- mddev->level = le32_to_cpu(sb->level); +- mddev->layout = le32_to_cpu(sb->layout); +- mddev->raid_disks = le32_to_cpu(sb->raid_disks); +- mddev->size = (u32)le64_to_cpu(sb->size); +- mddev->events = le64_to_cpu(sb->events); +- +- mddev->recovery_cp = le64_to_cpu(sb->resync_offset); +- memcpy(mddev->uuid, sb->set_uuid, 16); +- +- mddev->max_disks = (4096-256)/2; +- } else { +- __u64 ev1; +- ev1 = le64_to_cpu(sb->events); +- ++ev1; +- if (ev1 < mddev->events) +- return -EINVAL; +- } +- +- if (mddev->level != LEVEL_MULTIPATH) { +- int role; +- rdev->desc_nr = le32_to_cpu(sb->dev_number); +- role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); +- switch(role) { +- case 0xffff: /* spare */ +- rdev->in_sync = 0; +- rdev->faulty = 0; +- rdev->raid_disk = -1; +- break; +- case 0xfffe: /* faulty */ +- rdev->in_sync = 0; +- rdev->faulty = 1; +- rdev->raid_disk = -1; +- break; +- default: +- rdev->in_sync = 1; +- rdev->faulty = 0; +- rdev->raid_disk = role; +- break; +- } +- } +- return 0; +-} +- +-static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) +-{ +- struct mdp_superblock_1 *sb; +- struct list_head *tmp; +- mdk_rdev_t *rdev2; +- int max_dev, i; +- /* make rdev->sb match mddev and rdev data. */ +- +- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); +- +- sb->feature_map = 0; +- sb->pad0 = 0; +- memset(sb->pad1, 0, sizeof(sb->pad1)); +- memset(sb->pad2, 0, sizeof(sb->pad2)); +- memset(sb->pad3, 0, sizeof(sb->pad3)); +- +- sb->utime = cpu_to_le64((__u64)mddev->utime); +- sb->events = cpu_to_le64(mddev->events); +- if (mddev->in_sync) +- sb->resync_offset = cpu_to_le64(mddev->recovery_cp); +- else +- sb->resync_offset = cpu_to_le64(0); +- +- max_dev = 0; +- ITERATE_RDEV(mddev,rdev2,tmp) +- if (rdev2->desc_nr > max_dev) +- max_dev = rdev2->desc_nr; +- +- sb->max_dev = max_dev; +- for (i=0; idev_roles[max_dev] = cpu_to_le16(0xfffe); +- +- ITERATE_RDEV(mddev,rdev2,tmp) { +- i = rdev2->desc_nr; +- if (rdev2->faulty) +- sb->dev_roles[i] = cpu_to_le16(0xfffe); +- else if (rdev2->in_sync) +- sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); +- else +- sb->dev_roles[i] = cpu_to_le16(0xffff); +- } +- +- sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ +-} +- +- +-struct super_type super_types[] = { +- [0] = { +- .name = "0.90.0", +- .owner = THIS_MODULE, +- .load_super = super_90_load, +- .validate_super = super_90_validate, +- .sync_super = super_90_sync, +- }, +- [1] = { +- .name = "md-1", +- .owner = THIS_MODULE, +- .load_super = super_1_load, +- .validate_super = super_1_validate, +- .sync_super = super_1_sync, +- }, +-}; +- +-static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) +-{ +- struct list_head *tmp; +- mdk_rdev_t *rdev; +- +- ITERATE_RDEV(mddev,rdev,tmp) +- if (rdev->bdev->bd_contains == dev->bdev->bd_contains) +- return rdev; +- +- return NULL; +-} +- +-static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +-{ +- struct list_head *tmp; +- mdk_rdev_t *rdev; +- +- ITERATE_RDEV(mddev1,rdev,tmp) +- if (match_dev_unit(mddev2, rdev)) +- return 1; +- +- return 0; +-} +- +-static LIST_HEAD(pending_raid_disks); +- +-static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +-{ +- mdk_rdev_t *same_pdev; +- +- if (rdev->mddev) { +- MD_BUG(); +- return -EINVAL; +- } +- same_pdev = match_dev_unit(mddev, rdev); +- if (same_pdev) +- printk(KERN_WARNING +- "md%d: WARNING: %s appears to be on the same physical" +- " disk as %s. True\n protection against single-disk" +- " failure might be compromised.\n", +- mdidx(mddev), bdev_partition_name(rdev->bdev), +- bdev_partition_name(same_pdev->bdev)); +- +- /* Verify rdev->desc_nr is unique. +- * If it is -1, assign a free number, else +- * check number is not in use +- */ +- if (rdev->desc_nr < 0) { +- int choice = 0; +- if (mddev->pers) choice = mddev->raid_disks; +- while (find_rdev_nr(mddev, choice)) +- choice++; +- rdev->desc_nr = choice; +- } else { +- if (find_rdev_nr(mddev, rdev->desc_nr)) +- return -EBUSY; +- } +- +- list_add(&rdev->same_set, &mddev->disks); +- rdev->mddev = mddev; +- printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev)); +- return 0; +-} +- +-static void unbind_rdev_from_array(mdk_rdev_t * rdev) +-{ +- if (!rdev->mddev) { +- MD_BUG(); +- return; +- } +- list_del_init(&rdev->same_set); +- printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev)); +- rdev->mddev = NULL; +-} +- +-/* +- * prevent the device from being mounted, repartitioned or +- * otherwise reused by a RAID array (or any other kernel +- * subsystem), by opening the device. [simply getting an +- * inode is not enough, the SCSI module usage code needs +- * an explicit open() on the device] +- */ +-static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +-{ +- int err = 0; +- struct block_device *bdev; +- +- bdev = bdget(dev); +- if (!bdev) +- return -ENOMEM; +- err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); +- if (err) +- return err; +- err = bd_claim(bdev, rdev); +- if (err) { +- blkdev_put(bdev, BDEV_RAW); +- return err; +- } +- rdev->bdev = bdev; +- return err; +-} +- +-static void unlock_rdev(mdk_rdev_t *rdev) +-{ +- struct block_device *bdev = rdev->bdev; +- rdev->bdev = NULL; +- if (!bdev) +- MD_BUG(); +- bd_release(bdev); +- blkdev_put(bdev, BDEV_RAW); +-} +- +-void md_autodetect_dev(dev_t dev); +- +-static void export_rdev(mdk_rdev_t * rdev) +-{ +- printk(KERN_INFO "md: export_rdev(%s)\n", +- bdev_partition_name(rdev->bdev)); +- if (rdev->mddev) +- MD_BUG(); +- free_disk_sb(rdev); +- list_del_init(&rdev->same_set); +-#ifndef MODULE +- md_autodetect_dev(rdev->bdev->bd_dev); +-#endif +- unlock_rdev(rdev); +- kfree(rdev); +-} +- +-static void kick_rdev_from_array(mdk_rdev_t * rdev) +-{ +- unbind_rdev_from_array(rdev); +- export_rdev(rdev); +-} +- +-static void export_array(mddev_t *mddev) +-{ +- struct list_head *tmp; +- mdk_rdev_t *rdev; +- +- ITERATE_RDEV(mddev,rdev,tmp) { +- if (!rdev->mddev) { +- MD_BUG(); +- continue; +- } +- kick_rdev_from_array(rdev); +- } +- if (!list_empty(&mddev->disks)) +- MD_BUG(); +- mddev->raid_disks = 0; +- mddev->major_version = 0; +-} +- +-static void print_desc(mdp_disk_t *desc) +-{ +- printk(" DISK\n", desc->number, +- partition_name(MKDEV(desc->major,desc->minor)), +- desc->major,desc->minor,desc->raid_disk,desc->state); +-} +- +-static void print_sb(mdp_super_t *sb) +-{ +- int i; +- +- printk(KERN_INFO +- "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", +- sb->major_version, sb->minor_version, sb->patch_version, +- sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, +- sb->ctime); +- printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", +- sb->level, sb->size, sb->nr_disks, sb->raid_disks, +- sb->md_minor, sb->layout, sb->chunk_size); +- printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" +- " FD:%d SD:%d CSUM:%08x E:%08lx\n", +- sb->utime, sb->state, sb->active_disks, sb->working_disks, +- sb->failed_disks, sb->spare_disks, +- sb->sb_csum, (unsigned long)sb->events_lo); +- +- printk(KERN_INFO); +- for (i = 0; i < MD_SB_DISKS; i++) { +- mdp_disk_t *desc; +- +- desc = sb->disks + i; +- if (desc->number || desc->major || desc->minor || +- desc->raid_disk || (desc->state && (desc->state != 4))) { +- printk(" D %2d: ", i); +- print_desc(desc); +- } +- } +- printk(KERN_INFO "md: THIS: "); +- print_desc(&sb->this_disk); +- +-} +- +-static void print_rdev(mdk_rdev_t *rdev) +-{ +- printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ", +- bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size, +- rdev->faulty, rdev->in_sync, rdev->desc_nr); +- if (rdev->sb_loaded) { +- printk(KERN_INFO "md: rdev superblock:\n"); +- print_sb((mdp_super_t*)page_address(rdev->sb_page)); +- } else +- printk(KERN_INFO "md: no rdev superblock!\n"); +-} +- +-void md_print_devices(void) +-{ +- struct list_head *tmp, *tmp2; +- mdk_rdev_t *rdev; +- mddev_t *mddev; +- +- printk("\n"); +- printk("md: **********************************\n"); +- printk("md: * *\n"); +- printk("md: **********************************\n"); +- ITERATE_MDDEV(mddev,tmp) { +- printk("md%d: ", mdidx(mddev)); +- +- ITERATE_RDEV(mddev,rdev,tmp2) +- printk("<%s>", bdev_partition_name(rdev->bdev)); +- +- ITERATE_RDEV(mddev,rdev,tmp2) +- print_rdev(rdev); +- } +- printk("md: **********************************\n"); +- printk("\n"); +-} +- +- +-static int write_disk_sb(mdk_rdev_t * rdev) +-{ +- +- if (!rdev->sb_loaded) { +- MD_BUG(); +- return 1; +- } +- if (rdev->faulty) { +- MD_BUG(); +- return 1; +- } +- +- dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", +- bdev_partition_name(rdev->bdev), +- (unsigned long long)rdev->sb_offset); +- +- if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) +- return 0; +- +- printk("md: write_disk_sb failed for device %s\n", +- bdev_partition_name(rdev->bdev)); +- return 1; +-} +- +-static void sync_sbs(mddev_t * mddev) +-{ +- mdk_rdev_t *rdev; +- struct list_head *tmp; +- +- ITERATE_RDEV(mddev,rdev,tmp) { +- super_types[mddev->major_version]. +- sync_super(mddev, rdev); +- rdev->sb_loaded = 1; +- } +-} +- +-static void md_update_sb(mddev_t * mddev) +-{ +- int err, count = 100; +- struct list_head *tmp; +- mdk_rdev_t *rdev; +- +- mddev->sb_dirty = 0; +-repeat: +- mddev->utime = get_seconds(); +- mddev->events ++; +- +- if (!mddev->events) { +- /* +- * oops, this 64-bit counter should never wrap. +- * Either we are in around ~1 trillion A.C., assuming +- * 1 reboot per second, or we have a bug: +- */ +- MD_BUG(); +- mddev->events --; +- } +- sync_sbs(mddev); +- +- /* +- * do not write anything to disk if using +- * nonpersistent superblocks +- */ +- if (!mddev->persistent) +- return; +- +- dprintk(KERN_INFO +- "md: updating md%d RAID superblock on device (in sync %d)\n", +- mdidx(mddev),mddev->in_sync); +- +- err = 0; +- ITERATE_RDEV(mddev,rdev,tmp) { +- dprintk(KERN_INFO "md: "); +- if (rdev->faulty) +- dprintk("(skipping faulty "); +- +- dprintk("%s ", bdev_partition_name(rdev->bdev)); +- if (!rdev->faulty) { +- err += write_disk_sb(rdev); +- } else +- dprintk(")\n"); +- if (!err && mddev->level == LEVEL_MULTIPATH) +- /* only need to write one superblock... */ +- break; +- } +- if (err) { +- if (--count) { +- printk(KERN_ERR "md: errors occurred during superblock" +- " update, repeating\n"); +- goto repeat; +- } +- printk(KERN_ERR \ +- "md: excessive errors occurred during superblock update, exiting\n"); +- } +-} +- +-/* +- * Import a device. If 'super_format' >= 0, then sanity check the superblock +- * +- * mark the device faulty if: +- * +- * - the device is nonexistent (zero size) +- * - the device has no valid superblock +- * +- * a faulty rdev _never_ has rdev->sb set. +- */ +-static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) +-{ +- int err; +- mdk_rdev_t *rdev; +- sector_t size; +- +- rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); +- if (!rdev) { +- printk(KERN_ERR "md: could not alloc mem for %s!\n", +- partition_name(newdev)); +- return ERR_PTR(-ENOMEM); +- } +- memset(rdev, 0, sizeof(*rdev)); +- +- if ((err = alloc_disk_sb(rdev))) +- goto abort_free; +- +- err = lock_rdev(rdev, newdev); +- if (err) { +- printk(KERN_ERR "md: could not lock %s.\n", +- partition_name(newdev)); +- goto abort_free; +- } +- rdev->desc_nr = -1; +- rdev->faulty = 0; +- rdev->in_sync = 0; +- rdev->data_offset = 0; +- atomic_set(&rdev->nr_pending, 0); +- +- size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; +- if (!size) { +- printk(KERN_WARNING +- "md: %s has zero or unknown size, marking faulty!\n", +- bdev_partition_name(rdev->bdev)); +- err = -EINVAL; +- goto abort_free; +- } +- +- if (super_format >= 0) { +- err = super_types[super_format]. +- load_super(rdev, NULL, super_minor); +- if (err == -EINVAL) { +- printk(KERN_WARNING +- "md: %s has invalid sb, not importing!\n", +- bdev_partition_name(rdev->bdev)); +- goto abort_free; +- } +- if (err < 0) { +- printk(KERN_WARNING +- "md: could not read %s's sb, not importing!\n", +- bdev_partition_name(rdev->bdev)); +- goto abort_free; +- } +- } +- INIT_LIST_HEAD(&rdev->same_set); +- +- return rdev; +- +-abort_free: +- if (rdev->sb_page) { +- if (rdev->bdev) +- unlock_rdev(rdev); +- free_disk_sb(rdev); +- } +- kfree(rdev); +- return ERR_PTR(err); +-} +- +-/* +- * Check a full RAID array for plausibility +- */ +- +- +-static int analyze_sbs(mddev_t * mddev) +-{ +- int i; +- struct list_head *tmp; +- mdk_rdev_t *rdev, *freshest; +- +- freshest = NULL; +- ITERATE_RDEV(mddev,rdev,tmp) +- switch (super_types[mddev->major_version]. +- load_super(rdev, freshest, mddev->minor_version)) { +- case 1: +- freshest = rdev; +- break; +- case 0: +- break; +- default: +- printk( KERN_ERR \ +- "md: fatal superblock inconsistency in %s" +- " -- removing from array\n", +- bdev_partition_name(rdev->bdev)); +- kick_rdev_from_array(rdev); +- } +- +- +- super_types[mddev->major_version]. +- validate_super(mddev, freshest); +- +- i = 0; +- ITERATE_RDEV(mddev,rdev,tmp) { +- if (rdev != freshest) +- if (super_types[mddev->major_version]. +- validate_super(mddev, rdev)) { +- printk(KERN_WARNING "md: kicking non-fresh %s" +- " from array!\n", +- bdev_partition_name(rdev->bdev)); +- kick_rdev_from_array(rdev); +- continue; +- } +- if (mddev->level == LEVEL_MULTIPATH) { +- rdev->desc_nr = i++; +- rdev->raid_disk = rdev->desc_nr; +- rdev->in_sync = 1; +- } +- } +- +- +- /* +- * Check if we can support this RAID array +- */ +- if (mddev->major_version != MD_MAJOR_VERSION || +- mddev->minor_version > MD_MINOR_VERSION) { +- printk(KERN_ALERT +- "md: md%d: unsupported raid array version %d.%d.%d\n", +- mdidx(mddev), mddev->major_version, +- mddev->minor_version, mddev->patch_version); +- goto abort; +- } +- +- if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) || +- (mddev->level == 4) || (mddev->level == 5))) +- printk(KERN_ERR "md: md%d: raid array is not clean" +- " -- starting background reconstruction\n", +- mdidx(mddev)); +- +- return 0; +-abort: ++*** 1453,90 **** 1 + return 1; + } + ++#undef OLD_LEVEL ++ + static int device_size_calculation(mddev_t * mddev) + { + int data_disks = 0; + unsigned int readahead; + struct list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size < mddev->chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size:" + " %lluk < %dk\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->size, + mddev->chunk_size / 1024); + return -EINVAL; + } + } + + switch (mddev->level) { + case LEVEL_MULTIPATH: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case LEVEL_LINEAR: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = mddev->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = mddev->raid_disks-1; + break; + default: + printk(KERN_ERR "md: md%d: unsupported raid level %d\n", + mdidx(mddev), mddev->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = mddev->size * data_disks; + + readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; + if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) { + readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + // (no multipath branch - it uses the default setting) + if (mddev->level == -3) + readahead = 0; + } + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; + abort: + return 1; + } + + static struct gendisk *md_probe(dev_t dev, int *part, void *data) + { + static DECLARE_MUTEX(disks_sem); +- int unit = MINOR(dev); +- mddev_t *mddev = mddev_find(unit); +- struct gendisk *disk; +- +- if (!mddev) +- return NULL; +- +- down(&disks_sem); +- if (disks[unit]) { +- up(&disks_sem); +- mddev_put(mddev); +- return NULL; +- } +- disk = alloc_disk(1); +- if (!disk) { +- up(&disks_sem); +- mddev_put(mddev); +- return NULL; +- } +- disk->major = MD_MAJOR; +- disk->first_minor = mdidx(mddev); +- sprintf(disk->disk_name, "md%d", mdidx(mddev)); +- disk->fops = &md_fops; +- disk->private_data = mddev; +- disk->queue = &mddev->queue; +- add_disk(disk); +- disks[mdidx(mddev)] = disk; +- up(&disks_sem); +- return NULL; +-} +- +-void md_wakeup_thread(mdk_thread_t *thread); +- +-static void md_safemode_timeout(unsigned long data) +-{ +- mddev_t *mddev = (mddev_t *) data; +- +- mddev->safemode = 1; +- md_wakeup_thread(mddev->thread); +-} +- +- +-static int do_md_run(mddev_t * mddev) +-{ +- int pnum, err; +- int chunk_size; +- struct list_head *tmp; +- mdk_rdev_t *rdev; +- struct gendisk *disk; +- +- if (list_empty(&mddev->disks)) { +- MD_BUG(); +- return -EINVAL; +- } +- +- if (mddev->pers) +- return -EBUSY; +- +- /* +- * Analyze all RAID superblock(s) +- */ +- if (!mddev->raid_disks && analyze_sbs(mddev)) { +- MD_BUG(); +- return -EINVAL; +- } +- +- chunk_size = mddev->chunk_size; +- pnum = level_to_pers(mddev->level); +- +- if ((pnum != MULTIPATH) && (pnum != RAID1)) { +- if (!chunk_size) { +- /* +- * 'default chunksize' in the old md code used to +- * be PAGE_SIZE, baaad. +- * we abort here to be on the safe side. We don't +- * want to continue the bad practice. +- */ +- printk(KERN_ERR +- "no chunksize specified, see 'man raidtab'\n"); +- return -EINVAL; +- } +- if (chunk_size > MAX_CHUNK_SIZE) { +- printk(KERN_ERR "too big chunk_size: %d > %d\n", +- chunk_size, MAX_CHUNK_SIZE); +- return -EINVAL; +- } +- /* +- * chunk-size has to be a power of 2 and multiples of PAGE_SIZE +- */ +- if ( (1 << ffz(~chunk_size)) != chunk_size) { +- MD_BUG(); +- return -EINVAL; +- } +- if (chunk_size < PAGE_SIZE) { +- printk(KERN_ERR "too small chunk_size: %d < %ld\n", +- chunk_size, PAGE_SIZE); +- return -EINVAL; +- } +- +- /* devices must have minimum size of one chunk */ +- ITERATE_RDEV(mddev,rdev,tmp) { +- if (rdev->faulty) +- continue; +- if (rdev->size < chunk_size / 1024) { +- printk(KERN_WARNING +- "md: Dev %s smaller than chunk_size:" +- " %lluk < %dk\n", +- bdev_partition_name(rdev->bdev), +- (unsigned long long)rdev->size, +- chunk_size / 1024); +- return -EINVAL; +- } +- } +- } +- if (pnum >= MAX_PERSONALITY) { +- MD_BUG(); +- return -EINVAL; +- } +- +-#ifdef CONFIG_KMOD +- if (!pers[pnum]) +- { +- char module_name[80]; +- sprintf (module_name, "md-personality-%d", pnum); +- request_module (module_name); ++*** 1664,9 **** 2 ++ } + } +-#endif + + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md +- * device. +- * Also find largest hardsector size +- */ +- ITERATE_RDEV(mddev,rdev,tmp) { +- if (rdev->faulty) +- continue; +- sync_blockdev(rdev->bdev); +- invalidate_bdev(rdev->bdev, 0); +- } +- +- md_probe(mdidx(mddev), NULL, NULL); +- disk = disks[mdidx(mddev)]; +- if (!disk) +- return -ENOMEM; +- +- spin_lock(&pers_lock); +- if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { +- spin_unlock(&pers_lock); +- printk(KERN_ERR "md: personality %d is not loaded!\n", +- pnum); +- return -EINVAL; +- } +- +- mddev->pers = pers[pnum]; +- spin_unlock(&pers_lock); +- +- blk_queue_make_request(&mddev->queue, mddev->pers->make_request); +- printk("%s: setting max_sectors to %d, segment boundary to %d\n", +- disk->disk_name, +- chunk_size >> 9, +- (chunk_size>>1)-1); +- blk_queue_max_sectors(&mddev->queue, chunk_size >> 9); +- blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1); +- mddev->queue.queuedata = mddev; +- +- err = mddev->pers->run(mddev); +- if (err) { +- printk(KERN_ERR "md: pers->run() failed ...\n"); +- module_put(mddev->pers->owner); +- mddev->pers = NULL; +- return -EINVAL; +- } +- atomic_set(&mddev->writes_pending,0); +- mddev->safemode = 0; +- mddev->safemode_timer.function = md_safemode_timeout; +- mddev->safemode_timer.data = (unsigned long) mddev; +- mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ +- mddev->in_sync = 1; +- +- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +- md_wakeup_thread(mddev->thread); +- set_capacity(disk, mddev->array_size<<1); +- return 0; +-} +- +-static int restart_array(mddev_t *mddev) +-{ +- struct gendisk *disk = disks[mdidx(mddev)]; +- int err; +- +- /* +- * Complain if it has no devices +- */ +- err = -ENXIO; +- if (list_empty(&mddev->disks)) +- goto out; +- +- if (mddev->pers) { +- err = -EBUSY; +- if (!mddev->ro) +- goto out; +- +- mddev->safemode = 0; +- mddev->ro = 0; +- set_disk_ro(disk, 0); +- +- printk(KERN_INFO "md: md%d switched to read-write mode.\n", +- mdidx(mddev)); +- /* +- * Kick recovery or resync if necessary +- */ +- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +- md_wakeup_thread(mddev->thread); +- err = 0; +- } else { +- printk(KERN_ERR "md: md%d has no personality assigned.\n", +- mdidx(mddev)); +- err = -EINVAL; +- } +- +-out: +- return err; +-} +- +-static int do_md_stop(mddev_t * mddev, int ro) +-{ +- int err = 0; +- struct gendisk *disk = disks[mdidx(mddev)]; +- +- if (atomic_read(&mddev->active)>2) { +- printk("md: md%d still in use.\n",mdidx(mddev)); +- err = -EBUSY; +- goto out; +- } +- +- if (mddev->pers) { +- if (mddev->sync_thread) { +- set_bit(MD_RECOVERY_INTR, &mddev->recovery); +- md_unregister_thread(mddev->sync_thread); +- mddev->sync_thread = NULL; +- } +- +- del_timer_sync(&mddev->safemode_timer); +- +- invalidate_device(mk_kdev(disk->major, disk->first_minor), 1); +- +- if (ro) { +- err = -ENXIO; +- if (mddev->ro) +- goto out; +- mddev->ro = 1; +- } else { +- if (mddev->ro) +- set_disk_ro(disk, 0); +- if (mddev->pers->stop(mddev)) { +- err = -EBUSY; +- if (mddev->ro) +- set_disk_ro(disk, 1); +- goto out; +- } +- module_put(mddev->pers->owner); +- mddev->pers = NULL; +- if (mddev->ro) +- mddev->ro = 0; +- } +- if (mddev->raid_disks) { +- /* mark array as shutdown cleanly */ +- mddev->in_sync = 1; +- md_update_sb(mddev); +- } +- if (ro) +- set_disk_ro(disk, 1); +- } +- /* +- * Free resources if final stop +- */ +- if (!ro) { +- struct gendisk *disk; +- printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); +- +- export_array(mddev); +- +- mddev->array_size = 0; +- disk = disks[mdidx(mddev)]; +- if (disk) +- set_capacity(disk, 0); +- } else +- printk(KERN_INFO "md: md%d switched to read-only mode.\n", +- mdidx(mddev)); +- err = 0; +-out: +- return err; +-} +- +-static void autorun_array(mddev_t *mddev) +-{ +- mdk_rdev_t *rdev; +- struct list_head *tmp; +- int err; +- +- if (list_empty(&mddev->disks)) { +- MD_BUG(); +- return; +- } +- +- printk(KERN_INFO "md: running: "); +- +- ITERATE_RDEV(mddev,rdev,tmp) { +- printk("<%s>", bdev_partition_name(rdev->bdev)); +- } +- printk("\n"); +- +- err = do_md_run (mddev); +- if (err) { +- printk(KERN_WARNING "md :do_md_run() returned %d\n", err); +- do_md_stop (mddev, 0); +- } +-} +- +-/* +- * lets try to run arrays based on all disks that have arrived +- * until now. (those are in pending_raid_disks) +- * +- * the method: pick the first pending disk, collect all disks with +- * the same UUID, remove all from the pending list and put them into +- * the 'same_array' list. Then order this list based on superblock +- * update time (freshest comes first), kick out 'old' disks and +- * compare superblocks. If everything's fine then run it. +- * +- * If "unit" is allocated, then bump its reference count +- */ +-static void autorun_devices(void) +-{ +- struct list_head candidates; +- struct list_head *tmp; +- mdk_rdev_t *rdev0, *rdev; +- mddev_t *mddev; +- +- printk(KERN_INFO "md: autorun ...\n"); +- while (!list_empty(&pending_raid_disks)) { +- rdev0 = list_entry(pending_raid_disks.next, +- mdk_rdev_t, same_set); +- +- printk(KERN_INFO "md: considering %s ...\n", +- bdev_partition_name(rdev0->bdev)); +- INIT_LIST_HEAD(&candidates); +- ITERATE_RDEV_PENDING(rdev,tmp) +- if (super_90_load(rdev, rdev0, 0) >= 0) { +- printk(KERN_INFO "md: adding %s ...\n", +- bdev_partition_name(rdev->bdev)); +- list_move(&rdev->same_set, &candidates); +- } +- /* +- * now we have a set of devices, with all of them having +- * mostly sane superblocks. It's time to allocate the +- * mddev. +- */ +- +- mddev = mddev_find(rdev0->preferred_minor); +- if (!mddev) { +- printk(KERN_ERR +- "md: cannot allocate memory for md drive.\n"); +- break; +- } +- if (mddev_lock(mddev)) +- printk(KERN_WARNING "md: md%d locked, cannot run\n", +- mdidx(mddev)); +- else if (mddev->raid_disks || mddev->major_version +- || !list_empty(&mddev->disks)) { +- printk(KERN_WARNING +- "md: md%d already running, cannot run %s\n", +- mdidx(mddev), bdev_partition_name(rdev0->bdev)); +- mddev_unlock(mddev); +- } else { +- printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); +- ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { +- list_del_init(&rdev->same_set); +- if (bind_rdev_to_array(rdev, mddev)) +- export_rdev(rdev); +- } +- autorun_array(mddev); +- mddev_unlock(mddev); +- } +- /* on success, candidates will be empty, on error +- * it won't... +- */ +- ITERATE_RDEV_GENERIC(candidates,rdev,tmp) +- export_rdev(rdev); +- mddev_put(mddev); +- } +- printk(KERN_INFO "md: ... autorun DONE.\n"); +-} +- +-/* +- * import RAID devices based on one partition +- * if possible, the array gets run as well. +- */ +- +-static int autostart_array(dev_t startdev) +-{ +- int err = -EINVAL, i; +- mdp_super_t *sb = NULL; +- mdk_rdev_t *start_rdev = NULL, *rdev; +- +- start_rdev = md_import_device(startdev, 0, 0); +- if (IS_ERR(start_rdev)) { +- printk(KERN_WARNING "md: could not import %s!\n", +- partition_name(startdev)); +- return err; +- } +- +- /* NOTE: this can only work for 0.90.0 superblocks */ +- sb = (mdp_super_t*)page_address(start_rdev->sb_page); +- if (sb->major_version != 0 || +- sb->minor_version != 90 ) { +- printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); +- export_rdev(start_rdev); +- return err; +- } +- +- if (start_rdev->faulty) { +- printk(KERN_WARNING +- "md: can not autostart based on faulty %s!\n", +- bdev_partition_name(start_rdev->bdev)); +- export_rdev(start_rdev); +- return err; +- } +- list_add(&start_rdev->same_set, &pending_raid_disks); +- +- for (i = 0; i < MD_SB_DISKS; i++) { +- mdp_disk_t *desc; +- dev_t dev; +- +- desc = sb->disks + i; +- dev = MKDEV(desc->major, desc->minor); +- +- if (!dev) +- continue; +- if (dev == startdev) +- continue; +- rdev = md_import_device(dev, 0, 0); +- if (IS_ERR(rdev)) { +- printk(KERN_WARNING "md: could not import %s," +- " trying to run array nevertheless.\n", +- partition_name(dev)); +- continue; +- } +- list_add(&rdev->same_set, &pending_raid_disks); +- } +- +- /* +- * possibly return codes +- */ +- autorun_devices(); +- return 0; +- +-} +- +- +-static int get_version(void * arg) +-{ +- mdu_version_t ver; +- +- ver.major = MD_MAJOR_VERSION; +- ver.minor = MD_MINOR_VERSION; +- ver.patchlevel = MD_PATCHLEVEL_VERSION; +- +- if (copy_to_user(arg, &ver, sizeof(ver))) +- return -EFAULT; +- +- return 0; +-} +- +-static int get_array_info(mddev_t * mddev, void * arg) +-{ +- mdu_array_info_t info; +- int nr,working,active,failed,spare; +- mdk_rdev_t *rdev; +- struct list_head *tmp; +- +- nr=working=active=failed=spare=0; +- ITERATE_RDEV(mddev,rdev,tmp) { +- nr++; +- if (rdev->faulty) +- failed++; +- else { +- working++; +- if (rdev->in_sync) +- active++; +- else +- spare++; +- } +- } +- +- info.major_version = mddev->major_version; +- info.minor_version = mddev->minor_version; +- info.patch_version = 1; +- info.ctime = mddev->ctime; +- info.level = mddev->level; +- info.size = mddev->size; +- info.nr_disks = nr; +- info.raid_disks = mddev->raid_disks; +- info.md_minor = mddev->__minor; +- info.not_persistent= !mddev->persistent; +- +- info.utime = mddev->utime; +- info.state = 0; +- if (mddev->in_sync) +- info.state = (1<layout; +- info.chunk_size = mddev->chunk_size; +- +- if (copy_to_user(arg, &info, sizeof(info))) +- return -EFAULT; +- +- return 0; +-} +- +-static int get_disk_info(mddev_t * mddev, void * arg) +-{ +- mdu_disk_info_t info; +- unsigned int nr; +- mdk_rdev_t *rdev; +- +- if (copy_from_user(&info, arg, sizeof(info))) +- return -EFAULT; +- +- nr = info.number; +- +- rdev = find_rdev_nr(mddev, nr); +- if (rdev) { +- info.major = MAJOR(rdev->bdev->bd_dev); +- info.minor = MINOR(rdev->bdev->bd_dev); +- info.raid_disk = rdev->raid_disk; +- info.state = 0; +- if (rdev->faulty) +- info.state |= (1<in_sync) { +- info.state |= (1<major,info->minor); +- if (!mddev->raid_disks) { +- int err; +- /* expecting a device which has a superblock */ +- rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); +- if (IS_ERR(rdev)) { +- printk(KERN_WARNING +- "md: md_import_device returned %ld\n", +- PTR_ERR(rdev)); +- return PTR_ERR(rdev); +- } +- if (!list_empty(&mddev->disks)) { +- mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, +- mdk_rdev_t, same_set); +- int err = super_types[mddev->major_version] +- .load_super(rdev, rdev0, mddev->minor_version); +- if (err < 0) { +- printk(KERN_WARNING +- "md: %s has different UUID to %s\n", +- bdev_partition_name(rdev->bdev), +- bdev_partition_name(rdev0->bdev)); +- export_rdev(rdev); +- return -EINVAL; +- } +- } +- err = bind_rdev_to_array(rdev, mddev); +- if (err) +- export_rdev(rdev); +- return err; +- } +- +- /* +- * add_new_disk can be used once the array is assembled +- * to add "hot spares". They must already have a superblock +- * written +- */ +- if (mddev->pers) { +- int err; +- if (!mddev->pers->hot_add_disk) { +- printk(KERN_WARNING +- "md%d: personality does not support diskops!\n", +- mdidx(mddev)); +- return -EINVAL; +- } +- rdev = md_import_device(dev, mddev->major_version, +- mddev->minor_version); +- if (IS_ERR(rdev)) { +- printk(KERN_WARNING +- "md: md_import_device returned %ld\n", +- PTR_ERR(rdev)); +- return PTR_ERR(rdev); +- } +- rdev->in_sync = 0; /* just to be sure */ +- rdev->raid_disk = -1; +- err = bind_rdev_to_array(rdev, mddev); +- if (err) +- export_rdev(rdev); +- if (mddev->thread) +- md_wakeup_thread(mddev->thread); +- return err; +- } +- +- /* otherwise, add_new_disk is only allowed +- * for major_version==0 superblocks +- */ +- if (mddev->major_version != 0) { +- printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n", +- mdidx(mddev)); +- return -EINVAL; +- } +- +- if (!(info->state & (1<desc_nr = info->number; +- if (info->raid_disk < mddev->raid_disks) +- rdev->raid_disk = info->raid_disk; +- else +- rdev->raid_disk = -1; +- +- rdev->faulty = 0; +- if (rdev->raid_disk < mddev->raid_disks) +- rdev->in_sync = (info->state & (1<in_sync = 0; +- +- err = bind_rdev_to_array(rdev, mddev); +- if (err) { +- export_rdev(rdev); +- return err; +- } +- +- if (!mddev->persistent) { +- printk(KERN_INFO "md: nonpersistent superblock ...\n"); +- rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; +- } else +- rdev->sb_offset = calc_dev_sboffset(rdev->bdev); +- rdev->size = calc_dev_size(rdev, mddev->chunk_size); +- +- if (!mddev->size || (mddev->size > rdev->size)) +- mddev->size = rdev->size; +- } +- +- return 0; +-} +- +-static int hot_generate_error(mddev_t * mddev, dev_t dev) +-{ +- struct request_queue *q; +- mdk_rdev_t *rdev; +- +- if (!mddev->pers) +- return -ENODEV; +- +- printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", +- partition_name(dev), mdidx(mddev)); +- +- rdev = find_rdev(mddev, dev); +- if (!rdev) { +- MD_BUG(); +- return -ENXIO; +- } +- +- if (rdev->desc_nr == -1) { +- MD_BUG(); +- return -EINVAL; +- } +- if (!rdev->in_sync) +- return -ENODEV; +- +- q = bdev_get_queue(rdev->bdev); +- if (!q) { +- MD_BUG(); +- return -ENODEV; +- } +- printk(KERN_INFO "md: okay, generating error!\n"); +-// q->oneshot_error = 1; // disabled for now +- +- return 0; +-} +- +-static int hot_remove_disk(mddev_t * mddev, dev_t dev) +-{ +- mdk_rdev_t *rdev; +- +- if (!mddev->pers) +- return -ENODEV; +- +- printk(KERN_INFO "md: trying to remove %s from md%d ... \n", +- partition_name(dev), mdidx(mddev)); +- +- rdev = find_rdev(mddev, dev); +- if (!rdev) +- return -ENXIO; +- +- if (rdev->raid_disk >= 0) +- goto busy; +- +- kick_rdev_from_array(rdev); +- md_update_sb(mddev); +- +- return 0; +-busy: +- printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", +- bdev_partition_name(rdev->bdev), mdidx(mddev)); +- return -EBUSY; +-} +- +-static int hot_add_disk(mddev_t * mddev, dev_t dev) +-{ +- int err; +- unsigned int size; +- mdk_rdev_t *rdev; +- +- if (!mddev->pers) +- return -ENODEV; +- +- printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", +- partition_name(dev), mdidx(mddev)); +- +- if (mddev->major_version != 0) { +- printk(KERN_WARNING "md%d: HOT_ADD may only be used with" +- " version-0 superblocks.\n", +- mdidx(mddev)); +- return -EINVAL; +- } +- if (!mddev->pers->hot_add_disk) { +- printk(KERN_WARNING +- "md%d: personality does not support diskops!\n", +- mdidx(mddev)); +- return -EINVAL; +- } +- +- rdev = md_import_device (dev, -1, 0); +- if (IS_ERR(rdev)) { +- printk(KERN_WARNING +- "md: error, md_import_device() returned %ld\n", +- PTR_ERR(rdev)); +- return -EINVAL; +- } +- +- rdev->sb_offset = calc_dev_sboffset(rdev->bdev); +- size = calc_dev_size(rdev, mddev->chunk_size); +- rdev->size = size; +- +- if (size < mddev->size) { +- printk(KERN_WARNING +- "md%d: disk size %llu blocks < array size %llu\n", +- mdidx(mddev), (unsigned long long)size, +- (unsigned long long)mddev->size); +- err = -ENOSPC; +- goto abort_export; +- } +- +- if (rdev->faulty) { +- printk(KERN_WARNING +- "md: can not hot-add faulty %s disk to md%d!\n", +- bdev_partition_name(rdev->bdev), mdidx(mddev)); +- err = -EINVAL; +- goto abort_export; +- } +- rdev->in_sync = 0; +- rdev->desc_nr = -1; +- bind_rdev_to_array(rdev, mddev); +- +- /* +- * The rest should better be atomic, we can have disk failures +- * noticed in interrupt contexts ... +- */ +- +- if (rdev->desc_nr == mddev->max_disks) { +- printk(KERN_WARNING "md%d: can not hot-add to full array!\n", +- mdidx(mddev)); +- err = -EBUSY; +- goto abort_unbind_export; +- } +- +- rdev->raid_disk = -1; +- +- md_update_sb(mddev); +- +- /* +- * Kick recovery, maybe this spare has to be added to the +- * array immediately. +- */ +- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +- md_wakeup_thread(mddev->thread); +- +- return 0; +- +-abort_unbind_export: +- unbind_rdev_from_array(rdev); +- +-abort_export: +- export_rdev(rdev); +- return err; +-} +- +-/* +- * set_array_info is used two different ways +- * The original usage is when creating a new array. +- * In this usage, raid_disks is > = and it together with +- * level, size, not_persistent,layout,chunksize determine the +- * shape of the array. +- * This will always create an array with a type-0.90.0 superblock. +- * The newer usage is when assembling an array. +- * In this case raid_disks will be 0, and the major_version field is +- * use to determine which style super-blocks are to be found on the devices. +- * The minor and patch _version numbers are also kept incase the +- * super_block handler wishes to interpret them. +- */ +-static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +-{ +- +- if (info->raid_disks == 0) { +- /* just setting version number for superblock loading */ +- if (info->major_version < 0 || +- info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || +- super_types[info->major_version].name == NULL) { +- /* maybe try to auto-load a module? */ +- printk(KERN_INFO +- "md: superblock version %d not known\n", +- info->major_version); +- return -EINVAL; +- } +- mddev->major_version = info->major_version; +- mddev->minor_version = info->minor_version; +- mddev->patch_version = info->patch_version; +- return 0; +- } +- mddev->major_version = MD_MAJOR_VERSION; +- mddev->minor_version = MD_MINOR_VERSION; +- mddev->patch_version = MD_PATCHLEVEL_VERSION; +- mddev->ctime = get_seconds(); +- +- mddev->level = info->level; +- mddev->size = info->size; +- mddev->raid_disks = info->raid_disks; +- /* don't set __minor, it is determined by which /dev/md* was +- * openned +- */ +- if (info->state & (1<recovery_cp = MaxSector; +- else +- mddev->recovery_cp = 0; +- mddev->persistent = ! info->not_persistent; +- +- mddev->layout = info->layout; +- mddev->chunk_size = info->chunk_size; +- +- mddev->max_disks = MD_SB_DISKS; +- +- +- /* +- * Generate a 128 bit UUID +- */ +- get_random_bytes(mddev->uuid, 16); +- +- return 0; +-} +- +-static int set_disk_faulty(mddev_t *mddev, dev_t dev) +-{ +- mdk_rdev_t *rdev; +- +- rdev = find_rdev(mddev, dev); +- if (!rdev) +- return 0; +- +- md_error(mddev, rdev); +- return 1; +-} +- +-static int md_ioctl(struct inode *inode, struct file *file, +- unsigned int cmd, unsigned long arg) +-{ +- unsigned int minor; +- int err = 0; +- struct hd_geometry *loc = (struct hd_geometry *) arg; +- mddev_t *mddev = NULL; +- kdev_t dev; +- +- if (!capable(CAP_SYS_ADMIN)) +- return -EACCES; +- +- dev = inode->i_rdev; +- minor = minor(dev); +- if (minor >= MAX_MD_DEVS) { +- MD_BUG(); +- return -EINVAL; +- } +- +- /* +- * Commands dealing with the RAID driver but not any +- * particular array: +- */ +- switch (cmd) +- { +- case RAID_VERSION: +- err = get_version((void *)arg); +- goto done; +- +- case PRINT_RAID_DEBUG: +- err = 0; +- md_print_devices(); +- goto done; +- +-#ifndef MODULE +- case RAID_AUTORUN: +- err = 0; +- autostart_arrays(); +- goto done; +-#endif +- default:; +- } +- +- /* +- * Commands creating/starting a new array: +- */ +- +- mddev = inode->i_bdev->bd_inode->u.generic_ip; +- +- if (!mddev) { +- BUG(); +- goto abort; +- } +- +- +- if (cmd == START_ARRAY) { +- /* START_ARRAY doesn't need to lock the array as autostart_array +- * does the locking, and it could even be a different array +- */ +- err = autostart_array(arg); +- if (err) { +- printk(KERN_WARNING "md: autostart %s failed!\n", +- partition_name(arg)); +- goto abort; +- } +- goto done; +- } +- +- err = mddev_lock(mddev); +- if (err) { +- printk(KERN_INFO +- "md: ioctl lock interrupted, reason %d, cmd %d\n", +- err, cmd); +- goto abort; +- } +- +- switch (cmd) +- { +- case SET_ARRAY_INFO: +- +- if (!list_empty(&mddev->disks)) { +- printk(KERN_WARNING +- "md: array md%d already has disks!\n", +- mdidx(mddev)); +- err = -EBUSY; +- goto abort_unlock; +- } +- if (mddev->raid_disks) { +- printk(KERN_WARNING +- "md: array md%d already initialised!\n", +- mdidx(mddev)); +- err = -EBUSY; +- goto abort_unlock; +- } +- { +- mdu_array_info_t info; +- if (!arg) +- memset(&info, 0, sizeof(info)); +- else if (copy_from_user(&info, (void*)arg, sizeof(info))) { +- err = -EFAULT; +- goto abort_unlock; +- } +- err = set_array_info(mddev, &info); +- if (err) { +- printk(KERN_WARNING "md: couldn't set" +- " array info. %d\n", err); +- goto abort_unlock; +- } +- } +- goto done_unlock; +- +- default:; +- } +- +- /* +- * Commands querying/configuring an existing array: +- */ +- /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ +- if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { +- err = -ENODEV; +- goto abort_unlock; +- } +- +- /* +- * Commands even a read-only array can execute: +- */ +- switch (cmd) +- { +- case GET_ARRAY_INFO: +- err = get_array_info(mddev, (void *)arg); +- goto done_unlock; +- +- case GET_DISK_INFO: +- err = get_disk_info(mddev, (void *)arg); +- goto done_unlock; +- +- case RESTART_ARRAY_RW: +- err = restart_array(mddev); +- goto done_unlock; +- +- case STOP_ARRAY: +- err = do_md_stop (mddev, 0); +- goto done_unlock; +- +- case STOP_ARRAY_RO: +- err = do_md_stop (mddev, 1); +- goto done_unlock; +- +- /* +- * We have a problem here : there is no easy way to give a CHS +- * virtual geometry. We currently pretend that we have a 2 heads +- * 4 sectors (with a BIG number of cylinders...). This drives +- * dosfs just mad... ;-) +- */ +- case HDIO_GETGEO: +- if (!loc) { +- err = -EINVAL; +- goto abort_unlock; +- } +- err = put_user (2, (char *) &loc->heads); +- if (err) +- goto abort_unlock; +- err = put_user (4, (char *) &loc->sectors); +- if (err) +- goto abort_unlock; +- err = put_user(get_capacity(disks[mdidx(mddev)])/8, +- (short *) &loc->cylinders); +- if (err) +- goto abort_unlock; +- err = put_user (get_start_sect(inode->i_bdev), +- (long *) &loc->start); +- goto done_unlock; +- } +- +- /* +- * The remaining ioctls are changing the state of the +- * superblock, so we do not allow read-only arrays +- * here: +- */ +- if (mddev->ro) { +- err = -EROFS; +- goto abort_unlock; +- } +- +- switch (cmd) +- { +- case ADD_NEW_DISK: +- { +- mdu_disk_info_t info; +- if (copy_from_user(&info, (void*)arg, sizeof(info))) +- err = -EFAULT; +- else +- err = add_new_disk(mddev, &info); +- goto done_unlock; +- } +- case HOT_GENERATE_ERROR: +- err = hot_generate_error(mddev, arg); +- goto done_unlock; +- case HOT_REMOVE_DISK: +- err = hot_remove_disk(mddev, arg); +- goto done_unlock; +- +- case HOT_ADD_DISK: +- err = hot_add_disk(mddev, arg); +- goto done_unlock; +- +- case SET_DISK_FAULTY: +- err = set_disk_faulty(mddev, arg); +- goto done_unlock; +- +- case RUN_ARRAY: +- { +- err = do_md_run (mddev); +- /* +- * we have to clean up the mess if +- * the array cannot be run for some +- * reason ... +- * ->pers will not be set, to superblock will +- * not be updated. +- */ +- if (err) +- do_md_stop (mddev, 0); +- goto done_unlock; +- } +- +- default: +- if (_IOC_TYPE(cmd) == MD_MAJOR) +- printk(KERN_WARNING "md: %s(pid %d) used" +- " obsolete MD ioctl, upgrade your" +- " software to use new ictls.\n", +- current->comm, current->pid); +- err = -EINVAL; +- goto abort_unlock; +- } +- +-done_unlock: +-abort_unlock: +- mddev_unlock(mddev); +- +- return err; +-done: +- if (err) +- MD_BUG(); +-abort: +- return err; +-} +- +-static int md_open(struct inode *inode, struct file *file) +-{ +- /* +- * Succeed if we can find or allocate a mddev structure. +- */ +- mddev_t *mddev = mddev_find(minor(inode->i_rdev)); +- int err = -ENOMEM; +- +- if (!mddev) +- goto out; +- +- if ((err = mddev_lock(mddev))) +- goto put; +- +- err = 0; +- mddev_unlock(mddev); +- inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev); +- put: +- mddev_put(mddev); +- out: +- return err; +-} +- +-static int md_release(struct inode *inode, struct file * file) +-{ +- mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip; +- +- if (!mddev) +- BUG(); +- mddev_put(mddev); +- +- return 0; +-} +- +-static struct block_device_operations md_fops = +-{ +- .owner = THIS_MODULE, +- .open = md_open, +- .release = md_release, +- .ioctl = md_ioctl, +-}; +- +-int md_thread(void * arg) +-{ +- mdk_thread_t *thread = arg; +- +- lock_kernel(); +- +- /* +- * Detach thread +- */ +- +- daemonize(thread->name, mdidx(thread->mddev)); +- +- current->exit_signal = SIGCHLD; +- allow_signal(SIGKILL); +- thread->tsk = current; +- +- /* +- * md_thread is a 'system-thread', it's priority should be very +- * high. We avoid resource deadlocks individually in each +- * raid personality. (RAID5 does preallocation) We also use RR and +- * the very same RT priority as kswapd, thus we will never get +- * into a priority inversion deadlock. +- * +- * we definitely have to have equal or higher priority than +- * bdflush, otherwise bdflush will deadlock if there are too +- * many dirty RAID5 blocks. +- */ +- unlock_kernel(); +- +- complete(thread->event); +- while (thread->run) { +- void (*run)(mddev_t *); +- +- wait_event_interruptible(thread->wqueue, +- test_bit(THREAD_WAKEUP, &thread->flags)); +- if (current->flags & PF_FREEZE) +- refrigerator(PF_IOTHREAD); +- +- clear_bit(THREAD_WAKEUP, &thread->flags); +- +- run = thread->run; +- if (run) { +- run(thread->mddev); +- blk_run_queues(); +- } +- if (signal_pending(current)) +- flush_signals(current); +- } +- complete(thread->event); +- return 0; +-} +- +-void md_wakeup_thread(mdk_thread_t *thread) +-{ +- if (thread) { +- dprintk("md: waking up MD thread %p.\n", thread); +- set_bit(THREAD_WAKEUP, &thread->flags); +- wake_up(&thread->wqueue); +- } +-} +- +-mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, +- const char *name) +-{ +- mdk_thread_t *thread; +- int ret; +- struct completion event; +- +- thread = (mdk_thread_t *) kmalloc +- (sizeof(mdk_thread_t), GFP_KERNEL); +- if (!thread) +- return NULL; +- +- memset(thread, 0, sizeof(mdk_thread_t)); +- init_waitqueue_head(&thread->wqueue); +- +- init_completion(&event); +- thread->event = &event; +- thread->run = run; +- thread->mddev = mddev; +- thread->name = name; +- ret = kernel_thread(md_thread, thread, 0); +- if (ret < 0) { +- kfree(thread); +- return NULL; +- } +- wait_for_completion(&event); +- return thread; +-} +- +-void md_interrupt_thread(mdk_thread_t *thread) +-{ +- if (!thread->tsk) { +- MD_BUG(); +- return; +- } +- dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); +- send_sig(SIGKILL, thread->tsk, 1); +-} +- +-void md_unregister_thread(mdk_thread_t *thread) +-{ +- struct completion event; +- +- init_completion(&event); +- +- thread->event = &event; +- thread->run = NULL; +- thread->name = NULL; +- md_interrupt_thread(thread); +- wait_for_completion(&event); +- kfree(thread); +-} +- +-void md_error(mddev_t *mddev, mdk_rdev_t *rdev) +-{ +- dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", +- MD_MAJOR,mdidx(mddev), +- MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), +- __builtin_return_address(0),__builtin_return_address(1), +- __builtin_return_address(2),__builtin_return_address(3)); +- +- if (!mddev) { +- MD_BUG(); +- return; +- } +- +- if (!rdev || rdev->faulty) +- return; +- if (!mddev->pers->error_handler) +- return; +- mddev->pers->error_handler(mddev,rdev); +- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +- md_wakeup_thread(mddev->thread); +-} +- +-/* seq_file implementation /proc/mdstat */ +- +-static void status_unused(struct seq_file *seq) +-{ +- int i = 0; +- mdk_rdev_t *rdev; +- struct list_head *tmp; +- +- seq_printf(seq, "unused devices: "); +- +- ITERATE_RDEV_PENDING(rdev,tmp) { +- i++; +- seq_printf(seq, "%s ", +- bdev_partition_name(rdev->bdev)); +- } +- if (!i) +- seq_printf(seq, ""); +- +- seq_printf(seq, "\n"); +-} +- +- +-static void status_resync(struct seq_file *seq, mddev_t * mddev) +-{ +- unsigned long max_blocks, resync, res, dt, db, rt; +- +- resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; +- max_blocks = mddev->size; +- +- /* +- * Should not happen. +- */ +- if (!max_blocks) { +- MD_BUG(); +- return; +- } +- res = (resync/1024)*1000/(max_blocks/1024 + 1); +- { +- int i, x = res/50, y = 20-x; +- seq_printf(seq, "["); +- for (i = 0; i < x; i++) +- seq_printf(seq, "="); +- seq_printf(seq, ">"); +- for (i = 0; i < y; i++) +- seq_printf(seq, "."); +- seq_printf(seq, "] "); +- } +- seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", +- (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? +- "resync" : "recovery"), +- res/10, res % 10, resync, max_blocks); +- +- /* +- * We do not want to overflow, so the order of operands and +- * the * 100 / 100 trick are important. We do a +1 to be +- * safe against division by zero. We only estimate anyway. +- * +- * dt: time from mark until now +- * db: blocks written from mark until now +- * rt: remaining time +- */ +- dt = ((jiffies - mddev->resync_mark) / HZ); +- if (!dt) dt++; +- db = resync - (mddev->resync_mark_cnt/2); +- rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; +- +- seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); +- +- seq_printf(seq, " speed=%ldK/sec", db/dt); +-} +- +-static void *md_seq_start(struct seq_file *seq, loff_t *pos) +-{ +- struct list_head *tmp; +- loff_t l = *pos; +- mddev_t *mddev; +- +- if (l > 0x10000) +- return NULL; +- if (!l--) +- /* header */ +- return (void*)1; +- +- spin_lock(&all_mddevs_lock); +- list_for_each(tmp,&all_mddevs) +- if (!l--) { +- mddev = list_entry(tmp, mddev_t, all_mddevs); +- mddev_get(mddev); +- spin_unlock(&all_mddevs_lock); +- return mddev; +- } +- spin_unlock(&all_mddevs_lock); +- return (void*)2;/* tail */ +-} +- +-static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +-{ +- struct list_head *tmp; +- mddev_t *next_mddev, *mddev = v; +- +- ++*pos; +- if (v == (void*)2) +- return NULL; +- +- spin_lock(&all_mddevs_lock); +- if (v == (void*)1) +- tmp = all_mddevs.next; +- else +- tmp = mddev->all_mddevs.next; +- if (tmp != &all_mddevs) +- next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); +- else { +- next_mddev = (void*)2; +- *pos = 0x10000; +- } +- spin_unlock(&all_mddevs_lock); +- +- if (v != (void*)1) +- mddev_put(mddev); +- return next_mddev; +- +-} +- +-static void md_seq_stop(struct seq_file *seq, void *v) +-{ +- mddev_t *mddev = v; +- +- if (mddev && v != (void*)1 && v != (void*)2) +- mddev_put(mddev); +-} +- +-static int md_seq_show(struct seq_file *seq, void *v) +-{ +- mddev_t *mddev = v; +- sector_t size; +- struct list_head *tmp2; +- mdk_rdev_t *rdev; +- int i; +- +- if (v == (void*)1) { +- seq_printf(seq, "Personalities : "); +- spin_lock(&pers_lock); +- for (i = 0; i < MAX_PERSONALITY; i++) +- if (pers[i]) +- seq_printf(seq, "[%s] ", pers[i]->name); +- +- spin_unlock(&pers_lock); +- seq_printf(seq, "\n"); +- return 0; +- } +- if (v == (void*)2) { +- status_unused(seq); +- return 0; +- } +- +- if (mddev_lock(mddev)!=0) +- return -EINTR; +- if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { +- seq_printf(seq, "md%d : %sactive", mdidx(mddev), +- mddev->pers ? "" : "in"); +- if (mddev->pers) { +- if (mddev->ro) +- seq_printf(seq, " (read-only)"); +- seq_printf(seq, " %s", mddev->pers->name); +- } +- +- size = 0; +- ITERATE_RDEV(mddev,rdev,tmp2) { +- seq_printf(seq, " %s[%d]", +- bdev_partition_name(rdev->bdev), rdev->desc_nr); +- if (rdev->faulty) { +- seq_printf(seq, "(F)"); +- continue; +- } +- size += rdev->size; +- } +- +- if (!list_empty(&mddev->disks)) { +- if (mddev->pers) +- seq_printf(seq, "\n %llu blocks", +- (unsigned long long)mddev->array_size); +- else +- seq_printf(seq, "\n %llu blocks", +- (unsigned long long)size); +- } +- +- if (mddev->pers) { +- mddev->pers->status (seq, mddev); +- seq_printf(seq, "\n "); +- if (mddev->curr_resync > 2) +- status_resync (seq, mddev); +- else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) +- seq_printf(seq, " resync=DELAYED"); +- } +- +- seq_printf(seq, "\n"); +- } +- mddev_unlock(mddev); +- +- return 0; +-} +- +-static struct seq_operations md_seq_ops = { +- .start = md_seq_start, +- .next = md_seq_next, +- .stop = md_seq_stop, +- .show = md_seq_show, +-}; +- +-static int md_seq_open(struct inode *inode, struct file *file) +-{ +- int error; +- +- error = seq_open(file, &md_seq_ops); +- return error; +-} +- +-static struct file_operations md_seq_fops = { +- .open = md_seq_open, +- .read = seq_read, +- .llseek = seq_lseek, +- .release = seq_release, +-}; +- +-int register_md_personality(int pnum, mdk_personality_t *p) +-{ +- if (pnum >= MAX_PERSONALITY) { +- MD_BUG(); +- return -EINVAL; +- } +- +- spin_lock(&pers_lock); +- if (pers[pnum]) { +- spin_unlock(&pers_lock); +- MD_BUG(); +- return -EBUSY; +- } +- +- pers[pnum] = p; +- printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); +- spin_unlock(&pers_lock); +- return 0; +-} +- +-int unregister_md_personality(int pnum) +-{ +- if (pnum >= MAX_PERSONALITY) { +- MD_BUG(); +- return -EINVAL; +- } +- +- printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); +- spin_lock(&pers_lock); +- pers[pnum] = NULL; +- spin_unlock(&pers_lock); +- return 0; +-} +- +-void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors) +-{ +- rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors; +-} +- +-static int is_mddev_idle(mddev_t *mddev) +-{ +- mdk_rdev_t * rdev; +- struct list_head *tmp; +- int idle; +- unsigned long curr_events; +- +- idle = 1; +- ITERATE_RDEV(mddev,rdev,tmp) { +- struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; +- curr_events = disk_stat_read(disk, read_sectors) + +- disk_stat_read(disk, write_sectors) - +- disk->sync_io; +- if ((curr_events - rdev->last_events) > 32) { +- rdev->last_events = curr_events; +- idle = 0; +- } +- } +- return idle; +-} +- +-void md_done_sync(mddev_t *mddev, int blocks, int ok) +-{ +- /* another "blocks" (512byte) blocks have been synced */ +- atomic_sub(blocks, &mddev->recovery_active); +- wake_up(&mddev->recovery_wait); +- if (!ok) { +- set_bit(MD_RECOVERY_ERR, &mddev->recovery); +- md_wakeup_thread(mddev->thread); +- // stop recovery, signal do_sync .... +- } +-} +- +- +-void md_write_start(mddev_t *mddev) +-{ +- if (!atomic_read(&mddev->writes_pending)) { +- mddev_lock_uninterruptible(mddev); +- if (mddev->in_sync) { +- mddev->in_sync = 0; +- del_timer(&mddev->safemode_timer); +- md_update_sb(mddev); +- } +- atomic_inc(&mddev->writes_pending); +- mddev_unlock(mddev); +- } else +- atomic_inc(&mddev->writes_pending); +-} +- +-void md_write_end(mddev_t *mddev) +-{ +- if (atomic_dec_and_test(&mddev->writes_pending)) { +- if (mddev->safemode == 2) +- md_wakeup_thread(mddev->thread); +- else +- mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); +- } +-} +- +-static inline void md_enter_safemode(mddev_t *mddev) +-{ +- mddev_lock_uninterruptible(mddev); +- if (mddev->safemode && !atomic_read(&mddev->writes_pending) && +- !mddev->in_sync && mddev->recovery_cp == MaxSector) { +- mddev->in_sync = 1; +- md_update_sb(mddev); +- } +- mddev_unlock(mddev); +- +- if (mddev->safemode == 1) +- mddev->safemode = 0; +-} +- +-void md_handle_safemode(mddev_t *mddev) +-{ +- if (signal_pending(current)) { +- printk(KERN_INFO "md: md%d in immediate safe mode\n", +- mdidx(mddev)); +- mddev->safemode = 2; +- flush_signals(current); +- } +- if (mddev->safemode) +- md_enter_safemode(mddev); +-} +- +- +-DECLARE_WAIT_QUEUE_HEAD(resync_wait); +- +-#define SYNC_MARKS 10 +-#define SYNC_MARK_STEP (3*HZ) +-static void md_do_sync(mddev_t *mddev) +-{ +- mddev_t *mddev2; +- unsigned int max_sectors, currspeed = 0, +- j, window; +- unsigned long mark[SYNC_MARKS]; +- unsigned long mark_cnt[SYNC_MARKS]; +- int last_mark,m; +- struct list_head *tmp; +- unsigned long last_check; +- +- /* just incase thread restarts... */ +- if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) +- return; +- +- /* we overload curr_resync somewhat here. +- * 0 == not engaged in resync at all +- * 2 == checking that there is no conflict with another sync +- * 1 == like 2, but have yielded to allow conflicting resync to +- * commense +- * other == active in resync - this many blocks +- */ +- do { +- mddev->curr_resync = 2; +- +- ITERATE_MDDEV(mddev2,tmp) { +- if (mddev2 == mddev) +- continue; +- if (mddev2->curr_resync && +- match_mddev_units(mddev,mddev2)) { +- printk(KERN_INFO "md: delaying resync of md%d" +- " until md%d has finished resync (they" +- " share one or more physical units)\n", +- mdidx(mddev), mdidx(mddev2)); +- if (mddev < mddev2) {/* arbitrarily yield */ +- mddev->curr_resync = 1; +- wake_up(&resync_wait); +- } +- if (wait_event_interruptible(resync_wait, +- mddev2->curr_resync < mddev->curr_resync)) { +- flush_signals(current); +- mddev_put(mddev2); +- goto skip; +- } +- } +- if (mddev->curr_resync == 1) { +- mddev_put(mddev2); +- break; +- } +- } +- } while (mddev->curr_resync < 2); +- +- max_sectors = mddev->size << 1; +- +- printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); +- printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" +- " %d KB/sec/disc.\n", sysctl_speed_limit_min); +- printk(KERN_INFO "md: using maximum available idle IO bandwith " +- "(but not more than %d KB/sec) for reconstruction.\n", +- sysctl_speed_limit_max); +- +- is_mddev_idle(mddev); /* this also initializes IO event counters */ +- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) +- j = mddev->recovery_cp; +- else +- j = 0; +- for (m = 0; m < SYNC_MARKS; m++) { +- mark[m] = jiffies; +- mark_cnt[m] = j; +- } +- last_mark = 0; +- mddev->resync_mark = mark[last_mark]; +- mddev->resync_mark_cnt = mark_cnt[last_mark]; +- +- /* +- * Tune reconstruction: +- */ +- window = 32*(PAGE_SIZE/512); +- printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", +- window/2,max_sectors/2); +- +- atomic_set(&mddev->recovery_active, 0); +- init_waitqueue_head(&mddev->recovery_wait); +- last_check = 0; +- +- if (j) +- printk(KERN_INFO +- "md: resuming recovery of md%d from checkpoint.\n", +- mdidx(mddev)); +- +- while (j < max_sectors) { +- int sectors; +- +- sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); +- if (sectors < 0) { +- set_bit(MD_RECOVERY_ERR, &mddev->recovery); +- goto out; +- } +- atomic_add(sectors, &mddev->recovery_active); +- j += sectors; +- if (j>1) mddev->curr_resync = j; +- +- if (last_check + window > j) +- continue; +- +- last_check = j; +- +- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || +- test_bit(MD_RECOVERY_ERR, &mddev->recovery)) +- break; +- +- blk_run_queues(); +- +- repeat: +- if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { +- /* step marks */ +- int next = (last_mark+1) % SYNC_MARKS; +- +- mddev->resync_mark = mark[next]; +- mddev->resync_mark_cnt = mark_cnt[next]; +- mark[next] = jiffies; +- mark_cnt[next] = j - atomic_read(&mddev->recovery_active); +- last_mark = next; +- } +- +- +- if (signal_pending(current)) { +- /* +- * got a signal, exit. +- */ +- printk(KERN_INFO +- "md: md_do_sync() got signal ... exiting\n"); +- flush_signals(current); +- set_bit(MD_RECOVERY_INTR, &mddev->recovery); +- goto out; +- } +- +- /* +- * this loop exits only if either when we are slower than +- * the 'hard' speed limit, or the system was IO-idle for +- * a jiffy. +- * the system might be non-idle CPU-wise, but we only care +- * about not overloading the IO subsystem. (things like an +- * e2fsck being done on the RAID array should execute fast) +- */ +- cond_resched(); +- +- currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; +- +- if (currspeed > sysctl_speed_limit_min) { +- if ((currspeed > sysctl_speed_limit_max) || +- !is_mddev_idle(mddev)) { +- current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(HZ/4); +- goto repeat; +- } +- } +- } +- printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); +- /* +- * this also signals 'finished resyncing' to md_stop +- */ +- out: +- wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); +- +- /* tell personality that we are finished */ +- mddev->pers->sync_request(mddev, max_sectors, 1); +- +- if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && +- mddev->curr_resync > 2 && +- mddev->curr_resync > mddev->recovery_cp) { +- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { +- printk(KERN_INFO +- "md: checkpointing recovery of md%d.\n", +- mdidx(mddev)); +- mddev->recovery_cp = mddev->curr_resync; +- } else +- mddev->recovery_cp = MaxSector; +- } +- +- if (mddev->safemode) +- md_enter_safemode(mddev); +- skip: +- mddev->curr_resync = 0; +- set_bit(MD_RECOVERY_DONE, &mddev->recovery); +- md_wakeup_thread(mddev->thread); +-} +- +- +-/* +- * This routine is regularly called by all per-raid-array threads to +- * deal with generic issues like resync and super-block update. +- * Raid personalities that don't have a thread (linear/raid0) do not +- * need this as they never do any recovery or update the superblock. +- * +- * It does not do any resync itself, but rather "forks" off other threads +- * to do that as needed. +- * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in +- * "->recovery" and create a thread at ->sync_thread. +- * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) +- * and wakeups up this thread which will reap the thread and finish up. +- * This thread also removes any faulty devices (with nr_pending == 0). +- * +- * The overall approach is: +- * 1/ if the superblock needs updating, update it. +- * 2/ If a recovery thread is running, don't do anything else. +- * 3/ If recovery has finished, clean up, possibly marking spares active. +- * 4/ If there are any faulty devices, remove them. +- * 5/ If array is degraded, try to add spares devices +- * 6/ If array has spares or is not in-sync, start a resync thread. +- */ +-void md_check_recovery(mddev_t *mddev) +-{ +- mdk_rdev_t *rdev; +- struct list_head *rtmp; +- +- +- dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); +- +- if (mddev->ro) +- return; +- if ( ! ( +- mddev->sb_dirty || +- test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || +- test_bit(MD_RECOVERY_DONE, &mddev->recovery) +- )) +- return; +- if (mddev_trylock(mddev)==0) { +- int spares =0; +- if (mddev->sb_dirty) +- md_update_sb(mddev); +- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && +- !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) +- /* resync/recovery still happening */ +- goto unlock; +- if (mddev->sync_thread) { +- /* resync has finished, collect result */ +- md_unregister_thread(mddev->sync_thread); +- mddev->sync_thread = NULL; +- if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) { +- /* success...*/ +- /* activate any spares */ +- mddev->pers->spare_active(mddev); +- } +- md_update_sb(mddev); +- mddev->recovery = 0; +- wake_up(&resync_wait); +- goto unlock; +- } +- if (mddev->recovery) { +- /* that's odd.. */ +- mddev->recovery = 0; +- wake_up(&resync_wait); +- } +- +- /* no recovery is running. +- * remove any failed drives, then +- * add spares if possible +- */ +- ITERATE_RDEV(mddev,rdev,rtmp) { +- if (rdev->raid_disk >= 0 && +- rdev->faulty && +- atomic_read(&rdev->nr_pending)==0) { +- mddev->pers->hot_remove_disk(mddev, rdev->raid_disk); +- rdev->raid_disk = -1; +- } +- if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync) +- spares++; +- } +- if (mddev->degraded) { +- ITERATE_RDEV(mddev,rdev,rtmp) +- if (rdev->raid_disk < 0 +- && !rdev->faulty) { +- if (mddev->pers->hot_add_disk(mddev,rdev)) +- spares++; +- else +- break; +- } +- } +- +- if (!spares && (mddev->recovery_cp == MaxSector )) { +- /* nothing we can do ... */ +- goto unlock; +- } +- if (mddev->pers->sync_request) { +- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); +- if (!spares) +- set_bit(MD_RECOVERY_SYNC, &mddev->recovery); +- mddev->sync_thread = md_register_thread(md_do_sync, +- mddev, +- "md%d_resync"); +- if (!mddev->sync_thread) { +- printk(KERN_ERR "md%d: could not start resync" +- " thread...\n", +- mdidx(mddev)); +- /* leave the spares where they are, it shouldn't hurt */ +- mddev->recovery = 0; +- } else { +- md_wakeup_thread(mddev->sync_thread); +- } +- } +- unlock: +- mddev_unlock(mddev); +- } +-} +- +-int md_notify_reboot(struct notifier_block *this, +- unsigned long code, void *x) +-{ +- struct list_head *tmp; +- mddev_t *mddev; +- +- if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { +- +- printk(KERN_INFO "md: stopping all md devices.\n"); +- +- ITERATE_MDDEV(mddev,tmp) +- if (mddev_trylock(mddev)==0) +- do_md_stop (mddev, 1); +- /* +- * certain more exotic SCSI devices are known to be +- * volatile wrt too early system reboots. While the +- * right place to handle this issue is the given +- * driver, we do want to have a safe RAID driver ... +- */ +- mdelay(1000*1); +- } +- return NOTIFY_DONE; +-} +- +-struct notifier_block md_notifier = { +- .notifier_call = md_notify_reboot, +- .next = NULL, +- .priority = INT_MAX, /* before any real devices */ +-}; +- +-static void md_geninit(void) +-{ +- struct proc_dir_entry *p; +- +- dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); +- +-#ifdef CONFIG_PROC_FS +- p = create_proc_entry("mdstat", S_IRUGO, NULL); +- if (p) +- p->proc_fops = &md_seq_fops; +-#endif +-} +- +-int __init md_init(void) +-{ +- int minor; +- +- printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," +- " MD_SB_DISKS=%d\n", +- MD_MAJOR_VERSION, MD_MINOR_VERSION, +- MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); +- +- if (register_blkdev(MAJOR_NR, "md")) +- return -1; +- +- devfs_mk_dir("md"); +- blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, +- md_probe, NULL, NULL); +- for (minor=0; minor < MAX_MD_DEVS; ++minor) { +- char name[16]; +- sprintf(name, "md/%d", minor); +- devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor, +- S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); +- } +- +- register_reboot_notifier(&md_notifier); +- raid_table_header = register_sysctl_table(raid_root_table, 1); +- +- md_geninit(); +- return (0); +-} +- +- +-#ifndef MODULE +- +-/* +- * Searches all registered partitions for autorun RAID arrays +- * at boot time. +- */ +-static dev_t detected_devices[128]; +-static int dev_cnt; +- +-void md_autodetect_dev(dev_t dev) +-{ +- if (dev_cnt >= 0 && dev_cnt < 127) +- detected_devices[dev_cnt++] = dev; +-} +- +- +-static void autostart_arrays(void) +-{ +- mdk_rdev_t *rdev; +- int i; +- +- printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); +- +- for (i = 0; i < dev_cnt; i++) { +- dev_t dev = detected_devices[i]; +- +- rdev = md_import_device(dev,0, 0); +- if (IS_ERR(rdev)) { +- printk(KERN_ALERT "md: could not import %s!\n", +- partition_name(dev)); +- continue; +- } +- if (rdev->faulty) { +- MD_BUG(); +- continue; +- } +- list_add(&rdev->same_set, &pending_raid_disks); +- } +- dev_cnt = 0; +- +- autorun_devices(); +-} +- +-#endif +- +-static __exit void md_exit(void) +-{ +- int i; +- blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); +- for (i=0; i < MAX_MD_DEVS; i++) +- devfs_remove("md/%d", i); +- devfs_remove("md"); +- +- unregister_blkdev(MAJOR_NR,"md"); +- unregister_reboot_notifier(&md_notifier); +- unregister_sysctl_table(raid_table_header); +-#ifdef CONFIG_PROC_FS +- remove_proc_entry("mdstat", NULL); +-#endif +- for (i = 0; i < MAX_MD_DEVS; i++) { +- struct gendisk *disk = disks[i]; +- mddev_t *mddev; +- if (!disks[i]) +- continue; +- mddev = disk->private_data; +- del_gendisk(disk); +- put_disk(disk); +- mddev_put(mddev); +- } +-} +- +-module_init(md_init) +-module_exit(md_exit) +- +-EXPORT_SYMBOL(register_md_personality); +-EXPORT_SYMBOL(unregister_md_personality); +-EXPORT_SYMBOL(md_error); +-EXPORT_SYMBOL(md_sync_acct); +-EXPORT_SYMBOL(md_done_sync); +-EXPORT_SYMBOL(md_write_start); +-EXPORT_SYMBOL(md_write_end); +-EXPORT_SYMBOL(md_handle_safemode); +-EXPORT_SYMBOL(md_register_thread); +-EXPORT_SYMBOL(md_unregister_thread); +-EXPORT_SYMBOL(md_wakeup_thread); +-EXPORT_SYMBOL(md_print_devices); +-EXPORT_SYMBOL(md_interrupt_thread); +-EXPORT_SYMBOL(md_check_recovery); +-MODULE_LICENSE("GPL"); diff --git a/tests/linux/md/lmerge b/tests/linux/md/lmerge new file mode 100644 index 0000000..6440da9 --- /dev/null +++ b/tests/linux/md/lmerge @@ -0,0 +1,3595 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher + - Devfs support by Richard Gooch + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown . + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include /* for invalidate_bdev */ +#include + +#include + +#ifdef CONFIG_KMOD +#include +#endif + +#define __KERNEL_SYSCALLS__ +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define DEVICE_NR(device) (minor(device)) + +#include + +#define DEBUG 0 +#define dprintk(x...) ((void)(DEBUG && printk(x))) + + +#ifndef MODULE +static void autostart_arrays (void); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; +static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 1000 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 1000; +static int sysctl_speed_limit_max = 200000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + { + .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, + .procname = "speed_limit_min", + .data = &sysctl_speed_limit_min, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, + .procname = "speed_limit_max", + .data = &sysctl_speed_limit_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table raid_dir_table[] = { + { + .ctl_name = DEV_RAID, + .procname = "raid", + .maxlen = 0, + .mode = 0555, + .child = raid_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table raid_root_table[] = { + { + .ctl_name = CTL_DEV, + .procname = "dev", + .maxlen = 0, + .mode = 0555, + .child = raid_dir_table, + }, + { .ctl_name = 0 } +}; + +static struct block_device_operations md_fops; + +static struct gendisk *disks[MAX_MD_DEVS]; + +/* + * Enables to iterate over all existing md arrays + * all_mddevs_lock protects this list as well as mddev_map. + */ +static LIST_HEAD(all_mddevs); +static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; + + +/* + * iterates through all used mddevs in the system. + * We take care to grab the all_mddevs_lock whenever navigating + * the list, and to always hold a refcount when unlocked. + * Any code which breaks out of this loop while own + * a reference to the current mddev and must mddev_put it. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (({ spin_lock(&all_mddevs_lock); \ + tmp = all_mddevs.next; \ + mddev = NULL;}); \ + ({ if (tmp != &all_mddevs) \ + mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ + spin_unlock(&all_mddevs_lock); \ + if (mddev) mddev_put(mddev); \ + mddev = list_entry(tmp, mddev_t, all_mddevs); \ + tmp != &all_mddevs;}); \ + ({ spin_lock(&all_mddevs_lock); \ + tmp = tmp->next;}) \ + ) + +static mddev_t *mddev_map[MAX_MD_DEVS]; + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ + bio_io_error(bio, bio->bi_size); + return 0; +} + +static inline mddev_t *mddev_get(mddev_t *mddev) +{ + atomic_inc(&mddev->active); + return mddev; +} + +static void mddev_put(mddev_t *mddev) +{ + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) + return; + if (!mddev->raid_disks && list_empty(&mddev->disks)) { + list_del(&mddev->all_mddevs); + mddev_map[mdidx(mddev)] = NULL; + kfree(mddev); + MOD_DEC_USE_COUNT; + } + spin_unlock(&all_mddevs_lock); +} + +static mddev_t * mddev_find(int unit) +{ + mddev_t *mddev, *new = NULL; + + retry: + spin_lock(&all_mddevs_lock); + if (mddev_map[unit]) { + mddev = mddev_get(mddev_map[unit]); + spin_unlock(&all_mddevs_lock); + if (new) + kfree(new); + return mddev; + } + if (new) { + mddev_map[unit] = new; + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + MOD_INC_USE_COUNT; + return new; + } + spin_unlock(&all_mddevs_lock); + + new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + memset(new, 0, sizeof(*new)); + + new->__minor = unit; + init_MUTEX(&new->reconfig_sem); + INIT_LIST_HEAD(&new->disks); + INIT_LIST_HEAD(&new->all_mddevs); + init_timer(&new->safemode_timer); + atomic_set(&new->active, 1); + blk_queue_make_request(&new->queue, md_fail_request); + + goto retry; +} + +static inline int mddev_lock(mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +static inline void mddev_lock_uninterruptible(mddev_t * mddev) +{ + down(&mddev->reconfig_sem); +} + +static inline int mddev_trylock(mddev_t * mddev) +{ + return down_trylock(&mddev->reconfig_sem); +} + +static inline void mddev_unlock(mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->bdev->bd_dev == dev) + return rdev; + } + return NULL; +} + +inline static sector_t calc_dev_sboffset(struct block_device *bdev) +{ + sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + return MD_NEW_SIZE_BLOCKS(size); +} + +static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) +{ + sector_t size; + + size = rdev->sb_offset; + + if (chunk_size) + size &= ~((sector_t)chunk_size/1024 - 1); + return size; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(KERN_ALERT "md: out of memory.\n"); + return -EINVAL; + } + + return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) { + page_cache_release(rdev->sb_page); + rdev->sb_loaded = 0; + rdev->sb_page = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } +} + + +static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) +{ + if (bio->bi_size) + return 1; + + complete((struct completion*)bio->bi_private); + return 0; +} + +static int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw) +{ + struct bio bio; + struct bio_vec vec; + struct completion event; + + bio_init(&bio); + bio.bi_io_vec = &vec; + vec.bv_page = page; + vec.bv_len = size; + vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_idx = 0; + bio.bi_size = size; + bio.bi_bdev = bdev; + bio.bi_sector = sector; + init_completion(&event); + bio.bi_private = &event; + bio.bi_end_io = bi_complete; + submit_bio(rw, &bio); + blk_run_queues(); + wait_for_completion(&event); + + return test_bit(BIO_UPTODATE, &bio.bi_flags); +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ + + if (!rdev->sb_page) { + MD_BUG(); + return -EINVAL; + } + if (rdev->sb_loaded) + return 0; + + + if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) + goto fail; + rdev->sb_loaded = 1; + return 0; + +fail: + printk(KERN_ERR "md: disabled device %s, could not read superblock.\n", + bdev_partition_name(rdev->bdev)); + return -EINVAL; +} + +static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + if ( (sb1->set_uuid0 == sb2->set_uuid0) && + (sb1->set_uuid1 == sb2->set_uuid1) && + (sb1->set_uuid2 == sb2->set_uuid2) && + (sb1->set_uuid3 == sb2->set_uuid3)) + + return 1; + + return 0; +} + + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Handle superblock details. + * We want to be able to handle multiple superblock formats + * so we have a common interface to them all, and an array of + * different handlers. + * We rely on user-space to write the initial superblock, and support + * reading and updating of superblocks. + * Interface methods are: + * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) + * loads and validates a superblock on dev. + * if refdev != NULL, compare superblocks on both devices + * Return: + * 0 - dev has a superblock that is compatible with refdev + * 1 - dev has a superblock that is compatible and newer than refdev + * so dev should be used as the refdev in future + * -EINVAL superblock incompatible or invalid + * -othererror e.g. -EIO + * + * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) + * Verify that dev is acceptable into mddev. + * The first time, mddev->raid_disks will be 0, and data from + * dev should be merged in. Subsequent calls check that dev + * is new enough. Return 0 or -EINVAL + * + * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) + * Update the superblock for rdev with data in mddev + * This does not write to disc. + * + */ + +struct super_type { + char *name; + struct module *owner; + int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); + int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); + void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); +}; + +/* + * load_super for 0.90.0 + */ +static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ + mdp_super_t *sb; + int ret; + sector_t sb_offset; + + /* + * Calculate the position of the superblock, + * it's at the end of the disk. + * + * It also happens to be a multiple of 4Kb. + */ + sb_offset = calc_dev_sboffset(rdev->bdev); + rdev->sb_offset = sb_offset; + + ret = read_disk_sb(rdev); + if (ret) return ret; + + ret = -EINVAL; + + sb = (mdp_super_t*)page_address(rdev->sb_page); + + if (sb->md_magic != MD_SB_MAGIC) { + printk(KERN_ERR "md: invalid raid superblock magic on %s\n", + bdev_partition_name(rdev->bdev)); + goto abort; + } + + if (sb->major_version != 0 || + sb->minor_version != 90) { + printk(KERN_WARNING "Bad version number %d.%d on %s\n", + sb->major_version, sb->minor_version, + bdev_partition_name(rdev->bdev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk(KERN_ERR "md: %s: invalid raid minor (%x)\n", + bdev_partition_name(rdev->bdev), sb->md_minor); + goto abort; + } + if (sb->raid_disks <= 0) + goto abort; + + if (calc_sb_csum(sb) != sb->sb_csum) { + printk(KERN_WARNING "md: invalid superblock checksum on %s\n", + bdev_partition_name(rdev->bdev)); + goto abort; + } + + rdev->preferred_minor = sb->md_minor; + rdev->data_offset = 0; + + if (sb->level == MULTIPATH) + rdev->desc_nr = -1; + else + rdev->desc_nr = sb->this_disk.number; + + if (refdev == 0) + ret = 1; + else { + __u64 ev1, ev2; + mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); + if (!uuid_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + goto abort; + } + if (!sb_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has same UUID" + " but different superblock to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + goto abort; + } + ev1 = md_event(sb); + ev2 = md_event(refsb); + if (ev1 > ev2) + ret = 1; + else + ret = 0; + } + rdev->size = calc_dev_size(rdev, sb->chunk_size); + + abort: + return ret; +} + +/* + * validate_super for 0.90.0 + */ +static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdp_disk_t *desc; + mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + + if (mddev->raid_disks == 0) { + mddev->major_version = 0; + mddev->minor_version = sb->minor_version; + mddev->patch_version = sb->patch_version; + mddev->persistent = ! sb->not_persistent; + mddev->chunk_size = sb->chunk_size; + mddev->ctime = sb->ctime; + mddev->utime = sb->utime; + mddev->level = sb->level; + mddev->layout = sb->layout; + mddev->raid_disks = sb->raid_disks; + mddev->size = sb->size; + mddev->events = md_event(sb); + + if (sb->state & (1<recovery_cp = MaxSector; + else { + if (sb->events_hi == sb->cp_events_hi && + sb->events_lo == sb->cp_events_lo) { + mddev->recovery_cp = sb->recovery_cp; + } else + mddev->recovery_cp = 0; + } + + memcpy(mddev->uuid+0, &sb->set_uuid0, 4); + memcpy(mddev->uuid+4, &sb->set_uuid1, 4); + memcpy(mddev->uuid+8, &sb->set_uuid2, 4); + memcpy(mddev->uuid+12,&sb->set_uuid3, 4); + + mddev->max_disks = MD_SB_DISKS; + } else { + __u64 ev1; + ev1 = md_event(sb); + ++ev1; + if (ev1 < mddev->events) + return -EINVAL; + } + if (mddev->level != LEVEL_MULTIPATH) { + rdev->raid_disk = -1; + rdev->in_sync = rdev->faulty = 0; + desc = sb->disks + rdev->desc_nr; + + if (desc->state & (1<faulty = 1; + else if (desc->state & (1<raid_disk < mddev->raid_disks) { + rdev->in_sync = 1; + rdev->raid_disk = desc->raid_disk; + } + } + return 0; +} + +/* + * sync_super for 0.90.0 + */ +static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdp_super_t *sb; + struct list_head *tmp; + mdk_rdev_t *rdev2; + int next_spare = mddev->raid_disks; + + /* make rdev->sb match mddev data.. + * + * 1/ zero out disks + * 2/ Add info for each disk, keeping track of highest desc_nr + * 3/ any empty disks < highest become removed + * + * disks[0] gets initialised to REMOVED because + * we cannot be sure from other fields if it has + * been initialised or not. + */ + int highest = 0; + int i; + int active=0, working=0,failed=0,spare=0,nr_disks=0; + + sb = (mdp_super_t*)page_address(rdev->sb_page); + + memset(sb, 0, sizeof(*sb)); + + sb->md_magic = MD_SB_MAGIC; + sb->major_version = mddev->major_version; + sb->minor_version = mddev->minor_version; + sb->patch_version = mddev->patch_version; + sb->gvalid_words = 0; /* ignored */ + memcpy(&sb->set_uuid0, mddev->uuid+0, 4); + memcpy(&sb->set_uuid1, mddev->uuid+4, 4); + memcpy(&sb->set_uuid2, mddev->uuid+8, 4); + memcpy(&sb->set_uuid3, mddev->uuid+12,4); + + sb->ctime = mddev->ctime; + sb->level = mddev->level; + sb->size = mddev->size; + sb->raid_disks = mddev->raid_disks; + sb->md_minor = mddev->__minor; + sb->not_persistent = !mddev->persistent; + sb->utime = mddev->utime; + sb->state = 0; + sb->events_hi = (mddev->events>>32); + sb->events_lo = (u32)mddev->events; + + if (mddev->in_sync) + { + sb->recovery_cp = mddev->recovery_cp; + sb->cp_events_hi = (mddev->events>>32); + sb->cp_events_lo = (u32)mddev->events; + if (mddev->recovery_cp == MaxSector) + sb->state = (1<< MD_SB_CLEAN); + } else + sb->recovery_cp = 0; + + sb->layout = mddev->layout; + sb->chunk_size = mddev->chunk_size; + + sb->disks[0].state = (1<raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) + rdev2->desc_nr = rdev2->raid_disk; + else + rdev2->desc_nr = next_spare++; + d = &sb->disks[rdev2->desc_nr]; + nr_disks++; + d->number = rdev2->desc_nr; + d->major = MAJOR(rdev2->bdev->bd_dev); + d->minor = MINOR(rdev2->bdev->bd_dev); + if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) + d->raid_disk = rdev2->raid_disk; + else + d->raid_disk = rdev2->desc_nr; /* compatibility */ + if (rdev2->faulty) { + d->state = (1<in_sync) { + d->state = (1<state |= (1<state = 0; + spare++; + working++; + } + if (rdev2->desc_nr > highest) + highest = rdev2->desc_nr; + } + + /* now set the "removed" bit on any non-trailing holes */ + for (i=0; idisks[i]; + if (d->state == 0 && d->number == 0) { + d->number = i; + d->raid_disk = i; + d->state = (1<nr_disks = nr_disks; + sb->active_disks = active; + sb->working_disks = working; + sb->failed_disks = failed; + sb->spare_disks = spare; + + sb->this_disk = sb->disks[rdev->desc_nr]; + sb->sb_csum = calc_sb_csum(sb); +} + +/* + * version 1 superblock + */ + +static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +{ + unsigned int disk_csum, csum; + int size = 256 + sb->max_dev*2; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, size, 0); + sb->sb_csum = disk_csum; + return csum; +} + +static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ + struct mdp_superblock_1 *sb; + int ret; + sector_t sb_offset; + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depeding on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(minor_version) { + case 0: + sb_offset = rdev->bdev->bd_inode->i_size >> 9; + sb_offset -= 8*2; + sb_offset &= ~(4*2); + /* convert from sectors to K */ + sb_offset /= 2; + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4; + break; + default: + return -EINVAL; + } + rdev->sb_offset = sb_offset; + + ret = read_disk_sb(rdev); + if (ret) return ret; + + + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || + sb->major_version != cpu_to_le32(1) || + le32_to_cpu(sb->max_dev) > (4096-256)/2 || + le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || + sb->feature_map != 0) + return -EINVAL; + + if (calc_sb_1_csum(sb) != sb->sb_csum) { + printk("md: invalid superblock checksum on %s\n", + bdev_partition_name(rdev->bdev)); + return -EINVAL; + } + rdev->preferred_minor = 0xffff; + rdev->data_offset = le64_to_cpu(sb->data_offset); + + if (refdev == 0) + return 1; + else { + __u64 ev1, ev2; + struct mdp_superblock_1 *refsb = + (struct mdp_superblock_1*)page_address(refdev->sb_page); + + if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || + sb->level != refsb->level || + sb->layout != refsb->layout || + sb->chunksize != refsb->chunksize) { + printk(KERN_WARNING "md: %s has strangely different" + " superblock to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + return -EINVAL; + } + ev1 = le64_to_cpu(sb->events); + ev2 = le64_to_cpu(refsb->events); + + if (ev1 > ev2) + return 1; + } + if (minor_version) + rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; + else + rdev->size = rdev->sb_offset; + if (rdev->size < le64_to_cpu(sb->data_size)/2) + return -EINVAL; + rdev->size = le64_to_cpu(sb->data_size)/2; + if (le32_to_cpu(sb->chunksize)) + rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); + return 0; +} + +static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + if (mddev->raid_disks == 0) { + mddev->major_version = 1; + mddev->minor_version = 0; + mddev->patch_version = 0; + mddev->persistent = 1; + mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; + mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); + mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); + mddev->level = le32_to_cpu(sb->level); + mddev->layout = le32_to_cpu(sb->layout); + mddev->raid_disks = le32_to_cpu(sb->raid_disks); + mddev->size = (u32)le64_to_cpu(sb->size); + mddev->events = le64_to_cpu(sb->events); + + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + memcpy(mddev->uuid, sb->set_uuid, 16); + + mddev->max_disks = (4096-256)/2; + } else { + __u64 ev1; + ev1 = le64_to_cpu(sb->events); + ++ev1; + if (ev1 < mddev->events) + return -EINVAL; + } + + if (mddev->level != LEVEL_MULTIPATH) { + int role; + rdev->desc_nr = le32_to_cpu(sb->dev_number); + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + switch(role) { + case 0xffff: /* spare */ + rdev->in_sync = 0; + rdev->faulty = 0; + rdev->raid_disk = -1; + break; + case 0xfffe: /* faulty */ + rdev->in_sync = 0; + rdev->faulty = 1; + rdev->raid_disk = -1; + break; + default: + rdev->in_sync = 1; + rdev->faulty = 0; + rdev->raid_disk = role; + break; + } + } + return 0; +} + +static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct mdp_superblock_1 *sb; + struct list_head *tmp; + mdk_rdev_t *rdev2; + int max_dev, i; + /* make rdev->sb match mddev and rdev data. */ + + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + sb->feature_map = 0; + sb->pad0 = 0; + memset(sb->pad1, 0, sizeof(sb->pad1)); + memset(sb->pad2, 0, sizeof(sb->pad2)); + memset(sb->pad3, 0, sizeof(sb->pad3)); + + sb->utime = cpu_to_le64((__u64)mddev->utime); + sb->events = cpu_to_le64(mddev->events); + if (mddev->in_sync) + sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else + sb->resync_offset = cpu_to_le64(0); + + max_dev = 0; + ITERATE_RDEV(mddev,rdev2,tmp) + if (rdev2->desc_nr > max_dev) + max_dev = rdev2->desc_nr; + + sb->max_dev = max_dev; + for (i=0; idev_roles[max_dev] = cpu_to_le16(0xfffe); + + ITERATE_RDEV(mddev,rdev2,tmp) { + i = rdev2->desc_nr; + if (rdev2->faulty) + sb->dev_roles[i] = cpu_to_le16(0xfffe); + else if (rdev2->in_sync) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else + sb->dev_roles[i] = cpu_to_le16(0xffff); + } + + sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ +} + + +struct super_type super_types[] = { + [0] = { + .name = "0.90.0", + .owner = THIS_MODULE, + .load_super = super_90_load, + .validate_super = super_90_validate, + .sync_super = super_90_sync, + }, + [1] = { + .name = "md-1", + .owner = THIS_MODULE, + .load_super = super_1_load, + .validate_super = super_1_validate, + .sync_super = super_1_sync, + }, +}; + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (rdev->bdev->bd_contains == dev->bdev->bd_contains) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev)) + return 1; + + return 0; +} + +static LIST_HEAD(pending_raid_disks); + +static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return -EINVAL; + } + same_pdev = match_dev_unit(mddev, rdev); + if (same_pdev) + printk(KERN_WARNING + "md%d: WARNING: %s appears to be on the same physical" + " disk as %s. True\n protection against single-disk" + " failure might be compromised.\n", + mdidx(mddev), bdev_partition_name(rdev->bdev), + bdev_partition_name(same_pdev->bdev)); + + /* Verify rdev->desc_nr is unique. + * If it is -1, assign a free number, else + * check number is not in use + */ + if (rdev->desc_nr < 0) { + int choice = 0; + if (mddev->pers) choice = mddev->raid_disks; + while (find_rdev_nr(mddev, choice)) + choice++; + rdev->desc_nr = choice; + } else { + if (find_rdev_nr(mddev, rdev->desc_nr)) + return -EBUSY; + } + + list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev)); + return 0; +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del_init(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev)); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +{ + int err = 0; + struct block_device *bdev; + + bdev = bdget(dev); + if (!bdev) + return -ENOMEM; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + if (err) + return err; + err = bd_claim(bdev, rdev); + if (err) { + blkdev_put(bdev, BDEV_RAW); + return err; + } + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + bd_release(bdev); + blkdev_put(bdev, BDEV_RAW); +} + +void md_autodetect_dev(dev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ + printk(KERN_INFO "md: export_rdev(%s)\n", + bdev_partition_name(rdev->bdev)); + if (rdev->mddev) + MD_BUG(); + free_disk_sb(rdev); + list_del_init(&rdev->same_set); +#ifndef MODULE + md_autodetect_dev(rdev->bdev->bd_dev); +#endif + unlock_rdev(rdev); + kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); + mddev->raid_disks = 0; + mddev->major_version = 0; +} + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO + "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", + sb->level, sb->size, sb->nr_disks, sb->raid_disks, + sb->md_minor, sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" + " FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ", + bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size, + rdev->faulty, rdev->in_sync, rdev->desc_nr); + if (rdev->sb_loaded) { + printk(KERN_INFO "md: rdev superblock:\n"); + print_sb((mdp_super_t*)page_address(rdev->sb_page)); + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ + struct list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * *\n"); + printk("md: **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", bdev_partition_name(rdev->bdev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk("md: **********************************\n"); + printk("\n"); +} + + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + + if (!rdev->sb_loaded) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + + dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->sb_offset); + + if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) + return 0; + + printk("md: write_disk_sb failed for device %s\n", + bdev_partition_name(rdev->bdev)); + return 1; +} + +static void sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + super_types[mddev->major_version]. + sync_super(mddev, rdev); + rdev->sb_loaded = 1; + } +} + +static void md_update_sb(mddev_t * mddev) +{ + int err, count = 100; + struct list_head *tmp; + mdk_rdev_t *rdev; + + mddev->sb_dirty = 0; +repeat: + mddev->utime = get_seconds(); + mddev->events ++; + + if (!mddev->events) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->events --; + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (!mddev->persistent) + return; + + dprintk(KERN_INFO + "md: updating md%d RAID superblock on device (in sync %d)\n", + mdidx(mddev),mddev->in_sync); + + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + dprintk(KERN_INFO "md: "); + if (rdev->faulty) + dprintk("(skipping faulty "); + + dprintk("%s ", bdev_partition_name(rdev->bdev)); + if (!rdev->faulty) { + err += write_disk_sb(rdev); + } else + dprintk(")\n"); + if (!err && mddev->level == LEVEL_MULTIPATH) + /* only need to write one superblock... */ + break; + } + if (err) { + if (--count) { + printk(KERN_ERR "md: errors occurred during superblock" + " update, repeating\n"); + goto repeat; + } + printk(KERN_ERR \ + "md: excessive errors occurred during superblock update, exiting\n"); + } +} + +/* + * Import a device. If 'super_format' >= 0, then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + * a faulty rdev _never_ has rdev->sb set. + */ +static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) +{ + int err; + mdk_rdev_t *rdev; + sector_t size; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk(KERN_ERR "md: could not alloc mem for %s!\n", + partition_name(newdev)); + return ERR_PTR(-ENOMEM); + } + memset(rdev, 0, sizeof(*rdev)); + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + err = lock_rdev(rdev, newdev); + if (err) { + printk(KERN_ERR "md: could not lock %s.\n", + partition_name(newdev)); + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + rdev->in_sync = 0; + rdev->data_offset = 0; + atomic_set(&rdev->nr_pending, 0); + + size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + if (!size) { + printk(KERN_WARNING + "md: %s has zero or unknown size, marking faulty!\n", + bdev_partition_name(rdev->bdev)); + err = -EINVAL; + goto abort_free; + } + + if (super_format >= 0) { + err = super_types[super_format]. + load_super(rdev, NULL, super_minor); + if (err == -EINVAL) { + printk(KERN_WARNING + "md: %s has invalid sb, not importing!\n", + bdev_partition_name(rdev->bdev)); + goto abort_free; + } + if (err < 0) { + printk(KERN_WARNING + "md: could not read %s's sb, not importing!\n", + bdev_partition_name(rdev->bdev)); + goto abort_free; + } + } + INIT_LIST_HEAD(&rdev->same_set); + + return rdev; + +abort_free: + if (rdev->sb_page) { + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return ERR_PTR(err); +} + +/* + * Check a full RAID array for plausibility + */ + + +static int analyze_sbs(mddev_t * mddev) +{ + int i; + struct list_head *tmp; + mdk_rdev_t *rdev, *freshest; + + freshest = NULL; + ITERATE_RDEV(mddev,rdev,tmp) + switch (super_types[mddev->major_version]. + load_super(rdev, freshest, mddev->minor_version)) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + printk( KERN_ERR \ + "md: fatal superblock inconsistency in %s" + " -- removing from array\n", + bdev_partition_name(rdev->bdev)); + kick_rdev_from_array(rdev); + } + + + super_types[mddev->major_version]. + validate_super(mddev, freshest); + + i = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev != freshest) + if (super_types[mddev->major_version]. + validate_super(mddev, rdev)) { + printk(KERN_WARNING "md: kicking non-fresh %s" + " from array!\n", + bdev_partition_name(rdev->bdev)); + kick_rdev_from_array(rdev); + continue; + } + if (mddev->level == LEVEL_MULTIPATH) { + rdev->desc_nr = i++; + rdev->raid_disk = rdev->desc_nr; + rdev->in_sync = 1; + } + } + + + /* + * Check if we can support this RAID array + */ + if (mddev->major_version != MD_MAJOR_VERSION || + mddev->minor_version > MD_MINOR_VERSION) { + printk(KERN_ALERT + "md: md%d: unsupported raid array version %d.%d.%d\n", + mdidx(mddev), mddev->major_version, + mddev->minor_version, mddev->patch_version); + goto abort; + } + + if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) || + (mddev->level == 4) || (mddev->level == 5))) + printk(KERN_ERR "md: md%d: raid array is not clean" + " -- starting background reconstruction\n", + mdidx(mddev)); + + return 0; +abort: + return 1; +} + +<<<<<<< +||||||| +#undef OLD_LEVEL + +======= +>>>>>>> +static struct gendisk *md_probe(dev_t dev, int *part, void *data) +{ + static DECLARE_MUTEX(disks_sem); + int unit = MINOR(dev); + mddev_t *mddev = mddev_find(unit); + struct gendisk *disk; + + if (!mddev) + return NULL; + + down(&disks_sem); + if (disks[unit]) { + up(&disks_sem); + mddev_put(mddev); + return NULL; + } + disk = alloc_disk(1); + if (!disk) { + up(&disks_sem); + mddev_put(mddev); + return NULL; + } + disk->major = MD_MAJOR; + disk->first_minor = mdidx(mddev); + sprintf(disk->disk_name, "md%d", mdidx(mddev)); + disk->fops = &md_fops; + disk->private_data = mddev; + disk->queue = &mddev->queue; + add_disk(disk); + disks[mdidx(mddev)] = disk; + up(&disks_sem); + return NULL; +} + +void md_wakeup_thread(mdk_thread_t *thread); + +static void md_safemode_timeout(unsigned long data) +{ + mddev_t *mddev = (mddev_t *) data; + + mddev->safemode = 1; + md_wakeup_thread(mddev->thread); +} + + +static int do_md_run(mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct list_head *tmp; + mdk_rdev_t *rdev; + struct gendisk *disk; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Analyze all RAID superblock(s) + */ + if (!mddev->raid_disks && analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->chunk_size; + pnum = level_to_pers(mddev->level); + + if ((pnum != MULTIPATH) && (pnum != RAID1)) { + if (!chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We don't + * want to continue the bad practice. + */ + printk(KERN_ERR + "no chunksize specified, see 'man raidtab'\n"); + return -EINVAL; + } + if (chunk_size > MAX_CHUNK_SIZE) { + printk(KERN_ERR "too big chunk_size: %d > %d\n", + chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(KERN_ERR "too small chunk_size: %d < %ld\n", + chunk_size, PAGE_SIZE); + return -EINVAL; + } + + /* devices must have minimum size of one chunk */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size < chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size:" + " %lluk < %dk\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->size, + chunk_size / 1024); + return -EINVAL; + } + } + } + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + +#ifdef CONFIG_KMOD + if (!pers[pnum]) + { + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + } +#endif + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + sync_blockdev(rdev->bdev); + invalidate_bdev(rdev->bdev, 0); + } + + md_probe(mdidx(mddev), NULL, NULL); + disk = disks[mdidx(mddev)]; + if (!disk) + return -ENOMEM; + + spin_lock(&pers_lock); + if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { + spin_unlock(&pers_lock); + printk(KERN_ERR "md: personality %d is not loaded!\n", + pnum); + return -EINVAL; + } + + mddev->pers = pers[pnum]; + spin_unlock(&pers_lock); + + blk_queue_make_request(&mddev->queue, mddev->pers->make_request); + printk("%s: setting max_sectors to %d, segment boundary to %d\n", + disk->disk_name, + chunk_size >> 9, + (chunk_size>>1)-1); + blk_queue_max_sectors(&mddev->queue, chunk_size >> 9); + blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1); + mddev->queue.queuedata = mddev; + + err = mddev->pers->run(mddev); + if (err) { + printk(KERN_ERR "md: pers->run() failed ...\n"); + module_put(mddev->pers->owner); + mddev->pers = NULL; + return -EINVAL; + } + atomic_set(&mddev->writes_pending,0); + mddev->safemode = 0; + mddev->safemode_timer.function = md_safemode_timeout; + mddev->safemode_timer.data = (unsigned long) mddev; + mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ + mddev->in_sync = 1; + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + set_capacity(disk, mddev->array_size<<1); + return 0; +} + +static int restart_array(mddev_t *mddev) +{ + struct gendisk *disk = disks[mdidx(mddev)]; + int err; + + /* + * Complain if it has no devices + */ + err = -ENXIO; + if (list_empty(&mddev->disks)) + goto out; + + if (mddev->pers) { + err = -EBUSY; + if (!mddev->ro) + goto out; + + mddev->safemode = 0; + mddev->ro = 0; + set_disk_ro(disk, 0); + + printk(KERN_INFO "md: md%d switched to read-write mode.\n", + mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + err = 0; + } else { + printk(KERN_ERR "md: md%d has no personality assigned.\n", + mdidx(mddev)); + err = -EINVAL; + } + +out: + return err; +} + +static int do_md_stop(mddev_t * mddev, int ro) +{ + int err = 0; + struct gendisk *disk = disks[mdidx(mddev)]; + + if (atomic_read(&mddev->active)>2) { + printk("md: md%d still in use.\n",mdidx(mddev)); + err = -EBUSY; + goto out; + } + + if (mddev->pers) { + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + } + + del_timer_sync(&mddev->safemode_timer); + + invalidate_device(mk_kdev(disk->major, disk->first_minor), 1); + + if (ro) { + err = -ENXIO; + if (mddev->ro) + goto out; + mddev->ro = 1; + } else { + if (mddev->ro) + set_disk_ro(disk, 0); + if (mddev->pers->stop(mddev)) { + err = -EBUSY; + if (mddev->ro) + set_disk_ro(disk, 1); + goto out; + } + module_put(mddev->pers->owner); + mddev->pers = NULL; + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->raid_disks) { + /* mark array as shutdown cleanly */ + mddev->in_sync = 1; + md_update_sb(mddev); + } + if (ro) + set_disk_ro(disk, 1); + } + /* + * Free resources if final stop + */ + if (!ro) { + struct gendisk *disk; + printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); + + export_array(mddev); + + mddev->array_size = 0; + disk = disks[mdidx(mddev)]; + if (disk) + set_capacity(disk, 0); + } else + printk(KERN_INFO "md: md%d switched to read-only mode.\n", + mdidx(mddev)); + err = 0; +out: + return err; +} + +static void autorun_array(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + int err; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return; + } + + printk(KERN_INFO "md: running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", bdev_partition_name(rdev->bdev)); + } + printk("\n"); + + err = do_md_run (mddev); + if (err) { + printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in pending_raid_disks) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(void) +{ + struct list_head candidates; + struct list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + + printk(KERN_INFO "md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + rdev0 = list_entry(pending_raid_disks.next, + mdk_rdev_t, same_set); + + printk(KERN_INFO "md: considering %s ...\n", + bdev_partition_name(rdev0->bdev)); + INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) + if (super_90_load(rdev, rdev0, 0) >= 0) { + printk(KERN_INFO "md: adding %s ...\n", + bdev_partition_name(rdev->bdev)); + list_move(&rdev->same_set, &candidates); + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + + mddev = mddev_find(rdev0->preferred_minor); + if (!mddev) { + printk(KERN_ERR + "md: cannot allocate memory for md drive.\n"); + break; + } + if (mddev_lock(mddev)) + printk(KERN_WARNING "md: md%d locked, cannot run\n", + mdidx(mddev)); + else if (mddev->raid_disks || mddev->major_version + || !list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: md%d already running, cannot run %s\n", + mdidx(mddev), bdev_partition_name(rdev0->bdev)); + mddev_unlock(mddev); + } else { + printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { + list_del_init(&rdev->same_set); + if (bind_rdev_to_array(rdev, mddev)) + export_rdev(rdev); + } + autorun_array(mddev); + mddev_unlock(mddev); + } + /* on success, candidates will be empty, on error + * it won't... + */ + ITERATE_RDEV_GENERIC(candidates,rdev,tmp) + export_rdev(rdev); + mddev_put(mddev); + } + printk(KERN_INFO "md: ... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +static int autostart_array(dev_t startdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + start_rdev = md_import_device(startdev, 0, 0); + if (IS_ERR(start_rdev)) { + printk(KERN_WARNING "md: could not import %s!\n", + partition_name(startdev)); + return err; + } + + /* NOTE: this can only work for 0.90.0 superblocks */ + sb = (mdp_super_t*)page_address(start_rdev->sb_page); + if (sb->major_version != 0 || + sb->minor_version != 90 ) { + printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); + export_rdev(start_rdev); + return err; + } + + if (start_rdev->faulty) { + printk(KERN_WARNING + "md: can not autostart based on faulty %s!\n", + bdev_partition_name(start_rdev->bdev)); + export_rdev(start_rdev); + return err; + } + list_add(&start_rdev->same_set, &pending_raid_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + dev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (!dev) + continue; + if (dev == startdev) + continue; + rdev = md_import_device(dev, 0, 0); + if (IS_ERR(rdev)) { + printk(KERN_WARNING "md: could not import %s," + " trying to run array nevertheless.\n", + partition_name(dev)); + continue; + } + list_add(&rdev->same_set, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(); + return 0; + +} + + +static int get_version(void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +static int get_array_info(mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + int nr,working,active,failed,spare; + mdk_rdev_t *rdev; + struct list_head *tmp; + + nr=working=active=failed=spare=0; + ITERATE_RDEV(mddev,rdev,tmp) { + nr++; + if (rdev->faulty) + failed++; + else { + working++; + if (rdev->in_sync) + active++; + else + spare++; + } + } + + info.major_version = mddev->major_version; + info.minor_version = mddev->minor_version; + info.patch_version = 1; + info.ctime = mddev->ctime; + info.level = mddev->level; + info.size = mddev->size; + info.nr_disks = nr; + info.raid_disks = mddev->raid_disks; + info.md_minor = mddev->__minor; + info.not_persistent= !mddev->persistent; + + info.utime = mddev->utime; + info.state = 0; + if (mddev->in_sync) + info.state = (1<layout; + info.chunk_size = mddev->chunk_size; + + if (copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static int get_disk_info(mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + mdk_rdev_t *rdev; + + if (copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + + rdev = find_rdev_nr(mddev, nr); + if (rdev) { + info.major = MAJOR(rdev->bdev->bd_dev); + info.minor = MINOR(rdev->bdev->bd_dev); + info.raid_disk = rdev->raid_disk; + info.state = 0; + if (rdev->faulty) + info.state |= (1<in_sync) { + info.state |= (1<major,info->minor); + if (!mddev->raid_disks) { + int err; + /* expecting a device which has a superblock */ + rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + if (!list_empty(&mddev->disks)) { + mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + int err = super_types[mddev->major_version] + .load_super(rdev, rdev0, mddev->minor_version); + if (err < 0) { + printk(KERN_WARNING + "md: %s has different UUID to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(rdev0->bdev)); + export_rdev(rdev); + return -EINVAL; + } + } + err = bind_rdev_to_array(rdev, mddev); + if (err) + export_rdev(rdev); + return err; + } + + /* + * add_new_disk can be used once the array is assembled + * to add "hot spares". They must already have a superblock + * written + */ + if (mddev->pers) { + int err; + if (!mddev->pers->hot_add_disk) { + printk(KERN_WARNING + "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + rdev = md_import_device(dev, mddev->major_version, + mddev->minor_version); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + rdev->in_sync = 0; /* just to be sure */ + rdev->raid_disk = -1; + err = bind_rdev_to_array(rdev, mddev); + if (err) + export_rdev(rdev); + if (mddev->thread) + md_wakeup_thread(mddev->thread); + return err; + } + + /* otherwise, add_new_disk is only allowed + * for major_version==0 superblocks + */ + if (mddev->major_version != 0) { + printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n", + mdidx(mddev)); + return -EINVAL; + } + + if (!(info->state & (1<desc_nr = info->number; + if (info->raid_disk < mddev->raid_disks) + rdev->raid_disk = info->raid_disk; + else + rdev->raid_disk = -1; + + rdev->faulty = 0; + if (rdev->raid_disk < mddev->raid_disks) + rdev->in_sync = (info->state & (1<in_sync = 0; + + err = bind_rdev_to_array(rdev, mddev); + if (err) { + export_rdev(rdev); + return err; + } + + if (!mddev->persistent) { + printk(KERN_INFO "md: nonpersistent superblock ...\n"); + rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + } else + rdev->sb_offset = calc_dev_sboffset(rdev->bdev); + rdev->size = calc_dev_size(rdev, mddev->chunk_size); + + if (!mddev->size || (mddev->size > rdev->size)) + mddev->size = rdev->size; + } + + return 0; +} + +static int hot_generate_error(mddev_t * mddev, dev_t dev) +{ + struct request_queue *q; + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + return -ENXIO; + } + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + if (!rdev->in_sync) + return -ENODEV; + + q = bdev_get_queue(rdev->bdev); + if (!q) { + MD_BUG(); + return -ENODEV; + } + printk(KERN_INFO "md: okay, generating error!\n"); +// q->oneshot_error = 1; // disabled for now + + return 0; +} + +static int hot_remove_disk(mddev_t * mddev, dev_t dev) +{ + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->raid_disk >= 0) + goto busy; + + kick_rdev_from_array(rdev); + md_update_sb(mddev); + + return 0; +busy: + printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", + bdev_partition_name(rdev->bdev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk(mddev_t * mddev, dev_t dev) +{ + int err; + unsigned int size; + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (mddev->major_version != 0) { + printk(KERN_WARNING "md%d: HOT_ADD may only be used with" + " version-0 superblocks.\n", + mdidx(mddev)); + return -EINVAL; + } + if (!mddev->pers->hot_add_disk) { + printk(KERN_WARNING + "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = md_import_device (dev, -1, 0); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: error, md_import_device() returned %ld\n", + PTR_ERR(rdev)); + return -EINVAL; + } + + rdev->sb_offset = calc_dev_sboffset(rdev->bdev); + size = calc_dev_size(rdev, mddev->chunk_size); + rdev->size = size; + + if (size < mddev->size) { + printk(KERN_WARNING + "md%d: disk size %llu blocks < array size %llu\n", + mdidx(mddev), (unsigned long long)size, + (unsigned long long)mddev->size); + err = -ENOSPC; + goto abort_export; + } + + if (rdev->faulty) { + printk(KERN_WARNING + "md: can not hot-add faulty %s disk to md%d!\n", + bdev_partition_name(rdev->bdev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + rdev->in_sync = 0; + rdev->desc_nr = -1; + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + + if (rdev->desc_nr == mddev->max_disks) { + printk(KERN_WARNING "md%d: can not hot-add to full array!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + rdev->raid_disk = -1; + + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +/* + * set_array_info is used two different ways + * The original usage is when creating a new array. + * In this usage, raid_disks is > = and it together with + * level, size, not_persistent,layout,chunksize determine the + * shape of the array. + * This will always create an array with a type-0.90.0 superblock. + * The newer usage is when assembling an array. + * In this case raid_disks will be 0, and the major_version field is + * use to determine which style super-blocks are to be found on the devices. + * The minor and patch _version numbers are also kept incase the + * super_block handler wishes to interpret them. + */ +static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +{ + + if (info->raid_disks == 0) { + /* just setting version number for superblock loading */ + if (info->major_version < 0 || + info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || + super_types[info->major_version].name == NULL) { + /* maybe try to auto-load a module? */ + printk(KERN_INFO + "md: superblock version %d not known\n", + info->major_version); + return -EINVAL; + } + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; + mddev->minor_version = MD_MINOR_VERSION; + mddev->patch_version = MD_PATCHLEVEL_VERSION; + mddev->ctime = get_seconds(); + + mddev->level = info->level; + mddev->size = info->size; + mddev->raid_disks = info->raid_disks; + /* don't set __minor, it is determined by which /dev/md* was + * openned + */ + if (info->state & (1<recovery_cp = MaxSector; + else + mddev->recovery_cp = 0; + mddev->persistent = ! info->not_persistent; + + mddev->layout = info->layout; + mddev->chunk_size = info->chunk_size; + + mddev->max_disks = MD_SB_DISKS; + + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(mddev->uuid, 16); + + return 0; +} + +static int set_disk_faulty(mddev_t *mddev, dev_t dev) +{ + mdk_rdev_t *rdev; + + rdev = find_rdev(mddev, dev); + if (!rdev) + return 0; + + md_error(mddev, rdev); + return 1; +} + +static int md_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + dev = inode->i_rdev; + minor = minor(dev); + if (minor >= MAX_MD_DEVS) { + MD_BUG(); + return -EINVAL; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; + +#ifndef MODULE + case RAID_AUTORUN: + err = 0; + autostart_arrays(); + goto done; +#endif + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) { + BUG(); + goto abort; + } + + + if (cmd == START_ARRAY) { + /* START_ARRAY doesn't need to lock the array as autostart_array + * does the locking, and it could even be a different array + */ + err = autostart_array(arg); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name(arg)); + goto abort; + } + goto done; + } + + err = mddev_lock(mddev); + if (err) { + printk(KERN_INFO + "md: ioctl lock interrupted, reason %d, cmd %d\n", + err, cmd); + goto abort; + } + + switch (cmd) + { + case SET_ARRAY_INFO: + + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: array md%d already has disks!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->raid_disks) { + printk(KERN_WARNING + "md: array md%d already initialised!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + { + mdu_array_info_t info; + if (!arg) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't set" + " array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + + default:; + } + + /* + * Commands querying/configuring an existing array: + */ + /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop (mddev, 0); + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = put_user(get_capacity(disks[mdidx(mddev)])/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = put_user (get_start_sect(inode->i_bdev), + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + case HOT_GENERATE_ERROR: + err = hot_generate_error(mddev, arg); + goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, arg); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, arg); + goto done_unlock; + + case RUN_ARRAY: + { + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + * ->pers will not be set, to superblock will + * not be updated. + */ + if (err) + do_md_stop (mddev, 0); + goto done_unlock; + } + + default: + if (_IOC_TYPE(cmd) == MD_MAJOR) + printk(KERN_WARNING "md: %s(pid %d) used" + " obsolete MD ioctl, upgrade your" + " software to use new ictls.\n", + current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + mddev_unlock(mddev); + + return err; +done: + if (err) + MD_BUG(); +abort: + return err; +} + +static int md_open(struct inode *inode, struct file *file) +{ + /* + * Succeed if we can find or allocate a mddev structure. + */ + mddev_t *mddev = mddev_find(minor(inode->i_rdev)); + int err = -ENOMEM; + + if (!mddev) + goto out; + + if ((err = mddev_lock(mddev))) + goto put; + + err = 0; + mddev_unlock(mddev); + inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev); + put: + mddev_put(mddev); + out: + return err; +} + +static int md_release(struct inode *inode, struct file * file) +{ + mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) + BUG(); + mddev_put(mddev); + + return 0; +} + +static struct block_device_operations md_fops = +{ + .owner = THIS_MODULE, + .open = md_open, + .release = md_release, + .ioctl = md_ioctl, +}; + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + lock_kernel(); + + /* + * Detach thread + */ + + daemonize(thread->name, mdidx(thread->mddev)); + + current->exit_signal = SIGCHLD; + allow_signal(SIGKILL); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run)(mddev_t *); + + wait_event_interruptible(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags)); + if (current->flags & PF_FREEZE) + refrigerator(PF_IOTHREAD); + + clear_bit(THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->mddev); + blk_run_queues(); + } + if (signal_pending(current)) + flush_signals(current); + } + complete(thread->event); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + if (thread) { + dprintk("md: waking up MD thread %p.\n", thread); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); + } +} + +mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, + const char *name) +{ + mdk_thread_t *thread; + int ret; + struct completion event; + + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; + + memset(thread, 0, sizeof(mdk_thread_t)); + init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->mddev = mddev; + thread->name = name; + ret = kernel_thread(md_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +void md_interrupt_thread(mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread(mdk_thread_t *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + md_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +void md_error(mddev_t *mddev, mdk_rdev_t *rdev) +{ + dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", + MD_MAJOR,mdidx(mddev), + MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); + + if (!mddev) { + MD_BUG(); + return; + } + + if (!rdev || rdev->faulty) + return; + if (!mddev->pers->error_handler) + return; + mddev->pers->error_handler(mddev,rdev); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); +} + +/* seq_file implementation /proc/mdstat */ + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + mdk_rdev_t *rdev; + struct list_head *tmp; + + seq_printf(seq, "unused devices: "); + + ITERATE_RDEV_PENDING(rdev,tmp) { + i++; + seq_printf(seq, "%s ", + bdev_partition_name(rdev->bdev)); + } + if (!i) + seq_printf(seq, ""); + + seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, mddev_t * mddev) +{ + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + max_blocks = mddev->size; + + /* + * Should not happen. + */ + if (!max_blocks) { + MD_BUG(); + return; + } + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", + (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? + "resync" : "recovery"), + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt/2); + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/dt); +} + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + mddev_t *mddev; + + if (l > 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + spin_lock(&all_mddevs_lock); + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, mddev_t, all_mddevs); + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + return mddev; + } + spin_unlock(&all_mddevs_lock); + return (void*)2;/* tail */ +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + mddev_t *next_mddev, *mddev = v; + + ++*pos; + if (v == (void*)2) + return NULL; + + spin_lock(&all_mddevs_lock); + if (v == (void*)1) + tmp = all_mddevs.next; + else + tmp = mddev->all_mddevs.next; + if (tmp != &all_mddevs) + next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); + else { + next_mddev = (void*)2; + *pos = 0x10000; + } + spin_unlock(&all_mddevs_lock); + + if (v != (void*)1) + mddev_put(mddev); + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + mddev_t *mddev = v; + + if (mddev && v != (void*)1 && v != (void*)2) + mddev_put(mddev); +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + mddev_t *mddev = v; + sector_t size; + struct list_head *tmp2; + mdk_rdev_t *rdev; + int i; + + if (v == (void*)1) { + seq_printf(seq, "Personalities : "); + spin_lock(&pers_lock); + for (i = 0; i < MAX_PERSONALITY; i++) + if (pers[i]) + seq_printf(seq, "[%s] ", pers[i]->name); + + spin_unlock(&pers_lock); + seq_printf(seq, "\n"); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + if (mddev_lock(mddev)!=0) + return -EINTR; + if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { + seq_printf(seq, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + seq_printf(seq, " (read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + seq_printf(seq, " %s[%d]", + bdev_partition_name(rdev->bdev), rdev->desc_nr); + if (rdev->faulty) { + seq_printf(seq, "(F)"); + continue; + } + size += rdev->size; + } + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %llu blocks", + (unsigned long long)mddev->array_size); + else + seq_printf(seq, "\n %llu blocks", + (unsigned long long)size); + } + + if (mddev->pers) { + mddev->pers->status (seq, mddev); + seq_printf(seq, "\n "); + if (mddev->curr_resync > 2) + status_resync (seq, mddev); + else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + seq_printf(seq, " resync=DELAYED"); + } + + seq_printf(seq, "\n"); + } + mddev_unlock(mddev); + + return 0; +} + +static struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + int error; + + error = seq_open(file, &md_seq_ops); + return error; +} + +static struct file_operations md_seq_fops = { + .open = md_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int register_md_personality(int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + spin_lock(&pers_lock); + if (pers[pnum]) { + spin_unlock(&pers_lock); + MD_BUG(); + return -EBUSY; + } + + pers[pnum] = p; + printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); + spin_unlock(&pers_lock); + return 0; +} + +int unregister_md_personality(int pnum) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); + spin_lock(&pers_lock); + pers[pnum] = NULL; + spin_unlock(&pers_lock); + return 0; +} + +void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors) +{ + rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors; +} + +static int is_mddev_idle(mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; + curr_events = disk_stat_read(disk, read_sectors) + + disk_stat_read(disk, write_sectors) - + disk->sync_io; + if ((curr_events - rdev->last_events) > 32) { + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + set_bit(MD_RECOVERY_ERR, &mddev->recovery); + md_wakeup_thread(mddev->thread); + // stop recovery, signal do_sync .... + } +} + + +void md_write_start(mddev_t *mddev) +{ + if (!atomic_read(&mddev->writes_pending)) { + mddev_lock_uninterruptible(mddev); + if (mddev->in_sync) { + mddev->in_sync = 0; + del_timer(&mddev->safemode_timer); + md_update_sb(mddev); + } + atomic_inc(&mddev->writes_pending); + mddev_unlock(mddev); + } else + atomic_inc(&mddev->writes_pending); +} + +void md_write_end(mddev_t *mddev) +{ + if (atomic_dec_and_test(&mddev->writes_pending)) { + if (mddev->safemode == 2) + md_wakeup_thread(mddev->thread); + else + mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); + } +} + +static inline void md_enter_safemode(mddev_t *mddev) +{ + mddev_lock_uninterruptible(mddev); + if (mddev->safemode && !atomic_read(&mddev->writes_pending) && + !mddev->in_sync && mddev->recovery_cp == MaxSector) { + mddev->in_sync = 1; + md_update_sb(mddev); + } + mddev_unlock(mddev); + + if (mddev->safemode == 1) + mddev->safemode = 0; +} + +void md_handle_safemode(mddev_t *mddev) +{ + if (signal_pending(current)) { + printk(KERN_INFO "md: md%d in immediate safe mode\n", + mdidx(mddev)); + mddev->safemode = 2; + flush_signals(current); + } + if (mddev->safemode) + md_enter_safemode(mddev); +} + + +DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +static void md_do_sync(mddev_t *mddev) +{ + mddev_t *mddev2; + unsigned int max_sectors, currspeed = 0, + j, window; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct list_head *tmp; + unsigned long last_check; + + /* just incase thread restarts... */ + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + return; + + /* we overload curr_resync somewhat here. + * 0 == not engaged in resync at all + * 2 == checking that there is no conflict with another sync + * 1 == like 2, but have yielded to allow conflicting resync to + * commense + * other == active in resync - this many blocks + */ + do { + mddev->curr_resync = 2; + + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && + match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: delaying resync of md%d" + " until md%d has finished resync (they" + " share one or more physical units)\n", + mdidx(mddev), mdidx(mddev2)); + if (mddev < mddev2) {/* arbitrarily yield */ + mddev->curr_resync = 1; + wake_up(&resync_wait); + } + if (wait_event_interruptible(resync_wait, + mddev2->curr_resync < mddev->curr_resync)) { + flush_signals(current); + mddev_put(mddev2); + goto skip; + } + } + if (mddev->curr_resync == 1) { + mddev_put(mddev2); + break; + } + } + } while (mddev->curr_resync < 2); + + max_sectors = mddev->size << 1; + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" + " %d KB/sec/disc.\n", sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith " + "(but not more than %d KB/sec) for reconstruction.\n", + sysctl_speed_limit_max); + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + j = mddev->recovery_cp; + else + j = 0; + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = j; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = 32*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", + window/2,max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + + if (j) + printk(KERN_INFO + "md: resuming recovery of md%d from checkpoint.\n", + mdidx(mddev)); + + while (j < max_sectors) { + int sectors; + + sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); + if (sectors < 0) { + set_bit(MD_RECOVERY_ERR, &mddev->recovery); + goto out; + } + atomic_add(sectors, &mddev->recovery_active); + j += sectors; + if (j>1) mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + last_check = j; + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || + test_bit(MD_RECOVERY_ERR, &mddev->recovery)) + break; + + blk_run_queues(); + + repeat: + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (signal_pending(current)) { + /* + * got a signal, exit. + */ + printk(KERN_INFO + "md: md_do_sync() got signal ... exiting\n"); + flush_signals(current); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + cond_resched(); + + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/4); + goto repeat; + } + } + } + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + /* + * this also signals 'finished resyncing' to md_stop + */ + out: + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + + /* tell personality that we are finished */ + mddev->pers->sync_request(mddev, max_sectors, 1); + + if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && + mddev->curr_resync > 2 && + mddev->curr_resync > mddev->recovery_cp) { + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + printk(KERN_INFO + "md: checkpointing recovery of md%d.\n", + mdidx(mddev)); + mddev->recovery_cp = mddev->curr_resync; + } else + mddev->recovery_cp = MaxSector; + } + + if (mddev->safemode) + md_enter_safemode(mddev); + skip: + mddev->curr_resync = 0; + set_bit(MD_RECOVERY_DONE, &mddev->recovery); + md_wakeup_thread(mddev->thread); +} + + +/* + * This routine is regularly called by all per-raid-array threads to + * deal with generic issues like resync and super-block update. + * Raid personalities that don't have a thread (linear/raid0) do not + * need this as they never do any recovery or update the superblock. + * + * It does not do any resync itself, but rather "forks" off other threads + * to do that as needed. + * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in + * "->recovery" and create a thread at ->sync_thread. + * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) + * and wakeups up this thread which will reap the thread and finish up. + * This thread also removes any faulty devices (with nr_pending == 0). + * + * The overall approach is: + * 1/ if the superblock needs updating, update it. + * 2/ If a recovery thread is running, don't do anything else. + * 3/ If recovery has finished, clean up, possibly marking spares active. + * 4/ If there are any faulty devices, remove them. + * 5/ If array is degraded, try to add spares devices + * 6/ If array has spares or is not in-sync, start a resync thread. + */ +void md_check_recovery(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct list_head *rtmp; + + + dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); + + if (mddev->ro) + return; + if ( ! ( + mddev->sb_dirty || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_DONE, &mddev->recovery) + )) + return; + if (mddev_trylock(mddev)==0) { + int spares =0; + if (mddev->sb_dirty) + md_update_sb(mddev); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + /* resync/recovery still happening */ + goto unlock; + if (mddev->sync_thread) { + /* resync has finished, collect result */ + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + mddev->pers->spare_active(mddev); + } + md_update_sb(mddev); + mddev->recovery = 0; + wake_up(&resync_wait); + goto unlock; + } + if (mddev->recovery) { + /* that's odd.. */ + mddev->recovery = 0; + wake_up(&resync_wait); + } + + /* no recovery is running. + * remove any failed drives, then + * add spares if possible + */ + ITERATE_RDEV(mddev,rdev,rtmp) { + if (rdev->raid_disk >= 0 && + rdev->faulty && + atomic_read(&rdev->nr_pending)==0) { + mddev->pers->hot_remove_disk(mddev, rdev->raid_disk); + rdev->raid_disk = -1; + } + if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync) + spares++; + } + if (mddev->degraded) { + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk < 0 + && !rdev->faulty) { + if (mddev->pers->hot_add_disk(mddev,rdev)) + spares++; + else + break; + } + } + + if (!spares && (mddev->recovery_cp == MaxSector )) { + /* nothing we can do ... */ + goto unlock; + } + if (mddev->pers->sync_request) { + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + if (!spares) + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "md%d_resync"); + if (!mddev->sync_thread) { + printk(KERN_ERR "md%d: could not start resync" + " thread...\n", + mdidx(mddev)); + /* leave the spares where they are, it shouldn't hurt */ + mddev->recovery = 0; + } else { + md_wakeup_thread(mddev->sync_thread); + } + } + unlock: + mddev_unlock(mddev); + } +} + +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct list_head *tmp; + mddev_t *mddev; + + if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { + + printk(KERN_INFO "md: stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + if (mddev_trylock(mddev)==0) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + .notifier_call = md_notify_reboot, + .next = NULL, + .priority = INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + struct proc_dir_entry *p; + + dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + p = create_proc_entry("mdstat", S_IRUGO, NULL); + if (p) + p->proc_fops = &md_seq_fops; +#endif +} + +int __init md_init(void) +{ + int minor; + + printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," + " MD_SB_DISKS=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + + if (register_blkdev(MAJOR_NR, "md")) + return -1; + + devfs_mk_dir("md"); + blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, + md_probe, NULL, NULL); + for (minor=0; minor < MAX_MD_DEVS; ++minor) { + char name[16]; + sprintf(name, "md/%d", minor); + devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor, + S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); + } + + register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table, 1); + + md_geninit(); + return (0); +} + + +#ifndef MODULE + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +static dev_t detected_devices[128]; +static int dev_cnt; + +void md_autodetect_dev(dev_t dev) +{ + if (dev_cnt >= 0 && dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} + + +static void autostart_arrays(void) +{ + mdk_rdev_t *rdev; + int i; + + printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + + for (i = 0; i < dev_cnt; i++) { + dev_t dev = detected_devices[i]; + + rdev = md_import_device(dev,0, 0); + if (IS_ERR(rdev)) { + printk(KERN_ALERT "md: could not import %s!\n", + partition_name(dev)); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + list_add(&rdev->same_set, &pending_raid_disks); + } + dev_cnt = 0; + + autorun_devices(); +} + +#endif + +static __exit void md_exit(void) +{ + int i; + blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); + for (i=0; i < MAX_MD_DEVS; i++) + devfs_remove("md/%d", i); + devfs_remove("md"); + + unregister_blkdev(MAJOR_NR,"md"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("mdstat", NULL); +#endif + for (i = 0; i < MAX_MD_DEVS; i++) { + struct gendisk *disk = disks[i]; + mddev_t *mddev; + if (!disks[i]) + continue; + mddev = disk->private_data; + del_gendisk(disk); + put_disk(disk); + mddev_put(mddev); + } +} + +module_init(md_init) +module_exit(md_exit) + +EXPORT_SYMBOL(register_md_personality); +EXPORT_SYMBOL(unregister_md_personality); +EXPORT_SYMBOL(md_error); +EXPORT_SYMBOL(md_sync_acct); +EXPORT_SYMBOL(md_done_sync); +EXPORT_SYMBOL(md_write_start); +EXPORT_SYMBOL(md_write_end); +EXPORT_SYMBOL(md_handle_safemode); +EXPORT_SYMBOL(md_register_thread); +EXPORT_SYMBOL(md_unregister_thread); +EXPORT_SYMBOL(md_wakeup_thread); +EXPORT_SYMBOL(md_print_devices); +EXPORT_SYMBOL(md_interrupt_thread); +EXPORT_SYMBOL(md_check_recovery); +MODULE_LICENSE("GPL"); diff --git a/tests/linux/md/merge b/tests/linux/md/merge new file mode 100644 index 0000000..6440da9 --- /dev/null +++ b/tests/linux/md/merge @@ -0,0 +1,3595 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher + - Devfs support by Richard Gooch + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown . + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include /* for invalidate_bdev */ +#include + +#include + +#ifdef CONFIG_KMOD +#include +#endif + +#define __KERNEL_SYSCALLS__ +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define DEVICE_NR(device) (minor(device)) + +#include + +#define DEBUG 0 +#define dprintk(x...) ((void)(DEBUG && printk(x))) + + +#ifndef MODULE +static void autostart_arrays (void); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; +static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 1000 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 1000; +static int sysctl_speed_limit_max = 200000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + { + .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, + .procname = "speed_limit_min", + .data = &sysctl_speed_limit_min, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, + .procname = "speed_limit_max", + .data = &sysctl_speed_limit_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table raid_dir_table[] = { + { + .ctl_name = DEV_RAID, + .procname = "raid", + .maxlen = 0, + .mode = 0555, + .child = raid_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table raid_root_table[] = { + { + .ctl_name = CTL_DEV, + .procname = "dev", + .maxlen = 0, + .mode = 0555, + .child = raid_dir_table, + }, + { .ctl_name = 0 } +}; + +static struct block_device_operations md_fops; + +static struct gendisk *disks[MAX_MD_DEVS]; + +/* + * Enables to iterate over all existing md arrays + * all_mddevs_lock protects this list as well as mddev_map. + */ +static LIST_HEAD(all_mddevs); +static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; + + +/* + * iterates through all used mddevs in the system. + * We take care to grab the all_mddevs_lock whenever navigating + * the list, and to always hold a refcount when unlocked. + * Any code which breaks out of this loop while own + * a reference to the current mddev and must mddev_put it. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (({ spin_lock(&all_mddevs_lock); \ + tmp = all_mddevs.next; \ + mddev = NULL;}); \ + ({ if (tmp != &all_mddevs) \ + mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ + spin_unlock(&all_mddevs_lock); \ + if (mddev) mddev_put(mddev); \ + mddev = list_entry(tmp, mddev_t, all_mddevs); \ + tmp != &all_mddevs;}); \ + ({ spin_lock(&all_mddevs_lock); \ + tmp = tmp->next;}) \ + ) + +static mddev_t *mddev_map[MAX_MD_DEVS]; + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ + bio_io_error(bio, bio->bi_size); + return 0; +} + +static inline mddev_t *mddev_get(mddev_t *mddev) +{ + atomic_inc(&mddev->active); + return mddev; +} + +static void mddev_put(mddev_t *mddev) +{ + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) + return; + if (!mddev->raid_disks && list_empty(&mddev->disks)) { + list_del(&mddev->all_mddevs); + mddev_map[mdidx(mddev)] = NULL; + kfree(mddev); + MOD_DEC_USE_COUNT; + } + spin_unlock(&all_mddevs_lock); +} + +static mddev_t * mddev_find(int unit) +{ + mddev_t *mddev, *new = NULL; + + retry: + spin_lock(&all_mddevs_lock); + if (mddev_map[unit]) { + mddev = mddev_get(mddev_map[unit]); + spin_unlock(&all_mddevs_lock); + if (new) + kfree(new); + return mddev; + } + if (new) { + mddev_map[unit] = new; + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + MOD_INC_USE_COUNT; + return new; + } + spin_unlock(&all_mddevs_lock); + + new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + memset(new, 0, sizeof(*new)); + + new->__minor = unit; + init_MUTEX(&new->reconfig_sem); + INIT_LIST_HEAD(&new->disks); + INIT_LIST_HEAD(&new->all_mddevs); + init_timer(&new->safemode_timer); + atomic_set(&new->active, 1); + blk_queue_make_request(&new->queue, md_fail_request); + + goto retry; +} + +static inline int mddev_lock(mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +static inline void mddev_lock_uninterruptible(mddev_t * mddev) +{ + down(&mddev->reconfig_sem); +} + +static inline int mddev_trylock(mddev_t * mddev) +{ + return down_trylock(&mddev->reconfig_sem); +} + +static inline void mddev_unlock(mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->bdev->bd_dev == dev) + return rdev; + } + return NULL; +} + +inline static sector_t calc_dev_sboffset(struct block_device *bdev) +{ + sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + return MD_NEW_SIZE_BLOCKS(size); +} + +static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) +{ + sector_t size; + + size = rdev->sb_offset; + + if (chunk_size) + size &= ~((sector_t)chunk_size/1024 - 1); + return size; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(KERN_ALERT "md: out of memory.\n"); + return -EINVAL; + } + + return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) { + page_cache_release(rdev->sb_page); + rdev->sb_loaded = 0; + rdev->sb_page = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } +} + + +static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) +{ + if (bio->bi_size) + return 1; + + complete((struct completion*)bio->bi_private); + return 0; +} + +static int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw) +{ + struct bio bio; + struct bio_vec vec; + struct completion event; + + bio_init(&bio); + bio.bi_io_vec = &vec; + vec.bv_page = page; + vec.bv_len = size; + vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_idx = 0; + bio.bi_size = size; + bio.bi_bdev = bdev; + bio.bi_sector = sector; + init_completion(&event); + bio.bi_private = &event; + bio.bi_end_io = bi_complete; + submit_bio(rw, &bio); + blk_run_queues(); + wait_for_completion(&event); + + return test_bit(BIO_UPTODATE, &bio.bi_flags); +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ + + if (!rdev->sb_page) { + MD_BUG(); + return -EINVAL; + } + if (rdev->sb_loaded) + return 0; + + + if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) + goto fail; + rdev->sb_loaded = 1; + return 0; + +fail: + printk(KERN_ERR "md: disabled device %s, could not read superblock.\n", + bdev_partition_name(rdev->bdev)); + return -EINVAL; +} + +static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + if ( (sb1->set_uuid0 == sb2->set_uuid0) && + (sb1->set_uuid1 == sb2->set_uuid1) && + (sb1->set_uuid2 == sb2->set_uuid2) && + (sb1->set_uuid3 == sb2->set_uuid3)) + + return 1; + + return 0; +} + + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Handle superblock details. + * We want to be able to handle multiple superblock formats + * so we have a common interface to them all, and an array of + * different handlers. + * We rely on user-space to write the initial superblock, and support + * reading and updating of superblocks. + * Interface methods are: + * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) + * loads and validates a superblock on dev. + * if refdev != NULL, compare superblocks on both devices + * Return: + * 0 - dev has a superblock that is compatible with refdev + * 1 - dev has a superblock that is compatible and newer than refdev + * so dev should be used as the refdev in future + * -EINVAL superblock incompatible or invalid + * -othererror e.g. -EIO + * + * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) + * Verify that dev is acceptable into mddev. + * The first time, mddev->raid_disks will be 0, and data from + * dev should be merged in. Subsequent calls check that dev + * is new enough. Return 0 or -EINVAL + * + * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) + * Update the superblock for rdev with data in mddev + * This does not write to disc. + * + */ + +struct super_type { + char *name; + struct module *owner; + int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); + int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); + void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); +}; + +/* + * load_super for 0.90.0 + */ +static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ + mdp_super_t *sb; + int ret; + sector_t sb_offset; + + /* + * Calculate the position of the superblock, + * it's at the end of the disk. + * + * It also happens to be a multiple of 4Kb. + */ + sb_offset = calc_dev_sboffset(rdev->bdev); + rdev->sb_offset = sb_offset; + + ret = read_disk_sb(rdev); + if (ret) return ret; + + ret = -EINVAL; + + sb = (mdp_super_t*)page_address(rdev->sb_page); + + if (sb->md_magic != MD_SB_MAGIC) { + printk(KERN_ERR "md: invalid raid superblock magic on %s\n", + bdev_partition_name(rdev->bdev)); + goto abort; + } + + if (sb->major_version != 0 || + sb->minor_version != 90) { + printk(KERN_WARNING "Bad version number %d.%d on %s\n", + sb->major_version, sb->minor_version, + bdev_partition_name(rdev->bdev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk(KERN_ERR "md: %s: invalid raid minor (%x)\n", + bdev_partition_name(rdev->bdev), sb->md_minor); + goto abort; + } + if (sb->raid_disks <= 0) + goto abort; + + if (calc_sb_csum(sb) != sb->sb_csum) { + printk(KERN_WARNING "md: invalid superblock checksum on %s\n", + bdev_partition_name(rdev->bdev)); + goto abort; + } + + rdev->preferred_minor = sb->md_minor; + rdev->data_offset = 0; + + if (sb->level == MULTIPATH) + rdev->desc_nr = -1; + else + rdev->desc_nr = sb->this_disk.number; + + if (refdev == 0) + ret = 1; + else { + __u64 ev1, ev2; + mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); + if (!uuid_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + goto abort; + } + if (!sb_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has same UUID" + " but different superblock to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + goto abort; + } + ev1 = md_event(sb); + ev2 = md_event(refsb); + if (ev1 > ev2) + ret = 1; + else + ret = 0; + } + rdev->size = calc_dev_size(rdev, sb->chunk_size); + + abort: + return ret; +} + +/* + * validate_super for 0.90.0 + */ +static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdp_disk_t *desc; + mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + + if (mddev->raid_disks == 0) { + mddev->major_version = 0; + mddev->minor_version = sb->minor_version; + mddev->patch_version = sb->patch_version; + mddev->persistent = ! sb->not_persistent; + mddev->chunk_size = sb->chunk_size; + mddev->ctime = sb->ctime; + mddev->utime = sb->utime; + mddev->level = sb->level; + mddev->layout = sb->layout; + mddev->raid_disks = sb->raid_disks; + mddev->size = sb->size; + mddev->events = md_event(sb); + + if (sb->state & (1<recovery_cp = MaxSector; + else { + if (sb->events_hi == sb->cp_events_hi && + sb->events_lo == sb->cp_events_lo) { + mddev->recovery_cp = sb->recovery_cp; + } else + mddev->recovery_cp = 0; + } + + memcpy(mddev->uuid+0, &sb->set_uuid0, 4); + memcpy(mddev->uuid+4, &sb->set_uuid1, 4); + memcpy(mddev->uuid+8, &sb->set_uuid2, 4); + memcpy(mddev->uuid+12,&sb->set_uuid3, 4); + + mddev->max_disks = MD_SB_DISKS; + } else { + __u64 ev1; + ev1 = md_event(sb); + ++ev1; + if (ev1 < mddev->events) + return -EINVAL; + } + if (mddev->level != LEVEL_MULTIPATH) { + rdev->raid_disk = -1; + rdev->in_sync = rdev->faulty = 0; + desc = sb->disks + rdev->desc_nr; + + if (desc->state & (1<faulty = 1; + else if (desc->state & (1<raid_disk < mddev->raid_disks) { + rdev->in_sync = 1; + rdev->raid_disk = desc->raid_disk; + } + } + return 0; +} + +/* + * sync_super for 0.90.0 + */ +static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdp_super_t *sb; + struct list_head *tmp; + mdk_rdev_t *rdev2; + int next_spare = mddev->raid_disks; + + /* make rdev->sb match mddev data.. + * + * 1/ zero out disks + * 2/ Add info for each disk, keeping track of highest desc_nr + * 3/ any empty disks < highest become removed + * + * disks[0] gets initialised to REMOVED because + * we cannot be sure from other fields if it has + * been initialised or not. + */ + int highest = 0; + int i; + int active=0, working=0,failed=0,spare=0,nr_disks=0; + + sb = (mdp_super_t*)page_address(rdev->sb_page); + + memset(sb, 0, sizeof(*sb)); + + sb->md_magic = MD_SB_MAGIC; + sb->major_version = mddev->major_version; + sb->minor_version = mddev->minor_version; + sb->patch_version = mddev->patch_version; + sb->gvalid_words = 0; /* ignored */ + memcpy(&sb->set_uuid0, mddev->uuid+0, 4); + memcpy(&sb->set_uuid1, mddev->uuid+4, 4); + memcpy(&sb->set_uuid2, mddev->uuid+8, 4); + memcpy(&sb->set_uuid3, mddev->uuid+12,4); + + sb->ctime = mddev->ctime; + sb->level = mddev->level; + sb->size = mddev->size; + sb->raid_disks = mddev->raid_disks; + sb->md_minor = mddev->__minor; + sb->not_persistent = !mddev->persistent; + sb->utime = mddev->utime; + sb->state = 0; + sb->events_hi = (mddev->events>>32); + sb->events_lo = (u32)mddev->events; + + if (mddev->in_sync) + { + sb->recovery_cp = mddev->recovery_cp; + sb->cp_events_hi = (mddev->events>>32); + sb->cp_events_lo = (u32)mddev->events; + if (mddev->recovery_cp == MaxSector) + sb->state = (1<< MD_SB_CLEAN); + } else + sb->recovery_cp = 0; + + sb->layout = mddev->layout; + sb->chunk_size = mddev->chunk_size; + + sb->disks[0].state = (1<raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) + rdev2->desc_nr = rdev2->raid_disk; + else + rdev2->desc_nr = next_spare++; + d = &sb->disks[rdev2->desc_nr]; + nr_disks++; + d->number = rdev2->desc_nr; + d->major = MAJOR(rdev2->bdev->bd_dev); + d->minor = MINOR(rdev2->bdev->bd_dev); + if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) + d->raid_disk = rdev2->raid_disk; + else + d->raid_disk = rdev2->desc_nr; /* compatibility */ + if (rdev2->faulty) { + d->state = (1<in_sync) { + d->state = (1<state |= (1<state = 0; + spare++; + working++; + } + if (rdev2->desc_nr > highest) + highest = rdev2->desc_nr; + } + + /* now set the "removed" bit on any non-trailing holes */ + for (i=0; idisks[i]; + if (d->state == 0 && d->number == 0) { + d->number = i; + d->raid_disk = i; + d->state = (1<nr_disks = nr_disks; + sb->active_disks = active; + sb->working_disks = working; + sb->failed_disks = failed; + sb->spare_disks = spare; + + sb->this_disk = sb->disks[rdev->desc_nr]; + sb->sb_csum = calc_sb_csum(sb); +} + +/* + * version 1 superblock + */ + +static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +{ + unsigned int disk_csum, csum; + int size = 256 + sb->max_dev*2; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, size, 0); + sb->sb_csum = disk_csum; + return csum; +} + +static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ + struct mdp_superblock_1 *sb; + int ret; + sector_t sb_offset; + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depeding on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(minor_version) { + case 0: + sb_offset = rdev->bdev->bd_inode->i_size >> 9; + sb_offset -= 8*2; + sb_offset &= ~(4*2); + /* convert from sectors to K */ + sb_offset /= 2; + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4; + break; + default: + return -EINVAL; + } + rdev->sb_offset = sb_offset; + + ret = read_disk_sb(rdev); + if (ret) return ret; + + + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || + sb->major_version != cpu_to_le32(1) || + le32_to_cpu(sb->max_dev) > (4096-256)/2 || + le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || + sb->feature_map != 0) + return -EINVAL; + + if (calc_sb_1_csum(sb) != sb->sb_csum) { + printk("md: invalid superblock checksum on %s\n", + bdev_partition_name(rdev->bdev)); + return -EINVAL; + } + rdev->preferred_minor = 0xffff; + rdev->data_offset = le64_to_cpu(sb->data_offset); + + if (refdev == 0) + return 1; + else { + __u64 ev1, ev2; + struct mdp_superblock_1 *refsb = + (struct mdp_superblock_1*)page_address(refdev->sb_page); + + if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || + sb->level != refsb->level || + sb->layout != refsb->layout || + sb->chunksize != refsb->chunksize) { + printk(KERN_WARNING "md: %s has strangely different" + " superblock to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + return -EINVAL; + } + ev1 = le64_to_cpu(sb->events); + ev2 = le64_to_cpu(refsb->events); + + if (ev1 > ev2) + return 1; + } + if (minor_version) + rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; + else + rdev->size = rdev->sb_offset; + if (rdev->size < le64_to_cpu(sb->data_size)/2) + return -EINVAL; + rdev->size = le64_to_cpu(sb->data_size)/2; + if (le32_to_cpu(sb->chunksize)) + rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); + return 0; +} + +static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + if (mddev->raid_disks == 0) { + mddev->major_version = 1; + mddev->minor_version = 0; + mddev->patch_version = 0; + mddev->persistent = 1; + mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; + mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); + mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); + mddev->level = le32_to_cpu(sb->level); + mddev->layout = le32_to_cpu(sb->layout); + mddev->raid_disks = le32_to_cpu(sb->raid_disks); + mddev->size = (u32)le64_to_cpu(sb->size); + mddev->events = le64_to_cpu(sb->events); + + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + memcpy(mddev->uuid, sb->set_uuid, 16); + + mddev->max_disks = (4096-256)/2; + } else { + __u64 ev1; + ev1 = le64_to_cpu(sb->events); + ++ev1; + if (ev1 < mddev->events) + return -EINVAL; + } + + if (mddev->level != LEVEL_MULTIPATH) { + int role; + rdev->desc_nr = le32_to_cpu(sb->dev_number); + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + switch(role) { + case 0xffff: /* spare */ + rdev->in_sync = 0; + rdev->faulty = 0; + rdev->raid_disk = -1; + break; + case 0xfffe: /* faulty */ + rdev->in_sync = 0; + rdev->faulty = 1; + rdev->raid_disk = -1; + break; + default: + rdev->in_sync = 1; + rdev->faulty = 0; + rdev->raid_disk = role; + break; + } + } + return 0; +} + +static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct mdp_superblock_1 *sb; + struct list_head *tmp; + mdk_rdev_t *rdev2; + int max_dev, i; + /* make rdev->sb match mddev and rdev data. */ + + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + sb->feature_map = 0; + sb->pad0 = 0; + memset(sb->pad1, 0, sizeof(sb->pad1)); + memset(sb->pad2, 0, sizeof(sb->pad2)); + memset(sb->pad3, 0, sizeof(sb->pad3)); + + sb->utime = cpu_to_le64((__u64)mddev->utime); + sb->events = cpu_to_le64(mddev->events); + if (mddev->in_sync) + sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else + sb->resync_offset = cpu_to_le64(0); + + max_dev = 0; + ITERATE_RDEV(mddev,rdev2,tmp) + if (rdev2->desc_nr > max_dev) + max_dev = rdev2->desc_nr; + + sb->max_dev = max_dev; + for (i=0; idev_roles[max_dev] = cpu_to_le16(0xfffe); + + ITERATE_RDEV(mddev,rdev2,tmp) { + i = rdev2->desc_nr; + if (rdev2->faulty) + sb->dev_roles[i] = cpu_to_le16(0xfffe); + else if (rdev2->in_sync) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else + sb->dev_roles[i] = cpu_to_le16(0xffff); + } + + sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ +} + + +struct super_type super_types[] = { + [0] = { + .name = "0.90.0", + .owner = THIS_MODULE, + .load_super = super_90_load, + .validate_super = super_90_validate, + .sync_super = super_90_sync, + }, + [1] = { + .name = "md-1", + .owner = THIS_MODULE, + .load_super = super_1_load, + .validate_super = super_1_validate, + .sync_super = super_1_sync, + }, +}; + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (rdev->bdev->bd_contains == dev->bdev->bd_contains) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev)) + return 1; + + return 0; +} + +static LIST_HEAD(pending_raid_disks); + +static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return -EINVAL; + } + same_pdev = match_dev_unit(mddev, rdev); + if (same_pdev) + printk(KERN_WARNING + "md%d: WARNING: %s appears to be on the same physical" + " disk as %s. True\n protection against single-disk" + " failure might be compromised.\n", + mdidx(mddev), bdev_partition_name(rdev->bdev), + bdev_partition_name(same_pdev->bdev)); + + /* Verify rdev->desc_nr is unique. + * If it is -1, assign a free number, else + * check number is not in use + */ + if (rdev->desc_nr < 0) { + int choice = 0; + if (mddev->pers) choice = mddev->raid_disks; + while (find_rdev_nr(mddev, choice)) + choice++; + rdev->desc_nr = choice; + } else { + if (find_rdev_nr(mddev, rdev->desc_nr)) + return -EBUSY; + } + + list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev)); + return 0; +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del_init(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev)); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +{ + int err = 0; + struct block_device *bdev; + + bdev = bdget(dev); + if (!bdev) + return -ENOMEM; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + if (err) + return err; + err = bd_claim(bdev, rdev); + if (err) { + blkdev_put(bdev, BDEV_RAW); + return err; + } + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + bd_release(bdev); + blkdev_put(bdev, BDEV_RAW); +} + +void md_autodetect_dev(dev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ + printk(KERN_INFO "md: export_rdev(%s)\n", + bdev_partition_name(rdev->bdev)); + if (rdev->mddev) + MD_BUG(); + free_disk_sb(rdev); + list_del_init(&rdev->same_set); +#ifndef MODULE + md_autodetect_dev(rdev->bdev->bd_dev); +#endif + unlock_rdev(rdev); + kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); + mddev->raid_disks = 0; + mddev->major_version = 0; +} + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO + "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", + sb->level, sb->size, sb->nr_disks, sb->raid_disks, + sb->md_minor, sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" + " FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ", + bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size, + rdev->faulty, rdev->in_sync, rdev->desc_nr); + if (rdev->sb_loaded) { + printk(KERN_INFO "md: rdev superblock:\n"); + print_sb((mdp_super_t*)page_address(rdev->sb_page)); + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ + struct list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * *\n"); + printk("md: **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", bdev_partition_name(rdev->bdev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk("md: **********************************\n"); + printk("\n"); +} + + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + + if (!rdev->sb_loaded) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + + dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->sb_offset); + + if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) + return 0; + + printk("md: write_disk_sb failed for device %s\n", + bdev_partition_name(rdev->bdev)); + return 1; +} + +static void sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + super_types[mddev->major_version]. + sync_super(mddev, rdev); + rdev->sb_loaded = 1; + } +} + +static void md_update_sb(mddev_t * mddev) +{ + int err, count = 100; + struct list_head *tmp; + mdk_rdev_t *rdev; + + mddev->sb_dirty = 0; +repeat: + mddev->utime = get_seconds(); + mddev->events ++; + + if (!mddev->events) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->events --; + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (!mddev->persistent) + return; + + dprintk(KERN_INFO + "md: updating md%d RAID superblock on device (in sync %d)\n", + mdidx(mddev),mddev->in_sync); + + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + dprintk(KERN_INFO "md: "); + if (rdev->faulty) + dprintk("(skipping faulty "); + + dprintk("%s ", bdev_partition_name(rdev->bdev)); + if (!rdev->faulty) { + err += write_disk_sb(rdev); + } else + dprintk(")\n"); + if (!err && mddev->level == LEVEL_MULTIPATH) + /* only need to write one superblock... */ + break; + } + if (err) { + if (--count) { + printk(KERN_ERR "md: errors occurred during superblock" + " update, repeating\n"); + goto repeat; + } + printk(KERN_ERR \ + "md: excessive errors occurred during superblock update, exiting\n"); + } +} + +/* + * Import a device. If 'super_format' >= 0, then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + * a faulty rdev _never_ has rdev->sb set. + */ +static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) +{ + int err; + mdk_rdev_t *rdev; + sector_t size; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk(KERN_ERR "md: could not alloc mem for %s!\n", + partition_name(newdev)); + return ERR_PTR(-ENOMEM); + } + memset(rdev, 0, sizeof(*rdev)); + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + err = lock_rdev(rdev, newdev); + if (err) { + printk(KERN_ERR "md: could not lock %s.\n", + partition_name(newdev)); + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + rdev->in_sync = 0; + rdev->data_offset = 0; + atomic_set(&rdev->nr_pending, 0); + + size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + if (!size) { + printk(KERN_WARNING + "md: %s has zero or unknown size, marking faulty!\n", + bdev_partition_name(rdev->bdev)); + err = -EINVAL; + goto abort_free; + } + + if (super_format >= 0) { + err = super_types[super_format]. + load_super(rdev, NULL, super_minor); + if (err == -EINVAL) { + printk(KERN_WARNING + "md: %s has invalid sb, not importing!\n", + bdev_partition_name(rdev->bdev)); + goto abort_free; + } + if (err < 0) { + printk(KERN_WARNING + "md: could not read %s's sb, not importing!\n", + bdev_partition_name(rdev->bdev)); + goto abort_free; + } + } + INIT_LIST_HEAD(&rdev->same_set); + + return rdev; + +abort_free: + if (rdev->sb_page) { + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return ERR_PTR(err); +} + +/* + * Check a full RAID array for plausibility + */ + + +static int analyze_sbs(mddev_t * mddev) +{ + int i; + struct list_head *tmp; + mdk_rdev_t *rdev, *freshest; + + freshest = NULL; + ITERATE_RDEV(mddev,rdev,tmp) + switch (super_types[mddev->major_version]. + load_super(rdev, freshest, mddev->minor_version)) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + printk( KERN_ERR \ + "md: fatal superblock inconsistency in %s" + " -- removing from array\n", + bdev_partition_name(rdev->bdev)); + kick_rdev_from_array(rdev); + } + + + super_types[mddev->major_version]. + validate_super(mddev, freshest); + + i = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev != freshest) + if (super_types[mddev->major_version]. + validate_super(mddev, rdev)) { + printk(KERN_WARNING "md: kicking non-fresh %s" + " from array!\n", + bdev_partition_name(rdev->bdev)); + kick_rdev_from_array(rdev); + continue; + } + if (mddev->level == LEVEL_MULTIPATH) { + rdev->desc_nr = i++; + rdev->raid_disk = rdev->desc_nr; + rdev->in_sync = 1; + } + } + + + /* + * Check if we can support this RAID array + */ + if (mddev->major_version != MD_MAJOR_VERSION || + mddev->minor_version > MD_MINOR_VERSION) { + printk(KERN_ALERT + "md: md%d: unsupported raid array version %d.%d.%d\n", + mdidx(mddev), mddev->major_version, + mddev->minor_version, mddev->patch_version); + goto abort; + } + + if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) || + (mddev->level == 4) || (mddev->level == 5))) + printk(KERN_ERR "md: md%d: raid array is not clean" + " -- starting background reconstruction\n", + mdidx(mddev)); + + return 0; +abort: + return 1; +} + +<<<<<<< +||||||| +#undef OLD_LEVEL + +======= +>>>>>>> +static struct gendisk *md_probe(dev_t dev, int *part, void *data) +{ + static DECLARE_MUTEX(disks_sem); + int unit = MINOR(dev); + mddev_t *mddev = mddev_find(unit); + struct gendisk *disk; + + if (!mddev) + return NULL; + + down(&disks_sem); + if (disks[unit]) { + up(&disks_sem); + mddev_put(mddev); + return NULL; + } + disk = alloc_disk(1); + if (!disk) { + up(&disks_sem); + mddev_put(mddev); + return NULL; + } + disk->major = MD_MAJOR; + disk->first_minor = mdidx(mddev); + sprintf(disk->disk_name, "md%d", mdidx(mddev)); + disk->fops = &md_fops; + disk->private_data = mddev; + disk->queue = &mddev->queue; + add_disk(disk); + disks[mdidx(mddev)] = disk; + up(&disks_sem); + return NULL; +} + +void md_wakeup_thread(mdk_thread_t *thread); + +static void md_safemode_timeout(unsigned long data) +{ + mddev_t *mddev = (mddev_t *) data; + + mddev->safemode = 1; + md_wakeup_thread(mddev->thread); +} + + +static int do_md_run(mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct list_head *tmp; + mdk_rdev_t *rdev; + struct gendisk *disk; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Analyze all RAID superblock(s) + */ + if (!mddev->raid_disks && analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->chunk_size; + pnum = level_to_pers(mddev->level); + + if ((pnum != MULTIPATH) && (pnum != RAID1)) { + if (!chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We don't + * want to continue the bad practice. + */ + printk(KERN_ERR + "no chunksize specified, see 'man raidtab'\n"); + return -EINVAL; + } + if (chunk_size > MAX_CHUNK_SIZE) { + printk(KERN_ERR "too big chunk_size: %d > %d\n", + chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(KERN_ERR "too small chunk_size: %d < %ld\n", + chunk_size, PAGE_SIZE); + return -EINVAL; + } + + /* devices must have minimum size of one chunk */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size < chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size:" + " %lluk < %dk\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->size, + chunk_size / 1024); + return -EINVAL; + } + } + } + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + +#ifdef CONFIG_KMOD + if (!pers[pnum]) + { + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + } +#endif + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + sync_blockdev(rdev->bdev); + invalidate_bdev(rdev->bdev, 0); + } + + md_probe(mdidx(mddev), NULL, NULL); + disk = disks[mdidx(mddev)]; + if (!disk) + return -ENOMEM; + + spin_lock(&pers_lock); + if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { + spin_unlock(&pers_lock); + printk(KERN_ERR "md: personality %d is not loaded!\n", + pnum); + return -EINVAL; + } + + mddev->pers = pers[pnum]; + spin_unlock(&pers_lock); + + blk_queue_make_request(&mddev->queue, mddev->pers->make_request); + printk("%s: setting max_sectors to %d, segment boundary to %d\n", + disk->disk_name, + chunk_size >> 9, + (chunk_size>>1)-1); + blk_queue_max_sectors(&mddev->queue, chunk_size >> 9); + blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1); + mddev->queue.queuedata = mddev; + + err = mddev->pers->run(mddev); + if (err) { + printk(KERN_ERR "md: pers->run() failed ...\n"); + module_put(mddev->pers->owner); + mddev->pers = NULL; + return -EINVAL; + } + atomic_set(&mddev->writes_pending,0); + mddev->safemode = 0; + mddev->safemode_timer.function = md_safemode_timeout; + mddev->safemode_timer.data = (unsigned long) mddev; + mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ + mddev->in_sync = 1; + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + set_capacity(disk, mddev->array_size<<1); + return 0; +} + +static int restart_array(mddev_t *mddev) +{ + struct gendisk *disk = disks[mdidx(mddev)]; + int err; + + /* + * Complain if it has no devices + */ + err = -ENXIO; + if (list_empty(&mddev->disks)) + goto out; + + if (mddev->pers) { + err = -EBUSY; + if (!mddev->ro) + goto out; + + mddev->safemode = 0; + mddev->ro = 0; + set_disk_ro(disk, 0); + + printk(KERN_INFO "md: md%d switched to read-write mode.\n", + mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + err = 0; + } else { + printk(KERN_ERR "md: md%d has no personality assigned.\n", + mdidx(mddev)); + err = -EINVAL; + } + +out: + return err; +} + +static int do_md_stop(mddev_t * mddev, int ro) +{ + int err = 0; + struct gendisk *disk = disks[mdidx(mddev)]; + + if (atomic_read(&mddev->active)>2) { + printk("md: md%d still in use.\n",mdidx(mddev)); + err = -EBUSY; + goto out; + } + + if (mddev->pers) { + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + } + + del_timer_sync(&mddev->safemode_timer); + + invalidate_device(mk_kdev(disk->major, disk->first_minor), 1); + + if (ro) { + err = -ENXIO; + if (mddev->ro) + goto out; + mddev->ro = 1; + } else { + if (mddev->ro) + set_disk_ro(disk, 0); + if (mddev->pers->stop(mddev)) { + err = -EBUSY; + if (mddev->ro) + set_disk_ro(disk, 1); + goto out; + } + module_put(mddev->pers->owner); + mddev->pers = NULL; + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->raid_disks) { + /* mark array as shutdown cleanly */ + mddev->in_sync = 1; + md_update_sb(mddev); + } + if (ro) + set_disk_ro(disk, 1); + } + /* + * Free resources if final stop + */ + if (!ro) { + struct gendisk *disk; + printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); + + export_array(mddev); + + mddev->array_size = 0; + disk = disks[mdidx(mddev)]; + if (disk) + set_capacity(disk, 0); + } else + printk(KERN_INFO "md: md%d switched to read-only mode.\n", + mdidx(mddev)); + err = 0; +out: + return err; +} + +static void autorun_array(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + int err; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return; + } + + printk(KERN_INFO "md: running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", bdev_partition_name(rdev->bdev)); + } + printk("\n"); + + err = do_md_run (mddev); + if (err) { + printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in pending_raid_disks) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(void) +{ + struct list_head candidates; + struct list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + + printk(KERN_INFO "md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + rdev0 = list_entry(pending_raid_disks.next, + mdk_rdev_t, same_set); + + printk(KERN_INFO "md: considering %s ...\n", + bdev_partition_name(rdev0->bdev)); + INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) + if (super_90_load(rdev, rdev0, 0) >= 0) { + printk(KERN_INFO "md: adding %s ...\n", + bdev_partition_name(rdev->bdev)); + list_move(&rdev->same_set, &candidates); + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + + mddev = mddev_find(rdev0->preferred_minor); + if (!mddev) { + printk(KERN_ERR + "md: cannot allocate memory for md drive.\n"); + break; + } + if (mddev_lock(mddev)) + printk(KERN_WARNING "md: md%d locked, cannot run\n", + mdidx(mddev)); + else if (mddev->raid_disks || mddev->major_version + || !list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: md%d already running, cannot run %s\n", + mdidx(mddev), bdev_partition_name(rdev0->bdev)); + mddev_unlock(mddev); + } else { + printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { + list_del_init(&rdev->same_set); + if (bind_rdev_to_array(rdev, mddev)) + export_rdev(rdev); + } + autorun_array(mddev); + mddev_unlock(mddev); + } + /* on success, candidates will be empty, on error + * it won't... + */ + ITERATE_RDEV_GENERIC(candidates,rdev,tmp) + export_rdev(rdev); + mddev_put(mddev); + } + printk(KERN_INFO "md: ... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +static int autostart_array(dev_t startdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + start_rdev = md_import_device(startdev, 0, 0); + if (IS_ERR(start_rdev)) { + printk(KERN_WARNING "md: could not import %s!\n", + partition_name(startdev)); + return err; + } + + /* NOTE: this can only work for 0.90.0 superblocks */ + sb = (mdp_super_t*)page_address(start_rdev->sb_page); + if (sb->major_version != 0 || + sb->minor_version != 90 ) { + printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); + export_rdev(start_rdev); + return err; + } + + if (start_rdev->faulty) { + printk(KERN_WARNING + "md: can not autostart based on faulty %s!\n", + bdev_partition_name(start_rdev->bdev)); + export_rdev(start_rdev); + return err; + } + list_add(&start_rdev->same_set, &pending_raid_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + dev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (!dev) + continue; + if (dev == startdev) + continue; + rdev = md_import_device(dev, 0, 0); + if (IS_ERR(rdev)) { + printk(KERN_WARNING "md: could not import %s," + " trying to run array nevertheless.\n", + partition_name(dev)); + continue; + } + list_add(&rdev->same_set, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(); + return 0; + +} + + +static int get_version(void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +static int get_array_info(mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + int nr,working,active,failed,spare; + mdk_rdev_t *rdev; + struct list_head *tmp; + + nr=working=active=failed=spare=0; + ITERATE_RDEV(mddev,rdev,tmp) { + nr++; + if (rdev->faulty) + failed++; + else { + working++; + if (rdev->in_sync) + active++; + else + spare++; + } + } + + info.major_version = mddev->major_version; + info.minor_version = mddev->minor_version; + info.patch_version = 1; + info.ctime = mddev->ctime; + info.level = mddev->level; + info.size = mddev->size; + info.nr_disks = nr; + info.raid_disks = mddev->raid_disks; + info.md_minor = mddev->__minor; + info.not_persistent= !mddev->persistent; + + info.utime = mddev->utime; + info.state = 0; + if (mddev->in_sync) + info.state = (1<layout; + info.chunk_size = mddev->chunk_size; + + if (copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static int get_disk_info(mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + mdk_rdev_t *rdev; + + if (copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + + rdev = find_rdev_nr(mddev, nr); + if (rdev) { + info.major = MAJOR(rdev->bdev->bd_dev); + info.minor = MINOR(rdev->bdev->bd_dev); + info.raid_disk = rdev->raid_disk; + info.state = 0; + if (rdev->faulty) + info.state |= (1<in_sync) { + info.state |= (1<major,info->minor); + if (!mddev->raid_disks) { + int err; + /* expecting a device which has a superblock */ + rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + if (!list_empty(&mddev->disks)) { + mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + int err = super_types[mddev->major_version] + .load_super(rdev, rdev0, mddev->minor_version); + if (err < 0) { + printk(KERN_WARNING + "md: %s has different UUID to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(rdev0->bdev)); + export_rdev(rdev); + return -EINVAL; + } + } + err = bind_rdev_to_array(rdev, mddev); + if (err) + export_rdev(rdev); + return err; + } + + /* + * add_new_disk can be used once the array is assembled + * to add "hot spares". They must already have a superblock + * written + */ + if (mddev->pers) { + int err; + if (!mddev->pers->hot_add_disk) { + printk(KERN_WARNING + "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + rdev = md_import_device(dev, mddev->major_version, + mddev->minor_version); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + rdev->in_sync = 0; /* just to be sure */ + rdev->raid_disk = -1; + err = bind_rdev_to_array(rdev, mddev); + if (err) + export_rdev(rdev); + if (mddev->thread) + md_wakeup_thread(mddev->thread); + return err; + } + + /* otherwise, add_new_disk is only allowed + * for major_version==0 superblocks + */ + if (mddev->major_version != 0) { + printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n", + mdidx(mddev)); + return -EINVAL; + } + + if (!(info->state & (1<desc_nr = info->number; + if (info->raid_disk < mddev->raid_disks) + rdev->raid_disk = info->raid_disk; + else + rdev->raid_disk = -1; + + rdev->faulty = 0; + if (rdev->raid_disk < mddev->raid_disks) + rdev->in_sync = (info->state & (1<in_sync = 0; + + err = bind_rdev_to_array(rdev, mddev); + if (err) { + export_rdev(rdev); + return err; + } + + if (!mddev->persistent) { + printk(KERN_INFO "md: nonpersistent superblock ...\n"); + rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + } else + rdev->sb_offset = calc_dev_sboffset(rdev->bdev); + rdev->size = calc_dev_size(rdev, mddev->chunk_size); + + if (!mddev->size || (mddev->size > rdev->size)) + mddev->size = rdev->size; + } + + return 0; +} + +static int hot_generate_error(mddev_t * mddev, dev_t dev) +{ + struct request_queue *q; + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + return -ENXIO; + } + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + if (!rdev->in_sync) + return -ENODEV; + + q = bdev_get_queue(rdev->bdev); + if (!q) { + MD_BUG(); + return -ENODEV; + } + printk(KERN_INFO "md: okay, generating error!\n"); +// q->oneshot_error = 1; // disabled for now + + return 0; +} + +static int hot_remove_disk(mddev_t * mddev, dev_t dev) +{ + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->raid_disk >= 0) + goto busy; + + kick_rdev_from_array(rdev); + md_update_sb(mddev); + + return 0; +busy: + printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", + bdev_partition_name(rdev->bdev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk(mddev_t * mddev, dev_t dev) +{ + int err; + unsigned int size; + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (mddev->major_version != 0) { + printk(KERN_WARNING "md%d: HOT_ADD may only be used with" + " version-0 superblocks.\n", + mdidx(mddev)); + return -EINVAL; + } + if (!mddev->pers->hot_add_disk) { + printk(KERN_WARNING + "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = md_import_device (dev, -1, 0); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: error, md_import_device() returned %ld\n", + PTR_ERR(rdev)); + return -EINVAL; + } + + rdev->sb_offset = calc_dev_sboffset(rdev->bdev); + size = calc_dev_size(rdev, mddev->chunk_size); + rdev->size = size; + + if (size < mddev->size) { + printk(KERN_WARNING + "md%d: disk size %llu blocks < array size %llu\n", + mdidx(mddev), (unsigned long long)size, + (unsigned long long)mddev->size); + err = -ENOSPC; + goto abort_export; + } + + if (rdev->faulty) { + printk(KERN_WARNING + "md: can not hot-add faulty %s disk to md%d!\n", + bdev_partition_name(rdev->bdev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + rdev->in_sync = 0; + rdev->desc_nr = -1; + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + + if (rdev->desc_nr == mddev->max_disks) { + printk(KERN_WARNING "md%d: can not hot-add to full array!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + rdev->raid_disk = -1; + + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +/* + * set_array_info is used two different ways + * The original usage is when creating a new array. + * In this usage, raid_disks is > = and it together with + * level, size, not_persistent,layout,chunksize determine the + * shape of the array. + * This will always create an array with a type-0.90.0 superblock. + * The newer usage is when assembling an array. + * In this case raid_disks will be 0, and the major_version field is + * use to determine which style super-blocks are to be found on the devices. + * The minor and patch _version numbers are also kept incase the + * super_block handler wishes to interpret them. + */ +static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +{ + + if (info->raid_disks == 0) { + /* just setting version number for superblock loading */ + if (info->major_version < 0 || + info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || + super_types[info->major_version].name == NULL) { + /* maybe try to auto-load a module? */ + printk(KERN_INFO + "md: superblock version %d not known\n", + info->major_version); + return -EINVAL; + } + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; + mddev->minor_version = MD_MINOR_VERSION; + mddev->patch_version = MD_PATCHLEVEL_VERSION; + mddev->ctime = get_seconds(); + + mddev->level = info->level; + mddev->size = info->size; + mddev->raid_disks = info->raid_disks; + /* don't set __minor, it is determined by which /dev/md* was + * openned + */ + if (info->state & (1<recovery_cp = MaxSector; + else + mddev->recovery_cp = 0; + mddev->persistent = ! info->not_persistent; + + mddev->layout = info->layout; + mddev->chunk_size = info->chunk_size; + + mddev->max_disks = MD_SB_DISKS; + + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(mddev->uuid, 16); + + return 0; +} + +static int set_disk_faulty(mddev_t *mddev, dev_t dev) +{ + mdk_rdev_t *rdev; + + rdev = find_rdev(mddev, dev); + if (!rdev) + return 0; + + md_error(mddev, rdev); + return 1; +} + +static int md_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + dev = inode->i_rdev; + minor = minor(dev); + if (minor >= MAX_MD_DEVS) { + MD_BUG(); + return -EINVAL; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; + +#ifndef MODULE + case RAID_AUTORUN: + err = 0; + autostart_arrays(); + goto done; +#endif + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) { + BUG(); + goto abort; + } + + + if (cmd == START_ARRAY) { + /* START_ARRAY doesn't need to lock the array as autostart_array + * does the locking, and it could even be a different array + */ + err = autostart_array(arg); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name(arg)); + goto abort; + } + goto done; + } + + err = mddev_lock(mddev); + if (err) { + printk(KERN_INFO + "md: ioctl lock interrupted, reason %d, cmd %d\n", + err, cmd); + goto abort; + } + + switch (cmd) + { + case SET_ARRAY_INFO: + + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: array md%d already has disks!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->raid_disks) { + printk(KERN_WARNING + "md: array md%d already initialised!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + { + mdu_array_info_t info; + if (!arg) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't set" + " array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + + default:; + } + + /* + * Commands querying/configuring an existing array: + */ + /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop (mddev, 0); + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = put_user(get_capacity(disks[mdidx(mddev)])/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = put_user (get_start_sect(inode->i_bdev), + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + case HOT_GENERATE_ERROR: + err = hot_generate_error(mddev, arg); + goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, arg); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, arg); + goto done_unlock; + + case RUN_ARRAY: + { + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + * ->pers will not be set, to superblock will + * not be updated. + */ + if (err) + do_md_stop (mddev, 0); + goto done_unlock; + } + + default: + if (_IOC_TYPE(cmd) == MD_MAJOR) + printk(KERN_WARNING "md: %s(pid %d) used" + " obsolete MD ioctl, upgrade your" + " software to use new ictls.\n", + current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + mddev_unlock(mddev); + + return err; +done: + if (err) + MD_BUG(); +abort: + return err; +} + +static int md_open(struct inode *inode, struct file *file) +{ + /* + * Succeed if we can find or allocate a mddev structure. + */ + mddev_t *mddev = mddev_find(minor(inode->i_rdev)); + int err = -ENOMEM; + + if (!mddev) + goto out; + + if ((err = mddev_lock(mddev))) + goto put; + + err = 0; + mddev_unlock(mddev); + inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev); + put: + mddev_put(mddev); + out: + return err; +} + +static int md_release(struct inode *inode, struct file * file) +{ + mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) + BUG(); + mddev_put(mddev); + + return 0; +} + +static struct block_device_operations md_fops = +{ + .owner = THIS_MODULE, + .open = md_open, + .release = md_release, + .ioctl = md_ioctl, +}; + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + lock_kernel(); + + /* + * Detach thread + */ + + daemonize(thread->name, mdidx(thread->mddev)); + + current->exit_signal = SIGCHLD; + allow_signal(SIGKILL); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run)(mddev_t *); + + wait_event_interruptible(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags)); + if (current->flags & PF_FREEZE) + refrigerator(PF_IOTHREAD); + + clear_bit(THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->mddev); + blk_run_queues(); + } + if (signal_pending(current)) + flush_signals(current); + } + complete(thread->event); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + if (thread) { + dprintk("md: waking up MD thread %p.\n", thread); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); + } +} + +mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, + const char *name) +{ + mdk_thread_t *thread; + int ret; + struct completion event; + + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; + + memset(thread, 0, sizeof(mdk_thread_t)); + init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->mddev = mddev; + thread->name = name; + ret = kernel_thread(md_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +void md_interrupt_thread(mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread(mdk_thread_t *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + md_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +void md_error(mddev_t *mddev, mdk_rdev_t *rdev) +{ + dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", + MD_MAJOR,mdidx(mddev), + MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); + + if (!mddev) { + MD_BUG(); + return; + } + + if (!rdev || rdev->faulty) + return; + if (!mddev->pers->error_handler) + return; + mddev->pers->error_handler(mddev,rdev); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); +} + +/* seq_file implementation /proc/mdstat */ + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + mdk_rdev_t *rdev; + struct list_head *tmp; + + seq_printf(seq, "unused devices: "); + + ITERATE_RDEV_PENDING(rdev,tmp) { + i++; + seq_printf(seq, "%s ", + bdev_partition_name(rdev->bdev)); + } + if (!i) + seq_printf(seq, ""); + + seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, mddev_t * mddev) +{ + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + max_blocks = mddev->size; + + /* + * Should not happen. + */ + if (!max_blocks) { + MD_BUG(); + return; + } + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", + (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? + "resync" : "recovery"), + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt/2); + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/dt); +} + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + mddev_t *mddev; + + if (l > 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + spin_lock(&all_mddevs_lock); + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, mddev_t, all_mddevs); + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + return mddev; + } + spin_unlock(&all_mddevs_lock); + return (void*)2;/* tail */ +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + mddev_t *next_mddev, *mddev = v; + + ++*pos; + if (v == (void*)2) + return NULL; + + spin_lock(&all_mddevs_lock); + if (v == (void*)1) + tmp = all_mddevs.next; + else + tmp = mddev->all_mddevs.next; + if (tmp != &all_mddevs) + next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); + else { + next_mddev = (void*)2; + *pos = 0x10000; + } + spin_unlock(&all_mddevs_lock); + + if (v != (void*)1) + mddev_put(mddev); + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + mddev_t *mddev = v; + + if (mddev && v != (void*)1 && v != (void*)2) + mddev_put(mddev); +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + mddev_t *mddev = v; + sector_t size; + struct list_head *tmp2; + mdk_rdev_t *rdev; + int i; + + if (v == (void*)1) { + seq_printf(seq, "Personalities : "); + spin_lock(&pers_lock); + for (i = 0; i < MAX_PERSONALITY; i++) + if (pers[i]) + seq_printf(seq, "[%s] ", pers[i]->name); + + spin_unlock(&pers_lock); + seq_printf(seq, "\n"); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + if (mddev_lock(mddev)!=0) + return -EINTR; + if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { + seq_printf(seq, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + seq_printf(seq, " (read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + seq_printf(seq, " %s[%d]", + bdev_partition_name(rdev->bdev), rdev->desc_nr); + if (rdev->faulty) { + seq_printf(seq, "(F)"); + continue; + } + size += rdev->size; + } + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %llu blocks", + (unsigned long long)mddev->array_size); + else + seq_printf(seq, "\n %llu blocks", + (unsigned long long)size); + } + + if (mddev->pers) { + mddev->pers->status (seq, mddev); + seq_printf(seq, "\n "); + if (mddev->curr_resync > 2) + status_resync (seq, mddev); + else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + seq_printf(seq, " resync=DELAYED"); + } + + seq_printf(seq, "\n"); + } + mddev_unlock(mddev); + + return 0; +} + +static struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + int error; + + error = seq_open(file, &md_seq_ops); + return error; +} + +static struct file_operations md_seq_fops = { + .open = md_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int register_md_personality(int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + spin_lock(&pers_lock); + if (pers[pnum]) { + spin_unlock(&pers_lock); + MD_BUG(); + return -EBUSY; + } + + pers[pnum] = p; + printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); + spin_unlock(&pers_lock); + return 0; +} + +int unregister_md_personality(int pnum) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); + spin_lock(&pers_lock); + pers[pnum] = NULL; + spin_unlock(&pers_lock); + return 0; +} + +void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors) +{ + rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors; +} + +static int is_mddev_idle(mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; + curr_events = disk_stat_read(disk, read_sectors) + + disk_stat_read(disk, write_sectors) - + disk->sync_io; + if ((curr_events - rdev->last_events) > 32) { + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + set_bit(MD_RECOVERY_ERR, &mddev->recovery); + md_wakeup_thread(mddev->thread); + // stop recovery, signal do_sync .... + } +} + + +void md_write_start(mddev_t *mddev) +{ + if (!atomic_read(&mddev->writes_pending)) { + mddev_lock_uninterruptible(mddev); + if (mddev->in_sync) { + mddev->in_sync = 0; + del_timer(&mddev->safemode_timer); + md_update_sb(mddev); + } + atomic_inc(&mddev->writes_pending); + mddev_unlock(mddev); + } else + atomic_inc(&mddev->writes_pending); +} + +void md_write_end(mddev_t *mddev) +{ + if (atomic_dec_and_test(&mddev->writes_pending)) { + if (mddev->safemode == 2) + md_wakeup_thread(mddev->thread); + else + mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); + } +} + +static inline void md_enter_safemode(mddev_t *mddev) +{ + mddev_lock_uninterruptible(mddev); + if (mddev->safemode && !atomic_read(&mddev->writes_pending) && + !mddev->in_sync && mddev->recovery_cp == MaxSector) { + mddev->in_sync = 1; + md_update_sb(mddev); + } + mddev_unlock(mddev); + + if (mddev->safemode == 1) + mddev->safemode = 0; +} + +void md_handle_safemode(mddev_t *mddev) +{ + if (signal_pending(current)) { + printk(KERN_INFO "md: md%d in immediate safe mode\n", + mdidx(mddev)); + mddev->safemode = 2; + flush_signals(current); + } + if (mddev->safemode) + md_enter_safemode(mddev); +} + + +DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +static void md_do_sync(mddev_t *mddev) +{ + mddev_t *mddev2; + unsigned int max_sectors, currspeed = 0, + j, window; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct list_head *tmp; + unsigned long last_check; + + /* just incase thread restarts... */ + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + return; + + /* we overload curr_resync somewhat here. + * 0 == not engaged in resync at all + * 2 == checking that there is no conflict with another sync + * 1 == like 2, but have yielded to allow conflicting resync to + * commense + * other == active in resync - this many blocks + */ + do { + mddev->curr_resync = 2; + + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && + match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: delaying resync of md%d" + " until md%d has finished resync (they" + " share one or more physical units)\n", + mdidx(mddev), mdidx(mddev2)); + if (mddev < mddev2) {/* arbitrarily yield */ + mddev->curr_resync = 1; + wake_up(&resync_wait); + } + if (wait_event_interruptible(resync_wait, + mddev2->curr_resync < mddev->curr_resync)) { + flush_signals(current); + mddev_put(mddev2); + goto skip; + } + } + if (mddev->curr_resync == 1) { + mddev_put(mddev2); + break; + } + } + } while (mddev->curr_resync < 2); + + max_sectors = mddev->size << 1; + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" + " %d KB/sec/disc.\n", sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith " + "(but not more than %d KB/sec) for reconstruction.\n", + sysctl_speed_limit_max); + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + j = mddev->recovery_cp; + else + j = 0; + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = j; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = 32*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", + window/2,max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + + if (j) + printk(KERN_INFO + "md: resuming recovery of md%d from checkpoint.\n", + mdidx(mddev)); + + while (j < max_sectors) { + int sectors; + + sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); + if (sectors < 0) { + set_bit(MD_RECOVERY_ERR, &mddev->recovery); + goto out; + } + atomic_add(sectors, &mddev->recovery_active); + j += sectors; + if (j>1) mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + last_check = j; + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || + test_bit(MD_RECOVERY_ERR, &mddev->recovery)) + break; + + blk_run_queues(); + + repeat: + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (signal_pending(current)) { + /* + * got a signal, exit. + */ + printk(KERN_INFO + "md: md_do_sync() got signal ... exiting\n"); + flush_signals(current); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + cond_resched(); + + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/4); + goto repeat; + } + } + } + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + /* + * this also signals 'finished resyncing' to md_stop + */ + out: + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + + /* tell personality that we are finished */ + mddev->pers->sync_request(mddev, max_sectors, 1); + + if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && + mddev->curr_resync > 2 && + mddev->curr_resync > mddev->recovery_cp) { + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + printk(KERN_INFO + "md: checkpointing recovery of md%d.\n", + mdidx(mddev)); + mddev->recovery_cp = mddev->curr_resync; + } else + mddev->recovery_cp = MaxSector; + } + + if (mddev->safemode) + md_enter_safemode(mddev); + skip: + mddev->curr_resync = 0; + set_bit(MD_RECOVERY_DONE, &mddev->recovery); + md_wakeup_thread(mddev->thread); +} + + +/* + * This routine is regularly called by all per-raid-array threads to + * deal with generic issues like resync and super-block update. + * Raid personalities that don't have a thread (linear/raid0) do not + * need this as they never do any recovery or update the superblock. + * + * It does not do any resync itself, but rather "forks" off other threads + * to do that as needed. + * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in + * "->recovery" and create a thread at ->sync_thread. + * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) + * and wakeups up this thread which will reap the thread and finish up. + * This thread also removes any faulty devices (with nr_pending == 0). + * + * The overall approach is: + * 1/ if the superblock needs updating, update it. + * 2/ If a recovery thread is running, don't do anything else. + * 3/ If recovery has finished, clean up, possibly marking spares active. + * 4/ If there are any faulty devices, remove them. + * 5/ If array is degraded, try to add spares devices + * 6/ If array has spares or is not in-sync, start a resync thread. + */ +void md_check_recovery(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct list_head *rtmp; + + + dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); + + if (mddev->ro) + return; + if ( ! ( + mddev->sb_dirty || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_DONE, &mddev->recovery) + )) + return; + if (mddev_trylock(mddev)==0) { + int spares =0; + if (mddev->sb_dirty) + md_update_sb(mddev); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + /* resync/recovery still happening */ + goto unlock; + if (mddev->sync_thread) { + /* resync has finished, collect result */ + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + mddev->pers->spare_active(mddev); + } + md_update_sb(mddev); + mddev->recovery = 0; + wake_up(&resync_wait); + goto unlock; + } + if (mddev->recovery) { + /* that's odd.. */ + mddev->recovery = 0; + wake_up(&resync_wait); + } + + /* no recovery is running. + * remove any failed drives, then + * add spares if possible + */ + ITERATE_RDEV(mddev,rdev,rtmp) { + if (rdev->raid_disk >= 0 && + rdev->faulty && + atomic_read(&rdev->nr_pending)==0) { + mddev->pers->hot_remove_disk(mddev, rdev->raid_disk); + rdev->raid_disk = -1; + } + if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync) + spares++; + } + if (mddev->degraded) { + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk < 0 + && !rdev->faulty) { + if (mddev->pers->hot_add_disk(mddev,rdev)) + spares++; + else + break; + } + } + + if (!spares && (mddev->recovery_cp == MaxSector )) { + /* nothing we can do ... */ + goto unlock; + } + if (mddev->pers->sync_request) { + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + if (!spares) + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "md%d_resync"); + if (!mddev->sync_thread) { + printk(KERN_ERR "md%d: could not start resync" + " thread...\n", + mdidx(mddev)); + /* leave the spares where they are, it shouldn't hurt */ + mddev->recovery = 0; + } else { + md_wakeup_thread(mddev->sync_thread); + } + } + unlock: + mddev_unlock(mddev); + } +} + +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct list_head *tmp; + mddev_t *mddev; + + if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { + + printk(KERN_INFO "md: stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + if (mddev_trylock(mddev)==0) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + .notifier_call = md_notify_reboot, + .next = NULL, + .priority = INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + struct proc_dir_entry *p; + + dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + p = create_proc_entry("mdstat", S_IRUGO, NULL); + if (p) + p->proc_fops = &md_seq_fops; +#endif +} + +int __init md_init(void) +{ + int minor; + + printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," + " MD_SB_DISKS=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + + if (register_blkdev(MAJOR_NR, "md")) + return -1; + + devfs_mk_dir("md"); + blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, + md_probe, NULL, NULL); + for (minor=0; minor < MAX_MD_DEVS; ++minor) { + char name[16]; + sprintf(name, "md/%d", minor); + devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor, + S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); + } + + register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table, 1); + + md_geninit(); + return (0); +} + + +#ifndef MODULE + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +static dev_t detected_devices[128]; +static int dev_cnt; + +void md_autodetect_dev(dev_t dev) +{ + if (dev_cnt >= 0 && dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} + + +static void autostart_arrays(void) +{ + mdk_rdev_t *rdev; + int i; + + printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + + for (i = 0; i < dev_cnt; i++) { + dev_t dev = detected_devices[i]; + + rdev = md_import_device(dev,0, 0); + if (IS_ERR(rdev)) { + printk(KERN_ALERT "md: could not import %s!\n", + partition_name(dev)); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + list_add(&rdev->same_set, &pending_raid_disks); + } + dev_cnt = 0; + + autorun_devices(); +} + +#endif + +static __exit void md_exit(void) +{ + int i; + blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); + for (i=0; i < MAX_MD_DEVS; i++) + devfs_remove("md/%d", i); + devfs_remove("md"); + + unregister_blkdev(MAJOR_NR,"md"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("mdstat", NULL); +#endif + for (i = 0; i < MAX_MD_DEVS; i++) { + struct gendisk *disk = disks[i]; + mddev_t *mddev; + if (!disks[i]) + continue; + mddev = disk->private_data; + del_gendisk(disk); + put_disk(disk); + mddev_put(mddev); + } +} + +module_init(md_init) +module_exit(md_exit) + +EXPORT_SYMBOL(register_md_personality); +EXPORT_SYMBOL(unregister_md_personality); +EXPORT_SYMBOL(md_error); +EXPORT_SYMBOL(md_sync_acct); +EXPORT_SYMBOL(md_done_sync); +EXPORT_SYMBOL(md_write_start); +EXPORT_SYMBOL(md_write_end); +EXPORT_SYMBOL(md_handle_safemode); +EXPORT_SYMBOL(md_register_thread); +EXPORT_SYMBOL(md_unregister_thread); +EXPORT_SYMBOL(md_wakeup_thread); +EXPORT_SYMBOL(md_print_devices); +EXPORT_SYMBOL(md_interrupt_thread); +EXPORT_SYMBOL(md_check_recovery); +MODULE_LICENSE("GPL"); diff --git a/tests/linux/md/orig b/tests/linux/md/orig new file mode 100644 index 0000000..3f5b666 --- /dev/null +++ b/tests/linux/md/orig @@ -0,0 +1,3674 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher + - Devfs support by Richard Gooch + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown . + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include /* for invalidate_bdev */ +#include + +#include + +#ifdef CONFIG_KMOD +#include +#endif + +#define __KERNEL_SYSCALLS__ +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define DEVICE_NR(device) (minor(device)) + +#include + +#define DEBUG 0 +#define dprintk(x...) ((void)(DEBUG && printk(x))) + + +#ifndef MODULE +static void autostart_arrays (void); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; +static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 1000 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 1000; +static int sysctl_speed_limit_max = 200000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + { + .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, + .procname = "speed_limit_min", + .data = &sysctl_speed_limit_min, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, + .procname = "speed_limit_max", + .data = &sysctl_speed_limit_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table raid_dir_table[] = { + { + .ctl_name = DEV_RAID, + .procname = "raid", + .maxlen = 0, + .mode = 0555, + .child = raid_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table raid_root_table[] = { + { + .ctl_name = CTL_DEV, + .procname = "dev", + .maxlen = 0, + .mode = 0555, + .child = raid_dir_table, + }, + { .ctl_name = 0 } +}; + +static struct block_device_operations md_fops; + +static struct gendisk *disks[MAX_MD_DEVS]; + +/* + * Enables to iterate over all existing md arrays + * all_mddevs_lock protects this list as well as mddev_map. + */ +static LIST_HEAD(all_mddevs); +static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; + + +/* + * iterates through all used mddevs in the system. + * We take care to grab the all_mddevs_lock whenever navigating + * the list, and to always hold a refcount when unlocked. + * Any code which breaks out of this loop while own + * a reference to the current mddev and must mddev_put it. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (({ spin_lock(&all_mddevs_lock); \ + tmp = all_mddevs.next; \ + mddev = NULL;}); \ + ({ if (tmp != &all_mddevs) \ + mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ + spin_unlock(&all_mddevs_lock); \ + if (mddev) mddev_put(mddev); \ + mddev = list_entry(tmp, mddev_t, all_mddevs); \ + tmp != &all_mddevs;}); \ + ({ spin_lock(&all_mddevs_lock); \ + tmp = tmp->next;}) \ + ) + +static mddev_t *mddev_map[MAX_MD_DEVS]; + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ + bio_io_error(bio, bio->bi_size); + return 0; +} + +static inline mddev_t *mddev_get(mddev_t *mddev) +{ + atomic_inc(&mddev->active); + return mddev; +} + +static void mddev_put(mddev_t *mddev) +{ + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) + return; + if (!mddev->raid_disks && list_empty(&mddev->disks)) { + list_del(&mddev->all_mddevs); + mddev_map[mdidx(mddev)] = NULL; + kfree(mddev); + MOD_DEC_USE_COUNT; + } + spin_unlock(&all_mddevs_lock); +} + +static mddev_t * mddev_find(int unit) +{ + mddev_t *mddev, *new = NULL; + + retry: + spin_lock(&all_mddevs_lock); + if (mddev_map[unit]) { + mddev = mddev_get(mddev_map[unit]); + spin_unlock(&all_mddevs_lock); + if (new) + kfree(new); + return mddev; + } + if (new) { + mddev_map[unit] = new; + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + MOD_INC_USE_COUNT; + return new; + } + spin_unlock(&all_mddevs_lock); + + new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + memset(new, 0, sizeof(*new)); + + new->__minor = unit; + init_MUTEX(&new->reconfig_sem); + INIT_LIST_HEAD(&new->disks); + INIT_LIST_HEAD(&new->all_mddevs); + init_timer(&new->safemode_timer); + atomic_set(&new->active, 1); + blk_queue_make_request(&new->queue, md_fail_request); + + goto retry; +} + +static inline int mddev_lock(mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +static inline void mddev_lock_uninterruptible(mddev_t * mddev) +{ + down(&mddev->reconfig_sem); +} + +static inline int mddev_trylock(mddev_t * mddev) +{ + return down_trylock(&mddev->reconfig_sem); +} + +static inline void mddev_unlock(mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->bdev->bd_dev == dev) + return rdev; + } + return NULL; +} + +inline static sector_t calc_dev_sboffset(struct block_device *bdev) +{ + sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + return MD_NEW_SIZE_BLOCKS(size); +} + +static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) +{ + sector_t size; + + size = rdev->sb_offset; + + if (chunk_size) + size &= ~((sector_t)chunk_size/1024 - 1); + return size; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(KERN_ALERT "md: out of memory.\n"); + return -EINVAL; + } + + return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) { + page_cache_release(rdev->sb_page); + rdev->sb_loaded = 0; + rdev->sb_page = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } +} + + +static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) +{ + if (bio->bi_size) + return 1; + + complete((struct completion*)bio->bi_private); + return 0; +} + +static int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw) +{ + struct bio bio; + struct bio_vec vec; + struct completion event; + + bio_init(&bio); + bio.bi_io_vec = &vec; + vec.bv_page = page; + vec.bv_len = size; + vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_idx = 0; + bio.bi_size = size; + bio.bi_bdev = bdev; + bio.bi_sector = sector; + init_completion(&event); + bio.bi_private = &event; + bio.bi_end_io = bi_complete; + submit_bio(rw, &bio); + blk_run_queues(); + wait_for_completion(&event); + + return test_bit(BIO_UPTODATE, &bio.bi_flags); +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ + + if (!rdev->sb_page) { + MD_BUG(); + return -EINVAL; + } + if (rdev->sb_loaded) + return 0; + + + if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) + goto fail; + rdev->sb_loaded = 1; + return 0; + +fail: + printk(KERN_ERR "md: disabled device %s, could not read superblock.\n", + bdev_partition_name(rdev->bdev)); + return -EINVAL; +} + +static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + if ( (sb1->set_uuid0 == sb2->set_uuid0) && + (sb1->set_uuid1 == sb2->set_uuid1) && + (sb1->set_uuid2 == sb2->set_uuid2) && + (sb1->set_uuid3 == sb2->set_uuid3)) + + return 1; + + return 0; +} + + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Handle superblock details. + * We want to be able to handle multiple superblock formats + * so we have a common interface to them all, and an array of + * different handlers. + * We rely on user-space to write the initial superblock, and support + * reading and updating of superblocks. + * Interface methods are: + * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) + * loads and validates a superblock on dev. + * if refdev != NULL, compare superblocks on both devices + * Return: + * 0 - dev has a superblock that is compatible with refdev + * 1 - dev has a superblock that is compatible and newer than refdev + * so dev should be used as the refdev in future + * -EINVAL superblock incompatible or invalid + * -othererror e.g. -EIO + * + * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) + * Verify that dev is acceptable into mddev. + * The first time, mddev->raid_disks will be 0, and data from + * dev should be merged in. Subsequent calls check that dev + * is new enough. Return 0 or -EINVAL + * + * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) + * Update the superblock for rdev with data in mddev + * This does not write to disc. + * + */ + +struct super_type { + char *name; + struct module *owner; + int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); + int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); + void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); +}; + +/* + * load_super for 0.90.0 + */ +static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ + mdp_super_t *sb; + int ret; + sector_t sb_offset; + + /* + * Calculate the position of the superblock, + * it's at the end of the disk. + * + * It also happens to be a multiple of 4Kb. + */ + sb_offset = calc_dev_sboffset(rdev->bdev); + rdev->sb_offset = sb_offset; + + ret = read_disk_sb(rdev); + if (ret) return ret; + + ret = -EINVAL; + + sb = (mdp_super_t*)page_address(rdev->sb_page); + + if (sb->md_magic != MD_SB_MAGIC) { + printk(KERN_ERR "md: invalid raid superblock magic on %s\n", + bdev_partition_name(rdev->bdev)); + goto abort; + } + + if (sb->major_version != 0 || + sb->minor_version != 90) { + printk(KERN_WARNING "Bad version number %d.%d on %s\n", + sb->major_version, sb->minor_version, + bdev_partition_name(rdev->bdev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk(KERN_ERR "md: %s: invalid raid minor (%x)\n", + bdev_partition_name(rdev->bdev), sb->md_minor); + goto abort; + } + if (sb->raid_disks <= 0) + goto abort; + + if (calc_sb_csum(sb) != sb->sb_csum) { + printk(KERN_WARNING "md: invalid superblock checksum on %s\n", + bdev_partition_name(rdev->bdev)); + goto abort; + } + + rdev->preferred_minor = sb->md_minor; + rdev->data_offset = 0; + + if (sb->level == MULTIPATH) + rdev->desc_nr = -1; + else + rdev->desc_nr = sb->this_disk.number; + + if (refdev == 0) + ret = 1; + else { + __u64 ev1, ev2; + mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); + if (!uuid_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + goto abort; + } + if (!sb_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has same UUID" + " but different superblock to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + goto abort; + } + ev1 = md_event(sb); + ev2 = md_event(refsb); + if (ev1 > ev2) + ret = 1; + else + ret = 0; + } + rdev->size = calc_dev_size(rdev, sb->chunk_size); + + abort: + return ret; +} + +/* + * validate_super for 0.90.0 + */ +static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdp_disk_t *desc; + mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + + if (mddev->raid_disks == 0) { + mddev->major_version = 0; + mddev->minor_version = sb->minor_version; + mddev->patch_version = sb->patch_version; + mddev->persistent = ! sb->not_persistent; + mddev->chunk_size = sb->chunk_size; + mddev->ctime = sb->ctime; + mddev->utime = sb->utime; + mddev->level = sb->level; + mddev->layout = sb->layout; + mddev->raid_disks = sb->raid_disks; + mddev->size = sb->size; + mddev->events = md_event(sb); + + if (sb->state & (1<recovery_cp = MaxSector; + else { + if (sb->events_hi == sb->cp_events_hi && + sb->events_lo == sb->cp_events_lo) { + mddev->recovery_cp = sb->recovery_cp; + } else + mddev->recovery_cp = 0; + } + + memcpy(mddev->uuid+0, &sb->set_uuid0, 4); + memcpy(mddev->uuid+4, &sb->set_uuid1, 4); + memcpy(mddev->uuid+8, &sb->set_uuid2, 4); + memcpy(mddev->uuid+12,&sb->set_uuid3, 4); + + mddev->max_disks = MD_SB_DISKS; + } else { + __u64 ev1; + ev1 = md_event(sb); + ++ev1; + if (ev1 < mddev->events) + return -EINVAL; + } + if (mddev->level != LEVEL_MULTIPATH) { + rdev->raid_disk = -1; + rdev->in_sync = rdev->faulty = 0; + desc = sb->disks + rdev->desc_nr; + + if (desc->state & (1<faulty = 1; + else if (desc->state & (1<raid_disk < mddev->raid_disks) { + rdev->in_sync = 1; + rdev->raid_disk = desc->raid_disk; + } + } + return 0; +} + +/* + * sync_super for 0.90.0 + */ +static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdp_super_t *sb; + struct list_head *tmp; + mdk_rdev_t *rdev2; + int next_spare = mddev->raid_disks; + + /* make rdev->sb match mddev data.. + * + * 1/ zero out disks + * 2/ Add info for each disk, keeping track of highest desc_nr + * 3/ any empty disks < highest become removed + * + * disks[0] gets initialised to REMOVED because + * we cannot be sure from other fields if it has + * been initialised or not. + */ + int highest = 0; + int i; + int active=0, working=0,failed=0,spare=0,nr_disks=0; + + sb = (mdp_super_t*)page_address(rdev->sb_page); + + memset(sb, 0, sizeof(*sb)); + + sb->md_magic = MD_SB_MAGIC; + sb->major_version = mddev->major_version; + sb->minor_version = mddev->minor_version; + sb->patch_version = mddev->patch_version; + sb->gvalid_words = 0; /* ignored */ + memcpy(&sb->set_uuid0, mddev->uuid+0, 4); + memcpy(&sb->set_uuid1, mddev->uuid+4, 4); + memcpy(&sb->set_uuid2, mddev->uuid+8, 4); + memcpy(&sb->set_uuid3, mddev->uuid+12,4); + + sb->ctime = mddev->ctime; + sb->level = mddev->level; + sb->size = mddev->size; + sb->raid_disks = mddev->raid_disks; + sb->md_minor = mddev->__minor; + sb->not_persistent = !mddev->persistent; + sb->utime = mddev->utime; + sb->state = 0; + sb->events_hi = (mddev->events>>32); + sb->events_lo = (u32)mddev->events; + + if (mddev->in_sync) + { + sb->recovery_cp = mddev->recovery_cp; + sb->cp_events_hi = (mddev->events>>32); + sb->cp_events_lo = (u32)mddev->events; + if (mddev->recovery_cp == MaxSector) + sb->state = (1<< MD_SB_CLEAN); + } else + sb->recovery_cp = 0; + + sb->layout = mddev->layout; + sb->chunk_size = mddev->chunk_size; + + sb->disks[0].state = (1<raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) + rdev2->desc_nr = rdev2->raid_disk; + else + rdev2->desc_nr = next_spare++; + d = &sb->disks[rdev2->desc_nr]; + nr_disks++; + d->number = rdev2->desc_nr; + d->major = MAJOR(rdev2->bdev->bd_dev); + d->minor = MINOR(rdev2->bdev->bd_dev); + if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) + d->raid_disk = rdev2->raid_disk; + else + d->raid_disk = rdev2->desc_nr; /* compatibility */ + if (rdev2->faulty) { + d->state = (1<in_sync) { + d->state = (1<state |= (1<state = 0; + spare++; + working++; + } + if (rdev2->desc_nr > highest) + highest = rdev2->desc_nr; + } + + /* now set the "removed" bit on any non-trailing holes */ + for (i=0; idisks[i]; + if (d->state == 0 && d->number == 0) { + d->number = i; + d->raid_disk = i; + d->state = (1<nr_disks = nr_disks; + sb->active_disks = active; + sb->working_disks = working; + sb->failed_disks = failed; + sb->spare_disks = spare; + + sb->this_disk = sb->disks[rdev->desc_nr]; + sb->sb_csum = calc_sb_csum(sb); +} + +/* + * version 1 superblock + */ + +static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +{ + unsigned int disk_csum, csum; + int size = 256 + sb->max_dev*2; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, size, 0); + sb->sb_csum = disk_csum; + return csum; +} + +static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ + struct mdp_superblock_1 *sb; + int ret; + sector_t sb_offset; + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depeding on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(minor_version) { + case 0: + sb_offset = rdev->bdev->bd_inode->i_size >> 9; + sb_offset -= 8*2; + sb_offset &= ~(4*2); + /* convert from sectors to K */ + sb_offset /= 2; + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4; + break; + default: + return -EINVAL; + } + rdev->sb_offset = sb_offset; + + ret = read_disk_sb(rdev); + if (ret) return ret; + + + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || + sb->major_version != cpu_to_le32(1) || + le32_to_cpu(sb->max_dev) > (4096-256)/2 || + le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || + sb->feature_map != 0) + return -EINVAL; + + if (calc_sb_1_csum(sb) != sb->sb_csum) { + printk("md: invalid superblock checksum on %s\n", + bdev_partition_name(rdev->bdev)); + return -EINVAL; + } + rdev->preferred_minor = 0xffff; + rdev->data_offset = le64_to_cpu(sb->data_offset); + + if (refdev == 0) + return 1; + else { + __u64 ev1, ev2; + struct mdp_superblock_1 *refsb = + (struct mdp_superblock_1*)page_address(refdev->sb_page); + + if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || + sb->level != refsb->level || + sb->layout != refsb->layout || + sb->chunksize != refsb->chunksize) { + printk(KERN_WARNING "md: %s has strangely different" + " superblock to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + return -EINVAL; + } + ev1 = le64_to_cpu(sb->events); + ev2 = le64_to_cpu(refsb->events); + + if (ev1 > ev2) + return 1; + } + if (minor_version) + rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; + else + rdev->size = rdev->sb_offset; + if (rdev->size < le64_to_cpu(sb->data_size)/2) + return -EINVAL; + rdev->size = le64_to_cpu(sb->data_size)/2; + if (le32_to_cpu(sb->chunksize)) + rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); + return 0; +} + +static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + if (mddev->raid_disks == 0) { + mddev->major_version = 1; + mddev->minor_version = 0; + mddev->patch_version = 0; + mddev->persistent = 1; + mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; + mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); + mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); + mddev->level = le32_to_cpu(sb->level); + mddev->layout = le32_to_cpu(sb->layout); + mddev->raid_disks = le32_to_cpu(sb->raid_disks); + mddev->size = (u32)le64_to_cpu(sb->size); + mddev->events = le64_to_cpu(sb->events); + + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + memcpy(mddev->uuid, sb->set_uuid, 16); + + mddev->max_disks = (4096-256)/2; + } else { + __u64 ev1; + ev1 = le64_to_cpu(sb->events); + ++ev1; + if (ev1 < mddev->events) + return -EINVAL; + } + + if (mddev->level != LEVEL_MULTIPATH) { + int role; + rdev->desc_nr = le32_to_cpu(sb->dev_number); + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + switch(role) { + case 0xffff: /* spare */ + rdev->in_sync = 0; + rdev->faulty = 0; + rdev->raid_disk = -1; + break; + case 0xfffe: /* faulty */ + rdev->in_sync = 0; + rdev->faulty = 1; + rdev->raid_disk = -1; + break; + default: + rdev->in_sync = 1; + rdev->faulty = 0; + rdev->raid_disk = role; + break; + } + } + return 0; +} + +static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct mdp_superblock_1 *sb; + struct list_head *tmp; + mdk_rdev_t *rdev2; + int max_dev, i; + /* make rdev->sb match mddev and rdev data. */ + + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + sb->feature_map = 0; + sb->pad0 = 0; + memset(sb->pad1, 0, sizeof(sb->pad1)); + memset(sb->pad2, 0, sizeof(sb->pad2)); + memset(sb->pad3, 0, sizeof(sb->pad3)); + + sb->utime = cpu_to_le64((__u64)mddev->utime); + sb->events = cpu_to_le64(mddev->events); + if (mddev->in_sync) + sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else + sb->resync_offset = cpu_to_le64(0); + + max_dev = 0; + ITERATE_RDEV(mddev,rdev2,tmp) + if (rdev2->desc_nr > max_dev) + max_dev = rdev2->desc_nr; + + sb->max_dev = max_dev; + for (i=0; idev_roles[max_dev] = cpu_to_le16(0xfffe); + + ITERATE_RDEV(mddev,rdev2,tmp) { + i = rdev2->desc_nr; + if (rdev2->faulty) + sb->dev_roles[i] = cpu_to_le16(0xfffe); + else if (rdev2->in_sync) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else + sb->dev_roles[i] = cpu_to_le16(0xffff); + } + + sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ +} + + +struct super_type super_types[] = { + [0] = { + .name = "0.90.0", + .owner = THIS_MODULE, + .load_super = super_90_load, + .validate_super = super_90_validate, + .sync_super = super_90_sync, + }, + [1] = { + .name = "md-1", + .owner = THIS_MODULE, + .load_super = super_1_load, + .validate_super = super_1_validate, + .sync_super = super_1_sync, + }, +}; + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (rdev->bdev->bd_contains == dev->bdev->bd_contains) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev)) + return 1; + + return 0; +} + +static LIST_HEAD(pending_raid_disks); + +static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return -EINVAL; + } + same_pdev = match_dev_unit(mddev, rdev); + if (same_pdev) + printk(KERN_WARNING + "md%d: WARNING: %s appears to be on the same physical" + " disk as %s. True\n protection against single-disk" + " failure might be compromised.\n", + mdidx(mddev), bdev_partition_name(rdev->bdev), + bdev_partition_name(same_pdev->bdev)); + + /* Verify rdev->desc_nr is unique. + * If it is -1, assign a free number, else + * check number is not in use + */ + if (rdev->desc_nr < 0) { + int choice = 0; + if (mddev->pers) choice = mddev->raid_disks; + while (find_rdev_nr(mddev, choice)) + choice++; + rdev->desc_nr = choice; + } else { + if (find_rdev_nr(mddev, rdev->desc_nr)) + return -EBUSY; + } + + list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev)); + return 0; +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del_init(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev)); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +{ + int err = 0; + struct block_device *bdev; + + bdev = bdget(dev); + if (!bdev) + return -ENOMEM; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + if (err) + return err; + err = bd_claim(bdev, rdev); + if (err) { + blkdev_put(bdev, BDEV_RAW); + return err; + } + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + bd_release(bdev); + blkdev_put(bdev, BDEV_RAW); +} + +void md_autodetect_dev(dev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ + printk(KERN_INFO "md: export_rdev(%s)\n", + bdev_partition_name(rdev->bdev)); + if (rdev->mddev) + MD_BUG(); + free_disk_sb(rdev); + list_del_init(&rdev->same_set); +#ifndef MODULE + md_autodetect_dev(rdev->bdev->bd_dev); +#endif + unlock_rdev(rdev); + kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); + mddev->raid_disks = 0; + mddev->major_version = 0; +} + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO + "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", + sb->level, sb->size, sb->nr_disks, sb->raid_disks, + sb->md_minor, sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" + " FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ", + bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size, + rdev->faulty, rdev->in_sync, rdev->desc_nr); + if (rdev->sb_loaded) { + printk(KERN_INFO "md: rdev superblock:\n"); + print_sb((mdp_super_t*)page_address(rdev->sb_page)); + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ + struct list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * *\n"); + printk("md: **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", bdev_partition_name(rdev->bdev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk("md: **********************************\n"); + printk("\n"); +} + + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + + if (!rdev->sb_loaded) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + + dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->sb_offset); + + if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) + return 0; + + printk("md: write_disk_sb failed for device %s\n", + bdev_partition_name(rdev->bdev)); + return 1; +} + +static void sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + super_types[mddev->major_version]. + sync_super(mddev, rdev); + rdev->sb_loaded = 1; + } +} + +static void md_update_sb(mddev_t * mddev) +{ + int err, count = 100; + struct list_head *tmp; + mdk_rdev_t *rdev; + + mddev->sb_dirty = 0; +repeat: + mddev->utime = get_seconds(); + mddev->events ++; + + if (!mddev->events) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->events --; + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (!mddev->persistent) + return; + + dprintk(KERN_INFO + "md: updating md%d RAID superblock on device (in sync %d)\n", + mdidx(mddev),mddev->in_sync); + + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + dprintk(KERN_INFO "md: "); + if (rdev->faulty) + dprintk("(skipping faulty "); + + dprintk("%s ", bdev_partition_name(rdev->bdev)); + if (!rdev->faulty) { + err += write_disk_sb(rdev); + } else + dprintk(")\n"); + if (!err && mddev->level == LEVEL_MULTIPATH) + /* only need to write one superblock... */ + break; + } + if (err) { + if (--count) { + printk(KERN_ERR "md: errors occurred during superblock" + " update, repeating\n"); + goto repeat; + } + printk(KERN_ERR \ + "md: excessive errors occurred during superblock update, exiting\n"); + } +} + +/* + * Import a device. If 'super_format' >= 0, then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + * a faulty rdev _never_ has rdev->sb set. + */ +static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) +{ + int err; + mdk_rdev_t *rdev; + sector_t size; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk(KERN_ERR "md: could not alloc mem for %s!\n", + partition_name(newdev)); + return ERR_PTR(-ENOMEM); + } + memset(rdev, 0, sizeof(*rdev)); + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + err = lock_rdev(rdev, newdev); + if (err) { + printk(KERN_ERR "md: could not lock %s.\n", + partition_name(newdev)); + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + rdev->in_sync = 0; + rdev->data_offset = 0; + atomic_set(&rdev->nr_pending, 0); + + size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + if (!size) { + printk(KERN_WARNING + "md: %s has zero or unknown size, marking faulty!\n", + bdev_partition_name(rdev->bdev)); + err = -EINVAL; + goto abort_free; + } + + if (super_format >= 0) { + err = super_types[super_format]. + load_super(rdev, NULL, super_minor); + if (err == -EINVAL) { + printk(KERN_WARNING + "md: %s has invalid sb, not importing!\n", + bdev_partition_name(rdev->bdev)); + goto abort_free; + } + if (err < 0) { + printk(KERN_WARNING + "md: could not read %s's sb, not importing!\n", + bdev_partition_name(rdev->bdev)); + goto abort_free; + } + } + INIT_LIST_HEAD(&rdev->same_set); + + return rdev; + +abort_free: + if (rdev->sb_page) { + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return ERR_PTR(err); +} + +/* + * Check a full RAID array for plausibility + */ + + +static int analyze_sbs(mddev_t * mddev) +{ + int i; + struct list_head *tmp; + mdk_rdev_t *rdev, *freshest; + + freshest = NULL; + ITERATE_RDEV(mddev,rdev,tmp) + switch (super_types[mddev->major_version]. + load_super(rdev, freshest, mddev->minor_version)) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + printk( KERN_ERR \ + "md: fatal superblock inconsistency in %s" + " -- removing from array\n", + bdev_partition_name(rdev->bdev)); + kick_rdev_from_array(rdev); + } + + + super_types[mddev->major_version]. + validate_super(mddev, freshest); + + i = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev != freshest) + if (super_types[mddev->major_version]. + validate_super(mddev, rdev)) { + printk(KERN_WARNING "md: kicking non-fresh %s" + " from array!\n", + bdev_partition_name(rdev->bdev)); + kick_rdev_from_array(rdev); + continue; + } + if (mddev->level == LEVEL_MULTIPATH) { + rdev->desc_nr = i++; + rdev->raid_disk = rdev->desc_nr; + rdev->in_sync = 1; + } + } + + + /* + * Check if we can support this RAID array + */ + if (mddev->major_version != MD_MAJOR_VERSION || + mddev->minor_version > MD_MINOR_VERSION) { + printk(KERN_ALERT + "md: md%d: unsupported raid array version %d.%d.%d\n", + mdidx(mddev), mddev->major_version, + mddev->minor_version, mddev->patch_version); + goto abort; + } + + if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) || + (mddev->level == 4) || (mddev->level == 5))) + printk(KERN_ERR "md: md%d: raid array is not clean" + " -- starting background reconstruction\n", + mdidx(mddev)); + + return 0; +abort: + return 1; +} + +static int device_size_calculation(mddev_t * mddev) +{ + int data_disks = 0; + unsigned int readahead; + struct list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size < mddev->chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size:" + " %lluk < %dk\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->size, + mddev->chunk_size / 1024); + return -EINVAL; + } + } + + switch (mddev->level) { + case LEVEL_MULTIPATH: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case LEVEL_LINEAR: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = mddev->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = mddev->raid_disks-1; + break; + default: + printk(KERN_ERR "md: md%d: unsupported raid level %d\n", + mdidx(mddev), mddev->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = mddev->size * data_disks; + + readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; + if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) { + readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks; + if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) + readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; + } else { + // (no multipath branch - it uses the default setting) + if (mddev->level == -3) + readahead = 0; + } + + printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", + mdidx(mddev), readahead*(PAGE_SIZE/1024)); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %ldk\n", + mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); + return 0; +abort: + return 1; +} + +static struct gendisk *md_probe(dev_t dev, int *part, void *data) +{ + static DECLARE_MUTEX(disks_sem); + int unit = MINOR(dev); + mddev_t *mddev = mddev_find(unit); + struct gendisk *disk; + + if (!mddev) + return NULL; + + down(&disks_sem); + if (disks[unit]) { + up(&disks_sem); + mddev_put(mddev); + return NULL; + } + disk = alloc_disk(1); + if (!disk) { + up(&disks_sem); + mddev_put(mddev); + return NULL; + } + disk->major = MD_MAJOR; + disk->first_minor = mdidx(mddev); + sprintf(disk->disk_name, "md%d", mdidx(mddev)); + disk->fops = &md_fops; + disk->private_data = mddev; + disk->queue = &mddev->queue; + add_disk(disk); + disks[mdidx(mddev)] = disk; + up(&disks_sem); + return NULL; +} + +void md_wakeup_thread(mdk_thread_t *thread); + +static void md_safemode_timeout(unsigned long data) +{ + mddev_t *mddev = (mddev_t *) data; + + mddev->safemode = 1; + md_wakeup_thread(mddev->thread); +} + + +static int do_md_run(mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct list_head *tmp; + mdk_rdev_t *rdev; + struct gendisk *disk; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Analyze all RAID superblock(s) + */ + if (!mddev->raid_disks && analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->chunk_size; + pnum = level_to_pers(mddev->level); + + if ((pnum != MULTIPATH) && (pnum != RAID1)) { + if (!chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We don't + * want to continue the bad practice. + */ + printk(KERN_ERR + "no chunksize specified, see 'man raidtab'\n"); + return -EINVAL; + } + if (chunk_size > MAX_CHUNK_SIZE) { + printk(KERN_ERR "too big chunk_size: %d > %d\n", + chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(KERN_ERR "too small chunk_size: %d < %ld\n", + chunk_size, PAGE_SIZE); + return -EINVAL; + } + + /* devices must have minimum size of one chunk */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size < chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size:" + " %lluk < %dk\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->size, + chunk_size / 1024); + return -EINVAL; + } + } + } + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + +#ifdef CONFIG_KMOD + if (!pers[pnum]) + { + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + } +#endif + + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + sync_blockdev(rdev->bdev); + invalidate_bdev(rdev->bdev, 0); + } + + md_probe(mdidx(mddev), NULL, NULL); + disk = disks[mdidx(mddev)]; + if (!disk) + return -ENOMEM; + + spin_lock(&pers_lock); + if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { + spin_unlock(&pers_lock); + printk(KERN_ERR "md: personality %d is not loaded!\n", + pnum); + return -EINVAL; + } + + mddev->pers = pers[pnum]; + spin_unlock(&pers_lock); + + blk_queue_make_request(&mddev->queue, mddev->pers->make_request); + printk("%s: setting max_sectors to %d, segment boundary to %d\n", + disk->disk_name, + chunk_size >> 9, + (chunk_size>>1)-1); + blk_queue_max_sectors(&mddev->queue, chunk_size >> 9); + blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1); + mddev->queue.queuedata = mddev; + + err = mddev->pers->run(mddev); + if (err) { + printk(KERN_ERR "md: pers->run() failed ...\n"); + module_put(mddev->pers->owner); + mddev->pers = NULL; + return -EINVAL; + } + atomic_set(&mddev->writes_pending,0); + mddev->safemode = 0; + mddev->safemode_timer.function = md_safemode_timeout; + mddev->safemode_timer.data = (unsigned long) mddev; + mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ + mddev->in_sync = 1; + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + set_capacity(disk, mddev->array_size<<1); + return 0; +} + +static int restart_array(mddev_t *mddev) +{ + struct gendisk *disk = disks[mdidx(mddev)]; + int err; + + /* + * Complain if it has no devices + */ + err = -ENXIO; + if (list_empty(&mddev->disks)) + goto out; + + if (mddev->pers) { + err = -EBUSY; + if (!mddev->ro) + goto out; + + mddev->safemode = 0; + mddev->ro = 0; + set_disk_ro(disk, 0); + + printk(KERN_INFO "md: md%d switched to read-write mode.\n", + mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + err = 0; + } else { + printk(KERN_ERR "md: md%d has no personality assigned.\n", + mdidx(mddev)); + err = -EINVAL; + } + +out: + return err; +} + +static int do_md_stop(mddev_t * mddev, int ro) +{ + int err = 0; + struct gendisk *disk = disks[mdidx(mddev)]; + + if (atomic_read(&mddev->active)>2) { + printk("md: md%d still in use.\n",mdidx(mddev)); + err = -EBUSY; + goto out; + } + + if (mddev->pers) { + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + } + + del_timer_sync(&mddev->safemode_timer); + + invalidate_device(mk_kdev(disk->major, disk->first_minor), 1); + + if (ro) { + err = -ENXIO; + if (mddev->ro) + goto out; + mddev->ro = 1; + } else { + if (mddev->ro) + set_disk_ro(disk, 0); + if (mddev->pers->stop(mddev)) { + err = -EBUSY; + if (mddev->ro) + set_disk_ro(disk, 1); + goto out; + } + module_put(mddev->pers->owner); + mddev->pers = NULL; + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->raid_disks) { + /* mark array as shutdown cleanly */ + mddev->in_sync = 1; + md_update_sb(mddev); + } + if (ro) + set_disk_ro(disk, 1); + } + /* + * Free resources if final stop + */ + if (!ro) { + struct gendisk *disk; + printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); + + export_array(mddev); + + mddev->array_size = 0; + disk = disks[mdidx(mddev)]; + if (disk) + set_capacity(disk, 0); + } else + printk(KERN_INFO "md: md%d switched to read-only mode.\n", + mdidx(mddev)); + err = 0; +out: + return err; +} + +static void autorun_array(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + int err; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return; + } + + printk(KERN_INFO "md: running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", bdev_partition_name(rdev->bdev)); + } + printk("\n"); + + err = do_md_run (mddev); + if (err) { + printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in pending_raid_disks) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(void) +{ + struct list_head candidates; + struct list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + + printk(KERN_INFO "md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + rdev0 = list_entry(pending_raid_disks.next, + mdk_rdev_t, same_set); + + printk(KERN_INFO "md: considering %s ...\n", + bdev_partition_name(rdev0->bdev)); + INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) + if (super_90_load(rdev, rdev0, 0) >= 0) { + printk(KERN_INFO "md: adding %s ...\n", + bdev_partition_name(rdev->bdev)); + list_move(&rdev->same_set, &candidates); + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + + mddev = mddev_find(rdev0->preferred_minor); + if (!mddev) { + printk(KERN_ERR + "md: cannot allocate memory for md drive.\n"); + break; + } + if (mddev_lock(mddev)) + printk(KERN_WARNING "md: md%d locked, cannot run\n", + mdidx(mddev)); + else if (mddev->raid_disks || mddev->major_version + || !list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: md%d already running, cannot run %s\n", + mdidx(mddev), bdev_partition_name(rdev0->bdev)); + mddev_unlock(mddev); + } else { + printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { + list_del_init(&rdev->same_set); + if (bind_rdev_to_array(rdev, mddev)) + export_rdev(rdev); + } + autorun_array(mddev); + mddev_unlock(mddev); + } + /* on success, candidates will be empty, on error + * it won't... + */ + ITERATE_RDEV_GENERIC(candidates,rdev,tmp) + export_rdev(rdev); + mddev_put(mddev); + } + printk(KERN_INFO "md: ... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +static int autostart_array(dev_t startdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + start_rdev = md_import_device(startdev, 0, 0); + if (IS_ERR(start_rdev)) { + printk(KERN_WARNING "md: could not import %s!\n", + partition_name(startdev)); + return err; + } + + /* NOTE: this can only work for 0.90.0 superblocks */ + sb = (mdp_super_t*)page_address(start_rdev->sb_page); + if (sb->major_version != 0 || + sb->minor_version != 90 ) { + printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); + export_rdev(start_rdev); + return err; + } + + if (start_rdev->faulty) { + printk(KERN_WARNING + "md: can not autostart based on faulty %s!\n", + bdev_partition_name(start_rdev->bdev)); + export_rdev(start_rdev); + return err; + } + list_add(&start_rdev->same_set, &pending_raid_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + dev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (!dev) + continue; + if (dev == startdev) + continue; + rdev = md_import_device(dev, 0, 0); + if (IS_ERR(rdev)) { + printk(KERN_WARNING "md: could not import %s," + " trying to run array nevertheless.\n", + partition_name(dev)); + continue; + } + list_add(&rdev->same_set, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(); + return 0; + +} + + +static int get_version(void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +static int get_array_info(mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + int nr,working,active,failed,spare; + mdk_rdev_t *rdev; + struct list_head *tmp; + + nr=working=active=failed=spare=0; + ITERATE_RDEV(mddev,rdev,tmp) { + nr++; + if (rdev->faulty) + failed++; + else { + working++; + if (rdev->in_sync) + active++; + else + spare++; + } + } + + info.major_version = mddev->major_version; + info.minor_version = mddev->minor_version; + info.patch_version = 1; + info.ctime = mddev->ctime; + info.level = mddev->level; + info.size = mddev->size; + info.nr_disks = nr; + info.raid_disks = mddev->raid_disks; + info.md_minor = mddev->__minor; + info.not_persistent= !mddev->persistent; + + info.utime = mddev->utime; + info.state = 0; + if (mddev->in_sync) + info.state = (1<layout; + info.chunk_size = mddev->chunk_size; + + if (copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static int get_disk_info(mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + mdk_rdev_t *rdev; + + if (copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + + rdev = find_rdev_nr(mddev, nr); + if (rdev) { + info.major = MAJOR(rdev->bdev->bd_dev); + info.minor = MINOR(rdev->bdev->bd_dev); + info.raid_disk = rdev->raid_disk; + info.state = 0; + if (rdev->faulty) + info.state |= (1<in_sync) { + info.state |= (1<major,info->minor); + if (!mddev->raid_disks) { + int err; + /* expecting a device which has a superblock */ + rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + if (!list_empty(&mddev->disks)) { + mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + int err = super_types[mddev->major_version] + .load_super(rdev, rdev0, mddev->minor_version); + if (err < 0) { + printk(KERN_WARNING + "md: %s has different UUID to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(rdev0->bdev)); + export_rdev(rdev); + return -EINVAL; + } + } + err = bind_rdev_to_array(rdev, mddev); + if (err) + export_rdev(rdev); + return err; + } + + /* + * add_new_disk can be used once the array is assembled + * to add "hot spares". They must already have a superblock + * written + */ + if (mddev->pers) { + int err; + if (!mddev->pers->hot_add_disk) { + printk(KERN_WARNING + "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + rdev = md_import_device(dev, mddev->major_version, + mddev->minor_version); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + rdev->in_sync = 0; /* just to be sure */ + rdev->raid_disk = -1; + err = bind_rdev_to_array(rdev, mddev); + if (err) + export_rdev(rdev); + if (mddev->thread) + md_wakeup_thread(mddev->thread); + return err; + } + + /* otherwise, add_new_disk is only allowed + * for major_version==0 superblocks + */ + if (mddev->major_version != 0) { + printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n", + mdidx(mddev)); + return -EINVAL; + } + + if (!(info->state & (1<desc_nr = info->number; + if (info->raid_disk < mddev->raid_disks) + rdev->raid_disk = info->raid_disk; + else + rdev->raid_disk = -1; + + rdev->faulty = 0; + if (rdev->raid_disk < mddev->raid_disks) + rdev->in_sync = (info->state & (1<in_sync = 0; + + err = bind_rdev_to_array(rdev, mddev); + if (err) { + export_rdev(rdev); + return err; + } + + if (!mddev->persistent) { + printk(KERN_INFO "md: nonpersistent superblock ...\n"); + rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + } else + rdev->sb_offset = calc_dev_sboffset(rdev->bdev); + rdev->size = calc_dev_size(rdev, mddev->chunk_size); + + if (!mddev->size || (mddev->size > rdev->size)) + mddev->size = rdev->size; + } + + return 0; +} + +static int hot_generate_error(mddev_t * mddev, dev_t dev) +{ + struct request_queue *q; + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + return -ENXIO; + } + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + if (!rdev->in_sync) + return -ENODEV; + + q = bdev_get_queue(rdev->bdev); + if (!q) { + MD_BUG(); + return -ENODEV; + } + printk(KERN_INFO "md: okay, generating error!\n"); +// q->oneshot_error = 1; // disabled for now + + return 0; +} + +static int hot_remove_disk(mddev_t * mddev, dev_t dev) +{ + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->raid_disk >= 0) + goto busy; + + kick_rdev_from_array(rdev); + md_update_sb(mddev); + + return 0; +busy: + printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", + bdev_partition_name(rdev->bdev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk(mddev_t * mddev, dev_t dev) +{ + int err; + unsigned int size; + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (mddev->major_version != 0) { + printk(KERN_WARNING "md%d: HOT_ADD may only be used with" + " version-0 superblocks.\n", + mdidx(mddev)); + return -EINVAL; + } + if (!mddev->pers->hot_add_disk) { + printk(KERN_WARNING + "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = md_import_device (dev, -1, 0); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: error, md_import_device() returned %ld\n", + PTR_ERR(rdev)); + return -EINVAL; + } + + rdev->sb_offset = calc_dev_sboffset(rdev->bdev); + size = calc_dev_size(rdev, mddev->chunk_size); + rdev->size = size; + + if (size < mddev->size) { + printk(KERN_WARNING + "md%d: disk size %llu blocks < array size %llu\n", + mdidx(mddev), (unsigned long long)size, + (unsigned long long)mddev->size); + err = -ENOSPC; + goto abort_export; + } + + if (rdev->faulty) { + printk(KERN_WARNING + "md: can not hot-add faulty %s disk to md%d!\n", + bdev_partition_name(rdev->bdev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + rdev->in_sync = 0; + rdev->desc_nr = -1; + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + + if (rdev->desc_nr == mddev->max_disks) { + printk(KERN_WARNING "md%d: can not hot-add to full array!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + rdev->raid_disk = -1; + + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +/* + * set_array_info is used two different ways + * The original usage is when creating a new array. + * In this usage, raid_disks is > = and it together with + * level, size, not_persistent,layout,chunksize determine the + * shape of the array. + * This will always create an array with a type-0.90.0 superblock. + * The newer usage is when assembling an array. + * In this case raid_disks will be 0, and the major_version field is + * use to determine which style super-blocks are to be found on the devices. + * The minor and patch _version numbers are also kept incase the + * super_block handler wishes to interpret them. + */ +static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +{ + + if (info->raid_disks == 0) { + /* just setting version number for superblock loading */ + if (info->major_version < 0 || + info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || + super_types[info->major_version].name == NULL) { + /* maybe try to auto-load a module? */ + printk(KERN_INFO + "md: superblock version %d not known\n", + info->major_version); + return -EINVAL; + } + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; + mddev->minor_version = MD_MINOR_VERSION; + mddev->patch_version = MD_PATCHLEVEL_VERSION; + mddev->ctime = get_seconds(); + + mddev->level = info->level; + mddev->size = info->size; + mddev->raid_disks = info->raid_disks; + /* don't set __minor, it is determined by which /dev/md* was + * openned + */ + if (info->state & (1<recovery_cp = MaxSector; + else + mddev->recovery_cp = 0; + mddev->persistent = ! info->not_persistent; + + mddev->layout = info->layout; + mddev->chunk_size = info->chunk_size; + + mddev->max_disks = MD_SB_DISKS; + + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(mddev->uuid, 16); + + return 0; +} + +static int set_disk_faulty(mddev_t *mddev, dev_t dev) +{ + mdk_rdev_t *rdev; + + rdev = find_rdev(mddev, dev); + if (!rdev) + return 0; + + md_error(mddev, rdev); + return 1; +} + +static int md_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + dev = inode->i_rdev; + minor = minor(dev); + if (minor >= MAX_MD_DEVS) { + MD_BUG(); + return -EINVAL; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; + +#ifndef MODULE + case RAID_AUTORUN: + err = 0; + autostart_arrays(); + goto done; +#endif + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) { + BUG(); + goto abort; + } + + + if (cmd == START_ARRAY) { + /* START_ARRAY doesn't need to lock the array as autostart_array + * does the locking, and it could even be a different array + */ + err = autostart_array(arg); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name(arg)); + goto abort; + } + goto done; + } + + err = mddev_lock(mddev); + if (err) { + printk(KERN_INFO + "md: ioctl lock interrupted, reason %d, cmd %d\n", + err, cmd); + goto abort; + } + + switch (cmd) + { + case SET_ARRAY_INFO: + + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: array md%d already has disks!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->raid_disks) { + printk(KERN_WARNING + "md: array md%d already initialised!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + { + mdu_array_info_t info; + if (!arg) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't set" + " array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + + default:; + } + + /* + * Commands querying/configuring an existing array: + */ + /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop (mddev, 0); + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = put_user(get_capacity(disks[mdidx(mddev)])/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = put_user (get_start_sect(inode->i_bdev), + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + case HOT_GENERATE_ERROR: + err = hot_generate_error(mddev, arg); + goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, arg); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, arg); + goto done_unlock; + + case RUN_ARRAY: + { + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + * ->pers will not be set, to superblock will + * not be updated. + */ + if (err) + do_md_stop (mddev, 0); + goto done_unlock; + } + + default: + if (_IOC_TYPE(cmd) == MD_MAJOR) + printk(KERN_WARNING "md: %s(pid %d) used" + " obsolete MD ioctl, upgrade your" + " software to use new ictls.\n", + current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + mddev_unlock(mddev); + + return err; +done: + if (err) + MD_BUG(); +abort: + return err; +} + +static int md_open(struct inode *inode, struct file *file) +{ + /* + * Succeed if we can find or allocate a mddev structure. + */ + mddev_t *mddev = mddev_find(minor(inode->i_rdev)); + int err = -ENOMEM; + + if (!mddev) + goto out; + + if ((err = mddev_lock(mddev))) + goto put; + + err = 0; + mddev_unlock(mddev); + inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev); + put: + mddev_put(mddev); + out: + return err; +} + +static int md_release(struct inode *inode, struct file * file) +{ + mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) + BUG(); + mddev_put(mddev); + + return 0; +} + +static struct block_device_operations md_fops = +{ + .owner = THIS_MODULE, + .open = md_open, + .release = md_release, + .ioctl = md_ioctl, +}; + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + lock_kernel(); + + /* + * Detach thread + */ + + daemonize(thread->name, mdidx(thread->mddev)); + + current->exit_signal = SIGCHLD; + allow_signal(SIGKILL); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run)(mddev_t *); + + wait_event_interruptible(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags)); + if (current->flags & PF_FREEZE) + refrigerator(PF_IOTHREAD); + + clear_bit(THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->mddev); + blk_run_queues(); + } + if (signal_pending(current)) + flush_signals(current); + } + complete(thread->event); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + if (thread) { + dprintk("md: waking up MD thread %p.\n", thread); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); + } +} + +mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, + const char *name) +{ + mdk_thread_t *thread; + int ret; + struct completion event; + + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; + + memset(thread, 0, sizeof(mdk_thread_t)); + init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->mddev = mddev; + thread->name = name; + ret = kernel_thread(md_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +void md_interrupt_thread(mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread(mdk_thread_t *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + md_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +void md_error(mddev_t *mddev, mdk_rdev_t *rdev) +{ + dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", + MD_MAJOR,mdidx(mddev), + MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); + + if (!mddev) { + MD_BUG(); + return; + } + + if (!rdev || rdev->faulty) + return; + if (!mddev->pers->error_handler) + return; + mddev->pers->error_handler(mddev,rdev); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); +} + +/* seq_file implementation /proc/mdstat */ + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + mdk_rdev_t *rdev; + struct list_head *tmp; + + seq_printf(seq, "unused devices: "); + + ITERATE_RDEV_PENDING(rdev,tmp) { + i++; + seq_printf(seq, "%s ", + bdev_partition_name(rdev->bdev)); + } + if (!i) + seq_printf(seq, ""); + + seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, mddev_t * mddev) +{ + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + max_blocks = mddev->size; + + /* + * Should not happen. + */ + if (!max_blocks) { + MD_BUG(); + return; + } + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", + (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? + "resync" : "recovery"), + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt/2); + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/dt); +} + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + mddev_t *mddev; + + if (l > 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + spin_lock(&all_mddevs_lock); + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, mddev_t, all_mddevs); + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + return mddev; + } + spin_unlock(&all_mddevs_lock); + return (void*)2;/* tail */ +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + mddev_t *next_mddev, *mddev = v; + + ++*pos; + if (v == (void*)2) + return NULL; + + spin_lock(&all_mddevs_lock); + if (v == (void*)1) + tmp = all_mddevs.next; + else + tmp = mddev->all_mddevs.next; + if (tmp != &all_mddevs) + next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); + else { + next_mddev = (void*)2; + *pos = 0x10000; + } + spin_unlock(&all_mddevs_lock); + + if (v != (void*)1) + mddev_put(mddev); + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + mddev_t *mddev = v; + + if (mddev && v != (void*)1 && v != (void*)2) + mddev_put(mddev); +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + mddev_t *mddev = v; + sector_t size; + struct list_head *tmp2; + mdk_rdev_t *rdev; + int i; + + if (v == (void*)1) { + seq_printf(seq, "Personalities : "); + spin_lock(&pers_lock); + for (i = 0; i < MAX_PERSONALITY; i++) + if (pers[i]) + seq_printf(seq, "[%s] ", pers[i]->name); + + spin_unlock(&pers_lock); + seq_printf(seq, "\n"); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + if (mddev_lock(mddev)!=0) + return -EINTR; + if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { + seq_printf(seq, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + seq_printf(seq, " (read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + seq_printf(seq, " %s[%d]", + bdev_partition_name(rdev->bdev), rdev->desc_nr); + if (rdev->faulty) { + seq_printf(seq, "(F)"); + continue; + } + size += rdev->size; + } + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %llu blocks", + (unsigned long long)mddev->array_size); + else + seq_printf(seq, "\n %llu blocks", + (unsigned long long)size); + } + + if (mddev->pers) { + mddev->pers->status (seq, mddev); + seq_printf(seq, "\n "); + if (mddev->curr_resync > 2) + status_resync (seq, mddev); + else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + seq_printf(seq, " resync=DELAYED"); + } + + seq_printf(seq, "\n"); + } + mddev_unlock(mddev); + + return 0; +} + +static struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + int error; + + error = seq_open(file, &md_seq_ops); + return error; +} + +static struct file_operations md_seq_fops = { + .open = md_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int register_md_personality(int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + spin_lock(&pers_lock); + if (pers[pnum]) { + spin_unlock(&pers_lock); + MD_BUG(); + return -EBUSY; + } + + pers[pnum] = p; + printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); + spin_unlock(&pers_lock); + return 0; +} + +int unregister_md_personality(int pnum) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); + spin_lock(&pers_lock); + pers[pnum] = NULL; + spin_unlock(&pers_lock); + return 0; +} + +void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors) +{ + rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors; +} + +static int is_mddev_idle(mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; + curr_events = disk_stat_read(disk, read_sectors) + + disk_stat_read(disk, write_sectors) - + disk->sync_io; + if ((curr_events - rdev->last_events) > 32) { + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + set_bit(MD_RECOVERY_ERR, &mddev->recovery); + md_wakeup_thread(mddev->thread); + // stop recovery, signal do_sync .... + } +} + + +void md_write_start(mddev_t *mddev) +{ + if (!atomic_read(&mddev->writes_pending)) { + mddev_lock_uninterruptible(mddev); + if (mddev->in_sync) { + mddev->in_sync = 0; + del_timer(&mddev->safemode_timer); + md_update_sb(mddev); + } + atomic_inc(&mddev->writes_pending); + mddev_unlock(mddev); + } else + atomic_inc(&mddev->writes_pending); +} + +void md_write_end(mddev_t *mddev) +{ + if (atomic_dec_and_test(&mddev->writes_pending)) { + if (mddev->safemode == 2) + md_wakeup_thread(mddev->thread); + else + mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); + } +} + +static inline void md_enter_safemode(mddev_t *mddev) +{ + mddev_lock_uninterruptible(mddev); + if (mddev->safemode && !atomic_read(&mddev->writes_pending) && + !mddev->in_sync && mddev->recovery_cp == MaxSector) { + mddev->in_sync = 1; + md_update_sb(mddev); + } + mddev_unlock(mddev); + + if (mddev->safemode == 1) + mddev->safemode = 0; +} + +void md_handle_safemode(mddev_t *mddev) +{ + if (signal_pending(current)) { + printk(KERN_INFO "md: md%d in immediate safe mode\n", + mdidx(mddev)); + mddev->safemode = 2; + flush_signals(current); + } + if (mddev->safemode) + md_enter_safemode(mddev); +} + + +DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +static void md_do_sync(mddev_t *mddev) +{ + mddev_t *mddev2; + unsigned int max_sectors, currspeed = 0, + j, window; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct list_head *tmp; + unsigned long last_check; + + /* just incase thread restarts... */ + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + return; + + /* we overload curr_resync somewhat here. + * 0 == not engaged in resync at all + * 2 == checking that there is no conflict with another sync + * 1 == like 2, but have yielded to allow conflicting resync to + * commense + * other == active in resync - this many blocks + */ + do { + mddev->curr_resync = 2; + + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && + match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: delaying resync of md%d" + " until md%d has finished resync (they" + " share one or more physical units)\n", + mdidx(mddev), mdidx(mddev2)); + if (mddev < mddev2) {/* arbitrarily yield */ + mddev->curr_resync = 1; + wake_up(&resync_wait); + } + if (wait_event_interruptible(resync_wait, + mddev2->curr_resync < mddev->curr_resync)) { + flush_signals(current); + mddev_put(mddev2); + goto skip; + } + } + if (mddev->curr_resync == 1) { + mddev_put(mddev2); + break; + } + } + } while (mddev->curr_resync < 2); + + max_sectors = mddev->size << 1; + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" + " %d KB/sec/disc.\n", sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith " + "(but not more than %d KB/sec) for reconstruction.\n", + sysctl_speed_limit_max); + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + j = mddev->recovery_cp; + else + j = 0; + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = j; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = 32*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", + window/2,max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + + if (j) + printk(KERN_INFO + "md: resuming recovery of md%d from checkpoint.\n", + mdidx(mddev)); + + while (j < max_sectors) { + int sectors; + + sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); + if (sectors < 0) { + set_bit(MD_RECOVERY_ERR, &mddev->recovery); + goto out; + } + atomic_add(sectors, &mddev->recovery_active); + j += sectors; + if (j>1) mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + last_check = j; + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || + test_bit(MD_RECOVERY_ERR, &mddev->recovery)) + break; + + blk_run_queues(); + + repeat: + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (signal_pending(current)) { + /* + * got a signal, exit. + */ + printk(KERN_INFO + "md: md_do_sync() got signal ... exiting\n"); + flush_signals(current); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + cond_resched(); + + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/4); + goto repeat; + } + } + } + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + /* + * this also signals 'finished resyncing' to md_stop + */ + out: + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + + /* tell personality that we are finished */ + mddev->pers->sync_request(mddev, max_sectors, 1); + + if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && + mddev->curr_resync > 2 && + mddev->curr_resync > mddev->recovery_cp) { + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + printk(KERN_INFO + "md: checkpointing recovery of md%d.\n", + mdidx(mddev)); + mddev->recovery_cp = mddev->curr_resync; + } else + mddev->recovery_cp = MaxSector; + } + + if (mddev->safemode) + md_enter_safemode(mddev); + skip: + mddev->curr_resync = 0; + set_bit(MD_RECOVERY_DONE, &mddev->recovery); + md_wakeup_thread(mddev->thread); +} + + +/* + * This routine is regularly called by all per-raid-array threads to + * deal with generic issues like resync and super-block update. + * Raid personalities that don't have a thread (linear/raid0) do not + * need this as they never do any recovery or update the superblock. + * + * It does not do any resync itself, but rather "forks" off other threads + * to do that as needed. + * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in + * "->recovery" and create a thread at ->sync_thread. + * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) + * and wakeups up this thread which will reap the thread and finish up. + * This thread also removes any faulty devices (with nr_pending == 0). + * + * The overall approach is: + * 1/ if the superblock needs updating, update it. + * 2/ If a recovery thread is running, don't do anything else. + * 3/ If recovery has finished, clean up, possibly marking spares active. + * 4/ If there are any faulty devices, remove them. + * 5/ If array is degraded, try to add spares devices + * 6/ If array has spares or is not in-sync, start a resync thread. + */ +void md_check_recovery(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct list_head *rtmp; + + + dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); + + if (mddev->ro) + return; + if ( ! ( + mddev->sb_dirty || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_DONE, &mddev->recovery) + )) + return; + if (mddev_trylock(mddev)==0) { + int spares =0; + if (mddev->sb_dirty) + md_update_sb(mddev); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + /* resync/recovery still happening */ + goto unlock; + if (mddev->sync_thread) { + /* resync has finished, collect result */ + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + mddev->pers->spare_active(mddev); + } + md_update_sb(mddev); + mddev->recovery = 0; + wake_up(&resync_wait); + goto unlock; + } + if (mddev->recovery) { + /* that's odd.. */ + mddev->recovery = 0; + wake_up(&resync_wait); + } + + /* no recovery is running. + * remove any failed drives, then + * add spares if possible + */ + ITERATE_RDEV(mddev,rdev,rtmp) { + if (rdev->raid_disk >= 0 && + rdev->faulty && + atomic_read(&rdev->nr_pending)==0) { + mddev->pers->hot_remove_disk(mddev, rdev->raid_disk); + rdev->raid_disk = -1; + } + if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync) + spares++; + } + if (mddev->degraded) { + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk < 0 + && !rdev->faulty) { + if (mddev->pers->hot_add_disk(mddev,rdev)) + spares++; + else + break; + } + } + + if (!spares && (mddev->recovery_cp == MaxSector )) { + /* nothing we can do ... */ + goto unlock; + } + if (mddev->pers->sync_request) { + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + if (!spares) + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "md%d_resync"); + if (!mddev->sync_thread) { + printk(KERN_ERR "md%d: could not start resync" + " thread...\n", + mdidx(mddev)); + /* leave the spares where they are, it shouldn't hurt */ + mddev->recovery = 0; + } else { + md_wakeup_thread(mddev->sync_thread); + } + } + unlock: + mddev_unlock(mddev); + } +} + +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct list_head *tmp; + mddev_t *mddev; + + if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { + + printk(KERN_INFO "md: stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + if (mddev_trylock(mddev)==0) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + .notifier_call = md_notify_reboot, + .next = NULL, + .priority = INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + struct proc_dir_entry *p; + + dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + p = create_proc_entry("mdstat", S_IRUGO, NULL); + if (p) + p->proc_fops = &md_seq_fops; +#endif +} + +int __init md_init(void) +{ + int minor; + + printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," + " MD_SB_DISKS=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + + if (register_blkdev(MAJOR_NR, "md")) + return -1; + + devfs_mk_dir("md"); + blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, + md_probe, NULL, NULL); + for (minor=0; minor < MAX_MD_DEVS; ++minor) { + char name[16]; + sprintf(name, "md/%d", minor); + devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor, + S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); + } + + register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table, 1); + + md_geninit(); + return (0); +} + + +#ifndef MODULE + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +static dev_t detected_devices[128]; +static int dev_cnt; + +void md_autodetect_dev(dev_t dev) +{ + if (dev_cnt >= 0 && dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} + + +static void autostart_arrays(void) +{ + mdk_rdev_t *rdev; + int i; + + printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + + for (i = 0; i < dev_cnt; i++) { + dev_t dev = detected_devices[i]; + + rdev = md_import_device(dev,0, 0); + if (IS_ERR(rdev)) { + printk(KERN_ALERT "md: could not import %s!\n", + partition_name(dev)); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + list_add(&rdev->same_set, &pending_raid_disks); + } + dev_cnt = 0; + + autorun_devices(); +} + +#endif + +static __exit void md_exit(void) +{ + int i; + blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); + for (i=0; i < MAX_MD_DEVS; i++) + devfs_remove("md/%d", i); + devfs_remove("md"); + + unregister_blkdev(MAJOR_NR,"md"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("mdstat", NULL); +#endif + for (i = 0; i < MAX_MD_DEVS; i++) { + struct gendisk *disk = disks[i]; + mddev_t *mddev; + if (!disks[i]) + continue; + mddev = disk->private_data; + del_gendisk(disk); + put_disk(disk); + mddev_put(mddev); + } +} + +module_init(md_init) +module_exit(md_exit) + +EXPORT_SYMBOL(register_md_personality); +EXPORT_SYMBOL(unregister_md_personality); +EXPORT_SYMBOL(md_error); +EXPORT_SYMBOL(md_sync_acct); +EXPORT_SYMBOL(md_done_sync); +EXPORT_SYMBOL(md_write_start); +EXPORT_SYMBOL(md_write_end); +EXPORT_SYMBOL(md_handle_safemode); +EXPORT_SYMBOL(md_register_thread); +EXPORT_SYMBOL(md_unregister_thread); +EXPORT_SYMBOL(md_wakeup_thread); +EXPORT_SYMBOL(md_print_devices); +EXPORT_SYMBOL(md_interrupt_thread); +EXPORT_SYMBOL(md_check_recovery); +MODULE_LICENSE("GPL"); diff --git a/tests/linux/md/patch b/tests/linux/md/patch new file mode 100644 index 0000000..1370009 --- /dev/null +++ b/tests/linux/md/patch @@ -0,0 +1,117 @@ +*************** +*** 1453,1542 **** + return 1; + } + +- #undef OLD_LEVEL +- +- static int device_size_calculation(mddev_t * mddev) +- { +- int data_disks = 0; +- unsigned int readahead; +- struct list_head *tmp; +- mdk_rdev_t *rdev; +- +- /* +- * Do device size calculation. Bail out if too small. +- * (we have to do this after having validated chunk_size, +- * because device size has to be modulo chunk_size) +- */ +- +- ITERATE_RDEV(mddev,rdev,tmp) { +- if (rdev->faulty) +- continue; +- if (rdev->size < mddev->chunk_size / 1024) { +- printk(KERN_WARNING +- "md: Dev %s smaller than chunk_size:" +- " %lluk < %dk\n", +- bdev_partition_name(rdev->bdev), +- (unsigned long long)rdev->size, +- mddev->chunk_size / 1024); +- return -EINVAL; +- } +- } +- +- switch (mddev->level) { +- case LEVEL_MULTIPATH: +- data_disks = 1; +- break; +- case -3: +- data_disks = 1; +- break; +- case -2: +- data_disks = 1; +- break; +- case LEVEL_LINEAR: +- zoned_raid_size(mddev); +- data_disks = 1; +- break; +- case 0: +- zoned_raid_size(mddev); +- data_disks = mddev->raid_disks; +- break; +- case 1: +- data_disks = 1; +- break; +- case 4: +- case 5: +- data_disks = mddev->raid_disks-1; +- break; +- default: +- printk(KERN_ERR "md: md%d: unsupported raid level %d\n", +- mdidx(mddev), mddev->level); +- goto abort; +- } +- if (!md_size[mdidx(mddev)]) +- md_size[mdidx(mddev)] = mddev->size * data_disks; +- +- readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; +- if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) { +- readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks; +- if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) +- readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; +- } else { +- // (no multipath branch - it uses the default setting) +- if (mddev->level == -3) +- readahead = 0; +- } +- +- printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", +- mdidx(mddev), readahead*(PAGE_SIZE/1024)); +- +- printk(KERN_INFO +- "md%d: %d data-disks, max readahead per data-disk: %ldk\n", +- mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); +- return 0; +- abort: +- return 1; +- } +- + static struct gendisk *md_probe(dev_t dev, int *part, void *data) + { + static DECLARE_MUTEX(disks_sem); +--- 1436,1441 ---- + return 1; + } + + static struct gendisk *md_probe(dev_t dev, int *part, void *data) + { + static DECLARE_MUTEX(disks_sem); +*************** +*** 1664,1672 **** + } + } + +- if (device_size_calculation(mddev)) +- return -EINVAL; +- + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md +--- 1571,1576 ---- + } + } + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md diff --git a/tests/linux/md/rediff b/tests/linux/md/rediff new file mode 100644 index 0000000..fc27949 --- /dev/null +++ b/tests/linux/md/rediff @@ -0,0 +1,101 @@ +@@ -1453,90 +1436,6 @@ + return 1; + } + +-#undef OLD_LEVEL +- +-static int device_size_calculation(mddev_t * mddev) +-{ +- int data_disks = 0; +- unsigned int readahead; +- struct list_head *tmp; +- mdk_rdev_t *rdev; +- +- /* +- * Do device size calculation. Bail out if too small. +- * (we have to do this after having validated chunk_size, +- * because device size has to be modulo chunk_size) +- */ +- +- ITERATE_RDEV(mddev,rdev,tmp) { +- if (rdev->faulty) +- continue; +- if (rdev->size < mddev->chunk_size / 1024) { +- printk(KERN_WARNING +- "md: Dev %s smaller than chunk_size:" +- " %lluk < %dk\n", +- bdev_partition_name(rdev->bdev), +- (unsigned long long)rdev->size, +- mddev->chunk_size / 1024); +- return -EINVAL; +- } +- } +- +- switch (mddev->level) { +- case LEVEL_MULTIPATH: +- data_disks = 1; +- break; +- case -3: +- data_disks = 1; +- break; +- case -2: +- data_disks = 1; +- break; +- case LEVEL_LINEAR: +- zoned_raid_size(mddev); +- data_disks = 1; +- break; +- case 0: +- zoned_raid_size(mddev); +- data_disks = mddev->raid_disks; +- break; +- case 1: +- data_disks = 1; +- break; +- case 4: +- case 5: +- data_disks = mddev->raid_disks-1; +- break; +- default: +- printk(KERN_ERR "md: md%d: unsupported raid level %d\n", +- mdidx(mddev), mddev->level); +- goto abort; +- } +- if (!md_size[mdidx(mddev)]) +- md_size[mdidx(mddev)] = mddev->size * data_disks; +- +- readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; +- if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) { +- readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks; +- if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) +- readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; +- } else { +- // (no multipath branch - it uses the default setting) +- if (mddev->level == -3) +- readahead = 0; +- } +- +- printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", +- mdidx(mddev), readahead*(PAGE_SIZE/1024)); +- +- printk(KERN_INFO +- "md%d: %d data-disks, max readahead per data-disk: %ldk\n", +- mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); +- return 0; +-abort: +- return 1; +-} +- + static struct gendisk *md_probe(dev_t dev, int *part, void *data) + { + static DECLARE_MUTEX(disks_sem); +@@ -1664,9 +1571,6 @@ + } + } + +- if (device_size_calculation(mddev)) +- return -EINVAL; +- + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md diff --git a/tests/linux/md/replace b/tests/linux/md/replace new file mode 100644 index 0000000..e69de29 diff --git a/tests/linux/md/wmerge b/tests/linux/md/wmerge new file mode 100644 index 0000000..6aadb61 --- /dev/null +++ b/tests/linux/md/wmerge @@ -0,0 +1,3591 @@ +/* + md.c : Multiple Devices driver for Linux + Copyright (C) 1998, 1999, 2000 Ingo Molnar + + completely rewritten, based on the MD driver code from Marc Zyngier + + Changes: + + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher + - Devfs support by Richard Gooch + + - lots of fixes and improvements to the RAID1/RAID5 and generic + RAID code (such as request based resynchronization): + + Neil Brown . + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include /* for invalidate_bdev */ +#include + +#include + +#ifdef CONFIG_KMOD +#include +#endif + +#define __KERNEL_SYSCALLS__ +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define DEVICE_NR(device) (minor(device)) + +#include + +#define DEBUG 0 +#define dprintk(x...) ((void)(DEBUG && printk(x))) + + +#ifndef MODULE +static void autostart_arrays (void); +#endif + +static mdk_personality_t *pers[MAX_PERSONALITY]; +static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 1000 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 1000; +static int sysctl_speed_limit_max = 200000; + +static struct ctl_table_header *raid_table_header; + +static ctl_table raid_table[] = { + { + .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, + .procname = "speed_limit_min", + .data = &sysctl_speed_limit_min, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, + .procname = "speed_limit_max", + .data = &sysctl_speed_limit_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table raid_dir_table[] = { + { + .ctl_name = DEV_RAID, + .procname = "raid", + .maxlen = 0, + .mode = 0555, + .child = raid_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table raid_root_table[] = { + { + .ctl_name = CTL_DEV, + .procname = "dev", + .maxlen = 0, + .mode = 0555, + .child = raid_dir_table, + }, + { .ctl_name = 0 } +}; + +static struct block_device_operations md_fops; + +static struct gendisk *disks[MAX_MD_DEVS]; + +/* + * Enables to iterate over all existing md arrays + * all_mddevs_lock protects this list as well as mddev_map. + */ +static LIST_HEAD(all_mddevs); +static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED; + + +/* + * iterates through all used mddevs in the system. + * We take care to grab the all_mddevs_lock whenever navigating + * the list, and to always hold a refcount when unlocked. + * Any code which breaks out of this loop while own + * a reference to the current mddev and must mddev_put it. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (({ spin_lock(&all_mddevs_lock); \ + tmp = all_mddevs.next; \ + mddev = NULL;}); \ + ({ if (tmp != &all_mddevs) \ + mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ + spin_unlock(&all_mddevs_lock); \ + if (mddev) mddev_put(mddev); \ + mddev = list_entry(tmp, mddev_t, all_mddevs); \ + tmp != &all_mddevs;}); \ + ({ spin_lock(&all_mddevs_lock); \ + tmp = tmp->next;}) \ + ) + +static mddev_t *mddev_map[MAX_MD_DEVS]; + +static int md_fail_request (request_queue_t *q, struct bio *bio) +{ + bio_io_error(bio, bio->bi_size); + return 0; +} + +static inline mddev_t *mddev_get(mddev_t *mddev) +{ + atomic_inc(&mddev->active); + return mddev; +} + +static void mddev_put(mddev_t *mddev) +{ + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) + return; + if (!mddev->raid_disks && list_empty(&mddev->disks)) { + list_del(&mddev->all_mddevs); + mddev_map[mdidx(mddev)] = NULL; + kfree(mddev); + MOD_DEC_USE_COUNT; + } + spin_unlock(&all_mddevs_lock); +} + +static mddev_t * mddev_find(int unit) +{ + mddev_t *mddev, *new = NULL; + + retry: + spin_lock(&all_mddevs_lock); + if (mddev_map[unit]) { + mddev = mddev_get(mddev_map[unit]); + spin_unlock(&all_mddevs_lock); + if (new) + kfree(new); + return mddev; + } + if (new) { + mddev_map[unit] = new; + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + MOD_INC_USE_COUNT; + return new; + } + spin_unlock(&all_mddevs_lock); + + new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + memset(new, 0, sizeof(*new)); + + new->__minor = unit; + init_MUTEX(&new->reconfig_sem); + INIT_LIST_HEAD(&new->disks); + INIT_LIST_HEAD(&new->all_mddevs); + init_timer(&new->safemode_timer); + atomic_set(&new->active, 1); + blk_queue_make_request(&new->queue, md_fail_request); + + goto retry; +} + +static inline int mddev_lock(mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +static inline void mddev_lock_uninterruptible(mddev_t * mddev) +{ + down(&mddev->reconfig_sem); +} + +static inline int mddev_trylock(mddev_t * mddev) +{ + return down_trylock(&mddev->reconfig_sem); +} + +static inline void mddev_unlock(mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + +static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->bdev->bd_dev == dev) + return rdev; + } + return NULL; +} + +inline static sector_t calc_dev_sboffset(struct block_device *bdev) +{ + sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + return MD_NEW_SIZE_BLOCKS(size); +} + +static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) +{ + sector_t size; + + size = rdev->sb_offset; + + if (chunk_size) + size &= ~((sector_t)chunk_size/1024 - 1); + return size; +} + +static int alloc_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) + MD_BUG(); + + rdev->sb_page = alloc_page(GFP_KERNEL); + if (!rdev->sb_page) { + printk(KERN_ALERT "md: out of memory.\n"); + return -EINVAL; + } + + return 0; +} + +static void free_disk_sb(mdk_rdev_t * rdev) +{ + if (rdev->sb_page) { + page_cache_release(rdev->sb_page); + rdev->sb_loaded = 0; + rdev->sb_page = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } +} + + +static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) +{ + if (bio->bi_size) + return 1; + + complete((struct completion*)bio->bi_private); + return 0; +} + +static int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw) +{ + struct bio bio; + struct bio_vec vec; + struct completion event; + + bio_init(&bio); + bio.bi_io_vec = &vec; + vec.bv_page = page; + vec.bv_len = size; + vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_idx = 0; + bio.bi_size = size; + bio.bi_bdev = bdev; + bio.bi_sector = sector; + init_completion(&event); + bio.bi_private = &event; + bio.bi_end_io = bi_complete; + submit_bio(rw, &bio); + blk_run_queues(); + wait_for_completion(&event); + + return test_bit(BIO_UPTODATE, &bio.bi_flags); +} + +static int read_disk_sb(mdk_rdev_t * rdev) +{ + + if (!rdev->sb_page) { + MD_BUG(); + return -EINVAL; + } + if (rdev->sb_loaded) + return 0; + + + if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) + goto fail; + rdev->sb_loaded = 1; + return 0; + +fail: + printk(KERN_ERR "md: disabled device %s, could not read superblock.\n", + bdev_partition_name(rdev->bdev)); + return -EINVAL; +} + +static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + if ( (sb1->set_uuid0 == sb2->set_uuid0) && + (sb1->set_uuid1 == sb2->set_uuid1) && + (sb1->set_uuid2 == sb2->set_uuid2) && + (sb1->set_uuid3 == sb2->set_uuid3)) + + return 1; + + return 0; +} + + +static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static unsigned int calc_sb_csum(mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Handle superblock details. + * We want to be able to handle multiple superblock formats + * so we have a common interface to them all, and an array of + * different handlers. + * We rely on user-space to write the initial superblock, and support + * reading and updating of superblocks. + * Interface methods are: + * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) + * loads and validates a superblock on dev. + * if refdev != NULL, compare superblocks on both devices + * Return: + * 0 - dev has a superblock that is compatible with refdev + * 1 - dev has a superblock that is compatible and newer than refdev + * so dev should be used as the refdev in future + * -EINVAL superblock incompatible or invalid + * -othererror e.g. -EIO + * + * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) + * Verify that dev is acceptable into mddev. + * The first time, mddev->raid_disks will be 0, and data from + * dev should be merged in. Subsequent calls check that dev + * is new enough. Return 0 or -EINVAL + * + * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) + * Update the superblock for rdev with data in mddev + * This does not write to disc. + * + */ + +struct super_type { + char *name; + struct module *owner; + int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); + int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); + void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); +}; + +/* + * load_super for 0.90.0 + */ +static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ + mdp_super_t *sb; + int ret; + sector_t sb_offset; + + /* + * Calculate the position of the superblock, + * it's at the end of the disk. + * + * It also happens to be a multiple of 4Kb. + */ + sb_offset = calc_dev_sboffset(rdev->bdev); + rdev->sb_offset = sb_offset; + + ret = read_disk_sb(rdev); + if (ret) return ret; + + ret = -EINVAL; + + sb = (mdp_super_t*)page_address(rdev->sb_page); + + if (sb->md_magic != MD_SB_MAGIC) { + printk(KERN_ERR "md: invalid raid superblock magic on %s\n", + bdev_partition_name(rdev->bdev)); + goto abort; + } + + if (sb->major_version != 0 || + sb->minor_version != 90) { + printk(KERN_WARNING "Bad version number %d.%d on %s\n", + sb->major_version, sb->minor_version, + bdev_partition_name(rdev->bdev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk(KERN_ERR "md: %s: invalid raid minor (%x)\n", + bdev_partition_name(rdev->bdev), sb->md_minor); + goto abort; + } + if (sb->raid_disks <= 0) + goto abort; + + if (calc_sb_csum(sb) != sb->sb_csum) { + printk(KERN_WARNING "md: invalid superblock checksum on %s\n", + bdev_partition_name(rdev->bdev)); + goto abort; + } + + rdev->preferred_minor = sb->md_minor; + rdev->data_offset = 0; + + if (sb->level == MULTIPATH) + rdev->desc_nr = -1; + else + rdev->desc_nr = sb->this_disk.number; + + if (refdev == 0) + ret = 1; + else { + __u64 ev1, ev2; + mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); + if (!uuid_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has different UUID to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + goto abort; + } + if (!sb_equal(refsb, sb)) { + printk(KERN_WARNING "md: %s has same UUID" + " but different superblock to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + goto abort; + } + ev1 = md_event(sb); + ev2 = md_event(refsb); + if (ev1 > ev2) + ret = 1; + else + ret = 0; + } + rdev->size = calc_dev_size(rdev, sb->chunk_size); + + abort: + return ret; +} + +/* + * validate_super for 0.90.0 + */ +static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdp_disk_t *desc; + mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + + if (mddev->raid_disks == 0) { + mddev->major_version = 0; + mddev->minor_version = sb->minor_version; + mddev->patch_version = sb->patch_version; + mddev->persistent = ! sb->not_persistent; + mddev->chunk_size = sb->chunk_size; + mddev->ctime = sb->ctime; + mddev->utime = sb->utime; + mddev->level = sb->level; + mddev->layout = sb->layout; + mddev->raid_disks = sb->raid_disks; + mddev->size = sb->size; + mddev->events = md_event(sb); + + if (sb->state & (1<recovery_cp = MaxSector; + else { + if (sb->events_hi == sb->cp_events_hi && + sb->events_lo == sb->cp_events_lo) { + mddev->recovery_cp = sb->recovery_cp; + } else + mddev->recovery_cp = 0; + } + + memcpy(mddev->uuid+0, &sb->set_uuid0, 4); + memcpy(mddev->uuid+4, &sb->set_uuid1, 4); + memcpy(mddev->uuid+8, &sb->set_uuid2, 4); + memcpy(mddev->uuid+12,&sb->set_uuid3, 4); + + mddev->max_disks = MD_SB_DISKS; + } else { + __u64 ev1; + ev1 = md_event(sb); + ++ev1; + if (ev1 < mddev->events) + return -EINVAL; + } + if (mddev->level != LEVEL_MULTIPATH) { + rdev->raid_disk = -1; + rdev->in_sync = rdev->faulty = 0; + desc = sb->disks + rdev->desc_nr; + + if (desc->state & (1<faulty = 1; + else if (desc->state & (1<raid_disk < mddev->raid_disks) { + rdev->in_sync = 1; + rdev->raid_disk = desc->raid_disk; + } + } + return 0; +} + +/* + * sync_super for 0.90.0 + */ +static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdp_super_t *sb; + struct list_head *tmp; + mdk_rdev_t *rdev2; + int next_spare = mddev->raid_disks; + + /* make rdev->sb match mddev data.. + * + * 1/ zero out disks + * 2/ Add info for each disk, keeping track of highest desc_nr + * 3/ any empty disks < highest become removed + * + * disks[0] gets initialised to REMOVED because + * we cannot be sure from other fields if it has + * been initialised or not. + */ + int highest = 0; + int i; + int active=0, working=0,failed=0,spare=0,nr_disks=0; + + sb = (mdp_super_t*)page_address(rdev->sb_page); + + memset(sb, 0, sizeof(*sb)); + + sb->md_magic = MD_SB_MAGIC; + sb->major_version = mddev->major_version; + sb->minor_version = mddev->minor_version; + sb->patch_version = mddev->patch_version; + sb->gvalid_words = 0; /* ignored */ + memcpy(&sb->set_uuid0, mddev->uuid+0, 4); + memcpy(&sb->set_uuid1, mddev->uuid+4, 4); + memcpy(&sb->set_uuid2, mddev->uuid+8, 4); + memcpy(&sb->set_uuid3, mddev->uuid+12,4); + + sb->ctime = mddev->ctime; + sb->level = mddev->level; + sb->size = mddev->size; + sb->raid_disks = mddev->raid_disks; + sb->md_minor = mddev->__minor; + sb->not_persistent = !mddev->persistent; + sb->utime = mddev->utime; + sb->state = 0; + sb->events_hi = (mddev->events>>32); + sb->events_lo = (u32)mddev->events; + + if (mddev->in_sync) + { + sb->recovery_cp = mddev->recovery_cp; + sb->cp_events_hi = (mddev->events>>32); + sb->cp_events_lo = (u32)mddev->events; + if (mddev->recovery_cp == MaxSector) + sb->state = (1<< MD_SB_CLEAN); + } else + sb->recovery_cp = 0; + + sb->layout = mddev->layout; + sb->chunk_size = mddev->chunk_size; + + sb->disks[0].state = (1<raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) + rdev2->desc_nr = rdev2->raid_disk; + else + rdev2->desc_nr = next_spare++; + d = &sb->disks[rdev2->desc_nr]; + nr_disks++; + d->number = rdev2->desc_nr; + d->major = MAJOR(rdev2->bdev->bd_dev); + d->minor = MINOR(rdev2->bdev->bd_dev); + if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) + d->raid_disk = rdev2->raid_disk; + else + d->raid_disk = rdev2->desc_nr; /* compatibility */ + if (rdev2->faulty) { + d->state = (1<in_sync) { + d->state = (1<state |= (1<state = 0; + spare++; + working++; + } + if (rdev2->desc_nr > highest) + highest = rdev2->desc_nr; + } + + /* now set the "removed" bit on any non-trailing holes */ + for (i=0; idisks[i]; + if (d->state == 0 && d->number == 0) { + d->number = i; + d->raid_disk = i; + d->state = (1<nr_disks = nr_disks; + sb->active_disks = active; + sb->working_disks = working; + sb->failed_disks = failed; + sb->spare_disks = spare; + + sb->this_disk = sb->disks[rdev->desc_nr]; + sb->sb_csum = calc_sb_csum(sb); +} + +/* + * version 1 superblock + */ + +static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +{ + unsigned int disk_csum, csum; + int size = 256 + sb->max_dev*2; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, size, 0); + sb->sb_csum = disk_csum; + return csum; +} + +static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ + struct mdp_superblock_1 *sb; + int ret; + sector_t sb_offset; + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depeding on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(minor_version) { + case 0: + sb_offset = rdev->bdev->bd_inode->i_size >> 9; + sb_offset -= 8*2; + sb_offset &= ~(4*2); + /* convert from sectors to K */ + sb_offset /= 2; + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4; + break; + default: + return -EINVAL; + } + rdev->sb_offset = sb_offset; + + ret = read_disk_sb(rdev); + if (ret) return ret; + + + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || + sb->major_version != cpu_to_le32(1) || + le32_to_cpu(sb->max_dev) > (4096-256)/2 || + le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || + sb->feature_map != 0) + return -EINVAL; + + if (calc_sb_1_csum(sb) != sb->sb_csum) { + printk("md: invalid superblock checksum on %s\n", + bdev_partition_name(rdev->bdev)); + return -EINVAL; + } + rdev->preferred_minor = 0xffff; + rdev->data_offset = le64_to_cpu(sb->data_offset); + + if (refdev == 0) + return 1; + else { + __u64 ev1, ev2; + struct mdp_superblock_1 *refsb = + (struct mdp_superblock_1*)page_address(refdev->sb_page); + + if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || + sb->level != refsb->level || + sb->layout != refsb->layout || + sb->chunksize != refsb->chunksize) { + printk(KERN_WARNING "md: %s has strangely different" + " superblock to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(refdev->bdev)); + return -EINVAL; + } + ev1 = le64_to_cpu(sb->events); + ev2 = le64_to_cpu(refsb->events); + + if (ev1 > ev2) + return 1; + } + if (minor_version) + rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; + else + rdev->size = rdev->sb_offset; + if (rdev->size < le64_to_cpu(sb->data_size)/2) + return -EINVAL; + rdev->size = le64_to_cpu(sb->data_size)/2; + if (le32_to_cpu(sb->chunksize)) + rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); + return 0; +} + +static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + if (mddev->raid_disks == 0) { + mddev->major_version = 1; + mddev->minor_version = 0; + mddev->patch_version = 0; + mddev->persistent = 1; + mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; + mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); + mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); + mddev->level = le32_to_cpu(sb->level); + mddev->layout = le32_to_cpu(sb->layout); + mddev->raid_disks = le32_to_cpu(sb->raid_disks); + mddev->size = (u32)le64_to_cpu(sb->size); + mddev->events = le64_to_cpu(sb->events); + + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + memcpy(mddev->uuid, sb->set_uuid, 16); + + mddev->max_disks = (4096-256)/2; + } else { + __u64 ev1; + ev1 = le64_to_cpu(sb->events); + ++ev1; + if (ev1 < mddev->events) + return -EINVAL; + } + + if (mddev->level != LEVEL_MULTIPATH) { + int role; + rdev->desc_nr = le32_to_cpu(sb->dev_number); + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + switch(role) { + case 0xffff: /* spare */ + rdev->in_sync = 0; + rdev->faulty = 0; + rdev->raid_disk = -1; + break; + case 0xfffe: /* faulty */ + rdev->in_sync = 0; + rdev->faulty = 1; + rdev->raid_disk = -1; + break; + default: + rdev->in_sync = 1; + rdev->faulty = 0; + rdev->raid_disk = role; + break; + } + } + return 0; +} + +static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct mdp_superblock_1 *sb; + struct list_head *tmp; + mdk_rdev_t *rdev2; + int max_dev, i; + /* make rdev->sb match mddev and rdev data. */ + + sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + + sb->feature_map = 0; + sb->pad0 = 0; + memset(sb->pad1, 0, sizeof(sb->pad1)); + memset(sb->pad2, 0, sizeof(sb->pad2)); + memset(sb->pad3, 0, sizeof(sb->pad3)); + + sb->utime = cpu_to_le64((__u64)mddev->utime); + sb->events = cpu_to_le64(mddev->events); + if (mddev->in_sync) + sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else + sb->resync_offset = cpu_to_le64(0); + + max_dev = 0; + ITERATE_RDEV(mddev,rdev2,tmp) + if (rdev2->desc_nr > max_dev) + max_dev = rdev2->desc_nr; + + sb->max_dev = max_dev; + for (i=0; idev_roles[max_dev] = cpu_to_le16(0xfffe); + + ITERATE_RDEV(mddev,rdev2,tmp) { + i = rdev2->desc_nr; + if (rdev2->faulty) + sb->dev_roles[i] = cpu_to_le16(0xfffe); + else if (rdev2->in_sync) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else + sb->dev_roles[i] = cpu_to_le16(0xffff); + } + + sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ +} + + +struct super_type super_types[] = { + [0] = { + .name = "0.90.0", + .owner = THIS_MODULE, + .load_super = super_90_load, + .validate_super = super_90_validate, + .sync_super = super_90_sync, + }, + [1] = { + .name = "md-1", + .owner = THIS_MODULE, + .load_super = super_1_load, + .validate_super = super_1_validate, + .sync_super = super_1_sync, + }, +}; + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (rdev->bdev->bd_contains == dev->bdev->bd_contains) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev)) + return 1; + + return 0; +} + +static LIST_HEAD(pending_raid_disks); + +static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return -EINVAL; + } + same_pdev = match_dev_unit(mddev, rdev); + if (same_pdev) + printk(KERN_WARNING + "md%d: WARNING: %s appears to be on the same physical" + " disk as %s. True\n protection against single-disk" + " failure might be compromised.\n", + mdidx(mddev), bdev_partition_name(rdev->bdev), + bdev_partition_name(same_pdev->bdev)); + + /* Verify rdev->desc_nr is unique. + * If it is -1, assign a free number, else + * check number is not in use + */ + if (rdev->desc_nr < 0) { + int choice = 0; + if (mddev->pers) choice = mddev->raid_disks; + while (find_rdev_nr(mddev, choice)) + choice++; + rdev->desc_nr = choice; + } else { + if (find_rdev_nr(mddev, rdev->desc_nr)) + return -EBUSY; + } + + list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev)); + return 0; +} + +static void unbind_rdev_from_array(mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del_init(&rdev->same_set); + printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev)); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +{ + int err = 0; + struct block_device *bdev; + + bdev = bdget(dev); + if (!bdev) + return -ENOMEM; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); + if (err) + return err; + err = bd_claim(bdev, rdev); + if (err) { + blkdev_put(bdev, BDEV_RAW); + return err; + } + rdev->bdev = bdev; + return err; +} + +static void unlock_rdev(mdk_rdev_t *rdev) +{ + struct block_device *bdev = rdev->bdev; + rdev->bdev = NULL; + if (!bdev) + MD_BUG(); + bd_release(bdev); + blkdev_put(bdev, BDEV_RAW); +} + +void md_autodetect_dev(dev_t dev); + +static void export_rdev(mdk_rdev_t * rdev) +{ + printk(KERN_INFO "md: export_rdev(%s)\n", + bdev_partition_name(rdev->bdev)); + if (rdev->mddev) + MD_BUG(); + free_disk_sb(rdev); + list_del_init(&rdev->same_set); +#ifndef MODULE + md_autodetect_dev(rdev->bdev->bd_dev); +#endif + unlock_rdev(rdev); + kfree(rdev); +} + +static void kick_rdev_from_array(mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array(mddev_t *mddev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (!list_empty(&mddev->disks)) + MD_BUG(); + mddev->raid_disks = 0; + mddev->major_version = 0; +} + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(KERN_INFO + "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", + sb->level, sb->size, sb->nr_disks, sb->raid_disks, + sb->md_minor, sb->layout, sb->chunk_size); + printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" + " FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)sb->events_lo); + + printk(KERN_INFO); + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || + desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(KERN_INFO "md: THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ", + bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size, + rdev->faulty, rdev->in_sync, rdev->desc_nr); + if (rdev->sb_loaded) { + printk(KERN_INFO "md: rdev superblock:\n"); + print_sb((mdp_super_t*)page_address(rdev->sb_page)); + } else + printk(KERN_INFO "md: no rdev superblock!\n"); +} + +void md_print_devices(void) +{ + struct list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk("md: **********************************\n"); + printk("md: * *\n"); + printk("md: **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", bdev_partition_name(rdev->bdev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk("md: **********************************\n"); + printk("\n"); +} + + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + + if (!rdev->sb_loaded) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + + dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->sb_offset); + + if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) + return 0; + + printk("md: write_disk_sb failed for device %s\n", + bdev_partition_name(rdev->bdev)); + return 1; +} + +static void sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + super_types[mddev->major_version]. + sync_super(mddev, rdev); + rdev->sb_loaded = 1; + } +} + +static void md_update_sb(mddev_t * mddev) +{ + int err, count = 100; + struct list_head *tmp; + mdk_rdev_t *rdev; + + mddev->sb_dirty = 0; +repeat: + mddev->utime = get_seconds(); + mddev->events ++; + + if (!mddev->events) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->events --; + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (!mddev->persistent) + return; + + dprintk(KERN_INFO + "md: updating md%d RAID superblock on device (in sync %d)\n", + mdidx(mddev),mddev->in_sync); + + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + dprintk(KERN_INFO "md: "); + if (rdev->faulty) + dprintk("(skipping faulty "); + + dprintk("%s ", bdev_partition_name(rdev->bdev)); + if (!rdev->faulty) { + err += write_disk_sb(rdev); + } else + dprintk(")\n"); + if (!err && mddev->level == LEVEL_MULTIPATH) + /* only need to write one superblock... */ + break; + } + if (err) { + if (--count) { + printk(KERN_ERR "md: errors occurred during superblock" + " update, repeating\n"); + goto repeat; + } + printk(KERN_ERR \ + "md: excessive errors occurred during superblock update, exiting\n"); + } +} + +/* + * Import a device. If 'super_format' >= 0, then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + * a faulty rdev _never_ has rdev->sb set. + */ +static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) +{ + int err; + mdk_rdev_t *rdev; + sector_t size; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk(KERN_ERR "md: could not alloc mem for %s!\n", + partition_name(newdev)); + return ERR_PTR(-ENOMEM); + } + memset(rdev, 0, sizeof(*rdev)); + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + err = lock_rdev(rdev, newdev); + if (err) { + printk(KERN_ERR "md: could not lock %s.\n", + partition_name(newdev)); + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + rdev->in_sync = 0; + rdev->data_offset = 0; + atomic_set(&rdev->nr_pending, 0); + + size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + if (!size) { + printk(KERN_WARNING + "md: %s has zero or unknown size, marking faulty!\n", + bdev_partition_name(rdev->bdev)); + err = -EINVAL; + goto abort_free; + } + + if (super_format >= 0) { + err = super_types[super_format]. + load_super(rdev, NULL, super_minor); + if (err == -EINVAL) { + printk(KERN_WARNING + "md: %s has invalid sb, not importing!\n", + bdev_partition_name(rdev->bdev)); + goto abort_free; + } + if (err < 0) { + printk(KERN_WARNING + "md: could not read %s's sb, not importing!\n", + bdev_partition_name(rdev->bdev)); + goto abort_free; + } + } + INIT_LIST_HEAD(&rdev->same_set); + + return rdev; + +abort_free: + if (rdev->sb_page) { + if (rdev->bdev) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return ERR_PTR(err); +} + +/* + * Check a full RAID array for plausibility + */ + + +static int analyze_sbs(mddev_t * mddev) +{ + int i; + struct list_head *tmp; + mdk_rdev_t *rdev, *freshest; + + freshest = NULL; + ITERATE_RDEV(mddev,rdev,tmp) + switch (super_types[mddev->major_version]. + load_super(rdev, freshest, mddev->minor_version)) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + printk( KERN_ERR \ + "md: fatal superblock inconsistency in %s" + " -- removing from array\n", + bdev_partition_name(rdev->bdev)); + kick_rdev_from_array(rdev); + } + + + super_types[mddev->major_version]. + validate_super(mddev, freshest); + + i = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev != freshest) + if (super_types[mddev->major_version]. + validate_super(mddev, rdev)) { + printk(KERN_WARNING "md: kicking non-fresh %s" + " from array!\n", + bdev_partition_name(rdev->bdev)); + kick_rdev_from_array(rdev); + continue; + } + if (mddev->level == LEVEL_MULTIPATH) { + rdev->desc_nr = i++; + rdev->raid_disk = rdev->desc_nr; + rdev->in_sync = 1; + } + } + + + /* + * Check if we can support this RAID array + */ + if (mddev->major_version != MD_MAJOR_VERSION || + mddev->minor_version > MD_MINOR_VERSION) { + printk(KERN_ALERT + "md: md%d: unsupported raid array version %d.%d.%d\n", + mdidx(mddev), mddev->major_version, + mddev->minor_version, mddev->patch_version); + goto abort; + } + + if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) || + (mddev->level == 4) || (mddev->level == 5))) + printk(KERN_ERR "md: md%d: raid array is not clean" + " -- starting background reconstruction\n", + mdidx(mddev)); + + return 0; +abort: + return 1; +} + +<<<---|||#undef OLD_LEVEL + +===--->>>static struct gendisk *md_probe(dev_t dev, int *part, void *data) +{ + static DECLARE_MUTEX(disks_sem); + int unit = MINOR(dev); + mddev_t *mddev = mddev_find(unit); + struct gendisk *disk; + + if (!mddev) + return NULL; + + down(&disks_sem); + if (disks[unit]) { + up(&disks_sem); + mddev_put(mddev); + return NULL; + } + disk = alloc_disk(1); + if (!disk) { + up(&disks_sem); + mddev_put(mddev); + return NULL; + } + disk->major = MD_MAJOR; + disk->first_minor = mdidx(mddev); + sprintf(disk->disk_name, "md%d", mdidx(mddev)); + disk->fops = &md_fops; + disk->private_data = mddev; + disk->queue = &mddev->queue; + add_disk(disk); + disks[mdidx(mddev)] = disk; + up(&disks_sem); + return NULL; +} + +void md_wakeup_thread(mdk_thread_t *thread); + +static void md_safemode_timeout(unsigned long data) +{ + mddev_t *mddev = (mddev_t *) data; + + mddev->safemode = 1; + md_wakeup_thread(mddev->thread); +} + + +static int do_md_run(mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct list_head *tmp; + mdk_rdev_t *rdev; + struct gendisk *disk; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Analyze all RAID superblock(s) + */ + if (!mddev->raid_disks && analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->chunk_size; + pnum = level_to_pers(mddev->level); + + if ((pnum != MULTIPATH) && (pnum != RAID1)) { + if (!chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We don't + * want to continue the bad practice. + */ + printk(KERN_ERR + "no chunksize specified, see 'man raidtab'\n"); + return -EINVAL; + } + if (chunk_size > MAX_CHUNK_SIZE) { + printk(KERN_ERR "too big chunk_size: %d > %d\n", + chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(KERN_ERR "too small chunk_size: %d < %ld\n", + chunk_size, PAGE_SIZE); + return -EINVAL; + } + + /* devices must have minimum size of one chunk */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size < chunk_size / 1024) { + printk(KERN_WARNING + "md: Dev %s smaller than chunk_size:" + " %lluk < %dk\n", + bdev_partition_name(rdev->bdev), + (unsigned long long)rdev->size, + chunk_size / 1024); + return -EINVAL; + } + } + } + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + +#ifdef CONFIG_KMOD + if (!pers[pnum]) + { + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + } +#endif + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + sync_blockdev(rdev->bdev); + invalidate_bdev(rdev->bdev, 0); + } + + md_probe(mdidx(mddev), NULL, NULL); + disk = disks[mdidx(mddev)]; + if (!disk) + return -ENOMEM; + + spin_lock(&pers_lock); + if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { + spin_unlock(&pers_lock); + printk(KERN_ERR "md: personality %d is not loaded!\n", + pnum); + return -EINVAL; + } + + mddev->pers = pers[pnum]; + spin_unlock(&pers_lock); + + blk_queue_make_request(&mddev->queue, mddev->pers->make_request); + printk("%s: setting max_sectors to %d, segment boundary to %d\n", + disk->disk_name, + chunk_size >> 9, + (chunk_size>>1)-1); + blk_queue_max_sectors(&mddev->queue, chunk_size >> 9); + blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1); + mddev->queue.queuedata = mddev; + + err = mddev->pers->run(mddev); + if (err) { + printk(KERN_ERR "md: pers->run() failed ...\n"); + module_put(mddev->pers->owner); + mddev->pers = NULL; + return -EINVAL; + } + atomic_set(&mddev->writes_pending,0); + mddev->safemode = 0; + mddev->safemode_timer.function = md_safemode_timeout; + mddev->safemode_timer.data = (unsigned long) mddev; + mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ + mddev->in_sync = 1; + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + set_capacity(disk, mddev->array_size<<1); + return 0; +} + +static int restart_array(mddev_t *mddev) +{ + struct gendisk *disk = disks[mdidx(mddev)]; + int err; + + /* + * Complain if it has no devices + */ + err = -ENXIO; + if (list_empty(&mddev->disks)) + goto out; + + if (mddev->pers) { + err = -EBUSY; + if (!mddev->ro) + goto out; + + mddev->safemode = 0; + mddev->ro = 0; + set_disk_ro(disk, 0); + + printk(KERN_INFO "md: md%d switched to read-write mode.\n", + mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + err = 0; + } else { + printk(KERN_ERR "md: md%d has no personality assigned.\n", + mdidx(mddev)); + err = -EINVAL; + } + +out: + return err; +} + +static int do_md_stop(mddev_t * mddev, int ro) +{ + int err = 0; + struct gendisk *disk = disks[mdidx(mddev)]; + + if (atomic_read(&mddev->active)>2) { + printk("md: md%d still in use.\n",mdidx(mddev)); + err = -EBUSY; + goto out; + } + + if (mddev->pers) { + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + } + + del_timer_sync(&mddev->safemode_timer); + + invalidate_device(mk_kdev(disk->major, disk->first_minor), 1); + + if (ro) { + err = -ENXIO; + if (mddev->ro) + goto out; + mddev->ro = 1; + } else { + if (mddev->ro) + set_disk_ro(disk, 0); + if (mddev->pers->stop(mddev)) { + err = -EBUSY; + if (mddev->ro) + set_disk_ro(disk, 1); + goto out; + } + module_put(mddev->pers->owner); + mddev->pers = NULL; + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->raid_disks) { + /* mark array as shutdown cleanly */ + mddev->in_sync = 1; + md_update_sb(mddev); + } + if (ro) + set_disk_ro(disk, 1); + } + /* + * Free resources if final stop + */ + if (!ro) { + struct gendisk *disk; + printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); + + export_array(mddev); + + mddev->array_size = 0; + disk = disks[mdidx(mddev)]; + if (disk) + set_capacity(disk, 0); + } else + printk(KERN_INFO "md: md%d switched to read-only mode.\n", + mdidx(mddev)); + err = 0; +out: + return err; +} + +static void autorun_array(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + int err; + + if (list_empty(&mddev->disks)) { + MD_BUG(); + return; + } + + printk(KERN_INFO "md: running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", bdev_partition_name(rdev->bdev)); + } + printk("\n"); + + err = do_md_run (mddev); + if (err) { + printk(KERN_WARNING "md :do_md_run() returned %d\n", err); + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in pending_raid_disks) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void autorun_devices(void) +{ + struct list_head candidates; + struct list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + + printk(KERN_INFO "md: autorun ...\n"); + while (!list_empty(&pending_raid_disks)) { + rdev0 = list_entry(pending_raid_disks.next, + mdk_rdev_t, same_set); + + printk(KERN_INFO "md: considering %s ...\n", + bdev_partition_name(rdev0->bdev)); + INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) + if (super_90_load(rdev, rdev0, 0) >= 0) { + printk(KERN_INFO "md: adding %s ...\n", + bdev_partition_name(rdev->bdev)); + list_move(&rdev->same_set, &candidates); + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + + mddev = mddev_find(rdev0->preferred_minor); + if (!mddev) { + printk(KERN_ERR + "md: cannot allocate memory for md drive.\n"); + break; + } + if (mddev_lock(mddev)) + printk(KERN_WARNING "md: md%d locked, cannot run\n", + mdidx(mddev)); + else if (mddev->raid_disks || mddev->major_version + || !list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: md%d already running, cannot run %s\n", + mdidx(mddev), bdev_partition_name(rdev0->bdev)); + mddev_unlock(mddev); + } else { + printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { + list_del_init(&rdev->same_set); + if (bind_rdev_to_array(rdev, mddev)) + export_rdev(rdev); + } + autorun_array(mddev); + mddev_unlock(mddev); + } + /* on success, candidates will be empty, on error + * it won't... + */ + ITERATE_RDEV_GENERIC(candidates,rdev,tmp) + export_rdev(rdev); + mddev_put(mddev); + } + printk(KERN_INFO "md: ... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +static int autostart_array(dev_t startdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + start_rdev = md_import_device(startdev, 0, 0); + if (IS_ERR(start_rdev)) { + printk(KERN_WARNING "md: could not import %s!\n", + partition_name(startdev)); + return err; + } + + /* NOTE: this can only work for 0.90.0 superblocks */ + sb = (mdp_super_t*)page_address(start_rdev->sb_page); + if (sb->major_version != 0 || + sb->minor_version != 90 ) { + printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); + export_rdev(start_rdev); + return err; + } + + if (start_rdev->faulty) { + printk(KERN_WARNING + "md: can not autostart based on faulty %s!\n", + bdev_partition_name(start_rdev->bdev)); + export_rdev(start_rdev); + return err; + } + list_add(&start_rdev->same_set, &pending_raid_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + dev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (!dev) + continue; + if (dev == startdev) + continue; + rdev = md_import_device(dev, 0, 0); + if (IS_ERR(rdev)) { + printk(KERN_WARNING "md: could not import %s," + " trying to run array nevertheless.\n", + partition_name(dev)); + continue; + } + list_add(&rdev->same_set, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(); + return 0; + +} + + +static int get_version(void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +static int get_array_info(mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + int nr,working,active,failed,spare; + mdk_rdev_t *rdev; + struct list_head *tmp; + + nr=working=active=failed=spare=0; + ITERATE_RDEV(mddev,rdev,tmp) { + nr++; + if (rdev->faulty) + failed++; + else { + working++; + if (rdev->in_sync) + active++; + else + spare++; + } + } + + info.major_version = mddev->major_version; + info.minor_version = mddev->minor_version; + info.patch_version = 1; + info.ctime = mddev->ctime; + info.level = mddev->level; + info.size = mddev->size; + info.nr_disks = nr; + info.raid_disks = mddev->raid_disks; + info.md_minor = mddev->__minor; + info.not_persistent= !mddev->persistent; + + info.utime = mddev->utime; + info.state = 0; + if (mddev->in_sync) + info.state = (1<layout; + info.chunk_size = mddev->chunk_size; + + if (copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static int get_disk_info(mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + mdk_rdev_t *rdev; + + if (copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + + rdev = find_rdev_nr(mddev, nr); + if (rdev) { + info.major = MAJOR(rdev->bdev->bd_dev); + info.minor = MINOR(rdev->bdev->bd_dev); + info.raid_disk = rdev->raid_disk; + info.state = 0; + if (rdev->faulty) + info.state |= (1<in_sync) { + info.state |= (1<major,info->minor); + if (!mddev->raid_disks) { + int err; + /* expecting a device which has a superblock */ + rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + if (!list_empty(&mddev->disks)) { + mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + int err = super_types[mddev->major_version] + .load_super(rdev, rdev0, mddev->minor_version); + if (err < 0) { + printk(KERN_WARNING + "md: %s has different UUID to %s\n", + bdev_partition_name(rdev->bdev), + bdev_partition_name(rdev0->bdev)); + export_rdev(rdev); + return -EINVAL; + } + } + err = bind_rdev_to_array(rdev, mddev); + if (err) + export_rdev(rdev); + return err; + } + + /* + * add_new_disk can be used once the array is assembled + * to add "hot spares". They must already have a superblock + * written + */ + if (mddev->pers) { + int err; + if (!mddev->pers->hot_add_disk) { + printk(KERN_WARNING + "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + rdev = md_import_device(dev, mddev->major_version, + mddev->minor_version); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: md_import_device returned %ld\n", + PTR_ERR(rdev)); + return PTR_ERR(rdev); + } + rdev->in_sync = 0; /* just to be sure */ + rdev->raid_disk = -1; + err = bind_rdev_to_array(rdev, mddev); + if (err) + export_rdev(rdev); + if (mddev->thread) + md_wakeup_thread(mddev->thread); + return err; + } + + /* otherwise, add_new_disk is only allowed + * for major_version==0 superblocks + */ + if (mddev->major_version != 0) { + printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n", + mdidx(mddev)); + return -EINVAL; + } + + if (!(info->state & (1<desc_nr = info->number; + if (info->raid_disk < mddev->raid_disks) + rdev->raid_disk = info->raid_disk; + else + rdev->raid_disk = -1; + + rdev->faulty = 0; + if (rdev->raid_disk < mddev->raid_disks) + rdev->in_sync = (info->state & (1<in_sync = 0; + + err = bind_rdev_to_array(rdev, mddev); + if (err) { + export_rdev(rdev); + return err; + } + + if (!mddev->persistent) { + printk(KERN_INFO "md: nonpersistent superblock ...\n"); + rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + } else + rdev->sb_offset = calc_dev_sboffset(rdev->bdev); + rdev->size = calc_dev_size(rdev, mddev->chunk_size); + + if (!mddev->size || (mddev->size > rdev->size)) + mddev->size = rdev->size; + } + + return 0; +} + +static int hot_generate_error(mddev_t * mddev, dev_t dev) +{ + struct request_queue *q; + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + return -ENXIO; + } + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + if (!rdev->in_sync) + return -ENODEV; + + q = bdev_get_queue(rdev->bdev); + if (!q) { + MD_BUG(); + return -ENODEV; + } + printk(KERN_INFO "md: okay, generating error!\n"); +// q->oneshot_error = 1; // disabled for now + + return 0; +} + +static int hot_remove_disk(mddev_t * mddev, dev_t dev) +{ + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->raid_disk >= 0) + goto busy; + + kick_rdev_from_array(rdev); + md_update_sb(mddev); + + return 0; +busy: + printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", + bdev_partition_name(rdev->bdev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk(mddev_t * mddev, dev_t dev) +{ + int err; + unsigned int size; + mdk_rdev_t *rdev; + + if (!mddev->pers) + return -ENODEV; + + printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (mddev->major_version != 0) { + printk(KERN_WARNING "md%d: HOT_ADD may only be used with" + " version-0 superblocks.\n", + mdidx(mddev)); + return -EINVAL; + } + if (!mddev->pers->hot_add_disk) { + printk(KERN_WARNING + "md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + rdev = md_import_device (dev, -1, 0); + if (IS_ERR(rdev)) { + printk(KERN_WARNING + "md: error, md_import_device() returned %ld\n", + PTR_ERR(rdev)); + return -EINVAL; + } + + rdev->sb_offset = calc_dev_sboffset(rdev->bdev); + size = calc_dev_size(rdev, mddev->chunk_size); + rdev->size = size; + + if (size < mddev->size) { + printk(KERN_WARNING + "md%d: disk size %llu blocks < array size %llu\n", + mdidx(mddev), (unsigned long long)size, + (unsigned long long)mddev->size); + err = -ENOSPC; + goto abort_export; + } + + if (rdev->faulty) { + printk(KERN_WARNING + "md: can not hot-add faulty %s disk to md%d!\n", + bdev_partition_name(rdev->bdev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; + } + rdev->in_sync = 0; + rdev->desc_nr = -1; + bind_rdev_to_array(rdev, mddev); + + /* + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... + */ + + if (rdev->desc_nr == mddev->max_disks) { + printk(KERN_WARNING "md%d: can not hot-add to full array!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } + + rdev->raid_disk = -1; + + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; +} + +/* + * set_array_info is used two different ways + * The original usage is when creating a new array. + * In this usage, raid_disks is > = and it together with + * level, size, not_persistent,layout,chunksize determine the + * shape of the array. + * This will always create an array with a type-0.90.0 superblock. + * The newer usage is when assembling an array. + * In this case raid_disks will be 0, and the major_version field is + * use to determine which style super-blocks are to be found on the devices. + * The minor and patch _version numbers are also kept incase the + * super_block handler wishes to interpret them. + */ +static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +{ + + if (info->raid_disks == 0) { + /* just setting version number for superblock loading */ + if (info->major_version < 0 || + info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || + super_types[info->major_version].name == NULL) { + /* maybe try to auto-load a module? */ + printk(KERN_INFO + "md: superblock version %d not known\n", + info->major_version); + return -EINVAL; + } + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; + mddev->minor_version = MD_MINOR_VERSION; + mddev->patch_version = MD_PATCHLEVEL_VERSION; + mddev->ctime = get_seconds(); + + mddev->level = info->level; + mddev->size = info->size; + mddev->raid_disks = info->raid_disks; + /* don't set __minor, it is determined by which /dev/md* was + * openned + */ + if (info->state & (1<recovery_cp = MaxSector; + else + mddev->recovery_cp = 0; + mddev->persistent = ! info->not_persistent; + + mddev->layout = info->layout; + mddev->chunk_size = info->chunk_size; + + mddev->max_disks = MD_SB_DISKS; + + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(mddev->uuid, 16); + + return 0; +} + +static int set_disk_faulty(mddev_t *mddev, dev_t dev) +{ + mdk_rdev_t *rdev; + + rdev = find_rdev(mddev, dev); + if (!rdev) + return 0; + + md_error(mddev, rdev); + return 1; +} + +static int md_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + dev = inode->i_rdev; + minor = minor(dev); + if (minor >= MAX_MD_DEVS) { + MD_BUG(); + return -EINVAL; + } + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; + +#ifndef MODULE + case RAID_AUTORUN: + err = 0; + autostart_arrays(); + goto done; +#endif + default:; + } + + /* + * Commands creating/starting a new array: + */ + + mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) { + BUG(); + goto abort; + } + + + if (cmd == START_ARRAY) { + /* START_ARRAY doesn't need to lock the array as autostart_array + * does the locking, and it could even be a different array + */ + err = autostart_array(arg); + if (err) { + printk(KERN_WARNING "md: autostart %s failed!\n", + partition_name(arg)); + goto abort; + } + goto done; + } + + err = mddev_lock(mddev); + if (err) { + printk(KERN_INFO + "md: ioctl lock interrupted, reason %d, cmd %d\n", + err, cmd); + goto abort; + } + + switch (cmd) + { + case SET_ARRAY_INFO: + + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: array md%d already has disks!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->raid_disks) { + printk(KERN_WARNING + "md: array md%d already initialised!\n", + mdidx(mddev)); + err = -EBUSY; + goto abort_unlock; + } + { + mdu_array_info_t info; + if (!arg) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't set" + " array info. %d\n", err); + goto abort_unlock; + } + } + goto done_unlock; + + default:; + } + + /* + * Commands querying/configuring an existing array: + */ + /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop (mddev, 0); + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = put_user(get_capacity(disks[mdidx(mddev)])/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = put_user (get_start_sect(inode->i_bdev), + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case ADD_NEW_DISK: + { + mdu_disk_info_t info; + if (copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + case HOT_GENERATE_ERROR: + err = hot_generate_error(mddev, arg); + goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, arg); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, arg); + goto done_unlock; + + case RUN_ARRAY: + { + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + * ->pers will not be set, to superblock will + * not be updated. + */ + if (err) + do_md_stop (mddev, 0); + goto done_unlock; + } + + default: + if (_IOC_TYPE(cmd) == MD_MAJOR) + printk(KERN_WARNING "md: %s(pid %d) used" + " obsolete MD ioctl, upgrade your" + " software to use new ictls.\n", + current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + mddev_unlock(mddev); + + return err; +done: + if (err) + MD_BUG(); +abort: + return err; +} + +static int md_open(struct inode *inode, struct file *file) +{ + /* + * Succeed if we can find or allocate a mddev structure. + */ + mddev_t *mddev = mddev_find(minor(inode->i_rdev)); + int err = -ENOMEM; + + if (!mddev) + goto out; + + if ((err = mddev_lock(mddev))) + goto put; + + err = 0; + mddev_unlock(mddev); + inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev); + put: + mddev_put(mddev); + out: + return err; +} + +static int md_release(struct inode *inode, struct file * file) +{ + mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip; + + if (!mddev) + BUG(); + mddev_put(mddev); + + return 0; +} + +static struct block_device_operations md_fops = +{ + .owner = THIS_MODULE, + .open = md_open, + .release = md_release, + .ioctl = md_ioctl, +}; + +int md_thread(void * arg) +{ + mdk_thread_t *thread = arg; + + lock_kernel(); + + /* + * Detach thread + */ + + daemonize(thread->name, mdidx(thread->mddev)); + + current->exit_signal = SIGCHLD; + allow_signal(SIGKILL); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run)(mddev_t *); + + wait_event_interruptible(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags)); + if (current->flags & PF_FREEZE) + refrigerator(PF_IOTHREAD); + + clear_bit(THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->mddev); + blk_run_queues(); + } + if (signal_pending(current)) + flush_signals(current); + } + complete(thread->event); + return 0; +} + +void md_wakeup_thread(mdk_thread_t *thread) +{ + if (thread) { + dprintk("md: waking up MD thread %p.\n", thread); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); + } +} + +mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, + const char *name) +{ + mdk_thread_t *thread; + int ret; + struct completion event; + + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; + + memset(thread, 0, sizeof(mdk_thread_t)); + init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->mddev = mddev; + thread->name = name; + ret = kernel_thread(md_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +void md_interrupt_thread(mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread(mdk_thread_t *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + md_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +void md_error(mddev_t *mddev, mdk_rdev_t *rdev) +{ + dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", + MD_MAJOR,mdidx(mddev), + MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); + + if (!mddev) { + MD_BUG(); + return; + } + + if (!rdev || rdev->faulty) + return; + if (!mddev->pers->error_handler) + return; + mddev->pers->error_handler(mddev,rdev); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); +} + +/* seq_file implementation /proc/mdstat */ + +static void status_unused(struct seq_file *seq) +{ + int i = 0; + mdk_rdev_t *rdev; + struct list_head *tmp; + + seq_printf(seq, "unused devices: "); + + ITERATE_RDEV_PENDING(rdev,tmp) { + i++; + seq_printf(seq, "%s ", + bdev_partition_name(rdev->bdev)); + } + if (!i) + seq_printf(seq, ""); + + seq_printf(seq, "\n"); +} + + +static void status_resync(struct seq_file *seq, mddev_t * mddev) +{ + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + max_blocks = mddev->size; + + /* + * Should not happen. + */ + if (!max_blocks) { + MD_BUG(); + return; + } + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + seq_printf(seq, "["); + for (i = 0; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", + (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? + "resync" : "recovery"), + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt/2); + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + seq_printf(seq, " speed=%ldK/sec", db/dt); +} + +static void *md_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct list_head *tmp; + loff_t l = *pos; + mddev_t *mddev; + + if (l > 0x10000) + return NULL; + if (!l--) + /* header */ + return (void*)1; + + spin_lock(&all_mddevs_lock); + list_for_each(tmp,&all_mddevs) + if (!l--) { + mddev = list_entry(tmp, mddev_t, all_mddevs); + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + return mddev; + } + spin_unlock(&all_mddevs_lock); + return (void*)2;/* tail */ +} + +static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *tmp; + mddev_t *next_mddev, *mddev = v; + + ++*pos; + if (v == (void*)2) + return NULL; + + spin_lock(&all_mddevs_lock); + if (v == (void*)1) + tmp = all_mddevs.next; + else + tmp = mddev->all_mddevs.next; + if (tmp != &all_mddevs) + next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); + else { + next_mddev = (void*)2; + *pos = 0x10000; + } + spin_unlock(&all_mddevs_lock); + + if (v != (void*)1) + mddev_put(mddev); + return next_mddev; + +} + +static void md_seq_stop(struct seq_file *seq, void *v) +{ + mddev_t *mddev = v; + + if (mddev && v != (void*)1 && v != (void*)2) + mddev_put(mddev); +} + +static int md_seq_show(struct seq_file *seq, void *v) +{ + mddev_t *mddev = v; + sector_t size; + struct list_head *tmp2; + mdk_rdev_t *rdev; + int i; + + if (v == (void*)1) { + seq_printf(seq, "Personalities : "); + spin_lock(&pers_lock); + for (i = 0; i < MAX_PERSONALITY; i++) + if (pers[i]) + seq_printf(seq, "[%s] ", pers[i]->name); + + spin_unlock(&pers_lock); + seq_printf(seq, "\n"); + return 0; + } + if (v == (void*)2) { + status_unused(seq); + return 0; + } + + if (mddev_lock(mddev)!=0) + return -EINTR; + if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { + seq_printf(seq, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + seq_printf(seq, " (read-only)"); + seq_printf(seq, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + seq_printf(seq, " %s[%d]", + bdev_partition_name(rdev->bdev), rdev->desc_nr); + if (rdev->faulty) { + seq_printf(seq, "(F)"); + continue; + } + size += rdev->size; + } + + if (!list_empty(&mddev->disks)) { + if (mddev->pers) + seq_printf(seq, "\n %llu blocks", + (unsigned long long)mddev->array_size); + else + seq_printf(seq, "\n %llu blocks", + (unsigned long long)size); + } + + if (mddev->pers) { + mddev->pers->status (seq, mddev); + seq_printf(seq, "\n "); + if (mddev->curr_resync > 2) + status_resync (seq, mddev); + else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + seq_printf(seq, " resync=DELAYED"); + } + + seq_printf(seq, "\n"); + } + mddev_unlock(mddev); + + return 0; +} + +static struct seq_operations md_seq_ops = { + .start = md_seq_start, + .next = md_seq_next, + .stop = md_seq_stop, + .show = md_seq_show, +}; + +static int md_seq_open(struct inode *inode, struct file *file) +{ + int error; + + error = seq_open(file, &md_seq_ops); + return error; +} + +static struct file_operations md_seq_fops = { + .open = md_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int register_md_personality(int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + spin_lock(&pers_lock); + if (pers[pnum]) { + spin_unlock(&pers_lock); + MD_BUG(); + return -EBUSY; + } + + pers[pnum] = p; + printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); + spin_unlock(&pers_lock); + return 0; +} + +int unregister_md_personality(int pnum) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); + spin_lock(&pers_lock); + pers[pnum] = NULL; + spin_unlock(&pers_lock); + return 0; +} + +void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors) +{ + rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors; +} + +static int is_mddev_idle(mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; + curr_events = disk_stat_read(disk, read_sectors) + + disk_stat_read(disk, write_sectors) - + disk->sync_io; + if ((curr_events - rdev->last_events) > 32) { + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +void md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + set_bit(MD_RECOVERY_ERR, &mddev->recovery); + md_wakeup_thread(mddev->thread); + // stop recovery, signal do_sync .... + } +} + + +void md_write_start(mddev_t *mddev) +{ + if (!atomic_read(&mddev->writes_pending)) { + mddev_lock_uninterruptible(mddev); + if (mddev->in_sync) { + mddev->in_sync = 0; + del_timer(&mddev->safemode_timer); + md_update_sb(mddev); + } + atomic_inc(&mddev->writes_pending); + mddev_unlock(mddev); + } else + atomic_inc(&mddev->writes_pending); +} + +void md_write_end(mddev_t *mddev) +{ + if (atomic_dec_and_test(&mddev->writes_pending)) { + if (mddev->safemode == 2) + md_wakeup_thread(mddev->thread); + else + mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); + } +} + +static inline void md_enter_safemode(mddev_t *mddev) +{ + mddev_lock_uninterruptible(mddev); + if (mddev->safemode && !atomic_read(&mddev->writes_pending) && + !mddev->in_sync && mddev->recovery_cp == MaxSector) { + mddev->in_sync = 1; + md_update_sb(mddev); + } + mddev_unlock(mddev); + + if (mddev->safemode == 1) + mddev->safemode = 0; +} + +void md_handle_safemode(mddev_t *mddev) +{ + if (signal_pending(current)) { + printk(KERN_INFO "md: md%d in immediate safe mode\n", + mdidx(mddev)); + mddev->safemode = 2; + flush_signals(current); + } + if (mddev->safemode) + md_enter_safemode(mddev); +} + + +DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +static void md_do_sync(mddev_t *mddev) +{ + mddev_t *mddev2; + unsigned int max_sectors, currspeed = 0, + j, window; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct list_head *tmp; + unsigned long last_check; + + /* just incase thread restarts... */ + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + return; + + /* we overload curr_resync somewhat here. + * 0 == not engaged in resync at all + * 2 == checking that there is no conflict with another sync + * 1 == like 2, but have yielded to allow conflicting resync to + * commense + * other == active in resync - this many blocks + */ + do { + mddev->curr_resync = 2; + + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && + match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: delaying resync of md%d" + " until md%d has finished resync (they" + " share one or more physical units)\n", + mdidx(mddev), mdidx(mddev2)); + if (mddev < mddev2) {/* arbitrarily yield */ + mddev->curr_resync = 1; + wake_up(&resync_wait); + } + if (wait_event_interruptible(resync_wait, + mddev2->curr_resync < mddev->curr_resync)) { + flush_signals(current); + mddev_put(mddev2); + goto skip; + } + } + if (mddev->curr_resync == 1) { + mddev_put(mddev2); + break; + } + } + } while (mddev->curr_resync < 2); + + max_sectors = mddev->size << 1; + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" + " %d KB/sec/disc.\n", sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith " + "(but not more than %d KB/sec) for reconstruction.\n", + sysctl_speed_limit_max); + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + j = mddev->recovery_cp; + else + j = 0; + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = j; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = 32*(PAGE_SIZE/512); + printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", + window/2,max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + + if (j) + printk(KERN_INFO + "md: resuming recovery of md%d from checkpoint.\n", + mdidx(mddev)); + + while (j < max_sectors) { + int sectors; + + sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); + if (sectors < 0) { + set_bit(MD_RECOVERY_ERR, &mddev->recovery); + goto out; + } + atomic_add(sectors, &mddev->recovery_active); + j += sectors; + if (j>1) mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + last_check = j; + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || + test_bit(MD_RECOVERY_ERR, &mddev->recovery)) + break; + + blk_run_queues(); + + repeat: + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (signal_pending(current)) { + /* + * got a signal, exit. + */ + printk(KERN_INFO + "md: md_do_sync() got signal ... exiting\n"); + flush_signals(current); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + cond_resched(); + + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/4); + goto repeat; + } + } + } + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + /* + * this also signals 'finished resyncing' to md_stop + */ + out: + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + + /* tell personality that we are finished */ + mddev->pers->sync_request(mddev, max_sectors, 1); + + if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && + mddev->curr_resync > 2 && + mddev->curr_resync > mddev->recovery_cp) { + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + printk(KERN_INFO + "md: checkpointing recovery of md%d.\n", + mdidx(mddev)); + mddev->recovery_cp = mddev->curr_resync; + } else + mddev->recovery_cp = MaxSector; + } + + if (mddev->safemode) + md_enter_safemode(mddev); + skip: + mddev->curr_resync = 0; + set_bit(MD_RECOVERY_DONE, &mddev->recovery); + md_wakeup_thread(mddev->thread); +} + + +/* + * This routine is regularly called by all per-raid-array threads to + * deal with generic issues like resync and super-block update. + * Raid personalities that don't have a thread (linear/raid0) do not + * need this as they never do any recovery or update the superblock. + * + * It does not do any resync itself, but rather "forks" off other threads + * to do that as needed. + * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in + * "->recovery" and create a thread at ->sync_thread. + * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) + * and wakeups up this thread which will reap the thread and finish up. + * This thread also removes any faulty devices (with nr_pending == 0). + * + * The overall approach is: + * 1/ if the superblock needs updating, update it. + * 2/ If a recovery thread is running, don't do anything else. + * 3/ If recovery has finished, clean up, possibly marking spares active. + * 4/ If there are any faulty devices, remove them. + * 5/ If array is degraded, try to add spares devices + * 6/ If array has spares or is not in-sync, start a resync thread. + */ +void md_check_recovery(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct list_head *rtmp; + + + dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); + + if (mddev->ro) + return; + if ( ! ( + mddev->sb_dirty || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_DONE, &mddev->recovery) + )) + return; + if (mddev_trylock(mddev)==0) { + int spares =0; + if (mddev->sb_dirty) + md_update_sb(mddev); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + /* resync/recovery still happening */ + goto unlock; + if (mddev->sync_thread) { + /* resync has finished, collect result */ + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + mddev->pers->spare_active(mddev); + } + md_update_sb(mddev); + mddev->recovery = 0; + wake_up(&resync_wait); + goto unlock; + } + if (mddev->recovery) { + /* that's odd.. */ + mddev->recovery = 0; + wake_up(&resync_wait); + } + + /* no recovery is running. + * remove any failed drives, then + * add spares if possible + */ + ITERATE_RDEV(mddev,rdev,rtmp) { + if (rdev->raid_disk >= 0 && + rdev->faulty && + atomic_read(&rdev->nr_pending)==0) { + mddev->pers->hot_remove_disk(mddev, rdev->raid_disk); + rdev->raid_disk = -1; + } + if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync) + spares++; + } + if (mddev->degraded) { + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk < 0 + && !rdev->faulty) { + if (mddev->pers->hot_add_disk(mddev,rdev)) + spares++; + else + break; + } + } + + if (!spares && (mddev->recovery_cp == MaxSector )) { + /* nothing we can do ... */ + goto unlock; + } + if (mddev->pers->sync_request) { + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + if (!spares) + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "md%d_resync"); + if (!mddev->sync_thread) { + printk(KERN_ERR "md%d: could not start resync" + " thread...\n", + mdidx(mddev)); + /* leave the spares where they are, it shouldn't hurt */ + mddev->recovery = 0; + } else { + md_wakeup_thread(mddev->sync_thread); + } + } + unlock: + mddev_unlock(mddev); + } +} + +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct list_head *tmp; + mddev_t *mddev; + + if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { + + printk(KERN_INFO "md: stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + if (mddev_trylock(mddev)==0) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + .notifier_call = md_notify_reboot, + .next = NULL, + .priority = INT_MAX, /* before any real devices */ +}; + +static void md_geninit(void) +{ + struct proc_dir_entry *p; + + dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + +#ifdef CONFIG_PROC_FS + p = create_proc_entry("mdstat", S_IRUGO, NULL); + if (p) + p->proc_fops = &md_seq_fops; +#endif +} + +int __init md_init(void) +{ + int minor; + + printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," + " MD_SB_DISKS=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + + if (register_blkdev(MAJOR_NR, "md")) + return -1; + + devfs_mk_dir("md"); + blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, + md_probe, NULL, NULL); + for (minor=0; minor < MAX_MD_DEVS; ++minor) { + char name[16]; + sprintf(name, "md/%d", minor); + devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor, + S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); + } + + register_reboot_notifier(&md_notifier); + raid_table_header = register_sysctl_table(raid_root_table, 1); + + md_geninit(); + return (0); +} + + +#ifndef MODULE + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +static dev_t detected_devices[128]; +static int dev_cnt; + +void md_autodetect_dev(dev_t dev) +{ + if (dev_cnt >= 0 && dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} + + +static void autostart_arrays(void) +{ + mdk_rdev_t *rdev; + int i; + + printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + + for (i = 0; i < dev_cnt; i++) { + dev_t dev = detected_devices[i]; + + rdev = md_import_device(dev,0, 0); + if (IS_ERR(rdev)) { + printk(KERN_ALERT "md: could not import %s!\n", + partition_name(dev)); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + list_add(&rdev->same_set, &pending_raid_disks); + } + dev_cnt = 0; + + autorun_devices(); +} + +#endif + +static __exit void md_exit(void) +{ + int i; + blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); + for (i=0; i < MAX_MD_DEVS; i++) + devfs_remove("md/%d", i); + devfs_remove("md"); + + unregister_blkdev(MAJOR_NR,"md"); + unregister_reboot_notifier(&md_notifier); + unregister_sysctl_table(raid_table_header); +#ifdef CONFIG_PROC_FS + remove_proc_entry("mdstat", NULL); +#endif + for (i = 0; i < MAX_MD_DEVS; i++) { + struct gendisk *disk = disks[i]; + mddev_t *mddev; + if (!disks[i]) + continue; + mddev = disk->private_data; + del_gendisk(disk); + put_disk(disk); + mddev_put(mddev); + } +} + +module_init(md_init) +module_exit(md_exit) + +EXPORT_SYMBOL(register_md_personality); +EXPORT_SYMBOL(unregister_md_personality); +EXPORT_SYMBOL(md_error); +EXPORT_SYMBOL(md_sync_acct); +EXPORT_SYMBOL(md_done_sync); +EXPORT_SYMBOL(md_write_start); +EXPORT_SYMBOL(md_write_end); +EXPORT_SYMBOL(md_handle_safemode); +EXPORT_SYMBOL(md_register_thread); +EXPORT_SYMBOL(md_unregister_thread); +EXPORT_SYMBOL(md_wakeup_thread); +EXPORT_SYMBOL(md_print_devices); +EXPORT_SYMBOL(md_interrupt_thread); +EXPORT_SYMBOL(md_check_recovery); +MODULE_LICENSE("GPL"); diff --git a/tests/linux/nfsd-defines/merge b/tests/linux/nfsd-defines/merge new file mode 100644 index 0000000..379b771 --- /dev/null +++ b/tests/linux/nfsd-defines/merge @@ -0,0 +1,270 @@ +/* + * linux/include/linux/nfsd/nfsd.h + * + * Hodge-podge collection of knfsd-related stuff. + * I will sort this out later. + * + * Copyright (C) 1995-1997 Olaf Kirch + */ + +#ifndef LINUX_NFSD_NFSD_H +#define LINUX_NFSD_NFSD_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +/* + * nfsd version + */ +#define NFSD_VERSION "0.5" +#define NFSD_SUPPORTED_MINOR_VERSION 0 + +#ifdef __KERNEL__ +/* + * Special flags for nfsd_permission. These must be different from MAY_READ, + * MAY_WRITE, and MAY_EXEC. + */ +#define MAY_NOP 0 +#define MAY_SATTR 8 +#define MAY_TRUNC 16 +#define MAY_LOCK 32 +#define MAY_OWNER_OVERRIDE 64 +#define MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ +#if (MAY_SATTR | MAY_TRUNC | MAY_LOCK | MAY_OWNER_OVERRIDE | MAY_LOCAL_ACCESS) & (MAY_READ | MAY_WRITE | MAY_EXEC) +# error "please use a different value for MAY_SATTR or MAY_TRUNC or MAY_LOCK or MAY_LOCAL_ACCESS or MAY_OWNER_OVERRIDE." +#endif +#define MAY_CREATE (MAY_EXEC|MAY_WRITE) +#define MAY_REMOVE (MAY_EXEC|MAY_WRITE|MAY_TRUNC) + +/* + * Callback function for readdir + */ +struct readdir_cd { + int err; /* 0, nfserr, or nfserr_eof */ +}; +typedef int (*encode_dent_fn)(struct readdir_cd *, const char *, + int, loff_t, ino_t, unsigned int); +typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); + +extern struct svc_program nfsd_program; +extern struct svc_version nfsd_version2, nfsd_version3, + nfsd_version4; + +/* + * Function prototypes. + */ +int nfsd_svc(unsigned short port, int nrservs); +int nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp); + +/* nfsd/vfs.c */ +int fh_lock_parent(struct svc_fh *, struct dentry *); +int nfsd_racache_init(int); +void nfsd_racache_shutdown(void); +int nfsd_lookup(struct svc_rqst *, struct svc_fh *, + const char *, int, struct svc_fh *); +int nfsd_setattr(struct svc_rqst *, struct svc_fh *, + struct iattr *, int, time_t); +int nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); +#ifdef CONFIG_NFSD_V3 +int nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); +int nfsd_create_v3(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + struct svc_fh *res, int createmode, + u32 *verifier, int *truncp); +int nfsd_commit(struct svc_rqst *, struct svc_fh *, + off_t, unsigned long); +#endif /* CONFIG_NFSD_V3 */ +int nfsd_open(struct svc_rqst *, struct svc_fh *, int, + int, struct file *); +void nfsd_close(struct file *); +int nfsd_read(struct svc_rqst *, struct svc_fh *, + loff_t, struct iovec *,int, unsigned long *); +int nfsd_write(struct svc_rqst *, struct svc_fh *, + loff_t, struct iovec *,int, unsigned long, int *); +int nfsd_readlink(struct svc_rqst *, struct svc_fh *, + char *, int *); +int nfsd_symlink(struct svc_rqst *, struct svc_fh *, + char *name, int len, char *path, int plen, + struct svc_fh *res, struct iattr *); +int nfsd_link(struct svc_rqst *, struct svc_fh *, + char *, int, struct svc_fh *); +int nfsd_rename(struct svc_rqst *, + struct svc_fh *, char *, int, + struct svc_fh *, char *, int); +int nfsd_remove(struct svc_rqst *, + struct svc_fh *, char *, int); +int nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, + char *name, int len); +int nfsd_truncate(struct svc_rqst *, struct svc_fh *, + unsigned long size); +int nfsd_readdir(struct svc_rqst *, struct svc_fh *, + loff_t *, struct readdir_cd *, encode_dent_fn); +int nfsd_statfs(struct svc_rqst *, struct svc_fh *, + struct statfs *); + +int nfsd_notify_change(struct inode *, struct iattr *); +int nfsd_permission(struct svc_export *, struct dentry *, int); + + +/* + * NFSv4 State + */ +#ifdef CONFIG_NFSD_V4 +void nfs4_state_init(void); +void nfs4_state_shutdown(void); +#else +void static inline nfs4_state_init(void){} +void static inline nfs4_state_shutdown(void){} +#endif + +/* + * lockd binding + */ +void nfsd_lockd_init(void); +void nfsd_lockd_shutdown(void); + + +/* + * These macros provide pre-xdr'ed values for faster operation. + */ +#define nfs_ok __constant_htonl(NFS_OK) +#define nfserr_perm __constant_htonl(NFSERR_PERM) +#define nfserr_noent __constant_htonl(NFSERR_NOENT) +#define nfserr_io __constant_htonl(NFSERR_IO) +#define nfserr_nxio __constant_htonl(NFSERR_NXIO) +#define nfserr_eagain __constant_htonl(NFSERR_EAGAIN) +#define nfserr_acces __constant_htonl(NFSERR_ACCES) +#define nfserr_exist __constant_htonl(NFSERR_EXIST) +#define nfserr_xdev __constant_htonl(NFSERR_XDEV) +#define nfserr_nodev __constant_htonl(NFSERR_NODEV) +#define nfserr_notdir __constant_htonl(NFSERR_NOTDIR) +#define nfserr_isdir __constant_htonl(NFSERR_ISDIR) +#define nfserr_inval __constant_htonl(NFSERR_INVAL) +#define nfserr_fbig __constant_htonl(NFSERR_FBIG) +#define nfserr_nospc __constant_htonl(NFSERR_NOSPC) +#define nfserr_rofs __constant_htonl(NFSERR_ROFS) +#define nfserr_mlink __constant_htonl(NFSERR_MLINK) +#define nfserr_opnotsupp __constant_htonl(NFSERR_OPNOTSUPP) +#define nfserr_nametoolong __constant_htonl(NFSERR_NAMETOOLONG) +#define nfserr_notempty __constant_htonl(NFSERR_NOTEMPTY) +#define nfserr_dquot __constant_htonl(NFSERR_DQUOT) +#define nfserr_stale __constant_htonl(NFSERR_STALE) +#define nfserr_remote __constant_htonl(NFSERR_REMOTE) +#define nfserr_wflush __constant_htonl(NFSERR_WFLUSH) +#define nfserr_badhandle __constant_htonl(NFSERR_BADHANDLE) +#define nfserr_notsync __constant_htonl(NFSERR_NOT_SYNC) +#define nfserr_badcookie __constant_htonl(NFSERR_BAD_COOKIE) +#define nfserr_notsupp __constant_htonl(NFSERR_NOTSUPP) +#define nfserr_toosmall __constant_htonl(NFSERR_TOOSMALL) +#define nfserr_serverfault __constant_htonl(NFSERR_SERVERFAULT) +#define nfserr_badtype __constant_htonl(NFSERR_BADTYPE) +#define nfserr_jukebox __constant_htonl(NFSERR_JUKEBOX) +#define nfserr_bad_cookie __constant_htonl(NFSERR_BAD_COOKIE) +#define nfserr_same __constant_htonl(NFSERR_SAME) +#define nfserr_clid_inuse __constant_htonl(NFSERR_CLID_INUSE) +#define nfserr_stale_clientid __constant_htonl(NFSERR_STALE_CLIENTID) +#define nfserr_resource __constant_htonl(NFSERR_RESOURCE) +#define nfserr_nofilehandle __constant_htonl(NFSERR_NOFILEHANDLE) +#define nfserr_minor_vers_mismatch __constant_htonl(NFSERR_MINOR_VERS_MISMATCH) +#define nfserr_symlink __constant_htonl(NFSERR_SYMLINK) +#define nfserr_not_same __constant_htonl(NFSERR_NOT_SAME) +#define nfserr_readdir_nospc __constant_htonl(NFSERR_READDIR_NOSPC) +#define nfserr_bad_xdr __constant_htonl(NFSERR_BAD_XDR) + +/* error codes for internal use */ +/* if a request fails due to kmalloc failure, it gets dropped. + * Client should resend eventually + */ +#define nfserr_dropit __constant_htonl(30000) +/* end-of-file indicator in readdir */ +#define nfserr_eof __constant_htonl(30001) + +/* Check for dir entries '.' and '..' */ +#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) + +/* + * Time of server startup + */ +extern struct timeval nfssvc_boot; + + +#ifdef CONFIG_NFSD_V4 + +/* before processing a COMPOUND operation, we have to check that there + * is enough space in the buffer for XDR encode to succeed. otherwise, + * we might process an operation with side effects, and be unable to + * tell the client that the operation succeeded. + * + * COMPOUND_SLACK_SPACE - this is the minimum amount of buffer space + * needed to encode an "ordinary" _successful_ operation. (GETATTR, + * READ, READDIR, and READLINK have their own buffer checks.) if we + * fall below this level, we fail the next operation with NFS4ERR_RESOURCE. + * + * COMPOUND_ERR_SLACK_SPACE - this is the minimum amount of buffer space + * needed to encode an operation which has failed with NFS4ERR_RESOURCE. + * care is taken to ensure that we never fall below this level for any + * reason. + */ +#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ +#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ + +#define NFSD_LEASE_TIME 60 /* seconds */ + +/* + * The following attributes are currently not supported by the NFSv4 server: + * ACL (will be supported in a forthcoming patch) + * ARCHIVE (deprecated anyway) + * FS_LOCATIONS (will be supported eventually) + * HIDDEN (unlikely to be supported any time soon) + * MIMETYPE (unlikely to be supported any time soon) + * QUOTA_* (will be supported in a forthcoming patch) + * SYSTEM (unlikely to be supported any time soon) + * TIME_BACKUP (unlikely to be supported any time soon) + * TIME_CREATE (unlikely to be supported any time soon) + */ +#define NFSD_SUPPORTED_ATTRS_WORD0 \ +(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \ + | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \ + | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \ + | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \ + | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \ + | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \ + | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \ + | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_HOMOGENEOUS \ + | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \ + | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE) + +#define NFSD_SUPPORTED_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \ + | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \ + | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ + | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ + | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ + | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET) + +/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ +#define NFSD_WRITEONLY_ATTRS_WORD1 \ +(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + +/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ +#define NFSD_WRITEABLE_ATTRS_WORD0 FATTR4_WORD0_SIZE +#define NFSD_WRITEABLE_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY_SET) + +#endif /* CONFIG_NFSD_V4 */ + +#endif /* __KERNEL__ */ + +#endif /* LINUX_NFSD_NFSD_H */ diff --git a/tests/linux/nfsd-defines/orig b/tests/linux/nfsd-defines/orig new file mode 100644 index 0000000..f4b2784 --- /dev/null +++ b/tests/linux/nfsd-defines/orig @@ -0,0 +1,270 @@ +/* + * linux/include/linux/nfsd/nfsd.h + * + * Hodge-podge collection of knfsd-related stuff. + * I will sort this out later. + * + * Copyright (C) 1995-1997 Olaf Kirch + */ + +#ifndef LINUX_NFSD_NFSD_H +#define LINUX_NFSD_NFSD_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +/* + * nfsd version + */ +#define NFSD_VERSION "0.5" +#define NFSD_SUPPORTED_MINOR_VERSION 0 + +#ifdef __KERNEL__ +/* + * Special flags for nfsd_permission. These must be different from MAY_READ, + * MAY_WRITE, and MAY_EXEC. + */ +#define MAY_NOP 0 +#define MAY_SATTR 8 +#define MAY_TRUNC 16 +#define MAY_LOCK 32 +#define MAY_OWNER_OVERRIDE 64 +#define MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ +#if (MAY_SATTR | MAY_TRUNC | MAY_LOCK | MAY_OWNER_OVERRIDE | MAY_LOCAL_ACCESS) & (MAY_READ | MAY_WRITE | MAY_EXEC) +# error "please use a different value for MAY_SATTR or MAY_TRUNC or MAY_LOCK or MAY_OWNER_OVERRIDE." +#endif +#define MAY_CREATE (MAY_EXEC|MAY_WRITE) +#define MAY_REMOVE (MAY_EXEC|MAY_WRITE|MAY_TRUNC) + +/* + * Callback function for readdir + */ +struct readdir_cd { + int err; /* 0, nfserr, or nfserr_eof */ +}; +typedef int (*encode_dent_fn)(struct readdir_cd *, const char *, + int, loff_t, ino_t, unsigned int); +typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); + +extern struct svc_program nfsd_program; +extern struct svc_version nfsd_version2, nfsd_version3, + nfsd_version4; + +/* + * Function prototypes. + */ +int nfsd_svc(unsigned short port, int nrservs); +int nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp); + +/* nfsd/vfs.c */ +int fh_lock_parent(struct svc_fh *, struct dentry *); +int nfsd_racache_init(int); +void nfsd_racache_shutdown(void); +int nfsd_lookup(struct svc_rqst *, struct svc_fh *, + const char *, int, struct svc_fh *); +int nfsd_setattr(struct svc_rqst *, struct svc_fh *, + struct iattr *, int, time_t); +int nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); +#ifdef CONFIG_NFSD_V3 +int nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); +int nfsd_create_v3(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + struct svc_fh *res, int createmode, + u32 *verifier, int *truncp); +int nfsd_commit(struct svc_rqst *, struct svc_fh *, + off_t, unsigned long); +#endif /* CONFIG_NFSD_V3 */ +int nfsd_open(struct svc_rqst *, struct svc_fh *, int, + int, struct file *); +void nfsd_close(struct file *); +int nfsd_read(struct svc_rqst *, struct svc_fh *, + loff_t, struct iovec *,int, unsigned long *); +int nfsd_write(struct svc_rqst *, struct svc_fh *, + loff_t, struct iovec *,int, unsigned long, int *); +int nfsd_readlink(struct svc_rqst *, struct svc_fh *, + char *, int *); +int nfsd_symlink(struct svc_rqst *, struct svc_fh *, + char *name, int len, char *path, int plen, + struct svc_fh *res, struct iattr *); +int nfsd_link(struct svc_rqst *, struct svc_fh *, + char *, int, struct svc_fh *); +int nfsd_rename(struct svc_rqst *, + struct svc_fh *, char *, int, + struct svc_fh *, char *, int); +int nfsd_remove(struct svc_rqst *, + struct svc_fh *, char *, int); +int nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, + char *name, int len); +int nfsd_truncate(struct svc_rqst *, struct svc_fh *, + unsigned long size); +int nfsd_readdir(struct svc_rqst *, struct svc_fh *, + loff_t *, struct readdir_cd *, encode_dent_fn); +int nfsd_statfs(struct svc_rqst *, struct svc_fh *, + struct statfs *); + +int nfsd_notify_change(struct inode *, struct iattr *); +int nfsd_permission(struct svc_export *, struct dentry *, int); + + +/* + * NFSv4 State + */ +#ifdef CONFIG_NFSD_V4 +void nfs4_state_init(void); +void nfs4_state_shutdown(void); +#else +void static inline nfs4_state_init(void){} +void static inline nfs4_state_shutdown(void){} +#endif + +/* + * lockd binding + */ +void nfsd_lockd_init(void); +void nfsd_lockd_shutdown(void); + + +/* + * These macros provide pre-xdr'ed values for faster operation. + */ +#define nfs_ok __constant_htonl(NFS_OK) +#define nfserr_perm __constant_htonl(NFSERR_PERM) +#define nfserr_noent __constant_htonl(NFSERR_NOENT) +#define nfserr_io __constant_htonl(NFSERR_IO) +#define nfserr_nxio __constant_htonl(NFSERR_NXIO) +#define nfserr_eagain __constant_htonl(NFSERR_EAGAIN) +#define nfserr_acces __constant_htonl(NFSERR_ACCES) +#define nfserr_exist __constant_htonl(NFSERR_EXIST) +#define nfserr_xdev __constant_htonl(NFSERR_XDEV) +#define nfserr_nodev __constant_htonl(NFSERR_NODEV) +#define nfserr_notdir __constant_htonl(NFSERR_NOTDIR) +#define nfserr_isdir __constant_htonl(NFSERR_ISDIR) +#define nfserr_inval __constant_htonl(NFSERR_INVAL) +#define nfserr_fbig __constant_htonl(NFSERR_FBIG) +#define nfserr_nospc __constant_htonl(NFSERR_NOSPC) +#define nfserr_rofs __constant_htonl(NFSERR_ROFS) +#define nfserr_mlink __constant_htonl(NFSERR_MLINK) +#define nfserr_opnotsupp __constant_htonl(NFSERR_OPNOTSUPP) +#define nfserr_nametoolong __constant_htonl(NFSERR_NAMETOOLONG) +#define nfserr_notempty __constant_htonl(NFSERR_NOTEMPTY) +#define nfserr_dquot __constant_htonl(NFSERR_DQUOT) +#define nfserr_stale __constant_htonl(NFSERR_STALE) +#define nfserr_remote __constant_htonl(NFSERR_REMOTE) +#define nfserr_wflush __constant_htonl(NFSERR_WFLUSH) +#define nfserr_badhandle __constant_htonl(NFSERR_BADHANDLE) +#define nfserr_notsync __constant_htonl(NFSERR_NOT_SYNC) +#define nfserr_badcookie __constant_htonl(NFSERR_BAD_COOKIE) +#define nfserr_notsupp __constant_htonl(NFSERR_NOTSUPP) +#define nfserr_toosmall __constant_htonl(NFSERR_TOOSMALL) +#define nfserr_serverfault __constant_htonl(NFSERR_SERVERFAULT) +#define nfserr_badtype __constant_htonl(NFSERR_BADTYPE) +#define nfserr_jukebox __constant_htonl(NFSERR_JUKEBOX) +#define nfserr_bad_cookie __constant_htonl(NFSERR_BAD_COOKIE) +#define nfserr_same __constant_htonl(NFSERR_SAME) +#define nfserr_clid_inuse __constant_htonl(NFSERR_CLID_INUSE) +#define nfserr_stale_clientid __constant_htonl(NFSERR_STALE_CLIENTID) +#define nfserr_resource __constant_htonl(NFSERR_RESOURCE) +#define nfserr_nofilehandle __constant_htonl(NFSERR_NOFILEHANDLE) +#define nfserr_minor_vers_mismatch __constant_htonl(NFSERR_MINOR_VERS_MISMATCH) +#define nfserr_symlink __constant_htonl(NFSERR_SYMLINK) +#define nfserr_not_same __constant_htonl(NFSERR_NOT_SAME) +#define nfserr_readdir_nospc __constant_htonl(NFSERR_READDIR_NOSPC) +#define nfserr_bad_xdr __constant_htonl(NFSERR_BAD_XDR) + +/* error codes for internal use */ +/* if a request fails due to kmalloc failure, it gets dropped. + * Client should resend eventually + */ +#define nfserr_dropit __constant_htonl(30000) +/* end-of-file indicator in readdir */ +#define nfserr_eof __constant_htonl(30001) + +/* Check for dir entries '.' and '..' */ +#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) + +/* + * Time of server startup + */ +extern struct timeval nfssvc_boot; + + +#ifdef CONFIG_NFSD_V4 + +/* before processing a COMPOUND operation, we have to check that there + * is enough space in the buffer for XDR encode to succeed. otherwise, + * we might process an operation with side effects, and be unable to + * tell the client that the operation succeeded. + * + * COMPOUND_SLACK_SPACE - this is the minimum amount of buffer space + * needed to encode an "ordinary" _successful_ operation. (GETATTR, + * READ, READDIR, and READLINK have their own buffer checks.) if we + * fall below this level, we fail the next operation with NFS4ERR_RESOURCE. + * + * COMPOUND_ERR_SLACK_SPACE - this is the minimum amount of buffer space + * needed to encode an operation which has failed with NFS4ERR_RESOURCE. + * care is taken to ensure that we never fall below this level for any + * reason. + */ +#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ +#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ + +#define NFSD_LEASE_TIME 60 /* seconds */ + +/* + * The following attributes are currently not supported by the NFSv4 server: + * ACL (will be supported in a forthcoming patch) + * ARCHIVE (deprecated anyway) + * FS_LOCATIONS (will be supported eventually) + * HIDDEN (unlikely to be supported any time soon) + * MIMETYPE (unlikely to be supported any time soon) + * QUOTA_* (will be supported in a forthcoming patch) + * SYSTEM (unlikely to be supported any time soon) + * TIME_BACKUP (unlikely to be supported any time soon) + * TIME_CREATE (unlikely to be supported any time soon) + */ +#define NFSD_SUPPORTED_ATTRS_WORD0 \ +(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \ + | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \ + | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \ + | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \ + | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \ + | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \ + | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \ + | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_HOMOGENEOUS \ + | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \ + | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE) + +#define NFSD_SUPPORTED_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \ + | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \ + | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ + | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ + | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ + | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET) + +/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ +#define NFSD_WRITEONLY_ATTRS_WORD1 \ +(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + +/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ +#define NFSD_WRITEABLE_ATTRS_WORD0 FATTR4_WORD0_SIZE +#define NFSD_WRITEABLE_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY_SET) + +#endif /* CONFIG_NFSD_V4 */ + +#endif /* __KERNEL__ */ + +#endif /* LINUX_NFSD_NFSD_H */ diff --git a/tests/linux/nfsd-defines/patch b/tests/linux/nfsd-defines/patch new file mode 100644 index 0000000..506a370 --- /dev/null +++ b/tests/linux/nfsd-defines/patch @@ -0,0 +1,24 @@ +Status: trivial + +Fix errors with MAY_* value checking + +Typos and sillyness... + + ----------- Diffstat output ------------ + ./include/linux/nfsd/nfsd.h | 4 ++-- + 1 files changed, 2 insertions(+), 2 deletions(-) + +diff ./include/linux/nfsd/nfsd.h~current~ ./include/linux/nfsd/nfsd.h +--- ./include/linux/nfsd/nfsd.h~current~ 2003-04-17 10:31:15.000000000 +1000 ++++ ./include/linux/nfsd/nfsd.h 2003-04-17 10:31:08.000000000 +1000 +@@ -39,8 +39,8 @@ + #define MAY_LOCK 32 + #define MAY_OWNER_OVERRIDE 64 + #define MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ +-#if (MAY_SATTR | MAY_TRUNC | MAY_LOCK | MAX_OWNER_OVERRIDE | MAY_LOCAL_ACCESS) & (MAY_READ | MAY_WRITE | MAY_EXEC | MAY_OWNER_OVERRIDE) +-# error "please use a different value for MAY_SATTR or MAY_TRUNC or MAY_LOCK or MAY_OWNER_OVERRIDE." ++#if (MAY_SATTR | MAY_TRUNC | MAY_LOCK | MAY_OWNER_OVERRIDE | MAY_LOCAL_ACCESS) & (MAY_READ | MAY_WRITE | MAY_EXEC) ++# error "please use a different value for MAY_SATTR or MAY_TRUNC or MAY_LOCK or MAY_LOCAL_ACCESS or MAY_OWNER_OVERRIDE." + #endif + #define MAY_CREATE (MAY_EXEC|MAY_WRITE) + #define MAY_REMOVE (MAY_EXEC|MAY_WRITE|MAY_TRUNC) diff --git a/tests/linux/raid5/orig b/tests/linux/raid5/orig new file mode 100644 index 0000000..40204c9 --- /dev/null +++ b/tests/linux/raid5/orig @@ -0,0 +1,2079 @@ +/* + * raid5.c : Multiple Devices driver for Linux + * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * Copyright (C) 1999, 2000 Ingo Molnar + * + * RAID-5 management functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include +#include +#include +#include +#include +#include +#include + +/* + * Stripe cache + */ + +#define NR_STRIPES 256 +#define STRIPE_SIZE PAGE_SIZE +#define STRIPE_SECTORS (STRIPE_SIZE>>9) +#define IO_THRESHOLD 1 +#define HASH_PAGES 1 +#define HASH_PAGES_ORDER 0 +#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) +#define HASH_MASK (NR_HASH - 1) +#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / STRIPE_SECTORS) & HASH_MASK]) + +/* + * The following can be used to debug the driver + */ +#define RAID5_DEBUG 0 +#define RAID5_PARANOIA 1 +#if RAID5_PARANOIA && CONFIG_SMP +# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG() +#else +# define CHECK_DEVLOCK() +#endif + +#if RAID5_DEBUG +#define PRINTK(x...) printk(x) +#define inline +#define __inline__ +#else +#define PRINTK(x...) do { } while (0) +#endif + +static void print_raid5_conf (raid5_conf_t *conf); + +static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) +{ + if (atomic_dec_and_test(&sh->count)) { + if (!list_empty(&sh->lru)) + BUG(); + if (atomic_read(&conf->active_stripes)==0) + BUG(); + if (test_bit(STRIPE_HANDLE, &sh->state)) { + if (test_bit(STRIPE_DELAYED, &sh->state)) + list_add_tail(&sh->lru, &conf->delayed_list); + else + list_add_tail(&sh->lru, &conf->handle_list); + md_wakeup_thread(conf->thread); + } else { + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) + md_wakeup_thread(conf->thread); + } + list_add_tail(&sh->lru, &conf->inactive_list); + atomic_dec(&conf->active_stripes); + if (!conf->inactive_blocked || + atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4)) + wake_up(&conf->wait_for_stripe); + } + } +} +static void release_stripe(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + __release_stripe(conf, sh); + spin_unlock_irqrestore(&conf->device_lock, flags); +} + +static void remove_hash(struct stripe_head *sh) +{ + PRINTK("remove_hash(), stripe %lu\n", sh->sector); + + if (sh->hash_pprev) { + if (sh->hash_next) + sh->hash_next->hash_pprev = sh->hash_pprev; + *sh->hash_pprev = sh->hash_next; + sh->hash_pprev = NULL; + } +} + +static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) +{ + struct stripe_head **shp = &stripe_hash(conf, sh->sector); + + PRINTK("insert_hash(), stripe %lu\n",sh->sector); + + CHECK_DEVLOCK(); + if ((sh->hash_next = *shp) != NULL) + (*shp)->hash_pprev = &sh->hash_next; + *shp = sh; + sh->hash_pprev = shp; +} + + +/* find an idle stripe, make sure it is unhashed, and return it. */ +static struct stripe_head *get_free_stripe(raid5_conf_t *conf) +{ + struct stripe_head *sh = NULL; + struct list_head *first; + + CHECK_DEVLOCK(); + if (list_empty(&conf->inactive_list)) + goto out; + first = conf->inactive_list.next; + sh = list_entry(first, struct stripe_head, lru); + list_del_init(first); + remove_hash(sh); + atomic_inc(&conf->active_stripes); +out: + return sh; +} + +static void shrink_buffers(struct stripe_head *sh, int num) +{ + struct buffer_head *bh; + int i; + + for (i=0; ibh_cache[i]; + if (!bh) + return; + sh->bh_cache[i] = NULL; + free_page((unsigned long) bh->b_data); + kfree(bh); + } +} + +static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority) +{ + struct buffer_head *bh; + int i; + + for (i=0; ib_wait); + if ((page = alloc_page(priority))) + bh->b_data = page_address(page); + else { + kfree(bh); + return 1; + } + atomic_set(&bh->b_count, 0); + bh->b_page = page; + sh->bh_cache[i] = bh; + + } + return 0; +} + +static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i); + +static inline void init_stripe(struct stripe_head *sh, unsigned long sector) +{ + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; + + if (atomic_read(&sh->count) != 0) + BUG(); + if (test_bit(STRIPE_HANDLE, &sh->state)) + BUG(); + + CHECK_DEVLOCK(); + PRINTK("init_stripe called, stripe %lu\n", sh->sector); + + remove_hash(sh); + + sh->sector = sector; + sh->size = conf->buffer_size; + sh->state = 0; + + for (i=disks; i--; ) { + if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] || + buffer_locked(sh->bh_cache[i])) { + printk("sector=%lx i=%d %p %p %p %d\n", + sh->sector, i, sh->bh_read[i], + sh->bh_write[i], sh->bh_written[i], + buffer_locked(sh->bh_cache[i])); + BUG(); + } + clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state); + raid5_build_block(sh, i); + } + insert_hash(conf, sh); +} + +/* the buffer size has changed, so unhash all stripes + * as active stripes complete, they will go onto inactive list + */ +static void shrink_stripe_cache(raid5_conf_t *conf) +{ + int i; + CHECK_DEVLOCK(); + if (atomic_read(&conf->active_stripes)) + BUG(); + for (i=0; i < NR_HASH; i++) { + struct stripe_head *sh; + while ((sh = conf->stripe_hashtbl[i])) + remove_hash(sh); + } +} + +static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector) +{ + struct stripe_head *sh; + + CHECK_DEVLOCK(); + PRINTK("__find_stripe, sector %lu\n", sector); + for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) + if (sh->sector == sector) + return sh; + PRINTK("__stripe %lu not in cache\n", sector); + return NULL; +} + +static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, + int pd_idx, int noblock) +{ + struct stripe_head *sh; + + PRINTK("get_stripe, sector %lu\n", sector); + + spin_lock_irq(&conf->device_lock); + + do { + sh = __find_stripe(conf, sector); + if (!sh) { + if (!conf->inactive_blocked) + sh = get_free_stripe(conf); + if (noblock && sh == NULL) + break; + if (!sh) { + conf->inactive_blocked = 1; + wait_event_lock_irq(conf->wait_for_stripe, + !list_empty(&conf->inactive_list) && + (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4) + || !conf->inactive_blocked), + conf->device_lock); + conf->inactive_blocked = 0; + } else + init_stripe(sh, sector, pd_idx); + } else { + if (atomic_read(&sh->count)) { + if (!list_empty(&sh->lru)) + BUG(); + } else { + if (!test_bit(STRIPE_HANDLE, &sh->state)) + atomic_inc(&conf->active_stripes); + if (list_empty(&sh->lru)) + BUG(); + list_del_init(&sh->lru); + } + } + } while (sh == NULL); + + if (sh) + atomic_inc(&sh->count); + + spin_unlock_irq(&conf->device_lock); + return sh; +} + +static int grow_stripes(raid5_conf_t *conf, int num) +{ + struct stripe_head *sh; + kmem_cache_t *sc; + int devs = conf->raid_disks; + + sprintf(conf->cache_name, "md/raid5-%d", conf->mddev->__minor); + + sc = kmem_cache_create(conf->cache_name, + sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), + 0, 0, NULL, NULL); + if (!sc) + return 1; + conf->slab_cache = sc; + while (num--) { + sh = kmem_cache_alloc(sc, GFP_KERNEL); + if (!sh) + return 1; + memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev)); + sh->raid_conf = conf; + sh->lock = SPIN_LOCK_UNLOCKED; + + if (grow_buffers(sh, conf->raid_disks)) { + shrink_buffers(sh, conf->raid_disks); + kmem_cache_free(sc, sh); + return 1; + } + /* we just created an active stripe so... */ + atomic_set(&sh->count, 1); + atomic_inc(&conf->active_stripes); + INIT_LIST_HEAD(&sh->lru); + release_stripe(sh); + } + return 0; +} + +static void shrink_stripes(raid5_conf_t *conf) +{ + struct stripe_head *sh; + + while (1) { + spin_lock_irq(&conf->device_lock); + sh = get_free_stripe(conf); + spin_unlock_irq(&conf->device_lock); + if (!sh) + break; + if (atomic_read(&sh->count)) + BUG(); + shrink_buffers(sh, conf->raid_disks); + kmem_cache_free(conf->slab_cache, sh); + atomic_dec(&conf->active_stripes); + } + kmem_cache_destroy(conf->slab_cache); + conf->slab_cache = NULL; +} + +static void raid5_end_read_request (struct bio * bi) +{ + struct stripe_head *sh = bi->bi_private; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; + int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + + for (i=0 ; idev[i].req) + break; + + PRINTK("end_read_request %lu/%d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate); + if (i == disks) { + BUG(); + return; + } + + if (uptodate) { +#if 0 + struct bio *bio; + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + /* we can return a buffer if we bypassed the cache or + * if the top buffer is not in highmem. If there are + * multiple buffers, leave the extra work to + * handle_stripe + */ + buffer = sh->bh_read[i]; + if (buffer && + (!PageHighMem(buffer->b_page) + || buffer->b_page == bh->b_page ) + ) { + sh->bh_read[i] = buffer->b_reqnext; + buffer->b_reqnext = NULL; + } else + buffer = NULL; + spin_unlock_irqrestore(&conf->device_lock, flags); + if (sh->bh_page[i]==NULL) + set_bit(BH_Uptodate, &bh->b_state); + if (buffer) { + if (buffer->b_page != bh->b_page) + memcpy(buffer->b_data, bh->b_data, bh->b_size); + buffer->b_end_io(buffer, 1); + } + } else { + md_error(conf->mddev, bh->b_dev); + clear_bit(BH_Uptodate, &bh->b_state); + } + /* must restore b_page before unlocking buffer... */ + if (sh->bh_page[i]) { + bh->b_page = sh->bh_page[i]; + bh->b_data = page_address(bh->b_page); + sh->bh_page[i] = NULL; + clear_bit(BH_Uptodate, &bh->b_state); + } + clear_bit(BH_Lock, &bh->b_state); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +static void raid5_end_write_request (struct buffer_head *bh, int uptodate) +{ + struct stripe_head *sh = bh->b_private; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; + unsigned long flags; + + for (i=0 ; ibh_cache[i]) + break; + + PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate); + if (i == disks) { + BUG(); + return; + } + + spin_lock_irqsave(&conf->device_lock, flags); + if (!uptodate) + md_error(conf->mddev, bh->b_dev); + clear_bit(BH_Lock, &bh->b_state); + set_bit(STRIPE_HANDLE, &sh->state); + __release_stripe(conf, sh); + spin_unlock_irqrestore(&conf->device_lock, flags); +} + + + +static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i) +{ + raid5_conf_t *conf = sh->raid_conf; + struct buffer_head *bh = sh->bh_cache[i]; + unsigned long block = sh->sector / (sh->size >> 9); + + init_buffer(bh, raid5_end_read_request, sh); + bh->b_dev = conf->disks[i].dev; + bh->b_blocknr = block; + + bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); + bh->b_size = sh->size; + bh->b_list = BUF_LOCKED; + return bh; +} + +static int error (mddev_t *mddev, kdev_t dev) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; + struct disk_info *disk; + int i; + + PRINTK("raid5: error called\n"); + + for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) { + if (disk->dev == dev) { + if (disk->operational) { + disk->operational = 0; + mark_disk_faulty(sb->disks+disk->number); + mark_disk_nonsync(sb->disks+disk->number); + mark_disk_inactive(sb->disks+disk->number); + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + conf->working_disks--; + conf->failed_disks++; + md_wakeup_thread(conf->thread); + printk (KERN_ALERT + "raid5: Disk failure on %s, disabling device." + " Operation continuing on %d devices\n", + partition_name (dev), conf->working_disks); + } + return 0; + } + } + /* + * handle errors in spares (during reconstruction) + */ + if (conf->spare) { + disk = conf->spare; + if (disk->dev == dev) { + printk (KERN_ALERT + "raid5: Disk failure on spare %s\n", + partition_name (dev)); + if (!conf->spare->operational) { + /* probably a SET_DISK_FAULTY ioctl */ + return -EIO; + } + disk->operational = 0; + disk->write_only = 0; + conf->spare = NULL; + mark_disk_faulty(sb->disks+disk->number); + mark_disk_nonsync(sb->disks+disk->number); + mark_disk_inactive(sb->disks+disk->number); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + + return 0; + } + } + MD_BUG(); + return -EIO; +} + +/* + * Input: a 'big' sector number, + * Output: index of the data and parity disk, and the sector # in them. + */ +static unsigned long raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, + unsigned int data_disks, unsigned int * dd_idx, + unsigned int * pd_idx, raid5_conf_t *conf) +{ + sector_t stripe; + unsigned long chunk_number; + unsigned int chunk_offset; + sector_t new_sector; + int sectors_per_chunk = conf->chunk_size >> 9; + + /* First compute the information on this sector */ + + /* + * Compute the chunk number and the sector offset inside the chunk + */ + chunk_number = r_sector / sectors_per_chunk; + chunk_offset = r_sector % sectors_per_chunk; + + /* + * Compute the stripe number + */ + stripe = chunk_number / data_disks; + + /* + * Compute the data disk and parity disk indexes inside the stripe + */ + *dd_idx = chunk_number % data_disks; + + /* + * Select the parity disk based on the user selected algorithm. + */ + if (conf->level == 4) + *pd_idx = data_disks; + else switch (conf->algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + *pd_idx = data_disks - stripe % raid_disks; + if (*dd_idx >= *pd_idx) + (*dd_idx)++; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + *pd_idx = stripe % raid_disks; + if (*dd_idx >= *pd_idx) + (*dd_idx)++; + break; + case ALGORITHM_LEFT_SYMMETRIC: + *pd_idx = data_disks - stripe % raid_disks; + *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + *pd_idx = stripe % raid_disks; + *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + break; + default: + printk ("raid5: unsupported algorithm %d\n", conf->algorithm); + } + + /* + * Finally, compute the new sector number + */ + new_sector = stripe * sectors_per_chunk + chunk_offset; + return new_sector; +} + + +static sector_t compute_blocknr(struct stripe_head *sh, int i) +{ + raid5_conf_t *conf = sh->raid_conf; + int raid_disks = conf->raid_disks, data_disks = raid_disks - 1; + sector_t new_sector = sh->sector, check; + int sectors_per_chunk = conf->chunk_size >> 9; + sector_t stripe = new_sector / sectors_per_chunk; + int chunk_offset = new_sector % sectors_per_chunk; + int chunk_number, dummy1, dummy2, dd_idx = i; + sector_t r_sector; + + switch (conf->algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + case ALGORITHM_RIGHT_ASYMMETRIC: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC: + case ALGORITHM_RIGHT_SYMMETRIC: + if (i < sh->pd_idx) + i += raid_disks; + i -= (sh->pd_idx + 1); + break; + default: + printk ("raid5: unsupported algorithm %d\n", conf->algorithm); + } + + chunk_number = stripe * data_disks + i; + r_sector = chunk_number * sectors_per_chunk + chunk_offset; + + check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); + if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { + printk("compute_blocknr: map not correct\n"); + return 0; + } + return r_sector; +} + + + +/* + * Copy data between a page in the stripe cache, and one or more bion + * The page could align with the middle of the bio, or there could be + * several bion, each with several bio_vecs, which cover part of the page + * Multiple bion are linked together on bi_next. There may be extras + * at the end of this list. We ignore them. + */ +static void copy_data(int frombio, struct bio *bio, + struct page *page, + sector_t sector) +{ + char *pa = page_address(page); + struct bio_vec *bvl; + int i; + + for (;bio && bio->bi_sector < sector+STRIPE_SECTORS; + bio = bio->bi_next) { + int page_offset; + if (bio->bi_sector >= sector) + page_offset = (signed)(bio->bi_sector - sector) * 512; + else + page_offset = (signed)(sector - bio->bi_sector) * -512; + bio_for_each_segment(bvl, bio, i) { + char *ba = __bio_kmap(bio, i); + int len = bio_iovec_idx(bio,i)->bv_len; + int clen; + int b_offset = 0; + + if (page_offset < 0) { + b_offset = -page_offset; + page_offset += b_offset; + len -= b_offset; + } + + if (len > 0 && page_offset + len > STRIPE_SIZE) + clen = STRIPE_SIZE - page_offset; + else clen = len; + + if (len > 0) { + if (frombio) + memcpy(pa+page_offset, ba+b_offset, clen); + else + memcpy(ba+b_offset, pa+page_offset, clen); + } + __bio_kunmap(bio, i); + page_offset += len; + } + } +} + +#define check_xor() do { \ + if (count == MAX_XOR_BLOCKS) { \ + xor_block(count, STRIPE_SIZE, ptr); \ + count = 1; \ + } \ + } while(0) + + +static void compute_block(struct stripe_head *sh, int dd_idx) +{ + raid5_conf_t *conf = sh->raid_conf; + int i, count, disks = conf->raid_disks; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh; + + PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx); + + + memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size); + bh_ptr[0] = sh->bh_cache[dd_idx]; + count = 1; + for (i = disks ; i--; ) { + if (i == dd_idx) + continue; + bh = sh->bh_cache[i]; + if (buffer_uptodate(bh)) + bh_ptr[count++] = bh; + else + printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i); + + check_xor(); + } + if (count != 1) + xor_block(count, bh_ptr); + set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state); +} + +static void compute_parity(struct stripe_head *sh, int method) +{ + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; + struct buffer_head *chosen[MD_SB_DISKS]; + + PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method); + memset(chosen, 0, sizeof(chosen)); + + count = 1; + bh_ptr[0] = sh->bh_cache[pd_idx]; + switch(method) { + case READ_MODIFY_WRITE: + if (!buffer_uptodate(sh->bh_cache[pd_idx])) + BUG(); + for (i=disks ; i-- ;) { + if (i==pd_idx) + continue; + if (sh->bh_write[i] && + buffer_uptodate(sh->bh_cache[i])) { + bh_ptr[count++] = sh->bh_cache[i]; + chosen[i] = sh->bh_write[i]; + sh->bh_write[i] = sh->bh_write[i]->b_reqnext; + chosen[i]->b_reqnext = sh->bh_written[i]; + sh->bh_written[i] = chosen[i]; + check_xor(); + } + } + break; + case RECONSTRUCT_WRITE: + memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size); + for (i= disks; i-- ;) + if (i!=pd_idx && sh->bh_write[i]) { + chosen[i] = sh->bh_write[i]; + sh->bh_write[i] = sh->bh_write[i]->b_reqnext; + chosen[i]->b_reqnext = sh->bh_written[i]; + sh->bh_written[i] = chosen[i]; + } + break; + case CHECK_PARITY: + break; + } + if (count>1) { + xor_block(count, bh_ptr); + count = 1; + } + + for (i = disks; i--;) + if (chosen[i]) { + struct buffer_head *bh = sh->bh_cache[i]; + char *bdata; + bdata = bh_kmap(chosen[i]); + memcpy(bh->b_data, + bdata,sh->size); + bh_kunmap(chosen[i]); + set_bit(BH_Lock, &bh->b_state); + mark_buffer_uptodate(bh, 1); + } + + switch(method) { + case RECONSTRUCT_WRITE: + case CHECK_PARITY: + for (i=disks; i--;) + if (i != pd_idx) { + bh_ptr[count++] = sh->bh_cache[i]; + check_xor(); + } + break; + case READ_MODIFY_WRITE: + for (i = disks; i--;) + if (chosen[i]) { + bh_ptr[count++] = sh->bh_cache[i]; + check_xor(); + } + } + if (count != 1) + xor_block(count, bh_ptr); + + if (method != CHECK_PARITY) { + mark_buffer_uptodate(sh->bh_cache[pd_idx], 1); + set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state); + } else + mark_buffer_uptodate(sh->bh_cache[pd_idx], 0); +} + +static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw) +{ + struct buffer_head **bhp; + raid5_conf_t *conf = sh->raid_conf; + + PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector); + + + spin_lock(&sh->lock); + spin_lock_irq(&conf->device_lock); + bh->b_reqnext = NULL; + if (rw == READ) + bhp = &sh->bh_read[dd_idx]; + else + bhp = &sh->bh_write[dd_idx]; + while (*bhp) { + printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector); + bhp = & (*bhp)->b_reqnext; + } + *bhp = bh; + spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); + + PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx); +} + + + + + +/* + * handle_stripe - do things to a stripe. + * + * We lock the stripe and then examine the state of various bits + * to see what needs to be done. + * Possible results: + * return some read request which now have data + * return some write requests which are safely on disc + * schedule a read on some buffers + * schedule a write of some buffers + * return confirmation of parity correctness + * + * Parity calculations are done inside the stripe lock + * buffers are taken off read_list or write_list, and bh_cache buffers + * get BH_Lock set before the stripe lock is released. + * + */ + +static void handle_stripe(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks; + struct bio *return_bi= NULL; + struct bio *bi; + int action[MD_SB_DISKS]; + int i; + int syncing; + int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; + int failed_num=0; + struct r5dev *dev; + + PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx); + memset(action, 0, sizeof(action)); + + spin_lock(&sh->lock); + clear_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + + syncing = test_bit(STRIPE_SYNCING, &sh->state); + /* Now to look around and see what can be done */ + + for (i=disks; i--; ) { + dev = &sh->dev[i]; + PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, + dev->flags, dev->toread, dev->towrite, dev->written); + /* maybe we can reply to a read */ + if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { + struct bio *rbi, *rbi2; + PRINTK("Return read for disc %d\n", i); + spin_lock_irq(&conf->device_lock); + rbi = dev->toread; + dev->toread = NULL; + spin_unlock_irq(&conf->device_lock); + while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { + copy_data(0, rbi, dev->page, dev->sector); + rbi2 = rbi->bi_next; + spin_lock_irq(&conf->device_lock); + if (--rbi->bi_phys_segments == 0) { + rbi->bi_next = return_bi; + return_bi = rbi; + } + spin_unlock_irq(&conf->device_lock); + rbi = rbi2; + } + } + + /* now count some things */ + if (test_bit(R5_LOCKED, &dev->flags)) locked++; + if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + + + if (dev->toread) to_read++; + if (dev->towrite) to_write++; + if (dev->written) written++; + if (!conf->disks[i].operational) { + failed++; + failed_num = i; + } + } + PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n", + locked, uptodate, to_read, to_write, failed, failed_num); + /* check if the array has lost two devices and, if so, some requests might + * need to be failed + */ + if (failed > 1 && to_read+to_write) { + spin_lock_irq(&conf->device_lock); + for (i=disks; i--; ) { + /* fail all writes first */ + bi = sh->dev[i].towrite; + sh->dev[i].towrite = NULL; + if (bi) to_write--; + + while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ + struct bio *nextbi = bi->bi_next; + clear_bit(BIO_UPTODATE, &bi->bi_flags); + if (--bi->bi_phys_segments == 0) { + bi->bi_next = return_bi; + return_bi = bi; + } + bi = nextbi; + } + /* fail any reads if this device is non-operational */ + if (!conf->disks[i].operational) { + bi = sh->dev[i].toread; + sh->dev[i].toread = NULL; + if (bi) to_read--; + while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ + struct bio *nextbi = bi->bi_next; + clear_bit(BIO_UPTODATE, &bi->bi_flags); + if (--bi->bi_phys_segments == 0) { + bi->bi_next = return_bi; + return_bi = bi; + } + bi = nextbi; + } + } + } + spin_unlock_irq(&conf->device_lock); + } + if (failed > 1 && syncing) { + md_done_sync(conf->mddev, STRIPE_SECTORS,0); + clear_bit(STRIPE_SYNCING, &sh->state); + syncing = 0; + } + + /* might be able to return some write requests if the parity block + * is safe, or on a failed drive + */ + dev = &sh->dev[sh->pd_idx]; + if ( written && + ( (conf->disks[sh->pd_idx].operational && !test_bit(R5_LOCKED, &dev->flags) && + test_bit(R5_UPTODATE, &dev->flags)) + || (failed == 1 && failed_num == sh->pd_idx)) + ) { + /* any written block on an uptodate or failed drive can be returned */ + for (i=disks; i--; ) + if (sh->dev[i].written) { + dev = &sh->dev[i]; + if (!conf->disks[sh->pd_idx].operational || + (!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) ) { + /* maybe we can return some write requests */ + struct bio *wbi, *wbi2; + PRINTK("Return write for disc %d\n", i); + wbi = dev->written; + dev->written = NULL; + while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { + wbi2 = wbi->bi_next; + if (--wbi->bi_phys_segments == 0) { + wbi->bi_next = return_bi; + return_bi = wbi; + } + wbi = wbi2; + } + } + } + } + + /* Now we might consider reading some blocks, either to check/generate + * parity, or to satisfy requests + */ + if (to_read || (syncing && (uptodate+failed < disks))) { + for (i=disks; i--;) { + dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || syncing || (failed && sh->dev[failed_num].toread))) { + /* we would like to get this block, possibly + * by computing it, but we might not be able to + */ + if (uptodate == disks-1) { + PRINTK("Computing block %d\n", i); + compute_block(sh, i); + uptodate++; + } else if (conf->disks[i].operational) { + set_bit(BH_Lock, &bh->b_state); + action[i] = READ+1; + /* if I am just reading this block and we don't have + a failed drive, or any pending writes then sidestep the cache */ + if (sh->bh_page[i]) BUG(); + if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && + ! syncing && !failed && !to_write) { + sh->bh_page[i] = sh->bh_cache[i]->b_page; + sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; + sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; + } + locked++; + PRINTK("Reading block %d (sync=%d)\n", i, syncing); + if (syncing) + md_sync_acct(conf->disks[i].dev, bh->b_size>>9); + } + } + } + set_bit(STRIPE_HANDLE, &sh->state); + } + + /* now to consider writing and what else, if anything should be read */ + if (to_write) { + int rmw=0, rcw=0; + for (i=disks ; i--;) { + /* would I have to read this buffer for read_modify_write */ + dev = &sh->dev[i]; + if ((dev->towrite || i == sh->pd_idx) && + (!test_bit(R5_LOCKED, &dev->flags) +#if 0 +|| sh->bh_page[i]!=bh->b_page +#endif + ) && + !test_bit(R5_UPTODATE, &dev->flags)) { + if (conf->disks[i].operational +/* && !(conf->resync_parity && i == sh->pd_idx) */ + ) + rmw++; + else rmw += 2*disks; /* cannot read it */ + } + /* Would I have to read this buffer for reconstruct_write */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && + (!test_bit(R5_LOCKED, &dev->flags) +#if 0 +|| sh->bh_page[i] != bh->b_page +#endif + ) && + !test_bit(R5_UPTODATE, &dev->flags)) { + if (conf->disks[i].operational) rcw++; + else rcw += 2*disks; + } + } + PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw); + set_bit(STRIPE_HANDLE, &sh->state); + if (rmw < rcw && rmw > 0) + /* prefer read-modify-write, but need to get some data */ + for (i=disks; i--;) { + bh = sh->bh_cache[i]; + if ((sh->bh_write[i] || i == sh->pd_idx) && + !buffer_locked(bh) && !buffer_uptodate(bh) && + conf->disks[i].operational) { + if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + { + PRINTK("Read_old block %d for r-m-w\n", i); + set_bit(BH_Lock, &bh->b_state); + action[i] = READ+1; + locked++; + } else { + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + } + } + if (rcw <= rmw && rcw > 0) + /* want reconstruct write, but need to get some data */ + for (i=disks; i--;) { + bh = sh->bh_cache[i]; + if (!sh->bh_write[i] && i != sh->pd_idx && + !buffer_locked(bh) && !buffer_uptodate(bh) && + conf->disks[i].operational) { + if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + { + PRINTK("Read_old block %d for Reconstruct\n", i); + set_bit(BH_Lock, &bh->b_state); + action[i] = READ+1; + locked++; + } else { + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + } + } + /* now if nothing is locked, and if we have enough data, we can start a write request */ + if (locked == 0 && (rcw == 0 ||rmw == 0)) { + PRINTK("Computing parity...\n"); + compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); + /* now every locked buffer is ready to be written */ + for (i=disks; i--;) + if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { + PRINTK("Writing block %d\n", i); + locked++; + action[i] = WRITE+1; + if (!conf->disks[i].operational + || (i==sh->pd_idx && failed == 0)) + set_bit(STRIPE_INSYNC, &sh->state); + } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) + md_wakeup_thread(conf->thread); + } + } + } + + /* maybe we need to check and possibly fix the parity for this stripe + * Any reads will already have been scheduled, so we just see if enough data + * is available + */ + if (syncing && locked == 0 && + !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) { + set_bit(STRIPE_HANDLE, &sh->state); + if (failed == 0) { + char *pagea; + if (uptodate != disks) + BUG(); + compute_parity(sh, CHECK_PARITY); + uptodate--; + pagea = page_address(sh->dev[sh->pd_idx].page); + if ((*(u32*)pagea) == 0 && + !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { + /* parity is correct (on disc, not in buffer any more) */ + set_bit(STRIPE_INSYNC, &sh->state); + } + } + if (!test_bit(STRIPE_INSYNC, &sh->state)) { + struct disk_info *spare; + if (failed==0) + failed_num = sh->pd_idx; + /* should be able to compute the missing block and write it to spare */ + if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) { + if (uptodate+1 != disks) + BUG(); + compute_block(sh, failed_num); + uptodate++; + } + if (uptodate != disks) + BUG(); + bh = sh->bh_cache[failed_num]; + set_bit(BH_Lock, &bh->b_state); + action[failed_num] = WRITE+1; + locked++; + set_bit(STRIPE_INSYNC, &sh->state); + if (conf->disks[failed_num].operational) + md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9); + else if ((spare=conf->spare)) + md_sync_acct(spare->dev, bh->b_size>>9); + + } + } + if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { + md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1); + clear_bit(STRIPE_SYNCING, &sh->state); + } + + + spin_unlock(&sh->lock); + + while ((bh=return_ok)) { + return_ok = bh->b_reqnext; + bh->b_reqnext = NULL; + bh->b_end_io(bh, 1); + } + while ((bh=return_fail)) { + return_fail = bh->b_reqnext; + bh->b_reqnext = NULL; + bh->b_end_io(bh, 0); + } + for (i=disks; i-- ;) + if (action[i]) { + struct buffer_head *bh = sh->bh_cache[i]; + struct disk_info *spare = conf->spare; + int skip = 0; + if (action[i] == READ+1) + bh->b_end_io = raid5_end_read_request; + else + bh->b_end_io = raid5_end_write_request; + if (conf->disks[i].operational) + bh->b_dev = conf->disks[i].dev; + else if (spare && action[i] == WRITE+1) + bh->b_dev = spare->dev; + else skip=1; + if (!skip) { + PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i); + atomic_inc(&sh->count); + bh->b_rdev = bh->b_dev; + bh->b_rsector = bh->b_blocknr * (bh->b_size>>9); + generic_make_request(action[i]-1, bh); + } else { + PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector); + clear_bit(BH_Lock, &bh->b_state); + set_bit(STRIPE_HANDLE, &sh->state); + } + } +} + +static inline void raid5_activate_delayed(raid5_conf_t *conf) +{ + if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { + while (!list_empty(&conf->delayed_list)) { + struct list_head *l = conf->delayed_list.next; + struct stripe_head *sh; + sh = list_entry(l, struct stripe_head, lru); + list_del_init(l); + clear_bit(STRIPE_DELAYED, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + list_add_tail(&sh->lru, &conf->handle_list); + } + } +} +static void raid5_unplug_device(void *data) +{ + raid5_conf_t *conf = (raid5_conf_t *)data; + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + + raid5_activate_delayed(conf); + + conf->plugged = 0; + md_wakeup_thread(conf->thread); + + spin_unlock_irqrestore(&conf->device_lock, flags); +} + +static inline void raid5_plug_device(raid5_conf_t *conf) +{ + spin_lock_irq(&conf->device_lock); + if (list_empty(&conf->delayed_list)) + if (!conf->plugged) { + conf->plugged = 1; + queue_task(&conf->plug_tq, &tq_disk); + } + spin_unlock_irq(&conf->device_lock); +} + +static int make_request (mddev_t *mddev, int rw, struct bio * bi) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + const unsigned int raid_disks = conf->raid_disks; + const unsigned int data_disks = raid_disks - 1; + unsigned int dd_idx, pd_idx; + sector_t new_sector; + sector_t logical_sector, last_sector; + int read_ahead = 0; + + struct stripe_head *sh; + + if (rw == READA) { + rw = READ; + read_ahead=1; + } + + new_sector = raid5_compute_sector(bh->b_rsector, + raid_disks, data_disks, &dd_idx, &pd_idx, conf); + + PRINTK("raid5: make_request, sector %lu\n", new_sector); + sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead); + if (sh) { + sh->pd_idx = pd_idx; + + add_stripe_bh(sh, bh, dd_idx, rw); + + raid5_plug_device(conf); + handle_stripe(sh); + release_stripe(sh); + } else + bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); + return 0; +} + +/* + * Determine correct block size for this device. + */ +unsigned int device_bsize (kdev_t dev) +{ + unsigned int i, correct_size; + + correct_size = BLOCK_SIZE; + if (blksize_size[MAJOR(dev)]) { + i = blksize_size[MAJOR(dev)][MINOR(dev)]; + if (i) + correct_size = i; + } + + return correct_size; +} + +static int sync_request (mddev_t *mddev, unsigned long sector_nr) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + struct stripe_head *sh; + int sectors_per_chunk = conf->chunk_size >> 9; + unsigned long stripe = sector_nr/sectors_per_chunk; + int chunk_offset = sector_nr % sectors_per_chunk; + int dd_idx, pd_idx; + unsigned long first_sector; + int raid_disks = conf->raid_disks; + int data_disks = raid_disks-1; + + first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk + + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); + sh = get_active_stripe(conf, sector_nr, pd_idx, 0); + spin_lock(&sh->lock); + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); + spin_unlock(&sh->lock); + + handle_stripe(sh); + release_stripe(sh); + + return STRIPE_SECTORS; +} + +/* + * This is our raid5 kernel thread. + * + * We scan the hash table for stripes which can be handled now. + * During the scan, completed stripes are saved for us by the interrupt + * handler, so that they will not have to wait for our next wakeup. + */ +static void raid5d (void *data) +{ + struct stripe_head *sh; + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + int handled; + + PRINTK("+++ raid5d active\n"); + + handled = 0; + + if (mddev->sb_dirty) + md_update_sb(mddev); + spin_lock_irq(&conf->device_lock); + while (1) { + struct list_head *first; + + if (list_empty(&conf->handle_list) && + atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && + !conf->plugged && + !list_empty(&conf->delayed_list)) + raid5_activate_delayed(conf); + + if (list_empty(&conf->handle_list)) + break; + + first = conf->handle_list.next; + sh = list_entry(first, struct stripe_head, lru); + + list_del_init(first); + atomic_inc(&sh->count); + if (atomic_read(&sh->count)!= 1) + BUG(); + spin_unlock_irq(&conf->device_lock); + + handled++; + handle_stripe(sh); + release_stripe(sh); + + spin_lock_irq(&conf->device_lock); + } + PRINTK("%d stripes handled\n", handled); + + spin_unlock_irq(&conf->device_lock); + + PRINTK("--- raid5d inactive\n"); +} + +/* + * Private kernel thread for parity reconstruction after an unclean + * shutdown. Reconstruction on spare drives in case of a failed drive + * is done by the generic mdsyncd. + */ +static void raid5syncd (void *data) +{ + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + + if (!conf->resync_parity) + return; + if (conf->resync_parity == 2) + return; + down(&mddev->recovery_sem); + if (md_do_sync(mddev,NULL)) { + up(&mddev->recovery_sem); + printk("raid5: resync aborted!\n"); + return; + } + conf->resync_parity = 0; + up(&mddev->recovery_sem); + printk("raid5: resync finished.\n"); +} + +static int run (mddev_t *mddev) +{ + raid5_conf_t *conf; + int i, j, raid_disk, memory; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *desc; + mdk_rdev_t *rdev; + struct disk_info *disk; + struct list_head *tmp; + int start_recovery = 0; + + MOD_INC_USE_COUNT; + + if (sb->level != 5 && sb->level != 4) { + printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level); + MOD_DEC_USE_COUNT; + return -EIO; + } + + mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL); + if ((conf = mddev->private) == NULL) + goto abort; + memset (conf, 0, sizeof (*conf)); + conf->mddev = mddev; + + if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) + goto abort; + memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); + + conf->device_lock = SPIN_LOCK_UNLOCKED; + init_waitqueue_head(&conf->wait_for_stripe); + INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->inactive_list); + atomic_set(&conf->active_stripes, 0); + atomic_set(&conf->preread_active_stripes, 0); + + conf->plugged = 0; + conf->plug_tq.sync = 0; + conf->plug_tq.routine = &raid5_unplug_device; + conf->plug_tq.data = conf; + + PRINTK("raid5: run(md%d) called.\n", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * This is important -- we are using the descriptor on + * the disk only to get a pointer to the descriptor on + * the main superblock, which might be more recent. + */ + desc = sb->disks + rdev->desc_nr; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc)) { + printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev)); + if (!rdev->faulty) { + MD_BUG(); + goto abort; + } + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + disk->bdev = rdev->bdev; + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + continue; + } + if (disk_active(desc)) { + if (!disk_sync(desc)) { + printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev)); + MD_BUG(); + goto abort; + } + if (raid_disk > sb->raid_disks) { + printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev)); + continue; + } + if (disk->operational) { + printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk); + continue; + } + printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk); + + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + disk->bdev = rdev->bdev; + disk->operational = 1; + disk->used_slot = 1; + + conf->working_disks++; + } else { + /* + * Must be a spare disk .. + */ + printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev)); + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + disk->bdev = rdev->bdev; + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + } + } + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = sb->disks + i; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc) && (raid_disk < sb->raid_disks) && + !conf->disks[raid_disk].used_slot) { + + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + } + } + + conf->raid_disks = sb->raid_disks; + /* + * 0 for a fully functional array, 1 for a degraded array. + */ + conf->failed_disks = conf->raid_disks - conf->working_disks; + conf->mddev = mddev; + conf->chunk_size = sb->chunk_size; + conf->level = sb->level; + conf->algorithm = sb->layout; + conf->max_nr_stripes = NR_STRIPES; + +#if 0 + for (i = 0; i < conf->raid_disks; i++) { + if (!conf->disks[i].used_slot) { + MD_BUG(); + goto abort; + } + } +#endif + if (!conf->chunk_size || conf->chunk_size % 4) { + printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev)); + goto abort; + } + if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { + printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev)); + goto abort; + } + if (conf->failed_disks > 1) { + printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks); + goto abort; + } + + if (conf->working_disks != sb->raid_disks) { + printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); + start_recovery = 1; + } + + { + const char * name = "raid5d"; + + conf->thread = md_register_thread(raid5d, conf, name); + if (!conf->thread) { + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); + goto abort; + } + } + + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024; + if (grow_stripes(conf, conf->max_nr_stripes)) { + printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory); + shrink_stripes(conf); + goto abort; + } else + printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev)); + + /* + * Regenerate the "device is in sync with the raid set" bit for + * each device. + */ + for (i = 0; i < MD_SB_DISKS ; i++) { + mark_disk_nonsync(sb->disks + i); + for (j = 0; j < sb->raid_disks; j++) { + if (!conf->disks[j].operational) + continue; + if (sb->disks[i].number == conf->disks[j].number) + mark_disk_sync(sb->disks + i); + } + } + sb->active_disks = conf->working_disks; + + if (sb->active_disks == sb->raid_disks) + printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); + else + printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); + + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) { + const char * name = "raid5syncd"; + + conf->resync_thread = md_register_thread(raid5syncd, conf,name); + if (!conf->resync_thread) { + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); + goto abort; + } + + printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev)); + conf->resync_parity = 1; + md_wakeup_thread(conf->resync_thread); + } + + print_raid5_conf(conf); + if (start_recovery) + md_recover_arrays(); + print_raid5_conf(conf); + + /* Ok, everything is just fine now */ + return (0); +abort: + if (conf) { + print_raid5_conf(conf); + if (conf->stripe_hashtbl) + free_pages((unsigned long) conf->stripe_hashtbl, + HASH_PAGES_ORDER); + kfree(conf); + } + mddev->private = NULL; + printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev)); + MOD_DEC_USE_COUNT; + return -EIO; +} + +static int stop_resync (mddev_t *mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + mdk_thread_t *thread = conf->resync_thread; + + if (thread) { + if (conf->resync_parity) { + conf->resync_parity = 2; + md_interrupt_thread(thread); + printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int restart_resync (mddev_t *mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_parity) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + printk("raid5: waking up raid5resync.\n"); + conf->resync_parity = 1; + md_wakeup_thread(conf->resync_thread); + return 1; + } else + printk("raid5: no restart-resync needed.\n"); + return 0; +} + + +static int stop (mddev_t *mddev) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + md_unregister_thread(conf->thread); + shrink_stripes(conf); + free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); + kfree(conf); + mddev->private = NULL; + MOD_DEC_USE_COUNT; + return 0; +} + +#if RAID5_DEBUG +static void print_sh (struct stripe_head *sh) +{ + int i; + + printk("sh %lu, pd_idx %d, state %ld.\n", sh->sector, sh->pd_idx, sh->state); + printk("sh %lu, count %d.\n", sh->sector, atomic_read(&sh->count)); + printk("sh %lu, ", sh->sector); + for (i = 0; i < sh->raid_conf->raid_disks; i++) { + printk("(cache%d: %p %ld) ", i, sh->dev[i].page, sh->dev[i].flags); + } + printk("\n"); +} + +static void printall (raid5_conf_t *conf) +{ + struct stripe_head *sh; + int i; + + spin_lock_irq(&conf->device_lock); + for (i = 0; i < NR_HASH; i++) { + sh = conf->stripe_hashtbl[i]; + for (; sh; sh = sh->hash_next) { + if (sh->raid_conf != conf) + continue; + print_sh(sh); + } + } + spin_unlock_irq(&conf->device_lock); + + PRINTK("--- raid5d inactive\n"); +} +#endif + +static void status (struct seq_file *seq, mddev_t *mddev) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; + int i; + + seq_printf (seq, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout); + seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + seq_printf (seq, "%s", conf->disks[i].operational ? "U" : "_"); + seq_printf (seq, "]"); +#if RAID5_DEBUG +#define D(x) \ + seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x)) + printall(conf); +#endif + +} + +static void print_raid5_conf (raid5_conf_t *conf) +{ + int i; + struct disk_info *tmp; + + printk("RAID5 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, + conf->working_disks, conf->failed_disks); + +#if RAID5_DEBUG + for (i = 0; i < MD_SB_DISKS; i++) { +#else + for (i = 0; i < conf->working_disks+conf->failed_disks; i++) { +#endif + tmp = conf->disks + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} + +static int diskop(mddev_t *mddev, mdp_disk_t **d, int state) +{ + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid5_conf_t *conf = mddev->private; + struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + mdk_rdev_t *spare_rdev, *failed_rdev; + + print_raid5_conf(conf); + spin_lock_irq(&conf->device_lock); + /* + * find the disk ... + */ + switch (state) { + + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID5 configuration ... + * (this can only be in the first conf->raid_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->disks + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + } + + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + if (conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + conf->spare = sdisk; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + sdisk = conf->disks + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + /* + * Was the spare being resynced? + */ + if (conf->spare == sdisk) + conf->spare = NULL; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->raid_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: + if (!conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + fdisk = conf->disks + failed_disk; + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } + + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + /* + * do the switch finally + */ + spare_rdev = find_rdev_nr(mddev, spare_desc->number); + failed_rdev = find_rdev_nr(mddev, failed_desc->number); + + /* There must be a spare_rdev, but there may not be a + * failed_rdev. That slot might be empty... + */ + spare_rdev->desc_nr = failed_desc->number; + if (failed_rdev) + failed_rdev->desc_nr = spare_desc->number; + + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); + + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); + + *d = failed_desc; + + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + conf->failed_disks--; + conf->working_disks++; + conf->spare = NULL; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->disks + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->disks + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + + + break; + + default: + MD_BUG(); + err = 1; + goto abort; + } +abort: + spin_unlock_irq(&conf->device_lock); + print_raid5_conf(conf); + return err; +} + +static mdk_personality_t raid5_personality= +{ + name: "raid5", + make_request: make_request, + run: run, + stop: stop, + status: status, + error_handler: error, + diskop: diskop, + stop_resync: stop_resync, + restart_resync: restart_resync, + sync_request: sync_request +}; + +static int __init raid5_init (void) +{ + return register_md_personality (RAID5, &raid5_personality); +} + +static void raid5_exit (void) +{ + unregister_md_personality (RAID5); +} + +module_init(raid5_init); +module_exit(raid5_exit); +MODULE_LICENSE("GPL"); diff --git a/tests/linux/raid5/patch b/tests/linux/raid5/patch new file mode 100644 index 0000000..d149229 --- /dev/null +++ b/tests/linux/raid5/patch @@ -0,0 +1,962 @@ +*************** +*** 142,188 **** + + static void shrink_buffers(struct stripe_head *sh, int num) + { +- struct buffer_head *bh; + int i; + + for (i=0; ibh_cache[i]; +- if (!bh) +- return; +- sh->bh_cache[i] = NULL; +- free_page((unsigned long) bh->b_data); +- kfree(bh); + } + } + +- static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority) + { +- struct buffer_head *bh; + int i; + + for (i=0; ib_data = page_address(page); +- else { +- kfree(bh); + return 1; + } +- atomic_set(&bh->b_count, 0); +- bh->b_page = page; +- sh->bh_cache[i] = bh; +- + } + return 0; + } + +- static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i); + +- static inline void init_stripe(struct stripe_head *sh, unsigned long sector) + { + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; +--- 142,177 ---- + + static void shrink_buffers(struct stripe_head *sh, int num) + { ++ struct page *p; + int i; + + for (i=0; idev[i].page; ++ if (!p) ++ continue; ++ sh->dev[i].page = NULL; ++ page_cache_release(p); + } + } + ++ static int grow_buffers(struct stripe_head *sh, int num) + { + int i; + + for (i=0; idev[i].page = page; + } + return 0; + } + ++ static void raid5_build_block (struct stripe_head *sh, int i); + ++ static inline void init_stripe(struct stripe_head *sh, unsigned long sector, int pd_idx) + { + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; +*************** +*** 198,237 **** + remove_hash(sh); + + sh->sector = sector; +- sh->size = conf->buffer_size; + sh->state = 0; + + for (i=disks; i--; ) { +- if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] || +- buffer_locked(sh->bh_cache[i])) { + printk("sector=%lx i=%d %p %p %p %d\n", +- sh->sector, i, sh->bh_read[i], +- sh->bh_write[i], sh->bh_written[i], +- buffer_locked(sh->bh_cache[i])); + BUG(); + } +- clear_buffer_uptodate(sh->bh_cache[i]); + raid5_build_block(sh, i); + } + insert_hash(conf, sh); + } + +- /* the buffer size has changed, so unhash all stripes +- * as active stripes complete, they will go onto inactive list +- */ +- static void shrink_stripe_cache(raid5_conf_t *conf) +- { +- int i; +- CHECK_DEVLOCK(); +- if (atomic_read(&conf->active_stripes)) +- BUG(); +- for (i=0; i < NR_HASH; i++) { +- struct stripe_head *sh; +- while ((sh = conf->stripe_hashtbl[i])) +- remove_hash(sh); +- } +- } +- + static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector) + { + struct stripe_head *sh; +--- 187,212 ---- + remove_hash(sh); + + sh->sector = sector; ++ sh->pd_idx = pd_idx; + sh->state = 0; + + for (i=disks; i--; ) { ++ struct r5dev *dev = &sh->dev[i]; ++ ++ if (dev->toread || dev->towrite || dev->written || ++ test_bit(R5_LOCKED, &dev->flags)) { + printk("sector=%lx i=%d %p %p %p %d\n", ++ sh->sector, i, dev->toread, ++ dev->towrite, dev->written, ++ test_bit(R5_LOCKED, &dev->flags)); + BUG(); + } ++ dev->flags = 0; + raid5_build_block(sh, i); + } + insert_hash(conf, sh); + } + + static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector) + { + struct stripe_head *sh; +*************** +*** 410,447 **** + } else + buffer = NULL; + spin_unlock_irqrestore(&conf->device_lock, flags); +- if (sh->bh_page[i]==NULL) + set_buffer_uptodate(bh); + if (buffer) { + if (buffer->b_page != bh->b_page) + memcpy(buffer->b_data, bh->b_data, bh->b_size); + buffer->b_end_io(buffer, 1); + } + } else { +- md_error(conf->mddev, bh->b_bdev); +- clear_buffer_uptodate(bh); + } + /* must restore b_page before unlocking buffer... */ +- if (sh->bh_page[i]) { + bh->b_page = sh->bh_page[i]; + bh->b_data = page_address(bh->b_page); +- sh->bh_page[i] = NULL; + clear_buffer_uptodate(bh); + } +- clear_buffer_locked(bh); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); + } + +- static void raid5_end_write_request (struct buffer_head *bh, int uptodate) + { +- struct stripe_head *sh = bh->b_private; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; + unsigned long flags; + + for (i=0 ; ibh_cache[i]) + break; + + PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate); +--- 361,403 ---- + } else + buffer = NULL; + spin_unlock_irqrestore(&conf->device_lock, flags); ++ if (sh->bh_page[i]==bh->b_page) + set_buffer_uptodate(bh); + if (buffer) { + if (buffer->b_page != bh->b_page) + memcpy(buffer->b_data, bh->b_data, bh->b_size); + buffer->b_end_io(buffer, 1); + } ++ #else ++ set_bit(R5_UPTODATE, &sh->dev[i].flags); ++ #endif + } else { ++ md_error(conf->mddev, bi->bi_bdev); ++ clear_bit(R5_UPTODATE, &sh->dev[i].flags); + } ++ #if 0 + /* must restore b_page before unlocking buffer... */ ++ if (sh->bh_page[i] != bh->b_page) { + bh->b_page = sh->bh_page[i]; + bh->b_data = page_address(bh->b_page); + clear_buffer_uptodate(bh); + } ++ #endif ++ clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); + } + ++ static void raid5_end_write_request (struct bio *bi) + { ++ struct stripe_head *sh = bi->bi_private; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; + unsigned long flags; ++ int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + + for (i=0 ; idev[i].req) + break; + + PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate); +*************** +*** 452,480 **** + + spin_lock_irqsave(&conf->device_lock, flags); + if (!uptodate) +- md_error(conf->mddev, bh->b_bdev); +- clear_buffer_locked(bh); + set_bit(STRIPE_HANDLE, &sh->state); + __release_stripe(conf, sh); + spin_unlock_irqrestore(&conf->device_lock, flags); + } +- + + +- static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i) + { + raid5_conf_t *conf = sh->raid_conf; +- struct buffer_head *bh = sh->bh_cache[i]; +- unsigned long block = sh->sector / (sh->size >> 9); + +- init_buffer(bh, raid5_end_read_request, sh); +- bh->b_dev = conf->disks[i].dev; +- /* FIXME - later we will need bdev here */ +- bh->b_blocknr = block; +- +- bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); +- bh->b_size = sh->size; +- return bh; + } + + static int error (mddev_t *mddev, kdev_t dev) +--- 408,443 ---- + + spin_lock_irqsave(&conf->device_lock, flags); + if (!uptodate) ++ md_error(conf->mddev, bi->bi_bdev); ++ ++ clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + __release_stripe(conf, sh); + spin_unlock_irqrestore(&conf->device_lock, flags); + } + + ++ static unsigned long compute_blocknr(struct stripe_head *sh, int i); ++ ++ static void raid5_build_block (struct stripe_head *sh, int i) + { + raid5_conf_t *conf = sh->raid_conf; ++ struct r5dev *dev = &sh->dev[i]; + ++ bio_init(&dev->req); ++ dev->req.bi_io_vec = &dev->vec; ++ dev->req.bi_vcnt++; ++ dev->vec.bv_page = dev->page; ++ dev->vec.bv_len = STRIPE_SIZE; ++ dev->vec.bv_offset = 0; ++ ++ dev->req.bi_bdev = conf->disks[i].bdev; ++ dev->req.bi_sector = sh->sector; ++ dev->req.bi_private = sh; ++ ++ dev->flags = 0; ++ if (i != sh->pd_idx) ++ dev->sector = compute_blocknr(sh, i); + } + + static int error (mddev_t *mddev, kdev_t dev) +*************** +*** 661,748 **** + { + raid5_conf_t *conf = sh->raid_conf; + int i, count, disks = conf->raid_disks; +- struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh; + + PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx); + +- +- memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size); +- bh_ptr[0] = sh->bh_cache[dd_idx]; + count = 1; + for (i = disks ; i--; ) { + if (i == dd_idx) + continue; +- bh = sh->bh_cache[i]; +- if (buffer_uptodate(bh)) +- bh_ptr[count++] = bh; + else + printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i); + + check_xor(); + } + if (count != 1) +- xor_block(count, bh_ptr); +- set_buffer_uptodate(sh->bh_cache[dd_idx]); + } + + static void compute_parity(struct stripe_head *sh, int method) + { + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; +- struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; +- struct buffer_head *chosen[MD_SB_DISKS]; + + PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method); + memset(chosen, 0, sizeof(chosen)); + + count = 1; +- bh_ptr[0] = sh->bh_cache[pd_idx]; + switch(method) { + case READ_MODIFY_WRITE: +- if (!buffer_uptodate(sh->bh_cache[pd_idx])) + BUG(); + for (i=disks ; i-- ;) { + if (i==pd_idx) + continue; +- if (sh->bh_write[i] && +- buffer_uptodate(sh->bh_cache[i])) { +- bh_ptr[count++] = sh->bh_cache[i]; +- chosen[i] = sh->bh_write[i]; +- sh->bh_write[i] = sh->bh_write[i]->b_reqnext; +- chosen[i]->b_reqnext = sh->bh_written[i]; +- sh->bh_written[i] = chosen[i]; + check_xor(); + } + } + break; + case RECONSTRUCT_WRITE: +- memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size); + for (i= disks; i-- ;) +- if (i!=pd_idx && sh->bh_write[i]) { +- chosen[i] = sh->bh_write[i]; +- sh->bh_write[i] = sh->bh_write[i]->b_reqnext; +- chosen[i]->b_reqnext = sh->bh_written[i]; +- sh->bh_written[i] = chosen[i]; + } + break; + case CHECK_PARITY: + break; + } + if (count>1) { +- xor_block(count, bh_ptr); + count = 1; + } + + for (i = disks; i--;) + if (chosen[i]) { +- struct buffer_head *bh = sh->bh_cache[i]; +- char *bdata; +- bdata = bh_kmap(chosen[i]); +- memcpy(bh->b_data, +- bdata,sh->size); +- bh_kunmap(chosen[i]); +- set_buffer_locked(bh); +- set_buffer_uptodate(bh); + } + + switch(method) { +--- 674,757 ---- + { + raid5_conf_t *conf = sh->raid_conf; + int i, count, disks = conf->raid_disks; ++ void *ptr[MAX_XOR_BLOCKS], *p; + + PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx); + ++ ptr[0] = page_address(sh->dev[dd_idx].page); ++ memset(ptr[0], 0, STRIPE_SIZE); + count = 1; + for (i = disks ; i--; ) { + if (i == dd_idx) + continue; ++ p = page_address(sh->dev[i].page); ++ if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) ++ ptr[count++] = p; + else + printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i); + + check_xor(); + } + if (count != 1) ++ xor_block(count, STRIPE_SIZE, ptr); ++ set_bit(R5_UPTODATE, &sh->dev[i].flags); + } + + static void compute_parity(struct stripe_head *sh, int method) + { + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; ++ void *ptr[MAX_XOR_BLOCKS]; ++ struct bio *chosen[MD_SB_DISKS]; + + PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method); + memset(chosen, 0, sizeof(chosen)); + + count = 1; ++ ptr[0] = page_address(sh->dev[pd_idx].page); + switch(method) { + case READ_MODIFY_WRITE: ++ if (!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)) + BUG(); + for (i=disks ; i-- ;) { + if (i==pd_idx) + continue; ++ if (sh->dev[i].towrite && ++ test_bit(R5_UPTODATE, &sh->dev[i].flags)) { ++ ptr[count++] = page_address(sh->dev[i].page); ++ chosen[i] = sh->dev[i].towrite; ++ sh->dev[i].towrite = NULL; ++ if (sh->dev[i].written) BUG(); ++ sh->dev[i].written = chosen[i]; + check_xor(); + } + } + break; + case RECONSTRUCT_WRITE: ++ memset(ptr[0], 0, STRIPE_SIZE); + for (i= disks; i-- ;) ++ if (i!=pd_idx && sh->dev[i].towrite) { ++ chosen[i] = sh->dev[i].towrite; ++ sh->dev[i].towrite = NULL; ++ if (sh->dev[i].written) BUG(); ++ sh->dev[i].written = chosen[i]; + } + break; + case CHECK_PARITY: + break; + } + if (count>1) { ++ xor_block(count, STRIPE_SIZE, ptr); + count = 1; + } + + for (i = disks; i--;) + if (chosen[i]) { ++ sector_t sector = sh->dev[i].sector; ++ copy_data(1, chosen[i], sh->dev[i].page, sector); ++ ++ set_bit(R5_LOCKED, &sh->dev[i].flags); ++ set_bit(R5_UPTODATE, &sh->dev[i].flags); + } + + switch(method) { +*************** +*** 750,804 **** + case CHECK_PARITY: + for (i=disks; i--;) + if (i != pd_idx) { +- bh_ptr[count++] = sh->bh_cache[i]; + check_xor(); + } + break; + case READ_MODIFY_WRITE: + for (i = disks; i--;) + if (chosen[i]) { +- bh_ptr[count++] = sh->bh_cache[i]; + check_xor(); + } + } + if (count != 1) +- xor_block(count, bh_ptr); + + if (method != CHECK_PARITY) { +- set_buffer_uptodate(sh->bh_cache[pd_idx]); +- set_buffer_locked(sh->bh_cache[pd_idx]); + } else +- clear_buffer_uptodate(sh->bh_cache[pd_idx]); + } + +- static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw) + { +- struct buffer_head **bhp; + raid5_conf_t *conf = sh->raid_conf; + +- PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector); + + + spin_lock(&sh->lock); + spin_lock_irq(&conf->device_lock); +- bh->b_reqnext = NULL; +- if (rw == READ) +- bhp = &sh->bh_read[dd_idx]; + else +- bhp = &sh->bh_write[dd_idx]; +- while (*bhp) { +- printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector); +- bhp = & (*bhp)->b_reqnext; +- } +- *bhp = bh; + spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); + +- PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx); +- } +- +- + + + + /* +--- 759,832 ---- + case CHECK_PARITY: + for (i=disks; i--;) + if (i != pd_idx) { ++ ptr[count++] = page_address(sh->dev[i].page); + check_xor(); + } + break; + case READ_MODIFY_WRITE: + for (i = disks; i--;) + if (chosen[i]) { ++ ptr[count++] = page_address(sh->dev[i].page); + check_xor(); + } + } + if (count != 1) ++ xor_block(count, STRIPE_SIZE, ptr); + + if (method != CHECK_PARITY) { ++ set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); ++ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + } else ++ clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + } + ++ /* ++ * Each stripe/dev can have one or more bion attached. ++ * toread/towrite point to the first in a chain. ++ * The bi_next chain must be in order. ++ */ ++ static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) + { ++ struct bio **bip; + raid5_conf_t *conf = sh->raid_conf; + ++ PRINTK("adding bh b#%lu to stripe s#%lu\n", bi->bi_sector, sh->sector); + + + spin_lock(&sh->lock); + spin_lock_irq(&conf->device_lock); ++ if (forwrite) ++ bip = &sh->dev[dd_idx].towrite; + else ++ bip = &sh->dev[dd_idx].toread; ++ while (*bip && (*bip)->bi_sector < bi->bi_sector) ++ bip = & (*bip)->bi_next; ++ /* FIXME do I need to worry about overlapping bion */ ++ if (*bip && bi->bi_next && (*bip) != bi->bi_next) ++ BUG(); ++ if (*bip) ++ bi->bi_next = *bip; ++ *bip = bi; ++ bi->bi_phys_segments ++; + spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); + ++ if (forwrite) { ++ /* check if page is coverred */ ++ sector_t sector = sh->dev[dd_idx].sector; ++ for (bi=sh->dev[dd_idx].towrite; ++ sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && ++ bi && bi->bi_sector <= sector; ++ bi = bi->bi_next) { ++ if (bi->bi_sector + (bi->bi_size>>9) >= sector) ++ sector = bi->bi_sector + (bi->bi_size>>9); ++ } ++ if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) ++ set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); ++ } + ++ PRINTK("added bi b#%lu to stripe s#%lu, disk %d.\n", bi->bi_sector, sh->sector, dd_idx); ++ } + + + /* +*************** +*** 955,975 **** + compute_block(sh, i); + uptodate++; + } else if (conf->disks[i].operational) { +- set_buffer_locked(bh); + action[i] = READ+1; + /* if I am just reading this block and we don't have + a failed drive, or any pending writes then sidestep the cache */ +- if (sh->bh_page[i]) BUG(); + if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && + ! syncing && !failed && !to_write) { +- sh->bh_page[i] = sh->bh_cache[i]->b_page; + sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; + sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; + } + locked++; + PRINTK("Reading block %d (sync=%d)\n", i, syncing); + if (syncing) +- md_sync_acct(conf->disks[i].dev, bh->b_size>>9); + } + } + } +--- 1002,1022 ---- + compute_block(sh, i); + uptodate++; + } else if (conf->disks[i].operational) { ++ set_bit(R5_LOCKED, &dev->flags); + action[i] = READ+1; ++ #if 0 + /* if I am just reading this block and we don't have + a failed drive, or any pending writes then sidestep the cache */ + if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && + ! syncing && !failed && !to_write) { + sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; + sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; + } ++ #endif + locked++; + PRINTK("Reading block %d (sync=%d)\n", i, syncing); + if (syncing) ++ md_sync_acct(conf->disks[i].dev, STRIPE_SECTORS); + } + } + } +*************** +*** 1004,1017 **** + if (rmw < rcw && rmw > 0) + /* prefer read-modify-write, but need to get some data */ + for (i=disks; i--;) { +- bh = sh->bh_cache[i]; +- if ((sh->bh_write[i] || i == sh->pd_idx) && +- !buffer_locked(bh) && !buffer_uptodate(bh) && + conf->disks[i].operational) { + if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + { + PRINTK("Read_old block %d for r-m-w\n", i); +- set_buffer_locked(bh); + action[i] = READ+1; + locked++; + } else { +--- 1059,1072 ---- + if (rmw < rcw && rmw > 0) + /* prefer read-modify-write, but need to get some data */ + for (i=disks; i--;) { ++ dev = &sh->dev[i]; ++ if ((dev->towrite || i == sh->pd_idx) && ++ !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && + conf->disks[i].operational) { + if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + { + PRINTK("Read_old block %d for r-m-w\n", i); ++ set_bit(R5_LOCKED, &dev->flags); + action[i] = READ+1; + locked++; + } else { +*************** +*** 1023,1036 **** + if (rcw <= rmw && rcw > 0) + /* want reconstruct write, but need to get some data */ + for (i=disks; i--;) { +- bh = sh->bh_cache[i]; +- if (!sh->bh_write[i] && i != sh->pd_idx && +- !buffer_locked(bh) && !buffer_uptodate(bh) && + conf->disks[i].operational) { + if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + { + PRINTK("Read_old block %d for Reconstruct\n", i); +- set_buffer_locked(bh); + action[i] = READ+1; + locked++; + } else { +--- 1078,1091 ---- + if (rcw <= rmw && rcw > 0) + /* want reconstruct write, but need to get some data */ + for (i=disks; i--;) { ++ dev = &sh->dev[i]; ++ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && ++ !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && + conf->disks[i].operational) { + if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + { + PRINTK("Read_old block %d for Reconstruct\n", i); ++ set_bit(R5_LOCKED, &dev->flags); + action[i] = READ+1; + locked++; + } else { +*************** +*** 1093,1152 **** + } + if (uptodate != disks) + BUG(); +- bh = sh->bh_cache[failed_num]; +- set_buffer_locked(bh); + action[failed_num] = WRITE+1; + locked++; + set_bit(STRIPE_INSYNC, &sh->state); + if (conf->disks[failed_num].operational) +- md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9); + else if ((spare=conf->spare)) +- md_sync_acct(spare->dev, bh->b_size>>9); + + } + } + if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { +- md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1); + clear_bit(STRIPE_SYNCING, &sh->state); + } + +- + spin_unlock(&sh->lock); + +- while ((bh=return_ok)) { +- return_ok = bh->b_reqnext; +- bh->b_reqnext = NULL; +- bh->b_end_io(bh, 1); +- } +- while ((bh=return_fail)) { +- return_fail = bh->b_reqnext; +- bh->b_reqnext = NULL; +- bh->b_end_io(bh, 0); + } + for (i=disks; i-- ;) + if (action[i]) { +- struct buffer_head *bh = sh->bh_cache[i]; + struct disk_info *spare = conf->spare; + int skip = 0; + if (action[i] == READ+1) +- bh->b_end_io = raid5_end_read_request; + else +- bh->b_end_io = raid5_end_write_request; + if (conf->disks[i].operational) +- bh->b_dev = conf->disks[i].dev; + else if (spare && action[i] == WRITE+1) +- bh->b_dev = spare->dev; + else skip=1; +- /* FIXME - later we will need bdev here */ + if (!skip) { + PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i); + atomic_inc(&sh->count); +- bh->b_rdev = bh->b_dev; +- bh->b_rsector = bh->b_blocknr * (bh->b_size>>9); +- generic_make_request(action[i]-1, bh); + } else { + PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector); +- clear_buffer_locked(bh); + set_bit(STRIPE_HANDLE, &sh->state); + } + } +--- 1149,1210 ---- + } + if (uptodate != disks) + BUG(); ++ dev = &sh->dev[failed_num]; ++ set_bit(R5_LOCKED, &dev->flags); + action[failed_num] = WRITE+1; + locked++; + set_bit(STRIPE_INSYNC, &sh->state); + if (conf->disks[failed_num].operational) ++ md_sync_acct(conf->disks[failed_num].dev, STRIPE_SECTORS); + else if ((spare=conf->spare)) ++ md_sync_acct(spare->dev, STRIPE_SECTORS); + + } + } + if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { ++ md_done_sync(conf->mddev, STRIPE_SECTORS,1); + clear_bit(STRIPE_SYNCING, &sh->state); + } + + spin_unlock(&sh->lock); + ++ while ((bi=return_bi)) { ++ return_bi = bi->bi_next; ++ bi->bi_next = NULL; ++ bi->bi_end_io(bi); + } + for (i=disks; i-- ;) + if (action[i]) { ++ struct bio *bi = &sh->dev[i].req; + struct disk_info *spare = conf->spare; + int skip = 0; + if (action[i] == READ+1) ++ bi->bi_end_io = raid5_end_read_request; + else ++ bi->bi_end_io = raid5_end_write_request; + if (conf->disks[i].operational) ++ bi->bi_bdev = conf->disks[i].bdev; + else if (spare && action[i] == WRITE+1) ++ bi->bi_bdev = spare->bdev; + else skip=1; + if (!skip) { + PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i); + atomic_inc(&sh->count); ++ bi->bi_sector = sh->sector; ++ if (action[i] == READ+1) ++ bi->bi_rw = 0; ++ else ++ bi->bi_rw = 1; ++ bi->bi_flags = 0; ++ bi->bi_vcnt = 1; ++ bi->bi_idx = 0; ++ bi->bi_io_vec = &sh->dev[i].vec; ++ bi->bi_size = STRIPE_SIZE; ++ bi->bi_next = NULL; ++ generic_make_request(bi); + } else { + PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector); ++ clear_bit(R5_LOCKED, &dev->flags); + set_bit(STRIPE_HANDLE, &sh->state); + } + } +*************** +*** 1208,1232 **** + read_ahead=1; + } + +- new_sector = raid5_compute_sector(bh->b_rsector, +- raid_disks, data_disks, &dd_idx, &pd_idx, conf); + +- PRINTK("raid5: make_request, sector %lu\n", new_sector); +- sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead); +- if (sh) { +- sh->pd_idx = pd_idx; + +- add_stripe_bh(sh, bh, dd_idx, rw); + +- raid5_plug_device(conf); +- handle_stripe(sh); +- release_stripe(sh); +- } else +- bh->b_end_io(bh, buffer_uptodate(bh)); + return 0; + } + +- static int sync_request (mddev_t *mddev, unsigned long sector_nr) + { + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + struct stripe_head *sh; +--- 1267,1305 ---- + read_ahead=1; + } + ++ logical_sector = bi->bi_sector & ~(STRIPE_SECTORS-1); ++ last_sector = bi->bi_sector + (bi->bi_size>>9); + ++ bi->bi_next = NULL; ++ set_bit(BIO_UPTODATE, &bi->bi_flags); /* will be cleared if error detected */ ++ bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ ++ for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { ++ ++ new_sector = raid5_compute_sector(logical_sector, ++ raid_disks, data_disks, &dd_idx, &pd_idx, conf); + ++ PRINTK("raid5: make_request, sector %ul logical %ul\n", ++ new_sector, logical_sector); + ++ sh = get_active_stripe(conf, new_sector, pd_idx, read_ahead); ++ if (sh) { ++ ++ add_stripe_bio(sh, bi, dd_idx, rw); ++ ++ raid5_plug_device(conf); ++ handle_stripe(sh); ++ release_stripe(sh); ++ } ++ } ++ spin_lock_irq(&conf->device_lock); ++ if (--bi->bi_phys_segments == 0) ++ bi->bi_end_io(bi); ++ spin_unlock_irq(&conf->device_lock); + return 0; + } + ++ /* FIXME go_faster isn't used */ ++ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) + { + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + struct stripe_head *sh; +*************** +*** 1476,1481 **** + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = NODEV; + + disk->operational = 0; + disk->write_only = 0; +--- 1545,1551 ---- + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = NODEV; ++ disk->bdev = NULL; + + disk->operational = 0; + disk->write_only = 0; +*************** +*** 1963,1968 **** + goto abort; + } + rdisk->dev = NODEV; + rdisk->used_slot = 0; + + break; +--- 2032,2038 ---- + goto abort; + } + rdisk->dev = NODEV; ++ rdisk->bdev = NULL; + rdisk->used_slot = 0; + + break; +*************** +*** 1980,1985 **** + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = mk_kdev(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; +--- 2050,2057 ---- + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = mk_kdev(added_desc->major,added_desc->minor); ++ /* it will be held open by rdev */ ++ adisk->bdev = bdget(kdev_t_to_nr(adisk->dev)); + + adisk->operational = 0; + adisk->write_only = 0; diff --git a/tests/linux/raid5build/merge b/tests/linux/raid5build/merge new file mode 100644 index 0000000..4a2ce92 --- /dev/null +++ b/tests/linux/raid5build/merge @@ -0,0 +1,30 @@ +static void raid5_build_block (struct stripe_head *sh, int i) +{ + raid5_conf_t *conf = sh->raid_conf; + struct r5dev *dev = &sh->dev[i]; + + bio_init(&dev->req); + dev->req.bi_io_vec = &dev->vec; + dev->req.bi_vcnt++; + dev->vec.bv_page = dev->page; + dev->vec.bv_len = STRIPE_SIZE; + dev->vec.bv_offset = 0; + + dev->req.bi_bdev = conf->disks[i].bdev; +<<<<<<< +||||||| + /* FIXME - later we will need bdev here */ +======= + dev->req.bi_sector = sh->sector; +>>>>>>> + dev->req.bi_private = sh; + + dev->flags = 0; + if (i != sh->pd_idx) +<<<<<<< + bh->b_list = BUF_LOCKED; +||||||| +======= + dev->sector = compute_blocknr(sh, i); +>>>>>>> +} diff --git a/tests/linux/raid5build/orig b/tests/linux/raid5build/orig new file mode 100644 index 0000000..3738f06 --- /dev/null +++ b/tests/linux/raid5build/orig @@ -0,0 +1,15 @@ +static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i) +{ + raid5_conf_t *conf = sh->raid_conf; + struct buffer_head *bh = sh->bh_cache[i]; + unsigned long block = sh->sector / (sh->size >> 9); + + init_buffer(bh, raid5_end_read_request, sh); + bh->b_dev = conf->disks[i].dev; + bh->b_blocknr = block; + + bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); + bh->b_size = sh->size; + bh->b_list = BUF_LOCKED; + return bh; +} diff --git a/tests/linux/raid5build/patch b/tests/linux/raid5build/patch new file mode 100644 index 0000000..69cb527 --- /dev/null +++ b/tests/linux/raid5build/patch @@ -0,0 +1,31 @@ +@@ -1,15 +1,20 @@@ +-static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i) ++static void raid5_build_block (struct stripe_head *sh, int i) + { + raid5_conf_t *conf = sh->raid_conf; +- struct buffer_head *bh = sh->bh_cache[i]; +- unsigned long block = sh->sector / (sh->size >> 9); ++ struct r5dev *dev = &sh->dev[i]; + +- init_buffer(bh, raid5_end_read_request, sh); +- bh->b_dev = conf->disks[i].dev; +- /* FIXME - later we will need bdev here */ +- bh->b_blocknr = block; ++ bio_init(&dev->req); ++ dev->req.bi_io_vec = &dev->vec; ++ dev->req.bi_vcnt++; ++ dev->vec.bv_page = dev->page; ++ dev->vec.bv_len = STRIPE_SIZE; ++ dev->vec.bv_offset = 0; + +- bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); +- bh->b_size = sh->size; +- return bh; ++ dev->req.bi_bdev = conf->disks[i].bdev; ++ dev->req.bi_sector = sh->sector; ++ dev->req.bi_private = sh; ++ ++ dev->flags = 0; ++ if (i != sh->pd_idx) ++ dev->sector = compute_blocknr(sh, i); + } diff --git a/tests/linux/raid5line/lmerge b/tests/linux/raid5line/lmerge new file mode 100644 index 0000000..e6ffa40 --- /dev/null +++ b/tests/linux/raid5line/lmerge @@ -0,0 +1,7 @@ +<<<<<<< + clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state); +||||||| + clear_buffer_uptodate(sh->bh_cache[i]); +======= + dev->flags = 0; +>>>>>>> diff --git a/tests/linux/raid5line/merge b/tests/linux/raid5line/merge new file mode 100644 index 0000000..e6ffa40 --- /dev/null +++ b/tests/linux/raid5line/merge @@ -0,0 +1,7 @@ +<<<<<<< + clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state); +||||||| + clear_buffer_uptodate(sh->bh_cache[i]); +======= + dev->flags = 0; +>>>>>>> diff --git a/tests/linux/raid5line/orig b/tests/linux/raid5line/orig new file mode 100644 index 0000000..8b28be2 --- /dev/null +++ b/tests/linux/raid5line/orig @@ -0,0 +1 @@ + clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state); diff --git a/tests/linux/raid5line/patch b/tests/linux/raid5line/patch new file mode 100644 index 0000000..c0ebfeb --- /dev/null +++ b/tests/linux/raid5line/patch @@ -0,0 +1,3 @@ +@@ -1,1 +1,1 @@ +- clear_buffer_uptodate(sh->bh_cache[i]); ++ dev->flags = 0; diff --git a/tests/linux/raid5line/wmerge b/tests/linux/raid5line/wmerge new file mode 100644 index 0000000..e2c8104 --- /dev/null +++ b/tests/linux/raid5line/wmerge @@ -0,0 +1 @@ +<<<--- clear_bit(BH_Uptodate, &|||clear_buffer_uptodate(===dev--->>>-><<<---->b_state|||===flags = 0--->>>; diff --git a/tests/linux/rpc_tcp_nonagle/merge b/tests/linux/rpc_tcp_nonagle/merge new file mode 100644 index 0000000..969115a --- /dev/null +++ b/tests/linux/rpc_tcp_nonagle/merge @@ -0,0 +1,1518 @@ +/* + * linux/net/sunrpc/svcsock.c + * + * These are the RPC server socket internals. + * + * The server scheduling algorithm does not always distribute the load + * evenly when servicing a single client. May need to modify the + * svc_sock_enqueue procedure... + * + * TCP support is largely untested and may be a little slow. The problem + * is that we currently do two separate recvfrom's, one for the 4-byte + * record length, and the second for the actual record. This could possibly + * be improved by always reading a minimum size of around 100 bytes and + * tucking any superfluous bytes away in a temporary store. Still, that + * leaves write requests out in the rain. An alternative may be to peek at + * the first skb in the queue, and if it matches the next TCP sequence + * number, to extract the record marker. Yuck. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* SMP locking strategy: + * + * svc_serv->sv_lock protects most stuff for that service. + * + * Some flags can be set to certain values at any time + * providing that certain rules are followed: + * + * SK_BUSY can be set to 0 at any time. + * svc_sock_enqueue must be called afterwards + * SK_CONN, SK_DATA, can be set or cleared at any time. + * after a set, svc_sock_enqueue must be called. + * after a clear, the socket must be read/accepted + * if this succeeds, it must be set again. + * SK_CLOSE can set at any time. It is never cleared. + * + */ + +#define RPCDBG_FACILITY RPCDBG_SVCSOCK + + +static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, + int *errp, int pmap_reg); +static void svc_udp_data_ready(struct sock *, int); +static int svc_udp_recvfrom(struct svc_rqst *); +static int svc_udp_sendto(struct svc_rqst *); + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); +static int svc_deferred_recv(struct svc_rqst *rqstp); +static struct cache_deferred_req *svc_defer(struct cache_req *req); + +/* + * Queue up an idle server thread. Must have serv->sv_lock held. + * Note: this is really a stack rather than a queue, so that we only + * use as many different threads as we need, and the rest don't polute + * the cache. + */ +static inline void +svc_serv_enqueue(struct svc_serv *serv, struct svc_rqst *rqstp) +{ + list_add(&rqstp->rq_list, &serv->sv_threads); +} + +/* + * Dequeue an nfsd thread. Must have serv->sv_lock held. + */ +static inline void +svc_serv_dequeue(struct svc_serv *serv, struct svc_rqst *rqstp) +{ + list_del(&rqstp->rq_list); +} + +/* + * Release an skbuff after use + */ +static inline void +svc_release_skb(struct svc_rqst *rqstp) +{ + struct sk_buff *skb = rqstp->rq_skbuff; + struct svc_deferred_req *dr = rqstp->rq_deferred; + + if (skb) { + rqstp->rq_skbuff = NULL; + + dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); + skb_free_datagram(rqstp->rq_sock->sk_sk, skb); + } + if (dr) { + rqstp->rq_deferred = NULL; + kfree(dr); + } +} + +/* + * Queue up a socket with data pending. If there are idle nfsd + * processes, wake 'em up. + * + */ +static void +svc_sock_enqueue(struct svc_sock *svsk) +{ + struct svc_serv *serv = svsk->sk_server; + struct svc_rqst *rqstp; + + if (!(svsk->sk_flags & + ( (1<sv_lock); + + if (!list_empty(&serv->sv_threads) && + !list_empty(&serv->sv_sockets)) + printk(KERN_ERR + "svc_sock_enqueue: threads and sockets both waiting??\n"); + + if (test_bit(SK_DEAD, &svsk->sk_flags)) { + /* Don't enqueue dead sockets */ + dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); + goto out_unlock; + } + + if (test_bit(SK_BUSY, &svsk->sk_flags)) { + /* Don't enqueue socket while daemon is receiving */ + dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); + goto out_unlock; + } + + if (((svsk->sk_reserved + serv->sv_bufsz)*2 + > sock_wspace(svsk->sk_sk)) + && !test_bit(SK_CLOSE, &svsk->sk_flags) + && !test_bit(SK_CONN, &svsk->sk_flags)) { + /* Don't enqueue while not enough space for reply */ + dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", + svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz, + sock_wspace(svsk->sk_sk)); + goto out_unlock; + } + + /* Mark socket as busy. It will remain in this state until the + * server has processed all pending data and put the socket back + * on the idle list. + */ + set_bit(SK_BUSY, &svsk->sk_flags); + + if (!list_empty(&serv->sv_threads)) { + rqstp = list_entry(serv->sv_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: socket %p served by daemon %p\n", + svsk->sk_sk, rqstp); + svc_serv_dequeue(serv, rqstp); + if (rqstp->rq_sock) + printk(KERN_ERR + "svc_sock_enqueue: server %p, rq_sock=%p!\n", + rqstp, rqstp->rq_sock); + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + rqstp->rq_reserved = serv->sv_bufsz; + svsk->sk_reserved += rqstp->rq_reserved; + wake_up(&rqstp->rq_wait); + } else { + dprintk("svc: socket %p put into queue\n", svsk->sk_sk); + list_add_tail(&svsk->sk_ready, &serv->sv_sockets); + } + +out_unlock: + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Dequeue the first socket. Must be called with the serv->sv_lock held. + */ +static inline struct svc_sock * +svc_sock_dequeue(struct svc_serv *serv) +{ + struct svc_sock *svsk; + + if (list_empty(&serv->sv_sockets)) + return NULL; + + svsk = list_entry(serv->sv_sockets.next, + struct svc_sock, sk_ready); + list_del_init(&svsk->sk_ready); + + dprintk("svc: socket %p dequeued, inuse=%d\n", + svsk->sk_sk, svsk->sk_inuse); + + return svsk; +} + +/* + * Having read something from a socket, check whether it + * needs to be re-enqueued. + * Note: SK_DATA only gets cleared when a read-attempt finds + * no (or insufficient) data. + */ +static inline void +svc_sock_received(struct svc_sock *svsk) +{ + clear_bit(SK_BUSY, &svsk->sk_flags); + svc_sock_enqueue(svsk); +} + + +/** + * svc_reserve - change the space reserved for the reply to a request. + * @rqstp: The request in question + * @space: new max space to reserve + * + * Each request reserves some space on the output queue of the socket + * to make sure the reply fits. This function reduces that reserved + * space to be the amount of space used already, plus @space. + * + */ +void svc_reserve(struct svc_rqst *rqstp, int space) +{ + space += rqstp->rq_res.head[0].iov_len; + + if (space < rqstp->rq_reserved) { + struct svc_sock *svsk = rqstp->rq_sock; + spin_lock_bh(&svsk->sk_server->sv_lock); + svsk->sk_reserved -= (rqstp->rq_reserved - space); + rqstp->rq_reserved = space; + spin_unlock_bh(&svsk->sk_server->sv_lock); + + svc_sock_enqueue(svsk); + } +} + +/* + * Release a socket after use. + */ +static inline void +svc_sock_put(struct svc_sock *svsk) +{ + struct svc_serv *serv = svsk->sk_server; + + spin_lock_bh(&serv->sv_lock); + if (!--(svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { + spin_unlock_bh(&serv->sv_lock); + dprintk("svc: releasing dead socket\n"); + sock_release(svsk->sk_sock); + kfree(svsk); + } + else + spin_unlock_bh(&serv->sv_lock); +} + +static void +svc_sock_release(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + + svc_release_skb(rqstp); + + svc_free_allpages(rqstp); + rqstp->rq_res.page_len = 0; + rqstp->rq_res.page_base = 0; + + + /* Reset response buffer and release + * the reservation. + * But first, check that enough space was reserved + * for the reply, otherwise we have a bug! + */ + if ((rqstp->rq_res.len) > rqstp->rq_reserved) + printk(KERN_ERR "RPC request reserved %d but used %d\n", + rqstp->rq_reserved, + rqstp->rq_res.len); + + rqstp->rq_res.head[0].iov_len = 0; + svc_reserve(rqstp, 0); + rqstp->rq_sock = NULL; + + svc_sock_put(svsk); +} + +/* + * External function to wake up a server waiting for data + */ +void +svc_wake_up(struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_threads)) { + rqstp = list_entry(serv->sv_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + /* + svc_serv_dequeue(serv, rqstp); + rqstp->rq_sock = NULL; + */ + wake_up(&rqstp->rq_wait); + } + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Generic sendto routine + */ +static int +svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct socket *sock = svsk->sk_sock; + int slen; + int len = 0; + int result; + int size; + struct page **ppage = xdr->pages; + size_t base = xdr->page_base; + unsigned int pglen = xdr->page_len; + unsigned int flags = MSG_MORE; + + slen = xdr->len; + + /* Grab svsk->sk_sem to serialize outgoing data. */ + down(&svsk->sk_sem); + + if (rqstp->rq_prot == IPPROTO_UDP) { + /* set the destination */ + struct msghdr msg; + msg.msg_name = &rqstp->rq_addr; + msg.msg_namelen = sizeof(rqstp->rq_addr); + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_MORE; + + if (sock_sendmsg(sock, &msg, 0) < 0) + goto out; + } + + /* send head */ + if (slen == xdr->head[0].iov_len) + flags = 0; + len = sock->ops->sendpage(sock, rqstp->rq_respages[0], 0, xdr->head[0].iov_len, flags); + if (len != xdr->head[0].iov_len) + goto out; + slen -= xdr->head[0].iov_len; + if (slen == 0) + goto out; + + /* send page data */ + size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; + while (pglen > 0) { + if (slen == size) + flags = 0; + result = sock->ops->sendpage(sock, *ppage, base, size, flags); + if (result > 0) + len += result; + if (result != size) + goto out; + slen -= size; + pglen -= size; + size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; + base = 0; + ppage++; + } + /* send tail */ + if (xdr->tail[0].iov_len) { + /* The tail *will* be in respages[0]; */ + result = sock->ops->sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage], + ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1), + xdr->tail[0].iov_len, 0); + + if (result > 0) + len += result; + } +out: + up(&svsk->sk_sem); + + dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %x)\n", + rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, + rqstp->rq_addr.sin_addr.s_addr); + + return len; +} + +/* + * Check input queue length + */ +static int +svc_recv_available(struct svc_sock *svsk) +{ + mm_segment_t oldfs; + struct socket *sock = svsk->sk_sock; + int avail, err; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail); + set_fs(oldfs); + + return (err >= 0)? avail : err; +} + +/* + * Generic recvfrom routine. + */ +static int +svc_recvfrom(struct svc_rqst *rqstp, struct iovec *iov, int nr, int buflen) +{ + mm_segment_t oldfs; + struct msghdr msg; + struct socket *sock; + int len, alen; + + rqstp->rq_addrlen = sizeof(rqstp->rq_addr); + sock = rqstp->rq_sock->sk_sock; + + msg.msg_name = &rqstp->rq_addr; + msg.msg_namelen = sizeof(rqstp->rq_addr); + msg.msg_iov = iov; + msg.msg_iovlen = nr; + msg.msg_control = NULL; + msg.msg_controllen = 0; + + msg.msg_flags = MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + len = sock_recvmsg(sock, &msg, buflen, MSG_DONTWAIT); + set_fs(oldfs); + + /* sock_recvmsg doesn't fill in the name/namelen, so we must.. + * possibly we should cache this in the svc_sock structure + * at accept time. FIXME + */ + alen = sizeof(rqstp->rq_addr); + sock->ops->getname(sock, (struct sockaddr *)&rqstp->rq_addr, &alen, 1); + + dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", + rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, len); + + return len; +} + +/* + * Set socket snd and rcv buffer lengths + */ +static inline void +svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) +{ +#if 0 + mm_segment_t oldfs; + oldfs = get_fs(); set_fs(KERNEL_DS); + sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char*)&snd, sizeof(snd)); + sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, + (char*)&rcv, sizeof(rcv)); +#else + /* sock_setsockopt limits use to sysctl_?mem_max, + * which isn't acceptable. Until that is made conditional + * on not having CAP_SYS_RESOURCE or similar, we go direct... + * DaveM said I could! + */ + lock_sock(sock->sk); + sock->sk->sndbuf = snd * 2; + sock->sk->rcvbuf = rcv * 2; + sock->sk->userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; + release_sock(sock->sk); +#endif +} +/* + * INET callback when data has been received on the socket. + */ +static void +svc_udp_data_ready(struct sock *sk, int count) +{ + struct svc_sock *svsk = (struct svc_sock *)(sk->user_data); + + if (!svsk) + goto out; + dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", + svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); + set_bit(SK_DATA, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible(sk->sleep); +} + +/* + * INET callback when space is newly available on the socket. + */ +static void +svc_write_space(struct sock *sk) +{ + struct svc_sock *svsk = (struct svc_sock *)(sk->user_data); + + if (svsk) { + dprintk("svc: socket %p(inet %p), write_space busy=%d\n", + svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); + svc_sock_enqueue(svsk); + } + + if (sk->sleep && waitqueue_active(sk->sleep)) { + printk(KERN_WARNING "RPC svc_write_space: some sleeping on %p\n", + svsk); + wake_up_interruptible(sk->sleep); + } +} + +/* + * Receive a datagram from a UDP socket. + */ +extern int +csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb); + +static int +svc_udp_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct svc_serv *serv = svsk->sk_server; + struct sk_buff *skb; + int err, len; + + if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + /* udp sockets need large rcvbuf as all pending + * requests are still in that buffer. sndbuf must + * also be large enough that there is enough space + * for one reply per thread. + */ + svc_sock_setbufsize(svsk->sk_sock, + (serv->sv_nrthreads+3) * serv->sv_bufsz, + (serv->sv_nrthreads+3) * serv->sv_bufsz); + + if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) + return svc_deferred_recv(rqstp); + + clear_bit(SK_DATA, &svsk->sk_flags); + while ((skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) { + svc_sock_received(svsk); + if (err == -EAGAIN) + return err; + /* possibly an icmp error */ + dprintk("svc: recvfrom returned error %d\n", -err); + } + set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ + + len = skb->len - sizeof(struct udphdr); + rqstp->rq_arg.len = len; + + rqstp->rq_prot = IPPROTO_UDP; + + /* Get sender address */ + rqstp->rq_addr.sin_family = AF_INET; + rqstp->rq_addr.sin_port = skb->h.uh->source; + rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr; + + if (skb_is_nonlinear(skb)) { + /* we have to copy */ + local_bh_disable(); + if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { + local_bh_enable(); + /* checksum error */ + skb_free_datagram(svsk->sk_sk, skb); + svc_sock_received(svsk); + return 0; + } + local_bh_enable(); + skb_free_datagram(svsk->sk_sk, skb); + } else { + /* we can use it in-place */ + rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); + rqstp->rq_arg.head[0].iov_len = len; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + skb_free_datagram(svsk->sk_sk, skb); + svc_sock_received(svsk); + return 0; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + rqstp->rq_skbuff = skb; + } + + rqstp->rq_arg.page_base = 0; + if (len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = len; + rqstp->rq_arg.page_len = 0; + } else { + rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; + rqstp->rq_argused += (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE; + } + + if (serv->sv_stats) + serv->sv_stats->netudpcnt++; + + /* One down, maybe more to go... */ + svsk->sk_sk->stamp = skb->stamp; + svc_sock_received(svsk); + + return len; +} + +static int +svc_udp_sendto(struct svc_rqst *rqstp) +{ + int error; + + error = svc_sendto(rqstp, &rqstp->rq_res); + if (error == -ECONNREFUSED) + /* ICMP error on earlier request. */ + error = svc_sendto(rqstp, &rqstp->rq_res); + + return error; +} + +static void +svc_udp_init(struct svc_sock *svsk) +{ + svsk->sk_sk->data_ready = svc_udp_data_ready; + svsk->sk_sk->write_space = svc_write_space; + svsk->sk_recvfrom = svc_udp_recvfrom; + svsk->sk_sendto = svc_udp_sendto; + + /* initialise setting must have enough space to + * receive and respond to one request. + * svc_udp_recvfrom will re-adjust if necessary + */ + svc_sock_setbufsize(svsk->sk_sock, + 3 * svsk->sk_server->sv_bufsz, + 3 * svsk->sk_server->sv_bufsz); + + set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ + set_bit(SK_CHNGBUF, &svsk->sk_flags); +} + +/* + * A data_ready event on a listening socket means there's a connection + * pending. Do not use state_change as a substitute for it. + */ +static void +svc_tcp_listen_data_ready(struct sock *sk, int count_unused) +{ + struct svc_sock *svsk; + + dprintk("svc: socket %p TCP (listen) state change %d\n", + sk, sk->state); + + if (sk->state != TCP_ESTABLISHED) { + /* Aborted connection, SYN_RECV or whatever... */ + goto out; + } + if (!(svsk = (struct svc_sock *) sk->user_data)) { + printk("svc: socket %p: no user data\n", sk); + goto out; + } + set_bit(SK_CONN, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible_all(sk->sleep); +} + +/* + * A state change on a connected socket means it's dying or dead. + */ +static void +svc_tcp_state_change(struct sock *sk) +{ + struct svc_sock *svsk; + + dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", + sk, sk->state, sk->user_data); + + if (!(svsk = (struct svc_sock *) sk->user_data)) { + printk("svc: socket %p: no user data\n", sk); + goto out; + } + set_bit(SK_CLOSE, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible_all(sk->sleep); +} + +static void +svc_tcp_data_ready(struct sock *sk, int count) +{ + struct svc_sock * svsk; + + dprintk("svc: socket %p TCP data ready (svsk %p)\n", + sk, sk->user_data); + if (!(svsk = (struct svc_sock *)(sk->user_data))) + goto out; + set_bit(SK_DATA, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible(sk->sleep); +} + +/* + * Accept a TCP connection + */ +static void +svc_tcp_accept(struct svc_sock *svsk) +{ + struct sockaddr_in sin; + struct svc_serv *serv = svsk->sk_server; + struct socket *sock = svsk->sk_sock; + struct socket *newsock; + struct proto_ops *ops; + struct svc_sock *newsvsk; + int err, slen; + + dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); + if (!sock) + return; + + if (!(newsock = sock_alloc())) { + printk(KERN_WARNING "%s: no more sockets!\n", serv->sv_name); + return; + } + dprintk("svc: tcp_accept %p allocated\n", newsock); + + newsock->type = sock->type; + newsock->ops = ops = sock->ops; + + clear_bit(SK_CONN, &svsk->sk_flags); + if ((err = ops->accept(sock, newsock, O_NONBLOCK)) < 0) { + if (err != -EAGAIN && net_ratelimit()) + printk(KERN_WARNING "%s: accept failed (err %d)!\n", + serv->sv_name, -err); + goto failed; /* aborted connection or whatever */ + } + set_bit(SK_CONN, &svsk->sk_flags); + svc_sock_enqueue(svsk); + + slen = sizeof(sin); + err = ops->getname(newsock, (struct sockaddr *) &sin, &slen, 1); + if (err < 0) { + if (net_ratelimit()) + printk(KERN_WARNING "%s: peername failed (err %d)!\n", + serv->sv_name, -err); + goto failed; /* aborted connection or whatever */ + } + + /* Ideally, we would want to reject connections from unauthorized + * hosts here, but when we get encription, the IP of the host won't + * tell us anything. For now just warn about unpriv connections. + */ + if (ntohs(sin.sin_port) >= 1024) { + dprintk(KERN_WARNING + "%s: connect from unprivileged port: %u.%u.%u.%u:%d\n", + serv->sv_name, + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + } + + dprintk("%s: connect from %u.%u.%u.%u:%04x\n", serv->sv_name, + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + + /* make sure that a write doesn't block forever when + * low on memory + */ + newsock->sk->sndtimeo = HZ*30; + + if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 0))) + goto failed; + + + /* make sure that we don't have too many active connections. + * If we have, something must be dropped. + * We randomly choose between newest and oldest (in terms + * of recent activity) and drop it. + */ + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*5) { + struct svc_sock *svsk = NULL; + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + if (net_random()&1) + svsk = list_entry(serv->sv_tempsocks.prev, + struct svc_sock, + sk_list); + else + svsk = list_entry(serv->sv_tempsocks.next, + struct svc_sock, + sk_list); + set_bit(SK_CLOSE, &svsk->sk_flags); + svsk->sk_inuse ++; + } + spin_unlock_bh(&serv->sv_lock); + + if (svsk) { + svc_sock_enqueue(svsk); + svc_sock_put(svsk); + } + + } + + if (serv->sv_stats) + serv->sv_stats->nettcpconn++; + + return; + +failed: + sock_release(newsock); + return; +} + +/* + * Receive data from a TCP socket. + */ +static int +svc_tcp_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct svc_serv *serv = svsk->sk_server; + int len; + struct iovec vec[RPCSVC_MAXPAGES]; + int pnum, vlen; + + dprintk("svc: tcp_recv %p data %d conn %d close %d\n", + svsk, test_bit(SK_DATA, &svsk->sk_flags), + test_bit(SK_CONN, &svsk->sk_flags), + test_bit(SK_CLOSE, &svsk->sk_flags)); + + if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) + return svc_deferred_recv(rqstp); + + if (test_bit(SK_CLOSE, &svsk->sk_flags)) { + svc_delete_socket(svsk); + return 0; + } + + if (test_bit(SK_CONN, &svsk->sk_flags)) { + svc_tcp_accept(svsk); + svc_sock_received(svsk); + return 0; + } + + if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + /* sndbuf needs to have room for one request + * per thread, otherwise we can stall even when the + * network isn't a bottleneck. + * rcvbuf just needs to be able to hold a few requests. + * Normally they will be removed from the queue + * as soon a a complete request arrives. + */ + svc_sock_setbufsize(svsk->sk_sock, + (serv->sv_nrthreads+3) * serv->sv_bufsz, + 3 * serv->sv_bufsz); + + clear_bit(SK_DATA, &svsk->sk_flags); + + /* Receive data. If we haven't got the record length yet, get + * the next four bytes. Otherwise try to gobble up as much as + * possible up to the complete record length. + */ + if (svsk->sk_tcplen < 4) { + unsigned long want = 4 - svsk->sk_tcplen; + struct iovec iov; + + iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; + iov.iov_len = want; + if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) + goto error; + svsk->sk_tcplen += len; + + if (len < want) { + dprintk("svc: short recvfrom while reading record length (%d of %d)\n", + len, want); + svc_sock_received(svsk); + return -EAGAIN; /* record header not complete */ + } + + svsk->sk_reclen = ntohl(svsk->sk_reclen); + if (!(svsk->sk_reclen & 0x80000000)) { + /* FIXME: technically, a record can be fragmented, + * and non-terminal fragments will not have the top + * bit set in the fragment length header. + * But apparently no known nfs clients send fragmented + * records. */ + printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (non-terminal)\n", + (unsigned long) svsk->sk_reclen); + goto err_delete; + } + svsk->sk_reclen &= 0x7fffffff; + dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen); + if (svsk->sk_reclen > serv->sv_bufsz) { + printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n", + (unsigned long) svsk->sk_reclen); + goto err_delete; + } + } + + /* Check whether enough data is available */ + len = svc_recv_available(svsk); + if (len < 0) + goto error; + + if (len < svsk->sk_reclen) { + dprintk("svc: incomplete TCP record (%d of %d)\n", + len, svsk->sk_reclen); + svc_sock_received(svsk); + return -EAGAIN; /* record not complete */ + } + len = svsk->sk_reclen; + set_bit(SK_DATA, &svsk->sk_flags); + + vec[0] = rqstp->rq_arg.head[0]; + vlen = PAGE_SIZE; + pnum = 1; + while (vlen < len) { + vec[pnum].iov_base = page_address(rqstp->rq_argpages[rqstp->rq_argused++]); + vec[pnum].iov_len = PAGE_SIZE; + pnum++; + vlen += PAGE_SIZE; + } + + /* Now receive data */ + len = svc_recvfrom(rqstp, vec, pnum, len); + if (len < 0) + goto error; + + dprintk("svc: TCP complete record (%d bytes)\n", len); + rqstp->rq_arg.len = len; + rqstp->rq_arg.page_base = 0; + if (len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = len; + rqstp->rq_arg.page_len = 0; + } else { + rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; + } + + rqstp->rq_skbuff = 0; + rqstp->rq_prot = IPPROTO_TCP; + + /* Reset TCP read info */ + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + + svc_sock_received(svsk); + if (serv->sv_stats) + serv->sv_stats->nettcpcnt++; + + return len; + + err_delete: + svc_delete_socket(svsk); + return -EAGAIN; + + error: + if (len == -EAGAIN) { + dprintk("RPC: TCP recvfrom got EAGAIN\n"); + svc_sock_received(svsk); + } else { + printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", + svsk->sk_server->sv_name, -len); + svc_sock_received(svsk); + } + + return len; +} + +/* + * Send out data on TCP socket. + */ +static int +svc_tcp_sendto(struct svc_rqst *rqstp) +{ + struct xdr_buf *xbufp = &rqstp->rq_res; + int sent; + u32 reclen; + + /* Set up the first element of the reply iovec. + * Any other iovecs that may be in use have been taken + * care of by the server implementation itself. + */ + reclen = htonl(0x80000000|((xbufp->len ) - 4)); + memcpy(xbufp->head[0].iov_base, &reclen, 4); + + sent = svc_sendto(rqstp, &rqstp->rq_res); + if (sent != xbufp->len) { + printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + rqstp->rq_sock->sk_server->sv_name, + (sent<0)?"got error":"sent only", + sent, xbufp->len); + svc_delete_socket(rqstp->rq_sock); + sent = -EAGAIN; + } + return sent; +} + +static void +svc_tcp_init(struct svc_sock *svsk) +{ + struct sock *sk = svsk->sk_sk; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + svsk->sk_recvfrom = svc_tcp_recvfrom; + svsk->sk_sendto = svc_tcp_sendto; + + if (sk->state == TCP_LISTEN) { + dprintk("setting up TCP socket for listening\n"); + sk->data_ready = svc_tcp_listen_data_ready; + set_bit(SK_CONN, &svsk->sk_flags); + } else { + dprintk("setting up TCP socket for reading\n"); + sk->state_change = svc_tcp_state_change; + sk->data_ready = svc_tcp_data_ready; + sk->write_space = svc_write_space; + + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + +<<<<<<< + /* initialise setting must have enough space to +||||||| +======= + tp->nonagle = 1; /* disable Nagle's algorithm */ + +>>>>>>> + * receive and respond to one request. + * svc_tcp_recvfrom will re-adjust if necessary + */ + svc_sock_setbufsize(svsk->sk_sock, + 3 * svsk->sk_server->sv_bufsz, + 3 * svsk->sk_server->sv_bufsz); + + set_bit(SK_CHNGBUF, &svsk->sk_flags); + set_bit(SK_DATA, &svsk->sk_flags); + } +} + +void +svc_sock_update_bufs(struct svc_serv *serv) +{ + /* + * The number of server threads has changed. Update + * rcvbuf and sndbuf accordingly on all sockets + */ + struct list_head *le; + + spin_lock_bh(&serv->sv_lock); + list_for_each(le, &serv->sv_permsocks) { + struct svc_sock *svsk = + list_entry(le, struct svc_sock, sk_list); + set_bit(SK_CHNGBUF, &svsk->sk_flags); + } + list_for_each(le, &serv->sv_tempsocks) { + struct svc_sock *svsk = + list_entry(le, struct svc_sock, sk_list); + set_bit(SK_CHNGBUF, &svsk->sk_flags); + } + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Receive the next request on any socket. + */ +int +svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) +{ + struct svc_sock *svsk =NULL; + int len; + int pages; + struct xdr_buf *arg; + DECLARE_WAITQUEUE(wait, current); + + dprintk("svc: server %p waiting for data (to = %ld)\n", + rqstp, timeout); + + if (rqstp->rq_sock) + printk(KERN_ERR + "svc_recv: service %p, socket not NULL!\n", + rqstp); + if (waitqueue_active(&rqstp->rq_wait)) + printk(KERN_ERR + "svc_recv: service %p, wait queue active!\n", + rqstp); + + /* Initialize the buffers */ + /* first reclaim pages that were moved to response list */ + svc_pushback_allpages(rqstp); + + /* now allocate needed pages. If we get a failure, sleep briefly */ + pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE; + while (rqstp->rq_arghi < pages) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ/2); + current->state = TASK_RUNNING; + continue; + } + rqstp->rq_argpages[rqstp->rq_arghi++] = p; + } + + /* Make arg->head point to first page and arg->pages point to rest */ + arg = &rqstp->rq_arg; + arg->head[0].iov_base = page_address(rqstp->rq_argpages[0]); + arg->head[0].iov_len = PAGE_SIZE; + rqstp->rq_argused = 1; + arg->pages = rqstp->rq_argpages + 1; + arg->page_base = 0; + /* save at least one page for response */ + arg->page_len = (pages-2)*PAGE_SIZE; + arg->len = (pages-1)*PAGE_SIZE; + arg->tail[0].iov_len = 0; + + if (signalled()) + return -EINTR; + + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + svsk = list_entry(serv->sv_tempsocks.next, + struct svc_sock, sk_list); + /* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ + if (get_seconds() - svsk->sk_lastrecv < 6*60 + || test_bit(SK_BUSY, &svsk->sk_flags)) + svsk = NULL; + } + if (svsk) { + set_bit(SK_BUSY, &svsk->sk_flags); + set_bit(SK_CLOSE, &svsk->sk_flags); + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + } else if ((svsk = svc_sock_dequeue(serv)) != NULL) { + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + rqstp->rq_reserved = serv->sv_bufsz; + svsk->sk_reserved += rqstp->rq_reserved; + } else { + /* No data pending. Go to sleep */ + svc_serv_enqueue(serv, rqstp); + + /* + * We have to be able to interrupt this wait + * to bring down the daemons ... + */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&rqstp->rq_wait, &wait); + spin_unlock_bh(&serv->sv_lock); + + schedule_timeout(timeout); + + spin_lock_bh(&serv->sv_lock); + remove_wait_queue(&rqstp->rq_wait, &wait); + + if (!(svsk = rqstp->rq_sock)) { + svc_serv_dequeue(serv, rqstp); + spin_unlock_bh(&serv->sv_lock); + dprintk("svc: server %p, no data yet\n", rqstp); + return signalled()? -EINTR : -EAGAIN; + } + } + spin_unlock_bh(&serv->sv_lock); + + dprintk("svc: server %p, socket %p, inuse=%d\n", + rqstp, svsk, svsk->sk_inuse); + len = svsk->sk_recvfrom(rqstp); + dprintk("svc: got len=%d\n", len); + + /* No data, incomplete (TCP) read, or accept() */ + if (len == 0 || len == -EAGAIN) { + svc_sock_release(rqstp); + return -EAGAIN; + } + svsk->sk_lastrecv = get_seconds(); + if (test_bit(SK_TEMP, &svsk->sk_flags)) { + /* push active sockets to end of list */ + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&svsk->sk_list)) + list_move_tail(&svsk->sk_list, &serv->sv_tempsocks); + spin_unlock_bh(&serv->sv_lock); + } + + rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024; + rqstp->rq_userset = 0; + rqstp->rq_chandle.defer = svc_defer; + + if (serv->sv_stats) + serv->sv_stats->netcnt++; + return len; +} + +/* + * Drop request + */ +void +svc_drop(struct svc_rqst *rqstp) +{ + dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); + svc_sock_release(rqstp); +} + +/* + * Return reply to client. + */ +int +svc_send(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk; + int len; + struct xdr_buf *xb; + + if ((svsk = rqstp->rq_sock) == NULL) { + printk(KERN_WARNING "NULL socket pointer in %s:%d\n", + __FILE__, __LINE__); + return -EFAULT; + } + + /* release the receive skb before sending the reply */ + svc_release_skb(rqstp); + + /* calculate over-all length */ + xb = & rqstp->rq_res; + xb->len = xb->head[0].iov_len + + xb->page_len + + xb->tail[0].iov_len; + + len = svsk->sk_sendto(rqstp); + svc_sock_release(rqstp); + + if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) + return 0; + return len; +} + +/* + * Initialize socket for RPC use and create svc_sock struct + * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. + */ +static struct svc_sock * +svc_setup_socket(struct svc_serv *serv, struct socket *sock, + int *errp, int pmap_register) +{ + struct svc_sock *svsk; + struct sock *inet; + + dprintk("svc: svc_setup_socket %p\n", sock); + if (!(svsk = kmalloc(sizeof(*svsk), GFP_KERNEL))) { + *errp = -ENOMEM; + return NULL; + } + memset(svsk, 0, sizeof(*svsk)); + + inet = sock->sk; + + /* Register socket with portmapper */ + if (*errp >= 0 && pmap_register) + *errp = svc_register(serv, inet->protocol, + ntohs(inet_sk(inet)->sport)); + + if (*errp < 0) { + kfree(svsk); + return NULL; + } + + set_bit(SK_BUSY, &svsk->sk_flags); + inet->user_data = svsk; + svsk->sk_sock = sock; + svsk->sk_sk = inet; + svsk->sk_ostate = inet->state_change; + svsk->sk_odata = inet->data_ready; + svsk->sk_owspace = inet->write_space; + svsk->sk_server = serv; + svsk->sk_lastrecv = get_seconds(); + INIT_LIST_HEAD(&svsk->sk_deferred); + INIT_LIST_HEAD(&svsk->sk_ready); + sema_init(&svsk->sk_sem, 1); + + /* Initialize the socket */ + if (sock->type == SOCK_DGRAM) + svc_udp_init(svsk); + else + svc_tcp_init(svsk); + + spin_lock_bh(&serv->sv_lock); + if (!pmap_register) { + set_bit(SK_TEMP, &svsk->sk_flags); + list_add(&svsk->sk_list, &serv->sv_tempsocks); + serv->sv_tmpcnt++; + } else { + clear_bit(SK_TEMP, &svsk->sk_flags); + list_add(&svsk->sk_list, &serv->sv_permsocks); + } + spin_unlock_bh(&serv->sv_lock); + + dprintk("svc: svc_setup_socket created %p (inet %p)\n", + svsk, svsk->sk_sk); + + clear_bit(SK_BUSY, &svsk->sk_flags); + svc_sock_enqueue(svsk); + return svsk; +} + +/* + * Create socket for RPC service. + */ +static int +svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin) +{ + struct svc_sock *svsk; + struct socket *sock; + int error; + int type; + + dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n", + serv->sv_program->pg_name, protocol, + NIPQUAD(sin->sin_addr.s_addr), + ntohs(sin->sin_port)); + + if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { + printk(KERN_WARNING "svc: only UDP and TCP " + "sockets supported\n"); + return -EINVAL; + } + type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + + if ((error = sock_create(PF_INET, type, protocol, &sock)) < 0) + return error; + + if (sin != NULL) { + sock->sk->reuse = 1; /* allow address reuse */ + error = sock->ops->bind(sock, (struct sockaddr *) sin, + sizeof(*sin)); + if (error < 0) + goto bummer; + } + + if (protocol == IPPROTO_TCP) { + if ((error = sock->ops->listen(sock, 64)) < 0) + goto bummer; + } + + if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) + return 0; + +bummer: + dprintk("svc: svc_create_socket error = %d\n", -error); + sock_release(sock); + return error; +} + +/* + * Remove a dead socket + */ +void +svc_delete_socket(struct svc_sock *svsk) +{ + struct svc_serv *serv; + struct sock *sk; + + dprintk("svc: svc_delete_socket(%p)\n", svsk); + + serv = svsk->sk_server; + sk = svsk->sk_sk; + + sk->state_change = svsk->sk_ostate; + sk->data_ready = svsk->sk_odata; + sk->write_space = svsk->sk_owspace; + + spin_lock_bh(&serv->sv_lock); + + list_del_init(&svsk->sk_list); + list_del_init(&svsk->sk_ready); + if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) + if (test_bit(SK_TEMP, &svsk->sk_flags)) + serv->sv_tmpcnt--; + + if (!svsk->sk_inuse) { + spin_unlock_bh(&serv->sv_lock); + sock_release(svsk->sk_sock); + kfree(svsk); + } else { + spin_unlock_bh(&serv->sv_lock); + dprintk(KERN_NOTICE "svc: server socket destroy delayed\n"); + /* svsk->sk_server = NULL; */ + } +} + +/* + * Make a socket for nfsd and lockd + */ +int +svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) +{ + struct sockaddr_in sin; + + dprintk("svc: creating socket proto = %d\n", protocol); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = INADDR_ANY; + sin.sin_port = htons(port); + return svc_create_socket(serv, protocol, &sin); +} + +/* + * Handle defer and revisit of requests + */ + +static void svc_revisit(struct cache_deferred_req *dreq, int too_many) +{ + struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); + struct svc_serv *serv = dr->serv; + struct svc_sock *svsk; + + if (too_many) { + svc_sock_put(dr->svsk); + kfree(dr); + return; + } + dprintk("revisit queued\n"); + svsk = dr->svsk; + dr->svsk = NULL; + spin_lock(&serv->sv_lock); + list_add(&dr->handle.recent, &svsk->sk_deferred); + spin_unlock(&serv->sv_lock); + set_bit(SK_DEFERRED, &svsk->sk_flags); + svc_sock_enqueue(svsk); + svc_sock_put(svsk); +} + +static struct cache_deferred_req * +svc_defer(struct cache_req *req) +{ + struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); + int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); + struct svc_deferred_req *dr; + + if (rqstp->rq_arg.page_len) + return NULL; /* if more than a page, give up FIXME */ + if (rqstp->rq_deferred) { + dr = rqstp->rq_deferred; + rqstp->rq_deferred = NULL; + } else { + int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; + /* FIXME maybe discard if size too large */ + dr = kmalloc(size, GFP_KERNEL); + if (dr == NULL) + return NULL; + + dr->serv = rqstp->rq_server; + dr->prot = rqstp->rq_prot; + dr->addr = rqstp->rq_addr; + dr->argslen = rqstp->rq_arg.len >> 2; + memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); + } + spin_lock(&rqstp->rq_server->sv_lock); + rqstp->rq_sock->sk_inuse++; + dr->svsk = rqstp->rq_sock; + spin_unlock(&rqstp->rq_server->sv_lock); + + dr->handle.revisit = svc_revisit; + return &dr->handle; +} + +/* + * recv data from a deferred request into an active one + */ +static int svc_deferred_recv(struct svc_rqst *rqstp) +{ + struct svc_deferred_req *dr = rqstp->rq_deferred; + + rqstp->rq_arg.head[0].iov_base = dr->args; + rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; + rqstp->rq_arg.page_len = 0; + rqstp->rq_arg.len = dr->argslen<<2; + rqstp->rq_prot = dr->prot; + rqstp->rq_addr = dr->addr; + return dr->argslen<<2; +} + + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) +{ + struct svc_deferred_req *dr = NULL; + struct svc_serv *serv = svsk->sk_server; + + if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) + return NULL; + spin_lock(&serv->sv_lock); + clear_bit(SK_DEFERRED, &svsk->sk_flags); + if (!list_empty(&svsk->sk_deferred)) { + dr = list_entry(svsk->sk_deferred.next, + struct svc_deferred_req, + handle.recent); + list_del_init(&dr->handle.recent); + set_bit(SK_DEFERRED, &svsk->sk_flags); + } + spin_unlock(&serv->sv_lock); + svc_sock_received(svsk); + return dr; +} diff --git a/tests/linux/rpc_tcp_nonagle/orig b/tests/linux/rpc_tcp_nonagle/orig new file mode 100644 index 0000000..983322e --- /dev/null +++ b/tests/linux/rpc_tcp_nonagle/orig @@ -0,0 +1,1511 @@ +/* + * linux/net/sunrpc/svcsock.c + * + * These are the RPC server socket internals. + * + * The server scheduling algorithm does not always distribute the load + * evenly when servicing a single client. May need to modify the + * svc_sock_enqueue procedure... + * + * TCP support is largely untested and may be a little slow. The problem + * is that we currently do two separate recvfrom's, one for the 4-byte + * record length, and the second for the actual record. This could possibly + * be improved by always reading a minimum size of around 100 bytes and + * tucking any superfluous bytes away in a temporary store. Still, that + * leaves write requests out in the rain. An alternative may be to peek at + * the first skb in the queue, and if it matches the next TCP sequence + * number, to extract the record marker. Yuck. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* SMP locking strategy: + * + * svc_serv->sv_lock protects most stuff for that service. + * + * Some flags can be set to certain values at any time + * providing that certain rules are followed: + * + * SK_BUSY can be set to 0 at any time. + * svc_sock_enqueue must be called afterwards + * SK_CONN, SK_DATA, can be set or cleared at any time. + * after a set, svc_sock_enqueue must be called. + * after a clear, the socket must be read/accepted + * if this succeeds, it must be set again. + * SK_CLOSE can set at any time. It is never cleared. + * + */ + +#define RPCDBG_FACILITY RPCDBG_SVCSOCK + + +static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, + int *errp, int pmap_reg); +static void svc_udp_data_ready(struct sock *, int); +static int svc_udp_recvfrom(struct svc_rqst *); +static int svc_udp_sendto(struct svc_rqst *); + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); +static int svc_deferred_recv(struct svc_rqst *rqstp); +static struct cache_deferred_req *svc_defer(struct cache_req *req); + +/* + * Queue up an idle server thread. Must have serv->sv_lock held. + * Note: this is really a stack rather than a queue, so that we only + * use as many different threads as we need, and the rest don't polute + * the cache. + */ +static inline void +svc_serv_enqueue(struct svc_serv *serv, struct svc_rqst *rqstp) +{ + list_add(&rqstp->rq_list, &serv->sv_threads); +} + +/* + * Dequeue an nfsd thread. Must have serv->sv_lock held. + */ +static inline void +svc_serv_dequeue(struct svc_serv *serv, struct svc_rqst *rqstp) +{ + list_del(&rqstp->rq_list); +} + +/* + * Release an skbuff after use + */ +static inline void +svc_release_skb(struct svc_rqst *rqstp) +{ + struct sk_buff *skb = rqstp->rq_skbuff; + struct svc_deferred_req *dr = rqstp->rq_deferred; + + if (skb) { + rqstp->rq_skbuff = NULL; + + dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); + skb_free_datagram(rqstp->rq_sock->sk_sk, skb); + } + if (dr) { + rqstp->rq_deferred = NULL; + kfree(dr); + } +} + +/* + * Queue up a socket with data pending. If there are idle nfsd + * processes, wake 'em up. + * + */ +static void +svc_sock_enqueue(struct svc_sock *svsk) +{ + struct svc_serv *serv = svsk->sk_server; + struct svc_rqst *rqstp; + + if (!(svsk->sk_flags & + ( (1<sv_lock); + + if (!list_empty(&serv->sv_threads) && + !list_empty(&serv->sv_sockets)) + printk(KERN_ERR + "svc_sock_enqueue: threads and sockets both waiting??\n"); + + if (test_bit(SK_DEAD, &svsk->sk_flags)) { + /* Don't enqueue dead sockets */ + dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); + goto out_unlock; + } + + if (test_bit(SK_BUSY, &svsk->sk_flags)) { + /* Don't enqueue socket while daemon is receiving */ + dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); + goto out_unlock; + } + + if (((svsk->sk_reserved + serv->sv_bufsz)*2 + > sock_wspace(svsk->sk_sk)) + && !test_bit(SK_CLOSE, &svsk->sk_flags) + && !test_bit(SK_CONN, &svsk->sk_flags)) { + /* Don't enqueue while not enough space for reply */ + dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", + svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz, + sock_wspace(svsk->sk_sk)); + goto out_unlock; + } + + /* Mark socket as busy. It will remain in this state until the + * server has processed all pending data and put the socket back + * on the idle list. + */ + set_bit(SK_BUSY, &svsk->sk_flags); + + if (!list_empty(&serv->sv_threads)) { + rqstp = list_entry(serv->sv_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: socket %p served by daemon %p\n", + svsk->sk_sk, rqstp); + svc_serv_dequeue(serv, rqstp); + if (rqstp->rq_sock) + printk(KERN_ERR + "svc_sock_enqueue: server %p, rq_sock=%p!\n", + rqstp, rqstp->rq_sock); + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + rqstp->rq_reserved = serv->sv_bufsz; + svsk->sk_reserved += rqstp->rq_reserved; + wake_up(&rqstp->rq_wait); + } else { + dprintk("svc: socket %p put into queue\n", svsk->sk_sk); + list_add_tail(&svsk->sk_ready, &serv->sv_sockets); + } + +out_unlock: + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Dequeue the first socket. Must be called with the serv->sv_lock held. + */ +static inline struct svc_sock * +svc_sock_dequeue(struct svc_serv *serv) +{ + struct svc_sock *svsk; + + if (list_empty(&serv->sv_sockets)) + return NULL; + + svsk = list_entry(serv->sv_sockets.next, + struct svc_sock, sk_ready); + list_del_init(&svsk->sk_ready); + + dprintk("svc: socket %p dequeued, inuse=%d\n", + svsk->sk_sk, svsk->sk_inuse); + + return svsk; +} + +/* + * Having read something from a socket, check whether it + * needs to be re-enqueued. + * Note: SK_DATA only gets cleared when a read-attempt finds + * no (or insufficient) data. + */ +static inline void +svc_sock_received(struct svc_sock *svsk) +{ + clear_bit(SK_BUSY, &svsk->sk_flags); + svc_sock_enqueue(svsk); +} + + +/** + * svc_reserve - change the space reserved for the reply to a request. + * @rqstp: The request in question + * @space: new max space to reserve + * + * Each request reserves some space on the output queue of the socket + * to make sure the reply fits. This function reduces that reserved + * space to be the amount of space used already, plus @space. + * + */ +void svc_reserve(struct svc_rqst *rqstp, int space) +{ + space += rqstp->rq_res.head[0].iov_len; + + if (space < rqstp->rq_reserved) { + struct svc_sock *svsk = rqstp->rq_sock; + spin_lock_bh(&svsk->sk_server->sv_lock); + svsk->sk_reserved -= (rqstp->rq_reserved - space); + rqstp->rq_reserved = space; + spin_unlock_bh(&svsk->sk_server->sv_lock); + + svc_sock_enqueue(svsk); + } +} + +/* + * Release a socket after use. + */ +static inline void +svc_sock_put(struct svc_sock *svsk) +{ + struct svc_serv *serv = svsk->sk_server; + + spin_lock_bh(&serv->sv_lock); + if (!--(svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { + spin_unlock_bh(&serv->sv_lock); + dprintk("svc: releasing dead socket\n"); + sock_release(svsk->sk_sock); + kfree(svsk); + } + else + spin_unlock_bh(&serv->sv_lock); +} + +static void +svc_sock_release(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + + svc_release_skb(rqstp); + + svc_free_allpages(rqstp); + rqstp->rq_res.page_len = 0; + rqstp->rq_res.page_base = 0; + + + /* Reset response buffer and release + * the reservation. + * But first, check that enough space was reserved + * for the reply, otherwise we have a bug! + */ + if ((rqstp->rq_res.len) > rqstp->rq_reserved) + printk(KERN_ERR "RPC request reserved %d but used %d\n", + rqstp->rq_reserved, + rqstp->rq_res.len); + + rqstp->rq_res.head[0].iov_len = 0; + svc_reserve(rqstp, 0); + rqstp->rq_sock = NULL; + + svc_sock_put(svsk); +} + +/* + * External function to wake up a server waiting for data + */ +void +svc_wake_up(struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_threads)) { + rqstp = list_entry(serv->sv_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + /* + svc_serv_dequeue(serv, rqstp); + rqstp->rq_sock = NULL; + */ + wake_up(&rqstp->rq_wait); + } + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Generic sendto routine + */ +static int +svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct socket *sock = svsk->sk_sock; + int slen; + int len = 0; + int result; + int size; + struct page **ppage = xdr->pages; + size_t base = xdr->page_base; + unsigned int pglen = xdr->page_len; + unsigned int flags = MSG_MORE; + + slen = xdr->len; + + /* Grab svsk->sk_sem to serialize outgoing data. */ + down(&svsk->sk_sem); + + if (rqstp->rq_prot == IPPROTO_UDP) { + /* set the destination */ + struct msghdr msg; + msg.msg_name = &rqstp->rq_addr; + msg.msg_namelen = sizeof(rqstp->rq_addr); + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_MORE; + + if (sock_sendmsg(sock, &msg, 0) < 0) + goto out; + } + + /* send head */ + if (slen == xdr->head[0].iov_len) + flags = 0; + len = sock->ops->sendpage(sock, rqstp->rq_respages[0], 0, xdr->head[0].iov_len, flags); + if (len != xdr->head[0].iov_len) + goto out; + slen -= xdr->head[0].iov_len; + if (slen == 0) + goto out; + + /* send page data */ + size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; + while (pglen > 0) { + if (slen == size) + flags = 0; + result = sock->ops->sendpage(sock, *ppage, base, size, flags); + if (result > 0) + len += result; + if (result != size) + goto out; + slen -= size; + pglen -= size; + size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; + base = 0; + ppage++; + } + /* send tail */ + if (xdr->tail[0].iov_len) { + /* The tail *will* be in respages[0]; */ + result = sock->ops->sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage], + ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1), + xdr->tail[0].iov_len, 0); + + if (result > 0) + len += result; + } +out: + up(&svsk->sk_sem); + + dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %x)\n", + rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, + rqstp->rq_addr.sin_addr.s_addr); + + return len; +} + +/* + * Check input queue length + */ +static int +svc_recv_available(struct svc_sock *svsk) +{ + mm_segment_t oldfs; + struct socket *sock = svsk->sk_sock; + int avail, err; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail); + set_fs(oldfs); + + return (err >= 0)? avail : err; +} + +/* + * Generic recvfrom routine. + */ +static int +svc_recvfrom(struct svc_rqst *rqstp, struct iovec *iov, int nr, int buflen) +{ + mm_segment_t oldfs; + struct msghdr msg; + struct socket *sock; + int len, alen; + + rqstp->rq_addrlen = sizeof(rqstp->rq_addr); + sock = rqstp->rq_sock->sk_sock; + + msg.msg_name = &rqstp->rq_addr; + msg.msg_namelen = sizeof(rqstp->rq_addr); + msg.msg_iov = iov; + msg.msg_iovlen = nr; + msg.msg_control = NULL; + msg.msg_controllen = 0; + + msg.msg_flags = MSG_DONTWAIT; + + oldfs = get_fs(); set_fs(KERNEL_DS); + len = sock_recvmsg(sock, &msg, buflen, MSG_DONTWAIT); + set_fs(oldfs); + + /* sock_recvmsg doesn't fill in the name/namelen, so we must.. + * possibly we should cache this in the svc_sock structure + * at accept time. FIXME + */ + alen = sizeof(rqstp->rq_addr); + sock->ops->getname(sock, (struct sockaddr *)&rqstp->rq_addr, &alen, 1); + + dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", + rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, len); + + return len; +} + +/* + * Set socket snd and rcv buffer lengths + */ +static inline void +svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) +{ +#if 0 + mm_segment_t oldfs; + oldfs = get_fs(); set_fs(KERNEL_DS); + sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char*)&snd, sizeof(snd)); + sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, + (char*)&rcv, sizeof(rcv)); +#else + /* sock_setsockopt limits use to sysctl_?mem_max, + * which isn't acceptable. Until that is made conditional + * on not having CAP_SYS_RESOURCE or similar, we go direct... + * DaveM said I could! + */ + lock_sock(sock->sk); + sock->sk->sndbuf = snd * 2; + sock->sk->rcvbuf = rcv * 2; + sock->sk->userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; + release_sock(sock->sk); +#endif +} +/* + * INET callback when data has been received on the socket. + */ +static void +svc_udp_data_ready(struct sock *sk, int count) +{ + struct svc_sock *svsk = (struct svc_sock *)(sk->user_data); + + if (!svsk) + goto out; + dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", + svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); + set_bit(SK_DATA, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible(sk->sleep); +} + +/* + * INET callback when space is newly available on the socket. + */ +static void +svc_write_space(struct sock *sk) +{ + struct svc_sock *svsk = (struct svc_sock *)(sk->user_data); + + if (svsk) { + dprintk("svc: socket %p(inet %p), write_space busy=%d\n", + svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); + svc_sock_enqueue(svsk); + } + + if (sk->sleep && waitqueue_active(sk->sleep)) { + printk(KERN_WARNING "RPC svc_write_space: some sleeping on %p\n", + svsk); + wake_up_interruptible(sk->sleep); + } +} + +/* + * Receive a datagram from a UDP socket. + */ +extern int +csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb); + +static int +svc_udp_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct svc_serv *serv = svsk->sk_server; + struct sk_buff *skb; + int err, len; + + if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + /* udp sockets need large rcvbuf as all pending + * requests are still in that buffer. sndbuf must + * also be large enough that there is enough space + * for one reply per thread. + */ + svc_sock_setbufsize(svsk->sk_sock, + (serv->sv_nrthreads+3) * serv->sv_bufsz, + (serv->sv_nrthreads+3) * serv->sv_bufsz); + + if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) + return svc_deferred_recv(rqstp); + + clear_bit(SK_DATA, &svsk->sk_flags); + while ((skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) { + svc_sock_received(svsk); + if (err == -EAGAIN) + return err; + /* possibly an icmp error */ + dprintk("svc: recvfrom returned error %d\n", -err); + } + set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ + + len = skb->len - sizeof(struct udphdr); + rqstp->rq_arg.len = len; + + rqstp->rq_prot = IPPROTO_UDP; + + /* Get sender address */ + rqstp->rq_addr.sin_family = AF_INET; + rqstp->rq_addr.sin_port = skb->h.uh->source; + rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr; + + if (skb_is_nonlinear(skb)) { + /* we have to copy */ + local_bh_disable(); + if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { + local_bh_enable(); + /* checksum error */ + skb_free_datagram(svsk->sk_sk, skb); + svc_sock_received(svsk); + return 0; + } + local_bh_enable(); + skb_free_datagram(svsk->sk_sk, skb); + } else { + /* we can use it in-place */ + rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); + rqstp->rq_arg.head[0].iov_len = len; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + skb_free_datagram(svsk->sk_sk, skb); + svc_sock_received(svsk); + return 0; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + rqstp->rq_skbuff = skb; + } + + rqstp->rq_arg.page_base = 0; + if (len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = len; + rqstp->rq_arg.page_len = 0; + } else { + rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; + rqstp->rq_argused += (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE; + } + + if (serv->sv_stats) + serv->sv_stats->netudpcnt++; + + /* One down, maybe more to go... */ + svsk->sk_sk->stamp = skb->stamp; + svc_sock_received(svsk); + + return len; +} + +static int +svc_udp_sendto(struct svc_rqst *rqstp) +{ + int error; + + error = svc_sendto(rqstp, &rqstp->rq_res); + if (error == -ECONNREFUSED) + /* ICMP error on earlier request. */ + error = svc_sendto(rqstp, &rqstp->rq_res); + + return error; +} + +static void +svc_udp_init(struct svc_sock *svsk) +{ + svsk->sk_sk->data_ready = svc_udp_data_ready; + svsk->sk_sk->write_space = svc_write_space; + svsk->sk_recvfrom = svc_udp_recvfrom; + svsk->sk_sendto = svc_udp_sendto; + + /* initialise setting must have enough space to + * receive and respond to one request. + * svc_udp_recvfrom will re-adjust if necessary + */ + svc_sock_setbufsize(svsk->sk_sock, + 3 * svsk->sk_server->sv_bufsz, + 3 * svsk->sk_server->sv_bufsz); + + set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ + set_bit(SK_CHNGBUF, &svsk->sk_flags); +} + +/* + * A data_ready event on a listening socket means there's a connection + * pending. Do not use state_change as a substitute for it. + */ +static void +svc_tcp_listen_data_ready(struct sock *sk, int count_unused) +{ + struct svc_sock *svsk; + + dprintk("svc: socket %p TCP (listen) state change %d\n", + sk, sk->state); + + if (sk->state != TCP_ESTABLISHED) { + /* Aborted connection, SYN_RECV or whatever... */ + goto out; + } + if (!(svsk = (struct svc_sock *) sk->user_data)) { + printk("svc: socket %p: no user data\n", sk); + goto out; + } + set_bit(SK_CONN, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible_all(sk->sleep); +} + +/* + * A state change on a connected socket means it's dying or dead. + */ +static void +svc_tcp_state_change(struct sock *sk) +{ + struct svc_sock *svsk; + + dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", + sk, sk->state, sk->user_data); + + if (!(svsk = (struct svc_sock *) sk->user_data)) { + printk("svc: socket %p: no user data\n", sk); + goto out; + } + set_bit(SK_CLOSE, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible_all(sk->sleep); +} + +static void +svc_tcp_data_ready(struct sock *sk, int count) +{ + struct svc_sock * svsk; + + dprintk("svc: socket %p TCP data ready (svsk %p)\n", + sk, sk->user_data); + if (!(svsk = (struct svc_sock *)(sk->user_data))) + goto out; + set_bit(SK_DATA, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible(sk->sleep); +} + +/* + * Accept a TCP connection + */ +static void +svc_tcp_accept(struct svc_sock *svsk) +{ + struct sockaddr_in sin; + struct svc_serv *serv = svsk->sk_server; + struct socket *sock = svsk->sk_sock; + struct socket *newsock; + struct proto_ops *ops; + struct svc_sock *newsvsk; + int err, slen; + + dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); + if (!sock) + return; + + if (!(newsock = sock_alloc())) { + printk(KERN_WARNING "%s: no more sockets!\n", serv->sv_name); + return; + } + dprintk("svc: tcp_accept %p allocated\n", newsock); + + newsock->type = sock->type; + newsock->ops = ops = sock->ops; + + clear_bit(SK_CONN, &svsk->sk_flags); + if ((err = ops->accept(sock, newsock, O_NONBLOCK)) < 0) { + if (err != -EAGAIN && net_ratelimit()) + printk(KERN_WARNING "%s: accept failed (err %d)!\n", + serv->sv_name, -err); + goto failed; /* aborted connection or whatever */ + } + set_bit(SK_CONN, &svsk->sk_flags); + svc_sock_enqueue(svsk); + + slen = sizeof(sin); + err = ops->getname(newsock, (struct sockaddr *) &sin, &slen, 1); + if (err < 0) { + if (net_ratelimit()) + printk(KERN_WARNING "%s: peername failed (err %d)!\n", + serv->sv_name, -err); + goto failed; /* aborted connection or whatever */ + } + + /* Ideally, we would want to reject connections from unauthorized + * hosts here, but when we get encription, the IP of the host won't + * tell us anything. For now just warn about unpriv connections. + */ + if (ntohs(sin.sin_port) >= 1024) { + dprintk(KERN_WARNING + "%s: connect from unprivileged port: %u.%u.%u.%u:%d\n", + serv->sv_name, + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + } + + dprintk("%s: connect from %u.%u.%u.%u:%04x\n", serv->sv_name, + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + + /* make sure that a write doesn't block forever when + * low on memory + */ + newsock->sk->sndtimeo = HZ*30; + + if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 0))) + goto failed; + + + /* make sure that we don't have too many active connections. + * If we have, something must be dropped. + * We randomly choose between newest and oldest (in terms + * of recent activity) and drop it. + */ + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*5) { + struct svc_sock *svsk = NULL; + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + if (net_random()&1) + svsk = list_entry(serv->sv_tempsocks.prev, + struct svc_sock, + sk_list); + else + svsk = list_entry(serv->sv_tempsocks.next, + struct svc_sock, + sk_list); + set_bit(SK_CLOSE, &svsk->sk_flags); + svsk->sk_inuse ++; + } + spin_unlock_bh(&serv->sv_lock); + + if (svsk) { + svc_sock_enqueue(svsk); + svc_sock_put(svsk); + } + + } + + if (serv->sv_stats) + serv->sv_stats->nettcpconn++; + + return; + +failed: + sock_release(newsock); + return; +} + +/* + * Receive data from a TCP socket. + */ +static int +svc_tcp_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct svc_serv *serv = svsk->sk_server; + int len; + struct iovec vec[RPCSVC_MAXPAGES]; + int pnum, vlen; + + dprintk("svc: tcp_recv %p data %d conn %d close %d\n", + svsk, test_bit(SK_DATA, &svsk->sk_flags), + test_bit(SK_CONN, &svsk->sk_flags), + test_bit(SK_CLOSE, &svsk->sk_flags)); + + if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) + return svc_deferred_recv(rqstp); + + if (test_bit(SK_CLOSE, &svsk->sk_flags)) { + svc_delete_socket(svsk); + return 0; + } + + if (test_bit(SK_CONN, &svsk->sk_flags)) { + svc_tcp_accept(svsk); + svc_sock_received(svsk); + return 0; + } + + if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + /* sndbuf needs to have room for one request + * per thread, otherwise we can stall even when the + * network isn't a bottleneck. + * rcvbuf just needs to be able to hold a few requests. + * Normally they will be removed from the queue + * as soon a a complete request arrives. + */ + svc_sock_setbufsize(svsk->sk_sock, + (serv->sv_nrthreads+3) * serv->sv_bufsz, + 3 * serv->sv_bufsz); + + clear_bit(SK_DATA, &svsk->sk_flags); + + /* Receive data. If we haven't got the record length yet, get + * the next four bytes. Otherwise try to gobble up as much as + * possible up to the complete record length. + */ + if (svsk->sk_tcplen < 4) { + unsigned long want = 4 - svsk->sk_tcplen; + struct iovec iov; + + iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; + iov.iov_len = want; + if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) + goto error; + svsk->sk_tcplen += len; + + if (len < want) { + dprintk("svc: short recvfrom while reading record length (%d of %d)\n", + len, want); + svc_sock_received(svsk); + return -EAGAIN; /* record header not complete */ + } + + svsk->sk_reclen = ntohl(svsk->sk_reclen); + if (!(svsk->sk_reclen & 0x80000000)) { + /* FIXME: technically, a record can be fragmented, + * and non-terminal fragments will not have the top + * bit set in the fragment length header. + * But apparently no known nfs clients send fragmented + * records. */ + printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (non-terminal)\n", + (unsigned long) svsk->sk_reclen); + goto err_delete; + } + svsk->sk_reclen &= 0x7fffffff; + dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen); + if (svsk->sk_reclen > serv->sv_bufsz) { + printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n", + (unsigned long) svsk->sk_reclen); + goto err_delete; + } + } + + /* Check whether enough data is available */ + len = svc_recv_available(svsk); + if (len < 0) + goto error; + + if (len < svsk->sk_reclen) { + dprintk("svc: incomplete TCP record (%d of %d)\n", + len, svsk->sk_reclen); + svc_sock_received(svsk); + return -EAGAIN; /* record not complete */ + } + len = svsk->sk_reclen; + set_bit(SK_DATA, &svsk->sk_flags); + + vec[0] = rqstp->rq_arg.head[0]; + vlen = PAGE_SIZE; + pnum = 1; + while (vlen < len) { + vec[pnum].iov_base = page_address(rqstp->rq_argpages[rqstp->rq_argused++]); + vec[pnum].iov_len = PAGE_SIZE; + pnum++; + vlen += PAGE_SIZE; + } + + /* Now receive data */ + len = svc_recvfrom(rqstp, vec, pnum, len); + if (len < 0) + goto error; + + dprintk("svc: TCP complete record (%d bytes)\n", len); + rqstp->rq_arg.len = len; + rqstp->rq_arg.page_base = 0; + if (len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = len; + rqstp->rq_arg.page_len = 0; + } else { + rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; + } + + rqstp->rq_skbuff = 0; + rqstp->rq_prot = IPPROTO_TCP; + + /* Reset TCP read info */ + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + + svc_sock_received(svsk); + if (serv->sv_stats) + serv->sv_stats->nettcpcnt++; + + return len; + + err_delete: + svc_delete_socket(svsk); + return -EAGAIN; + + error: + if (len == -EAGAIN) { + dprintk("RPC: TCP recvfrom got EAGAIN\n"); + svc_sock_received(svsk); + } else { + printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", + svsk->sk_server->sv_name, -len); + svc_sock_received(svsk); + } + + return len; +} + +/* + * Send out data on TCP socket. + */ +static int +svc_tcp_sendto(struct svc_rqst *rqstp) +{ + struct xdr_buf *xbufp = &rqstp->rq_res; + int sent; + u32 reclen; + + /* Set up the first element of the reply iovec. + * Any other iovecs that may be in use have been taken + * care of by the server implementation itself. + */ + reclen = htonl(0x80000000|((xbufp->len ) - 4)); + memcpy(xbufp->head[0].iov_base, &reclen, 4); + + sent = svc_sendto(rqstp, &rqstp->rq_res); + if (sent != xbufp->len) { + printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + rqstp->rq_sock->sk_server->sv_name, + (sent<0)?"got error":"sent only", + sent, xbufp->len); + svc_delete_socket(rqstp->rq_sock); + sent = -EAGAIN; + } + return sent; +} + +static void +svc_tcp_init(struct svc_sock *svsk) +{ + struct sock *sk = svsk->sk_sk; + + svsk->sk_recvfrom = svc_tcp_recvfrom; + svsk->sk_sendto = svc_tcp_sendto; + + if (sk->state == TCP_LISTEN) { + dprintk("setting up TCP socket for listening\n"); + sk->data_ready = svc_tcp_listen_data_ready; + set_bit(SK_CONN, &svsk->sk_flags); + } else { + dprintk("setting up TCP socket for reading\n"); + sk->state_change = svc_tcp_state_change; + sk->data_ready = svc_tcp_data_ready; + sk->write_space = svc_write_space; + + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + + /* initialise setting must have enough space to + * receive and respond to one request. + * svc_tcp_recvfrom will re-adjust if necessary + */ + svc_sock_setbufsize(svsk->sk_sock, + 3 * svsk->sk_server->sv_bufsz, + 3 * svsk->sk_server->sv_bufsz); + + set_bit(SK_CHNGBUF, &svsk->sk_flags); + set_bit(SK_DATA, &svsk->sk_flags); + } +} + +void +svc_sock_update_bufs(struct svc_serv *serv) +{ + /* + * The number of server threads has changed. Update + * rcvbuf and sndbuf accordingly on all sockets + */ + struct list_head *le; + + spin_lock_bh(&serv->sv_lock); + list_for_each(le, &serv->sv_permsocks) { + struct svc_sock *svsk = + list_entry(le, struct svc_sock, sk_list); + set_bit(SK_CHNGBUF, &svsk->sk_flags); + } + list_for_each(le, &serv->sv_tempsocks) { + struct svc_sock *svsk = + list_entry(le, struct svc_sock, sk_list); + set_bit(SK_CHNGBUF, &svsk->sk_flags); + } + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Receive the next request on any socket. + */ +int +svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) +{ + struct svc_sock *svsk =NULL; + int len; + int pages; + struct xdr_buf *arg; + DECLARE_WAITQUEUE(wait, current); + + dprintk("svc: server %p waiting for data (to = %ld)\n", + rqstp, timeout); + + if (rqstp->rq_sock) + printk(KERN_ERR + "svc_recv: service %p, socket not NULL!\n", + rqstp); + if (waitqueue_active(&rqstp->rq_wait)) + printk(KERN_ERR + "svc_recv: service %p, wait queue active!\n", + rqstp); + + /* Initialize the buffers */ + /* first reclaim pages that were moved to response list */ + svc_pushback_allpages(rqstp); + + /* now allocate needed pages. If we get a failure, sleep briefly */ + pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE; + while (rqstp->rq_arghi < pages) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ/2); + current->state = TASK_RUNNING; + continue; + } + rqstp->rq_argpages[rqstp->rq_arghi++] = p; + } + + /* Make arg->head point to first page and arg->pages point to rest */ + arg = &rqstp->rq_arg; + arg->head[0].iov_base = page_address(rqstp->rq_argpages[0]); + arg->head[0].iov_len = PAGE_SIZE; + rqstp->rq_argused = 1; + arg->pages = rqstp->rq_argpages + 1; + arg->page_base = 0; + /* save at least one page for response */ + arg->page_len = (pages-2)*PAGE_SIZE; + arg->len = (pages-1)*PAGE_SIZE; + arg->tail[0].iov_len = 0; + + if (signalled()) + return -EINTR; + + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + svsk = list_entry(serv->sv_tempsocks.next, + struct svc_sock, sk_list); + /* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ + if (get_seconds() - svsk->sk_lastrecv < 6*60 + || test_bit(SK_BUSY, &svsk->sk_flags)) + svsk = NULL; + } + if (svsk) { + set_bit(SK_BUSY, &svsk->sk_flags); + set_bit(SK_CLOSE, &svsk->sk_flags); + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + } else if ((svsk = svc_sock_dequeue(serv)) != NULL) { + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + rqstp->rq_reserved = serv->sv_bufsz; + svsk->sk_reserved += rqstp->rq_reserved; + } else { + /* No data pending. Go to sleep */ + svc_serv_enqueue(serv, rqstp); + + /* + * We have to be able to interrupt this wait + * to bring down the daemons ... + */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&rqstp->rq_wait, &wait); + spin_unlock_bh(&serv->sv_lock); + + schedule_timeout(timeout); + + spin_lock_bh(&serv->sv_lock); + remove_wait_queue(&rqstp->rq_wait, &wait); + + if (!(svsk = rqstp->rq_sock)) { + svc_serv_dequeue(serv, rqstp); + spin_unlock_bh(&serv->sv_lock); + dprintk("svc: server %p, no data yet\n", rqstp); + return signalled()? -EINTR : -EAGAIN; + } + } + spin_unlock_bh(&serv->sv_lock); + + dprintk("svc: server %p, socket %p, inuse=%d\n", + rqstp, svsk, svsk->sk_inuse); + len = svsk->sk_recvfrom(rqstp); + dprintk("svc: got len=%d\n", len); + + /* No data, incomplete (TCP) read, or accept() */ + if (len == 0 || len == -EAGAIN) { + svc_sock_release(rqstp); + return -EAGAIN; + } + svsk->sk_lastrecv = get_seconds(); + if (test_bit(SK_TEMP, &svsk->sk_flags)) { + /* push active sockets to end of list */ + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&svsk->sk_list)) + list_move_tail(&svsk->sk_list, &serv->sv_tempsocks); + spin_unlock_bh(&serv->sv_lock); + } + + rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024; + rqstp->rq_userset = 0; + rqstp->rq_chandle.defer = svc_defer; + + if (serv->sv_stats) + serv->sv_stats->netcnt++; + return len; +} + +/* + * Drop request + */ +void +svc_drop(struct svc_rqst *rqstp) +{ + dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); + svc_sock_release(rqstp); +} + +/* + * Return reply to client. + */ +int +svc_send(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk; + int len; + struct xdr_buf *xb; + + if ((svsk = rqstp->rq_sock) == NULL) { + printk(KERN_WARNING "NULL socket pointer in %s:%d\n", + __FILE__, __LINE__); + return -EFAULT; + } + + /* release the receive skb before sending the reply */ + svc_release_skb(rqstp); + + /* calculate over-all length */ + xb = & rqstp->rq_res; + xb->len = xb->head[0].iov_len + + xb->page_len + + xb->tail[0].iov_len; + + len = svsk->sk_sendto(rqstp); + svc_sock_release(rqstp); + + if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) + return 0; + return len; +} + +/* + * Initialize socket for RPC use and create svc_sock struct + * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. + */ +static struct svc_sock * +svc_setup_socket(struct svc_serv *serv, struct socket *sock, + int *errp, int pmap_register) +{ + struct svc_sock *svsk; + struct sock *inet; + + dprintk("svc: svc_setup_socket %p\n", sock); + if (!(svsk = kmalloc(sizeof(*svsk), GFP_KERNEL))) { + *errp = -ENOMEM; + return NULL; + } + memset(svsk, 0, sizeof(*svsk)); + + inet = sock->sk; + + /* Register socket with portmapper */ + if (*errp >= 0 && pmap_register) + *errp = svc_register(serv, inet->protocol, + ntohs(inet_sk(inet)->sport)); + + if (*errp < 0) { + kfree(svsk); + return NULL; + } + + set_bit(SK_BUSY, &svsk->sk_flags); + inet->user_data = svsk; + svsk->sk_sock = sock; + svsk->sk_sk = inet; + svsk->sk_ostate = inet->state_change; + svsk->sk_odata = inet->data_ready; + svsk->sk_owspace = inet->write_space; + svsk->sk_server = serv; + svsk->sk_lastrecv = get_seconds(); + INIT_LIST_HEAD(&svsk->sk_deferred); + INIT_LIST_HEAD(&svsk->sk_ready); + sema_init(&svsk->sk_sem, 1); + + /* Initialize the socket */ + if (sock->type == SOCK_DGRAM) + svc_udp_init(svsk); + else + svc_tcp_init(svsk); + + spin_lock_bh(&serv->sv_lock); + if (!pmap_register) { + set_bit(SK_TEMP, &svsk->sk_flags); + list_add(&svsk->sk_list, &serv->sv_tempsocks); + serv->sv_tmpcnt++; + } else { + clear_bit(SK_TEMP, &svsk->sk_flags); + list_add(&svsk->sk_list, &serv->sv_permsocks); + } + spin_unlock_bh(&serv->sv_lock); + + dprintk("svc: svc_setup_socket created %p (inet %p)\n", + svsk, svsk->sk_sk); + + clear_bit(SK_BUSY, &svsk->sk_flags); + svc_sock_enqueue(svsk); + return svsk; +} + +/* + * Create socket for RPC service. + */ +static int +svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin) +{ + struct svc_sock *svsk; + struct socket *sock; + int error; + int type; + + dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n", + serv->sv_program->pg_name, protocol, + NIPQUAD(sin->sin_addr.s_addr), + ntohs(sin->sin_port)); + + if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { + printk(KERN_WARNING "svc: only UDP and TCP " + "sockets supported\n"); + return -EINVAL; + } + type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + + if ((error = sock_create(PF_INET, type, protocol, &sock)) < 0) + return error; + + if (sin != NULL) { + sock->sk->reuse = 1; /* allow address reuse */ + error = sock->ops->bind(sock, (struct sockaddr *) sin, + sizeof(*sin)); + if (error < 0) + goto bummer; + } + + if (protocol == IPPROTO_TCP) { + if ((error = sock->ops->listen(sock, 64)) < 0) + goto bummer; + } + + if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) + return 0; + +bummer: + dprintk("svc: svc_create_socket error = %d\n", -error); + sock_release(sock); + return error; +} + +/* + * Remove a dead socket + */ +void +svc_delete_socket(struct svc_sock *svsk) +{ + struct svc_serv *serv; + struct sock *sk; + + dprintk("svc: svc_delete_socket(%p)\n", svsk); + + serv = svsk->sk_server; + sk = svsk->sk_sk; + + sk->state_change = svsk->sk_ostate; + sk->data_ready = svsk->sk_odata; + sk->write_space = svsk->sk_owspace; + + spin_lock_bh(&serv->sv_lock); + + list_del_init(&svsk->sk_list); + list_del_init(&svsk->sk_ready); + if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) + if (test_bit(SK_TEMP, &svsk->sk_flags)) + serv->sv_tmpcnt--; + + if (!svsk->sk_inuse) { + spin_unlock_bh(&serv->sv_lock); + sock_release(svsk->sk_sock); + kfree(svsk); + } else { + spin_unlock_bh(&serv->sv_lock); + dprintk(KERN_NOTICE "svc: server socket destroy delayed\n"); + /* svsk->sk_server = NULL; */ + } +} + +/* + * Make a socket for nfsd and lockd + */ +int +svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) +{ + struct sockaddr_in sin; + + dprintk("svc: creating socket proto = %d\n", protocol); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = INADDR_ANY; + sin.sin_port = htons(port); + return svc_create_socket(serv, protocol, &sin); +} + +/* + * Handle defer and revisit of requests + */ + +static void svc_revisit(struct cache_deferred_req *dreq, int too_many) +{ + struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); + struct svc_serv *serv = dr->serv; + struct svc_sock *svsk; + + if (too_many) { + svc_sock_put(dr->svsk); + kfree(dr); + return; + } + dprintk("revisit queued\n"); + svsk = dr->svsk; + dr->svsk = NULL; + spin_lock(&serv->sv_lock); + list_add(&dr->handle.recent, &svsk->sk_deferred); + spin_unlock(&serv->sv_lock); + set_bit(SK_DEFERRED, &svsk->sk_flags); + svc_sock_enqueue(svsk); + svc_sock_put(svsk); +} + +static struct cache_deferred_req * +svc_defer(struct cache_req *req) +{ + struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); + int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); + struct svc_deferred_req *dr; + + if (rqstp->rq_arg.page_len) + return NULL; /* if more than a page, give up FIXME */ + if (rqstp->rq_deferred) { + dr = rqstp->rq_deferred; + rqstp->rq_deferred = NULL; + } else { + int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; + /* FIXME maybe discard if size too large */ + dr = kmalloc(size, GFP_KERNEL); + if (dr == NULL) + return NULL; + + dr->serv = rqstp->rq_server; + dr->prot = rqstp->rq_prot; + dr->addr = rqstp->rq_addr; + dr->argslen = rqstp->rq_arg.len >> 2; + memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); + } + spin_lock(&rqstp->rq_server->sv_lock); + rqstp->rq_sock->sk_inuse++; + dr->svsk = rqstp->rq_sock; + spin_unlock(&rqstp->rq_server->sv_lock); + + dr->handle.revisit = svc_revisit; + return &dr->handle; +} + +/* + * recv data from a deferred request into an active one + */ +static int svc_deferred_recv(struct svc_rqst *rqstp) +{ + struct svc_deferred_req *dr = rqstp->rq_deferred; + + rqstp->rq_arg.head[0].iov_base = dr->args; + rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; + rqstp->rq_arg.page_len = 0; + rqstp->rq_arg.len = dr->argslen<<2; + rqstp->rq_prot = dr->prot; + rqstp->rq_addr = dr->addr; + return dr->argslen<<2; +} + + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) +{ + struct svc_deferred_req *dr = NULL; + struct svc_serv *serv = svsk->sk_server; + + if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) + return NULL; + spin_lock(&serv->sv_lock); + clear_bit(SK_DEFERRED, &svsk->sk_flags); + if (!list_empty(&svsk->sk_deferred)) { + dr = list_entry(svsk->sk_deferred.next, + struct svc_deferred_req, + handle.recent); + list_del_init(&dr->handle.recent); + set_bit(SK_DEFERRED, &svsk->sk_flags); + } + spin_unlock(&serv->sv_lock); + svc_sock_received(svsk); + return dr; +} diff --git a/tests/linux/rpc_tcp_nonagle/patch b/tests/linux/rpc_tcp_nonagle/patch new file mode 100644 index 0000000..bafda29 --- /dev/null +++ b/tests/linux/rpc_tcp_nonagle/patch @@ -0,0 +1,33 @@ +*************** +*** 932,937 **** + svc_tcp_init(struct svc_sock *svsk) + { + struct sock *sk = svsk->sk_sk; + + svsk->sk_recvfrom = svc_tcp_recvfrom; + svsk->sk_sendto = svc_tcp_sendto; +--- 932,938 ---- + svc_tcp_init(struct svc_sock *svsk) + { + struct sock *sk = svsk->sk_sk; ++ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + svsk->sk_recvfrom = svc_tcp_recvfrom; + svsk->sk_sendto = svc_tcp_sendto; +*************** +*** 948,953 **** + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + + /* initialise setting must have enough space to + * receive and respond to one request. + * svc_tcp_recvfrom will re-adjust if necessary +--- 949,956 ---- + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + ++ tp->nonagle = 1; /* disable Nagle's algorithm */ ++ + /* initialise setting must have enough space to + * receive and respond to one request. + * svc_tcp_recvfrom will re-adjust if necessary diff --git a/tests/simple/all-different-2/lmerge b/tests/simple/all-different-2/lmerge new file mode 100644 index 0000000..65606f9 --- /dev/null +++ b/tests/simple/all-different-2/lmerge @@ -0,0 +1,34 @@ +<<<<<<< +1 +2 +3 +4 +5 +6 +7 +8 +9 +0 +||||||| +a +b +c +d +e +f +g +h +i +j +======= +A +B +C +D +E +F +G +H +I +J +>>>>>>> diff --git a/tests/simple/all-different-2/merge b/tests/simple/all-different-2/merge new file mode 100644 index 0000000..65606f9 --- /dev/null +++ b/tests/simple/all-different-2/merge @@ -0,0 +1,34 @@ +<<<<<<< +1 +2 +3 +4 +5 +6 +7 +8 +9 +0 +||||||| +a +b +c +d +e +f +g +h +i +j +======= +A +B +C +D +E +F +G +H +I +J +>>>>>>> diff --git a/tests/simple/all-different-2/new b/tests/simple/all-different-2/new new file mode 100644 index 0000000..92dfa21 --- /dev/null +++ b/tests/simple/all-different-2/new @@ -0,0 +1,10 @@ +a +b +c +d +e +f +g +h +i +j diff --git a/tests/simple/all-different-2/new2 b/tests/simple/all-different-2/new2 new file mode 100644 index 0000000..719a59f --- /dev/null +++ b/tests/simple/all-different-2/new2 @@ -0,0 +1,10 @@ +A +B +C +D +E +F +G +H +I +J diff --git a/tests/simple/all-different-2/orig b/tests/simple/all-different-2/orig new file mode 100644 index 0000000..e53eaa1 --- /dev/null +++ b/tests/simple/all-different-2/orig @@ -0,0 +1,10 @@ +1 +2 +3 +4 +5 +6 +7 +8 +9 +0 diff --git a/tests/simple/all-different-2/wmerge b/tests/simple/all-different-2/wmerge new file mode 100644 index 0000000..d84a11f --- /dev/null +++ b/tests/simple/all-different-2/wmerge @@ -0,0 +1,10 @@ +<<<---1|||a===A--->>> +<<<---2|||b===B--->>> +<<<---3|||c===C--->>> +<<<---4|||d===D--->>> +<<<---5|||e===E--->>> +<<<---6|||f===F--->>> +<<<---7|||g===G--->>> +<<<---8|||h===H--->>> +<<<---9|||i===I--->>> +<<<---0|||j===J--->>> diff --git a/tests/simple/all-different/lmerge b/tests/simple/all-different/lmerge new file mode 100644 index 0000000..ab83c87 --- /dev/null +++ b/tests/simple/all-different/lmerge @@ -0,0 +1,35 @@ +<<<<<<< +1 +2 +3 +4 +5 +6 +7 +8 +9 +0 +||||||| +a +b +c +d +e +f +g +h +i +j +======= +A +B +C +D +E +F +G +H +I +J +>>>>>>> +yes diff --git a/tests/simple/all-different/merge b/tests/simple/all-different/merge new file mode 100644 index 0000000..ab83c87 --- /dev/null +++ b/tests/simple/all-different/merge @@ -0,0 +1,35 @@ +<<<<<<< +1 +2 +3 +4 +5 +6 +7 +8 +9 +0 +||||||| +a +b +c +d +e +f +g +h +i +j +======= +A +B +C +D +E +F +G +H +I +J +>>>>>>> +yes diff --git a/tests/simple/all-different/new b/tests/simple/all-different/new new file mode 100644 index 0000000..2e93219 --- /dev/null +++ b/tests/simple/all-different/new @@ -0,0 +1,11 @@ +a +b +c +d +e +f +g +h +i +j +yes diff --git a/tests/simple/all-different/new2 b/tests/simple/all-different/new2 new file mode 100644 index 0000000..6186f49 --- /dev/null +++ b/tests/simple/all-different/new2 @@ -0,0 +1,11 @@ +A +B +C +D +E +F +G +H +I +J +yes diff --git a/tests/simple/all-different/orig b/tests/simple/all-different/orig new file mode 100644 index 0000000..9db162b --- /dev/null +++ b/tests/simple/all-different/orig @@ -0,0 +1,11 @@ +1 +2 +3 +4 +5 +6 +7 +8 +9 +0 +yes diff --git a/tests/simple/all-different/wmerge b/tests/simple/all-different/wmerge new file mode 100644 index 0000000..ac32368 --- /dev/null +++ b/tests/simple/all-different/wmerge @@ -0,0 +1,11 @@ +<<<---1|||a===A--->>> +<<<---2|||b===B--->>> +<<<---3|||c===C--->>> +<<<---4|||d===D--->>> +<<<---5|||e===E--->>> +<<<---6|||f===F--->>> +<<<---7|||g===G--->>> +<<<---8|||h===H--->>> +<<<---9|||i===I--->>> +<<<---0|||j===J--->>> +yes diff --git a/tests/simple/already-applied/merge b/tests/simple/already-applied/merge new file mode 100644 index 0000000..5532005 --- /dev/null +++ b/tests/simple/already-applied/merge @@ -0,0 +1,3 @@ +This is the +current version of the file +which has already had the word 'current' updated. diff --git a/tests/simple/already-applied/new b/tests/simple/already-applied/new new file mode 100644 index 0000000..cfd09a2 --- /dev/null +++ b/tests/simple/already-applied/new @@ -0,0 +1,2 @@ +This is the +old version of the files diff --git a/tests/simple/already-applied/new2 b/tests/simple/already-applied/new2 new file mode 100644 index 0000000..1680c7e --- /dev/null +++ b/tests/simple/already-applied/new2 @@ -0,0 +1,2 @@ +This is the +current version of the file diff --git a/tests/simple/already-applied/orig b/tests/simple/already-applied/orig new file mode 100644 index 0000000..3702563 --- /dev/null +++ b/tests/simple/already-applied/orig @@ -0,0 +1,3 @@ +This is the +current version of the files +which has already had the word 'current' updated. diff --git a/tests/simple/base/diff b/tests/simple/base/diff new file mode 100644 index 0000000..ed409b8 --- /dev/null +++ b/tests/simple/base/diff @@ -0,0 +1,23 @@ +@@ -1,20 +1,21 @@ +- + This is a base file + some changes are going to happen to it + but it has ++had + several lines + so that alll + the changes + don't h... +|I don't know <<<--waht-->>><<<++what++>>> I am saying. +|This <<<--lion-->>><<<++line++>>> will have some changes made. + but this one wont + stuf stuf stuff + thing thing + xxxxx + that is all + except + for + this + last + bit ++x diff --git a/tests/simple/base/ldiff b/tests/simple/base/ldiff new file mode 100644 index 0000000..2b0ef69 --- /dev/null +++ b/tests/simple/base/ldiff @@ -0,0 +1,25 @@ +@@ -1,20 +1,21 @@ +- + This is a base file + some changes are going to happen to it + but it has ++had + several lines + so that alll + the changes + don't h... +-I don't know waht I am saying. +-This lion will have some changes made. ++I don't know what I am saying. ++This line will have some changes made. + but this one wont + stuf stuf stuff + thing thing + xxxxx + that is all + except + for + this + last + bit ++x diff --git a/tests/simple/base/merge b/tests/simple/base/merge new file mode 100644 index 0000000..fdd9823 --- /dev/null +++ b/tests/simple/base/merge @@ -0,0 +1,20 @@ + +This is a base file +some changes are going to happen to it +but it has +several lines +so that alll +the changes +don't h... +I don't know waht I am saying. +This lion will have some modifications made. +but this one wont +stuf stuf stuff +thing thing +xxxxx +that is all +except +for +this +last +bit diff --git a/tests/simple/base/new b/tests/simple/base/new new file mode 100644 index 0000000..0ea0d92 --- /dev/null +++ b/tests/simple/base/new @@ -0,0 +1,21 @@ +This is a base file +some changes are going to happen to it +but it has +had +several lines +so that alll +the changes +don't h... +I don't know what I am saying. +This line will have some changes made. +but this one wont +stuf stuf stuff +thing thing +xxxxx +that is all +except +for +this +last +bit +x diff --git a/tests/simple/base/new2 b/tests/simple/base/new2 new file mode 100644 index 0000000..cf8f75c --- /dev/null +++ b/tests/simple/base/new2 @@ -0,0 +1,21 @@ +This is a base file +some changes are going to happen to it +but it has +had +several lines +so that alll +the changes +don't h... +I don't know what I am saying. +This line will have some modifications made. +but this one wont +stuf stuf stuff +thing thing +xxxxx +that is all +except +for +this +last +bit +x diff --git a/tests/simple/base/orig b/tests/simple/base/orig new file mode 100644 index 0000000..46c9ab9 --- /dev/null +++ b/tests/simple/base/orig @@ -0,0 +1,20 @@ + +This is a base file +some changes are going to happen to it +but it has +several lines +so that alll +the changes +don't h... +I don't know waht I am saying. +This lion will have some changes made. +but this one wont +stuf stuf stuff +thing thing +xxxxx +that is all +except +for +this +last +bit diff --git a/tests/simple/brokenlines/diff b/tests/simple/brokenlines/diff new file mode 100644 index 0000000..e04a44d --- /dev/null +++ b/tests/simple/brokenlines/diff @@ -0,0 +1,7 @@ +@@ -1,5 +1,3 @@ +|This is a long line that <<<--might-->>><<<++has++>>> <<<--be -->>><<<++been +|++>>>broken +|and this is<<<-- +|-->>><<<++ ++>>>a broken line<<<-- +|-->>><<<++ ++>>>that <<<--might-->>><<<++will++>>> be<<<-- +|-->>><<<++ ++>>>joined diff --git a/tests/simple/brokenlines/merge b/tests/simple/brokenlines/merge new file mode 100644 index 0000000..ae3d3e3 --- /dev/null +++ b/tests/simple/brokenlines/merge @@ -0,0 +1,5 @@ +This is a longish line that might be split up +and this is +a broken line +that might be +catenated diff --git a/tests/simple/brokenlines/new b/tests/simple/brokenlines/new new file mode 100644 index 0000000..9ce96e0 --- /dev/null +++ b/tests/simple/brokenlines/new @@ -0,0 +1,3 @@ +This is a long line that has been +broken +and this is a broken line that will be joined diff --git a/tests/simple/brokenlines/new2 b/tests/simple/brokenlines/new2 new file mode 100644 index 0000000..1548622 --- /dev/null +++ b/tests/simple/brokenlines/new2 @@ -0,0 +1,3 @@ +This is a longish line that has been +split up +and this is a broken line that will be catenated diff --git a/tests/simple/brokenlines/orig b/tests/simple/brokenlines/orig new file mode 100644 index 0000000..9a2e13a --- /dev/null +++ b/tests/simple/brokenlines/orig @@ -0,0 +1,5 @@ +This is a long line that might be broken +and this is +a broken line +that might be +joined diff --git a/tests/simple/changeafteradd/merge b/tests/simple/changeafteradd/merge new file mode 100644 index 0000000..88b2138 --- /dev/null +++ b/tests/simple/changeafteradd/merge @@ -0,0 +1,5 @@ +here +is +the +inaugural +file diff --git a/tests/simple/changeafteradd/new b/tests/simple/changeafteradd/new new file mode 100644 index 0000000..a5eefce --- /dev/null +++ b/tests/simple/changeafteradd/new @@ -0,0 +1,6 @@ +here +is +the +new version of the +original +file diff --git a/tests/simple/changeafteradd/new2 b/tests/simple/changeafteradd/new2 new file mode 100644 index 0000000..39e2ee8 --- /dev/null +++ b/tests/simple/changeafteradd/new2 @@ -0,0 +1,6 @@ +here +is +the +new version of the +inaugural +file diff --git a/tests/simple/changeafteradd/orig b/tests/simple/changeafteradd/orig new file mode 100644 index 0000000..c37acc3 --- /dev/null +++ b/tests/simple/changeafteradd/orig @@ -0,0 +1,5 @@ +here +is +the +original +file diff --git a/tests/simple/conflict/diff b/tests/simple/conflict/diff new file mode 100644 index 0000000..8ecf042 --- /dev/null +++ b/tests/simple/conflict/diff @@ -0,0 +1,5 @@ +@@ -1,4 +1,4 @@ + this is a file + with the word +|<<<--two-->>><<<++to++>>> which is + misspelt diff --git a/tests/simple/conflict/ldiff b/tests/simple/conflict/ldiff new file mode 100644 index 0000000..4772aae --- /dev/null +++ b/tests/simple/conflict/ldiff @@ -0,0 +1,6 @@ +@@ -1,4 +1,4 @@ + this is a file + with the word +-two which is ++to which is + misspelt diff --git a/tests/simple/conflict/merge b/tests/simple/conflict/merge new file mode 100644 index 0000000..4afaeff --- /dev/null +++ b/tests/simple/conflict/merge @@ -0,0 +1,10 @@ +this is a file +with the word +<<<<<<< +two which is +||||||| +to which is +======= +too which is +>>>>>>> +misspelt diff --git a/tests/simple/conflict/new b/tests/simple/conflict/new new file mode 100644 index 0000000..5c346ba --- /dev/null +++ b/tests/simple/conflict/new @@ -0,0 +1,4 @@ +this is a file +with the word +to which is +misspelt diff --git a/tests/simple/conflict/new2 b/tests/simple/conflict/new2 new file mode 100644 index 0000000..cb8ea09 --- /dev/null +++ b/tests/simple/conflict/new2 @@ -0,0 +1,4 @@ +this is a file +with the word +too which is +misspelt diff --git a/tests/simple/conflict/orig b/tests/simple/conflict/orig new file mode 100644 index 0000000..bc856ca --- /dev/null +++ b/tests/simple/conflict/orig @@ -0,0 +1,4 @@ +this is a file +with the word +two which is +misspelt diff --git a/tests/simple/conflict/wmerge b/tests/simple/conflict/wmerge new file mode 100644 index 0000000..6af56bc --- /dev/null +++ b/tests/simple/conflict/wmerge @@ -0,0 +1,4 @@ +this is a file +with the word +<<<---two|||to===too--->>> which is +misspelt diff --git a/tests/simple/conflictmixed/diff b/tests/simple/conflictmixed/diff new file mode 100644 index 0000000..8ecf042 --- /dev/null +++ b/tests/simple/conflictmixed/diff @@ -0,0 +1,5 @@ +@@ -1,4 +1,4 @@ + this is a file + with the word +|<<<--two-->>><<<++to++>>> which is + misspelt diff --git a/tests/simple/conflictmixed/ldiff b/tests/simple/conflictmixed/ldiff new file mode 100644 index 0000000..4772aae --- /dev/null +++ b/tests/simple/conflictmixed/ldiff @@ -0,0 +1,6 @@ +@@ -1,4 +1,4 @@ + this is a file + with the word +-two which is ++to which is + misspelt diff --git a/tests/simple/conflictmixed/lmerge b/tests/simple/conflictmixed/lmerge new file mode 100644 index 0000000..bb4d03c --- /dev/null +++ b/tests/simple/conflictmixed/lmerge @@ -0,0 +1,10 @@ +this is a file +with the word +<<<<<<< +two which is +||||||| +to which is +======= +too which was +>>>>>>> +misspelt diff --git a/tests/simple/conflictmixed/merge b/tests/simple/conflictmixed/merge new file mode 100644 index 0000000..bb4d03c --- /dev/null +++ b/tests/simple/conflictmixed/merge @@ -0,0 +1,10 @@ +this is a file +with the word +<<<<<<< +two which is +||||||| +to which is +======= +too which was +>>>>>>> +misspelt diff --git a/tests/simple/conflictmixed/new b/tests/simple/conflictmixed/new new file mode 100644 index 0000000..5c346ba --- /dev/null +++ b/tests/simple/conflictmixed/new @@ -0,0 +1,4 @@ +this is a file +with the word +to which is +misspelt diff --git a/tests/simple/conflictmixed/new2 b/tests/simple/conflictmixed/new2 new file mode 100644 index 0000000..24e7c78 --- /dev/null +++ b/tests/simple/conflictmixed/new2 @@ -0,0 +1,4 @@ +this is a file +with the word +too which was +misspelt diff --git a/tests/simple/conflictmixed/orig b/tests/simple/conflictmixed/orig new file mode 100644 index 0000000..bc856ca --- /dev/null +++ b/tests/simple/conflictmixed/orig @@ -0,0 +1,4 @@ +this is a file +with the word +two which is +misspelt diff --git a/tests/simple/conflictmixed/wmerge b/tests/simple/conflictmixed/wmerge new file mode 100644 index 0000000..d10fc02 --- /dev/null +++ b/tests/simple/conflictmixed/wmerge @@ -0,0 +1,4 @@ +this is a file +with the word +<<<---two|||to===too--->>> which was +misspelt diff --git a/tests/simple/multideletes/lmerge b/tests/simple/multideletes/lmerge new file mode 100644 index 0000000..d1849fe --- /dev/null +++ b/tests/simple/multideletes/lmerge @@ -0,0 +1,2 @@ +First line +last line diff --git a/tests/simple/multideletes/merge b/tests/simple/multideletes/merge new file mode 100644 index 0000000..d1849fe --- /dev/null +++ b/tests/simple/multideletes/merge @@ -0,0 +1,2 @@ +First line +last line diff --git a/tests/simple/multideletes/new b/tests/simple/multideletes/new new file mode 100644 index 0000000..66ddf08 --- /dev/null +++ b/tests/simple/multideletes/new @@ -0,0 +1,8 @@ +Some padding +this line will go +Some more padding +this one too +This stuff is padding too +and this +Guess what you find here? +last line diff --git a/tests/simple/multideletes/new2 b/tests/simple/multideletes/new2 new file mode 100644 index 0000000..ead2f24 --- /dev/null +++ b/tests/simple/multideletes/new2 @@ -0,0 +1,5 @@ +Some padding +Some more padding +This stuff is padding too +Guess what you find here? +last line diff --git a/tests/simple/multideletes/orig b/tests/simple/multideletes/orig new file mode 100644 index 0000000..084d8d8 --- /dev/null +++ b/tests/simple/multideletes/orig @@ -0,0 +1,5 @@ +First line +this line will go +this one too +and this +last line diff --git a/tests/simple/multiple-add/lmerge b/tests/simple/multiple-add/lmerge new file mode 100644 index 0000000..f2a4151 --- /dev/null +++ b/tests/simple/multiple-add/lmerge @@ -0,0 +1,15 @@ +This +is +the +current +version +of +the +<<<<<<< +file. +||||||| +file +======= +file that has changed +>>>>>>> + diff --git a/tests/simple/multiple-add/merge b/tests/simple/multiple-add/merge new file mode 100644 index 0000000..f2a4151 --- /dev/null +++ b/tests/simple/multiple-add/merge @@ -0,0 +1,15 @@ +This +is +the +current +version +of +the +<<<<<<< +file. +||||||| +file +======= +file that has changed +>>>>>>> + diff --git a/tests/simple/multiple-add/new b/tests/simple/multiple-add/new new file mode 100644 index 0000000..f34b7b2 --- /dev/null +++ b/tests/simple/multiple-add/new @@ -0,0 +1,9 @@ +This +is +the +old +version +of +the +file + diff --git a/tests/simple/multiple-add/new2 b/tests/simple/multiple-add/new2 new file mode 100644 index 0000000..234da11 --- /dev/null +++ b/tests/simple/multiple-add/new2 @@ -0,0 +1,9 @@ +This +is +the +old +version +of +the +file that has changed + diff --git a/tests/simple/multiple-add/orig b/tests/simple/multiple-add/orig new file mode 100644 index 0000000..c6ed59c --- /dev/null +++ b/tests/simple/multiple-add/orig @@ -0,0 +1,9 @@ +This +is +the +current +version +of +the +file. + diff --git a/tests/simple/multiple-add/wmerge b/tests/simple/multiple-add/wmerge new file mode 100644 index 0000000..27f6ce8 --- /dev/null +++ b/tests/simple/multiple-add/wmerge @@ -0,0 +1,9 @@ +This +is +the +current +version +of +the +file<<<---.|||=== that has changed--->>> + diff --git a/version b/version new file mode 100644 index 0000000..5a2a580 --- /dev/null +++ b/version @@ -0,0 +1 @@ +0.6 diff --git a/wiggle.1 b/wiggle.1 new file mode 100644 index 0000000..7a973b3 --- /dev/null +++ b/wiggle.1 @@ -0,0 +1,439 @@ +.\" -*- nroff -*- +.\" wiggle - apply rejected patches +.\" +.\" Copyright (C) 2003 Neil Brown +.\" +.\" +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" +.\" This program is distributed in the hope that it will be useful, +.\" but WITHOUT ANY WARRANTY; without even the implied warranty of +.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +.\" GNU General Public License for more details. +.\" +.\" You should have received a copy of the GNU General Public License +.\" along with this program; if not, write to the Free Software +.\" Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +.\" +.\" Author: Neil Brown +.\" Email: +.\" Paper: Neil Brown +.\" School of Computer Science and Engineering +.\" The University of New South Wales +.\" Sydney, 2052 +.\" Australia +.\" +.TH WIGGLE 1 "" v0.6 +.SH NAME +wiggle \- apply rejected patches and perform word-wise diffs + +.SH SYNOPSIS + +.BI wiggle " [function] [options] file [files]" + +.SH DESCRIPTION +The main function of +.I wiggle +is to apply a patch to a file in a similar manner to the +.BR patch (1) +program. + +The distinctive difference of +.I wiggle +is that it will attempt to apply a patch even if the "before" part of +the patch doesn't match the target file perfectly. +This is achieved by breaking the file and patch into words and finding +the best alignment of words in the file with words in the patch. +Once this alignment has been found, any differences (word-wise) in the +patch are applied to the file as best as possible. + +Also, +.I wiggle +will (in some cases) detect changes that have already been applied, +and will ignore them. + +.I wiggle +ensures that every change in the patch is applied to the target +file somehow. If a particular change cannot be made in the file, the +file is annotated to show where the change should be made is a similar +way to the +.BR merge (1) +program. +Each annotation contains 3 components: a portion of the original file +where the change should be applied, a portion of the patch that +couldn't be matched precisely in the file, and the text that should +replace that portion of the patch. These are separated by lines +containing precisely 7 identical characters, either '<', '|', '=', or '>', so +.in +5 +.nf +.ft CW +<<<<<<< +Some portion of the original file +||||||| +text to replace +======= +text to replace it with +>>>>>>> +.ft +.fi +.in -5 + +indicates that "text to replace" should be replaced by "text to +replace it with" somewhere in the portion of the original file. +However +.I wiggle +was not able to find a place to make this change. + +.I wiggle +can also produce conflict reports showing only the words that are +involved rather than showing whole lines. +In this case the output looks like: +.ft CW +.ti +5 +<<<---original|||old===new--->>> +.ft + +A typical usage of +.I wiggle +is to run +.I patch +to apply some patch, and to collect a list of rejects by monitoring +the error messages from patch. Then for each file for which a +reject was found, run +.ti +5 +wiggle --replace originalfile originalfile.rej + +Finally each file must be examined to resolve any unresolved +conflicts, and to make sure the applied patch is semantically correct. + +.SS OPTIONS +The following options are understood by +.IR wiggle . +Some of these are explained in more detail in the following sections +on MERGE, DIFF, and EXTRACT. + +.TP +.BR -m ", " --merge +Select the "merge" function. This is the default function. + +.TP +.BR -d ", " --diff +Select the "diff" function. This displays the differences between files. + +.TP +.BR -x ", " --extract +Select the "extract" function. This extracts one branch of a patch or +merge file. + +.TP +.BR -w ", " --words +Request that all operations and display be word based. This is the +default for the "diff" function. + +.TP +.BR -l ", " --lines +Request that all operations and display be line based. + +.TP +.BR -p ", " --patch +Treat the last named file as a patch instead of a file (with --diff) +or a merge (--extract). + +.TP +.BR -r ", " --replace +Normally the merged output is written to standard-output. With +--replace, the original file is replaced with the merge output. + +.TP +.BR -R ", " --reverse +When used with the "diff" function, swap the files before calculating +the differences. +When used with the "merge" function, +.I wiggle +attempts to revert changes rather than apply them. + +.TP +.BR -h ", " --help +Print a simple help message. If given after one of the function +selectors (--merge, --diff, --extract) help specific to that function +is displayed. + +.TP +.BR -V ", " --version +Display the version number of +.IR wiggle . + +.TP +.BR -v ", " --verbose +Enable verbose mode. Currently this makes no difference. + +.TP +.BR -q ", " --quiet +Enable quiet mode. This suppresses the message from the merge +function when there are unresolvable conflicts. + +.SS WORDS +.I wiggle +can divide a text into lines or words when performing it's tasks. +A line is simply a string of characters terminated by a newline. +A word is either a maximal contiguous string of alphanumerics +(including underscore), a maximal contiguous string of space or tab +characters, or any other single character. + +.SS MERGE +The merge function modifies a given text by finding all changes between +two other texts and imposing those changes on the given text. + +Normally +.I wiggle +considers words which have changed so as to maximise the possibility +of finding a good match in the given text for the context of a given +change. However it can consider only whole lines. + +.I wiggle +extracts the three texts that it needs from files listed on the +command line. Either 1, 2, or 3 files may be listed, and any one of +them may be a lone hyphen signifying standard-input. + +If one file is given, it is treated as a +.B merge +file, i.e. the output of "merge -A" or "wiggle". Such a file +implicitly contains three streams and these are extracted and +compared. + +If two files are given, then the first simply contains the primary +text, and the second is treated as a patch file (the output of "diff\ -u" +or "diff\ -c", or a ".rej" file from +.IR patch ) +and the two other texts +are extracted from that. + +Finally if three files are listed, they are taken to contain the given +text and the two other texts, in order. + +Normally the result of the merge is written to standard-output. +However if the "-r" flag is given, the output is written to a file +which replaces the original given file. In this case the original file +is renamed to have a +.B .porig +suffix (for "patched original" which makes sense if you first use +.I patch +to apply a patch, and then use +.I wiggle +to wiggle the rejects in). + +If no errors occur (such as file access errors) +.I wiggle +will exit with a status of 0 if all changes were successfully merged, +and with an exit status of 1 and a brief message if any changes could +not be fully merged and were instead inserted as annotations. + +The merge function can operate in three different modes with respect +to lines or words. + +With the +.B --lines +option, whole lines are compared and any conflicts +are reported as whole lines that need to be replaced. + +With the +.B --words +option, individual words are compared and any +conflicts are reported just covering the words affected. This used +the \f(CW <<<|||===>>> \fP conflict format. + +Without either of these options, a hybrid approach is taken. +Individual words are compared and merged, but when a conflict is found +the whole surrounding line is reported as being in conflict. + +.I wiggle +will ensure that every change between the two other texts is reflected +in the result of the merge somehow. There are four different ways +that a change can be reflected. +.IP 1 +If a change converts +.B A +to +.B B +and +.B A +is found at a suitable place in the original file, it is +replaced with +.BR B . +This includes the possibility that +.B B +is empty, but +not that +.B A +is empty. + +.IP 2 +If a change is found which simply adds +.B B +and the text immediately preceding and following the insertion are +found adjacent in the original file in a suitable place, then +.B B +is inserted between those adjacent texts. + +.IP 3 +If a change is found which changes +.B A +to +.B B +and this appears (based on context) to align with +.B B +in the original, then it is assumed that this change has already been +applied, and the change is ignored. When this happens, a message +reflected the number of ignored changes is printed by +.IR wiggle . + +.IP 4 +If a change is found that does not fit any of the above possibilities, +then a conflict is reported as described earlier. + +.SS DIFF + +The diff function is provided primarily to allow inspection of the +alignments that +.I wiggle +calculated between texts and that it uses for performing a merge. + +The output of the diff function is similar to the unified output of +diff. However while diff does not output long stretches of common text, +.IR wiggle 's +diff mode outputs everything. + +When calculating a word-based alignment (the default), +.I wiggle +may need to show these word-based differences. This is done using an +extension to the unified-diff format. If a line starts with a +vertical bar, then it may contain sections surrounded by special +multi-character brackets. The brackets "<<<++" and "++>>>" surround +added text while "<<<--" and "-->>>" surround removed text. + +.I wiggle +can be given the two texts to compare in one of three ways. + +If only one file is given, then it is treated as a patch and the two +branches of that diff are compared. This effectively allows a patch +to be refined from a line-based patch to a word-based patch. + +If two files are given, then they are normally assumed to be simple +texts to be compared. + +If two files are given along with the --patch option, then the second +file is assumed to be a patch and either the first (with -1) or the +second (with -2) branch is extracted and compared with text found in +the first file. + +This last option causes +.I wiggle +to apply a "best-fit" algorithm for aligning patch hunks with the +file before computing the differences. This algorithm is used when +merging a patch with a file, and its value can be seen by comparing +the difference produced this was with the difference produced by first +extracting one branch of a patch into a file, and then computing the +difference of that file with the main file. + + +.SS EXTRACT + +The extract function of +.I wiggle +simply exposes the internal functionality for extracting one branch of +a patch or a merge file. + +Precisely one file should be given, and it will be assumed to be a +merge file unless +.B --patch +is given, in which case a patch is assumed. + +The choice of branch in made by providing one of +.BR -1 , +.BR -2 , +or +.B -3 +with obvious meanings. + +.SH WARNING + +Caution should always be exercised when applying a rejected patch with +.IR wiggle . +When +.I patch +rejects a patch, it does so for a good reason. Even though +.I wiggle +may be able to find a believable place to apply each textual change, +there is no guarantee that the result is correct in any semantic +sense. The result should always be inspected to make sure it is +correct. + +.SH EXAMPLES + +.B " wiggle --replace file file.rej" +.br +This is the normal usage of +.I wiggle +and will take any changes in +.B file.rej +that +.I patch +could not apply, and merge them into +.BR file . + +.B " wiggle -dp1 file file.rej" +.br +This will perform a word-wise comparison between the +.B file +and the +.I before +branch of the diff in +.B file.rej +and display the differences. This allows you to see where a given +patch would apply. + +.B " wiggle --merge --help" +.br +Get help about the merge function of +.IR wiggle . + +.SH QUOTE +The name of wiggle was inspired by the following quote, even though +wiggle does not (yet) have a graphical interface. + +.nf +The problem I find is that I often want to take + (file1+patch) -> file2, +when I don't have file1. But merge tools want to take + (file1|file2) -> file3. +I haven't seen a graphical tool which helps you to wiggle a patch +into a file. + +-- Andrew Morton - 2002 +.fi + +.SH SHORTCOMINGS +.IP - +.I wiggle +cannot read the extended unified-diff output that it produces for +--diff --words. + +.IP - +.I wiggle +cannot read the word-based merge format that it produces for --merge +--words. + +.SH AUTHOR + +Neil Brown at Computer Science and Engineering at +The University of New South Wales, Sydney, Australia + +.SH SEE ALSO +.IR patch (1), +.IR diff (1), +.IR merge (1), +.IR wdiff (1), +.IR diff3 (1). diff --git a/wiggle.c b/wiggle.c new file mode 100644 index 0000000..2bbb90f --- /dev/null +++ b/wiggle.c @@ -0,0 +1,643 @@ +/* + * wiggle - apply rejected patches + * + * Copyright (C) 2003 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * Paper: Neil Brown + * School of Computer Science and Engineering + * The University of New South Wales + * Sydney, 2052 + * Australia + */ + +/* + * Wiggle is a tool for working with patches that don't quite apply properly. + * It provides functionality similar to 'diff' and 'merge' but can + * work at the level of individual words thus allowing the merging of + * two changes that affect the same line, but not the same parts of that line. + * + * Wiggle can also read patch and merge files. Unlike 'merge' it does not + * need to be given three separate files, but can be given a file and a patch + * and it will extract the pieces of the two two other files that it needs from + * the patch. + * + * Wiggle performs one of three core function: + * --extract -x extract part of a patch or merge file + * --diff -d report differences between two files + * --merge -m merge the changes between two files into a third file + * + * To perform these, wiggle requires 1, 2, or 3 input streams respectively. + * I can get there from individual files, from a diff (unified or context) or + * from a merge file. + * + * For merge: + * If one file is given, it is a merge file (output of 'merge'). + * If two files are given, the second is assumed to be a patch, the first is a normal file. + * If three files are given, they are taken to be normal files. + * + * For diff: + * If one file is given, it is a patch + * If two files are given, they are normal files. + * + * For extract: + * Only one file can be given. -p indicates it is a patch, otherwise it is a merge. + * One of the flags -1 -2 or -3 must also be given and they indicate which + * part of the patch or merge to extract. + * + * Difference calculate and merging is performed on lines (-l) or words (-w). + * In the case of -w, an initial diff is computed based on non-trivial words. + * i.e. spaces are ignored + * This diff is computed from the ends of the file and is used to find a suitable + * starting point and range. Then a more precise diff is computed over that + * restricted range + * + * Other options available are: + * --replace -r replace first file with result of merge. + * --help -h provide help + * --version -v version + * + * Defaults are --merge --words + * + */ + +#include "wiggle.h" +#include +#include +#include +#include +#include +#include + +void die() +{ + fprintf(stderr,"wiggle: fatal error\n"); + abort(); + exit(3); +} + +void printword(FILE *f, struct elmnt e) +{ + if (e.start[0]) + fprintf(f, "%.*s", e.len, e.start); + else { + int a,b,c; + sscanf(e.start+1, "%d %d %d", &a, &b, &c); + fprintf(f, "*** %d,%d **** %d\n", b,c,a); + } +} + +static void printsep(struct elmnt e1, struct elmnt e2) +{ + int a,b,c,d,e,f; + sscanf(e1.start+1, "%d %d %d", &a, &b, &c); + sscanf(e2.start+1, "%d %d %d", &d, &e, &f); + printf("@@ -%d,%d +%d,%d @@\n", b,c,e,f); +} + + +/* Remove any entries from the common-sublist that are + * just spaces, tabs, or newlines + */ +void cleanlist(struct file a, struct file b, struct csl *list) +{ + struct csl *new = list; + + while (list->len) { + int i; + int ap; + for( ap = list->a; ap< list->a+list->len; ap++) { + for (i=0; ia+list->len) + list++; + else + *new++ = *list++; + } + *new = *list; +} + +int main(int argc, char *argv[]) +{ + int opt; + int option_index; + int mode = 0; + int obj = 0; + int replace = 0; + char *replacename=NULL, *orignew=NULL; + int which = 0; + int ispatch = 0; + int reverse = 0; + int verbose=0, quiet=0; + int i; + int chunks1=0, chunks2=0, chunks3=0; + int exit_status = 0; + FILE *outfile = stdout; + char *helpmsg; + + struct stream f, flist[3]; + struct file fl[3]; + struct csl *csl1, *csl2; + + while ((opt = getopt_long(argc, argv, + short_options, long_options, + &option_index)) != -1) + switch(opt) { + case 'h': + helpmsg = Help; + switch(mode) { + case 'x': helpmsg = HelpExtract; break; + case 'd': helpmsg = HelpDiff; break; + case 'm': helpmsg = HelpMerge; break; + } + fputs(helpmsg, stderr); + exit(0); + + case 'V': + fputs(Version, stderr); + exit(0); + case ':': + case '?': + default: + fputs(Usage, stderr); + exit(2); + + case 'x': + case 'd': + case 'm': + if (mode ==0){ + mode = opt; + continue; + } + fprintf(stderr, "wiggle: mode is '%c' - cannot set to '%c'\n", + mode, opt); + exit(2); + + case 'w': + case 'l': + if (obj == 0 || obj == opt) { + obj = opt; + continue; + } + fprintf(stderr, "wiggle: cannot select both words and lines.\n"); + exit(2); + + case 'r': + replace = 1; + continue; + case 'R': + reverse = 1; + continue; + + case '1': + case '2': + case '3': + if (which == 0 || which == opt) { + which = opt; + continue; + } + fprintf(stderr, "wiggle: can only select one of -1, -2, -3\n"); + exit(2); + + case 'p': + ispatch = 1; + continue; + + case 'v': verbose++; continue; + case 'q': quiet=1 ; continue; + } + if (!mode) + mode = 'm'; + + if (obj && mode == 'x') { + fprintf(stderr,"wiggle: cannot specify --line or --word with --extract\n"); + exit(2); + } + if (mode != 'm' && !obj) obj = 'w'; + if (replace && mode != 'm') { + fprintf(stderr, "wiggle: --replace only allowed with --merge\n"); + exit(2); + } + if (mode == 'x' && !which) { + fprintf(stderr, "wiggle: must specify -1, -2 or -3 with --extract\n"); + exit(2); + } + if (mode != 'x' && mode != 'd' && which) { + fprintf(stderr, "wiggle: -1, -2 or -3 only allowed with --extract or --diff\n"); + exit(2); + } + if (ispatch && (mode != 'x' && mode != 'd')) { + fprintf(stderr, "wiggle: --patch only allowed with --extract or --diff\n"); + exit(2); + } + if (ispatch && which == '3') { + fprintf(stderr, "wiggle: cannot extract -3 from a patch.\n"); + exit(2); + } + + switch(mode) { + case 'x': + /* extract a branch of a diff or diff3 or merge output + * We need one file + */ + if (optind == argc) { + fprintf(stderr, "wiggle: no file given for --extract\n"); + exit(2); + } + if (optind < argc-1) { + fprintf(stderr, "wiggle: only give one file for --extract\n"); + exit(2); + } + f = load_file(argv[optind]); + if (f.body==NULL) { + fprintf(stderr, "wiggle: cannot load file '%s' - %s\n", + argv[optind], strerror(errno)); + exit(2); + } + if (ispatch) + chunks1 = chunks2 = split_patch(f, &flist[0], &flist[1]); + else { + if (!split_merge(f, &flist[0], &flist[1], &flist[2])) { + fprintf(stderr, "wiggle: merge file %s looks bad.\n", + argv[optind]); + exit(2); + } + } + if (flist[which-'1'].body == NULL) { + fprintf(stderr, "wiggle: %s has no -%c component.\n", + argv[optind], which); + exit(2); + } else { + write(1, flist[which-'1'].body, flist[which-'1'].len); + } + + break; + case 'd': + /* create a diff (line or char) of two streams */ + switch (argc-optind) { + case 0: + fprintf(stderr, "wiggle: no file given for --diff\n"); + exit(2); + case 1: + f = load_file(argv[optind]); + if (f.body == NULL) { + fprintf(stderr, "wiggle: cannot load file '%s' - %s\n", + argv[optind], strerror(errno)); + exit(2); + } + chunks1 = chunks2 = split_patch(f, &flist[0], &flist[1]); + if (!flist[0].body || !flist[1].body) { + fprintf(stderr, "wiggle: couldn't parse patch %s\n", + argv[optind]); + exit(2); + } + break; + case 2: + flist[0] = load_file(argv[optind]); + if (flist[0].body == NULL) { + fprintf(stderr, "wiggle: cannot load file '%s' - %s\n", + argv[optind], strerror(errno)); + exit(2); + } + if (ispatch) { + f = load_file(argv[optind+1]); + if (f.body == NULL) { + fprintf(stderr, "wiggle: cannot load patch '%s' - %s\n", + argv[optind], strerror(errno)); + exit(2); + } + if (which == '2') + chunks2 = chunks3 = split_patch(f, &flist[2], &flist[1]); + else + chunks2 = chunks3 = split_patch(f, &flist[1], &flist[2]); + + } else + flist[1] = load_file(argv[optind+1]); + if (flist[1].body == NULL) { + fprintf(stderr, "wiggle: cannot load file '%s' - %s\n", + argv[optind+1], strerror(errno)); + exit(2); + } + break; + default: + fprintf(stderr, "wiggle: too many files given for --diff\n"); + exit(2); + } + if (reverse) { + f=flist[1]; + flist[1] = flist[2]; + flist[2]= f; + } + if (obj == 'l') { + int a,b; + fl[0] = split_stream(flist[0], ByLine, 0); + fl[1] = split_stream(flist[1], ByLine, 0); + if (chunks2 && ! chunks1) + csl1 = pdiff(fl[0], fl[1], chunks2); + else + csl1 = diff(fl[0], fl[1]); + + if (!chunks1) + printf("@@ -1,%d +1,%d @@\n", fl[0].elcnt, fl[1].elcnt); + a = b = 0; + while (aa) { + if (fl[0].list[a].start[0]) { + printf("-"); + printword(stdout, fl[0].list[a]); + } + a++; + exit_status++; + } else if (b < csl1->b) { + if (fl[1].list[b].start[0]) { + printf("+"); + printword(stdout, fl[1].list[b]); + } + b++; + exit_status++; + } else { + if (fl[0].list[a].start[0] == '\0') + printsep(fl[0].list[a], fl[1].list[b]); + else { + printf(" "); + printword(stdout, fl[0].list[a]); + } + a++; + b++; + if (a >= csl1->a+csl1->len) + csl1++; + } + } + } else { + int a,b; + int sol = 1; /* start of line */ + fl[0] = split_stream(flist[0], ByWord, 0); + fl[1] = split_stream(flist[1], ByWord, 0); + if (chunks2 && !chunks1) + csl1 = pdiff(fl[0], fl[1], chunks2); + else + csl1 = diff(fl[0], fl[1]); + + if (!chunks1) { + /* count lines in each file */ + int l1, l2, i; + l1=l2=0; + for (i=0;ia) { + exit_status++; + if (sol) { + int a1; + /* If we remove a whole line, output +line + * else clear sol and retry */ + sol = 0; + for (a1=a; a1a;a1++) + if (ends_line(fl[0].list[a1])) { + sol=1; + break; + } + if (sol) { + printf("-"); + for (; aa; a++) { + printword(stdout, fl[0].list[a]); + if (ends_line(fl[0].list[a])) { + a++; + break; + } + } + } else printf("|"); + } + if (!sol) { + printf("<<<--"); + do { + if (sol) printf("|"); + printword(stdout, fl[0].list[a]); + sol = ends_line(fl[0].list[a]); + a++; + } while (a < csl1->a); + printf("%s-->>>", sol?"|":""); + sol=0; + } + } else if (b < csl1->b) { + exit_status++; + if (sol) { + int b1; + sol = 0; + for (b1=b; b1b;b1++) + if(ends_line(fl[1].list[b1])) { + sol=1; + break; + } + if (sol) { + printf("+"); + for(; bb ; b++) { + printword(stdout, fl[1].list[b]); + if(ends_line(fl[1].list[b])) { + b++; + break; + } + } + } else printf("|"); + } + if (!sol) { + printf("<<<++"); + do { + if (sol) printf("|"); + printword(stdout, fl[1].list[b]); + sol = ends_line(fl[1].list[b]); + b++; + } while (b < csl1->b); + printf("%s++>>>",sol?"|":""); + sol=0; + } + } else { + if (sol) { + int a1; + sol = 0; + for (a1=a; a1a+csl1->len; a1++) + if (ends_line(fl[0].list[a1])) + sol=1; + if (sol) { + if (fl[0].list[a].start[0]) { + printf(" "); + for(; aa+csl1->len; a++,b++) { + printword(stdout, fl[0].list[a]); + if (ends_line(fl[0].list[a])) { + a++,b++; + break; + } + } + } else { + printsep(fl[0].list[a], fl[1].list[b]); + a++; b++; + } + } + else printf("|"); + } + if (!sol) { + printword(stdout, fl[0].list[a]); + if (ends_line(fl[0].list[a])) + sol=1; + a++; + b++; + } + if (a >= csl1->a+csl1->len) + csl1++; + } + } + + } + break; + case 'm': + /* merge three files, A B C, so changed between B and C get made to A + */ + switch (argc-optind) { + case 0: + fprintf(stderr, "wiggle: no files given for --merge\n"); + exit(2); + case 3: + case 2: + case 1: + for (i=0; i< argc-optind; i++) { + flist[i] = load_file(argv[optind+i]); + if (flist[i].body == NULL) { + fprintf(stderr, "wiggle: cannot load file '%s' - %s\n", + argv[optind+i], strerror(errno)); + exit(2); + } + } + break; + default: + fprintf(stderr, "wiggle: too many files given for --merge\n"); + exit(2); + } + switch(argc-optind) { + case 1: /* a merge file */ + f = flist[0]; + if (!split_merge(f, &flist[0], &flist[1], &flist[2])) { + fprintf(stderr,"wiggle: merge file %s looks bad.\n", + argv[optind]); + exit(2); + } + break; + case 2: /* a file and a patch */ + f = flist[1]; + chunks2 = chunks3 = split_patch(f, &flist[1], &flist[2]); + break; + case 3: /* three separate files */ + break; + } + if (reverse) { + f=flist[1]; + flist[1] = flist[2]; + flist[2]= f; + } + + for (i=0; i<3; i++) { + if (flist[i].body==NULL) { + fprintf(stderr, "wiggle: file %d missing\n", i); + exit(2); + } + } + if (replace) { + int fd; + replacename = malloc(strlen(argv[optind])+ 20); + if (!replacename) die(); + orignew = malloc(strlen(argv[optind])+20); + if (!orignew) die(); + strcpy(replacename, argv[optind]); + strcpy(orignew, argv[optind]); + strcat(orignew, ".porig"); + if (open(orignew, O_RDONLY) >= 0 || + errno != ENOENT) { + fprintf(stderr,"wiggle: %s already exists\n", + orignew); + exit(2); + } + strcat(replacename,"XXXXXX"); + fd = mkstemp(replacename); + if (fd == -1) { + fprintf(stderr,"wiggle: could not create temporary file for %s\n", + replacename); + exit(2); + } + outfile = fdopen(fd, "w"); + + } + + if (obj == 'l') { + fl[0] = split_stream(flist[0], ByLine, 0); + fl[1] = split_stream(flist[1], ByLine, 0); + fl[2] = split_stream(flist[2], ByLine, 0); + } else { + fl[0] = split_stream(flist[0], ByWord, 0); + fl[1] = split_stream(flist[1], ByWord, 0); + fl[2] = split_stream(flist[2], ByWord, 0); + } + if (chunks2 && !chunks1) + csl1 = pdiff(fl[0], fl[1], chunks2); + else + csl1 = diff(fl[0], fl[1]); + csl2 = diff(fl[1], fl[2]); + +#if 0 + cleanlist(fl[0],fl[1],csl1); + cleanlist(fl[1],fl[2],csl2); +#endif + + { + struct ci ci; + + ci = print_merge(outfile, &fl[0], &fl[1], &fl[2], + csl1, csl2, obj=='w'); + if (!quiet && ci.conflicts) + fprintf(stderr, "%d unresolved conflict%s found\n", ci.conflicts, ci.conflicts==1?"":"s"); + if (!quiet && ci.ignored) + fprintf(stderr, "%d already-applied change%s ignored\n", ci.ignored, ci.ignored==1?"":"s"); + exit_status = (ci.conflicts > 0); + } + if (replace) { + fclose(outfile); + if (rename(argv[optind], orignew) ==0 && + rename(replacename, argv[optind]) ==0) + /* all ok */; + else { + fprintf(stderr, "wiggle: failed to move new file into place.\n"); + exit(2); + } + } + break; + + } + exit(exit_status); +} diff --git a/wiggle.h b/wiggle.h new file mode 100644 index 0000000..124be29 --- /dev/null +++ b/wiggle.h @@ -0,0 +1,100 @@ +/* + * wiggle - apply rejected patches + * + * Copyright (C) 2003 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * Paper: Neil Brown + * School of Computer Science and Engineering + * The University of New South Wales + * Sydney, 2052 + * Australia + */ + +#include +#include +#include + +struct stream { + char *body; + int len; +}; + + +struct elmnt { + int hash; + char *start; + int len; +}; + +static inline int match(struct elmnt *a, struct elmnt *b) +{ + return + a->hash == b->hash && + a->len == b->len && + strncmp(a->start, b->start, a->len)==0; +} + +static inline int ends_line(struct elmnt e) +{ + return e.len && e.start[e.len-1] == '\n'; +} + +struct csl { + int a,b; + int len; +}; + +struct file { + struct elmnt *list; + int elcnt; +}; + +extern struct stream load_file(char *name); +extern int split_patch(struct stream, struct stream*, struct stream*); +extern int split_merge(struct stream, struct stream*, struct stream*, struct stream*); +extern struct file split_stream(struct stream s, int type, int reverse); +extern struct csl *pdiff(struct file a, struct file b, int chunks); +extern struct csl *diff(struct file a, struct file b); +extern struct csl *diff_partial(struct file a, struct file b, + int alo, int ahi, int blo, int bhi); +extern struct csl *worddiff(struct stream f1, struct stream f2, + struct file *fl1p, struct file *fl2p); + +struct ci { int conflicts; int ignored; }; +extern struct ci print_merge(FILE *out, struct file *a, struct file *b, struct file *c, + struct csl *c1, struct csl *c2, + int words); +extern void printword(FILE *f, struct elmnt e); + +extern void die(void); + +extern char Version[]; +extern char short_options[]; +extern struct option long_options[]; +extern char Usage[]; +extern char Help[]; +extern char HelpExtract[]; +extern char HelpDiff[]; +extern char HelpMerge[]; + + +#define ByLine 0 +#define ByWord 1 +#define ApproxWord 2 -- 2.39.5