Sync from bytedesk-private: update

2026-05-16 04:07:51 +00:00 · 2024-12-14 10:43:18 +08:00
parent 476eebb101
commit 5e082909e4
3421 changed files with 812709 additions and 0 deletions
--- a/modules/python/vendors/AsrTools/LICENSE
+++ b/modules/python/vendors/AsrTools/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
--- a/modules/python/vendors/AsrTools/README.md
+++ b/modules/python/vendors/AsrTools/README.md
@@ -0,0 +1,97 @@
+# 🎤 AsrTools
+
+🎙️✨ **AsrTools**：智能语音转字幕文本工具
+
+ 对比 Github 目前多数的音频转换文字项目（通过调用 Wishper 等模型），本项目最大区别和优势就是无需 GPU 和繁琐的本地配置。
+
+欢迎为项目给上一个 Star。
+
+
+## 🌟 **特色功能**
+
+- 🚀 **无需复杂配置**：无需 GPU 和繁琐的本地配置，小白也能轻松使用。
+- 🖥️ **高颜值界面**：基于 **PyQt5** 和 **qfluentwidgets**，界面美观且用户友好。
+- ⚡ **效率超人**：多线程并发 + 批量处理，文字转换快如闪电。
+- 📄 **多格式支持**：支持生成 `.srt` 和 `.txt` 字幕文件，满足不同需求。
+
+
+## 🌟 未来计划（TODO）
+
+- 🎥 视频直接处理：支持输入视频文件自动转换为音频文件，无需用户手动转换为mp3等音频格式。
+- 📄 多样化输出：增加输出格式选择，提供更多字幕格式选项，满足不同用户需求。
+- 🔀 一键字幕视频：增加视频自动加字幕功能，一键完成从视频到带字幕视频的全流程。
+- 🔗 API 集成：提供 API 接口，允许开发者将 AsrTools 集成到自己的工作流程中。
+- ✏️ 字幕编辑器：集成一个简单的字幕编辑界面，允许用户直接修改、调整时间轴和校正识别错误。
+
+
+*主界面截图示例*
+
+<img src="resources/main_window.png" width="80%" alt="主界面">
+
+
+### 🖥️ **快速上手**
+
+1. **启动应用**：运行下载的可执行文件或通过命令行启动 GUI 界面。
+2. **选择 ASR 引擎**：在下拉菜单中选择你需要使用的 ASR 引擎。
+3. **添加文件**：点击“选择文件”按钮或将文件/文件夹拖拽到指定区域。
+4. **开始处理**：点击“开始处理”按钮，程序将自动开始转换，并在完成后在原音频目录生成 `.srt` 或 `.txt` 字幕文件。（默认保持 3 个线程运行）
+
+## 🛠️ **安装指南**
+
+###  **1. 从发布版本安装**
+
+我为 Windows 用户提供了打包好的[Release](https://github.com/WEIFENG2333/AsrTools/releases)版本，下载后解压即可直接使用，无需配置环境。
+
+或者从网盘下载： [https://wwwm.lanzoue.com/iUJYZ2clk7xg](https://wwwm.lanzoue.com/iUJYZ2clk7xg)
+
+运行解压后的 `AsrTools.exe`，即可启动 GUI 界面。
+
+
+###  **2. 从源码安装（开发者）**
+
+项目的依赖仅仅为 `requests`。
+
+如果您需要 GUI 界面，请额外安装 `PyQt5`, `qfluentwidgets`。
+
+如果您想从源码运行，请按照以下步骤操作：
+
+1. **克隆仓库并进入项目目录**
+
+    ```bash
+    git clone https://github.com/WEIFENG2333/AsrTools.git
+    cd AsrTools
+    ```
+
+2. **安装依赖并运行**
+
+    - **启动 GUI 界面**
+
+        ```bash
+        pip install -r requirements.txt
+        python asr_gui.py
+        ```
+
+    - **纯代码调用示例**
+
+        ```bash
+        pip install requests
+        python example.py
+        ```
+
+
+---
+
+## 📬 **联系与支持**
+
+- **Issues**：[提交问题](https://github.com/WEIFENG2333/AsrTools/issues)
+
+感谢您使用 **AsrTools**！🎉  
+
+目前项目的相关调用和GUI页面的功能仍在不断完善中...
+
+希望这款工具能为您带来便利。😊
+
+---
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=WEIFENG2333/AsrTools&type=Date)](https://star-history.com/#WEIFENG2333/AsrTools&Date)
--- a/modules/python/vendors/AsrTools/asr_gui.py
+++ b/modules/python/vendors/AsrTools/asr_gui.py
@@ -0,0 +1,472 @@
+import logging
+import os
+import platform
+import subprocess
+import sys
+import webbrowser
+
+from PyQt5.QtCore import Qt, QRunnable, QThreadPool, QObject, pyqtSignal as Signal, pyqtSlot as Slot, QSize, QThread, \
+    pyqtSignal
+from PyQt5.QtGui import QCursor, QColor, QFont
+from PyQt5.QtWidgets import (QApplication, QWidget, QVBoxLayout, QHBoxLayout, QFileDialog,
+                             QTableWidgetItem, QHeaderView, QSizePolicy)
+from qfluentwidgets import (ComboBox, PushButton, LineEdit, TableWidget, FluentIcon as FIF,
+                            Action, RoundMenu, InfoBar, InfoBarPosition,
+                            FluentWindow, BodyLabel, MessageBox)
+
+from bk_asr.BcutASR import BcutASR
+from bk_asr.JianYingASR import JianYingASR
+from bk_asr.KuaiShouASR import KuaiShouASR
+
+# 设置日志配置
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+
+
+class WorkerSignals(QObject):
+    finished = Signal(str, str)
+    errno = Signal(str, str)
+
+
+class ASRWorker(QRunnable):
+    """ASR处理工作线程"""
+    def __init__(self, file_path, asr_engine):
+        super().__init__()
+        self.file_path = file_path
+        self.asr_engine = asr_engine
+        self.signals = WorkerSignals()
+
+    @Slot()
+    def run(self):
+        try:
+            use_cache = True
+            # 根据选择的 ASR 引擎实例化相应的类
+            if self.asr_engine == 'B 接口':
+                asr = BcutASR(self.file_path, use_cache=use_cache)
+            elif self.asr_engine == 'J 接口':
+                asr = JianYingASR(self.file_path, use_cache=use_cache)
+            elif self.asr_engine == 'K 接口':
+                asr = KuaiShouASR(self.file_path, use_cache=use_cache)
+            elif self.asr_engine == 'Whisper':
+                # from bk_asr.WhisperASR import WhisperASR
+                # asr = WhisperASR(self.file_path, use_cache=use_cache)
+                raise NotImplementedError("WhisperASR 暂未实现")
+            else:
+                raise ValueError(f"未知的 ASR 引擎: {self.asr_engine}")
+
+            logging.info(f"开始处理文件: {self.file_path} 使用引擎: {self.asr_engine}")
+            result = asr.run()
+            result_text = result.to_srt()
+            logging.info(f"完成处理文件: {self.file_path} 使用引擎: {self.asr_engine}")
+            save_path = self.file_path.rsplit(".", 1)[0] + ".srt"
+            with open(save_path, "w", encoding="utf-8") as f:
+                f.write(result_text)
+            self.signals.finished.emit(self.file_path, result_text)
+        except Exception as e:
+            logging.error(f"处理文件 {self.file_path} 时出错: {str(e)}")
+            self.signals.errno.emit(self.file_path, f"处理时出错: {str(e)}")
+
+class UpdateCheckerThread(QThread):
+    msg = pyqtSignal(str, str, str)  # 用于发送消息的信号
+
+    def __init__(self, parent=None):
+        super().__init__(parent)
+
+    def run(self):
+        try:
+            from check_update import check_update, check_internet_connection
+            # 检查互联网连接
+            if not check_internet_connection():
+                self.msg.emit("错误", "无法连接到互联网，请检查网络连接。", "")
+                return
+            # 检查更新
+            config = check_update(self)
+            if config:
+                if config['fource']:
+                    self.msg.emit("更新", "检测到新版本，请下载最新版本。", config['update_download_url'])
+                else:
+                    self.msg.emit("可更新", "检测到新版本，请下载最新版本。", config['update_download_url'])
+        except Exception as e:
+            pass
+
+
+class ASRWidget(QWidget):
+    """ASR处理界面"""
+
+    def __init__(self):
+        super().__init__()
+        self.init_ui()
+        self.max_threads = 3  # 设置最大线程数
+        self.thread_pool = QThreadPool()
+        self.thread_pool.setMaxThreadCount(self.max_threads)
+        self.processing_queue = []
+        self.workers = {}  # 维护文件路径到worker的映射
+
+
+    def init_ui(self):
+        layout = QVBoxLayout(self)
+
+        # ASR引擎选择下拉框
+        self.combo_box = ComboBox(self)
+        self.combo_box.addItems(['B 接口', 'J 接口', 'K 接口', 'Whisper'])
+        layout.addWidget(self.combo_box)
+
+        # 文件选择区域
+        file_layout = QHBoxLayout()
+        self.file_input = LineEdit(self)
+        self.file_input.setPlaceholderText("拖拽文件或文件夹到这里")
+        self.file_input.setReadOnly(True)
+        self.file_button = PushButton("选择文件", self)
+        self.file_button.clicked.connect(self.select_file)
+        file_layout.addWidget(self.file_input)
+        file_layout.addWidget(self.file_button)
+        layout.addLayout(file_layout)
+
+        # 文件列表表格
+        self.table = TableWidget(self)
+        self.table.setColumnCount(2)
+        self.table.setHorizontalHeaderLabels(['文件名', '状态'])
+        self.table.setContextMenuPolicy(Qt.CustomContextMenu)
+        self.table.customContextMenuRequested.connect(self.show_context_menu)
+        layout.addWidget(self.table)
+
+        # 设置表格列的拉伸模式
+        header = self.table.horizontalHeader()
+        header.setSectionResizeMode(0, QHeaderView.Stretch)
+        header.setSectionResizeMode(1, QHeaderView.Fixed)
+        self.table.setColumnWidth(1, 100)
+        self.table.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Expanding)
+
+        # 处理按钮
+        self.process_button = PushButton("开始处理", self)
+        self.process_button.clicked.connect(self.process_files)
+        self.process_button.setEnabled(False)  # 初始禁用
+        layout.addWidget(self.process_button)
+
+        self.setAcceptDrops(True)
+
+    def select_file(self):
+        """选择文件对话框"""
+        files, _ = QFileDialog.getOpenFileNames(self, "选择音频或视频文件", "",
+                                                "Media Files (*.mp3 *.wav *.ogg *.mp4 *.avi *.mov *.ts)")
+        for file in files:
+            self.add_file_to_table(file)
+        self.update_start_button_state()
+
+    def add_file_to_table(self, file_path):
+        """将文件添加到表格中"""
+        if self.find_row_by_file_path(file_path) != -1:
+            InfoBar.warning(
+                title='文件已存在',
+                content=f"文件 {os.path.basename(file_path)} 已经添加到列表中。",
+                orient=Qt.Horizontal,
+                isClosable=True,
+                position=InfoBarPosition.TOP,
+                duration=2000,
+                parent=self
+            )
+            return
+
+        row_count = self.table.rowCount()
+        self.table.insertRow(row_count)
+        item_filename = self.create_non_editable_item(os.path.basename(file_path))
+        item_status = self.create_non_editable_item("未处理")
+        item_status.setForeground(QColor("gray"))
+        self.table.setItem(row_count, 0, item_filename)
+        self.table.setItem(row_count, 1, item_status)
+        item_filename.setData(Qt.UserRole, file_path)
+
+    def create_non_editable_item(self, text):
+        """创建不可编辑的表格项"""
+        item = QTableWidgetItem(text)
+        item.setFlags(item.flags() & ~Qt.ItemIsEditable)
+        return item
+
+    def show_context_menu(self, pos):
+        """显示右键菜单"""
+        current_row = self.table.rowAt(pos.y())
+        if current_row < 0:
+            return
+
+        self.table.selectRow(current_row)
+
+        menu = RoundMenu(self)
+        reprocess_action = Action(FIF.SYNC, "重新处理")
+        delete_action = Action(FIF.DELETE, "删除任务")
+        open_dir_action = Action(FIF.FOLDER, "打开文件目录")
+        menu.addActions([reprocess_action, delete_action, open_dir_action])
+
+        delete_action.triggered.connect(self.delete_selected_row)
+        open_dir_action.triggered.connect(self.open_file_directory)
+        reprocess_action.triggered.connect(self.reprocess_selected_file)
+
+        menu.exec(QCursor.pos())
+
+    def delete_selected_row(self):
+        """删除选中的行"""
+        current_row = self.table.currentRow()
+        if current_row >= 0:
+            file_path = self.table.item(current_row, 0).data(Qt.UserRole)
+            if file_path in self.workers:
+                worker = self.workers[file_path]
+                worker.signals.finished.disconnect(self.update_table)
+                worker.signals.errno.disconnect(self.handle_error)
+                # QThreadPool 不支持直接终止线程，通常需要设计任务可中断
+                # 这里仅移除引用
+                self.workers.pop(file_path, None)
+            self.table.removeRow(current_row)
+            self.update_start_button_state()
+
+    def open_file_directory(self):
+        """打开文件所在目录"""
+        current_row = self.table.currentRow()
+        if current_row >= 0:
+            current_item = self.table.item(current_row, 0)
+            if current_item:
+                file_path = current_item.data(Qt.UserRole)
+                directory = os.path.dirname(file_path)
+                try:
+                    if platform.system() == "Windows":
+                        os.startfile(directory)
+                    elif platform.system() == "Darwin":
+                        subprocess.Popen(["open", directory])
+                    else:
+                        subprocess.Popen(["xdg-open", directory])
+                except Exception as e:
+                    InfoBar.error(
+                        title='无法打开目录',
+                        content=str(e),
+                        orient=Qt.Horizontal,
+                        isClosable=True,
+                        position=InfoBarPosition.TOP,
+                        duration=3000,
+                        parent=self
+                    )
+
+    def reprocess_selected_file(self):
+        """重新处理选中的文件"""
+        current_row = self.table.currentRow()
+        if current_row >= 0:
+            file_path = self.table.item(current_row, 0).data(Qt.UserRole)
+            status = self.table.item(current_row, 1).text()
+            if status == "处理中":
+                InfoBar.warning(
+                    title='当前文件正在处理中',
+                    content="请等待当前文件处理完成后再重新处理。",
+                    orient=Qt.Horizontal,
+                    isClosable=True,
+                    position=InfoBarPosition.TOP,
+                    duration=3000,
+                    parent=self
+                )
+                return
+            self.add_to_queue(file_path)
+
+    def add_to_queue(self, file_path):
+        """将文件添加到处理队列并更新状态"""
+        self.processing_queue.append(file_path)
+        self.process_next_in_queue()
+
+    def process_files(self):
+        """处理所有未处理的文件"""
+        for row in range(self.table.rowCount()):
+            if self.table.item(row, 1).text() == "未处理":
+                file_path = self.table.item(row, 0).data(Qt.UserRole)
+                self.processing_queue.append(file_path)
+        self.process_next_in_queue()
+
+    def process_next_in_queue(self):
+        """处理队列中的下一个文件"""
+        while self.thread_pool.activeThreadCount() < self.max_threads and self.processing_queue:
+            file_path = self.processing_queue.pop(0)
+            if file_path not in self.workers:
+                self.process_file(file_path)
+
+    def process_file(self, file_path):
+        """处理单个文件"""
+        selected_engine = self.combo_box.currentText()
+        worker = ASRWorker(file_path, selected_engine)
+        worker.signals.finished.connect(self.update_table)
+        worker.signals.errno.connect(self.handle_error)
+        self.thread_pool.start(worker)
+        self.workers[file_path] = worker
+
+        row = self.find_row_by_file_path(file_path)
+        if row != -1:
+            status_item = self.create_non_editable_item("处理中")
+            status_item.setForeground(QColor("orange"))
+            self.table.setItem(row, 1, status_item)
+            self.update_start_button_state()
+
+    def update_table(self, file_path, result):
+        """更新表格中文件的处理状态"""
+        row = self.find_row_by_file_path(file_path)
+        if row != -1:
+            item_status = self.create_non_editable_item("已处理")
+            item_status.setForeground(QColor("green"))
+            self.table.setItem(row, 1, item_status)
+
+            InfoBar.success(
+                title='处理完成',
+                content=f"文件 {self.table.item(row, 0).text()} 已处理完成",
+                orient=Qt.Horizontal,
+                isClosable=True,
+                position=InfoBarPosition.TOP,
+                duration=1500,
+                parent=self
+            )
+
+        self.workers.pop(file_path, None)
+        self.process_next_in_queue()
+        self.update_start_button_state()
+
+    def handle_error(self, file_path, error_message):
+        """处理错误信息"""
+        row = self.find_row_by_file_path(file_path)
+        if row != -1:
+            item_status = self.create_non_editable_item("错误")
+            item_status.setForeground(QColor("red"))
+            self.table.setItem(row, 1, item_status)
+
+            InfoBar.error(
+                title='处理出错',
+                content=error_message,
+                orient=Qt.Horizontal,
+                isClosable=True,
+                position=InfoBarPosition.TOP,
+                duration=3000,
+                parent=self
+            )
+
+        self.workers.pop(file_path, None)
+        self.process_next_in_queue()
+        self.update_start_button_state()
+
+    def find_row_by_file_path(self, file_path):
+        """根据文件路径查找表格中的行号"""
+        for row in range(self.table.rowCount()):
+            item = self.table.item(row, 0)
+            if item.data(Qt.UserRole) == file_path:
+                return row
+        return -1
+
+    def update_start_button_state(self):
+        """根据文件列表更新开始处理按钮的状态"""
+        has_unprocessed = any(
+            self.table.item(row, 1).text() == "未处理"
+            for row in range(self.table.rowCount())
+        )
+        self.process_button.setEnabled(has_unprocessed)
+
+    def dragEnterEvent(self, event):
+        """拖拽进入事件"""
+        if event.mimeData().hasUrls():
+            event.accept()
+        else:
+            event.ignore()
+
+    def dropEvent(self, event):
+        """拖拽释放事件"""
+        supported_formats = ('.mp3', '.wav', '.ogg', '.mp4', '.avi', '.mov', '.ts')
+        files = [u.toLocalFile() for u in event.mimeData().urls()]
+        for file in files:
+            if os.path.isdir(file):
+                for root, dirs, files_in_dir in os.walk(file):
+                    for f in files_in_dir:
+                        if f.lower().endswith(supported_formats):
+                            self.add_file_to_table(os.path.join(root, f))
+            elif file.lower().endswith(supported_formats):
+                self.add_file_to_table(file)
+        self.update_start_button_state()
+
+
+class InfoWidget(QWidget):
+    """个人信息界面"""
+
+    def __init__(self):
+        super().__init__()
+        self.init_ui()
+
+    def init_ui(self):
+        # GitHub URL 和仓库描述
+        GITHUB_URL = "https://github.com/WEIFENG2333/AsrTools"
+        REPO_DESCRIPTION = """
+    🚀 无需复杂配置：无需 GPU 和繁琐的本地配置，小白也能轻松使用。
+    🖥️ 高颜值界面：基于 PyQt5 和 qfluentwidgets，界面美观且用户友好。
+    ⚡ 效率超人：多线程并发 + 批量处理，文字转换快如闪电。
+    📄 多格式支持：支持生成 .srt 和 .txt 字幕文件，满足不同需求。
+        """
+        
+        main_layout = QVBoxLayout(self)
+        main_layout.setAlignment(Qt.AlignTop)
+        # main_layout.setSpacing(50)
+
+        # 标题
+        title_label = BodyLabel("  ASRTools", self)
+        title_label.setFont(QFont("Segoe UI", 30, QFont.Bold))
+        title_label.setAlignment(Qt.AlignCenter)
+        main_layout.addWidget(title_label)
+
+        # 仓库描述区域
+        desc_label = BodyLabel(REPO_DESCRIPTION, self)
+        desc_label.setFont(QFont("Segoe UI", 12))
+        main_layout.addWidget(desc_label)
+
+        github_button = PushButton("GitHub 仓库", self)
+        github_button.setIcon(FIF.GITHUB)
+        github_button.setIconSize(QSize(20, 20))
+        github_button.setMinimumHeight(42)
+        github_button.clicked.connect(lambda _: webbrowser.open(GITHUB_URL))
+        main_layout.addWidget(github_button)
+
+
+class MainWindow(FluentWindow):
+    """主窗口"""
+    def __init__(self):
+        super().__init__()
+        self.setWindowTitle('ASR Processing Tool')
+
+        # ASR 处理界面
+        self.asr_widget = ASRWidget()
+        self.asr_widget.setObjectName("main")
+        self.addSubInterface(self.asr_widget, FIF.ALBUM, 'ASR Processing')
+
+        # 个人信息界面
+        self.info_widget = InfoWidget()
+        self.info_widget.setObjectName("info")  # 设置对象名称
+        self.addSubInterface(self.info_widget, FIF.GITHUB, 'About')
+
+        self.navigationInterface.setExpandWidth(200)
+        self.resize(800, 600)
+
+        self.update_checker = UpdateCheckerThread(self)
+        self.update_checker.msg.connect(self.show_msg)
+        self.update_checker.start()
+
+    def show_msg(self, title, content, update_download_url):
+        w = MessageBox(title, content, self)
+        if w.exec() and update_download_url:
+            webbrowser.open(update_download_url)
+        if title == "更新":
+            sys.exit(0)
+
+
+
+def start():
+    # enable dpi scale
+    QApplication.setHighDpiScaleFactorRoundingPolicy(
+        Qt.HighDpiScaleFactorRoundingPolicy.PassThrough)
+    QApplication.setAttribute(Qt.AA_EnableHighDpiScaling)
+    QApplication.setAttribute(Qt.AA_UseHighDpiPixmaps)
+
+    app = QApplication(sys.argv)
+    # setTheme(Theme.DARK)  # 如果需要深色主题，取消注释此行
+    window = MainWindow()
+    window.show()
+    sys.exit(app.exec())
+
+
+if __name__ == '__main__':
+    start()
--- a/modules/python/vendors/AsrTools/bk_asr/ASRData.py
+++ b/modules/python/vendors/AsrTools/bk_asr/ASRData.py
@@ -0,0 +1,98 @@
+from typing import List
+
+
+class ASRDataSeg:
+    def __init__(self, text, start_time, end_time):
+        self.text = text
+        self.start_time = start_time
+        self.end_time = end_time
+
+    def to_srt_ts(self) -> str:
+        """Convert to SRT timestamp format"""
+        return f"{self._ms_to_srt_time(self.start_time)} --> {self._ms_to_srt_time(self.end_time)}"
+
+    @staticmethod
+    def _ms_to_srt_time(ms) -> str:
+        """Convert milliseconds to SRT time format (HH:MM:SS,mmm)"""
+        total_seconds, milliseconds = divmod(ms, 1000)
+        minutes, seconds = divmod(total_seconds, 60)
+        hours, minutes = divmod(minutes, 60)
+        return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{int(milliseconds):03}"
+
+    def to_lrc_ts(self) -> str:
+        """Convert to LRC timestamp format"""
+        return f"[{self._ms_to_lrc_time(self.start_time)}]"
+
+    def _ms_to_lrc_time(self, ms) -> str:
+        seconds = ms / 1000
+        minutes, seconds = divmod(seconds, 60)
+        return f"{int(minutes):02}:{seconds:.2f}"
+
+    @property
+    def transcript(self) -> str:
+        """Return segment text"""
+        return self.text
+
+    def __str__(self) -> str:
+        return f"ASRDataSeg({self.text}, {self.start_time}, {self.end_time})"
+
+
+
+class ASRData:
+    def __init__(self, segments: List[ASRDataSeg]):
+        self.segments = segments
+
+    def __iter__(self):
+        return iter(self.segments)
+
+    def has_data(self) -> bool:
+        """Check if there are any utterances"""
+        return len(self.segments) > 0
+
+    def to_txt(self) -> str:
+        """Convert to plain text subtitle format (without timestamps)"""
+        return "\n".join(seg.transcript for seg in self.segments)
+
+    def to_srt(self, save_path=None) -> str:
+        """Convert to SRT subtitle format"""
+        srt_text = "\n".join(
+            f"{n}\n{seg.to_srt_ts()}\n{seg.transcript}\n"
+            for n, seg in enumerate(self.segments, 1))
+        if save_path:
+            with open(save_path, 'w', encoding='utf-8') as f:
+                f.write(srt_text)
+        return srt_text
+
+    def to_lrc(self) -> str:
+        """Convert to LRC subtitle format"""
+        return "\n".join(
+            f"{seg.to_lrc_ts()}{seg.transcript}" for seg in self.segments
+        )
+
+    def to_ass(self) -> str:
+        """Convert to ASS subtitle format"""
+        raise NotImplementedError("ASS format conversion not implemented yet")
+
+    def to_json(self) -> dict:
+        result_json = {}
+        for i, segment in enumerate(self.segments):
+            result_json[i] = segment.text
+        return result_json
+
+    def __str__(self):
+        return self.to_txt()
+
+
+if __name__ == '__main__':
+    pass
+    # asr_data = ASRData(seg)
+    # Uncomment to test different formats:
+    # print(asr_data.to_srt())
+    # print(asr_data.to_lrc())
+    # print(asr_data.to_txt())
+    # print(asr_data.to_json())
+    # print(asr_data.to_json())
+
+
+
+
--- a/modules/python/vendors/AsrTools/bk_asr/BaseASR.py
+++ b/modules/python/vendors/AsrTools/bk_asr/BaseASR.py
@@ -0,0 +1,89 @@
+import json
+import logging
+import os
+import zlib
+import tempfile
+import threading
+
+from .ASRData import ASRDataSeg, ASRData
+
+
+class BaseASR:
+    SUPPORTED_SOUND_FORMAT = ["flac", "m4a", "mp3", "wav"]
+    CACHE_FILE = os.path.join(tempfile.gettempdir(), "bk_asr", "asr_cache.json")
+    _lock = threading.Lock()
+
+    def __init__(self, audio_path: [str, bytes], use_cache: bool = False):
+        self.audio_path = audio_path
+        self.file_binary = None
+
+        self.crc32_hex = None
+        self.use_cache = use_cache
+
+        self._set_data()
+
+        self.cache = self._load_cache()
+
+    def _load_cache(self):
+        if not self.use_cache:
+            return {}
+        os.makedirs(os.path.dirname(self.CACHE_FILE), exist_ok=True)
+        with self._lock:
+            if os.path.exists(self.CACHE_FILE):
+                try:
+                    with open(self.CACHE_FILE, 'r', encoding='utf-8') as f:
+                        cache = json.load(f)
+                        if isinstance(cache, dict):
+                            return cache
+                except (json.JSONDecodeError, IOError):
+                    return {}
+            return {}
+
+    def _save_cache(self):
+        if not self.use_cache:
+            return
+        with self._lock:
+            try:
+                with open(self.CACHE_FILE, 'w', encoding='utf-8') as f:
+                    json.dump(self.cache, f, ensure_ascii=False, indent=2)
+                if os.path.exists(self.CACHE_FILE) and os.path.getsize(self.CACHE_FILE) > 10 * 1024 * 1024:
+                    os.remove(self.CACHE_FILE)
+            except IOError as e:
+                logging.error(f"Failed to save cache: {e}")
+
+    def _set_data(self):
+        if isinstance(self.audio_path, bytes):
+            self.file_binary = self.audio_path
+        else:
+            ext = self.audio_path.split(".")[-1].lower()
+            assert ext in self.SUPPORTED_SOUND_FORMAT, f"Unsupported sound format: {ext}"
+            assert os.path.exists(self.audio_path), f"File not found: {self.audio_path}"
+            with open(self.audio_path, "rb") as f:
+                self.file_binary = f.read()
+        crc32_value = zlib.crc32(self.file_binary) & 0xFFFFFFFF
+        self.crc32_hex = format(crc32_value, '08x')
+
+    def _get_key(self):
+        return f"{self.__class__.__name__}-{self.crc32_hex}"
+
+    def run(self):
+        k = self._get_key()
+        if k in self.cache and self.use_cache:
+            resp_data = self.cache[k]
+        else:
+            resp_data = self._run()
+            # Cache the result
+            self.cache[k] = resp_data
+            self._save_cache()
+        segments = self._make_segments(resp_data)
+        return ASRData(segments)
+
+    def _make_segments(self, resp_data: dict) -> list[ASRDataSeg]:
+        raise NotImplementedError("_make_segments method must be implemented in subclass")
+
+    def _run(self) -> dict:
+        """ Run the ASR service and return the response data. """
+        raise NotImplementedError("_run method must be implemented in subclass")
+
+
+
--- a/modules/python/vendors/AsrTools/bk_asr/BcutASR.py
+++ b/modules/python/vendors/AsrTools/bk_asr/BcutASR.py
@@ -0,0 +1,164 @@
+import json
+import logging
+import time
+from os import PathLike
+from typing import Optional
+
+import requests
+
+from .ASRData import ASRData, ASRDataSeg
+from .BaseASR import BaseASR
+
+
+__version__ = "0.0.3"
+
+API_BASE_URL = "https://member.bilibili.com/x/bcut/rubick-interface"
+
+# 申请上传
+API_REQ_UPLOAD = API_BASE_URL + "/resource/create"
+
+# 提交上传
+API_COMMIT_UPLOAD = API_BASE_URL + "/resource/create/complete"
+
+# 创建任务
+API_CREATE_TASK = API_BASE_URL + "/task"
+
+# 查询结果
+API_QUERY_RESULT = API_BASE_URL + "/task/result"
+
+
+class BcutASR(BaseASR):
+    """必剪 语音识别接口"""
+    headers = {
+        'User-Agent': 'Bilibili/1.0.0 (https://www.bilibili.com)',
+        'Content-Type': 'application/json'
+    }
+
+    def __init__(self, audio_path: [str, bytes], use_cache: bool = False):
+        super().__init__(audio_path, use_cache=use_cache)
+        self.session = requests.Session()
+        self.task_id = None
+        self.__etags = []
+
+        self.__in_boss_key: Optional[str, None] = None
+        self.__resource_id: Optional[str, None] = None
+        self.__upload_id: Optional[str, None] = None
+        self.__upload_urls: Optional[list[str]] = []
+        self.__per_size: Optional[int, None] = None
+        self.__clips: Optional[int, None] = None
+
+        self.__etags: Optional[list[str]] = []
+        self.__download_url: Optional[str, None] = None
+        self.task_id: Optional[str, None] = None
+
+
+    def upload(self) -> None:
+        """申请上传"""
+        if not self.file_binary:
+            raise ValueError("none set data")
+        payload = json.dumps({
+            "type": 2,
+            "name": "audio.mp3",
+            "size": len(self.file_binary),
+            "ResourceFileType": "mp3",
+            "model_id": "8",
+        })
+
+        resp = requests.post(
+            API_REQ_UPLOAD,
+            data=payload,
+            headers=self.headers
+        )
+        resp.raise_for_status()
+        resp = resp.json()
+        resp_data = resp["data"]
+
+        self.__in_boss_key = resp_data["in_boss_key"]
+        self.__resource_id = resp_data["resource_id"]
+        self.__upload_id = resp_data["upload_id"]
+        self.__upload_urls = resp_data["upload_urls"]
+        self.__per_size = resp_data["per_size"]
+        self.__clips = len(resp_data["upload_urls"])
+
+        logging.info(
+            f"申请上传成功, 总计大小{resp_data['size'] // 1024}KB, {self.__clips}分片, 分片大小{resp_data['per_size'] // 1024}KB: {self.__in_boss_key}"
+        )
+        self.__upload_part()
+        self.__commit_upload()
+
+    def __upload_part(self) -> None:
+        """上传音频数据"""
+        for clip in range(self.__clips):
+            start_range = clip * self.__per_size
+            end_range = (clip + 1) * self.__per_size
+            logging.info(f"开始上传分片{clip}: {start_range}-{end_range}")
+            resp = requests.put(
+                self.__upload_urls[clip],
+                data=self.file_binary[start_range:end_range],
+                headers=self.headers
+            )
+            resp.raise_for_status()
+            etag = resp.headers.get("Etag")
+            self.__etags.append(etag)
+            logging.info(f"分片{clip}上传成功: {etag}")
+
+    def __commit_upload(self) -> None:
+        """提交上传数据"""
+        data = json.dumps({
+            "InBossKey": self.__in_boss_key,
+            "ResourceId": self.__resource_id,
+            "Etags": ",".join(self.__etags),
+            "UploadId": self.__upload_id,
+            "model_id": "8",
+        })
+        resp = requests.post(
+            API_COMMIT_UPLOAD,
+            data=data,
+            headers=self.headers
+        )
+        resp.raise_for_status()
+        resp = resp.json()
+        self.__download_url = resp["data"]["download_url"]
+        logging.info(f"提交成功")
+
+    def create_task(self) -> str:
+        """开始创建转换任务"""
+        resp = requests.post(
+            API_CREATE_TASK, json={"resource": self.__download_url, "model_id": "8"}, headers=self.headers
+        )
+        resp.raise_for_status()
+        resp = resp.json()
+        self.task_id = resp["data"]["task_id"]
+        logging.info(f"任务已创建: {self.task_id}")
+        return self.task_id
+
+    def result(self, task_id: Optional[str] = None):
+        """查询转换结果"""
+        resp = requests.get(API_QUERY_RESULT, params={"model_id": 7, "task_id": task_id or self.task_id}, headers=self.headers)
+        resp.raise_for_status()
+        resp = resp.json()
+        return resp["data"]
+
+    def _run(self):
+        self.upload()
+        self.create_task()
+        # 轮询检查任务状态
+        for _ in range(500):
+            task_resp = self.result()
+            if task_resp["state"] == 4:
+                break
+            time.sleep(1)
+        logging.info(f"转换成功")
+        return json.loads(task_resp["result"])
+
+    def _make_segments(self, resp_data: dict) -> list[ASRDataSeg]:
+        return [ASRDataSeg(u['transcript'], u['start_time'], u['end_time']) for u in resp_data['utterances']]
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+    # Example usage
+    audio_file = r"test.mp3"
+    asr = BcutASR(audio_file)
+    asr_data = asr.run()
+    print(asr_data)
--- a/modules/python/vendors/AsrTools/bk_asr/JianYingASR.py
+++ b/modules/python/vendors/AsrTools/bk_asr/JianYingASR.py
@@ -0,0 +1,251 @@
+import datetime
+import hashlib
+import hmac
+import json
+import logging
+import os
+import time
+from typing import Dict, Tuple
+
+import requests
+
+from .ASRData import ASRDataSeg
+from .BaseASR import BaseASR
+
+
+class JianYingASR(BaseASR):
+    def __init__(self, audio_path: [str, bytes], use_cache: bool = False, start_time: float = 0, end_time: float = 6000):
+        super().__init__(audio_path, use_cache)
+        self.audio_path = audio_path
+        self.end_time = end_time
+        self.start_time = start_time
+
+        # AWS credentials
+        self.session_token = None
+        self.secret_key = None
+        self.access_key = None
+
+        # Upload details
+        self.store_uri = None
+        self.auth = None
+        self.upload_id = None
+        self.session_key = None
+        self.upload_hosts = None
+
+
+    def submit(self) -> str:
+        """Submit the task"""
+        url = "https://lv-pc-api-sinfonlinec.ulikecam.com/lv/v1/audio_subtitle/submit"
+        payload = {
+            "adjust_endtime": 200,
+            "audio": self.store_uri,
+            "caption_type": 2,
+            "client_request_id": "45faf98c-160f-4fae-a649-6d89b0fe35be",
+            "max_lines": 1,
+            "songs_info": [{"end_time": self.end_time, "id": "", "start_time": self.start_time}],
+            "words_per_line": 16
+        }
+
+        sign, device_time = self._generate_sign_parameters(url='/lv/v1/audio_subtitle/submit', pf='4', appvr='4.0.0',
+                                                           tdid='3943278516897751')
+        headers = self._build_headers(device_time, sign)
+        response = requests.post(url, json=payload, headers=headers)
+        query_id = response.json()['data']['id']
+        return query_id
+
+    def upload(self):
+        """Upload the file"""
+        self._upload_sign()
+        self._upload_auth()
+        self._upload_file()
+        self._upload_check()
+        uri = self._upload_commit()
+        return uri
+
+    def query(self, query_id: str):
+        """Query the task"""
+        url = "https://lv-pc-api-sinfonlinec.ulikecam.com/lv/v1/audio_subtitle/query"
+        payload = {
+            "id": query_id,
+            "pack_options": {"need_attribute": True}
+        }
+        sign, device_time = self._generate_sign_parameters(url='/lv/v1/audio_subtitle/query', pf='4', appvr='4.0.0',
+                                                           tdid='3943278516897751')
+        headers = self._build_headers(device_time, sign)
+        response = requests.post(url, json=payload, headers=headers)
+        return response.json()
+
+    def _run(self):
+        logging.info("正在上传文件...")
+        self.upload()
+        query_id = self.submit()
+        logging.info(f"任务提交成功, 查询ID: {query_id}")
+        resp_data = self.query(query_id)
+        return resp_data
+
+    def _make_segments(self, resp_data: dict) -> list[ASRDataSeg]:
+        return [ASRDataSeg(u['text'], u['start_time'], u['end_time']) for u in resp_data['data']['utterances']]
+
+    @staticmethod
+    def _generate_sign_parameters(url: str, pf: str = '4', appvr: str = '4.0.0', tdid: str = '3943278516897751') -> \
+    Tuple[str, str]:
+        """Generate signature and timestamp via an HTTP request"""
+        current_time = str(int(time.time()))
+        data = {
+            'url': url,
+            'current_time': current_time,
+            'pf': pf,
+            'appvr': appvr,
+            'tdid': tdid
+        }
+        # Replace with your actual endpoint URL
+        get_sign_url = 'https://asrtools-update.bkfeng.top/sign'
+        try:
+            response = requests.post(get_sign_url, json=data)
+            response.raise_for_status()
+            response_data = response.json()
+            sign = response_data.get('sign')
+            if not sign:
+                raise ValueError("No 'sign' in response")
+        except requests.exceptions.RequestException as e:
+            raise SystemExit(f"HTTP Request failed: {e}")
+        except ValueError as ve:
+            raise SystemExit(f"Invalid response: {ve}")
+        return sign.lower(), current_time
+
+
+
+    def _build_headers(self, device_time: str, sign: str) -> Dict[str, str]:
+        """Build headers for requests"""
+        return {
+            'User-Agent': "Cronet/TTNetVersion:01594da2 2023-03-14 QuicVersion:46688bb4 2022-11-28",
+            'appvr': "4.0.0",
+            'device-time': str(device_time),
+            'pf': "4",
+            'sign': sign,
+            'sign-ver': "1",
+            'tdid': "3943278516897751",
+        }
+
+    def _uplosd_headers(self):
+        headers = {
+            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 Thea/1.0.1",
+            'Authorization': self.auth,
+            'Content-CRC32': self.crc32_hex,
+        }
+        return headers
+
+    def _upload_sign(self):
+        """Get upload sign"""
+        url = "https://lv-pc-api-sinfonlinec.ulikecam.com/lv/v1/upload_sign"
+        payload = json.dumps({"biz": "pc-recognition"})
+        sign, device_time = self._generate_sign_parameters(url='/lv/v1/upload_sign', pf='4', appvr='4.0.0',
+                                                           tdid='3943278516897751')
+        headers = self._build_headers(device_time, sign)
+        response = requests.post(url, data=payload, headers=headers)
+        response.raise_for_status()
+        login_data = response.json()
+        self.access_key = login_data['data']['access_key_id']
+        self.secret_key = login_data['data']['secret_access_key']
+        self.session_token = login_data['data']['session_token']
+        return self.access_key, self.secret_key, self.session_token
+
+    def _upload_auth(self):
+        """Get upload authorization"""
+        if isinstance(self.audio_path, bytes):
+            file_size = len(self.audio_path)
+        else:
+            file_size = os.path.getsize(self.audio_path)
+        request_parameters = f'Action=ApplyUploadInner&FileSize={file_size}&FileType=object&IsInner=1&SpaceName=lv-mac-recognition&Version=2020-11-19&s=5y0udbjapi'
+
+        t = datetime.datetime.utcnow()
+        amz_date = t.strftime('%Y%m%dT%H%M%SZ')
+        datestamp = t.strftime('%Y%m%d')
+        headers = {
+            "x-amz-date": amz_date,
+            "x-amz-security-token": self.session_token
+        }
+        signature = aws_signature(self.secret_key, request_parameters, headers, region="cn", service="vod")
+        authorization = f"AWS4-HMAC-SHA256 Credential={self.access_key}/{datestamp}/cn/vod/aws4_request, SignedHeaders=x-amz-date;x-amz-security-token, Signature={signature}"
+        headers["authorization"] = authorization
+        response = requests.get(f"https://vod.bytedanceapi.com/?{request_parameters}", headers=headers)
+        store_infos = response.json()
+
+        self.store_uri = store_infos['Result']['UploadAddress']['StoreInfos'][0]['StoreUri']
+        self.auth = store_infos['Result']['UploadAddress']['StoreInfos'][0]['Auth']
+        self.upload_id = store_infos['Result']['UploadAddress']['StoreInfos'][0]['UploadID']
+        self.session_key = store_infos['Result']['UploadAddress']['SessionKey']
+        self.upload_hosts = store_infos['Result']['UploadAddress']['UploadHosts'][0]
+        self.store_uri = store_infos['Result']['UploadAddress']['StoreInfos'][0]['StoreUri']
+        return store_infos
+
+    def _upload_file(self):
+        """Upload the file"""
+        url = f"https://{self.upload_hosts}/{self.store_uri}?partNumber=1&uploadID={self.upload_id}"
+        headers = self._uplosd_headers()
+        response = requests.put(url, data=self.file_binary, headers=headers)
+        resp_data = response.json()
+        assert resp_data['success'] == 0, f"File upload failed: {response.text}"
+        return resp_data
+
+    def _upload_check(self):
+        """Check upload result"""
+        url = f"https://{self.upload_hosts}/{self.store_uri}?uploadID={self.upload_id}"
+        payload = f"1:{self.crc32_hex}"
+        headers = self._uplosd_headers()
+        response = requests.post(url, data=payload, headers=headers)
+        resp_data = response.json()
+        return resp_data
+
+    def _upload_commit(self):
+        """Commit the uploaded file"""
+        url = f"https://{self.upload_hosts}/{self.store_uri}?uploadID={self.upload_id}&partNumber=1&x-amz-security-token={self.session_token}"
+        headers = self._uplosd_headers()
+        response = requests.put(url, data=self.file_binary, headers=headers)
+        return self.store_uri
+
+
+def sign(key: bytes, msg: str) -> bytes:
+    """使用HMAC-SHA256生成签名"""
+    return hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest()
+
+
+def get_signature_key(secret_key: str, date_stamp: str, region_name: str, service_name: str) -> bytes:
+    """生成用于AWS签名的密钥"""
+    k_date = sign(('AWS4' + secret_key).encode('utf-8'), date_stamp)
+    k_region = sign(k_date, region_name)
+    k_service = sign(k_region, service_name)
+    k_signing = sign(k_service, 'aws4_request')
+    return k_signing
+
+
+def aws_signature(secret_key: str, request_parameters: str, headers: Dict[str, str],
+                  method: str = "GET", payload: str = '', region: str = "cn", service: str = "vod") -> str:
+    """生成AWS签名"""
+    canonical_uri = '/'
+    canonical_querystring = request_parameters
+    canonical_headers = '\n'.join([f"{key}:{value}" for key, value in headers.items()]) + '\n'
+    signed_headers = ';'.join(headers.keys())
+    payload_hash = hashlib.sha256(payload.encode('utf-8')).hexdigest()
+    canonical_request = f"{method}\n{canonical_uri}\n{canonical_querystring}\n{canonical_headers}\n{signed_headers}\n{payload_hash}"
+
+    amzdate = headers["x-amz-date"]
+    datestamp = amzdate.split('T')[0]
+
+    algorithm = 'AWS4-HMAC-SHA256'
+    credential_scope = f"{datestamp}/{region}/{service}/aws4_request"
+    string_to_sign = f"{algorithm}\n{amzdate}\n{credential_scope}\n{hashlib.sha256(canonical_request.encode('utf-8')).hexdigest()}"
+
+    signing_key = get_signature_key(secret_key, datestamp, region, service)
+    signature = hmac.new(signing_key, string_to_sign.encode('utf-8'), hashlib.sha256).hexdigest()
+    return signature
+
+
+
+
+if __name__ == '__main__':
+    # Example usage
+    audio_file = r"test.mp3"
+    asr = JianYingASR(audio_file)
+    asr_data = asr.run()
+    print(asr_data)
--- a/modules/python/vendors/AsrTools/bk_asr/KuaiShouASR.py
+++ b/modules/python/vendors/AsrTools/bk_asr/KuaiShouASR.py
@@ -0,0 +1,32 @@
+import requests
+
+from .ASRData import ASRDataSeg
+from .BaseASR import BaseASR
+
+
+class KuaiShouASR(BaseASR):
+    def __init__(self, audio_path: [str, bytes], use_cache: bool = False):
+        super().__init__(audio_path, use_cache)
+
+    def _run(self) -> dict:
+        return self._submit()
+
+    def _make_segments(self, resp_data: dict) -> list[ASRDataSeg]:
+        return [ASRDataSeg(u['text'], u['start_time'], u['end_time']) for u in resp_data['data']['text']]
+
+    def _submit(self) -> dict:
+        payload = {
+            "typeId": "1"
+        }
+        files = [('file', ('test.mp3', self.file_binary, 'audio/mpeg'))]
+        result = requests.post("https://ai.kuaishou.com/api/effects/subtitle_generate", data=payload, files=files)
+        return result.json()
+
+
+if __name__ == '__main__':
+    # Example usage
+    # Example usage
+    audio_file = r"test.mp3"
+    asr = KuaiShouASR(audio_file)
+    asr_data = asr.run()
+    print(asr_data)
--- a/modules/python/vendors/AsrTools/bk_asr/WhisperASR.py
+++ b/modules/python/vendors/AsrTools/bk_asr/WhisperASR.py
@@ -0,0 +1,47 @@
+import os
+
+from openai import OpenAI
+
+from .ASRData import ASRDataSeg
+from .BaseASR import BaseASR
+
+
+
+class WhisperASR(BaseASR):
+    def __init__(self, audio_path: [str, bytes], model: str = MODEL, use_cache: bool = False):
+        super().__init__(audio_path, use_cache)
+        self.base_url = os.getenv('OPENAI_BASE_URL')
+        self.api_key = os.getenv('OPENAI_API_KEY')
+        if not self.base_url or not self.api_key:
+            raise ValueError("环境变量 OPENAI_BASE_URL 和 OPENAI_API_KEY 必须设置")
+        self.model = model
+        self.client = OpenAI(base_url=self.base_url, api_key=self.api_key)
+
+    def _run(self) -> dict:
+        return self._submit()
+
+    def _make_segments(self, resp_data: dict) -> list[ASRDataSeg]:
+        return [ASRDataSeg(u['text'], u['start'], u['end']) for u in resp_data['segments']]
+
+    def _get_key(self) -> str:
+        return f"{self.__class__.__name__}-{self.model}-{self.crc32_hex}-{self.model}"
+
+    def _submit(self) -> dict:
+        completion = self.client.audio.transcriptions.create(
+            model=self.model,
+            temperature=0,
+            response_format="verbose_json",
+            file=("test.mp3", self.file_binary, "audio/mp3"),
+            prompt="",
+            language="zh"
+        )
+        return completion.to_dict()
+
+if __name__ == '__main__':
+    # Example usage
+    audio_file = r"test.mp3"
+    asr = WhisperASR(audio_file)
+    asr_data = asr.run()
+    print(asr_data)
+
+
--- a/modules/python/vendors/AsrTools/bk_asr/init.py
+++ b/modules/python/vendors/AsrTools/bk_asr/init.py
@@ -0,0 +1,12 @@
+from .BcutASR import BcutASR
+from .JianYingASR import JianYingASR
+from .KuaiShouASR import KuaiShouASR
+# from .WhisperASR import WhisperASR
+
+__all__ = ["BcutASR", "JianYingASR", "KuaiShouASR"]
+
+
+def transcribe(audio_file, platform):
+    assert platform in __all__
+    asr = globals()[platform](audio_file)
+    return asr.run()
--- a/modules/python/vendors/AsrTools/example.py
+++ b/modules/python/vendors/AsrTools/example.py
@@ -0,0 +1,9 @@
+from bk_asr import BcutASR, JianYingASR, KuaiShouASR
+
+
+if __name__ == '__main__':
+    audio_file = "resources/test.mp3"
+    asr = JianYingASR(audio_file)
+    result = asr.run()
+    result.to_srt()
+    print(result.to_srt())
--- a/modules/python/vendors/AsrTools/requirements.txt
+++ b/modules/python/vendors/AsrTools/requirements.txt
--- a/modules/python/vendors/AsrTools/resources/main_window.png
+++ b/modules/python/vendors/AsrTools/resources/main_window.png
--- a/modules/python/vendors/AsrTools/resources/test.mp3
+++ b/modules/python/vendors/AsrTools/resources/test.mp3
--- a/modules/python/vendors/FunASR/.github/ISSUE_TEMPLATE/ask_questions.md
+++ b/modules/python/vendors/FunASR/.github/ISSUE_TEMPLATE/ask_questions.md
@@ -0,0 +1,38 @@
+---
+name: ❓ Questions/Help
+about: If you have questions, please first search existing issues and docs
+labels: 'question, needs triage'
+---
+
+Notice: In order to resolve issues more efficiently, please raise issue following the template.
+（注意：为了更加高效率解决您遇到的问题，请按照模板提问，补充细节）
+
+## ❓ Questions and Help
+
+
+### Before asking:
+1. search the issues.
+2. search the docs.
+
+<!-- If you still can't find what you need: -->
+
+#### What is your question?
+
+#### Code
+
+<!-- Please paste a code snippet if your question requires it! -->
+
+#### What have you tried?
+
+#### What's your environment?
+
+ - OS (e.g., Linux):
+ - FunASR Version (e.g., 1.0.0):
+ - ModelScope Version (e.g., 1.11.0):
+ - PyTorch Version (e.g., 2.0.0):
+ - How you installed funasr (`pip`, source):
+ - Python version:
+ - GPU (e.g., V100M32)
+ - CUDA/cuDNN version (e.g., cuda11.7):
+ - Docker version (e.g., funasr-runtime-sdk-cpu-0.4.1)
+ - Any other relevant information:
--- a/modules/python/vendors/FunASR/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/modules/python/vendors/FunASR/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,47 @@
+---
+name: 🐛 Bug Report
+about: Submit a bug report to help us improve
+labels: 'bug, needs triage'
+---
+
+Notice: In order to resolve issues more efficiently, please raise issue following the template.
+（注意：为了更加高效率解决您遇到的问题，请按照模板提问，补充细节）
+
+## 🐛 Bug
+
+<!-- A clear and concise description of what the bug is. -->
+
+### To Reproduce
+
+Steps to reproduce the behavior (**always include the command you ran**):
+
+1. Run cmd '....'
+2. See error
+
+<!-- If you have a code sample, error messages, stack traces, please provide it here as well -->
+
+
+#### Code sample
+<!-- Ideally attach a minimal code sample to reproduce the decried issue.
+Minimal means having the shortest code but still preserving the bug. -->
+
+### Expected behavior
+
+<!-- A clear and concise description of what you expected to happen. -->
+
+### Environment
+
+ - OS (e.g., Linux):
+ - FunASR Version (e.g., 1.0.0):
+ - ModelScope Version (e.g., 1.11.0):
+ - PyTorch Version (e.g., 2.0.0):
+ - How you installed funasr (`pip`, source):
+ - Python version:
+ - GPU (e.g., V100M32)
+ - CUDA/cuDNN version (e.g., cuda11.7):
+ - Docker version (e.g., funasr-runtime-sdk-cpu-0.4.1)
+ - Any other relevant information:
+
+### Additional context
+
+<!-- Add any other context about the problem here. -->
--- a/modules/python/vendors/FunASR/.github/ISSUE_TEMPLATE/config.yaml
+++ b/modules/python/vendors/FunASR/.github/ISSUE_TEMPLATE/config.yaml
@@ -0,0 +1 @@
+blank_issues_enabled: false
--- a/modules/python/vendors/FunASR/.github/ISSUE_TEMPLATE/error_docs.md
+++ b/modules/python/vendors/FunASR/.github/ISSUE_TEMPLATE/error_docs.md
@@ -0,0 +1,15 @@
+---
+name: 📚 Documentation/Typos
+about: Report an issue related to documentation or a typo
+labels: 'documentation, needs triage'
+---
+
+## 📚 Documentation
+
+For typos and doc fixes, please go ahead and:
+
+1. Create an issue.
+2. Fix the typo.
+3. Submit a PR.
+
+Thanks!
--- a/modules/python/vendors/FunASR/.pre-commit-config.yaml
+++ b/modules/python/vendors/FunASR/.pre-commit-config.yaml
@@ -0,0 +1,6 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 24.4.0
+    hooks:
+      - id: black
+        args: ['--line-length=100']  # 示例参数，black默认使用4个空格缩进
--- a/modules/python/vendors/FunASR/Acknowledge.md
+++ b/modules/python/vendors/FunASR/Acknowledge.md
@@ -0,0 +1,10 @@
+## Acknowledge
+
+1. We borrowed a lot of code from [Kaldi](http://kaldi-asr.org/) for data preparation.
+2. We borrowed a lot of code from [ESPnet](https://github.com/espnet/espnet). FunASR follows up the training and finetuning pipelines of ESPnet.
+3. We referred [Wenet](https://github.com/wenet-e2e/wenet) for building dataloader for large scale data training.
+4. We acknowledge [ChinaTelecom](https://github.com/zhuzizyf/damo-fsmn-vad-infer-httpserver) for contributing the VAD runtime.
+5. We acknowledge [RapidAI](https://github.com/RapidAI) for contributing the Paraformer and CT_Transformer-punc runtime.
+6. We acknowledge [AiHealthx](http://www.aihealthx.com/) for contributing the websocket service and html5.
+7. We acknowledge [XVERSE](http://www.xverse.cn/index.html) for contributing the grpc service.
+8. We acknowledge [blt](https://github.com/bltcn) for develop and deploy website.
--- a/modules/python/vendors/FunASR/MODEL_LICENSE
+++ b/modules/python/vendors/FunASR/MODEL_LICENSE
@@ -0,0 +1,74 @@
+FunASR Model Open Source License
+Version 1.0
+
+Copyright (C) [2023-2028] Alibaba Group. All rights reserved.
+
+Thank you for choosing the FunASR open source models. The FunASR open source models contain a series of open-source models that allow everyone to use, modify, share, and learn from it.
+
+To ensure better community collaboration, we have developed the following agreement and hope that you carefully read and abide by it.
+
+1 Definitions
+In this agreement, [FunASR software] refers to the FunASR open source model, and its derivatives, including fine-tuned models. [You] refer to individuals or organizations who use, modify, share, and learn from [FunASR software].
+
+2 License and Restrictions
+
+2.1 License
+You are free to use, copy, modify, and share [FunASR software] under the conditions of this agreement.
+
+2.2 Restrictions
+You should indicate the code and model source and author information when using, copying, modifying and sharing [FunASR software]. You should keep the relevant names of models in [FunASR software].
+
+3 Responsibility and Risk
+[FunASR software] is for reference and learning purposes only and is not responsible for any direct or indirect losses caused by your use or modification of [FunASR software]. You should take responsibility and risks for your use and modification of [FunASR software].
+
+4 Termination
+If you violate any terms of this agreement, your license will be automatically terminated, and you must stop using, copying, modifying, and sharing [FunASR software].
+
+5 Revision
+This agreement may be updated and revised from time to time. The revised agreement will be published in the FunASR official repository and automatically take effect. If you continue to use, copy, modify, and share [FunASR software], it means you agree to the revised agreement.
+
+6 Other Provisions
+This agreement is subject to the laws of [Country/Region]. If any provisions are found to be illegal, invalid, or unenforceable, they shall be deemed deleted from this agreement, and the remaining provisions shall remain valid and binding.
+
+If you have any questions or comments about this agreement, please contact us.
+
+Copyright (c) [2023-2028] Alibaba Group. All rights reserved.
+
+FunASR 模型开源协议
+
+版本号：1.0
+
+版权所有 (C) [2023-2028] [阿里巴巴集团]。保留所有权利。
+
+感谢您选择 FunASR 开源模型。FunASR 开源模型包含一系列免费且开源的工业模型，让大家可以使用、修改、分享和学习该模型。
+
+为了保证更好的社区合作，我们制定了以下协议，希望您仔细阅读并遵守本协议。
+
+1 定义
+本协议中，[FunASR 软件]指 FunASR 开源模型权重及其衍生品，包括 Finetune 后的模型；[您]指使用、修改、分享和学习[FunASR 软件]的个人或组织。
+
+2 许可和限制
+2.1 许可
+
+您可以在遵守本协议的前提下，自由地使用、复制、修改和分享[FunASR 软件]。
+
+2.2 限制
+
+您在使用、复制、修改和分享[FunASR 软件]时，必须注明出处以及作者信息，并保留[FunASR 软件]中相关模型名称。
+
+3 责任和风险承担
+[FunASR 软件]仅作为参考和学习使用，不对您使用或修改[FunASR 软件]造成的任何直接或间接损失承担任何责任。您对[FunASR 软件]的使用和修改应该自行承担风险。
+
+4 终止
+如果您违反本协议的任何条款，您的许可将自动终止，您必须停止使用、复制、修改和分享[FunASR 软件]。
+
+5 修订
+本协议可能会不时更新和修订。修订后的协议将在[FunASR 软件]官方仓库发布，并自动生效。如果您继续使用、复制、修改和分享[FunASR 软件]，即表示您同意修订后的协议。
+
+6 其他规定
+本协议受到[国家/地区] 的法律管辖。如果任何条款被裁定为不合法、无效或无法执行，则该条款应被视为从本协议中删除，而其余条款应继续有效并具有约束力。
+
+如果您对本协议有任何问题或意见，请联系我们。
+
+版权所有© [2023-2028] [阿里巴巴集团]。保留所有权利。
+
--- a/modules/python/vendors/FunASR/README.md
+++ b/modules/python/vendors/FunASR/README.md
@@ -0,0 +1,385 @@
+[//]: # (<div align="left"><img src="docs/images/funasr_logo.jpg" width="400"/></div>)
+
+([简体中文](./README_zh.md)|English)
+
+[//]: # (# FunASR: A Fundamental End-to-End Speech Recognition Toolkit)
+
+[![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=FunASR🤠&text2=💖%20A%20Fundamental%20End-to-End%20Speech%20Recognition%20Toolkit&width=800&height=210)](https://github.com/Akshay090/svg-banners)
+
+[![PyPI](https://img.shields.io/pypi/v/funasr)](https://pypi.org/project/funasr/)
+
+
+<strong>FunASR</strong> hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model, researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun！
+
+[**Highlights**](#highlights)
+| [**News**](https://github.com/alibaba-damo-academy/FunASR#whats-new) 
+| [**Installation**](#installation)
+| [**Quick Start**](#quick-start)
+| [**Tutorial**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/tutorial/README.md)
+| [**Runtime**](./runtime/readme.md)
+| [**Model Zoo**](#model-zoo)
+| [**Contact**](#contact)
+
+
+<a name="highlights"></a>
+## Highlights
+- FunASR is a fundamental speech recognition toolkit that offers a variety of features, including speech recognition (ASR), Voice Activity Detection (VAD), Punctuation Restoration, Language Models, Speaker Verification, Speaker Diarization and multi-talker ASR. FunASR provides convenient scripts and tutorials, supporting inference and fine-tuning of pre-trained models.
+- We have released a vast collection of academic and industrial pretrained models on the [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) and [huggingface](https://huggingface.co/FunASR), which can be accessed through our [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md). The representative [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), a non-autoregressive end-to-end speech recognition model, has the advantages of high accuracy, high efficiency, and convenient deployment, supporting the rapid construction of speech recognition services. For more details on service deployment, please refer to the [service deployment document](runtime/readme_cn.md). 
+
+
+<a name="whats-new"></a>
+## What's new:
+- 2024/10/10：Added support for the Whisper-large-v3-turbo model, a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. It can be downloaded from the [modelscope](examples/industrial_data_pretraining/whisper/demo.py), and [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py).
+- 2024/09/26: Offline File Transcription Service 4.6, Offline File Transcription Service of English 1.7，Real-time Transcription Service 1.11 released，fix memory leak & Support the SensevoiceSmall onnx model；File Transcription Service 2.0 GPU released, Fix GPU memory leak; ([docs](runtime/readme.md));
+- 2024/09/25：keyword spotting models are new supported. Supports fine-tuning and inference for four models: [fsmn_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [fsmn_kws_mt](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [sanm_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-offline), [sanm_kws_streaming](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online).
+- 2024/07/04：[SenseVoice](https://github.com/FunAudioLLM/SenseVoice) is a speech foundation model with multiple speech understanding capabilities, including ASR, LID, SER, and AED.
+- 2024/07/01: Offline File Transcription Service GPU 1.1 released, optimize BladeDISC model compatibility issues; ref to ([docs](runtime/readme.md))
+- 2024/06/27: Offline File Transcription Service GPU 1.0 released, supporting dynamic batch processing and multi-threading concurrency. In the long audio test set, the single-thread RTF is 0.0076, and multi-threads' speedup is 1200+ (compared to 330+ on CPU); ref to ([docs](runtime/readme.md))
+- 2024/05/15：emotion recognition models are new supported. [emotion2vec+large](https://modelscope.cn/models/iic/emotion2vec_plus_large/summary)，[emotion2vec+base](https://modelscope.cn/models/iic/emotion2vec_plus_base/summary)，[emotion2vec+seed](https://modelscope.cn/models/iic/emotion2vec_plus_seed/summary). currently supports the following categories: 0: angry 1: happy 2: neutral 3: sad 4: unknown.
+- 2024/05/15: Offline File Transcription Service 4.5, Offline File Transcription Service of English 1.6，Real-time Transcription Service 1.10 released，adapting to FunASR 1.0 model structure；([docs](runtime/readme.md))
+
+<details><summary>Full Changelog</summary>
+
+- 2024/03/05：Added the Qwen-Audio and Qwen-Audio-Chat large-scale audio-text multimodal models, which have topped multiple audio domain leaderboards. These models support speech dialogue, [usage](examples/industrial_data_pretraining/qwen_audio).
+- 2024/03/05：Added support for the Whisper-large-v3 model, a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. It can be downloaded from the[modelscope](examples/industrial_data_pretraining/whisper/demo.py), and [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py).
+- 2024/03/05: Offline File Transcription Service 4.4, Offline File Transcription Service of English 1.5，Real-time Transcription Service 1.9 released，docker image supports ARM64 platform, update modelscope；([docs](runtime/readme.md))
+- 2024/01/30：funasr-1.0 has been released ([docs](https://github.com/alibaba-damo-academy/FunASR/discussions/1319))
+- 2024/01/30：emotion recognition models are new supported. [model link](https://www.modelscope.cn/models/iic/emotion2vec_base_finetuned/summary), modified from [repo](https://github.com/ddlBoJack/emotion2vec).
+- 2024/01/25: Offline File Transcription Service 4.2, Offline File Transcription Service of English 1.3 released，optimized the VAD (Voice Activity Detection) data processing method, significantly reducing peak memory usage, memory leak optimization; Real-time Transcription Service 1.7 released，optimizatized the client-side；([docs](runtime/readme.md))
+- 2024/01/09: The Funasr SDK for Windows version 2.0 has been released, featuring support for The offline file transcription service (CPU) of Mandarin 4.1, The offline file transcription service (CPU) of English 1.2, The real-time transcription service (CPU) of Mandarin 1.6. For more details, please refer to the official documentation or release notes([FunASR-Runtime-Windows](https://www.modelscope.cn/models/damo/funasr-runtime-win-cpu-x64/summary))
+- 2024/01/03: File Transcription Service 4.0 released, Added support for 8k models, optimized timestamp mismatch issues and added sentence-level timestamps, improved the effectiveness of English word FST hotwords, supported automated configuration of thread parameters, and fixed known crash issues as well as memory leak problems, refer to ([docs](runtime/readme.md#file-transcription-service-mandarin-cpu)).
+- 2024/01/03: Real-time Transcription Service 1.6 released，The 2pass-offline mode supports Ngram language model decoding and WFST hotwords, while also addressing known crash issues and memory leak problems, ([docs](runtime/readme.md#the-real-time-transcription-service-mandarin-cpu))
+- 2024/01/03: Fixed known crash issues as well as memory leak problems, ([docs](runtime/readme.md#file-transcription-service-english-cpu)).
+- 2023/12/04: The Funasr SDK for Windows version 1.0 has been released, featuring support for The offline file transcription service (CPU) of Mandarin, The offline file transcription service (CPU) of English, The real-time transcription service (CPU) of Mandarin. For more details, please refer to the official documentation or release notes([FunASR-Runtime-Windows](https://www.modelscope.cn/models/damo/funasr-runtime-win-cpu-x64/summary))
+- 2023/11/08: The offline file transcription service 3.0 (CPU) of Mandarin has been released, adding punctuation large model, Ngram language model, and wfst hot words. For detailed information, please refer to [docs](runtime#file-transcription-service-mandarin-cpu). 
+- 2023/10/17: The offline file transcription service (CPU) of English has been released. For more details, please refer to ([docs](runtime#file-transcription-service-english-cpu)).
+- 2023/10/13: [SlideSpeech](https://slidespeech.github.io/): A large scale multi-modal audio-visual corpus with a significant amount of real-time synchronized slides.
+- 2023/10/10: The ASR-SpeakersDiarization combined pipeline [Paraformer-VAD-SPK](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr_vad_spk/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/demo.py) is now released. Experience the model to get recognition results with speaker information.
+- 2023/10/07: [FunCodec](https://github.com/alibaba-damo-academy/FunCodec): A Fundamental, Reproducible and Integrable Open-source Toolkit for Neural Speech Codec.
+- 2023/09/01: The offline file transcription service 2.0 (CPU) of Mandarin has been released, with added support for ffmpeg, timestamp, and hotword models. For more details, please refer to ([docs](runtime#file-transcription-service-mandarin-cpu)).
+- 2023/08/07: The real-time transcription service (CPU) of Mandarin has been released. For more details, please refer to ([docs](runtime#the-real-time-transcription-service-mandarin-cpu)).
+- 2023/07/17: BAT is released, which is a low-latency and low-memory-consumption RNN-T model. For more details, please refer to ([BAT](egs/aishell/bat)).
+- 2023/06/26: ASRU2023 Multi-Channel Multi-Party Meeting Transcription Challenge 2.0 completed the competition and announced the results. For more details, please refer to ([M2MeT2.0](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)).
+
+</details>
+
+<a name="Installation"></a>
+## Installation
+
+- Requirements
+```text
+python>=3.8
+torch>=1.13
+torchaudio
+```
+
+- Install for pypi
+```shell
+pip3 install -U funasr
+```
+- Or install from source code
+``` sh
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
+pip3 install -e ./
+```
+- Install modelscope or huggingface_hub for the pretrained models (Optional)
+
+```shell
+pip3 install -U modelscope huggingface_hub
+```
+
+## Model Zoo
+FunASR has open-sourced a large number of pre-trained models on industrial data. You are free to use, copy, modify, and share FunASR models under the [Model License Agreement](./MODEL_LICENSE). Below are some representative models, for more models please refer to the [Model Zoo](./model_zoo).
+
+(Note: ⭐ represents the ModelScope model zoo, 🤗 represents the Huggingface model zoo, 🍀 represents the OpenAI model zoo)
+
+
+|                                                                                                         Model Name                                                                                                         |                                   Task Details                                   |          Training Data           | Parameters |
+|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------:|:--------------------------------:|:----------:|
+|                                        SenseVoiceSmall <br> ([⭐](https://www.modelscope.cn/models/iic/SenseVoiceSmall)  [🤗](https://huggingface.co/FunAudioLLM/SenseVoiceSmall) )                                         | multiple speech understanding capabilities, including ASR, ITN, LID, SER, and AED, support languages such as zh, yue, en, ja, ko   |           300000 hours           |    234M    |
+|          paraformer-zh <br> ([⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [🤗](https://huggingface.co/funasr/paraformer-zh) )           |                speech recognition, with timestamps, non-streaming                |      60000 hours, Mandarin       |    220M    |
+| <nobr>paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗](https://huggingface.co/funasr/paraformer-zh-streaming) )</nobr> |                          speech recognition, streaming                           |      60000 hours, Mandarin       |    220M    |
+|               paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗](https://huggingface.co/funasr/paraformer-en) )                |              speech recognition, without timestamps, non-streaming               |       50000 hours, English       |    220M    |
+|                            conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗](https://huggingface.co/funasr/conformer-en) )                             |                        speech recognition, non-streaming                         |       50000 hours, English       |    220M    |
+|                               ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗](https://huggingface.co/funasr/ct-punc) )                               |                             punctuation restoration                              |    100M, Mandarin and English    |    290M    | 
+|                                   fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗](https://huggingface.co/funasr/fsmn-vad) )                                   |                             voice activity detection                             | 5000 hours, Mandarin and English |    0.4M    | 
+|                                                              fsmn-kws <br> ( [⭐](https://modelscope.cn/models/iic/speech_charctc_kws_phone-xiaoyun/summary) )                                                              |     keyword spotting，streaming      |  5000 hours, Mandarin  |    0.7M    | 
+|                                     fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗](https://huggingface.co/funasr/fa-zh) )                                     |                               timestamp prediction                               |       5000 hours, Mandarin       |    38M     | 
+|                                       cam++ <br> ( [⭐](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [🤗](https://huggingface.co/funasr/campplus) )                                        |                         speaker verification/diarization                         |            5000 hours            |    7.2M    | 
+|                                            Whisper-large-v3 <br> ([⭐](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary)  [🍀](https://github.com/openai/whisper) )                                            |                speech recognition, with timestamps, non-streaming                |           multilingual           |   1550 M   |
+|                                      Whisper-large-v3-turbo <br> ([⭐](https://www.modelscope.cn/models/iic/Whisper-large-v3-turbo/summary)  [🍀](https://github.com/openai/whisper) )                                      |                speech recognition, with timestamps, non-streaming                |           multilingual           |   809 M    |
+|                                               Qwen-Audio <br> ([⭐](examples/industrial_data_pretraining/qwen_audio/demo.py)  [🤗](https://huggingface.co/Qwen/Qwen-Audio) )                                                |                    audio-text multimodal models (pretraining)                    |           multilingual           |     8B     |
+|                                        Qwen-Audio-Chat <br> ([⭐](examples/industrial_data_pretraining/qwen_audio/demo_chat.py)  [🤗](https://huggingface.co/Qwen/Qwen-Audio-Chat) )                                        |                       audio-text multimodal models (chat)                        |           multilingual           |     8B     |
+|                              emotion2vec+large <br> ([⭐](https://modelscope.cn/models/iic/emotion2vec_plus_large/summary)  [🤗](https://huggingface.co/emotion2vec/emotion2vec_plus_large) )                               |                           speech emotion recongintion                            |           40000 hours            |    300M    |
+
+
+
+
+[//]: # ()
+[//]: # (FunASR supports pre-trained or further fine-tuned models for deployment as a service. The CPU version of the Chinese offline file conversion service has been released, details can be found in [docs]&#40;funasr/runtime/docs/SDK_tutorial.md&#41;. More detailed information about service deployment can be found in the [deployment roadmap]&#40;funasr/runtime/readme_cn.md&#41;.)
+
+
+<a name="quick-start"></a>
+## Quick Start
+
+Below is a quick start tutorial. Test audio files ([Mandarin](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav), [English](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav)).
+
+### Command-line usage
+
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
+```
+
+Notes: Support recognition of single audio file, as well as file list in Kaldi-style wav.scp format: `wav_id wav_pat`
+
+### Speech Recognition (Non-streaming)
+#### SenseVoice
+```python
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+
+model_dir = "iic/SenseVoiceSmall"
+
+model = AutoModel(
+    model=model_dir,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device="cuda:0",
+)
+
+# en
+res = model.generate(
+    input=f"{model.model_path}/example/en.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+```
+Parameter Description:
+- `model_dir`: The name of the model, or the path to the model on the local disk.
+- `vad_model`: This indicates the activation of VAD (Voice Activity Detection). The purpose of VAD is to split long audio into shorter clips. In this case, the inference time includes both VAD and SenseVoice total consumption, and represents the end-to-end latency. If you wish to test the SenseVoice model's inference time separately, the VAD model can be disabled.
+- `vad_kwargs`: Specifies the configurations for the VAD model. `max_single_segment_time`: denotes the maximum duration for audio segmentation by the `vad_model`, with the unit being milliseconds (ms).
+- `use_itn`: Whether the output result includes punctuation and inverse text normalization.
+- `batch_size_s`: Indicates the use of dynamic batching, where the total duration of audio in the batch is measured in seconds (s).
+- `merge_vad`: Whether to merge short audio fragments segmented by the VAD model, with the merged length being `merge_length_s`, in seconds (s).
+- `ban_emo_unk`: Whether to ban the output of the `emo_unk` token.
+
+#### Paraformer
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh",  vad_model="fsmn-vad",  punc_model="ct-punc", 
+                  # spk_model="cam++", 
+                  )
+res = model.generate(input=f"{model.model_path}/example/asr_example.wav", 
+                     batch_size_s=300, 
+                     hotword='魔搭')
+print(res)
+```
+Note: `hub`: represents the model repository, `ms` stands for selecting ModelScope download, `hf` stands for selecting Huggingface download.
+
+### Speech Recognition (Streaming)
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word.
+
+<details><summary>More Examples</summary>
+
+### Voice Activity Detection (Non-Streaming)
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad")
+wav_file = f"{model.model_path}/example/vad_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+Note: The output format of the VAD model is: `[[beg1, end1], [beg2, end2], ..., [begN, endN]]`, where `begN/endN` indicates the starting/ending point of the `N-th` valid audio segment, measured in milliseconds.
+
+### Voice Activity Detection (Streaming)
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+Note: The output format for the streaming VAD model can be one of four scenarios:
+- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`：The same as the offline VAD output result mentioned above.
+- `[[beg, -1]]`：Indicates that only a starting point has been detected.
+- `[[-1, end]]`：Indicates that only an ending point has been detected.
+- `[]`：Indicates that neither a starting point nor an ending point has been detected. 
+
+The output is measured in milliseconds and represents the absolute time from the starting point.
+### Punctuation Restoration
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc")
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+### Timestamp Prediction
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh")
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+
+
+### Speech Emotion Recognition
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="emotion2vec_plus_large")
+
+wav_file = f"{model.model_path}/example/test.wav"
+
+res = model.generate(wav_file, output_dir="./outputs", granularity="utterance", extract_embedding=False)
+print(res)
+```
+
+More usages ref to [docs](docs/tutorial/README_zh.md), 
+more examples ref to [demo](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)
+
+</details>
+
+## Export ONNX
+
+### Command-line usage
+```shell
+funasr-export ++model=paraformer ++quantize=false ++device=cpu
+```
+
+### Python
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer", device="cpu")
+
+res = model.export(quantize=False)
+```
+
+### Test ONNX
+```python
+# pip3 install -U funasr-onnx
+from funasr_onnx import Paraformer
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
+```
+
+More examples ref to [demo](runtime/python/onnxruntime)
+
+## Deployment Service
+FunASR supports deploying pre-trained or further fine-tuned models for service. Currently, it supports the following types of service deployment:
+- File transcription service, Mandarin, CPU version, done
+- The real-time transcription service, Mandarin (CPU), done
+- File transcription service, English, CPU version, done
+- File transcription service, Mandarin, GPU version, in progress
+- and more.
+
+For more detailed information, please refer to the [service deployment documentation](runtime/readme.md).
+
+
+<a name="contact"></a>
+## Community Communication
+If you encounter problems in use, you can directly raise Issues on the github page.
+
+You can also scan the following DingTalk group to join the community group for communication and discussion.
+
+|                           DingTalk group                            |
+|:-------------------------------------------------------------------:|
+| <div align="left"><img src="docs/images/dingding.png" width="250"/> |
+
+## Contributors
+
+| <div align="left"><img src="docs/images/alibaba.png" width="260"/> | <div align="left"><img src="docs/images/nwpu.png" width="260"/> | <img src="docs/images/China_Telecom.png" width="200"/> </div>  | <img src="docs/images/RapidAI.png" width="200"/> </div> | <img src="docs/images/aihealthx.png" width="200"/> </div> | <img src="docs/images/XVERSE.png" width="250"/> </div> |
+|:------------------------------------------------------------------:|:---------------------------------------------------------------:|:--------------------------------------------------------------:|:-------------------------------------------------------:|:-----------------------------------------------------------:|:------------------------------------------------------:|
+
+The contributors can be found in [contributors list](./Acknowledge.md)
+
+## License
+This project is licensed under [The MIT License](https://opensource.org/licenses/MIT). FunASR also contains various third-party components and some code modified from other repos under other open source licenses.
+The use of pretraining model is subject to [model license](./MODEL_LICENSE)
+
+
+## Citations
+``` bibtex
+@inproceedings{gao2023funasr,
+  author={Zhifu Gao and Zerui Li and Jiaming Wang and Haoneng Luo and Xian Shi and Mengzhe Chen and Yabin Li and Lingyun Zuo and Zhihao Du and Zhangyu Xiao and Shiliang Zhang},
+  title={FunASR: A Fundamental End-to-End Speech Recognition Toolkit},
+  year={2023},
+  booktitle={INTERSPEECH},
+}
+@inproceedings{An2023bat,
+  author={Keyu An and Xian Shi and Shiliang Zhang},
+  title={BAT: Boundary aware transducer for memory-efficient and low-latency ASR},
+  year={2023},
+  booktitle={INTERSPEECH},
+}
+@inproceedings{gao22b_interspeech,
+  author={Zhifu Gao and ShiLiang Zhang and Ian McLoughlin and Zhijie Yan},
+  title={Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition},
+  year=2022,
+  booktitle={Proc. Interspeech 2022},
+  pages={2063--2067},
+  doi={10.21437/Interspeech.2022-9996}
+}
+@inproceedings{shi2023seaco,
+  author={Xian Shi and Yexin Yang and Zerui Li and Yanni Chen and Zhifu Gao and Shiliang Zhang},
+  title={SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and Effective Hotword Customization Ability},
+  year={2023},
+  booktitle={ICASSP2024}
+}
+```
--- a/modules/python/vendors/FunASR/README_zh.md
+++ b/modules/python/vendors/FunASR/README_zh.md
@@ -0,0 +1,392 @@
+[//]: # (<div align="left"><img src="docs/images/funasr_logo.jpg" width="400"/></div>)
+
+(简体中文|[English](./README.md))
+
+
+
+[![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=FunASR🤠&text2=💖%20A%20Fundamental%20End-to-End%20Speech%20Recognition%20Toolkit&width=800&height=210)](https://github.com/Akshay090/svg-banners)
+
+[//]: # (# FunASR: A Fundamental End-to-End Speech Recognition Toolkit)
+
+[![PyPI](https://img.shields.io/pypi/v/funasr)](https://pypi.org/project/funasr/)
+
+
+FunASR希望在语音识别的学术研究和工业应用之间架起一座桥梁。通过发布工业级语音识别模型的训练和微调，研究人员和开发人员可以更方便地进行语音识别模型的研究和生产，并推动语音识别生态的发展。让语音识别更有趣！
+
+<div align="center">  
+<h4>
+ <a href="#核心功能"> 核心功能 </a>   
+｜<a href="#最新动态"> 最新动态 </a>
+｜<a href="#安装教程"> 安装 </a>
+｜<a href="#快速开始"> 快速开始 </a>
+｜<a href="https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/tutorial/README_zh.md"> 教程文档 </a>
+｜<a href="#模型仓库"> 模型仓库 </a>
+｜<a href="#服务部署"> 服务部署 </a>
+｜<a href="#联系我们"> 联系我们 </a>
+</h4>
+</div>
+
+<a name="核心功能"></a>
+## 核心功能
+- FunASR是一个基础语音识别工具包，提供多种功能，包括语音识别（ASR）、语音端点检测（VAD）、标点恢复、语言模型、说话人验证、说话人分离和多人对话语音识别等。FunASR提供了便捷的脚本和教程，支持预训练好的模型的推理与微调。
+- 我们在[ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition)与[huggingface](https://huggingface.co/FunASR)上发布了大量开源数据集或者海量工业数据训练的模型，可以通过我们的[模型仓库](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md)了解模型的详细信息。代表性的[Paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)非自回归端到端语音识别模型具有高精度、高效率、便捷部署的优点，支持快速构建语音识别服务，详细信息可以阅读([服务部署文档](runtime/readme_cn.md))。
+
+<a name="最新动态"></a>
+## 最新动态
+- 2024/10/10：新增加Whisper-large-v3-turbo模型支持，多语言语音识别/翻译/语种识别，支持从 [modelscope](examples/industrial_data_pretraining/whisper/demo.py)仓库下载，也支持从 [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py)仓库下载模型。
+- 2024/09/26: 中文离线文件转写服务 4.6、英文离线文件转写服务 1.7、中文实时语音听写服务 1.11 发布，修复ONNX内存泄漏、支持SensevoiceSmall onnx模型；中文离线文件转写服务GPU 2.0 发布，修复显存泄漏; 详细信息参阅([部署文档](runtime/readme_cn.md))
+- 2024/09/25：新增语音唤醒模型，支持[fsmn_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [fsmn_kws_mt](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [sanm_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-offline), [sanm_kws_streaming](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online) 4个模型的微调和推理。
+- 2024/07/04：[SenseVoice](https://github.com/FunAudioLLM/SenseVoice) 是一个基础语音理解模型，具备多种语音理解能力，涵盖了自动语音识别（ASR）、语言识别（LID）、情感识别（SER）以及音频事件检测（AED）。
+- 2024/07/01：中文离线文件转写服务GPU版本 1.1发布，优化bladedisc模型兼容性问题；详细信息参阅([部署文档](runtime/readme_cn.md))
+- 2024/06/27：中文离线文件转写服务GPU版本 1.0发布，支持动态batch，支持多路并发，在长音频测试集上单线RTF为0.0076，多线加速比为1200+（CPU为330+）；详细信息参阅([部署文档](runtime/readme_cn.md))
+- 2024/05/15：新增加情感识别模型，[emotion2vec+large](https://modelscope.cn/models/iic/emotion2vec_plus_large/summary)，[emotion2vec+base](https://modelscope.cn/models/iic/emotion2vec_plus_base/summary)，[emotion2vec+seed](https://modelscope.cn/models/iic/emotion2vec_plus_seed/summary)，输出情感类别为：生气/angry，开心/happy，中立/neutral，难过/sad。
+- 2024/05/15: 中文离线文件转写服务 4.5、英文离线文件转写服务 1.6、中文实时语音听写服务 1.10 发布，适配FunASR 1.0模型结构；详细信息参阅([部署文档](runtime/readme_cn.md))
+- 2024/03/05：新增加Qwen-Audio与Qwen-Audio-Chat音频文本模态大模型，在多个音频领域测试榜单刷榜，中支持语音对话，详细用法见 [示例](examples/industrial_data_pretraining/qwen_audio)。
+- 2024/03/05：新增加Whisper-large-v3模型支持，多语言语音识别/翻译/语种识别，支持从 [modelscope](examples/industrial_data_pretraining/whisper/demo.py)仓库下载，也支持从 [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py)仓库下载模型。
+- 2024/03/05: 中文离线文件转写服务 4.4、英文离线文件转写服务 1.5、中文实时语音听写服务 1.9 发布，docker镜像支持arm64平台，升级modelscope版本；详细信息参阅([部署文档](runtime/readme_cn.md))
+- 2024/01/30：funasr-1.0发布，更新说明[文档](https://github.com/alibaba-damo-academy/FunASR/discussions/1319)
+
+<details><summary>展开日志</summary>
+
+- 2024/01/30：新增加情感识别 [模型链接](https://www.modelscope.cn/models/iic/emotion2vec_base_finetuned/summary)，原始模型 [repo](https://github.com/ddlBoJack/emotion2vec).
+- 2024/01/25: 中文离线文件转写服务 4.2、英文离线文件转写服务 1.3，优化vad数据处理方式，大幅降低峰值内存占用，内存泄漏优化；中文实时语音听写服务 1.7 发布，客户端优化；详细信息参阅([部署文档](runtime/readme_cn.md))
+- 2024/01/09: funasr社区软件包windows 2.0版本发布，支持软件包中文离线文件转写4.1、英文离线文件转写1.2、中文实时听写服务1.6的最新功能，详细信息参阅([FunASR社区软件包windows版本](https://www.modelscope.cn/models/damo/funasr-runtime-win-cpu-x64/summary))
+- 2024/01/03: 中文离线文件转写服务 4.0 发布，新增支持8k模型、优化时间戳不匹配问题及增加句子级别时间戳、优化英文单词fst热词效果、支持自动化配置线程参数，同时修复已知的crash问题及内存泄漏问题，详细信息参阅([部署文档](runtime/readme_cn.md#中文离线文件转写服务cpu版本))
+- 2024/01/03: 中文实时语音听写服务 1.6 发布，2pass-offline模式支持Ngram语言模型解码、wfst热词，同时修复已知的crash问题及内存泄漏问题，详细信息参阅([部署文档](runtime/readme_cn.md#中文实时语音听写服务cpu版本))
+- 2024/01/03: 英文离线文件转写服务 1.2 发布，修复已知的crash问题及内存泄漏问题，详细信息参阅([部署文档](runtime/readme_cn.md#英文离线文件转写服务cpu版本))
+- 2023/12/04: funasr社区软件包windows 1.0版本发布，支持中文离线文件转写、英文离线文件转写、中文实时听写服务，详细信息参阅([FunASR社区软件包windows版本](https://www.modelscope.cn/models/damo/funasr-runtime-win-cpu-x64/summary))
+- 2023/11/08：中文离线文件转写服务3.0 CPU版本发布，新增标点大模型、Ngram语言模型与wfst热词，详细信息参阅([部署文档](runtime/readme_cn.md#中文离线文件转写服务cpu版本))
+- 2023/10/17: 英文离线文件转写服务一键部署的CPU版本发布，详细信息参阅([部署文档](runtime/readme_cn.md#英文离线文件转写服务cpu版本))
+- 2023/10/13: [SlideSpeech](https://slidespeech.github.io/): 一个大规模的多模态音视频语料库，主要是在线会议或者在线课程场景，包含了大量与发言人讲话实时同步的幻灯片。
+- 2023.10.10: [Paraformer-long-Spk](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr_vad_spk/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/demo.py)模型发布，支持在长语音识别的基础上获取每句话的说话人标签。
+- 2023.10.07: [FunCodec](https://github.com/alibaba-damo-academy/FunCodec): FunCodec提供开源模型和训练工具，可以用于音频离散编码，以及基于离散编码的语音识别、语音合成等任务。
+- 2023.09.01: 中文离线文件转写服务2.0 CPU版本发布，新增ffmpeg、时间戳与热词模型支持，详细信息参阅([部署文档](runtime/readme_cn.md#中文离线文件转写服务cpu版本))
+- 2023.08.07: 中文实时语音听写服务一键部署的CPU版本发布，详细信息参阅([部署文档](runtime/readme_cn.md#中文实时语音听写服务cpu版本))
+- 2023.07.17: BAT一种低延迟低内存消耗的RNN-T模型发布，详细信息参阅（[BAT](egs/aishell/bat)）
+- 2023.06.26: ASRU2023 多通道多方会议转录挑战赛2.0完成竞赛结果公布，详细信息参阅（[M2MeT2.0](https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/index.html)）
+
+</details>
+
+<a name="安装教程"></a>
+## 安装教程
+
+- 安装funasr之前，确保已经安装了下面依赖环境:
+```text
+python>=3.8
+torch>=1.13
+torchaudio
+```
+
+- pip安装
+```shell
+pip3 install -U funasr
+```
+
+- 或者从源代码安装
+``` sh
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
+pip3 install -e ./
+```
+
+如果需要使用工业预训练模型，安装modelscope与huggingface_hub（可选）
+
+```shell
+pip3 install -U modelscope huggingface huggingface_hub
+```
+
+## 模型仓库
+
+FunASR开源了大量在工业数据上预训练模型，您可以在[模型许可协议](./MODEL_LICENSE)下自由使用、复制、修改和分享FunASR模型，下面列举代表性的模型，更多模型请参考 [模型仓库](./model_zoo)。
+
+（注：⭐ 表示ModelScope模型仓库，🤗 表示Huggingface模型仓库，🍀表示OpenAI模型仓库）
+
+
+|                                                                                                     模型名字                                                                                                      |        任务详情        |      训练数据      |  参数量   | 
+|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------:|:--------------:|:------:|
+|                                  SenseVoiceSmall <br> ([⭐](https://www.modelscope.cn/models/iic/SenseVoiceSmall)  [🤗](https://huggingface.co/FunAudioLLM/SenseVoiceSmall) )                                  |  多种语音理解能力，涵盖了自动语音识别（ASR）、语言识别（LID）、情感识别（SER）以及音频事件检测（AED）   |  400000小时，中文   |  330M  |
+|    paraformer-zh <br> ([⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [🤗](https://huggingface.co/funasr/paraformer-zh) )    |  语音识别，带时间戳输出，非实时   |   60000小时，中文   |  220M  |
+| paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗](https://huggingface.co/funasr/paraformer-zh-streaming) ) |      语音识别，实时       |   60000小时，中文   |  220M  |
+|         paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗](https://huggingface.co/funasr/paraformer-en) )         |      语音识别，非实时      |   50000小时，英文   |  220M  |
+|                      conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗](https://huggingface.co/funasr/conformer-en) )                      |      语音识别，非实时      |   50000小时，英文   |  220M  |
+|                        ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗](https://huggingface.co/funasr/ct-punc) )                         |        标点恢复        |   100M，中文与英文   |  290M  | 
+|                            fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗](https://huggingface.co/funasr/fsmn-vad) )                             |     语音端点检测，实时      |  5000小时，中文与英文  |  0.4M  | 
+|                                                       fsmn-kws <br> ( [⭐](https://modelscope.cn/models/iic/speech_charctc_kws_phone-xiaoyun/summary) )                                                        |     语音唤醒，实时      |  5000小时，中文  |  0.7M  | 
+|                              fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗](https://huggingface.co/funasr/fa-zh) )                               |      字级别时间戳预测      |   50000小时，中文   |  38M   |
+|                                 cam++ <br> ( [⭐](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [🤗](https://huggingface.co/funasr/campplus) )                                 |      说话人确认/分割      |     5000小时     |  7.2M  | 
+|                                     Whisper-large-v3 <br> ([⭐](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary)  [🍀](https://github.com/openai/whisper) )                                      |  语音识别，带时间戳输出，非实时   |      多语言       | 1550 M |
+|                               Whisper-large-v3-turbo <br> ([⭐](https://www.modelscope.cn/models/iic/Whisper-large-v3-turbo/summary)  [🍀](https://github.com/openai/whisper) )                                |  语音识别，带时间戳输出，非实时   |      多语言       | 809 M |
+|                                         Qwen-Audio <br> ([⭐](examples/industrial_data_pretraining/qwen_audio/demo.py)  [🤗](https://huggingface.co/Qwen/Qwen-Audio) )                                         |  音频文本多模态大模型（预训练）   |      多语言       |   8B   |
+|                                 Qwen-Audio-Chat <br> ([⭐](examples/industrial_data_pretraining/qwen_audio/demo_chat.py)  [🤗](https://huggingface.co/Qwen/Qwen-Audio-Chat) )                                  | 音频文本多模态大模型（chat版本） |      多语言       |   8B   |
+|                        emotion2vec+large <br> ([⭐](https://modelscope.cn/models/iic/emotion2vec_plus_large/summary)  [🤗](https://huggingface.co/emotion2vec/emotion2vec_plus_large) )                        |    情感识别模型          | 40000小时，4种情感类别 |  300M  |
+
+<a name="快速开始"></a>
+## 快速开始
+
+下面为快速上手教程，测试音频（[中文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav)，[英文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav)）
+
+### 可执行命令行
+
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
+```
+
+注：支持单条音频文件识别，也支持文件列表，列表为kaldi风格wav.scp：`wav_id   wav_path`
+
+### 非实时语音识别
+#### SenseVoice
+```python
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+
+model_dir = "iic/SenseVoiceSmall"
+
+model = AutoModel(
+    model=model_dir,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device="cuda:0",
+)
+
+# en
+res = model.generate(
+    input=f"{model.model_path}/example/en.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+```
+参数说明：
+- `model_dir`：模型名称，或本地磁盘中的模型路径。
+- `vad_model`：表示开启VAD，VAD的作用是将长音频切割成短音频，此时推理耗时包括了VAD与SenseVoice总耗时，为链路耗时，如果需要单独测试SenseVoice模型耗时，可以关闭VAD模型。
+- `vad_kwargs`：表示VAD模型配置,`max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms。
+- `use_itn`：输出结果中是否包含标点与逆文本正则化。
+- `batch_size_s` 表示采用动态batch，batch中总音频时长，单位为秒s。
+- `merge_vad`：是否将 vad 模型切割的短音频碎片合成，合并后长度为`merge_length_s`，单位为秒s。
+- `ban_emo_unk`：禁用emo_unk标签，禁用后所有的句子都会被赋与情感标签。
+
+#### Paraformer
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh",  vad_model="fsmn-vad", punc_model="ct-punc", 
+                  # spk_model="cam++"
+                  )
+res = model.generate(input=f"{model.model_path}/example/asr_example.wav", 
+            batch_size_s=300, 
+            hotword='魔搭')
+print(res)
+```
+注：`hub`：表示模型仓库，`ms`为选择modelscope下载，`hf`为选择huggingface下载。
+
+### 实时语音识别
+
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+
+注：`chunk_size`为流式延时配置，`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`，未来信息为`5*60=300ms`。每次推理输入为`600ms`（采样点数为`16000*0.6=960`），输出为对应文字，最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
+
+<details><summary>更多例子</summary>
+
+### 语音端点检测（非实时）
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad")
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+注：VAD模型输出格式为：`[[beg1, end1], [beg2, end2], .., [begN, endN]]`，其中`begN/endN`表示第`N`个有效音频片段的起始点/结束点，
+单位为毫秒。
+
+### 语音端点检测（实时）
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+注：流式VAD模型输出格式为4种情况：
+- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`：同上离线VAD输出结果。
+- `[[beg, -1]]`：表示只检测到起始点。
+- `[[-1, end]]`：表示只检测到结束点。
+- `[]`：表示既没有检测到起始点，也没有检测到结束点
+输出结果单位为毫秒，从起始点开始的绝对时间。
+
+### 标点恢复
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc")
+
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+
+### 时间戳预测
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+
+### 情感识别
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="emotion2vec_plus_large")
+
+wav_file = f"{model.model_path}/example/test.wav"
+
+res = model.generate(wav_file, output_dir="./outputs", granularity="utterance", extract_embedding=False)
+print(res)
+```
+
+更详细（[教程文档](docs/tutorial/README_zh.md)），
+更多（[模型示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）
+
+</details>
+
+## 导出ONNX
+### 从命令行导出
+```shell
+funasr-export ++model=paraformer ++quantize=false
+```
+
+### 从Python导出
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer")
+
+res = model.export(quantize=False)
+```
+
+### 测试ONNX
+```python
+# pip3 install -U funasr-onnx
+from funasr_onnx import Paraformer
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
+```
+
+更多例子请参考 [样例](runtime/python/onnxruntime)
+
+<a name="服务部署"></a>
+## 服务部署
+FunASR支持预训练或者进一步微调的模型进行服务部署。目前支持以下几种服务部署：
+
+- 中文离线文件转写服务（CPU版本），已完成
+- 中文流式语音识别服务（CPU版本），已完成
+- 英文离线文件转写服务（CPU版本），已完成
+- 中文离线文件转写服务（GPU版本），进行中
+- 更多支持中
+
+详细信息可以参阅([服务部署文档](runtime/readme_cn.md))。
+
+
+<a name="社区交流"></a>
+## 联系我们
+
+如果您在使用中遇到问题，可以直接在github页面提Issues。欢迎语音兴趣爱好者扫描以下的钉钉群二维码加入社区群，进行交流和讨论。
+
+|                                 钉钉群                                 |
+|:-------------------------------------------------------------------:|
+| <div align="left"><img src="docs/images/dingding.png" width="250"/> |
+
+## 社区贡献者
+
+| <div align="left"><img src="docs/images/alibaba.png" width="260"/> | <div align="left"><img src="docs/images/nwpu.png" width="260"/> | <img src="docs/images/China_Telecom.png" width="200"/> </div>  | <img src="docs/images/RapidAI.png" width="200"/> </div> | <img src="docs/images/aihealthx.png" width="200"/> </div> | <img src="docs/images/XVERSE.png" width="250"/> </div> |
+|:------------------------------------------------------------------:|:---------------------------------------------------------------:|:--------------------------------------------------------------:|:-------------------------------------------------------:|:-----------------------------------------------------------:|:------------------------------------------------------:|
+
+贡献者名单请参考（[致谢名单](./Acknowledge.md)）
+
+
+## 许可协议
+项目遵循[The MIT License](https://opensource.org/licenses/MIT)开源协议，模型许可协议请参考（[模型协议](./MODEL_LICENSE)）
+
+
+## 论文引用
+
+``` bibtex
+@inproceedings{gao2023funasr,
+  author={Zhifu Gao and Zerui Li and Jiaming Wang and Haoneng Luo and Xian Shi and Mengzhe Chen and Yabin Li and Lingyun Zuo and Zhihao Du and Zhangyu Xiao and Shiliang Zhang},
+  title={FunASR: A Fundamental End-to-End Speech Recognition Toolkit},
+  year={2023},
+  booktitle={INTERSPEECH},
+}
+@inproceedings{An2023bat,
+  author={Keyu An and Xian Shi and Shiliang Zhang},
+  title={BAT: Boundary aware transducer for memory-efficient and low-latency ASR},
+  year={2023},
+  booktitle={INTERSPEECH},
+}
+@inproceedings{gao22b_interspeech,
+  author={Zhifu Gao and ShiLiang Zhang and Ian McLoughlin and Zhijie Yan},
+  title={{Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition}},
+  year=2022,
+  booktitle={Proc. Interspeech 2022},
+  pages={2063--2067},
+  doi={10.21437/Interspeech.2022-9996}
+}
+@article{shi2023seaco,
+  author={Xian Shi and Yexin Yang and Zerui Li and Yanni Chen and Zhifu Gao and Shiliang Zhang},
+  title={{SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and Effective Hotword Customization Ability}},
+  year=2023,
+  journal={arXiv preprint arXiv:2308.03266(accepted by ICASSP2024)},
+}
+```
--- a/modules/python/vendors/FunASR/benchmarks/benchmark_pipeline_cer.md
+++ b/modules/python/vendors/FunASR/benchmarks/benchmark_pipeline_cer.md
@@ -0,0 +1,216 @@
+# Leaderboard IO
+
+
+## Configuration
+### Data set:
+[Aishell1](https://www.openslr.org/33/): dev, test
+
+[Aishell2](https://www.aishelltech.com/aishell_2): dev_ios, test_ios, test_android, test_mic
+
+[WenetSpeech](https://github.com/wenet-e2e/WenetSpeech): dev, test_meeting, test_net
+
+
+### Tools
+#### [Install Requirements](https://alibaba-damo-academy.github.io/FunASR/en/installation/installation.html#installation)
+
+Install ModelScope and FunASR from pip
+```shell
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+#pip install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+
+Or install FunASR from source code
+```shell
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
+pip install -e ./
+# For the users in China, you could install with the command:
+# pip install -e ./ -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+
+
+#### Recipe
+
+
+##### [Test CER](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/asr_pipeline.html#inference-with-multi-thread-cpus-or-multi-gpus)
+set the `model`, `data_dir` and `output_dir` in `infer.sh`.
+```shell
+cd egs_modelscope/asr/TEMPLATE
+bash infer.sh
+```
+
+## Benchmark CER
+
+
+### Chinese Dataset
+
+
+<table border="1">
+    <tr align="center">
+        <td style="border: 1px solid">Model</td>
+        <td style="border: 1px solid">Offline/Online</td>
+        <td colspan="2" style="border: 1px solid">Aishell1</td>
+        <td colspan="4" style="border: 1px solid">Aishell2</td>
+        <td colspan="3" style="border: 1px solid">WenetSpeech</td>
+    </tr>
+    <tr align="center">
+        <td style="border: 1px solid"></td>
+        <td style="border: 1px solid"></td>
+        <td style="border: 1px solid">dev</td> 
+        <td style="border: 1px solid">test</td>
+        <td style="border: 1px solid">dev_ios</td>
+        <td style="border: 1px solid">test_ios</td>
+        <td style="border: 1px solid">test_android</td>
+        <td style="border: 1px solid">test_mic</td>
+        <td style="border: 1px solid">dev</td>
+        <td style="border: 1px solid">test_meeting</td>
+        <td style="border: 1px solid">test_net</td>
+    </tr>
+    <tr align="center">
+        <td style="border: 1px solid"> <a href="https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary">Paraformer-large</a> </td>
+        <td style="border: 1px solid">Offline</td>
+        <td style="border: 1px solid">1.76</td>
+        <td style="border: 1px solid">1.94</td>
+        <td style="border: 1px solid">2.79</td>
+        <td style="border: 1px solid">2.84</td>
+        <td style="border: 1px solid">3.08</td>
+        <td style="border: 1px solid">3.03</td>
+        <td style="border: 1px solid">3.43</td>
+        <td style="border: 1px solid">7.01</td>
+        <td style="border: 1px solid">6.66</td>
+    </tr>
+    <tr align="center">
+        <td style="border: 1px solid"> <a href="https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary">Paraformer-large-long</a> </td> 
+        <td style="border: 1px solid">Offline</td>      
+        <td style="border: 1px solid">1.80</td>
+        <td style="border: 1px solid">2.10</td>
+        <td style="border: 1px solid">2.78</td>
+        <td style="border: 1px solid">2.87</td>
+        <td style="border: 1px solid">3.12</td>
+        <td style="border: 1px solid">3.11</td>
+        <td style="border: 1px solid">3.44</td>
+        <td style="border: 1px solid">13.28</td>
+        <td style="border: 1px solid">7.08</td>
+    </tr>
+    <tr align="center">
+        <td style="border: 1px solid"> <a href="https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary">Paraformer-large-contextual</a> </td>
+        <td style="border: 1px solid">Offline</td>
+        <td style="border: 1px solid">1.76</td>
+        <td style="border: 1px solid">2.02</td>
+        <td style="border: 1px solid">2.73</td>
+        <td style="border: 1px solid">2.85</td>
+        <td style="border: 1px solid">2.98</td>
+        <td style="border: 1px solid">2.95</td>
+        <td style="border: 1px solid">3.42</td>
+        <td style="border: 1px solid">7.16</td>
+        <td style="border: 1px solid">6.72</td>
+    </tr>
+    <tr align="center">
+        <td style="border: 1px solid"> <a href="https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary">Paraformer-large-online</a> </td>
+        <td style="border: 1px solid">Online</td>
+        <td style="border: 1px solid">2.37</td>
+        <td style="border: 1px solid">3.34</td>
+        <td style="border: 1px solid">4.04</td>
+        <td style="border: 1px solid">3.86</td>
+        <td style="border: 1px solid">4.38</td>
+        <td style="border: 1px solid">4.21</td>
+        <td style="border: 1px solid">4.55</td>
+        <td style="border: 1px solid">10.64</td>
+        <td style="border: 1px solid">7.78</td>
+    </tr>
+    <tr align="center">
+        <td style="border: 1px solid"> <a href="https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary">Paraformer</a> </td> 
+        <td style="border: 1px solid">Offline</td>
+        <td style="border: 1px solid">3.24</td>
+        <td style="border: 1px solid">3.69</td>
+        <td style="border: 1px solid">4.58</td>
+        <td style="border: 1px solid">4.63</td>
+        <td style="border: 1px solid">4.83</td>
+        <td style="border: 1px solid">4.71</td>
+        <td style="border: 1px solid">4.19</td>
+        <td style="border: 1px solid">8.32</td>
+        <td style="border: 1px solid">9.19</td>
+    </tr>
+   <tr align="center">
+        <td style="border: 1px solid"> <a href="https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/summary">UniASR</a> </td> 
+        <td style="border: 1px solid">Online</td>
+        <td style="border: 1px solid">3.34</td>
+        <td style="border: 1px solid">3.99</td>
+        <td style="border: 1px solid">4.62</td>
+        <td style="border: 1px solid">4.52</td>
+        <td style="border: 1px solid">4.77</td>
+        <td style="border: 1px solid">4.73</td>
+        <td style="border: 1px solid">4.51</td>
+        <td style="border: 1px solid">10.63</td>
+        <td style="border: 1px solid">9.70</td>
+    </tr>
+   <tr align="center">
+        <td style="border: 1px solid"> <a href="https://modelscope.cn/models/damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary">UniASR-large</a> </td> 
+        <td style="border: 1px solid">Offline</td>      
+        <td style="border: 1px solid">2.93</td>
+        <td style="border: 1px solid">3.48</td>
+        <td style="border: 1px solid">3.95</td>
+        <td style="border: 1px solid">3.87</td>
+        <td style="border: 1px solid">4.11</td>
+        <td style="border: 1px solid">4.11</td>
+        <td style="border: 1px solid">4.16</td>
+        <td style="border: 1px solid">10.09</td>
+        <td style="border: 1px solid">8.69</td>
+    </tr>
+    <tr align="center">
+        <td style="border: 1px solid"> <a href="https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell1-pytorch/summary">Paraformer-aishell</a> </td>
+        <td style="border: 1px solid">Offline</td>
+        <td style="border: 1px solid">4.88</td>
+        <td style="border: 1px solid">5.43</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+    </tr>
+   <tr align="center">
+        <td style="border: 1px solid"> <a href="https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary">ParaformerBert-aishell</a> </td>
+        <td style="border: 1px solid">Offline</td>
+        <td style="border: 1px solid">6.14</td>
+        <td style="border: 1px solid">7.01</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+    </tr>
+   <tr align="center">
+        <td style="border: 1px solid"> <a href="https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary">Paraformer-aishell2</a> </td> 
+        <td style="border: 1px solid">Offline</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">5.82</td>
+        <td style="border: 1px solid">6.30</td>
+        <td style="border: 1px solid">6.60</td>
+        <td style="border: 1px solid">5.83</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+    </tr>
+   <tr align="center">
+        <td style="border: 1px solid"> <a href="https://www.modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary">ParaformerBert-aishell2</a> </td> 
+        <td style="border: 1px solid">Offline</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">4.95</td>
+        <td style="border: 1px solid">5.45</td>
+        <td style="border: 1px solid">5.59</td>
+        <td style="border: 1px solid">5.83</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+        <td style="border: 1px solid">-</td>
+    </tr>
+</table>
+
+
+### English Dataset
+
--- a/modules/python/vendors/FunASR/data/list/train.jsonl
+++ b/modules/python/vendors/FunASR/data/list/train.jsonl
@@ -0,0 +1,4 @@
+{"key": "BAC009S0764W0121", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav", "source_len": 90, "target": "甚至出现交易几乎停滞的情况", "target_len": 13}
+{"key": "BAC009S0916W0489", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav", "source_len": 90, "target": "湖北一公司以员工名义贷款数十员工负债千万", "target_len": 20}
+{"key": "asr_example_cn_en", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav", "source_len": 91, "target": "所有只要处理 data 不管你是做 machine learning 做 deep learning 做 data analytics 做 data science 也好 scientist 也好通通都要都做的基本功啊那 again 先先对有一些也许对", "target_len": 19}
+{"key": "ID0012W0014", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav", "source_len": 88, "target": "he tried to think how it could be", "target_len": 8}
--- a/modules/python/vendors/FunASR/data/list/train_emo.txt
+++ b/modules/python/vendors/FunASR/data/list/train_emo.txt
@@ -0,0 +1,4 @@
+BAC009S0764W0121 <|NEUTRAL|>
+BAC009S0916W0489 <|NEUTRAL|>
+asr_example_cn_en <|NEUTRAL|>
+ID0012W0014 <|NEUTRAL|>
--- a/modules/python/vendors/FunASR/data/list/train_event.txt
+++ b/modules/python/vendors/FunASR/data/list/train_event.txt
@@ -0,0 +1,4 @@
+BAC009S0764W0121 <|Speech|>
+BAC009S0916W0489 <|Speech|>
+asr_example_cn_en <|Speech|>
+ID0012W0014 <|Speech|>
--- a/modules/python/vendors/FunASR/data/list/train_text.txt
+++ b/modules/python/vendors/FunASR/data/list/train_text.txt
@@ -0,0 +1,4 @@
+BAC009S0764W0121 甚至出现交易几乎停滞的情况
+BAC009S0916W0489 湖北一公司以员工名义贷款数十员工负债千万
+asr_example_cn_en 所有只要处理 data 不管你是做 machine learning 做 deep learning 做 data analytics 做 data science 也好 scientist 也好通通都要都做的基本功啊那 again 先先对有一些也许对
+ID0012W0014 he tried to think how it could be
--- a/modules/python/vendors/FunASR/data/list/train_text_language.txt
+++ b/modules/python/vendors/FunASR/data/list/train_text_language.txt
@@ -0,0 +1,4 @@
+BAC009S0764W0121 <|zh|>
+BAC009S0916W0489 <|zh|>
+asr_example_cn_en <|zh|>
+ID0012W0014 <|en|>
--- a/modules/python/vendors/FunASR/data/list/train_wav.scp
+++ b/modules/python/vendors/FunASR/data/list/train_wav.scp
@@ -0,0 +1,4 @@
+BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
+BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
+asr_example_cn_en https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
+ID0012W0014 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav
--- a/modules/python/vendors/FunASR/data/list/val.jsonl
+++ b/modules/python/vendors/FunASR/data/list/val.jsonl
@@ -0,0 +1,2 @@
+{"key": "ID0012W0013", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", "source_len": 88, "target": "欢迎大家来体验达摩院推出的语音识别模型", "target_len": 19}
+{"key": "ID0012W0014", "source": "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav", "source_len": 88, "target": "he tried to think how it could be", "target_len": 8}
--- a/modules/python/vendors/FunASR/data/list/val_text.txt
+++ b/modules/python/vendors/FunASR/data/list/val_text.txt
@@ -0,0 +1,2 @@
+ID0012W0013 欢迎大家来体验达摩院推出的语音识别模型
+ID0012W0014 he tried to think how it could be
--- a/modules/python/vendors/FunASR/data/list/val_wav.scp
+++ b/modules/python/vendors/FunASR/data/list/val_wav.scp
@@ -0,0 +1,2 @@
+ID0012W0013 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav
+ID0012W0014 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav
--- a/modules/python/vendors/FunASR/docs/Makefile
+++ b/modules/python/vendors/FunASR/docs/Makefile
@@ -0,0 +1,21 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = FunASR
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/modules/python/vendors/FunASR/docs/README.md
+++ b/modules/python/vendors/FunASR/docs/README.md
@@ -0,0 +1,19 @@
+# FunASR document generation
+
+## Generate HTML
+For convenience, we provide users with the ability to generate local HTML manually.
+
+First, you should install the following packages, which is required for building HTML:
+
+```sh
+pip3 install -U "funasr[docs]"
+```
+
+Then you can generate HTML manually.
+
+```sh
+cd docs
+make html
+```
+
+The generated files are all contained in the "FunASR/docs/_build" directory. You can access the FunASR documentation by simply opening the "html/index.html" file in your browser from this directory.
--- a/modules/python/vendors/FunASR/docs/conf.py
+++ b/modules/python/vendors/FunASR/docs/conf.py
@@ -0,0 +1,67 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = "FunASR"
+copyright = "2022, Speech Lab, Alibaba Group"
+author = "Speech Lab, Alibaba Group"
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "nbsphinx",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.todo",
+    # "sphinxarg.ext",
+    "sphinx_markdown_tables",
+    "recommonmark",
+    "sphinx_rtd_theme",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+source_suffix = [".rst", ".md"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+
+html_theme = "sphinx_rtd_theme"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ['_static']
--- a/modules/python/vendors/FunASR/docs/images/China_Telecom.png
+++ b/modules/python/vendors/FunASR/docs/images/China_Telecom.png
--- a/modules/python/vendors/FunASR/docs/images/DeepScience.png
+++ b/modules/python/vendors/FunASR/docs/images/DeepScience.png
--- a/modules/python/vendors/FunASR/docs/images/RapidAI.png
+++ b/modules/python/vendors/FunASR/docs/images/RapidAI.png
--- a/modules/python/vendors/FunASR/docs/images/XVERSE.png
+++ b/modules/python/vendors/FunASR/docs/images/XVERSE.png
--- a/modules/python/vendors/FunASR/docs/images/aihealthx.png
+++ b/modules/python/vendors/FunASR/docs/images/aihealthx.png
--- a/modules/python/vendors/FunASR/docs/images/alibaba.png
+++ b/modules/python/vendors/FunASR/docs/images/alibaba.png
--- a/modules/python/vendors/FunASR/docs/images/damo.png
+++ b/modules/python/vendors/FunASR/docs/images/damo.png
--- a/modules/python/vendors/FunASR/docs/images/dingding.png
+++ b/modules/python/vendors/FunASR/docs/images/dingding.png
--- a/modules/python/vendors/FunASR/docs/images/funasr_logo.jpg
+++ b/modules/python/vendors/FunASR/docs/images/funasr_logo.jpg
--- a/modules/python/vendors/FunASR/docs/images/funasr_overview.png
+++ b/modules/python/vendors/FunASR/docs/images/funasr_overview.png
--- a/modules/python/vendors/FunASR/docs/images/logo.png
+++ b/modules/python/vendors/FunASR/docs/images/logo.png
--- a/modules/python/vendors/FunASR/docs/images/nwpu.png
+++ b/modules/python/vendors/FunASR/docs/images/nwpu.png
--- a/modules/python/vendors/FunASR/docs/images/wechat.png
+++ b/modules/python/vendors/FunASR/docs/images/wechat.png
--- a/modules/python/vendors/FunASR/docs/index.rst
+++ b/modules/python/vendors/FunASR/docs/index.rst
@@ -0,0 +1,121 @@
+.. Funasr documentation master file, created by
+   sphinx-quickstart on Tues Dec 6 19:05:00 2022.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+FunASR: A Fundamental End-to-End Speech Recognition Toolkit
+============================================================
+.. image:: ./images/funasr_logo.jpg
+
+FunASR hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model released on `ModelScope <https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition>`_, researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun！
+
+Overview
+============================================================
+.. image:: ./images/funasr_overview.png
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Installation
+
+   ./installation/installation.md
+   ./installation/docker.md
+
+.. toctree::
+   :maxdepth: 5
+   :caption: Quick Start
+
+   ./funasr/quick_start.md
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Academic Egs
+
+   ./academic_recipe/asr_recipe.md
+   ./academic_recipe/punc_recipe.md
+   ./academic_recipe/vad_recipe.md
+   ./academic_recipe/sv_recipe.md
+   ./academic_recipe/sd_recipe.md
+
+
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: ModelScope Egs
+
+   ./modelscope_pipeline/quick_start.md
+   ./egs_modelscope/asr/TEMPLATE/README.md
+   ./egs_modelscope/vad/TEMPLATE/README.md
+   ./egs_modelscope/punctuation/TEMPLATE/README.md
+   ./egs_modelscope/tp/TEMPLATE/README.md
+   ./modelscope_pipeline/sv_pipeline.md
+   ./modelscope_pipeline/sd_pipeline.md
+   ./modelscope_pipeline/itn_pipeline.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Huggingface Egs
+
+   Undo
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Model Zoo
+
+   ./model_zoo/modelscope_models.md
+   ./model_zoo/huggingface_models.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Runtime and Service
+
+   ./runtime/readme.md
+   ./runtime/docs/SDK_tutorial_online.md
+   ./runtime/docs/SDK_tutorial.md
+   ./runtime/html5/readme.md
+
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Benchmark and Leaderboard
+
+   ./benchmark/benchmark_onnx.md
+   ./benchmark/benchmark_onnx_cpp.md
+   ./benchmark/benchmark_libtorch.md
+   ./benchmark/benchmark_pipeline_cer.md
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Funasr Library
+
+   ./reference/build_task.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Papers
+
+   ./reference/papers.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Application
+
+   ./reference/application.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: FQA
+
+   ./reference/FQA.md
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
--- a/modules/python/vendors/FunASR/docs/installation/docker.md
+++ b/modules/python/vendors/FunASR/docs/installation/docker.md
@@ -0,0 +1,72 @@
+([简体中文](./docker_zh.md)|English)
+
+# Docker
+
+## Install Docker
+
+### Ubuntu
+```shell
+curl -fsSL https://test.docker.com -o test-docker.sh
+sudo sh test-docker.sh
+```
+### Debian
+```shell
+ curl -fsSL https://get.docker.com -o get-docker.sh
+ sudo sh get-docker.sh
+```
+
+### CentOS
+```shell
+curl -fsSL https://get.docker.com | bash -s docker --mirror Aliyun
+```
+
+### MacOS
+```shell
+brew install --cask --appdir=/Applications docker
+```
+
+### Windows
+Ref to [docs](https://docs.docker.com/desktop/install/windows-install/)
+
+## Start Docker
+```shell
+sudo systemctl start docker
+```
+## Download image
+
+### Image Hub
+
+#### CPU
+`registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-cpu-0.4.1`
+
+#### GPU
+
+`registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py38-torch1.11.0-tf1.15.5-1.8.1`
+
+### Pull Image
+```shell
+sudo docker pull <image-name>:<tag>
+```
+
+### Check Image 
+```shell
+sudo docker images
+```
+
+## Run Docker
+```shell
+# cpu
+sudo docker run -itd --name funasr -v <local_dir:dir_in_docker> <image-name>:<tag> /bin/bash
+# gpu
+sudo docker run -itd --gpus all --name funasr -v <local_dir:dir_in_docker> <image-name>:<tag> /bin/bash
+
+sudo docker exec -it funasr /bin/bash
+```
+
+## Stop Docker
+```shell
+exit
+sudo docker ps
+sudo docker stop funasr
+```
+
--- a/modules/python/vendors/FunASR/docs/installation/docker_zh.md
+++ b/modules/python/vendors/FunASR/docs/installation/docker_zh.md
@@ -0,0 +1,72 @@
+(简体中文|[English](./docker.md))
+
+# Docker
+
+## 安装Docker
+
+### Ubuntu
+```shell
+curl -fsSL https://test.docker.com -o test-docker.sh
+sudo sh test-docker.sh
+```
+### Debian
+```shell
+ curl -fsSL https://get.docker.com -o get-docker.sh
+ sudo sh get-docker.sh
+```
+
+### CentOS
+```shell
+curl -fsSL https://get.docker.com | bash -s docker --mirror Aliyun
+```
+
+### MacOS
+```shell
+brew install --cask --appdir=/Applications docker
+```
+
+### Windows
+请参考[文档](https://docs.docker.com/desktop/install/windows-install/)
+
+## 启动Docker
+```shell
+sudo systemctl start docker
+```
+## 下载Docker镜像
+
+### 镜像仓库
+
+#### CPU
+`registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-cpu-0.4.1`
+
+#### GPU
+
+`registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py38-torch1.11.0-tf1.15.5-1.8.1`
+
+### 拉取镜像
+```shell
+sudo docker pull <image-name>:<tag>
+```
+
+### 查看镜像
+```shell
+sudo docker images
+```
+
+## 运行Docker
+```shell
+# cpu
+sudo docker run -itd --name funasr -v <local_dir:dir_in_docker> <image-name>:<tag> /bin/bash
+# gpu
+sudo docker run -itd --gpus all --name funasr -v <local_dir:dir_in_docker> <image-name>:<tag> /bin/bash
+
+sudo docker exec -it funasr /bin/bash
+```
+
+## 停止Docker
+```shell
+exit
+sudo docker ps
+sudo docker stop funasr
+```
+
--- a/modules/python/vendors/FunASR/docs/installation/installation.md
+++ b/modules/python/vendors/FunASR/docs/installation/installation.md
@@ -0,0 +1,74 @@
+([简体中文](./installation_zh.md)|English)
+
+<p align="left">
+    <a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-brightgreen.svg"></a>
+    <a href=""><img src="https://img.shields.io/badge/Python->=3.7,<=3.10-aff.svg"></a>
+    <a href=""><img src="https://img.shields.io/badge/Pytorch-%3E%3D1.11-blue"></a>
+</p>
+
+## Installation
+
+### Install Conda (Optional):
+
+#### Linux
+```sh
+wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+sh Miniconda3-latest-Linux-x86_64.sh
+source ~/.bashrc
+conda create -n funasr python=3.8
+conda activate funasr
+```
+#### Mac
+```sh
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+# For M1 chip
+# wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh
+sh Miniconda3-latest-MacOSX*
+source ~/.zashrc
+conda create -n funasr python=3.8
+conda activate funasr
+```
+#### Windows
+Ref to [docs](https://docs.conda.io/en/latest/miniconda.html#windows-installers)
+
+### Install Pytorch (version >= 1.11.0):
+
+```sh
+pip3 install torch torchaudio
+```
+If there exists CUDAs in your environments, you should install the pytorch with the version matching the CUDA. The matching list could be found in [docs](https://pytorch.org/get-started/previous-versions/).
+### Install funasr
+
+#### Install from pip
+
+```shell
+pip3 install -U funasr
+# For the users in China, you could install with the command:
+# pip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+
+#### Or install from source code
+
+``` sh
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
+pip3 install -e ./
+# For the users in China, you could install with the command:
+# pip3 install -e ./ -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+
+### Install modelscope (Optional)
+If you want to use the pretrained models in ModelScope, you should install the modelscope:
+
+```shell
+pip3 install -U modelscope
+# For the users in China, you could install with the command:
+# pip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+
+### FQA
+- For installation on MAC M1 chip, the following error may happen:
+- - _cffi_backend.cpython-38-darwin.so' (mach-o file, but is an incompatible architecture (have (x86_64), need (arm64e)))
+    ```shell
+    pip uninstall cffi pycparser
+    ARCHFLAGS="-arch arm64" pip install cffi pycparser --compile --no-cache-dir
+    ```
--- a/modules/python/vendors/FunASR/docs/installation/installation_zh.md
+++ b/modules/python/vendors/FunASR/docs/installation/installation_zh.md
@@ -0,0 +1,75 @@
+(简体中文|[English](./installation.md))
+
+<p align="left">
+    <a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-brightgreen.svg"></a>
+    <a href=""><img src="https://img.shields.io/badge/Python->=3.7,<=3.10-aff.svg"></a>
+    <a href=""><img src="https://img.shields.io/badge/Pytorch-%3E%3D1.11-blue"></a>
+</p>
+
+## 安装
+
+### 安装Conda（可选）：
+
+#### Linux
+```sh
+wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+sh Miniconda3-latest-Linux-x86_64.sh
+source ~/.bashrc
+conda create -n funasr python=3.8
+conda activate funasr
+```
+#### Mac
+```sh
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+# For M1 chip
+# wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh
+sh Miniconda3-latest-MacOSX*
+source ~/.zashrc
+conda create -n funasr python=3.8
+conda activate funasr
+```
+#### Windows
+Ref to [docs](https://docs.conda.io/en/latest/miniconda.html#windows-installers)
+
+### 安装Pytorch（版本 >= 1.11.0）：
+
+```sh
+pip3 install torch torchaudio
+```
+如果您的环境中存在CUDAs，则应安装与CUDA匹配版本的pytorch，匹配列表可在文档中找到（[文档](https://pytorch.org/get-started/previous-versions/)）。
+### 安装funasr
+
+#### 从pip安装
+
+```shell
+pip3 install -U funasr
+# 对于中国大陆用户，可以使用以下命令进行安装：
+# pip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+
+#### 或者从源代码安装
+
+``` sh
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
+pip3 install -e ./
+# 对于中国大陆用户，可以使用以下命令进行安装：
+# pip3 install -e ./ -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+
+### 安装modelscope（可选）
+
+如果您想要使用ModelScope中的预训练模型，则应安装modelscope:
+
+```shell
+pip3 install -U modelscope
+# 对于中国大陆用户，可以使用以下命令进行安装：
+# pip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+
+### 常见问题解答
+- 在MAC M1芯片上安装时，可能会出现以下错误：
+- - _cffi_backend.cpython-38-darwin.so' (mach-o file, but is an incompatible architecture (have (x86_64), need (arm64e)))
+    ```shell
+    pip uninstall cffi pycparser
+    ARCHFLAGS="-arch arm64" pip install cffi pycparser --compile --no-cache-dir
+    ```
--- a/modules/python/vendors/FunASR/docs/m2met2/Baseline.md
+++ b/modules/python/vendors/FunASR/docs/m2met2/Baseline.md
@@ -0,0 +1,38 @@
+# Baseline
+## Overview
+We will release an E2E SA-ASR baseline conducted on [FunASR](https://github.com/alibaba-damo-academy/FunASR) at the time according to the timeline. The model architecture is shown in Figure 3. The SpeakerEncoder is initialized with a pre-trained speaker verification model from ModelScope. This speaker verification model is also be used to extract the speaker embedding in the speaker profile.
+
+![model archietecture](images/sa_asr_arch.png)
+
+## Quick start
+To run the baseline, first you need to install FunASR and ModelScope. ([installation](https://github.com/alibaba-damo-academy/FunASR#installation))  
+There are two startup scripts, `run.sh` for training and evaluating on the old eval and test sets, and `run_m2met_2023_infer.sh` for inference on the new test set of the Multi-Channel Multi-Party Meeting Transcription 2.0 ([M2MeT2.0](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)) Challenge.  
+Before running `run.sh`, you must manually download and unpack the [AliMeeting](http://www.openslr.org/119/) corpus and place it in the `./dataset` directory:
+```shell
+dataset
+|—— Eval_Ali_far
+|—— Eval_Ali_near
+|—— Test_Ali_far
+|—— Test_Ali_near
+|—— Train_Ali_far
+|—— Train_Ali_near
+```
+Before running `run_m2met_2023_infer.sh`, you need to place the new test set `Test_2023_Ali_far` (to be released after the challenge starts) in the `./dataset` directory, which contains only raw audios. Then put the given `wav.scp`, `wav_raw.scp`, `segments`, `utt2spk` and `spk2utt` in the `./data/Test_2023_Ali_far` directory.  
+```shell
+data/Test_2023_Ali_far
+|—— wav.scp
+|—— wav_raw.scp
+|—— segments
+|—— utt2spk
+|—— spk2utt
+```
+For more details you can see [here](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md)
+
+## Baseline results
+The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy. 
+
+
+|                |SI-CER(%)     |cpCER(%)  |
+|:---------------|:------------:|----------:|
+|oracle profile  |32.72         |42.92      |
+|cluster  profile|32.73         |49.37      |
--- a/modules/python/vendors/FunASR/docs/m2met2/Challenge_result.md
+++ b/modules/python/vendors/FunASR/docs/m2met2/Challenge_result.md
@@ -0,0 +1,14 @@
+# Challenge Result
+The following table shows the final results of the competition, where Sub-track1 represents the sub-track under fixed training condition and Sub-track 2 represents the sub-track under the open training condition. All result in this table is cp-CER (%). The rankings in the table are the combined rankings of the two sub-tracks as all teams' submissions met the requirements of the sub-track under fixed training condition.
+| Rank &nbsp; &nbsp; | Team Name &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;  | Sub-track1 &nbsp; &nbsp; | Sub-track2 &nbsp; &nbsp; | paper |
+|------|----------------------|------------|------------|------------------------|
+| 1    | Ximalaya Speech Team | 11.27      | 11.27      |                        |
+| 2    | 小马达                | 18.64      | 18.64      |                        |
+| 3    | AIzyzx               | 22.83      | 22.83      |                        |
+| 4    | AsrSpeeder           | /          | 23.51      |                        |
+| 5    | zyxlhz               | 24.82      | 24.82      |                        |
+| 6    | CMCAI                | 26.11      | /          |                        |
+| 7    | Volcspeech           | 34.21      | 34.21      |                        |
+| 8    | 鉴往知来              | 40.14      | 40.14      |                        |
+| 9    | baseline             | 41.55      | 41.55      |                        |
+| 10   | DAICT                | 41.64      |            |                        |
--- a/modules/python/vendors/FunASR/docs/m2met2/Contact.md
+++ b/modules/python/vendors/FunASR/docs/m2met2/Contact.md
@@ -0,0 +1,4 @@
+# Contact
+If you have any questions about M2MeT2.0 challenge, please contact us by
+
+- email: [m2met.alimeeting@gmail.com](mailto:m2met.alimeeting@gmail.com)
--- a/modules/python/vendors/FunASR/docs/m2met2/Dataset.md
+++ b/modules/python/vendors/FunASR/docs/m2met2/Dataset.md
@@ -0,0 +1,26 @@
+# Datasets
+## Overview of training data
+In the fixed training condition, the training dataset is restricted to three publicly available corpora, namely, AliMeeting, AISHELL-4, and CN-Celeb. To evaluate the performance of the models trained on these datasets, we will release a new Test set called Test-2023 for scoring and ranking. We will describe the AliMeeting dataset and the Test-2023 set in detail.
+## Detail of AliMeeting corpus
+AliMeeting contains 118.75 hours of speech data in total. The dataset is divided into 104.75 hours for training (Train), 4 hours for evaluation (Eval) and 10 hours as test set (Test) for scoring and ranking. Specifically, the Train, Eval and Test sets contain 212, 8 and 20 sessions, respectively. Each session consists of a 15 to 30-minute discussion by a group of participants. The total number of participants in Train, Eval and Test sets is 456, 25 and 60, respectively, with balanced gender coverage.
+
+The dataset is collected in 13 meeting venues, which are categorized into three types: small, medium, and large rooms with sizes ranging from 8 m$^{2}$ to 55 m$^{2}$. Different rooms give us a variety of acoustic properties and layouts. The detailed parameters of each meeting venue will be released together with the Train data. The type of wall material of the meeting venues covers cement, glass, etc. Other furnishings in meeting venues include sofa, TV, blackboard, fan, air conditioner, plants, etc. During recording, the participants of the meeting sit around the microphone array which is placed on the table and conduct a natural conversation. The microphone-speaker distance ranges from 0.3 m to 5.0 m. All participants are native Chinese speakers speaking Mandarin without strong accents. During the meeting, various kinds of indoor noise including but not limited to clicking, keyboard, door opening/closing, fan, bubble noise, etc., are made naturally. For both Train and Eval sets, the participants are required to remain in the same position during recording. There is no speaker overlap between the Train and Eval set. An example of the recording venue from the Train set is shown in Fig 1.
+
+![meeting room](images/meeting_room.png)
+
+The number of participants within one meeting session ranges from 2 to 4. To ensure the coverage of different overlap ratios, we select various meeting topics during recording, including medical treatment, education, business, organization management, industrial production and other daily routine meetings. The average speech overlap ratio of Train, Eval and Test sets are 42.27\%, 34.76\% and 42.8\%, respectively. More details of AliMeeting are shown in Table 1. A detailed overlap ratio distribution of meeting sessions with different numbers of speakers in the Train, Eval and Test set is shown in Table 2.
+
+![dataset detail](images/dataset_details.png)
+
+The Test-2023 set consists of 20 sessions that were recorded in an identical acoustic setting to that of the AliMeeting corpus. Each meeting session in the Test-2023 dataset comprises between 2 and 4 participants, thereby sharing a similar configuration with the AliMeeting test set.
+
+We also record the near-field signal of each participant using a headset microphone and ensure that only the participant's own speech is recorded and transcribed. It is worth noting that the far-field audio recorded by the microphone array and the near-field audio recorded by the headset microphone will be synchronized to a common timeline range.
+
+All transcriptions of the speech data are prepared in TextGrid format for each session, which contains the information of the session duration, speaker information (number of speaker, speaker-id, gender, etc.), the total number of segments of each speaker, the timestamp and transcription of each segment, etc.
+## Get the data
+The three dataset for training mentioned above can be downloaded at [OpenSLR](https://openslr.org/resources.php). The participants can download via the following links. Particularly, in the baseline we provide convenient data preparation scripts for AliMeeting corpus.
+- [AliMeeting](https://openslr.org/119/)
+- [AISHELL-4](https://openslr.org/111/)
+- [CN-Celeb](https://openslr.org/82/)
+
+Now, the new test set is available [here](https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/AliMeeting/openlr/Test_2023_Ali.tar.gz)
--- a/modules/python/vendors/FunASR/docs/m2met2/Introduction.md
+++ b/modules/python/vendors/FunASR/docs/m2met2/Introduction.md
@@ -0,0 +1,28 @@
+# Introduction
+## Call for participation
+Automatic speech recognition (ASR) and speaker diarization have made significant strides in recent years, resulting in a surge of speech technology applications across various domains. However, meetings present unique challenges to speech technologies due to their complex acoustic conditions and diverse speaking styles, including overlapping speech, variable numbers of speakers, far-field signals in large conference rooms, and environmental noise and reverberation. 
+
+Over the years, several challenges have been organized to advance the development of meeting transcription, including the Rich Transcription evaluation and Computational Hearing in Multisource Environments (CHIME) challenges. The latest iteration of the CHIME challenge has a particular focus on distant automatic speech recognition and developing systems that can generalize across various array topologies and application scenarios. However, while progress has been made in English meeting transcription, language differences remain a significant barrier to achieving comparable results in non-English languages, such as Mandarin. The Multimodal Information Based Speech Processing (MISP) and Multi-Channel Multi-Party Meeting Transcription (M2MeT) challenges have been instrumental in advancing Mandarin meeting transcription. The MISP challenge seeks to address the problem of audio-visual distant multi-microphone signal processing in everyday home environments, while the M2MeT challenge focuses on tackling the speech overlap issue in offline meeting rooms.
+
+The ICASSP2022 M2MeT challenge focuses on meeting scenarios, and it comprises two main tasks: speaker diarization and multi-speaker automatic speech recognition. The former involves identifying who spoke when in the meeting, while the latter aims to transcribe speech from multiple speakers simultaneously, which poses significant technical difficulties due to overlapping speech and acoustic interferences.
+
+Building on the success of the previous M2MeT challenge, we are excited to propose the M2MeT2.0 challenge as an ASRU 2023 challenge special session. In the original M2MeT challenge, the evaluation metric was speaker-independent, which meant that the transcription could be determined, but not the corresponding speaker. To address this limitation and further advance the current multi-talker ASR system towards practicality, the M2MeT2.0 challenge proposes the speaker-attributed ASR task with two sub-tracks: fixed and open training conditions. The speaker-attribute automatic speech recognition (ASR) task aims to tackle the practical and challenging problem of identifying "who spoke what at when". To facilitate reproducible research in this field, we offer a comprehensive overview of the dataset, rules, evaluation metrics, and baseline systems. Furthermore, we will release a carefully curated test set, comprising approximately 10 hours of audio, according to the timeline. The new test set is designed to enable researchers to validate and compare their models' performance and advance the state of the art in this area.
+
+## Timeline(AOE Time)
+- $ April~29, 2023: $ Challenge and registration open.
+- $ May~11, 2023: $ Baseline release.
+- $ May~22, 2023: $ Registration deadline, the due date for participants to join the Challenge.
+- $ June~16, 2023: $ Test data release and leaderboard open.
+- $ June~20, 2023: $ Final submission deadline and leaderboar close.
+- $ June~26, 2023: $ Evaluation result and ranking release.
+- $ July~3, 2023: $ Deadline for paper submission.
+- $ July~10, 2023: $ Deadline for final paper submission.
+- $ December~12\ to\ 16, 2023: $ ASRU Workshop and Challenge Session.
+
+## Guidelines
+
+Interested participants, whether from academia or industry, must register for the challenge by completing the Google form below. The deadline for registration is May 22, 2023. 
+
+[M2MeT2.0 Registration](https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link)
+
+Within three working days, the challenge organizer will send email invitations to eligible teams to participate in the challenge. All qualified teams are required to adhere to the challenge rules, which will be published on the challenge page. Prior to the ranking release time, each participant must submit a system description document detailing their approach and methods. The organizer will select the top ranking submissions to be included in the ASRU2023 Proceedings. 
--- a/modules/python/vendors/FunASR/docs/m2met2/Makefile
+++ b/modules/python/vendors/FunASR/docs/m2met2/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/modules/python/vendors/FunASR/docs/m2met2/Organizers.md
+++ b/modules/python/vendors/FunASR/docs/m2met2/Organizers.md
@@ -0,0 +1,48 @@
+# Organizers
+***Lei Xie, Professor, AISHELL foundation, China***
+
+Email: [lxie@nwpu.edu.cn](mailto:lxie@nwpu.edu.cn)
+
+<img src="images/lxie.jpeg" alt="lxie" width="20%">
+
+
+***Kong Aik Lee, Senior Scientist at Institute for Infocomm Research, A\*Star, Singapore***
+
+Email: [kongaik.lee@ieee.org](mailto:kongaik.lee@ieee.org)
+
+<img src="images/kong.png" alt="kong" width="20%">
+
+
+***Zhijie Yan, Principal Engineer at Alibaba, China***
+Email: [zhijie.yzj@alibaba-inc.com](mailto:zhijie.yzj@alibaba-inc.com)
+
+<img src="images/zhijie.jpg" alt="zhijie" width="20%">
+
+***Shiliang Zhang, Senior Engineer at Alibaba, China***
+Email: [sly.zsl@alibaba-inc.com](mailto:sly.zsl@alibaba-inc.com)
+
+<img src="images/zsl.JPG" alt="zsl" width="20%">
+
+***Yanmin Qian, Professor, Shanghai Jiao Tong University, China***
+
+Email: [yanminqian@sjtu.edu.cn](mailto:yanminqian@sjtu.edu.cn)
+
+<img src="images/qian.jpeg" alt="qian" width="20%">
+
+***Zhuo Chen, Applied Scientist in Microsoft, USA***
+
+Email: [zhuc@microsoft.com](mailto:zhuc@microsoft.com)
+
+<img src="images/chenzhuo.jpg" alt="chenzhuo" width="20%">
+
+***Jian Wu, Applied Scientist in Microsoft, USA***
+
+Email: [wujian@microsoft.com](mailto:wujian@microsoft.com)
+
+<img src="images/wujian.jpg" alt="wujian" width="20%">
+
+***Hui Bu, CEO, AISHELL foundation, China***
+
+Email: [buhui@aishelldata.com](mailto:buhui@aishelldata.com)
+
+<img src="images/buhui.jpeg" alt="buhui" width="20%">
--- a/modules/python/vendors/FunASR/docs/m2met2/Rules.md
+++ b/modules/python/vendors/FunASR/docs/m2met2/Rules.md
@@ -0,0 +1,14 @@
+# Rules
+All participants should adhere to the following rules to be eligible for the challenge.
+
+- Data augmentation is allowed on the original training dataset, including, but not limited to, adding noise or reverberation, speed perturbation and tone change.
+
+- Participants are permitted to use the Eval set for model training, but it is not allowed to use the Test set for this purpose. Instead, the Test set should only be utilized for parameter tuning and model selection. Any use of the Test-2023 dataset that violates these rules is strictly prohibited, including but not limited to the use of the Test set for fine-tuning or training the model.
+  
+- If the cpCER of the two systems on the Test dataset are the same, the system with lower computation complexity will be judged as the superior one.
+  
+- If the forced alignment is used to obtain the frame-level classification label, the forced alignment model must be trained on the basis of the data allowed by the corresponding sub-track.
+  
+- Shallow fusion is allowed to the end-to-end approaches, e.g., LAS, RNNT and Transformer, but the training data of the shallow fusion language model can only come from the transcripts of the allowed training dataset.
+  
+- The right of final interpretation belongs to the organizer. In case of special circumstances, the organizer will coordinate the interpretation.
--- a/modules/python/vendors/FunASR/docs/m2met2/Track_setting_and_evaluation.md
+++ b/modules/python/vendors/FunASR/docs/m2met2/Track_setting_and_evaluation.md
@@ -0,0 +1,17 @@
+# Track & Evaluation 
+## Speaker-Attributed ASR
+The speaker-attributed ASR task poses a unique challenge of transcribing speech from multiple speakers and assigning a speaker label to the transcription. Figure 2 illustrates the difference between the speaker-attributed ASR task and the multi-speaker ASR task. This track allows for the use of the AliMeeting, Aishell4, and Cn-Celeb datasets as constrained data sources during both training and evaluation. The AliMeeting dataset, which was used in the M2MeT challenge, includes Train, Eval, and Test sets. Additionally, a new Test-2023 set, consisting of approximately 10 hours of meeting data recorded in an identical acoustic setting as the AliMeeting corpus, will be released soon for challenge scoring and ranking. It's worth noting that the organizers will not provide the near-field audio, transcriptions, or oracle timestamps of the Test-2023 set. Instead, segments containing multiple speakers will be provided, which can be obtained using a simple voice activity detection (VAD) model.
+
+![task difference](images/task_diff.png)
+
+## Evaluation metric
+The accuracy of a speaker-attributed ASR system is evaluated using the concatenated minimum permutation character error rate (cpCER) metric. The calculation of cpCER involves three steps. Firstly, the reference and hypothesis transcriptions from each speaker in a session are concatenated in chronological order. Secondly, the character error rate (CER) is calculated between the concatenated reference and hypothesis transcriptions, and this process is repeated for all possible speaker permutations. Finally, the permutation with the lowest CER is selected as the cpCER for that session. TThe CER is obtained by dividing the total number of insertions (Ins), substitutions (Sub), and deletions(Del) of characters required to transform the ASR output into the reference transcript by the total number of characters in the reference transcript. Specifically, CER is calculated by:
+
+$$\text{CER} = \frac {\mathcal N_{\text{Ins}} + \mathcal N_{\text{Sub}} + \mathcal N_{\text{Del}} }{\mathcal N_{\text{Total}}} \times 100\%,$$
+
+where $\mathcal N_{\text{Ins}}$, $\mathcal N_{\text{Sub}}$, $\mathcal N_{\text{Del}}$ are the character number of the three errors, and $\mathcal N_{\text{Total}}$ is the total number of characters.
+## Sub-track arrangement
+### Sub-track I (Fixed Training Condition):
+Participants are required to use only the fixed-constrained data (i.e., AliMeeting, Aishell-4, and CN-Celeb) for system development. The usage of any additional data is strictly prohibited. However, participants can use open-source pre-trained models from third-party sources, such as [Hugging Face](https://huggingface.co/models) and [ModelScope](https://www.modelscope.cn/models), provided that the list of utilized models is clearly stated in the final system description paper.
+### Sub-track II (Open Training Condition):
+Participants are allowed to use any publicly available data set, privately recorded data, and manual simulation to build their systems in addition to the fixed-constrained data. They can also utilize any open-source pre-trained models, but it is mandatory to provide a clear list of the data and models used in the final system description paper. If manually simulated data is used, a detailed description of the data simulation scheme must be provided.
--- a/modules/python/vendors/FunASR/docs/m2met2/conf.py
+++ b/modules/python/vendors/FunASR/docs/m2met2/conf.py
@@ -0,0 +1,44 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0"
+copyright = "2023, Speech Lab, Alibaba Group; ASLP Group, Northwestern Polytechnical University"
+author = "Speech Lab, Alibaba Group; Audio, Speech and Language Processing Group, Northwestern Polytechnical University"
+
+
+extensions = [
+    "nbsphinx",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.todo",
+    # "sphinxarg.ext",
+    "sphinx_markdown_tables",
+    # 'recommonmark',
+    "sphinx_rtd_theme",
+    "myst_parser",
+]
+
+myst_enable_extensions = [
+    "colon_fence",
+    "deflist",
+    "dollarmath",
+    "html_image",
+]
+
+myst_heading_anchors = 2
+myst_highlight_code_blocks = True
+myst_update_mathjax = False
+
+templates_path = ["_templates"]
+source_suffix = [".rst", ".md"]
+
+pygments_style = "sphinx"
+
+html_theme = "sphinx_rtd_theme"
--- a/modules/python/vendors/FunASR/docs/m2met2/images/buhui.jpeg
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/buhui.jpeg
--- a/modules/python/vendors/FunASR/docs/m2met2/images/chenzhuo.jpg
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/chenzhuo.jpg
--- a/modules/python/vendors/FunASR/docs/m2met2/images/dataset_details.png
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/dataset_details.png
--- a/modules/python/vendors/FunASR/docs/m2met2/images/kong.png
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/kong.png
--- a/modules/python/vendors/FunASR/docs/m2met2/images/lxie.jpeg
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/lxie.jpeg
--- a/modules/python/vendors/FunASR/docs/m2met2/images/meeting_room.png
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/meeting_room.png
--- a/modules/python/vendors/FunASR/docs/m2met2/images/qian.jpeg
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/qian.jpeg
--- a/modules/python/vendors/FunASR/docs/m2met2/images/sa_asr_arch.png
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/sa_asr_arch.png
--- a/modules/python/vendors/FunASR/docs/m2met2/images/task_diff.png
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/task_diff.png
--- a/modules/python/vendors/FunASR/docs/m2met2/images/wujian.jpg
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/wujian.jpg
--- a/modules/python/vendors/FunASR/docs/m2met2/images/zhijie.jpg
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/zhijie.jpg
--- a/modules/python/vendors/FunASR/docs/m2met2/images/zsl.JPG
+++ b/modules/python/vendors/FunASR/docs/m2met2/images/zsl.JPG
--- a/modules/python/vendors/FunASR/docs/m2met2/index.rst
+++ b/modules/python/vendors/FunASR/docs/m2met2/index.rst
@@ -0,0 +1,26 @@
+.. m2met2 documentation master file, created by
+   sphinx-quickstart on Tue Apr 11 14:18:55 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+ASRU 2023 MULTI-CHANNEL MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0 (M2MeT2.0)
+==================================================================================
+Building on the success of the M2MeT challenge, we are delighted to propose the M2MeT2.0 challenge as a special session at ASRU2023.
+To advance the current state-of-the-art in multi-talker automatic speech recognition, the M2MeT2.0 challenge proposes a speaker-attributed ASR task, comprising two sub-tracks: fixed and open training conditions.
+To facilitate reproducible research, we provide a comprehensive overview of the dataset, challenge rules, evaluation metrics, and baseline systems. 
+
+Now the new test set contains about 10 hours audio is available. You can download from `here <https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/AliMeeting/openlr/Test_2023_Ali.tar.gz>`_
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Contents:
+   
+   ./Introduction
+   ./Dataset
+   ./Track_setting_and_evaluation
+   ./Baseline
+   ./Rules
+   ./Challenge_result
+   ./Organizers
+   ./Contact
+
--- a/modules/python/vendors/FunASR/docs/m2met2/make.bat
+++ b/modules/python/vendors/FunASR/docs/m2met2/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/modules/python/vendors/FunASR/docs/make.bat
+++ b/modules/python/vendors/FunASR/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/modules/python/vendors/FunASR/docs/reference/FQA.md
+++ b/modules/python/vendors/FunASR/docs/reference/FQA.md
@@ -0,0 +1,22 @@
+# FQA
+
+## How to use VAD model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/236)
+
+## How to use Punctuation model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/238)
+
+## How to use Parafomrer model for streaming by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
+
+## How to use vad, asr and punc model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/278)
+
+## How to combine vad, asr, punc and nnlm models inside modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/134)
+
+## How to combine timestamp prediction model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/246)
+
+## How to switch decoding mode between online and offline for UniASR model
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/151)
--- a/modules/python/vendors/FunASR/docs/reference/application.md
+++ b/modules/python/vendors/FunASR/docs/reference/application.md
@@ -0,0 +1,5 @@
+## Audio Cut
+
+## Realtime Speech Recognition
+
+## Audio Chat
--- a/modules/python/vendors/FunASR/docs/reference/build_task.md
+++ b/modules/python/vendors/FunASR/docs/reference/build_task.md
@@ -0,0 +1,125 @@
+# Build custom tasks
+FunASR is similar to ESPNet, which applies `Task`  as the general interface ti achieve the training and inference of models. Each `Task` is a class inherited from `AbsTask` and its corresponding code can be seen in `funasr/tasks/abs_task.py`. The main functions of `AbsTask` are shown as follows:
+```python
+class AbsTask(ABC):
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        pass
+    
+    @classmethod
+    def build_preprocess_fn(cls, args, train):
+        (...)
+    
+    @classmethod
+    def build_collate_fn(cls, args: argparse.Namespace):
+        (...)
+
+    @classmethod
+    def build_model(cls, args):
+        (...)
+    
+    @classmethod
+    def main(cls, args):
+        (...)
+```
+- add_task_arguments：Add parameters required by a specified `Task`
+- build_preprocess_fn：定义如何处理对样本进行预处理 define how to preprocess samples
+- build_collate_fn：define how to combine multiple samples into a `batch`
+- build_model：define the model
+- main：training interface, starting training through `Task.main()`
+
+Next, we take the speech recognition as an example to introduce how to define a new `Task`. For the corresponding code, please see `ASRTask` in `funasr/tasks/asr.py`. The procedure of defining a new `Task` is actually the procedure of redefining the above functions according to the requirements of the specified `Task`.
+
+- add_task_arguments
+```python
+@classmethod
+def add_task_arguments(cls, parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(description="Task related")
+    group.add_argument(
+        "--token_list",
+        type=str_or_none,
+        default=None,
+        help="A text mapping int-id to token",
+    )
+    (...)
+```
+For speech recognition tasks, specific parameters required include `token_list`, etc. According to the specific requirements of different tasks, users can define corresponding parameters in this function.
+
+- build_preprocess_fn
+```python
+@classmethod
+def build_preprocess_fn(cls, args, train):
+    if args.use_preprocessor:
+        retval = CommonPreprocessor(
+                    train=train,
+                    token_type=args.token_type,
+                    token_list=args.token_list,
+                    bpemodel=args.bpemodel,
+                    non_linguistic_symbols=args.non_linguistic_symbols,
+                    text_cleaner=args.cleaner,
+                    ...
+                )
+    else:
+        retval = None
+    return retval
+```
+This function defines how to preprocess samples. Specifically, the input of speech recognition tasks includes speech and text. For speech, functions such as (optional) adding noise and reverberation to the speech are supported. For text, functions such as (optional) processing text according to bpe and mapping text to `tokenid` are supported. Users can choose the preprocessing operation that needs to be performed on the sample. For the detail implementation, please refer to `CommonPreprocessor`.
+
+- build_collate_fn
+```python
+@classmethod
+def build_collate_fn(cls, args, train):
+    return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+```
+This function defines how to combine multiple samples into a `batch`. For speech recognition tasks, `padding` is employed to obtain equal-length data from different speech and text. Specifically, we set `0.0` as the default padding value for speech and `-1` as the default padding value for text. Users can define different `batch` operations here. For the detail implementation, please refer to `CommonCollateFn`.
+
+- build_model
+```python
+@classmethod
+def build_model(cls, args, train):
+    with open(args.token_list, encoding="utf-8") as f:
+        token_list = [line.rstrip() for line in f]
+        vocab_size = len(token_list)
+        frontend = frontend_class(**args.frontend_conf)
+        specaug = specaug_class(**args.specaug_conf)
+        normalize = normalize_class(**args.normalize_conf)
+        preencoder = preencoder_class(**args.preencoder_conf)
+        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+        postencoder = postencoder_class(input_size=encoder_output_size, **args.postencoder_conf)
+        decoder = decoder_class(vocab_size=vocab_size, encoder_output_size=encoder_output_size,  **args.decoder_conf)
+        ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **args.ctc_conf)
+        model = model_class(
+            vocab_size=vocab_size,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            token_list=token_list,
+            **args.model_conf,
+        )
+    return model
+```
+This function defines the detail of the model. For different speech recognition models, the same speech recognition `Task` can usually be shared and the remaining thing needed to be done is to define a specific model in this function. For example, a speech recognition model with a standard encoder-decoder structure has been shown above. Specifically, it first defines each module of the model, including encoder, decoder, etc. and then combine these modules together to generate a complete model. In FunASR, the model needs to inherit `FunASRModel` and the corresponding code can be seen in `funasr/train/abs_espnet_model.py`. The main function needed to be implemented is the `forward` function.
+
+Next, we take `SANMEncoder` as an example to introduce how to use a custom encoder as a part of the model when defining the specified model and the corresponding code can be seen in `funasr/models/encoder/sanm_encoder.py`. For a custom encoder, in addition to inheriting the common encoder class `AbsEncoder`, it is also necessary to define the `forward` function to achieve the forward computation of the `encoder`. After defining the `encoder`, it should also be registered in the `Task`. The corresponding code example can be seen as below:
+```python
+encoder_choices = ClassChoices(
+    "encoder",
+    classes=dict(
+        conformer=ConformerEncoder,
+        transformer=TransformerEncoder,
+        rnn=RNNEncoder,
+        sanm=SANMEncoder,
+        sanm_chunk_opt=SANMEncoderChunkOpt,
+        data2vec_encoder=Data2VecEncoder,
+        mfcca_enc=MFCCAEncoder,
+    ),
+    type_check=AbsEncoder,
+    default="rnn",
+)
+```
+In this code, `sanm=SANMEncoder` takes the newly defined `SANMEncoder` as an optional choice of the `encoder`. Once the user specifies the `encoder` as `sanm` in the configuration file, the `SANMEncoder` will be correspondingly employed as the `encoder` module of the model.
--- a/modules/python/vendors/FunASR/docs/reference/papers.md
+++ b/modules/python/vendors/FunASR/docs/reference/papers.md
@@ -0,0 +1,38 @@
+# Papers
+
+FunASR have implemented the following paper code
+
+### Speech Recognition
+- [FunASR: A Fundamental End-to-End Speech Recognition Toolkit](https://arxiv.org/abs/2305.11013), INTERSPEECH 2023
+- [BAT: Boundary aware transducer for memory-efficient and low-latency ASR](https://arxiv.org/abs/2305.11571), INTERSPEECH 2023
+- [Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition](https://arxiv.org/abs/2206.08317), INTERSPEECH 2022
+- [E-branchformer: Branchformer with enhanced merging for speech recognition](https://arxiv.org/abs/2210.00077), SLT 2022
+- [Branchformer: Parallel mlp-attention architectures to capture local and global context for speech recognition and understanding](https://proceedings.mlr.press/v162/peng22a.html?ref=https://githubhelp.com), ICML 2022
+- [Universal ASR: Unifying Streaming and Non-Streaming ASR Using a Single Encoder-Decoder Model](https://arxiv.org/abs/2010.14099), arXiv preprint arXiv:2010.14099, 2020
+- [San-m: Memory equipped self-attention for end-to-end speech recognition](https://arxiv.org/pdf/2006.01713), INTERSPEECH 2020
+- [Streaming Chunk-Aware Multihead Attention for Online End-to-End Speech Recognition](https://arxiv.org/abs/2006.01712), INTERSPEECH 2020
+- [Conformer: Convolution-augmented Transformer for Speech Recognition](https://arxiv.org/abs/2005.08100),  INTERSPEECH 2020
+- [Sequence-to-sequence learning with Transducers](https://arxiv.org/pdf/1211.3711.pdf), NIPS 2016
+
+
+### Multi-talker Speech Recognition
+- [MFCCA:Multi-Frame Cross-Channel attention for multi-speaker ASR in Multi-party meeting scenario](https://arxiv.org/abs/2210.05265), ICASSP 2022
+
+### Voice Activity Detection
+- [Deep-FSMN for Large Vocabulary Continuous Speech Recognition](https://arxiv.org/abs/1803.05030), ICASSP 2018
+
+### Punctuation Restoration
+- [CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection](https://arxiv.org/pdf/2003.01309.pdf), ICASSP 2018
+
+### Language Models
+- [Attention Is All You Need](https://arxiv.org/abs/1706.03762), NEURIPS 2017
+
+### Speaker Verification
+- [X-VECTORS: ROBUST DNN EMBEDDINGS FOR SPEAKER RECOGNITION](https://www.danielpovey.com/files/2018_icassp_xvectors.pdf), ICASSP 2018
+
+### Speaker diarization
+- [Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis](https://arxiv.org/abs/2211.10243), EMNLP 2022
+- [TOLD: A Novel Two-Stage Overlap-Aware Framework for Speaker Diarization](https://arxiv.org/abs/2303.05397), ICASSP 2023
+
+### Timestamp Prediction
+- [Achieving Timestamp Prediction While Recognizing with Non-Autoregressive End-to-End ASR Model](https://arxiv.org/abs/2301.12343), arXiv:2301.12343
--- a/modules/python/vendors/FunASR/docs/tutorial/README.md
+++ b/modules/python/vendors/FunASR/docs/tutorial/README.md
@@ -0,0 +1,424 @@
+([简体中文](./README_zh.md)|English)
+
+FunASR has open-sourced a large number of pre-trained models on industrial data. You are free to use, copy, modify, and share FunASR models under the [Model License Agreement](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE). Below, we list some representative models. For a comprehensive list, please refer to our [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo).
+
+<div align="center">  
+<h4>
+ <a href="#Inference"> Model Inference </a>   
+｜<a href="#Training"> Model Training and Testing </a>
+｜<a href="#Export"> Model Export and Testing </a>
+</h4>
+</div>
+
+<a name="Inference"></a>
+## Model Inference
+
+### Quick Start
+
+For command-line invocation:
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
+```
+
+For python code invocation (recommended): 
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer-zh")
+
+res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
+print(res)
+```
+
+### API Description 
+#### AutoModel Definition
+```python
+model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
+```
+- `model`(str): model name in the [Model Repository](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo), or a model path on local disk.
+- `device`(str): `cuda:0` (default gpu0) for using GPU for inference, specify `cpu` for using CPU.
+- `ncpu`(int): `4` (default), sets the number of threads for CPU internal operations.
+- `output_dir`(str): `None` (default), set this to specify the output path for the results.
+- `batch_size`(int): `1` (default), the number of samples per batch during decoding.
+- `hub`(str)：`ms` (default) to download models from ModelScope. Use `hf` to download models from Hugging Face.
+- `**kwargs`(dict): Any parameters found in config.yaml can be directly specified here, for instance, the maximum segmentation length in the vad model max_single_segment_time=6000 (milliseconds).
+
+#### AutoModel Inference
+```python
+res = model.generate(input=[str], output_dir=[str])
+```
+- `input`: The input to be decoded, which could be:
+  - A wav file path, e.g., asr_example.wav
+  - A pcm file path, e.g., asr_example.pcm, in this case, specify the audio sampling rate fs (default is 16000)
+  - An audio byte stream, e.g., byte data from a microphone
+  - A wav.scp, a Kaldi-style wav list (wav_id \t wav_path), for example:
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  When using wav.scp as input, you must set output_dir to save the output results.
+  - Audio samples, `e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, data type is numpy.ndarray. Supports batch inputs, type is list：
+  ```[audio_sample1, audio_sample2, ..., audio_sampleN]```
+  - fbank input, supports batch grouping. Shape is [batch, frames, dim], type is torch.Tensor.
+- `output_dir`: None (default), if set, specifies the output path for the results.
+- `**kwargs`(dict): Inference parameters related to the model, for example,`beam_size=10`，`decoding_ctc_weight=0.1`.
+
+
+### More Usage Introduction
+
+
+#### Speech Recognition (Non-streaming)
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh",  
+                  vad_model="fsmn-vad", 
+                  vad_kwargs={"max_single_segment_time": 60000},
+                  punc_model="ct-punc", 
+                  # spk_model="cam++"
+                  )
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
+print(res)
+```
+Notes:
+- Typically, the input duration for models is limited to under 30 seconds. However, when combined with `vad_model`, support for audio input of any length is enabled, not limited to the paraformer model—any audio input model can be used.
+- Parameters related to model can be directly specified in the definition of AutoModel; parameters related to `vad_model` can be set through `vad_kwargs`, which is a dict; similar parameters include `punc_kwargs` and `spk_kwargs`.
+- `max_single_segment_time`: Denotes the maximum audio segmentation length for `vad_model`, measured in milliseconds (ms).
+- `batch_size_s` represents the use of dynamic batching, where the total audio duration within a batch is measured in seconds (s).
+- `batch_size_threshold_s`: Indicates that when the duration of an audio segment post-VAD segmentation exceeds the batch_size_threshold_s threshold, the batch size is set to 1, measured in seconds (s).
+
+Recommendations: 
+
+When you input long audio and encounter Out Of Memory (OOM) issues, since memory usage tends to increase quadratically with audio length, consider the following three scenarios:
+
+a) At the beginning of inference, memory usage primarily depends on `batch_size_s`. Appropriately reducing this value can decrease memory usage.
+b) During the middle of inference, when encountering long audio segments cut by VAD and the total token count is less than `batch_size_s`, yet still facing OOM, you can appropriately reduce `batch_size_threshold_s`. If the threshold is exceeded, the batch size is forced to 1.
+c) Towards the end of inference, if long audio segments cut by VAD have a total token count less than `batch_size_s` and exceed the `threshold` batch_size_threshold_s, forcing the batch size to 1 and still facing OOM, you may reduce `max_single_segment_time` to shorten the VAD audio segment length.
+
+#### Speech Recognition (Streaming)
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word.
+
+#### Voice Activity Detection (Non-Streaming)
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad")
+wav_file = f"{model.model_path}/example/vad_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+Note: The output format of the VAD model is: `[[beg1, end1], [beg2, end2], ..., [begN, endN]]`, where `begN/endN` indicates the starting/ending point of the `N-th` valid audio segment, measured in milliseconds.
+
+#### Voice Activity Detection (Streaming)
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+Note: The output format for the streaming VAD model can be one of four scenarios:
+- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`：The same as the offline VAD output result mentioned above.
+- `[[beg, -1]]`：Indicates that only a starting point has been detected.
+- `[[-1, end]]`：Indicates that only an ending point has been detected.
+- `[]`：Indicates that neither a starting point nor an ending point has been detected. 
+
+The output is measured in milliseconds and represents the absolute time from the starting point.
+#### Punctuation Restoration
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc")
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+#### Timestamp Prediction
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh")
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+
+More examples ref to [docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)
+
+<a name="Training"></a>
+## Model Training and Testing
+
+### Quick Start
+
+Execute via command line (for quick testing, not recommended):
+```shell
+funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
+```
+
+Execute with Python code (supports multi-node and multi-GPU, recommended):
+
+```shell
+cd examples/industrial_data_pretraining/paraformer
+bash finetune.sh
+# "log_file: ./outputs/log.txt"
+```
+Full code ref to [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
+
+### Detailed Parameter Description:
+
+```shell
+funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
+```
+
+- `model`（str）: The name of the model (the ID in the model repository), at which point the script will automatically download the model to local storage; alternatively, the path to a model already downloaded locally.
+- `train_data_set_list`（str）: The path to the training data, typically in jsonl format, for specific details refer to [examples](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list).
+- `valid_data_set_list`（str）：The path to the validation data, also generally in jsonl format, for specific details refer to examples](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list).
+- `dataset_conf.batch_type`（str）：example (default), the type of batch. example means batches are formed with a fixed number of batch_size samples; length or token means dynamic batching, with total length or number of tokens of the batch equalling batch_size.
+- `dataset_conf.batch_size`（int）：Used in conjunction with batch_type. When batch_type=example, it represents the number of samples; when batch_type=length, it represents the length of the samples, measured in fbank frames (1 frame = 10 ms) or the number of text tokens.
+- `train_conf.max_epoch`（int）：The total number of epochs for training.
+- `train_conf.log_interval`（int）：The number of steps between logging.
+- `train_conf.resume`（int）：Whether to enable checkpoint resuming for training.
+- `train_conf.validate_interval`（int）：The interval in steps to run validation tests during training.
+- `train_conf.save_checkpoint_interval`（int）：The interval in steps for saving the model during training.
+- `train_conf.keep_nbest_models`（int）：The maximum number of model parameters to retain, sorted by validation set accuracy, from highest to lowest.
+- `train_conf.avg_nbest_model`（int）：Average over the top n models with the highest accuracy.
+- `optim_conf.lr`（float）：The learning rate.
+- `output_dir`（str）：The path for saving the model.
+- `**kwargs`(dict): Any parameters in config.yaml can be specified directly here, for example, to filter out audio longer than 20s: dataset_conf.max_token_length=2000, measured in fbank frames (1 frame = 10 ms) or the number of text tokens.
+
+#### Multi-GPU Training
+##### Single-Machine Multi-GPU Training
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
+../../../funasr/bin/train.py ${train_args}
+```
+--nnodes represents the total number of participating nodes, while --nproc_per_node indicates the number of processes running on each node.
+
+##### Multi-Machine Multi-GPU Training
+
+On the master node, assuming the IP is 192.168.1.1 and the port is 12345, and you're using 2 GPUs, you would run the following command:
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr=192.168.1.1 --master_port=12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+On the worker node (assuming the IP is 192.168.1.2), you need to ensure that the MASTER_ADDR and MASTER_PORT environment variables are set to match those of the master node, and then run the same command:
+
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr=192.168.1.1 --master_port=12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+
+--nnodes indicates the total number of nodes participating in the training, --node_rank represents the ID of the current node, and --nproc_per_node specifies the number of processes running on each node (usually corresponds to the number of GPUs).
+
+#### Data prepare
+
+`jsonl` ref to（[demo](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）.
+The instruction scp2jsonl can be used to generate from wav.scp and text.txt. The preparation process for wav.scp and text.txt is as follows:
+
+`train_text.txt`
+
+```bash
+ID0012W0013 当客户风险承受能力评估依据发生变化时
+ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
+ID0012W0015 he tried to think how it could be
+```
+
+
+`train_wav.scp`
+
+
+```bash
+BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
+BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
+ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
+```
+
+`Command`
+
+```shell
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
+```
+
+(Optional, not required) If you need to parse from jsonl back to wav.scp and text.txt, you can use the following command:
+
+```shell
+# generate wav.scp and text.txt from train.jsonl and val.jsonl
+jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
+```
+
+#### Training log
+
+##### log.txt
+```shell
+tail log.txt
+[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
+[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
+```
+
+
+- `rank`：gpu id。
+- `epoch`,`step`,`total step`：the current epoch, step, and total steps.
+- `loss_avg_rank`：the average loss across all GPUs for the current step.
+- `loss/ppl/acc_avg_epoch`：the overall average loss/perplexity/accuracy for the current epoch, up to the current step count. The last step of the epoch when it ends represents the total average loss/perplexity/accuracy for that epoch; it is recommended to use the accuracy metric.
+- `lr`：the learning rate for the current step.
+- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`：the specific data for the current GPU ID.
+- `total_time`：the total time taken for a single step.
+- `GPU, memory`：the model-used/peak memory and the model+cache-used/peak memory.
+
+##### tensorboard
+```bash
+tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
+```
+http://localhost:6006/
+
+### 训练后模型测试
+
+
+#### With `configuration.json` file
+
+Assuming the training model path is: ./model_dir, if a configuration.json file has been generated in this directory, you only need to change the model name to the model path in the above model inference method. 
+
+For example, for shell inference:
+```shell
+python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
+```
+
+Python inference
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="./model_dir")
+
+res = model.generate(input=wav_file)
+print(res)
+```
+
+#### Without `configuration.json` file
+
+If there is no configuration.json in the model path, you need to manually specify the exact configuration file path and the model path.
+
+```shell
+python -m funasr.bin.inference \
+--config-path "${local_path}" \
+--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
+```
+
+Parameter Introduction
+- `config-path`：This is the path to the config.yaml saved during the experiment, which can be found in the experiment's output directory.
+- `config-name`：The name of the configuration file, usually config.yaml. It supports both YAML and JSON formats, for example config.json.
+- `init_param`：The model parameters that need to be tested, usually model.pt. You can choose a specific model file as needed.
+- `tokenizer_conf.token_list`：The path to the vocabulary file, which is normally specified in config.yaml. There is no need to manually specify it again unless the path in config.yaml is incorrect, in which case the correct path must be manually specified here.
+- `frontend_conf.cmvn_file`：The CMVN (Cepstral Mean and Variance Normalization) file used when extracting fbank features from WAV files, which is usually specified in config.yaml. There is no need to manually specify it again unless the path in config.yaml is incorrect, in which case the correct path must be manually specified here.
+
+Other parameters are the same as mentioned above. A complete [example](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh) can be found here.
+
+<a name="Export"></a>
+## Export ONNX
+
+### Command-line usage
+```shell
+funasr-export ++model=paraformer ++quantize=false ++device=cpu
+```
+
+### Python
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer", device="cpu")
+
+res = model.export(quantize=False)
+```
+
+### Test ONNX
+```python
+# pip3 install -U funasr-onnx
+from funasr_onnx import Paraformer
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
+```
+
+More examples ref to [demo](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)
--- a/modules/python/vendors/FunASR/docs/tutorial/README_zh.md
+++ b/modules/python/vendors/FunASR/docs/tutorial/README_zh.md
@@ -0,0 +1,436 @@
+(简体中文|[English](./README.md))
+
+FunASR开源了大量在工业数据上预训练模型，您可以在 [模型许可协议](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)下自由使用、复制、修改和分享FunASR模型，下面列举代表性的模型，更多模型请参考 [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)。
+
+<div align="center">  
+<h4>
+ <a href="#模型推理"> 模型推理 </a>   
+｜<a href="#模型训练与测试"> 模型训练与测试 </a>
+｜<a href="#模型导出与测试"> 模型导出与测试 </a>
+</h4>
+</div>
+
+<a name="模型推理"></a>
+## 模型推理
+
+### 快速使用
+
+命令行方式调用：
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
+```
+
+python代码调用（推荐）
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer-zh")
+
+res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
+print(res)
+```
+
+### 接口说明
+
+#### AutoModel 定义
+```python
+model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
+```
+- `model`(str): [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) 中的模型名称，或本地磁盘中的模型路径
+- `device`(str): `cuda:0`（默认gpu0），使用 GPU 进行推理，指定。如果为`cpu`，则使用 CPU 进行推理
+- `ncpu`(int): `4` （默认），设置用于 CPU 内部操作并行性的线程数
+- `output_dir`(str): `None` （默认），如果设置，输出结果的输出路径
+- `batch_size`(int): `1` （默认），解码时的批处理，样本个数
+- `hub`(str)：`ms`（默认），从modelscope下载模型。如果为`hf`，从huggingface下载模型。
+- `**kwargs`(dict): 所有在`config.yaml`中参数，均可以直接在此处指定，例如，vad模型中最大切割长度 `max_single_segment_time=6000` （毫秒）。
+
+#### AutoModel 推理
+```python
+res = model.generate(input=[str], output_dir=[str])
+```
+- `input`: 要解码的输入，可以是：
+  - wav文件路径, 例如: asr_example.wav
+  - pcm文件路径, 例如: asr_example.pcm，此时需要指定音频采样率fs（默认为16000）
+  - 音频字节数流，例如：麦克风的字节数数据
+  - wav.scp，kaldi 样式的 wav 列表 (`wav_id \t wav_path`), 例如:
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  在这种输入 `wav.scp` 的情况下，必须设置 `output_dir` 以保存输出结果
+  - 音频采样点，例如：`audio, rate = soundfile.read("asr_example_zh.wav")`, 数据类型为 numpy.ndarray。支持batch输入，类型为list：
+  ```[audio_sample1, audio_sample2, ..., audio_sampleN]```
+  - fbank输入，支持组batch。shape为[batch, frames, dim]，类型为torch.Tensor，例如
+- `output_dir`: None （默认），如果设置，输出结果的输出路径
+- `**kwargs`(dict): 与模型相关的推理参数，例如，`beam_size=10`，`decoding_ctc_weight=0.1`。
+
+
+### 更多用法介绍
+
+
+#### 非实时语音识别
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh",  
+                  vad_model="fsmn-vad", 
+                  vad_kwargs={"max_single_segment_time": 60000},
+                  punc_model="ct-punc", 
+                  # spk_model="cam++"
+                  )
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
+print(res)
+```
+注意：
+- 通常模型输入限制时长30s以下，组合`vad_model`后，支持任意时长音频输入，不局限于paraformer模型，所有音频输入模型均可以。
+- `model`相关的参数可以直接在`AutoModel`定义中直接指定；与`vad_model`相关参数可以通过`vad_kwargs`来指定，类型为dict；类似的有`punc_kwargs`，`spk_kwargs`；
+- `max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms.
+- `batch_size_s` 表示采用动态batch，batch中总音频时长，单位为秒s。
+- `batch_size_threshold_s`: 表示`vad_model`切割后音频片段时长超过 `batch_size_threshold_s`阈值时，将batch_size数设置为1, 单位为秒s.
+
+建议：当您输入为长音频，遇到OOM问题时，因为显存占用与音频时长呈平方关系增加，分为3种情况：
+- a)推理起始阶段，显存主要取决于`batch_size_s`，适当减小该值，可以减少显存占用；
+- b)推理中间阶段，遇到VAD切割的长音频片段，总token数小于`batch_size_s`，仍然出现OOM，可以适当减小`batch_size_threshold_s`，超过阈值，强制batch为1; 
+- c)推理快结束阶段，遇到VAD切割的长音频片段，总token数小于`batch_size_s`，且超过阈值`batch_size_threshold_s`，强制batch为1，仍然出现OOM，可以适当减小`max_single_segment_time`，使得VAD切割音频时长变短。
+
+#### 实时语音识别
+
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+
+注：`chunk_size`为流式延时配置，`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`，未来信息为`5*60=300ms`。每次推理输入为`600ms`（采样点数为`16000*0.6=960`），输出为对应文字，最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
+
+#### 语音端点检测（非实时）
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad")
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+注：VAD模型输出格式为：`[[beg1, end1], [beg2, end2], .., [begN, endN]]`，其中`begN/endN`表示第`N`个有效音频片段的起始点/结束点，
+单位为毫秒。
+
+#### 语音端点检测（实时）
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+注：流式VAD模型输出格式为4种情况：
+- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`：同上离线VAD输出结果。
+- `[[beg, -1]]`：表示只检测到起始点。
+- `[[-1, end]]`：表示只检测到结束点。
+- `[]`：表示既没有检测到起始点，也没有检测到结束点
+输出结果单位为毫秒，从起始点开始的绝对时间。
+
+#### 标点恢复
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc")
+
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+
+#### 时间戳预测
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+更多（[示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）
+
+<a name="核心功能"></a>
+## 模型训练与测试
+
+### 快速开始
+
+命令行执行（用于快速测试，不推荐）：
+```shell
+funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
+```
+
+python代码执行（可以多机多卡，推荐）
+
+```shell
+cd examples/industrial_data_pretraining/paraformer
+bash finetune.sh
+# "log_file: ./outputs/log.txt"
+```
+详细完整的脚本参考 [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
+
+### 详细参数介绍
+
+```shell
+funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
+```
+
+- `model`（str）：模型名字（模型仓库中的ID），此时脚本会自动下载模型到本读；或者本地已经下载好的模型路径。
+- `train_data_set_list`（str）：训练数据路径，默认为jsonl格式，具体参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+- `valid_data_set_list`（str）：验证数据路径，默认为jsonl格式，具体参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+- `dataset_conf.batch_type`（str）：`example`（默认），batch的类型。`example`表示按照固定数目batch_size个样本组batch；`length` or `token` 表示动态组batch，batch总长度或者token数为batch_size。
+- `dataset_conf.batch_size`（int）：与 `batch_type` 搭配使用，当 `batch_type=example` 时，表示样本个数；当 `batch_type=length` 时，表示样本中长度，单位为fbank帧数（1帧10ms）或者文字token个数。
+- `train_conf.max_epoch`（int）：`100`（默认），训练总epoch数。
+- `train_conf.log_interval`（int）：`50`（默认），打印日志间隔step数。
+- `train_conf.resume`（int）：`True`（默认），是否开启断点重训。
+- `train_conf.validate_interval`（int）：`5000`（默认），训练中做验证测试的间隔step数。
+- `train_conf.save_checkpoint_interval`（int）：`5000`（默认），训练中模型保存间隔step数。
+- `train_conf.avg_keep_nbest_models_type`（str）：`acc`（默认），保留nbest的标准为acc（越大越好）。`loss`表示，保留nbest的标准为loss（越小越好）。
+- `train_conf.keep_nbest_models`（int）：`500`（默认），保留最大多少个模型参数，配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 保留最佳的n个模型，其他删除，节约存储空间。
+- `train_conf.avg_nbest_model`（int）：`10`（默认），保留最大多少个模型参数，配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 对最佳的n个模型平均。
+- `train_conf.accum_grad`（int）：`1`（默认），梯度累积功能。
+- `train_conf.grad_clip`（float）：`10.0`（默认），梯度截断功能。
+- `train_conf.use_fp16`（bool）：`False`（默认），开启fp16训练，加快训练速度。
+- `optim_conf.lr`（float）：学习率。
+- `output_dir`（str）：模型保存路径。
+- `**kwargs`(dict): 所有在`config.yaml`中参数，均可以直接在此处指定，例如，过滤20s以上长音频：`dataset_conf.max_token_length=2000`，单位为音频fbank帧数（1帧10ms）或者文字token个数。
+
+#### 多gpu训练
+##### 单机多gpu训练
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
+../../../funasr/bin/train.py ${train_args}
+```
+--nnodes 表示参与的节点总数，--nproc_per_node 表示每个节点上运行的进程数
+
+##### 多机多gpu训练
+
+在主节点上，假设IP为192.168.1.1，端口为12345，使用的是2个GPU，则运行如下命令：
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+在从节点上（假设IP为192.168.1.2），你需要确保MASTER_ADDR和MASTER_PORT环境变量与主节点设置的一致，并运行同样的命令：
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+
+--nnodes 表示参与的节点总数，--node_rank 表示当前节点id，--nproc_per_node 表示每个节点上运行的进程数（通常为gpu个数）
+
+#### 准备数据
+
+`jsonl`格式可以参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+可以用指令 `scp2jsonl` 从wav.scp与text.txt生成。wav.scp与text.txt准备过程如下：
+
+`train_text.txt`
+
+左边为数据唯一ID，需与`train_wav.scp`中的`ID`一一对应
+右边为音频文件标注文本，格式如下：
+
+```bash
+ID0012W0013 当客户风险承受能力评估依据发生变化时
+ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
+ID0012W0015 he tried to think how it could be
+```
+
+
+`train_wav.scp`
+
+左边为数据唯一ID，需与`train_text.txt`中的`ID`一一对应
+右边为音频文件的路径，格式如下
+
+```bash
+BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
+BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
+ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
+```
+
+`生成指令`
+
+```shell
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
+```
+
+（可选，非必需）如果需要从jsonl解析成wav.scp与text.txt，可以使用指令：
+
+```shell
+# generate wav.scp and text.txt from train.jsonl and val.jsonl
+jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
+```
+
+#### 查看训练日志
+
+##### 查看实验log
+```shell
+tail log.txt
+[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
+[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
+```
+指标解释：
+- `rank`：表示gpu id。
+- `epoch`,`step`,`total step`：表示当前epoch，step，总step。
+- `loss_avg_rank`：表示当前step，所有gpu平均loss。
+- `loss/ppl/acc_avg_epoch`：表示当前epoch周期，截止当前step数时，总平均loss/ppl/acc。epoch结束时的最后一个step表示epoch总平均loss/ppl/acc，推荐使用acc指标。
+- `lr`：当前step的学习率。
+- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`：表示当前gpu id的具体数据。
+- `total_time`：表示单个step总耗时。
+- `GPU, memory`：分别表示，模型使用/峰值显存，模型+缓存使用/峰值显存。
+
+##### tensorboard可视化
+```bash
+tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
+```
+浏览器中打开：http://localhost:6006/
+
+### 训练后模型测试
+
+
+#### 有configuration.json
+
+假定，训练模型路径为：./model_dir，如果该目录下有生成configuration.json，只需要将 [上述模型推理方法](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/README_zh.md#%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86) 中模型名字修改为模型路径即可
+
+例如：
+
+从shell推理
+```shell
+python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
+```
+从python推理
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="./model_dir")
+
+res = model.generate(input=wav_file)
+print(res)
+```
+
+#### 无configuration.json时
+
+如果模型路径中无configuration.json时，需要手动指定具体配置文件路径与模型路径
+
+```shell
+python -m funasr.bin.inference \
+--config-path "${local_path}" \
+--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
+```
+
+参数介绍
+- `config-path`：为实验中保存的 `config.yaml`，可以从实验输出目录中查找。
+- `config-name`：配置文件名，一般为 `config.yaml`，支持yaml格式与json格式，例如 `config.json`
+- `init_param`：需要测试的模型参数，一般为`model.pt`，可以自己选择具体的模型文件
+- `tokenizer_conf.token_list`：词表文件路径，一般在 `config.yaml` 有指定，无需再手动指定，当 `config.yaml` 中路径不正确时，需要在此处手动指定。
+- `frontend_conf.cmvn_file`：wav提取fbank中用到的cmvn文件，一般在 `config.yaml` 有指定，无需再手动指定，当 `config.yaml` 中路径不正确时，需要在此处手动指定。
+
+其他参数同上，完整 [示例](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh)
+
+
+<a name="模型导出与测试"></a>
+## 模型导出与测试
+### 从命令行导出
+```shell
+funasr-export ++model=paraformer ++quantize=false
+```
+
+### 从Python导出
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer")
+
+res = model.export(quantize=False)
+```
+
+### 测试ONNX
+```python
+# pip3 install -U funasr-onnx
+from funasr_onnx import Paraformer
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
+```
+
+更多例子请参考 [样例](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)
--- a/modules/python/vendors/FunASR/examples/README.md
+++ b/modules/python/vendors/FunASR/examples/README.md
@@ -0,0 +1,461 @@
+([简体中文](./README_zh.md)|English)
+
+FunASR has open-sourced a large number of pre-trained models on industrial data. You are free to use, copy, modify, and share FunASR models under the [Model License Agreement](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE). Below, we list some representative models. For a comprehensive list, please refer to our [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo).
+
+<div align="center">  
+<h4>
+ <a href="#Inference"> Model Inference </a>   
+｜<a href="#Training"> Model Training and Testing </a>
+｜<a href="#Export"> Model Export and Testing </a>
+</h4>
+</div>
+
+<a name="Inference"></a>
+## Model Inference
+
+### Quick Start
+
+For command-line invocation:
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
+```
+
+For python code invocation (recommended): 
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer-zh")
+
+res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
+print(res)
+```
+
+### API Description 
+#### AutoModel Definition
+```python
+model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
+```
+- `model`(str): model name in the [Model Repository](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo), or a model path on local disk.
+- `device`(str): `cuda:0` (default gpu0) for using GPU for inference, specify `cpu` for using CPU.
+- `ncpu`(int): `4` (default), sets the number of threads for CPU internal operations.
+- `output_dir`(str): `None` (default), set this to specify the output path for the results.
+- `batch_size`(int): `1` (default), the number of samples per batch during decoding.
+- `hub`(str)：`ms` (default) to download models from ModelScope. Use `hf` to download models from Hugging Face.
+- `**kwargs`(dict): Any parameters found in config.yaml can be directly specified here, for instance, the maximum segmentation length in the vad model max_single_segment_time=6000 (milliseconds).
+
+#### AutoModel Inference
+```python
+res = model.generate(input=[str], output_dir=[str])
+```
+- `input`: The input to be decoded, which could be:
+  - A wav file path, e.g., asr_example.wav
+  - A pcm file path, e.g., asr_example.pcm, in this case, specify the audio sampling rate fs (default is 16000)
+  - An audio byte stream, e.g., byte data from a microphone
+  - A wav.scp, a Kaldi-style wav list (wav_id \t wav_path), for example:
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  When using wav.scp as input, you must set output_dir to save the output results.
+  - Audio samples, `e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, data type is numpy.ndarray. Supports batch inputs, type is list：
+  ```[audio_sample1, audio_sample2, ..., audio_sampleN]```
+  - fbank input, supports batch grouping. Shape is [batch, frames, dim], type is torch.Tensor.
+- `output_dir`: None (default), if set, specifies the output path for the results.
+- `**kwargs`(dict): Inference parameters related to the model, for example,`beam_size=10`，`decoding_ctc_weight=0.1`.
+
+
+### More Usage Introduction
+
+
+#### Speech Recognition (Non-streaming)
+##### SenseVoice
+```python
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+
+model_dir = "iic/SenseVoiceSmall"
+
+model = AutoModel(
+    model=model_dir,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device="cuda:0",
+)
+
+# en
+res = model.generate(
+    input=f"{model.model_path}/example/en.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+```
+Notes:
+- `model_dir`: The name of the model, or the path to the model on the local disk.
+- `vad_model`: This indicates the activation of VAD (Voice Activity Detection). The purpose of VAD is to split long audio into shorter clips. In this case, the inference time includes both VAD and SenseVoice total consumption, and represents the end-to-end latency. If you wish to test the SenseVoice model's inference time separately, the VAD model can be disabled.
+- `vad_kwargs`: Specifies the configurations for the VAD model. `max_single_segment_time`: denotes the maximum duration for audio segmentation by the `vad_model`, with the unit being milliseconds (ms).
+- `use_itn`: Whether the output result includes punctuation and inverse text normalization.
+- `batch_size_s`: Indicates the use of dynamic batching, where the total duration of audio in the batch is measured in seconds (s).
+- `merge_vad`: Whether to merge short audio fragments segmented by the VAD model, with the merged length being `merge_length_s`, in seconds (s).
+- `ban_emo_unk`: Whether to ban the output of the `emo_unk` token.
+
+##### Paraformer
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh",  
+                  vad_model="fsmn-vad", 
+                  vad_kwargs={"max_single_segment_time": 60000},
+                  punc_model="ct-punc", 
+                  # spk_model="cam++"
+                  )
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
+print(res)
+```
+Notes:
+- Typically, the input duration for models is limited to under 30 seconds. However, when combined with `vad_model`, support for audio input of any length is enabled, not limited to the paraformer model—any audio input model can be used.
+- Parameters related to model can be directly specified in the definition of AutoModel; parameters related to `vad_model` can be set through `vad_kwargs`, which is a dict; similar parameters include `punc_kwargs` and `spk_kwargs`.
+- `max_single_segment_time`: Denotes the maximum audio segmentation length for `vad_model`, measured in milliseconds (ms).
+- `batch_size_s` represents the use of dynamic batching, where the total audio duration within a batch is measured in seconds (s).
+- `batch_size_threshold_s`: Indicates that when the duration of an audio segment post-VAD segmentation exceeds the batch_size_threshold_s threshold, the batch size is set to 1, measured in seconds (s).
+
+Recommendations: 
+
+When you input long audio and encounter Out Of Memory (OOM) issues, since memory usage tends to increase quadratically with audio length, consider the following three scenarios:
+
+a) At the beginning of inference, memory usage primarily depends on `batch_size_s`. Appropriately reducing this value can decrease memory usage.
+b) During the middle of inference, when encountering long audio segments cut by VAD and the total token count is less than `batch_size_s`, yet still facing OOM, you can appropriately reduce `batch_size_threshold_s`. If the threshold is exceeded, the batch size is forced to 1.
+c) Towards the end of inference, if long audio segments cut by VAD have a total token count less than `batch_size_s` and exceed the `threshold` batch_size_threshold_s, forcing the batch size to 1 and still facing OOM, you may reduce `max_single_segment_time` to shorten the VAD audio segment length.
+
+#### Speech Recognition (Streaming)
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word.
+
+#### Voice Activity Detection (Non-Streaming)
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad")
+wav_file = f"{model.model_path}/example/vad_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+Note: The output format of the VAD model is: `[[beg1, end1], [beg2, end2], ..., [begN, endN]]`, where `begN/endN` indicates the starting/ending point of the `N-th` valid audio segment, measured in milliseconds.
+
+#### Voice Activity Detection (Streaming)
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+Note: The output format for the streaming VAD model can be one of four scenarios:
+- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`：The same as the offline VAD output result mentioned above.
+- `[[beg, -1]]`：Indicates that only a starting point has been detected.
+- `[[-1, end]]`：Indicates that only an ending point has been detected.
+- `[]`：Indicates that neither a starting point nor an ending point has been detected. 
+
+The output is measured in milliseconds and represents the absolute time from the starting point.
+#### Punctuation Restoration
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc")
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+#### Timestamp Prediction
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh")
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+
+More examples ref to [docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)
+
+<a name="Training"></a>
+## Model Training and Testing
+
+### Quick Start
+
+Execute via command line (for quick testing, not recommended):
+```shell
+funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
+```
+
+Execute with Python code (supports multi-node and multi-GPU, recommended):
+
+```shell
+cd examples/industrial_data_pretraining/paraformer
+bash finetune.sh
+# "log_file: ./outputs/log.txt"
+```
+Full code ref to [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
+
+### Detailed Parameter Description:
+
+```shell
+funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
+```
+
+- `model`（str）: The name of the model (the ID in the model repository), at which point the script will automatically download the model to local storage; alternatively, the path to a model already downloaded locally.
+- `train_data_set_list`（str）: The path to the training data, typically in jsonl format, for specific details refer to [examples](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list).
+- `valid_data_set_list`（str）：The path to the validation data, also generally in jsonl format, for specific details refer to examples](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list).
+- `dataset_conf.batch_type`（str）：example (default), the type of batch. example means batches are formed with a fixed number of batch_size samples; length or token means dynamic batching, with total length or number of tokens of the batch equalling batch_size.
+- `dataset_conf.batch_size`（int）：Used in conjunction with batch_type. When batch_type=example, it represents the number of samples; when batch_type=length, it represents the length of the samples, measured in fbank frames (1 frame = 10 ms) or the number of text tokens.
+- `train_conf.max_epoch`（int）：The total number of epochs for training.
+- `train_conf.log_interval`（int）：The number of steps between logging.
+- `train_conf.resume`（int）：Whether to enable checkpoint resuming for training.
+- `train_conf.validate_interval`（int）：The interval in steps to run validation tests during training.
+- `train_conf.save_checkpoint_interval`（int）：The interval in steps for saving the model during training.
+- `train_conf.keep_nbest_models`（int）：The maximum number of model parameters to retain, sorted by validation set accuracy, from highest to lowest.
+- `train_conf.avg_nbest_model`（int）：Average over the top n models with the highest accuracy.
+- `optim_conf.lr`（float）：The learning rate.
+- `output_dir`（str）：The path for saving the model.
+- `**kwargs`(dict): Any parameters in config.yaml can be specified directly here, for example, to filter out audio longer than 20s: dataset_conf.max_token_length=2000, measured in fbank frames (1 frame = 10 ms) or the number of text tokens.
+
+#### Multi-GPU Training
+##### Single-Machine Multi-GPU Training
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 1 --nproc_per_node ${gpu_num} --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+--nnodes represents the total number of participating nodes, while --nproc_per_node indicates the number of processes running on each node. --master_port indicates the port is 12345
+
+##### Multi-Machine Multi-GPU Training
+
+On the master node, assuming the IP is 192.168.1.1 and the port is 12345, and you're using 2 GPUs, you would run the following command:
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+On the worker node (assuming the IP is 192.168.1.2), you need to ensure that the MASTER_ADDR and MASTER_PORT environment variables are set to match those of the master node, and then run the same command:
+
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+
+--nnodes indicates the total number of nodes participating in the training, --node_rank represents the ID of the current node, and --nproc_per_node specifies the number of processes running on each node (usually corresponds to the number of GPUs).
+
+#### Data prepare
+
+`jsonl` ref to（[demo](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）.
+The instruction scp2jsonl can be used to generate from wav.scp and text.txt. The preparation process for wav.scp and text.txt is as follows:
+
+`train_text.txt`
+
+```bash
+ID0012W0013 当客户风险承受能力评估依据发生变化时
+ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
+ID0012W0015 he tried to think how it could be
+```
+
+
+`train_wav.scp`
+
+
+```bash
+BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
+BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
+ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
+```
+
+`Command`
+
+```shell
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
+```
+
+(Optional, not required) If you need to parse from jsonl back to wav.scp and text.txt, you can use the following command:
+
+```shell
+# generate wav.scp and text.txt from train.jsonl and val.jsonl
+jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
+```
+
+#### Training log
+
+##### log.txt
+```shell
+tail log.txt
+[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
+[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
+```
+
+
+- `rank`：gpu id。
+- `epoch`,`step`,`total step`：the current epoch, step, and total steps.
+- `loss_avg_rank`：the average loss across all GPUs for the current step.
+- `loss/ppl/acc_avg_epoch`：the overall average loss/perplexity/accuracy for the current epoch, up to the current step count. The last step of the epoch when it ends represents the total average loss/perplexity/accuracy for that epoch; it is recommended to use the accuracy metric.
+- `lr`：the learning rate for the current step.
+- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`：the specific data for the current GPU ID.
+- `total_time`：the total time taken for a single step.
+- `GPU, memory`：the model-used/peak memory and the model+cache-used/peak memory.
+
+##### tensorboard
+```bash
+tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
+```
+http://localhost:6006/
+
+### 训练后模型测试
+
+
+#### With `configuration.json` file
+
+Assuming the training model path is: ./model_dir, if a configuration.json file has been generated in this directory, you only need to change the model name to the model path in the above model inference method. 
+
+For example, for shell inference:
+```shell
+python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
+```
+
+Python inference
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="./model_dir")
+
+res = model.generate(input=wav_file)
+print(res)
+```
+
+#### Without `configuration.json` file
+
+If there is no configuration.json in the model path, you need to manually specify the exact configuration file path and the model path.
+
+```shell
+python -m funasr.bin.inference \
+--config-path "${local_path}" \
+--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
+```
+
+Parameter Introduction
+- `config-path`：This is the path to the config.yaml saved during the experiment, which can be found in the experiment's output directory.
+- `config-name`：The name of the configuration file, usually config.yaml. It supports both YAML and JSON formats, for example config.json.
+- `init_param`：The model parameters that need to be tested, usually model.pt. You can choose a specific model file as needed.
+- `tokenizer_conf.token_list`：The path to the vocabulary file, which is normally specified in config.yaml. There is no need to manually specify it again unless the path in config.yaml is incorrect, in which case the correct path must be manually specified here.
+- `frontend_conf.cmvn_file`：The CMVN (Cepstral Mean and Variance Normalization) file used when extracting fbank features from WAV files, which is usually specified in config.yaml. There is no need to manually specify it again unless the path in config.yaml is incorrect, in which case the correct path must be manually specified here.
+
+Other parameters are the same as mentioned above. A complete [example](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh) can be found here.
+
+<a name="Export"></a>
+## Export ONNX
+
+### Command-line usage
+```shell
+funasr-export ++model=paraformer ++quantize=false ++device=cpu
+```
+
+### Python
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer", device="cpu")
+
+res = model.export(quantize=False)
+```
+
+### Test ONNX
+```python
+# pip3 install -U funasr-onnx
+from funasr_onnx import Paraformer
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
+```
+
+More examples ref to [demo](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)
--- a/modules/python/vendors/FunASR/examples/README_zh.md
+++ b/modules/python/vendors/FunASR/examples/README_zh.md
@@ -0,0 +1,473 @@
+(简体中文|[English](./README.md))
+
+FunASR开源了大量在工业数据上预训练模型，您可以在 [模型许可协议](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)下自由使用、复制、修改和分享FunASR模型，下面列举代表性的模型，更多模型请参考 [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)。
+
+<div align="center">  
+<h4>
+ <a href="#模型推理"> 模型推理 </a>   
+｜<a href="#模型训练与测试"> 模型训练与测试 </a>
+｜<a href="#模型导出与测试"> 模型导出与测试 </a>
+</h4>
+</div>
+
+<a name="模型推理"></a>
+## 模型推理
+
+### 快速使用
+
+命令行方式调用：
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
+```
+
+python代码调用（推荐）
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer-zh")
+
+res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
+print(res)
+```
+
+### 接口说明
+
+#### AutoModel 定义
+```python
+model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
+```
+- `model`(str): [模型仓库](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) 中的模型名称，或本地磁盘中的模型路径
+- `device`(str): `cuda:0`（默认gpu0），使用 GPU 进行推理，指定。如果为`cpu`，则使用 CPU 进行推理
+- `ncpu`(int): `4` （默认），设置用于 CPU 内部操作并行性的线程数
+- `output_dir`(str): `None` （默认），如果设置，输出结果的输出路径
+- `batch_size`(int): `1` （默认），解码时的批处理，样本个数
+- `hub`(str)：`ms`（默认），从modelscope下载模型。如果为`hf`，从huggingface下载模型。
+- `**kwargs`(dict): 所有在`config.yaml`中参数，均可以直接在此处指定，例如，vad模型中最大切割长度 `max_single_segment_time=6000` （毫秒）。
+
+#### AutoModel 推理
+```python
+res = model.generate(input=[str], output_dir=[str])
+```
+- `input`: 要解码的输入，可以是：
+  - wav文件路径, 例如: asr_example.wav
+  - pcm文件路径, 例如: asr_example.pcm，此时需要指定音频采样率fs（默认为16000）
+  - 音频字节数流，例如：麦克风的字节数数据
+  - wav.scp，kaldi 样式的 wav 列表 (`wav_id \t wav_path`), 例如:
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  在这种输入 `wav.scp` 的情况下，必须设置 `output_dir` 以保存输出结果
+  - 音频采样点，例如：`audio, rate = soundfile.read("asr_example_zh.wav")`, 数据类型为 numpy.ndarray。支持batch输入，类型为list：
+  ```[audio_sample1, audio_sample2, ..., audio_sampleN]```
+  - fbank输入，支持组batch。shape为[batch, frames, dim]，类型为torch.Tensor，例如
+- `output_dir`: None （默认），如果设置，输出结果的输出路径
+- `**kwargs`(dict): 与模型相关的推理参数，例如，`beam_size=10`，`decoding_ctc_weight=0.1`。
+
+
+### 更多用法介绍
+
+
+#### 非实时语音识别
+##### SenseVoice
+```python
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+
+model_dir = "iic/SenseVoiceSmall"
+
+model = AutoModel(
+    model=model_dir,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device="cuda:0",
+)
+
+# en
+res = model.generate(
+    input=f"{model.model_path}/example/en.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+```
+参数说明：
+- `model_dir`：模型名称，或本地磁盘中的模型路径。
+- `vad_model`：表示开启VAD，VAD的作用是将长音频切割成短音频，此时推理耗时包括了VAD与SenseVoice总耗时，为链路耗时，如果需要单独测试SenseVoice模型耗时，可以关闭VAD模型。
+- `vad_kwargs`：表示VAD模型配置,`max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms。
+- `use_itn`：输出结果中是否包含标点与逆文本正则化。
+- `batch_size_s` 表示采用动态batch，batch中总音频时长，单位为秒s。
+- `merge_vad`：是否将 vad 模型切割的短音频碎片合成，合并后长度为`merge_length_s`，单位为秒s。
+- `ban_emo_unk`：禁用emo_unk标签，禁用后所有的句子都会被赋与情感标签。
+
+##### Paraformer
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh",  
+                  vad_model="fsmn-vad", 
+                  vad_kwargs={"max_single_segment_time": 60000},
+                  punc_model="ct-punc", 
+                  # spk_model="cam++"
+                  )
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='魔搭')
+print(res)
+```
+注意：
+- 通常模型输入限制时长30s以下，组合`vad_model`后，支持任意时长音频输入，不局限于paraformer模型，所有音频输入模型均可以。
+- `model`相关的参数可以直接在`AutoModel`定义中直接指定；与`vad_model`相关参数可以通过`vad_kwargs`来指定，类型为dict；类似的有`punc_kwargs`，`spk_kwargs`；
+- `max_single_segment_time`: 表示`vad_model`最大切割音频时长, 单位是毫秒ms.
+- `batch_size_s` 表示采用动态batch，batch中总音频时长，单位为秒s。
+- `batch_size_threshold_s`: 表示`vad_model`切割后音频片段时长超过 `batch_size_threshold_s`阈值时，将batch_size数设置为1, 单位为秒s.
+
+建议：当您输入为长音频，遇到OOM问题时，因为显存占用与音频时长呈平方关系增加，分为3种情况：
+- a)推理起始阶段，显存主要取决于`batch_size_s`，适当减小该值，可以减少显存占用；
+- b)推理中间阶段，遇到VAD切割的长音频片段，总token数小于`batch_size_s`，仍然出现OOM，可以适当减小`batch_size_threshold_s`，超过阈值，强制batch为1; 
+- c)推理快结束阶段，遇到VAD切割的长音频片段，总token数小于`batch_size_s`，且超过阈值`batch_size_threshold_s`，强制batch为1，仍然出现OOM，可以适当减小`max_single_segment_time`，使得VAD切割音频时长变短。
+
+#### 实时语音识别
+
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+
+注：`chunk_size`为流式延时配置，`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`，未来信息为`5*60=300ms`。每次推理输入为`600ms`（采样点数为`16000*0.6=960`），输出为对应文字，最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
+
+#### 语音端点检测（非实时）
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad")
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+注：VAD模型输出格式为：`[[beg1, end1], [beg2, end2], .., [begN, endN]]`，其中`begN/endN`表示第`N`个有效音频片段的起始点/结束点，
+单位为毫秒。
+
+#### 语音端点检测（实时）
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+注：流式VAD模型输出格式为4种情况：
+- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`：同上离线VAD输出结果。
+- `[[beg, -1]]`：表示只检测到起始点。
+- `[[-1, end]]`：表示只检测到结束点。
+- `[]`：表示既没有检测到起始点，也没有检测到结束点
+输出结果单位为毫秒，从起始点开始的绝对时间。
+
+#### 标点恢复
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc")
+
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+
+#### 时间戳预测
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+更多（[示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）
+
+<a name="核心功能"></a>
+## 模型训练与测试
+
+### 快速开始
+
+命令行执行（用于快速测试，不推荐）：
+```shell
+funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
+```
+
+python代码执行（可以多机多卡，推荐）
+
+```shell
+cd examples/industrial_data_pretraining/paraformer
+bash finetune.sh
+# "log_file: ./outputs/log.txt"
+```
+详细完整的脚本参考 [finetune.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/finetune.sh)
+
+### 详细参数介绍
+
+```shell
+funasr/bin/train.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=20000 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=false \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
+```
+
+- `model`（str）：模型名字（模型仓库中的ID），此时脚本会自动下载模型到本地；或者本地已经下载好的模型路径。
+- `train_data_set_list`（str）：训练数据路径，默认为jsonl格式，具体参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+- `valid_data_set_list`（str）：验证数据路径，默认为jsonl格式，具体参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+- `dataset_conf.batch_type`（str）：`example`（默认），batch的类型。`example`表示按照固定数目batch_size个样本组batch；`length` or `token` 表示动态组batch，batch总长度或者token数为batch_size。
+- `dataset_conf.batch_size`（int）：与 `batch_type` 搭配使用，当 `batch_type=example` 时，表示样本个数；当 `batch_type=length` 时，表示样本中长度，单位为fbank帧数（1帧10ms）或者文字token个数。
+- `train_conf.max_epoch`（int）：`100`（默认），训练总epoch数。
+- `train_conf.log_interval`（int）：`50`（默认），打印日志间隔step数。
+- `train_conf.resume`（int）：`True`（默认），是否开启断点重训。
+- `train_conf.validate_interval`（int）：`5000`（默认），训练中做验证测试的间隔step数。
+- `train_conf.save_checkpoint_interval`（int）：`5000`（默认），训练中模型保存间隔step数。
+- `train_conf.avg_keep_nbest_models_type`（str）：`acc`（默认），保留nbest的标准为acc（越大越好）。`loss`表示，保留nbest的标准为loss（越小越好）。
+- `train_conf.keep_nbest_models`（int）：`500`（默认），保留最大多少个模型参数，配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 保留最佳的n个模型，其他删除，节约存储空间。
+- `train_conf.avg_nbest_model`（int）：`10`（默认），保留最大多少个模型参数，配合 `avg_keep_nbest_models_type` 按照验证集 acc/loss 对最佳的n个模型平均。
+- `train_conf.accum_grad`（int）：`1`（默认），梯度累积功能。
+- `train_conf.grad_clip`（float）：`10.0`（默认），梯度截断功能。
+- `train_conf.use_fp16`（bool）：`False`（默认），开启fp16训练，加快训练速度。
+- `optim_conf.lr`（float）：学习率。
+- `output_dir`（str）：模型保存路径。
+- `**kwargs`(dict): 所有在`config.yaml`中参数，均可以直接在此处指定，例如，过滤20s以上长音频：`dataset_conf.max_token_length=2000`，单位为音频fbank帧数（1帧10ms）或者文字token个数。
+
+#### 多gpu训练
+##### 单机多gpu训练
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 1 --nproc_per_node ${gpu_num} --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+--nnodes 表示参与的节点总数，--nproc_per_node 表示每个节点上运行的进程数，--master_port 表示端口号
+
+##### 多机多gpu训练
+
+在主节点上，假设IP为192.168.1.1，端口为12345，使用的是2个GPU，则运行如下命令：
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+在从节点上（假设IP为192.168.1.2），你需要确保MASTER_ADDR和MASTER_PORT环境变量与主节点设置的一致，并运行同样的命令：
+```shell
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
+../../../funasr/bin/train.py ${train_args}
+```
+
+--nnodes 表示参与的节点总数，--node_rank 表示当前节点id，--nproc_per_node 表示每个节点上运行的进程数（通常为gpu个数），--master_port 表示端口号
+
+#### 准备数据
+
+`jsonl`格式可以参考（[例子](https://github.com/alibaba-damo-academy/FunASR/blob/main/data/list)）。
+可以用指令 `scp2jsonl` 从wav.scp与text.txt生成。wav.scp与text.txt准备过程如下：
+
+`train_text.txt`
+
+左边为数据唯一ID，需与`train_wav.scp`中的`ID`一一对应
+右边为音频文件标注文本，格式如下：
+
+```bash
+ID0012W0013 当客户风险承受能力评估依据发生变化时
+ID0012W0014 所有只要处理 data 不管你是做 machine learning 做 deep learning
+ID0012W0015 he tried to think how it could be
+```
+
+
+`train_wav.scp`
+
+左边为数据唯一ID，需与`train_text.txt`中的`ID`一一对应
+右边为音频文件的路径，格式如下
+
+```bash
+BAC009S0764W0121 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0764W0121.wav
+BAC009S0916W0489 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/BAC009S0916W0489.wav
+ID0012W0015 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cn_en.wav
+```
+
+`生成指令`
+
+```shell
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="../../../data/list/train.jsonl"
+```
+
+（可选，非必需）如果需要从jsonl解析成wav.scp与text.txt，可以使用指令：
+
+```shell
+# generate wav.scp and text.txt from train.jsonl and val.jsonl
+jsonl2scp \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in="../../../data/list/train.jsonl"
+```
+
+#### 查看训练日志
+
+##### 查看实验log
+```shell
+tail log.txt
+[2024-03-21 15:55:52,137][root][INFO] - train, rank: 3, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.327), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)], {'data_load': '0.000', 'forward_time': '0.315', 'backward_time': '0.555', 'optim_time': '0.076', 'total_time': '0.947'}, GPU, memory: usage: 3.830 GB, peak: 18.357 GB, cache: 20.910 GB, cache_peak: 20.910 GB
+[2024-03-21 15:55:52,139][root][INFO] - train, rank: 1, epoch: 0/50, step: 6990/1, total step: 6990, (loss_avg_rank: 0.334), (loss_avg_epoch: 0.409), (ppl_avg_epoch: 1.506), (acc_avg_epoch: 0.795), (lr: 1.165e-04), [('loss_att', 0.285), ('acc', 0.823), ('loss_pre', 0.046), ('loss', 0.331), ('batch_size', 36)], {'data_load': '0.000', 'forward_time': '0.334', 'backward_time': '0.536', 'optim_time': '0.077', 'total_time': '0.948'}, GPU, memory: usage: 3.943 GB, peak: 18.291 GB, cache: 19.619 GB, cache_peak: 19.619 GB
+```
+指标解释：
+- `rank`：表示gpu id。
+- `epoch`,`step`,`total step`：表示当前epoch，step，总step。
+- `loss_avg_rank`：表示当前step，所有gpu平均loss。
+- `loss/ppl/acc_avg_epoch`：表示当前epoch周期，截止当前step数时，总平均loss/ppl/acc。epoch结束时的最后一个step表示epoch总平均loss/ppl/acc，推荐使用acc指标。
+- `lr`：当前step的学习率。
+- `[('loss_att', 0.259), ('acc', 0.825), ('loss_pre', 0.04), ('loss', 0.299), ('batch_size', 40)]`：表示当前gpu id的具体数据。
+- `total_time`：表示单个step总耗时。
+- `GPU, memory`：分别表示，模型使用/峰值显存，模型+缓存使用/峰值显存。
+
+##### tensorboard可视化
+```bash
+tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
+```
+浏览器中打开：http://localhost:6006/
+
+### 训练后模型测试
+
+
+#### 有configuration.json
+
+假定，训练模型路径为：./model_dir，如果改目录下有生成configuration.json，只需要将 [上述模型推理方法](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/README_zh.md#%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86) 中模型名字修改为模型路径即可
+
+例如：
+
+从shell推理
+```shell
+python -m funasr.bin.inference ++model="./model_dir" ++input=="${input}" ++output_dir="${output_dir}"
+```
+从python推理
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="./model_dir")
+
+res = model.generate(input=wav_file)
+print(res)
+```
+
+#### 无configuration.json时
+
+如果模型路径中无configuration.json时，需要手动指定具体配置文件路径与模型路径
+
+```shell
+python -m funasr.bin.inference \
+--config-path "${local_path}" \
+--config-name "${config}" \
++init_param="${init_param}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}"
+```
+
+参数介绍
+- `config-path`：为实验中保存的 `config.yaml`，可以从实验输出目录中查找。
+- `config-name`：配置文件名，一般为 `config.yaml`，支持yaml格式与json格式，例如 `config.json`
+- `init_param`：需要测试的模型参数，一般为`model.pt`，可以自己选择具体的模型文件
+- `tokenizer_conf.token_list`：词表文件路径，一般在 `config.yaml` 有指定，无需再手动指定，当 `config.yaml` 中路径不正确时，需要在此处手动指定。
+- `frontend_conf.cmvn_file`：wav提取fbank中用到的cmvn文件，一般在 `config.yaml` 有指定，无需再手动指定，当 `config.yaml` 中路径不正确时，需要在此处手动指定。
+
+其他参数同上，完整 [示例](https://github.com/alibaba-damo-academy/FunASR/blob/main/examples/industrial_data_pretraining/paraformer/infer_from_local.sh)
+
+
+<a name="模型导出与测试"></a>
+## 模型导出与测试
+### 从命令行导出
+```shell
+funasr-export ++model=paraformer ++quantize=false
+```
+
+### 从Python导出
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer")
+
+res = model.export(quantize=False)
+```
+
+### 测试ONNX
+```python
+# pip3 install -U funasr-onnx
+from funasr_onnx import Paraformer
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
+```
+
+更多例子请参考 [样例](https://github.com/alibaba-damo-academy/FunASR/tree/main/runtime/python/onnxruntime)
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/README.md
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/README.md
@@ -0,0 +1,14 @@
+# Branchformer Result
+
+## Training Config
+- Feature info: using raw speech, extracting 80 dims fbank online, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train info: lr 0.001, batch_size 10000, 4 gpu(Tesla V100), acc_grad 1, 180 epochs
+- Train config: conf/train_asr_branchformer.yaml
+- LM config: LM was not used
+
+## Results (CER)
+
+|   testset   | CER(%)  |
+|:-----------:|:-------:|
+|     dev     |  4.15   |
+|    test     |  4.51   |
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/conf/branchformer_12e_6d_2048_256.yaml
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/conf/branchformer_12e_6d_2048_256.yaml
@@ -0,0 +1,122 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: Branchformer
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# encoder
+encoder: BranchformerEncoder
+encoder_conf:
+    output_size: 256
+    use_attn: true
+    attention_heads: 4
+    attention_layer_type: rel_selfattn
+    pos_enc_layer_type: rel_pos
+    rel_pos_type: latest
+    use_cgmlp: true
+    cgmlp_linear_units: 2048
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    merge_method: concat
+    cgmlp_weight: 0.5               # used only if merge_method is "fixed_ave"
+    attn_branch_drop_rate: 0.0      # used only if merge_method is "learned_ave"
+    num_blocks: 24
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    stochastic_depth_rate: 0.0
+
+# decoder
+decoder: TransformerDecoder
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.
+    src_attention_dropout_rate: 0.
+
+
+# frontend related
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    dither: 0.0
+    lfr_m: 1
+    lfr_n: 1
+
+specaug: SpecAug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 180
+  keep_nbest_models: 10
+  avg_keep_nbest_models_type: acc
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.001
+   weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 35000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 10000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 1024
+    shuffle: True
+    num_workers: 4
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null
+
+beam_size: 10
+decoding_ctc_weight: 0.4
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/demo_infer.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/demo_infer.sh
@@ -0,0 +1 @@
+../paraformer/demo_infer.sh
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/demo_train_or_finetune.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/demo_train_or_finetune.sh
@@ -0,0 +1 @@
+../paraformer/demo_train_or_finetune.sh
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/infer.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/infer.sh
@@ -0,0 +1,12 @@
+
+
+python -m funasr.bin.inference \
+--config-path="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3" \
+--config-name="config.yaml" \
++init_param="/mnt/workspace/FunASR/examples/aishell/paraformer/exp/baseline_paraformer_conformer_12e_6d_2048_256_zh_char_exp3/model.pt.ep38" \
++tokenizer_conf.token_list="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/zh_token_list/char/tokens.txt" \
++frontend_conf.cmvn_file="/mnt/nfs/zhifu.gzf/data/AISHELL-1-feats/DATA/data/train/am.mvn" \
++input="/mnt/nfs/zhifu.gzf/data/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav" \
++output_dir="./outputs/debug" \
++device="cuda:0" \
+
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/local/aishell_data_prep.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/local/aishell_data_prep.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+#. ./path.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <audio-path> <text-path> <output-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt
+output_dir=$3
+
+train_dir=$output_dir/data/local/train
+dev_dir=$output_dir/data/local/dev
+test_dir=$output_dir/data/local/test
+tmp_dir=$output_dir/data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+done
+
+mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test
+
+for f in wav.scp text; do
+  cp $train_dir/$f $output_dir/data/train/$f || exit 1;
+  cp $dev_dir/$f $output_dir/data/dev/$f || exit 1;
+  cp $test_dir/$f $output_dir/data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/local/download_and_untar.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! command -v wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data || exit 1
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data || exit 1
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav || exit 1
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/run.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/run.sh
@@ -0,0 +1,203 @@
+#!/usr/bin/env bash
+
+
+CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir=`pwd`
+lang=zh
+token_type=char
+stage=0
+stop_stage=5
+
+# feature configuration
+nj=32
+
+inference_device="cuda" #"cpu"
+inference_checkpoint="model.pt.avg10"
+inference_scp="wav.scp"
+inference_batch_size=1
+
+# data
+raw_data=../raw_data
+data_url=www.openslr.org/resources/33
+
+# exp tag
+tag="exp1"
+workspace=`pwd`
+
+master_port=12345
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+config=branchformer_12e_6d_2048_256.yaml
+model_dir="baseline_$(basename "${config}" .yaml)_${lang}_${token_type}_${tag}"
+
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    mkdir -p ${raw_data}
+    local/download_and_untar.sh ${raw_data} ${data_url} data_aishell
+    local/download_and_untar.sh ${raw_data} ${data_url} resource_aishell
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    local/aishell_data_prep.sh ${raw_data}/data_aishell/wav ${raw_data}/data_aishell/transcript ${feats_dir}
+    for x in train dev test; do
+        cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org
+        paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \
+            > ${feats_dir}/data/${x}/text
+        utils/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org
+        mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text
+
+        # convert wav.scp text to jsonl
+        scp_file_list_arg="++scp_file_list='[\"${feats_dir}/data/${x}/wav.scp\",\"${feats_dir}/data/${x}/text\"]'"
+        python ../../../funasr/datasets/audio_datasets/scp2jsonl.py \
+        ++data_type_list='["source", "target"]' \
+        ++jsonl_file_out=${feats_dir}/data/${x}/audio_datasets.jsonl \
+        ${scp_file_list_arg}
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature and CMVN Generation"
+    python ../../../funasr/bin/compute_audio_cmvn.py \
+    --config-path "${workspace}/conf" \
+    --config-name "${config}" \
+    ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+    ++cmvn_file="${feats_dir}/data/${train_set}/cmvn.json" \
+
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: Dictionary Preparation"
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
+
+    echo "make a dictionary"
+    echo "<blank>" > ${token_list}
+    echo "<s>" >> ${token_list}
+    echo "</s>" >> ${token_list}
+    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list}
+    echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "stage 4: ASR Training"
+
+  mkdir -p ${exp_dir}/exp/${model_dir}
+  current_time=$(date "+%Y-%m-%d_%H-%M")
+  log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}"
+  echo "log_file: ${log_file}"
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+  gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  torchrun \
+  --nnodes 1 \
+  --nproc_per_node ${gpu_num} \
+  --master_port ${master_port} \
+  ../../../funasr/bin/train.py \
+  --config-path "${workspace}/conf" \
+  --config-name "${config}" \
+  ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
+  ++valid_data_set_list="${feats_dir}/data/${valid_set}/audio_datasets.jsonl" \
+  ++tokenizer_conf.token_list="${token_list}" \
+  ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+  ++output_dir="${exp_dir}/exp/${model_dir}" &> ${log_file}
+fi
+
+
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  echo "stage 5: Inference"
+
+  if [ ${inference_device} == "cuda" ]; then
+      nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  else
+      inference_batch_size=1
+      CUDA_VISIBLE_DEVICES=""
+      for JOB in $(seq ${nj}); do
+          CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
+      done
+  fi
+
+  for dset in ${test_sets}; do
+
+    inference_dir="${exp_dir}/exp/${model_dir}/inference-${inference_checkpoint}/${dset}"
+    _logdir="${inference_dir}/logdir"
+    echo "inference_dir: ${inference_dir}"
+
+    mkdir -p "${_logdir}"
+    data_dir="${feats_dir}/data/${dset}"
+    key_file=${data_dir}/${inference_scp}
+
+    split_scps=
+    for JOB in $(seq "${nj}"); do
+        split_scps+=" ${_logdir}/keys.${JOB}.scp"
+    done
+    utils/split_scp.pl "${key_file}" ${split_scps}
+
+    gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+          id=$((JOB-1))
+          gpuid=${gpuid_list_array[$id]}
+
+          export CUDA_VISIBLE_DEVICES=${gpuid}
+          python ../../../funasr/bin/inference.py \
+          --config-path="${exp_dir}/exp/${model_dir}" \
+          --config-name="config.yaml" \
+          ++init_param="${exp_dir}/exp/${model_dir}/${inference_checkpoint}" \
+          ++tokenizer_conf.token_list="${token_list}" \
+          ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \
+          ++input="${_logdir}/keys.${JOB}.scp" \
+          ++output_dir="${inference_dir}/${JOB}" \
+          ++device="${inference_device}" \
+          ++ncpu=1 \
+          ++disable_log=true \
+          ++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
+        }&
+
+    done
+    wait
+
+    mkdir -p ${inference_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then
+          for JOB in $(seq "${nj}"); do
+              cat "${inference_dir}/${JOB}/1best_recog/${f}"
+          done | sort -k1 >"${inference_dir}/1best_recog/${f}"
+        fi
+    done
+
+    echo "Computing WER ..."
+    python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc
+    python utils/postprocess_text_zh.py  ${data_dir}/text ${inference_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer
+    tail -n 3 ${inference_dir}/1best_recog/text.cer
+  done
+
+fi
--- a/modules/python/vendors/FunASR/examples/aishell/branchformer/utils
+++ b/modules/python/vendors/FunASR/examples/aishell/branchformer/utils
@@ -0,0 +1 @@
+../paraformer/utils
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/README.md
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/README.md
@@ -0,0 +1,16 @@
+
+# Conformer Result
+
+## Training Config
+- Feature info: using 80 dims fbank, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train info: lr 5e-4, batch_size 25000, 2 gpu(Tesla V100), acc_grad 1, 50 epochs
+- Train config: conf/train_asr_transformer.yaml
+- LM config: LM was not used
+- Model size: 46M
+
+## Results (CER)
+
+|   testset   | CER(%)  |
+|:-----------:|:-------:|
+|     dev     |  4.42   |
+|    test     |  4.87   |
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/conf/conformer_12e_6d_2048_256.yaml
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/conf/conformer_12e_6d_2048_256.yaml
@@ -0,0 +1,110 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: Conformer
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# encoder
+encoder: ConformerEncoder
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder
+decoder: TransformerDecoder
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+
+# frontend related
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+specaug: SpecAug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 150
+  keep_nbest_models: 10
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 25000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 1024
+    shuffle: True
+    num_workers: 4
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/conf/conformer_rwkv.yaml
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/conf/conformer_rwkv.yaml
@@ -0,0 +1,124 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: Conformer
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# encoder
+encoder: ConformerEncoder
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder
+decoder: TransformerRWKVDecoder
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+    input_layer: embed
+    rwkv_cfg:
+      n_embd: 256
+      dropout: 0
+      head_size_a: 64
+      ctx_len: 512
+      dim_att: 256 #${model_conf.rwkv_cfg.n_embd}
+      dim_ffn: null
+      head_size_divisor: 4
+      n_layer: 6
+      pre_ffn: 0
+      ln0: false
+      ln1: false
+      init_rwkv: true
+
+
+# frontend related
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+specaug: SpecAug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 150
+  keep_nbest_models: 10
+  log_interval: 50
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 25000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 1024
+    shuffle: True
+    num_workers: 4
+    preprocessor_speech: SpeechPreprocessSpeedPerturb
+    preprocessor_speech_conf:
+      speed_perturb: [0.9, 1.0, 1.1]
+
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null
--- a/modules/python/vendors/FunASR/examples/aishell/conformer/demo_infer.sh
+++ b/modules/python/vendors/FunASR/examples/aishell/conformer/demo_infer.sh
@@ -0,0 +1 @@
+../paraformer/demo_infer.sh
--- a/Show More
+++ b/Show More