Ruby 3.4.9p82 (2026-03-11 revision 76cca827ab52ab1d346a728f068d5b8da3e2952b)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/numeric.h"
35#include "internal/object.h"
36#include "internal/proc.h"
37#include "internal/re.h"
38#include "internal/sanitizers.h"
39#include "internal/string.h"
40#include "internal/transcode.h"
41#include "probes.h"
42#include "ruby/encoding.h"
43#include "ruby/re.h"
44#include "ruby/util.h"
45#include "ruby_assert.h"
46#include "vm_sync.h"
48
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
51# include <crypt.h>
52# endif
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
56#endif
57
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
60
61#undef rb_str_new
62#undef rb_usascii_str_new
63#undef rb_utf8_str_new
64#undef rb_enc_str_new
65#undef rb_str_new_cstr
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
73#undef rb_str_buf_cat
74#undef rb_str_buf_cat2
75#undef rb_str_cat2
76#undef rb_str_cat_cstr
77#undef rb_fstring_cstr
78
81
82/* Flags of RString
83 *
84 * 0: STR_SHARED (equal to ELTS_SHARED)
85 * The string is shared. The buffer this string points to is owned by
86 * another string (the shared root).
87 * 1: RSTRING_NOEMBED
88 * The string is not embedded. When a string is embedded, the contents
89 * follow the header. When a string is not embedded, the contents is
90 * on a separately allocated buffer.
91 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
92 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
93 * It emits a deprecation warning when mutated for the first time.
94 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
95 * The string was allocated by the `Symbol#to_s` method.
96 * It emits a deprecation warning when mutated for the first time.
97 * 4: STR_PRECOMPUTED_HASH
98 * The string is embedded and has its precomputed hashcode stored
99 * after the terminator.
100 * 5: STR_SHARED_ROOT
101 * Other strings may point to the contents of this string. When this
102 * flag is set, STR_SHARED must not be set.
103 * 6: STR_BORROWED
104 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
105 * to be unshared by rb_str_tmp_frozen_release.
106 * 7: STR_TMPLOCK
107 * The pointer to the buffer is passed to a system call such as
108 * read(2). Any modification and realloc is prohibited.
109 * 8-9: ENC_CODERANGE
110 * Stores the coderange of the string.
111 * 10-16: ENCODING
112 * Stores the encoding of the string.
113 * 17: RSTRING_FSTR
114 * The string is a fstring. The string is deduplicated in the fstring
115 * table.
116 * 18: STR_NOFREE
117 * Do not free this string's buffer when the string is reclaimed
118 * by the garbage collector. Used for when the string buffer is a C
119 * string literal.
120 * 19: STR_FAKESTR
121 * The string is not allocated or managed by the garbage collector.
122 * Typically, the string object header (struct RString) is temporarily
123 * allocated on C stack.
124 */
125
126#define RUBY_MAX_CHAR_LEN 16
127#define STR_PRECOMPUTED_HASH FL_USER4
128#define STR_SHARED_ROOT FL_USER5
129#define STR_BORROWED FL_USER6
130#define STR_TMPLOCK FL_USER7
131#define STR_NOFREE FL_USER18
132#define STR_FAKESTR FL_USER19
133
134#define STR_SET_NOEMBED(str) do {\
135 FL_SET((str), STR_NOEMBED);\
136 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
137} while (0)
138#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
139
140#define STR_SET_LEN(str, n) do { \
141 RSTRING(str)->len = (n); \
142} while (0)
143
144static inline bool
145str_encindex_fastpath(int encindex)
146{
147 // The overwhelming majority of strings are in one of these 3 encodings.
148 switch (encindex) {
149 case ENCINDEX_ASCII_8BIT:
150 case ENCINDEX_UTF_8:
151 case ENCINDEX_US_ASCII:
152 return true;
153 default:
154 return false;
155 }
156}
157
158static inline bool
159str_enc_fastpath(VALUE str)
160{
161 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
162}
163
164#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
165#define TERM_FILL(ptr, termlen) do {\
166 char *const term_fill_ptr = (ptr);\
167 const int term_fill_len = (termlen);\
168 *term_fill_ptr = '\0';\
169 if (UNLIKELY(term_fill_len > 1))\
170 memset(term_fill_ptr, 0, term_fill_len);\
171} while (0)
172
173#define RESIZE_CAPA(str,capacity) do {\
174 const int termlen = TERM_LEN(str);\
175 RESIZE_CAPA_TERM(str,capacity,termlen);\
176} while (0)
177#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
178 if (STR_EMBED_P(str)) {\
179 if (str_embed_capa(str) < capacity + termlen) {\
180 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
181 const long tlen = RSTRING_LEN(str);\
182 memcpy(tmp, RSTRING_PTR(str), tlen);\
183 RSTRING(str)->as.heap.ptr = tmp;\
184 RSTRING(str)->len = tlen;\
185 STR_SET_NOEMBED(str);\
186 RSTRING(str)->as.heap.aux.capa = (capacity);\
187 }\
188 }\
189 else {\
190 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
191 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
192 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
193 RSTRING(str)->as.heap.aux.capa = (capacity);\
194 }\
195} while (0)
196
197#define STR_SET_SHARED(str, shared_str) do { \
198 if (!FL_TEST(str, STR_FAKESTR)) { \
199 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
200 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
201 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
202 FL_SET((str), STR_SHARED); \
203 FL_SET((shared_str), STR_SHARED_ROOT); \
204 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
205 FL_SET_RAW((shared_str), STR_BORROWED); \
206 } \
207} while (0)
208
209#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
210#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
211/* TODO: include the terminator size in capa. */
212
213#define STR_ENC_GET(str) get_encoding(str)
214
215#if !defined SHARABLE_MIDDLE_SUBSTRING
216# define SHARABLE_MIDDLE_SUBSTRING 0
217#endif
218#if !SHARABLE_MIDDLE_SUBSTRING
219#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
220#else
221#define SHARABLE_SUBSTRING_P(beg, len, end) 1
222#endif
223
224
225static inline long
226str_embed_capa(VALUE str)
227{
228 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
229}
230
231bool
232rb_str_reembeddable_p(VALUE str)
233{
234 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
235}
236
237static inline size_t
238rb_str_embed_size(long capa)
239{
240 return offsetof(struct RString, as.embed.ary) + capa;
241}
242
243size_t
244rb_str_size_as_embedded(VALUE str)
245{
246 size_t real_size;
247 if (STR_EMBED_P(str)) {
248 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
249 }
250 /* if the string is not currently embedded, but it can be embedded, how
251 * much space would it require */
252 else if (rb_str_reembeddable_p(str)) {
253 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
254 }
255 else {
256 real_size = sizeof(struct RString);
257 }
258
259 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
260 real_size += sizeof(st_index_t);
261 }
262
263 return real_size;
264}
265
266static inline bool
267STR_EMBEDDABLE_P(long len, long termlen)
268{
269 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
270}
271
272static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
273static VALUE str_new_frozen(VALUE klass, VALUE orig);
274static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
275static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
276static VALUE str_new(VALUE klass, const char *ptr, long len);
277static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
278static inline void str_modifiable(VALUE str);
279static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
280static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
281
282static inline void
283str_make_independent(VALUE str)
284{
285 long len = RSTRING_LEN(str);
286 int termlen = TERM_LEN(str);
287 str_make_independent_expand((str), len, 0L, termlen);
288}
289
290static inline int str_dependent_p(VALUE str);
291
292void
293rb_str_make_independent(VALUE str)
294{
295 if (str_dependent_p(str)) {
296 str_make_independent(str);
297 }
298}
299
300void
301rb_str_make_embedded(VALUE str)
302{
303 RUBY_ASSERT(rb_str_reembeddable_p(str));
304 RUBY_ASSERT(!STR_EMBED_P(str));
305
306 char *buf = RSTRING(str)->as.heap.ptr;
307 long len = RSTRING(str)->len;
308
309 STR_SET_EMBED(str);
310 STR_SET_LEN(str, len);
311
312 if (len > 0) {
313 memcpy(RSTRING_PTR(str), buf, len);
314 ruby_xfree(buf);
315 }
316
317 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
318}
319
320void
321rb_debug_rstring_null_ptr(const char *func)
322{
323 fprintf(stderr, "%s is returning NULL!! "
324 "SIGSEGV is highly expected to follow immediately.\n"
325 "If you could reproduce, attach your debugger here, "
326 "and look at the passed string.\n",
327 func);
328}
329
330/* symbols for [up|down|swap]case/capitalize options */
331static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
332
333static rb_encoding *
334get_encoding(VALUE str)
335{
336 return rb_enc_from_index(ENCODING_GET(str));
337}
338
339static void
340mustnot_broken(VALUE str)
341{
342 if (is_broken_string(str)) {
343 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
344 }
345}
346
347static void
348mustnot_wchar(VALUE str)
349{
350 rb_encoding *enc = STR_ENC_GET(str);
351 if (rb_enc_mbminlen(enc) > 1) {
352 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
353 }
354}
355
356static int fstring_cmp(VALUE a, VALUE b);
357
358static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
359
360#if SIZEOF_LONG == SIZEOF_VOIDP
361#define PRECOMPUTED_FAKESTR_HASH 1
362#else
363#endif
364
365#ifdef PRECOMPUTED_FAKESTR_HASH
366static st_index_t
367fstring_hash(VALUE str)
368{
369 if (FL_TEST_RAW(str, STR_FAKESTR)) {
370 // register_fstring precomputes the hash and stores it in capa for fake strings
371 return (st_index_t)RSTRING(str)->as.heap.aux.capa;
372 }
373 else {
374 return rb_str_hash(str);
375 }
376}
377#else
378#define fstring_hash rb_str_hash
379#endif
380
381const struct st_hash_type rb_fstring_hash_type = {
382 fstring_cmp,
383 fstring_hash,
384};
385
386#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
387
388static inline st_index_t
389str_do_hash(VALUE str)
390{
391 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
392 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
393 if (e && !is_ascii_string(str)) {
394 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
395 }
396 return h;
397}
398
399static VALUE
400str_store_precomputed_hash(VALUE str, st_index_t hash)
401{
402 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
403 RUBY_ASSERT(STR_EMBED_P(str));
404
405#if RUBY_DEBUG
406 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
407 size_t free_bytes = str_embed_capa(str) - used_bytes;
408 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
409#endif
410
411 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
412
413 FL_SET(str, STR_PRECOMPUTED_HASH);
414
415 return str;
416}
417
419 VALUE fstr;
420 bool copy;
421 bool force_precompute_hash;
422};
423
424static int
425fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
426{
427 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
428 VALUE str = (VALUE)*key;
429
430 if (existing) {
431 /* because of lazy sweep, str may be unmarked already and swept
432 * at next time */
433
434 if (rb_objspace_garbage_object_p(str)) {
435 arg->fstr = Qundef;
436 // When RSTRING_FSTR strings are swept, they call `st_delete`.
437 // To avoid a race condition if an equivalent string was inserted
438 // we must remove the flag immediately.
439 FL_UNSET_RAW(str, RSTRING_FSTR);
440 return ST_DELETE;
441 }
442
443 arg->fstr = str;
444 return ST_STOP;
445 }
446 else {
447 // Unless the string is empty or binary, its coderange has been precomputed.
448 int coderange = ENC_CODERANGE(str);
449
450 if (FL_TEST_RAW(str, STR_FAKESTR)) {
451 if (arg->copy) {
452 VALUE new_str;
453 long len = RSTRING_LEN(str);
454 long capa = len + sizeof(st_index_t);
455 int term_len = TERM_LEN(str);
456
457 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
458 new_str = str_alloc_embed(rb_cString, capa + term_len);
459 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
460 STR_SET_LEN(new_str, RSTRING_LEN(str));
461 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
462 rb_enc_copy(new_str, str);
463 str_store_precomputed_hash(new_str, fstring_hash(str));
464 }
465 else {
466 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
467 rb_enc_copy(new_str, str);
468#ifdef PRECOMPUTED_FAKESTR_HASH
469 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
470 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
471 }
472#endif
473 }
474 str = new_str;
475 }
476 else {
477 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
478 RSTRING(str)->len,
479 ENCODING_GET(str));
480 }
481 OBJ_FREEZE(str);
482 }
483 else {
484 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
485 str = str_new_frozen(rb_cString, str);
486 }
487 if (STR_SHARED_P(str)) { /* str should not be shared */
488 /* shared substring */
489 str_make_independent(str);
491 }
492 if (!BARE_STRING_P(str)) {
493 str = str_new_frozen(rb_cString, str);
494 }
495 }
496
497 ENC_CODERANGE_SET(str, coderange);
498 RBASIC(str)->flags |= RSTRING_FSTR;
499
500 *key = *value = arg->fstr = str;
501 return ST_CONTINUE;
502 }
503}
504
505VALUE
506rb_fstring(VALUE str)
507{
508 VALUE fstr;
509 int bare;
510
511 Check_Type(str, T_STRING);
512
513 if (FL_TEST(str, RSTRING_FSTR))
514 return str;
515
516 bare = BARE_STRING_P(str);
517 if (!bare) {
518 if (STR_EMBED_P(str)) {
519 OBJ_FREEZE(str);
520 return str;
521 }
522
523 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
525 return str;
526 }
527 }
528
529 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
530 rb_str_resize(str, RSTRING_LEN(str));
531
532 fstr = register_fstring(str, false, false);
533
534 if (!bare) {
535 str_replace_shared_without_enc(str, fstr);
536 OBJ_FREEZE(str);
537 return str;
538 }
539 return fstr;
540}
541
542static VALUE
543register_fstring(VALUE str, bool copy, bool force_precompute_hash)
544{
545 struct fstr_update_arg args = {
546 .copy = copy,
547 .force_precompute_hash = force_precompute_hash
548 };
549
550#if SIZEOF_VOIDP == SIZEOF_LONG
551 if (FL_TEST_RAW(str, STR_FAKESTR)) {
552 // if the string hasn't been interned, we'll need the hash twice, so we
553 // compute it once and store it in capa
554 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
555 }
556#endif
557
558 RB_VM_LOCK_ENTER();
559 {
560 st_table *frozen_strings = rb_vm_fstring_table();
561 do {
562 args.fstr = str;
563 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
564 } while (UNDEF_P(args.fstr));
565 }
566 RB_VM_LOCK_LEAVE();
567
568 RUBY_ASSERT(OBJ_FROZEN(args.fstr));
569 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
570 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
571 RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
572
573 return args.fstr;
574}
575
576static VALUE
577setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
578{
579 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
580
581 if (!name) {
583 name = "";
584 }
585
586 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
587
588 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
589 fake_str->len = len;
590 fake_str->as.heap.ptr = (char *)name;
591 fake_str->as.heap.aux.capa = len;
592 return (VALUE)fake_str;
593}
594
595/*
596 * set up a fake string which refers a static string literal.
597 */
598VALUE
599rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
600{
601 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
602}
603
604/*
605 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
606 * shared string which refers a static string literal. `ptr` must
607 * point a constant string.
608 */
609VALUE
610rb_fstring_new(const char *ptr, long len)
611{
612 struct RString fake_str = {RBASIC_INIT};
613 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
614}
615
616VALUE
617rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
618{
619 struct RString fake_str = {RBASIC_INIT};
620 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
621}
622
623VALUE
624rb_fstring_cstr(const char *ptr)
625{
626 return rb_fstring_new(ptr, strlen(ptr));
627}
628
629static int
630fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
631{
632 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
633 return ST_CONTINUE;
634}
635
636static int
637fstring_cmp(VALUE a, VALUE b)
638{
639 long alen, blen;
640 const char *aptr, *bptr;
641 RSTRING_GETMEM(a, aptr, alen);
642 RSTRING_GETMEM(b, bptr, blen);
643 return (alen != blen ||
644 ENCODING_GET(a) != ENCODING_GET(b) ||
645 memcmp(aptr, bptr, alen) != 0);
646}
647
648static inline bool
649single_byte_optimizable(VALUE str)
650{
651 int encindex = ENCODING_GET(str);
652 switch (encindex) {
653 case ENCINDEX_ASCII_8BIT:
654 case ENCINDEX_US_ASCII:
655 return true;
656 case ENCINDEX_UTF_8:
657 // For UTF-8 it's worth scanning the string coderange when unknown.
659 }
660 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
661 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
662 return true;
663 }
664
665 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
666 return true;
667 }
668
669 /* Conservative. Possibly single byte.
670 * "\xa1" in Shift_JIS for example. */
671 return false;
672}
673
675
676static inline const char *
677search_nonascii(const char *p, const char *e)
678{
679 const char *s, *t;
680
681#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
682# if SIZEOF_UINTPTR_T == 8
683# define NONASCII_MASK UINT64_C(0x8080808080808080)
684# elif SIZEOF_UINTPTR_T == 4
685# define NONASCII_MASK UINT32_C(0x80808080)
686# else
687# error "don't know what to do."
688# endif
689#else
690# if SIZEOF_UINTPTR_T == 8
691# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
692# elif SIZEOF_UINTPTR_T == 4
693# define NONASCII_MASK 0x80808080UL /* or...? */
694# else
695# error "don't know what to do."
696# endif
697#endif
698
699 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
700#if !UNALIGNED_WORD_ACCESS
701 if ((uintptr_t)p % SIZEOF_VOIDP) {
702 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
703 p += l;
704 switch (l) {
705 default: UNREACHABLE;
706#if SIZEOF_VOIDP > 4
707 case 7: if (p[-7]&0x80) return p-7;
708 case 6: if (p[-6]&0x80) return p-6;
709 case 5: if (p[-5]&0x80) return p-5;
710 case 4: if (p[-4]&0x80) return p-4;
711#endif
712 case 3: if (p[-3]&0x80) return p-3;
713 case 2: if (p[-2]&0x80) return p-2;
714 case 1: if (p[-1]&0x80) return p-1;
715 case 0: break;
716 }
717 }
718#endif
719#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
720#define aligned_ptr(value) \
721 __builtin_assume_aligned((value), sizeof(uintptr_t))
722#else
723#define aligned_ptr(value) (value)
724#endif
725 s = aligned_ptr(p);
726 t = (e - (SIZEOF_VOIDP-1));
727#undef aligned_ptr
728 for (;s < t; s += sizeof(uintptr_t)) {
729 uintptr_t word;
730 memcpy(&word, s, sizeof(word));
731 if (word & NONASCII_MASK) {
732#ifdef WORDS_BIGENDIAN
733 return (const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
734#else
735 return (const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
736#endif
737 }
738 }
739 p = (const char *)s;
740 }
741
742 switch (e - p) {
743 default: UNREACHABLE;
744#if SIZEOF_VOIDP > 4
745 case 7: if (e[-7]&0x80) return e-7;
746 case 6: if (e[-6]&0x80) return e-6;
747 case 5: if (e[-5]&0x80) return e-5;
748 case 4: if (e[-4]&0x80) return e-4;
749#endif
750 case 3: if (e[-3]&0x80) return e-3;
751 case 2: if (e[-2]&0x80) return e-2;
752 case 1: if (e[-1]&0x80) return e-1;
753 case 0: return NULL;
754 }
755}
756
757static int
758coderange_scan(const char *p, long len, rb_encoding *enc)
759{
760 const char *e = p + len;
761
762 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
763 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
764 p = search_nonascii(p, e);
766 }
767
768 if (rb_enc_asciicompat(enc)) {
769 p = search_nonascii(p, e);
770 if (!p) return ENC_CODERANGE_7BIT;
771 for (;;) {
772 int ret = rb_enc_precise_mbclen(p, e, enc);
774 p += MBCLEN_CHARFOUND_LEN(ret);
775 if (p == e) break;
776 p = search_nonascii(p, e);
777 if (!p) break;
778 }
779 }
780 else {
781 while (p < e) {
782 int ret = rb_enc_precise_mbclen(p, e, enc);
784 p += MBCLEN_CHARFOUND_LEN(ret);
785 }
786 }
787 return ENC_CODERANGE_VALID;
788}
789
790long
791rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
792{
793 const char *p = s;
794
795 if (*cr == ENC_CODERANGE_BROKEN)
796 return e - s;
797
798 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
799 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
800 if (*cr == ENC_CODERANGE_VALID) return e - s;
801 p = search_nonascii(p, e);
803 return e - s;
804 }
805 else if (rb_enc_asciicompat(enc)) {
806 p = search_nonascii(p, e);
807 if (!p) {
808 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
809 return e - s;
810 }
811 for (;;) {
812 int ret = rb_enc_precise_mbclen(p, e, enc);
813 if (!MBCLEN_CHARFOUND_P(ret)) {
815 return p - s;
816 }
817 p += MBCLEN_CHARFOUND_LEN(ret);
818 if (p == e) break;
819 p = search_nonascii(p, e);
820 if (!p) break;
821 }
822 }
823 else {
824 while (p < e) {
825 int ret = rb_enc_precise_mbclen(p, e, enc);
826 if (!MBCLEN_CHARFOUND_P(ret)) {
828 return p - s;
829 }
830 p += MBCLEN_CHARFOUND_LEN(ret);
831 }
832 }
834 return e - s;
835}
836
837static inline void
838str_enc_copy(VALUE str1, VALUE str2)
839{
840 rb_enc_set_index(str1, ENCODING_GET(str2));
841}
842
843/* Like str_enc_copy, but does not check frozen status of str1.
844 * You should use this only if you're certain that str1 is not frozen. */
845static inline void
846str_enc_copy_direct(VALUE str1, VALUE str2)
847{
848 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
849 if (inlined_encoding == ENCODING_INLINE_MAX) {
850 rb_enc_set_index(str1, rb_enc_get_index(str2));
851 }
852 else {
853 ENCODING_SET_INLINED(str1, inlined_encoding);
854 }
855}
856
857static void
858rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
859{
860 /* this function is designed for copying encoding and coderange
861 * from src to new string "dest" which is made from the part of src.
862 */
863 str_enc_copy(dest, src);
864 if (RSTRING_LEN(dest) == 0) {
865 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
867 else
869 return;
870 }
871 switch (ENC_CODERANGE(src)) {
874 break;
876 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
877 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
879 else
881 break;
882 default:
883 break;
884 }
885}
886
887static void
888rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
889{
890 str_enc_copy(dest, src);
892}
893
894static int
895enc_coderange_scan(VALUE str, rb_encoding *enc)
896{
897 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
898}
899
900int
901rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
902{
903 return enc_coderange_scan(str, enc);
904}
905
906int
908{
909 int cr = ENC_CODERANGE(str);
910
911 if (cr == ENC_CODERANGE_UNKNOWN) {
912 cr = enc_coderange_scan(str, get_encoding(str));
913 ENC_CODERANGE_SET(str, cr);
914 }
915 return cr;
916}
917
918static inline bool
919rb_enc_str_asciicompat(VALUE str)
920{
921 int encindex = ENCODING_GET_INLINED(str);
922 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
923}
924
925int
927{
928 switch(ENC_CODERANGE(str)) {
930 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
932 return true;
933 default:
934 return false;
935 }
936}
937
938static inline void
939str_mod_check(VALUE s, const char *p, long len)
940{
941 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
942 rb_raise(rb_eRuntimeError, "string modified");
943 }
944}
945
946static size_t
947str_capacity(VALUE str, const int termlen)
948{
949 if (STR_EMBED_P(str)) {
950 return str_embed_capa(str) - termlen;
951 }
952 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
953 return RSTRING(str)->len;
954 }
955 else {
956 return RSTRING(str)->as.heap.aux.capa;
957 }
958}
959
960size_t
962{
963 return str_capacity(str, TERM_LEN(str));
964}
965
966static inline void
967must_not_null(const char *ptr)
968{
969 if (!ptr) {
970 rb_raise(rb_eArgError, "NULL pointer given");
971 }
972}
973
974static inline VALUE
975str_alloc_embed(VALUE klass, size_t capa)
976{
977 size_t size = rb_str_embed_size(capa);
978 RUBY_ASSERT(size > 0);
979 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
980
981 NEWOBJ_OF(str, struct RString, klass,
983
984 return (VALUE)str;
985}
986
987static inline VALUE
988str_alloc_heap(VALUE klass)
989{
990 NEWOBJ_OF(str, struct RString, klass,
991 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
992
993 return (VALUE)str;
994}
995
996static inline VALUE
997empty_str_alloc(VALUE klass)
998{
999 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1000 VALUE str = str_alloc_embed(klass, 0);
1001 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1003 return str;
1004}
1005
1006static VALUE
1007str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1008{
1009 VALUE str;
1010
1011 if (len < 0) {
1012 rb_raise(rb_eArgError, "negative string size (or size too big)");
1013 }
1014
1015 if (enc == NULL) {
1016 enc = rb_ascii8bit_encoding();
1017 }
1018
1019 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1020
1021 int termlen = rb_enc_mbminlen(enc);
1022
1023 if (STR_EMBEDDABLE_P(len, termlen)) {
1024 str = str_alloc_embed(klass, len + termlen);
1025 if (len == 0) {
1026 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1027 }
1028 }
1029 else {
1030 str = str_alloc_heap(klass);
1031 RSTRING(str)->as.heap.aux.capa = len;
1032 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1033 * integer overflow. If we can STATIC_ASSERT that, the following
1034 * mul_add_mul can be reverted to a simple ALLOC_N. */
1035 RSTRING(str)->as.heap.ptr =
1036 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1037 }
1038
1039 rb_enc_raw_set(str, enc);
1040
1041 if (ptr) {
1042 memcpy(RSTRING_PTR(str), ptr, len);
1043 }
1044
1045 STR_SET_LEN(str, len);
1046 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1047 return str;
1048}
1049
1050static VALUE
1051str_new(VALUE klass, const char *ptr, long len)
1052{
1053 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1054}
1055
1056VALUE
1057rb_str_new(const char *ptr, long len)
1058{
1059 return str_new(rb_cString, ptr, len);
1060}
1061
1062VALUE
1063rb_usascii_str_new(const char *ptr, long len)
1064{
1065 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1066}
1067
1068VALUE
1069rb_utf8_str_new(const char *ptr, long len)
1070{
1071 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1072}
1073
1074VALUE
1075rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1076{
1077 return str_enc_new(rb_cString, ptr, len, enc);
1078}
1079
1080VALUE
1081rb_str_new_cstr(const char *ptr)
1082{
1083 must_not_null(ptr);
1084 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1085 * memory regions, and that cannot be detected by the MSAN. Just
1086 * trust the programmer that the argument passed here is a sane C
1087 * string. */
1088 __msan_unpoison_string(ptr);
1089 return rb_str_new(ptr, strlen(ptr));
1090}
1091
1092VALUE
1094{
1096}
1097
1098VALUE
1099rb_utf8_str_new_cstr(const char *ptr)
1100{
1102}
1103
1104VALUE
1105rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
1106{
1107 must_not_null(ptr);
1108 if (rb_enc_mbminlen(enc) != 1) {
1109 rb_raise(rb_eArgError, "wchar encoding given");
1110 }
1111 return rb_enc_str_new(ptr, strlen(ptr), enc);
1112}
1113
1114static VALUE
1115str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1116{
1117 VALUE str;
1118
1119 if (len < 0) {
1120 rb_raise(rb_eArgError, "negative string size (or size too big)");
1121 }
1122
1123 if (!ptr) {
1124 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1125 }
1126 else {
1127 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1128 str = str_alloc_heap(klass);
1129 RSTRING(str)->len = len;
1130 RSTRING(str)->as.heap.ptr = (char *)ptr;
1131 RSTRING(str)->as.heap.aux.capa = len;
1132 RBASIC(str)->flags |= STR_NOFREE;
1133 rb_enc_associate_index(str, encindex);
1134 }
1135 return str;
1136}
1137
1138VALUE
1139rb_str_new_static(const char *ptr, long len)
1140{
1141 return str_new_static(rb_cString, ptr, len, 0);
1142}
1143
1144VALUE
1145rb_usascii_str_new_static(const char *ptr, long len)
1146{
1147 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1148}
1149
1150VALUE
1151rb_utf8_str_new_static(const char *ptr, long len)
1152{
1153 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1154}
1155
1156VALUE
1157rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1158{
1159 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1160}
1161
1162static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1163 rb_encoding *from, rb_encoding *to,
1164 int ecflags, VALUE ecopts);
1165
1166static inline bool
1167is_enc_ascii_string(VALUE str, rb_encoding *enc)
1168{
1169 int encidx = rb_enc_to_index(enc);
1170 if (rb_enc_get_index(str) == encidx)
1171 return is_ascii_string(str);
1172 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1173}
1174
1175VALUE
1176rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1177{
1178 long len;
1179 const char *ptr;
1180 VALUE newstr;
1181
1182 if (!to) return str;
1183 if (!from) from = rb_enc_get(str);
1184 if (from == to) return str;
1185 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1186 rb_is_ascii8bit_enc(to)) {
1187 if (STR_ENC_GET(str) != to) {
1188 str = rb_str_dup(str);
1189 rb_enc_associate(str, to);
1190 }
1191 return str;
1192 }
1193
1194 RSTRING_GETMEM(str, ptr, len);
1195 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1196 from, to, ecflags, ecopts);
1197 if (NIL_P(newstr)) {
1198 /* some error, return original */
1199 return str;
1200 }
1201 return newstr;
1202}
1203
1204VALUE
1205rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1206 rb_encoding *from, int ecflags, VALUE ecopts)
1207{
1208 long olen;
1209
1210 olen = RSTRING_LEN(newstr);
1211 if (ofs < -olen || olen < ofs)
1212 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1213 if (ofs < 0) ofs += olen;
1214 if (!from) {
1215 STR_SET_LEN(newstr, ofs);
1216 return rb_str_cat(newstr, ptr, len);
1217 }
1218
1219 rb_str_modify(newstr);
1220 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1221 rb_enc_get(newstr),
1222 ecflags, ecopts);
1223}
1224
1225VALUE
1226rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1227{
1228 STR_SET_LEN(str, 0);
1229 rb_enc_associate(str, enc);
1230 rb_str_cat(str, ptr, len);
1231 return str;
1232}
1233
1234static VALUE
1235str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1236 rb_encoding *from, rb_encoding *to,
1237 int ecflags, VALUE ecopts)
1238{
1239 rb_econv_t *ec;
1241 long olen;
1242 VALUE econv_wrapper;
1243 const unsigned char *start, *sp;
1244 unsigned char *dest, *dp;
1245 size_t converted_output = (size_t)ofs;
1246
1247 olen = rb_str_capacity(newstr);
1248
1249 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1250 RBASIC_CLEAR_CLASS(econv_wrapper);
1251 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1252 if (!ec) return Qnil;
1253 DATA_PTR(econv_wrapper) = ec;
1254
1255 sp = (unsigned char*)ptr;
1256 start = sp;
1257 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1258 (dp = dest + converted_output),
1259 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1261 /* destination buffer short */
1262 size_t converted_input = sp - start;
1263 size_t rest = len - converted_input;
1264 converted_output = dp - dest;
1265 rb_str_set_len(newstr, converted_output);
1266 if (converted_input && converted_output &&
1267 rest < (LONG_MAX / converted_output)) {
1268 rest = (rest * converted_output) / converted_input;
1269 }
1270 else {
1271 rest = olen;
1272 }
1273 olen += rest < 2 ? 2 : rest;
1274 rb_str_resize(newstr, olen);
1275 }
1276 DATA_PTR(econv_wrapper) = 0;
1277 RB_GC_GUARD(econv_wrapper);
1278 rb_econv_close(ec);
1279 switch (ret) {
1280 case econv_finished:
1281 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1282 rb_str_set_len(newstr, len);
1283 rb_enc_associate(newstr, to);
1284 return newstr;
1285
1286 default:
1287 return Qnil;
1288 }
1289}
1290
1291VALUE
1292rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1293{
1294 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1295}
1296
1297VALUE
1298rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1299{
1300 rb_encoding *ienc;
1301 VALUE str;
1302 const int eidx = rb_enc_to_index(eenc);
1303
1304 if (!ptr) {
1305 return rb_enc_str_new(ptr, len, eenc);
1306 }
1307
1308 /* ASCII-8BIT case, no conversion */
1309 if ((eidx == rb_ascii8bit_encindex()) ||
1310 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1311 return rb_str_new(ptr, len);
1312 }
1313 /* no default_internal or same encoding, no conversion */
1315 if (!ienc || eenc == ienc) {
1316 return rb_enc_str_new(ptr, len, eenc);
1317 }
1318 /* ASCII compatible, and ASCII only string, no conversion in
1319 * default_internal */
1320 if ((eidx == rb_ascii8bit_encindex()) ||
1321 (eidx == rb_usascii_encindex()) ||
1322 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1323 return rb_enc_str_new(ptr, len, ienc);
1324 }
1325 /* convert from the given encoding to default_internal */
1326 str = rb_enc_str_new(NULL, 0, ienc);
1327 /* when the conversion failed for some reason, just ignore the
1328 * default_internal and result in the given encoding as-is. */
1329 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1330 rb_str_initialize(str, ptr, len, eenc);
1331 }
1332 return str;
1333}
1334
1335VALUE
1336rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1337{
1338 int eidx = rb_enc_to_index(eenc);
1339 if (eidx == rb_usascii_encindex() &&
1340 !is_ascii_string(str)) {
1341 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1342 return str;
1343 }
1344 rb_enc_associate_index(str, eidx);
1345 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1346}
1347
1348VALUE
1349rb_external_str_new(const char *ptr, long len)
1350{
1352}
1353
1354VALUE
1356{
1358}
1359
1360VALUE
1361rb_locale_str_new(const char *ptr, long len)
1362{
1364}
1365
1366VALUE
1368{
1369 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1370}
1371
1372VALUE
1373rb_filesystem_str_new(const char *ptr, long len)
1374{
1376}
1377
1378VALUE
1380{
1381 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1382}
1383
1384VALUE
1389
1390VALUE
1395
1396VALUE
1397rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1398{
1399 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1400}
1401
1402static VALUE
1403str_replace_shared_without_enc(VALUE str2, VALUE str)
1404{
1405 const int termlen = TERM_LEN(str);
1406 char *ptr;
1407 long len;
1408
1409 RSTRING_GETMEM(str, ptr, len);
1410 if (str_embed_capa(str2) >= len + termlen) {
1411 char *ptr2 = RSTRING(str2)->as.embed.ary;
1412 STR_SET_EMBED(str2);
1413 memcpy(ptr2, RSTRING_PTR(str), len);
1414 TERM_FILL(ptr2+len, termlen);
1415 }
1416 else {
1417 VALUE root;
1418 if (STR_SHARED_P(str)) {
1419 root = RSTRING(str)->as.heap.aux.shared;
1420 RSTRING_GETMEM(str, ptr, len);
1421 }
1422 else {
1423 root = rb_str_new_frozen(str);
1424 RSTRING_GETMEM(root, ptr, len);
1425 }
1426 RUBY_ASSERT(OBJ_FROZEN(root));
1427
1428 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1429 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1430 rb_fatal("about to free a possible shared root");
1431 }
1432 char *ptr2 = STR_HEAP_PTR(str2);
1433 if (ptr2 != ptr) {
1434 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1435 }
1436 }
1437 FL_SET(str2, STR_NOEMBED);
1438 RSTRING(str2)->as.heap.ptr = ptr;
1439 STR_SET_SHARED(str2, root);
1440 }
1441
1442 STR_SET_LEN(str2, len);
1443
1444 return str2;
1445}
1446
1447static VALUE
1448str_replace_shared(VALUE str2, VALUE str)
1449{
1450 str_replace_shared_without_enc(str2, str);
1451 rb_enc_cr_str_exact_copy(str2, str);
1452 return str2;
1453}
1454
1455static VALUE
1456str_new_shared(VALUE klass, VALUE str)
1457{
1458 return str_replace_shared(str_alloc_heap(klass), str);
1459}
1460
1461VALUE
1463{
1464 return str_new_shared(rb_obj_class(str), str);
1465}
1466
1467VALUE
1469{
1470 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1471 return str_new_frozen(rb_obj_class(orig), orig);
1472}
1473
1474static VALUE
1475rb_str_new_frozen_String(VALUE orig)
1476{
1477 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1478 return str_new_frozen(rb_cString, orig);
1479}
1480
1481VALUE
1482rb_str_tmp_frozen_acquire(VALUE orig)
1483{
1484 if (OBJ_FROZEN_RAW(orig)) return orig;
1485 return str_new_frozen_buffer(0, orig, FALSE);
1486}
1487
1488VALUE
1489rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1490{
1491 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1492 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1493
1494 VALUE str = str_alloc_heap(0);
1495 OBJ_FREEZE(str);
1496 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1497 FL_SET(str, STR_SHARED_ROOT);
1498
1499 size_t capa = str_capacity(orig, TERM_LEN(orig));
1500
1501 /* If the string is embedded then we want to create a copy that is heap
1502 * allocated. If the string is shared then the shared root must be
1503 * embedded, so we want to create a copy. If the string is a shared root
1504 * then it must be embedded, so we want to create a copy. */
1505 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1506 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1507 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1508 }
1509 else {
1510 /* orig must be heap allocated and not shared, so we can safely transfer
1511 * the pointer to str. */
1512 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1513 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1514 RBASIC(orig)->flags &= ~STR_NOFREE;
1515 STR_SET_SHARED(orig, str);
1516 }
1517
1518 RSTRING(str)->len = RSTRING(orig)->len;
1519 RSTRING(str)->as.heap.aux.capa = capa;
1520
1521 return str;
1522}
1523
1524void
1525rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1526{
1527 if (RBASIC_CLASS(tmp) != 0)
1528 return;
1529
1530 if (STR_EMBED_P(tmp)) {
1532 }
1533 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1534 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1535 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1536
1537 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1538 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1539 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1540
1541 /* Unshare orig since the root (tmp) only has this one child. */
1542 FL_UNSET_RAW(orig, STR_SHARED);
1543 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1544 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1546
1547 /* Make tmp embedded and empty so it is safe for sweeping. */
1548 STR_SET_EMBED(tmp);
1549 STR_SET_LEN(tmp, 0);
1550 }
1551 }
1552}
1553
1554static VALUE
1555str_new_frozen(VALUE klass, VALUE orig)
1556{
1557 return str_new_frozen_buffer(klass, orig, TRUE);
1558}
1559
1560static VALUE
1561heap_str_make_shared(VALUE klass, VALUE orig)
1562{
1563 RUBY_ASSERT(!STR_EMBED_P(orig));
1564 RUBY_ASSERT(!STR_SHARED_P(orig));
1565
1566 VALUE str = str_alloc_heap(klass);
1567 STR_SET_LEN(str, RSTRING_LEN(orig));
1568 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1569 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1570 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1571 RBASIC(orig)->flags &= ~STR_NOFREE;
1572 STR_SET_SHARED(orig, str);
1573 if (klass == 0)
1574 FL_UNSET_RAW(str, STR_BORROWED);
1575 return str;
1576}
1577
1578static VALUE
1579str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1580{
1581 VALUE str;
1582
1583 long len = RSTRING_LEN(orig);
1584 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1585 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1586
1587 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1588 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1589 RUBY_ASSERT(STR_EMBED_P(str));
1590 }
1591 else {
1592 if (FL_TEST_RAW(orig, STR_SHARED)) {
1593 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1594 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1595 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1596 RUBY_ASSERT(ofs >= 0);
1597 RUBY_ASSERT(rest >= 0);
1598 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1600
1601 if ((ofs > 0) || (rest > 0) ||
1602 (klass != RBASIC(shared)->klass) ||
1603 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1604 str = str_new_shared(klass, shared);
1605 RUBY_ASSERT(!STR_EMBED_P(str));
1606 RSTRING(str)->as.heap.ptr += ofs;
1607 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1608 }
1609 else {
1610 if (RBASIC_CLASS(shared) == 0)
1611 FL_SET_RAW(shared, STR_BORROWED);
1612 return shared;
1613 }
1614 }
1615 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1616 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1617 STR_SET_EMBED(str);
1618 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1619 STR_SET_LEN(str, RSTRING_LEN(orig));
1620 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1621 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1622 }
1623 else {
1624 str = heap_str_make_shared(klass, orig);
1625 }
1626 }
1627
1628 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1629 OBJ_FREEZE(str);
1630 return str;
1631}
1632
1633VALUE
1634rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1635{
1636 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1637}
1638
1639static VALUE
1640str_new_empty_String(VALUE str)
1641{
1642 VALUE v = rb_str_new(0, 0);
1643 rb_enc_copy(v, str);
1644 return v;
1645}
1646
1647#define STR_BUF_MIN_SIZE 63
1648
1649VALUE
1651{
1652 if (STR_EMBEDDABLE_P(capa, 1)) {
1653 return str_alloc_embed(rb_cString, capa + 1);
1654 }
1655
1656 VALUE str = str_alloc_heap(rb_cString);
1657
1658 RSTRING(str)->as.heap.aux.capa = capa;
1659 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1660 RSTRING(str)->as.heap.ptr[0] = '\0';
1661
1662 return str;
1663}
1664
1665VALUE
1666rb_str_buf_new_cstr(const char *ptr)
1667{
1668 VALUE str;
1669 long len = strlen(ptr);
1670
1671 str = rb_str_buf_new(len);
1672 rb_str_buf_cat(str, ptr, len);
1673
1674 return str;
1675}
1676
1677VALUE
1679{
1680 return str_new(0, 0, len);
1681}
1682
1683void
1685{
1686 if (STR_EMBED_P(str)) {
1687 RB_DEBUG_COUNTER_INC(obj_str_embed);
1688 }
1689 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1690 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1691 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1692 }
1693 else {
1694 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1695 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1696 }
1697}
1698
1699size_t
1700rb_str_memsize(VALUE str)
1701{
1702 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1703 return STR_HEAP_SIZE(str);
1704 }
1705 else {
1706 return 0;
1707 }
1708}
1709
1710VALUE
1712{
1713 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1714}
1715
1716static inline void str_discard(VALUE str);
1717static void str_shared_replace(VALUE str, VALUE str2);
1718
1719void
1721{
1722 if (str != str2) str_shared_replace(str, str2);
1723}
1724
1725static void
1726str_shared_replace(VALUE str, VALUE str2)
1727{
1728 rb_encoding *enc;
1729 int cr;
1730 int termlen;
1731
1732 RUBY_ASSERT(str2 != str);
1733 enc = STR_ENC_GET(str2);
1734 cr = ENC_CODERANGE(str2);
1735 str_discard(str);
1736 termlen = rb_enc_mbminlen(enc);
1737
1738 STR_SET_LEN(str, RSTRING_LEN(str2));
1739
1740 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1741 STR_SET_EMBED(str);
1742 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1743 rb_enc_associate(str, enc);
1744 ENC_CODERANGE_SET(str, cr);
1745 }
1746 else {
1747 if (STR_EMBED_P(str2)) {
1748 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1749 long len = RSTRING_LEN(str2);
1750 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1751
1752 char *new_ptr = ALLOC_N(char, len + termlen);
1753 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1754 RSTRING(str2)->as.heap.ptr = new_ptr;
1755 STR_SET_LEN(str2, len);
1756 RSTRING(str2)->as.heap.aux.capa = len;
1757 STR_SET_NOEMBED(str2);
1758 }
1759
1760 STR_SET_NOEMBED(str);
1761 FL_UNSET(str, STR_SHARED);
1762 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1763
1764 if (FL_TEST(str2, STR_SHARED)) {
1765 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1766 STR_SET_SHARED(str, shared);
1767 }
1768 else {
1769 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1770 }
1771
1772 /* abandon str2 */
1773 STR_SET_EMBED(str2);
1774 RSTRING_PTR(str2)[0] = 0;
1775 STR_SET_LEN(str2, 0);
1776 rb_enc_associate(str, enc);
1777 ENC_CODERANGE_SET(str, cr);
1778 }
1779}
1780
1781VALUE
1782rb_obj_as_string(VALUE obj)
1783{
1784 VALUE str;
1785
1786 if (RB_TYPE_P(obj, T_STRING)) {
1787 return obj;
1788 }
1789 str = rb_funcall(obj, idTo_s, 0);
1790 return rb_obj_as_string_result(str, obj);
1791}
1792
1793VALUE
1794rb_obj_as_string_result(VALUE str, VALUE obj)
1795{
1796 if (!RB_TYPE_P(str, T_STRING))
1797 return rb_any_to_s(obj);
1798 return str;
1799}
1800
1801static VALUE
1802str_replace(VALUE str, VALUE str2)
1803{
1804 long len;
1805
1806 len = RSTRING_LEN(str2);
1807 if (STR_SHARED_P(str2)) {
1808 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1810 STR_SET_NOEMBED(str);
1811 STR_SET_LEN(str, len);
1812 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1813 STR_SET_SHARED(str, shared);
1814 rb_enc_cr_str_exact_copy(str, str2);
1815 }
1816 else {
1817 str_replace_shared(str, str2);
1818 }
1819
1820 return str;
1821}
1822
1823static inline VALUE
1824ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1825{
1826 size_t size = rb_str_embed_size(capa);
1827 RUBY_ASSERT(size > 0);
1828 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1829
1830 NEWOBJ_OF(str, struct RString, klass,
1832
1833 return (VALUE)str;
1834}
1835
1836static inline VALUE
1837ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1838{
1839 NEWOBJ_OF(str, struct RString, klass,
1840 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1841
1842 return (VALUE)str;
1843}
1844
1845static inline VALUE
1846str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1847{
1848 int encidx = 0;
1849 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1850 encidx = rb_enc_get_index(str);
1851 flags &= ~ENCODING_MASK;
1852 }
1853 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1854 if (encidx) rb_enc_associate_index(dup, encidx);
1855 return dup;
1856}
1857
1858static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1859
1860static inline VALUE
1861str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1862{
1863 VALUE flags = FL_TEST_RAW(str, flag_mask);
1864 long len = RSTRING_LEN(str);
1865
1866 RUBY_ASSERT(STR_EMBED_P(dup));
1867 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1868 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1869 STR_SET_LEN(dup, RSTRING_LEN(str));
1870 return str_duplicate_setup_encoding(str, dup, flags);
1871}
1872
1873static inline VALUE
1874str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1875{
1876 VALUE flags = FL_TEST_RAW(str, flag_mask);
1877 VALUE root = str;
1878 if (FL_TEST_RAW(str, STR_SHARED)) {
1879 root = RSTRING(str)->as.heap.aux.shared;
1880 }
1881 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1882 root = str = str_new_frozen(klass, str);
1883 flags = FL_TEST_RAW(str, flag_mask);
1884 }
1885 RUBY_ASSERT(!STR_SHARED_P(root));
1887
1888 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1889 FL_SET(root, STR_SHARED_ROOT);
1890 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1891 flags |= RSTRING_NOEMBED | STR_SHARED;
1892
1893 STR_SET_LEN(dup, RSTRING_LEN(str));
1894 return str_duplicate_setup_encoding(str, dup, flags);
1895}
1896
1897static inline VALUE
1898str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1899{
1900 if (STR_EMBED_P(str)) {
1901 return str_duplicate_setup_embed(klass, str, dup);
1902 }
1903 else {
1904 return str_duplicate_setup_heap(klass, str, dup);
1905 }
1906}
1907
1908static inline VALUE
1909str_duplicate(VALUE klass, VALUE str)
1910{
1911 VALUE dup;
1912 if (STR_EMBED_P(str)) {
1913 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1914 }
1915 else {
1916 dup = str_alloc_heap(klass);
1917 }
1918
1919 return str_duplicate_setup(klass, str, dup);
1920}
1921
1922VALUE
1924{
1925 return str_duplicate(rb_obj_class(str), str);
1926}
1927
1928/* :nodoc: */
1929VALUE
1930rb_str_dup_m(VALUE str)
1931{
1932 if (LIKELY(BARE_STRING_P(str))) {
1933 return str_duplicate(rb_obj_class(str), str);
1934 }
1935 else {
1936 return rb_obj_dup(str);
1937 }
1938}
1939
1940VALUE
1942{
1943 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1944 return str_duplicate(rb_cString, str);
1945}
1946
1947VALUE
1948rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1949{
1950 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1951 VALUE new_str, klass = rb_cString;
1952
1953 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1954 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1955 str_duplicate_setup_embed(klass, str, new_str);
1956 }
1957 else {
1958 new_str = ec_str_alloc_heap(ec, klass);
1959 str_duplicate_setup_heap(klass, str, new_str);
1960 }
1961 if (chilled) {
1962 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1963 }
1964 return new_str;
1965}
1966
1967VALUE
1968rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
1969{
1970 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
1971 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
1972 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1973 FL_SET_RAW(str, STR_CHILLED_LITERAL);
1974 return rb_str_freeze(str);
1975}
1976
1977/*
1978 *
1979 * call-seq:
1980 * String.new(string = '', **opts) -> new_string
1981 *
1982 * :include: doc/string/new.rdoc
1983 *
1984 */
1985
1986static VALUE
1987rb_str_init(int argc, VALUE *argv, VALUE str)
1988{
1989 static ID keyword_ids[2];
1990 VALUE orig, opt, venc, vcapa;
1991 VALUE kwargs[2];
1992 rb_encoding *enc = 0;
1993 int n;
1994
1995 if (!keyword_ids[0]) {
1996 keyword_ids[0] = rb_id_encoding();
1997 CONST_ID(keyword_ids[1], "capacity");
1998 }
1999
2000 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2001 if (!NIL_P(opt)) {
2002 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2003 venc = kwargs[0];
2004 vcapa = kwargs[1];
2005 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2006 enc = rb_to_encoding(venc);
2007 }
2008 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2009 long capa = NUM2LONG(vcapa);
2010 long len = 0;
2011 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2012
2013 if (capa < STR_BUF_MIN_SIZE) {
2014 capa = STR_BUF_MIN_SIZE;
2015 }
2016 if (n == 1) {
2017 StringValue(orig);
2018 len = RSTRING_LEN(orig);
2019 if (capa < len) {
2020 capa = len;
2021 }
2022 if (orig == str) n = 0;
2023 }
2024 str_modifiable(str);
2025 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2026 /* make noembed always */
2027 const size_t size = (size_t)capa + termlen;
2028 const char *const old_ptr = RSTRING_PTR(str);
2029 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2030 char *new_ptr = ALLOC_N(char, size);
2031 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2032 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2033 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2034 RSTRING(str)->as.heap.ptr = new_ptr;
2035 }
2036 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2037 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2038 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2039 }
2040 STR_SET_LEN(str, len);
2041 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2042 if (n == 1) {
2043 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2044 rb_enc_cr_str_exact_copy(str, orig);
2045 }
2046 FL_SET(str, STR_NOEMBED);
2047 RSTRING(str)->as.heap.aux.capa = capa;
2048 }
2049 else if (n == 1) {
2050 rb_str_replace(str, orig);
2051 }
2052 if (enc) {
2053 rb_enc_associate(str, enc);
2055 }
2056 }
2057 else if (n == 1) {
2058 rb_str_replace(str, orig);
2059 }
2060 return str;
2061}
2062
2063/* :nodoc: */
2064static VALUE
2065rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2066{
2067 if (klass != rb_cString) {
2068 return rb_class_new_instance_pass_kw(argc, argv, klass);
2069 }
2070
2071 static ID keyword_ids[2];
2072 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2073 VALUE kwargs[2];
2074 rb_encoding *enc = NULL;
2075
2076 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2077 if (NIL_P(opt)) {
2078 return rb_class_new_instance_pass_kw(argc, argv, klass);
2079 }
2080
2081 keyword_ids[0] = rb_id_encoding();
2082 CONST_ID(keyword_ids[1], "capacity");
2083 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2084 encoding = kwargs[0];
2085 capacity = kwargs[1];
2086
2087 if (n == 1) {
2088 orig = StringValue(orig);
2089 }
2090 else {
2091 orig = Qnil;
2092 }
2093
2094 if (UNDEF_P(encoding)) {
2095 if (!NIL_P(orig)) {
2096 encoding = rb_obj_encoding(orig);
2097 }
2098 }
2099
2100 if (!UNDEF_P(encoding)) {
2101 enc = rb_to_encoding(encoding);
2102 }
2103
2104 // If capacity is nil, we're basically just duping `orig`.
2105 if (UNDEF_P(capacity)) {
2106 if (NIL_P(orig)) {
2107 VALUE empty_str = str_new(klass, "", 0);
2108 if (enc) {
2109 rb_enc_associate(empty_str, enc);
2110 }
2111 return empty_str;
2112 }
2113 VALUE copy = str_duplicate(klass, orig);
2114 rb_enc_associate(copy, enc);
2115 ENC_CODERANGE_CLEAR(copy);
2116 return copy;
2117 }
2118
2119 long capa = 0;
2120 capa = NUM2LONG(capacity);
2121 if (capa < 0) {
2122 capa = 0;
2123 }
2124
2125 if (!NIL_P(orig)) {
2126 long orig_capa = rb_str_capacity(orig);
2127 if (orig_capa > capa) {
2128 capa = orig_capa;
2129 }
2130 }
2131
2132 VALUE str = str_enc_new(klass, NULL, capa, enc);
2133 STR_SET_LEN(str, 0);
2134 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2135
2136 if (!NIL_P(orig)) {
2137 rb_str_buf_append(str, orig);
2138 }
2139
2140 return str;
2141}
2142
2143#ifdef NONASCII_MASK
2144#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2145
2146/*
2147 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2148 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2149 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2150 *
2151 * if (!(byte & 0x80))
2152 * byte |= 0x40; // turn on bit6
2153 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2154 *
2155 * This function calculates whether a byte is leading or not for all bytes
2156 * in the argument word by concurrently using the above logic, and then
2157 * adds up the number of leading bytes in the word.
2158 */
2159static inline uintptr_t
2160count_utf8_lead_bytes_with_word(const uintptr_t *s)
2161{
2162 uintptr_t d = *s;
2163
2164 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2165 d = (d>>6) | (~d>>7);
2166 d &= NONASCII_MASK >> 7;
2167
2168 /* Gather all bytes. */
2169#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2170 /* use only if it can use POPCNT */
2171 return rb_popcount_intptr(d);
2172#else
2173 d += (d>>8);
2174 d += (d>>16);
2175# if SIZEOF_VOIDP == 8
2176 d += (d>>32);
2177# endif
2178 return (d&0xF);
2179#endif
2180}
2181#endif
2182
2183static inline long
2184enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2185{
2186 long c;
2187 const char *q;
2188
2189 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2190 long diff = (long)(e - p);
2191 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2192 }
2193#ifdef NONASCII_MASK
2194 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2195 uintptr_t len = 0;
2196 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2197 const uintptr_t *s, *t;
2198 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2199 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2200 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2201 while (p < (const char *)s) {
2202 if (is_utf8_lead_byte(*p)) len++;
2203 p++;
2204 }
2205 while (s < t) {
2206 len += count_utf8_lead_bytes_with_word(s);
2207 s++;
2208 }
2209 p = (const char *)s;
2210 }
2211 while (p < e) {
2212 if (is_utf8_lead_byte(*p)) len++;
2213 p++;
2214 }
2215 return (long)len;
2216 }
2217#endif
2218 else if (rb_enc_asciicompat(enc)) {
2219 c = 0;
2220 if (ENC_CODERANGE_CLEAN_P(cr)) {
2221 while (p < e) {
2222 if (ISASCII(*p)) {
2223 q = search_nonascii(p, e);
2224 if (!q)
2225 return c + (e - p);
2226 c += q - p;
2227 p = q;
2228 }
2229 p += rb_enc_fast_mbclen(p, e, enc);
2230 c++;
2231 }
2232 }
2233 else {
2234 while (p < e) {
2235 if (ISASCII(*p)) {
2236 q = search_nonascii(p, e);
2237 if (!q)
2238 return c + (e - p);
2239 c += q - p;
2240 p = q;
2241 }
2242 p += rb_enc_mbclen(p, e, enc);
2243 c++;
2244 }
2245 }
2246 return c;
2247 }
2248
2249 for (c=0; p<e; c++) {
2250 p += rb_enc_mbclen(p, e, enc);
2251 }
2252 return c;
2253}
2254
2255long
2256rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2257{
2258 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2259}
2260
2261/* To get strlen with cr
2262 * Note that given cr is not used.
2263 */
2264long
2265rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2266{
2267 long c;
2268 const char *q;
2269 int ret;
2270
2271 *cr = 0;
2272 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2273 long diff = (long)(e - p);
2274 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2275 }
2276 else if (rb_enc_asciicompat(enc)) {
2277 c = 0;
2278 while (p < e) {
2279 if (ISASCII(*p)) {
2280 q = search_nonascii(p, e);
2281 if (!q) {
2282 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2283 return c + (e - p);
2284 }
2285 c += q - p;
2286 p = q;
2287 }
2288 ret = rb_enc_precise_mbclen(p, e, enc);
2289 if (MBCLEN_CHARFOUND_P(ret)) {
2290 *cr |= ENC_CODERANGE_VALID;
2291 p += MBCLEN_CHARFOUND_LEN(ret);
2292 }
2293 else {
2295 p++;
2296 }
2297 c++;
2298 }
2299 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2300 return c;
2301 }
2302
2303 for (c=0; p<e; c++) {
2304 ret = rb_enc_precise_mbclen(p, e, enc);
2305 if (MBCLEN_CHARFOUND_P(ret)) {
2306 *cr |= ENC_CODERANGE_VALID;
2307 p += MBCLEN_CHARFOUND_LEN(ret);
2308 }
2309 else {
2311 if (p + rb_enc_mbminlen(enc) <= e)
2312 p += rb_enc_mbminlen(enc);
2313 else
2314 p = e;
2315 }
2316 }
2317 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2318 return c;
2319}
2320
2321/* enc must be str's enc or rb_enc_check(str, str2) */
2322static long
2323str_strlen(VALUE str, rb_encoding *enc)
2324{
2325 const char *p, *e;
2326 int cr;
2327
2328 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2329 if (!enc) enc = STR_ENC_GET(str);
2330 p = RSTRING_PTR(str);
2331 e = RSTRING_END(str);
2332 cr = ENC_CODERANGE(str);
2333
2334 if (cr == ENC_CODERANGE_UNKNOWN) {
2335 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2336 if (cr) ENC_CODERANGE_SET(str, cr);
2337 return n;
2338 }
2339 else {
2340 return enc_strlen(p, e, enc, cr);
2341 }
2342}
2343
2344long
2346{
2347 return str_strlen(str, NULL);
2348}
2349
2350/*
2351 * call-seq:
2352 * length -> integer
2353 *
2354 * :include: doc/string/length.rdoc
2355 *
2356 */
2357
2358VALUE
2360{
2361 return LONG2NUM(str_strlen(str, NULL));
2362}
2363
2364/*
2365 * call-seq:
2366 * bytesize -> integer
2367 *
2368 * :include: doc/string/bytesize.rdoc
2369 *
2370 */
2371
2372VALUE
2373rb_str_bytesize(VALUE str)
2374{
2375 return LONG2NUM(RSTRING_LEN(str));
2376}
2377
2378/*
2379 * call-seq:
2380 * empty? -> true or false
2381 *
2382 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2383 *
2384 * "hello".empty? # => false
2385 * " ".empty? # => false
2386 * "".empty? # => true
2387 *
2388 */
2389
2390static VALUE
2391rb_str_empty(VALUE str)
2392{
2393 return RBOOL(RSTRING_LEN(str) == 0);
2394}
2395
2396/*
2397 * call-seq:
2398 * string + other_string -> new_string
2399 *
2400 * Returns a new +String+ containing +other_string+ concatenated to +self+:
2401 *
2402 * "Hello from " + self.to_s # => "Hello from main"
2403 *
2404 */
2405
2406VALUE
2408{
2409 VALUE str3;
2410 rb_encoding *enc;
2411 char *ptr1, *ptr2, *ptr3;
2412 long len1, len2;
2413 int termlen;
2414
2415 StringValue(str2);
2416 enc = rb_enc_check_str(str1, str2);
2417 RSTRING_GETMEM(str1, ptr1, len1);
2418 RSTRING_GETMEM(str2, ptr2, len2);
2419 termlen = rb_enc_mbminlen(enc);
2420 if (len1 > LONG_MAX - len2) {
2421 rb_raise(rb_eArgError, "string size too big");
2422 }
2423 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2424 ptr3 = RSTRING_PTR(str3);
2425 memcpy(ptr3, ptr1, len1);
2426 memcpy(ptr3+len1, ptr2, len2);
2427 TERM_FILL(&ptr3[len1+len2], termlen);
2428
2429 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2431 RB_GC_GUARD(str1);
2432 RB_GC_GUARD(str2);
2433 return str3;
2434}
2435
2436/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2437VALUE
2438rb_str_opt_plus(VALUE str1, VALUE str2)
2439{
2442 long len1, len2;
2443 MAYBE_UNUSED(char) *ptr1, *ptr2;
2444 RSTRING_GETMEM(str1, ptr1, len1);
2445 RSTRING_GETMEM(str2, ptr2, len2);
2446 int enc1 = rb_enc_get_index(str1);
2447 int enc2 = rb_enc_get_index(str2);
2448
2449 if (enc1 < 0) {
2450 return Qundef;
2451 }
2452 else if (enc2 < 0) {
2453 return Qundef;
2454 }
2455 else if (enc1 != enc2) {
2456 return Qundef;
2457 }
2458 else if (len1 > LONG_MAX - len2) {
2459 return Qundef;
2460 }
2461 else {
2462 return rb_str_plus(str1, str2);
2463 }
2464
2465}
2466
2467/*
2468 * call-seq:
2469 * string * integer -> new_string
2470 *
2471 * Returns a new +String+ containing +integer+ copies of +self+:
2472 *
2473 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2474 * "Ho! " * 0 # => ""
2475 *
2476 */
2477
2478VALUE
2480{
2481 VALUE str2;
2482 long n, len;
2483 char *ptr2;
2484 int termlen;
2485
2486 if (times == INT2FIX(1)) {
2487 return str_duplicate(rb_cString, str);
2488 }
2489 if (times == INT2FIX(0)) {
2490 str2 = str_alloc_embed(rb_cString, 0);
2491 rb_enc_copy(str2, str);
2492 return str2;
2493 }
2494 len = NUM2LONG(times);
2495 if (len < 0) {
2496 rb_raise(rb_eArgError, "negative argument");
2497 }
2498 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2499 if (STR_EMBEDDABLE_P(len, 1)) {
2500 str2 = str_alloc_embed(rb_cString, len + 1);
2501 memset(RSTRING_PTR(str2), 0, len + 1);
2502 }
2503 else {
2504 str2 = str_alloc_heap(rb_cString);
2505 RSTRING(str2)->as.heap.aux.capa = len;
2506 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2507 }
2508 STR_SET_LEN(str2, len);
2509 rb_enc_copy(str2, str);
2510 return str2;
2511 }
2512 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2513 rb_raise(rb_eArgError, "argument too big");
2514 }
2515
2516 len *= RSTRING_LEN(str);
2517 termlen = TERM_LEN(str);
2518 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2519 ptr2 = RSTRING_PTR(str2);
2520 if (len) {
2521 n = RSTRING_LEN(str);
2522 memcpy(ptr2, RSTRING_PTR(str), n);
2523 while (n <= len/2) {
2524 memcpy(ptr2 + n, ptr2, n);
2525 n *= 2;
2526 }
2527 memcpy(ptr2 + n, ptr2, len-n);
2528 }
2529 STR_SET_LEN(str2, len);
2530 TERM_FILL(&ptr2[len], termlen);
2531 rb_enc_cr_str_copy_for_substr(str2, str);
2532
2533 return str2;
2534}
2535
2536/*
2537 * call-seq:
2538 * string % object -> new_string
2539 *
2540 * Returns the result of formatting +object+ into the format specification +self+
2541 * (see Kernel#sprintf for formatting details):
2542 *
2543 * "%05d" % 123 # => "00123"
2544 *
2545 * If +self+ contains multiple substitutions, +object+ must be
2546 * an Array or Hash containing the values to be substituted:
2547 *
2548 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2549 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2550 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2551 *
2552 */
2553
2554static VALUE
2555rb_str_format_m(VALUE str, VALUE arg)
2556{
2557 VALUE tmp = rb_check_array_type(arg);
2558
2559 if (!NIL_P(tmp)) {
2560 VALUE result = rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2561 RB_GC_GUARD(tmp);
2562 return result;
2563 }
2564 return rb_str_format(1, &arg, str);
2565}
2566
2567static inline void
2568rb_check_lockedtmp(VALUE str)
2569{
2570 if (FL_TEST(str, STR_TMPLOCK)) {
2571 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2572 }
2573}
2574
2575// If none of these flags are set, we know we have an modifiable string.
2576// If any is set, we need to do more detailed checks.
2577#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2578static inline void
2579str_modifiable(VALUE str)
2580{
2581 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2582 if (CHILLED_STRING_P(str)) {
2583 CHILLED_STRING_MUTATED(str);
2584 }
2585 rb_check_lockedtmp(str);
2586 rb_check_frozen(str);
2587 }
2588}
2589
2590static inline int
2591str_dependent_p(VALUE str)
2592{
2593 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2594 return FALSE;
2595 }
2596 else {
2597 return TRUE;
2598 }
2599}
2600
2601// If none of these flags are set, we know we have an independent string.
2602// If any is set, we need to do more detailed checks.
2603#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2604static inline int
2605str_independent(VALUE str)
2606{
2607 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2608 str_modifiable(str);
2609 return !str_dependent_p(str);
2610 }
2611 return TRUE;
2612}
2613
2614static void
2615str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2616{
2617 char *ptr;
2618 char *oldptr;
2619 long capa = len + expand;
2620
2621 if (len > capa) len = capa;
2622
2623 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2624 ptr = RSTRING(str)->as.heap.ptr;
2625 STR_SET_EMBED(str);
2626 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2627 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2628 STR_SET_LEN(str, len);
2629 return;
2630 }
2631
2632 ptr = ALLOC_N(char, (size_t)capa + termlen);
2633 oldptr = RSTRING_PTR(str);
2634 if (oldptr) {
2635 memcpy(ptr, oldptr, len);
2636 }
2637 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2638 xfree(oldptr);
2639 }
2640 STR_SET_NOEMBED(str);
2641 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2642 TERM_FILL(ptr + len, termlen);
2643 RSTRING(str)->as.heap.ptr = ptr;
2644 STR_SET_LEN(str, len);
2645 RSTRING(str)->as.heap.aux.capa = capa;
2646}
2647
2648void
2649rb_str_modify(VALUE str)
2650{
2651 if (!str_independent(str))
2652 str_make_independent(str);
2654}
2655
2656void
2658{
2659 int termlen = TERM_LEN(str);
2660 long len = RSTRING_LEN(str);
2661
2662 if (expand < 0) {
2663 rb_raise(rb_eArgError, "negative expanding string size");
2664 }
2665 if (expand >= LONG_MAX - len) {
2666 rb_raise(rb_eArgError, "string size too big");
2667 }
2668
2669 if (!str_independent(str)) {
2670 str_make_independent_expand(str, len, expand, termlen);
2671 }
2672 else if (expand > 0) {
2673 RESIZE_CAPA_TERM(str, len + expand, termlen);
2674 }
2676}
2677
2678/* As rb_str_modify(), but don't clear coderange */
2679static void
2680str_modify_keep_cr(VALUE str)
2681{
2682 if (!str_independent(str))
2683 str_make_independent(str);
2685 /* Force re-scan later */
2687}
2688
2689static inline void
2690str_discard(VALUE str)
2691{
2692 str_modifiable(str);
2693 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2694 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2695 RSTRING(str)->as.heap.ptr = 0;
2696 STR_SET_LEN(str, 0);
2697 }
2698}
2699
2700void
2702{
2703 int encindex = rb_enc_get_index(str);
2704
2705 if (RB_UNLIKELY(encindex == -1)) {
2706 rb_raise(rb_eTypeError, "not encoding capable object");
2707 }
2708
2709 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2710 return;
2711 }
2712
2713 rb_encoding *enc = rb_enc_from_index(encindex);
2714 if (!rb_enc_asciicompat(enc)) {
2715 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2716 }
2717}
2718
2719VALUE
2721{
2722 VALUE s = *ptr;
2723 if (!RB_TYPE_P(s, T_STRING)) {
2724 s = rb_str_to_str(s);
2725 *ptr = s;
2726 }
2727 return s;
2728}
2729
2730char *
2732{
2733 VALUE str = rb_string_value(ptr);
2734 return RSTRING_PTR(str);
2735}
2736
2737static int
2738zero_filled(const char *s, int n)
2739{
2740 for (; n > 0; --n) {
2741 if (*s++) return 0;
2742 }
2743 return 1;
2744}
2745
2746static const char *
2747str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2748{
2749 const char *e = s + len;
2750
2751 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2752 if (zero_filled(s, minlen)) return s;
2753 }
2754 return 0;
2755}
2756
2757static char *
2758str_fill_term(VALUE str, char *s, long len, int termlen)
2759{
2760 /* This function assumes that (capa + termlen) bytes of memory
2761 * is allocated, like many other functions in this file.
2762 */
2763 if (str_dependent_p(str)) {
2764 if (!zero_filled(s + len, termlen))
2765 str_make_independent_expand(str, len, 0L, termlen);
2766 }
2767 else {
2768 TERM_FILL(s + len, termlen);
2769 return s;
2770 }
2771 return RSTRING_PTR(str);
2772}
2773
2774void
2775rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2776{
2777 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2778 long len = RSTRING_LEN(str);
2779
2780 RUBY_ASSERT(capa >= len);
2781 if (capa - len < termlen) {
2782 rb_check_lockedtmp(str);
2783 str_make_independent_expand(str, len, 0L, termlen);
2784 }
2785 else if (str_dependent_p(str)) {
2786 if (termlen > oldtermlen)
2787 str_make_independent_expand(str, len, 0L, termlen);
2788 }
2789 else {
2790 if (!STR_EMBED_P(str)) {
2791 /* modify capa instead of realloc */
2792 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2793 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2794 }
2795 if (termlen > oldtermlen) {
2796 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2797 }
2798 }
2799
2800 return;
2801}
2802
2803static char *
2804str_null_check(VALUE str, int *w)
2805{
2806 char *s = RSTRING_PTR(str);
2807 long len = RSTRING_LEN(str);
2808 rb_encoding *enc = rb_enc_get(str);
2809 const int minlen = rb_enc_mbminlen(enc);
2810
2811 if (minlen > 1) {
2812 *w = 1;
2813 if (str_null_char(s, len, minlen, enc)) {
2814 return NULL;
2815 }
2816 return str_fill_term(str, s, len, minlen);
2817 }
2818 *w = 0;
2819 if (!s || memchr(s, 0, len)) {
2820 return NULL;
2821 }
2822 if (s[len]) {
2823 s = str_fill_term(str, s, len, minlen);
2824 }
2825 return s;
2826}
2827
2828char *
2829rb_str_to_cstr(VALUE str)
2830{
2831 int w;
2832 return str_null_check(str, &w);
2833}
2834
2835char *
2837{
2838 VALUE str = rb_string_value(ptr);
2839 int w;
2840 char *s = str_null_check(str, &w);
2841 if (!s) {
2842 if (w) {
2843 rb_raise(rb_eArgError, "string contains null char");
2844 }
2845 rb_raise(rb_eArgError, "string contains null byte");
2846 }
2847 return s;
2848}
2849
2850char *
2851rb_str_fill_terminator(VALUE str, const int newminlen)
2852{
2853 char *s = RSTRING_PTR(str);
2854 long len = RSTRING_LEN(str);
2855 return str_fill_term(str, s, len, newminlen);
2856}
2857
2858VALUE
2860{
2861 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2862 return str;
2863}
2864
2865/*
2866 * call-seq:
2867 * String.try_convert(object) -> object, new_string, or nil
2868 *
2869 * If +object+ is a +String+ object, returns +object+.
2870 *
2871 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2872 * calls <tt>object.to_str</tt> and returns the result.
2873 *
2874 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2875 *
2876 * Raises an exception unless <tt>object.to_str</tt> returns a +String+ object.
2877 */
2878static VALUE
2879rb_str_s_try_convert(VALUE dummy, VALUE str)
2880{
2881 return rb_check_string_type(str);
2882}
2883
2884static char*
2885str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2886{
2887 long nth = *nthp;
2888 if (rb_enc_mbmaxlen(enc) == 1) {
2889 p += nth;
2890 }
2891 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2892 p += nth * rb_enc_mbmaxlen(enc);
2893 }
2894 else if (rb_enc_asciicompat(enc)) {
2895 const char *p2, *e2;
2896 int n;
2897
2898 while (p < e && 0 < nth) {
2899 e2 = p + nth;
2900 if (e < e2) {
2901 *nthp = nth;
2902 return (char *)e;
2903 }
2904 if (ISASCII(*p)) {
2905 p2 = search_nonascii(p, e2);
2906 if (!p2) {
2907 nth -= e2 - p;
2908 *nthp = nth;
2909 return (char *)e2;
2910 }
2911 nth -= p2 - p;
2912 p = p2;
2913 }
2914 n = rb_enc_mbclen(p, e, enc);
2915 p += n;
2916 nth--;
2917 }
2918 *nthp = nth;
2919 if (nth != 0) {
2920 return (char *)e;
2921 }
2922 return (char *)p;
2923 }
2924 else {
2925 while (p < e && nth--) {
2926 p += rb_enc_mbclen(p, e, enc);
2927 }
2928 }
2929 if (p > e) p = e;
2930 *nthp = nth;
2931 return (char*)p;
2932}
2933
2934char*
2935rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2936{
2937 return str_nth_len(p, e, &nth, enc);
2938}
2939
2940static char*
2941str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2942{
2943 if (singlebyte)
2944 p += nth;
2945 else {
2946 p = str_nth_len(p, e, &nth, enc);
2947 }
2948 if (!p) return 0;
2949 if (p > e) p = e;
2950 return (char *)p;
2951}
2952
2953/* char offset to byte offset */
2954static long
2955str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2956{
2957 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2958 if (!pp) return e - p;
2959 return pp - p;
2960}
2961
2962long
2963rb_str_offset(VALUE str, long pos)
2964{
2965 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2966 STR_ENC_GET(str), single_byte_optimizable(str));
2967}
2968
2969#ifdef NONASCII_MASK
2970static char *
2971str_utf8_nth(const char *p, const char *e, long *nthp)
2972{
2973 long nth = *nthp;
2974 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2975 const uintptr_t *s, *t;
2976 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2977 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2978 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2979 while (p < (const char *)s) {
2980 if (is_utf8_lead_byte(*p)) nth--;
2981 p++;
2982 }
2983 do {
2984 nth -= count_utf8_lead_bytes_with_word(s);
2985 s++;
2986 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2987 p = (char *)s;
2988 }
2989 while (p < e) {
2990 if (is_utf8_lead_byte(*p)) {
2991 if (nth == 0) break;
2992 nth--;
2993 }
2994 p++;
2995 }
2996 *nthp = nth;
2997 return (char *)p;
2998}
2999
3000static long
3001str_utf8_offset(const char *p, const char *e, long nth)
3002{
3003 const char *pp = str_utf8_nth(p, e, &nth);
3004 return pp - p;
3005}
3006#endif
3007
3008/* byte offset to char offset */
3009long
3010rb_str_sublen(VALUE str, long pos)
3011{
3012 if (single_byte_optimizable(str) || pos < 0)
3013 return pos;
3014 else {
3015 char *p = RSTRING_PTR(str);
3016 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3017 }
3018}
3019
3020static VALUE
3021str_subseq(VALUE str, long beg, long len)
3022{
3023 VALUE str2;
3024
3025 RUBY_ASSERT(beg >= 0);
3026 RUBY_ASSERT(len >= 0);
3027 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3028
3029 const int termlen = TERM_LEN(str);
3030 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3031 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3032 RB_GC_GUARD(str);
3033 return str2;
3034 }
3035
3036 str2 = str_alloc_heap(rb_cString);
3037 if (str_embed_capa(str2) >= len + termlen) {
3038 char *ptr2 = RSTRING(str2)->as.embed.ary;
3039 STR_SET_EMBED(str2);
3040 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3041 TERM_FILL(ptr2+len, termlen);
3042
3043 STR_SET_LEN(str2, len);
3044 RB_GC_GUARD(str);
3045 }
3046 else {
3047 str_replace_shared(str2, str);
3048 RUBY_ASSERT(!STR_EMBED_P(str2));
3049 ENC_CODERANGE_CLEAR(str2);
3050 RSTRING(str2)->as.heap.ptr += beg;
3051 if (RSTRING_LEN(str2) > len) {
3052 STR_SET_LEN(str2, len);
3053 }
3054 }
3055
3056 return str2;
3057}
3058
3059VALUE
3060rb_str_subseq(VALUE str, long beg, long len)
3061{
3062 VALUE str2 = str_subseq(str, beg, len);
3063 rb_enc_cr_str_copy_for_substr(str2, str);
3064 return str2;
3065}
3066
3067char *
3068rb_str_subpos(VALUE str, long beg, long *lenp)
3069{
3070 long len = *lenp;
3071 long slen = -1L;
3072 const long blen = RSTRING_LEN(str);
3073 rb_encoding *enc = STR_ENC_GET(str);
3074 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3075
3076 if (len < 0) return 0;
3077 if (beg < 0 && -beg < 0) return 0;
3078 if (!blen) {
3079 len = 0;
3080 }
3081 if (single_byte_optimizable(str)) {
3082 if (beg > blen) return 0;
3083 if (beg < 0) {
3084 beg += blen;
3085 if (beg < 0) return 0;
3086 }
3087 if (len > blen - beg)
3088 len = blen - beg;
3089 if (len < 0) return 0;
3090 p = s + beg;
3091 goto end;
3092 }
3093 if (beg < 0) {
3094 if (len > -beg) len = -beg;
3095 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3096 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3097 beg = -beg;
3098 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3099 p = e;
3100 if (!p) return 0;
3101 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3102 if (!p) return 0;
3103 len = e - p;
3104 goto end;
3105 }
3106 else {
3107 slen = str_strlen(str, enc);
3108 beg += slen;
3109 if (beg < 0) return 0;
3110 p = s + beg;
3111 if (len == 0) goto end;
3112 }
3113 }
3114 else if (beg > 0 && beg > blen) {
3115 return 0;
3116 }
3117 if (len == 0) {
3118 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3119 p = s + beg;
3120 }
3121#ifdef NONASCII_MASK
3122 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3123 enc == rb_utf8_encoding()) {
3124 p = str_utf8_nth(s, e, &beg);
3125 if (beg > 0) return 0;
3126 len = str_utf8_offset(p, e, len);
3127 }
3128#endif
3129 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3130 int char_sz = rb_enc_mbmaxlen(enc);
3131
3132 p = s + beg * char_sz;
3133 if (p > e) {
3134 return 0;
3135 }
3136 else if (len * char_sz > e - p)
3137 len = e - p;
3138 else
3139 len *= char_sz;
3140 }
3141 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3142 if (beg > 0) return 0;
3143 len = 0;
3144 }
3145 else {
3146 len = str_offset(p, e, len, enc, 0);
3147 }
3148 end:
3149 *lenp = len;
3150 RB_GC_GUARD(str);
3151 return p;
3152}
3153
3154static VALUE str_substr(VALUE str, long beg, long len, int empty);
3155
3156VALUE
3157rb_str_substr(VALUE str, long beg, long len)
3158{
3159 return str_substr(str, beg, len, TRUE);
3160}
3161
3162VALUE
3163rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3164{
3165 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3166}
3167
3168static VALUE
3169str_substr(VALUE str, long beg, long len, int empty)
3170{
3171 char *p = rb_str_subpos(str, beg, &len);
3172
3173 if (!p) return Qnil;
3174 if (!len && !empty) return Qnil;
3175
3176 beg = p - RSTRING_PTR(str);
3177
3178 VALUE str2 = str_subseq(str, beg, len);
3179 rb_enc_cr_str_copy_for_substr(str2, str);
3180 return str2;
3181}
3182
3183/* :nodoc: */
3184VALUE
3186{
3187 if (CHILLED_STRING_P(str)) {
3188 FL_UNSET_RAW(str, STR_CHILLED);
3189 }
3190
3191 if (OBJ_FROZEN(str)) return str;
3192 rb_str_resize(str, RSTRING_LEN(str));
3193 return rb_obj_freeze(str);
3194}
3195
3196/*
3197 * call-seq:
3198 * +string -> new_string or self
3199 *
3200 * Returns +self+ if +self+ is not frozen and can be mutated
3201 * without warning issuance.
3202 *
3203 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3204 */
3205static VALUE
3206str_uplus(VALUE str)
3207{
3208 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3209 return rb_str_dup(str);
3210 }
3211 else {
3212 return str;
3213 }
3214}
3215
3216/*
3217 * call-seq:
3218 * -string -> frozen_string
3219 * dedup -> frozen_string
3220 *
3221 * Returns a frozen, possibly pre-existing copy of the string.
3222 *
3223 * The returned +String+ will be deduplicated as long as it does not have
3224 * any instance variables set on it and is not a String subclass.
3225 *
3226 * Note that <tt>-string</tt> variant is more convenient for defining
3227 * constants:
3228 *
3229 * FILENAME = -'config/database.yml'
3230 *
3231 * while +dedup+ is better suitable for using the method in chains
3232 * of calculations:
3233 *
3234 * @url_list.concat(urls.map(&:dedup))
3235 *
3236 */
3237static VALUE
3238str_uminus(VALUE str)
3239{
3240 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3241 str = rb_str_dup(str);
3242 }
3243 return rb_fstring(str);
3244}
3245
3246RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3247#define rb_str_dup_frozen rb_str_new_frozen
3248
3249VALUE
3251{
3252 if (FL_TEST(str, STR_TMPLOCK)) {
3253 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3254 }
3255 FL_SET(str, STR_TMPLOCK);
3256 return str;
3257}
3258
3259VALUE
3261{
3262 if (!FL_TEST(str, STR_TMPLOCK)) {
3263 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3264 }
3265 FL_UNSET(str, STR_TMPLOCK);
3266 return str;
3267}
3268
3269VALUE
3270rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3271{
3272 rb_str_locktmp(str);
3273 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3274}
3275
3276void
3278{
3279 long capa;
3280 const int termlen = TERM_LEN(str);
3281
3282 str_modifiable(str);
3283 if (STR_SHARED_P(str)) {
3284 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3285 }
3286 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3287 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3288 }
3289
3290 int cr = ENC_CODERANGE(str);
3291 if (len == 0) {
3292 /* Empty string does not contain non-ASCII */
3294 }
3295 else if (cr == ENC_CODERANGE_UNKNOWN) {
3296 /* Leave unknown. */
3297 }
3298 else if (len > RSTRING_LEN(str)) {
3299 if (ENC_CODERANGE_CLEAN_P(cr)) {
3300 /* Update the coderange regarding the extended part. */
3301 const char *const prev_end = RSTRING_END(str);
3302 const char *const new_end = RSTRING_PTR(str) + len;
3303 rb_encoding *enc = rb_enc_get(str);
3304 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3305 ENC_CODERANGE_SET(str, cr);
3306 }
3307 else if (cr == ENC_CODERANGE_BROKEN) {
3308 /* May be valid now, by appended part. */
3310 }
3311 }
3312 else if (len < RSTRING_LEN(str)) {
3313 if (cr != ENC_CODERANGE_7BIT) {
3314 /* ASCII-only string is keeping after truncated. Valid
3315 * and broken may be invalid or valid, leave unknown. */
3317 }
3318 }
3319
3320 STR_SET_LEN(str, len);
3321 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3322}
3323
3324VALUE
3325rb_str_resize(VALUE str, long len)
3326{
3327 if (len < 0) {
3328 rb_raise(rb_eArgError, "negative string size (or size too big)");
3329 }
3330
3331 int independent = str_independent(str);
3332 long slen = RSTRING_LEN(str);
3333 const int termlen = TERM_LEN(str);
3334
3335 if (slen > len || (termlen != 1 && slen < len)) {
3337 }
3338
3339 {
3340 long capa;
3341 if (STR_EMBED_P(str)) {
3342 if (len == slen) return str;
3343 if (str_embed_capa(str) >= len + termlen) {
3344 STR_SET_LEN(str, len);
3345 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3346 return str;
3347 }
3348 str_make_independent_expand(str, slen, len - slen, termlen);
3349 }
3350 else if (str_embed_capa(str) >= len + termlen) {
3351 char *ptr = STR_HEAP_PTR(str);
3352 STR_SET_EMBED(str);
3353 if (slen > len) slen = len;
3354 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3355 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3356 STR_SET_LEN(str, len);
3357 if (independent) ruby_xfree(ptr);
3358 return str;
3359 }
3360 else if (!independent) {
3361 if (len == slen) return str;
3362 str_make_independent_expand(str, slen, len - slen, termlen);
3363 }
3364 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3365 (capa - len) > (len < 1024 ? len : 1024)) {
3366 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3367 (size_t)len + termlen, STR_HEAP_SIZE(str));
3368 RSTRING(str)->as.heap.aux.capa = len;
3369 }
3370 else if (len == slen) return str;
3371 STR_SET_LEN(str, len);
3372 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3373 }
3374 return str;
3375}
3376
3377static void
3378str_ensure_available_capa(VALUE str, long len)
3379{
3380 str_modify_keep_cr(str);
3381
3382 const int termlen = TERM_LEN(str);
3383 long olen = RSTRING_LEN(str);
3384
3385 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3386 rb_raise(rb_eArgError, "string sizes too big");
3387 }
3388
3389 long total = olen + len;
3390 long capa = str_capacity(str, termlen);
3391
3392 if (capa < total) {
3393 if (total >= LONG_MAX / 2) {
3394 capa = total;
3395 }
3396 while (total > capa) {
3397 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3398 }
3399 RESIZE_CAPA_TERM(str, capa, termlen);
3400 }
3401}
3402
3403static VALUE
3404str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3405{
3406 if (keep_cr) {
3407 str_modify_keep_cr(str);
3408 }
3409 else {
3410 rb_str_modify(str);
3411 }
3412 if (len == 0) return 0;
3413
3414 long total, olen, off = -1;
3415 char *sptr;
3416 const int termlen = TERM_LEN(str);
3417
3418 RSTRING_GETMEM(str, sptr, olen);
3419 if (ptr >= sptr && ptr <= sptr + olen) {
3420 off = ptr - sptr;
3421 }
3422
3423 long capa = str_capacity(str, termlen);
3424
3425 if (olen > LONG_MAX - len) {
3426 rb_raise(rb_eArgError, "string sizes too big");
3427 }
3428 total = olen + len;
3429 if (capa < total) {
3430 if (total >= LONG_MAX / 2) {
3431 capa = total;
3432 }
3433 while (total > capa) {
3434 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3435 }
3436 RESIZE_CAPA_TERM(str, capa, termlen);
3437 sptr = RSTRING_PTR(str);
3438 }
3439 if (off != -1) {
3440 ptr = sptr + off;
3441 }
3442 memcpy(sptr + olen, ptr, len);
3443 STR_SET_LEN(str, total);
3444 TERM_FILL(sptr + total, termlen); /* sentinel */
3445
3446 return str;
3447}
3448
3449#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3450#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3451
3452VALUE
3453rb_str_cat(VALUE str, const char *ptr, long len)
3454{
3455 if (len == 0) return str;
3456 if (len < 0) {
3457 rb_raise(rb_eArgError, "negative string size (or size too big)");
3458 }
3459 return str_buf_cat(str, ptr, len);
3460}
3461
3462VALUE
3463rb_str_cat_cstr(VALUE str, const char *ptr)
3464{
3465 must_not_null(ptr);
3466 return rb_str_buf_cat(str, ptr, strlen(ptr));
3467}
3468
3469static void
3470rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3471{
3472 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3473
3474 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3475 if (UNLIKELY(!str_independent(str))) {
3476 str_make_independent(str);
3477 }
3478
3479 long string_length = -1;
3480 const int null_terminator_length = 1;
3481 char *sptr;
3482 RSTRING_GETMEM(str, sptr, string_length);
3483
3484 // Ensure the resulting string wouldn't be too long.
3485 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3486 rb_raise(rb_eArgError, "string sizes too big");
3487 }
3488
3489 long string_capacity = str_capacity(str, null_terminator_length);
3490
3491 // Get the code range before any modifications since those might clear the code range.
3492 int cr = ENC_CODERANGE(str);
3493
3494 // Check if the string has spare string_capacity to write the new byte.
3495 if (LIKELY(string_capacity >= string_length + 1)) {
3496 // In fast path we can write the new byte and note the string's new length.
3497 sptr[string_length] = byte;
3498 STR_SET_LEN(str, string_length + 1);
3499 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3500 }
3501 else {
3502 // If there's not enough string_capacity, make a call into the general string concatenation function.
3503 str_buf_cat(str, (char *)&byte, 1);
3504 }
3505
3506 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3507 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3508 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3509 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3510 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3511 if (ISASCII(byte)) {
3513 }
3514 else {
3516
3517 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3518 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3519 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3520 }
3521 }
3522 }
3523}
3524
3525RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3526RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3527RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3528
3529static VALUE
3530rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3531 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3532{
3533 int str_encindex = ENCODING_GET(str);
3534 int res_encindex;
3535 int str_cr, res_cr;
3536 rb_encoding *str_enc, *ptr_enc;
3537
3538 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3539
3540 if (str_encindex == ptr_encindex) {
3541 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3542 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3543 }
3544 }
3545 else {
3546 str_enc = rb_enc_from_index(str_encindex);
3547 ptr_enc = rb_enc_from_index(ptr_encindex);
3548 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3549 if (len == 0)
3550 return str;
3551 if (RSTRING_LEN(str) == 0) {
3552 rb_str_buf_cat(str, ptr, len);
3553 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3554 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3555 return str;
3556 }
3557 goto incompatible;
3558 }
3559 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3560 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3561 }
3562 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3563 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3564 str_cr = rb_enc_str_coderange(str);
3565 }
3566 }
3567 }
3568 if (ptr_cr_ret)
3569 *ptr_cr_ret = ptr_cr;
3570
3571 if (str_encindex != ptr_encindex &&
3572 str_cr != ENC_CODERANGE_7BIT &&
3573 ptr_cr != ENC_CODERANGE_7BIT) {
3574 str_enc = rb_enc_from_index(str_encindex);
3575 ptr_enc = rb_enc_from_index(ptr_encindex);
3576 goto incompatible;
3577 }
3578
3579 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3580 res_encindex = str_encindex;
3581 res_cr = ENC_CODERANGE_UNKNOWN;
3582 }
3583 else if (str_cr == ENC_CODERANGE_7BIT) {
3584 if (ptr_cr == ENC_CODERANGE_7BIT) {
3585 res_encindex = str_encindex;
3586 res_cr = ENC_CODERANGE_7BIT;
3587 }
3588 else {
3589 res_encindex = ptr_encindex;
3590 res_cr = ptr_cr;
3591 }
3592 }
3593 else if (str_cr == ENC_CODERANGE_VALID) {
3594 res_encindex = str_encindex;
3595 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3596 res_cr = str_cr;
3597 else
3598 res_cr = ptr_cr;
3599 }
3600 else { /* str_cr == ENC_CODERANGE_BROKEN */
3601 res_encindex = str_encindex;
3602 res_cr = str_cr;
3603 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3604 }
3605
3606 if (len < 0) {
3607 rb_raise(rb_eArgError, "negative string size (or size too big)");
3608 }
3609 str_buf_cat(str, ptr, len);
3610 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3611 return str;
3612
3613 incompatible:
3614 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3615 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3617}
3618
3619VALUE
3620rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3621{
3622 return rb_enc_cr_str_buf_cat(str, ptr, len,
3623 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3624}
3625
3626VALUE
3627rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3628{
3629 /* ptr must reference NUL terminated ASCII string. */
3630 int encindex = ENCODING_GET(str);
3631 rb_encoding *enc = rb_enc_from_index(encindex);
3632 if (rb_enc_asciicompat(enc)) {
3633 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3634 encindex, ENC_CODERANGE_7BIT, 0);
3635 }
3636 else {
3637 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3638 while (*ptr) {
3639 unsigned int c = (unsigned char)*ptr;
3640 int len = rb_enc_codelen(c, enc);
3641 rb_enc_mbcput(c, buf, enc);
3642 rb_enc_cr_str_buf_cat(str, buf, len,
3643 encindex, ENC_CODERANGE_VALID, 0);
3644 ptr++;
3645 }
3646 return str;
3647 }
3648}
3649
3650VALUE
3652{
3653 int str2_cr = rb_enc_str_coderange(str2);
3654
3655 if (str_enc_fastpath(str)) {
3656 switch (str2_cr) {
3657 case ENC_CODERANGE_7BIT:
3658 // If RHS is 7bit we can do simple concatenation
3659 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3660 RB_GC_GUARD(str2);
3661 return str;
3663 // If RHS is valid, we can do simple concatenation if encodings are the same
3664 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3665 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3666 int str_cr = ENC_CODERANGE(str);
3667 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3668 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3669 }
3670 RB_GC_GUARD(str2);
3671 return str;
3672 }
3673 }
3674 }
3675
3676 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3677 ENCODING_GET(str2), str2_cr, &str2_cr);
3678
3679 ENC_CODERANGE_SET(str2, str2_cr);
3680
3681 return str;
3682}
3683
3684VALUE
3686{
3687 StringValue(str2);
3688 return rb_str_buf_append(str, str2);
3689}
3690
3691VALUE
3692rb_str_concat_literals(size_t num, const VALUE *strary)
3693{
3694 VALUE str;
3695 size_t i, s = 0;
3696 unsigned long len = 1;
3697
3698 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3699 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3700
3701 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3702 str = rb_str_buf_new(len);
3703 str_enc_copy_direct(str, strary[0]);
3704
3705 for (i = s; i < num; ++i) {
3706 const VALUE v = strary[i];
3707 int encidx = ENCODING_GET(v);
3708
3709 rb_str_buf_append(str, v);
3710 if (encidx != ENCINDEX_US_ASCII) {
3711 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3712 rb_enc_set_index(str, encidx);
3713 }
3714 }
3715 return str;
3716}
3717
3718/*
3719 * call-seq:
3720 * concat(*objects) -> string
3721 *
3722 * Concatenates each object in +objects+ to +self+ and returns +self+:
3723 *
3724 * s = 'foo'
3725 * s.concat('bar', 'baz') # => "foobarbaz"
3726 * s # => "foobarbaz"
3727 *
3728 * For each given object +object+ that is an Integer,
3729 * the value is considered a codepoint and converted to a character before concatenation:
3730 *
3731 * s = 'foo'
3732 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3733 *
3734 * Related: String#<<, which takes a single argument.
3735 */
3736static VALUE
3737rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3738{
3739 str_modifiable(str);
3740
3741 if (argc == 1) {
3742 return rb_str_concat(str, argv[0]);
3743 }
3744 else if (argc > 1) {
3745 int i;
3746 VALUE arg_str = rb_str_tmp_new(0);
3747 rb_enc_copy(arg_str, str);
3748 for (i = 0; i < argc; i++) {
3749 rb_str_concat(arg_str, argv[i]);
3750 }
3751 rb_str_buf_append(str, arg_str);
3752 }
3753
3754 return str;
3755}
3756
3757/*
3758 * call-seq:
3759 * append_as_bytes(*objects) -> string
3760 *
3761 * Concatenates each object in +objects+ into +self+ without any encoding
3762 * validation or conversion and returns +self+:
3763 *
3764 * s = 'foo'
3765 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3766 * s.valid_encoding? # => false
3767 * s.append_as_bytes("\xAC 12")
3768 * s.valid_encoding? # => true
3769 *
3770 * For each given object +object+ that is an Integer,
3771 * the value is considered a Byte. If the Integer is bigger
3772 * than one byte, only the lower byte is considered, similar to String#setbyte:
3773 *
3774 * s = ""
3775 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3776 *
3777 * Related: String#<<, String#concat, which do an encoding aware concatenation.
3778 */
3779
3780VALUE
3781rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3782{
3783 long needed_capacity = 0;
3784 volatile VALUE t0;
3785 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3786
3787 for (int index = 0; index < argc; index++) {
3788 VALUE obj = argv[index];
3789 enum ruby_value_type type = types[index] = rb_type(obj);
3790 switch (type) {
3791 case T_FIXNUM:
3792 case T_BIGNUM:
3793 needed_capacity++;
3794 break;
3795 case T_STRING:
3796 needed_capacity += RSTRING_LEN(obj);
3797 break;
3798 default:
3799 rb_raise(
3801 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3802 rb_obj_class(obj)
3803 );
3804 break;
3805 }
3806 }
3807
3808 str_ensure_available_capa(str, needed_capacity);
3809 char *sptr = RSTRING_END(str);
3810
3811 for (int index = 0; index < argc; index++) {
3812 VALUE obj = argv[index];
3813 enum ruby_value_type type = types[index];
3814 switch (type) {
3815 case T_FIXNUM:
3816 case T_BIGNUM: {
3817 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3818 char byte = (char)(NUM2INT(obj) & 0xFF);
3819 *sptr = byte;
3820 sptr++;
3821 break;
3822 }
3823 case T_STRING: {
3824 const char *ptr;
3825 long len;
3826 RSTRING_GETMEM(obj, ptr, len);
3827 memcpy(sptr, ptr, len);
3828 sptr += len;
3829 break;
3830 }
3831 default:
3832 rb_bug("append_as_bytes arguments should have been validated");
3833 }
3834 }
3835
3836 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3837 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3838
3839 int cr = ENC_CODERANGE(str);
3840 switch (cr) {
3841 case ENC_CODERANGE_7BIT: {
3842 for (int index = 0; index < argc; index++) {
3843 VALUE obj = argv[index];
3844 enum ruby_value_type type = types[index];
3845 switch (type) {
3846 case T_FIXNUM:
3847 case T_BIGNUM: {
3848 if (!ISASCII(NUM2INT(obj))) {
3849 goto clear_cr;
3850 }
3851 break;
3852 }
3853 case T_STRING: {
3854 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3855 goto clear_cr;
3856 }
3857 break;
3858 }
3859 default:
3860 rb_bug("append_as_bytes arguments should have been validated");
3861 }
3862 }
3863 break;
3864 }
3866 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3867 goto keep_cr;
3868 }
3869 else {
3870 goto clear_cr;
3871 }
3872 break;
3873 default:
3874 goto clear_cr;
3875 break;
3876 }
3877
3878 RB_GC_GUARD(t0);
3879
3880 clear_cr:
3881 // If no fast path was hit, we clear the coderange.
3882 // append_as_bytes is predominently meant to be used in
3883 // buffering situation, hence it's likely the coderange
3884 // will never be scanned, so it's not worth spending time
3885 // precomputing the coderange except for simple and common
3886 // situations.
3888 keep_cr:
3889 return str;
3890}
3891
3892/*
3893 * call-seq:
3894 * string << object -> string
3895 *
3896 * Concatenates +object+ to +self+ and returns +self+:
3897 *
3898 * s = 'foo'
3899 * s << 'bar' # => "foobar"
3900 * s # => "foobar"
3901 *
3902 * If +object+ is an Integer,
3903 * the value is considered a codepoint and converted to a character before concatenation:
3904 *
3905 * s = 'foo'
3906 * s << 33 # => "foo!"
3907 *
3908 * If that codepoint is not representable in the encoding of
3909 * _string_, RangeError is raised.
3910 *
3911 * s = 'foo'
3912 * s.encoding # => <Encoding:UTF-8>
3913 * s << 0x00110000 # 1114112 out of char range (RangeError)
3914 * s = 'foo'.encode('EUC-JP')
3915 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3916 *
3917 * If the encoding is US-ASCII and the codepoint is 0..0xff, _string_
3918 * is automatically promoted to ASCII-8BIT.
3919 *
3920 * s = 'foo'.encode('US-ASCII')
3921 * s << 0xff
3922 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3923 *
3924 * Related: String#concat, which takes multiple arguments.
3925 */
3926VALUE
3928{
3929 unsigned int code;
3930 rb_encoding *enc = STR_ENC_GET(str1);
3931 int encidx;
3932
3933 if (RB_INTEGER_TYPE_P(str2)) {
3934 if (rb_num_to_uint(str2, &code) == 0) {
3935 }
3936 else if (FIXNUM_P(str2)) {
3937 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3938 }
3939 else {
3940 rb_raise(rb_eRangeError, "bignum out of char range");
3941 }
3942 }
3943 else {
3944 return rb_str_append(str1, str2);
3945 }
3946
3947 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3948
3949 if (encidx >= 0) {
3950 rb_str_buf_cat_byte(str1, (unsigned char)code);
3951 }
3952 else {
3953 long pos = RSTRING_LEN(str1);
3954 int cr = ENC_CODERANGE(str1);
3955 int len;
3956 char *buf;
3957
3958 switch (len = rb_enc_codelen(code, enc)) {
3959 case ONIGERR_INVALID_CODE_POINT_VALUE:
3960 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3961 break;
3962 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3963 case 0:
3964 rb_raise(rb_eRangeError, "%u out of char range", code);
3965 break;
3966 }
3967 buf = ALLOCA_N(char, len + 1);
3968 rb_enc_mbcput(code, buf, enc);
3969 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3970 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3971 }
3972 rb_str_resize(str1, pos+len);
3973 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3974 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3976 }
3977 else if (cr == ENC_CODERANGE_BROKEN) {
3979 }
3980 ENC_CODERANGE_SET(str1, cr);
3981 }
3982 return str1;
3983}
3984
3985int
3986rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3987{
3988 int encidx = rb_enc_to_index(enc);
3989
3990 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3991 /* US-ASCII automatically extended to ASCII-8BIT */
3992 if (code > 0xFF) {
3993 rb_raise(rb_eRangeError, "%u out of char range", code);
3994 }
3995 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3996 return ENCINDEX_ASCII_8BIT;
3997 }
3998 return encidx;
3999 }
4000 else {
4001 return -1;
4002 }
4003}
4004
4005/*
4006 * call-seq:
4007 * prepend(*other_strings) -> string
4008 *
4009 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4010 *
4011 * s = 'foo'
4012 * s.prepend('bar', 'baz') # => "barbazfoo"
4013 * s # => "barbazfoo"
4014 *
4015 * Related: String#concat.
4016 */
4017
4018static VALUE
4019rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4020{
4021 str_modifiable(str);
4022
4023 if (argc == 1) {
4024 rb_str_update(str, 0L, 0L, argv[0]);
4025 }
4026 else if (argc > 1) {
4027 int i;
4028 VALUE arg_str = rb_str_tmp_new(0);
4029 rb_enc_copy(arg_str, str);
4030 for (i = 0; i < argc; i++) {
4031 rb_str_append(arg_str, argv[i]);
4032 }
4033 rb_str_update(str, 0L, 0L, arg_str);
4034 }
4035
4036 return str;
4037}
4038
4039st_index_t
4041{
4042 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4043 st_index_t precomputed_hash;
4044 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4045
4046 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4047 return precomputed_hash;
4048 }
4049
4050 return str_do_hash(str);
4051}
4052
4053int
4055{
4056 long len1, len2;
4057 const char *ptr1, *ptr2;
4058 RSTRING_GETMEM(str1, ptr1, len1);
4059 RSTRING_GETMEM(str2, ptr2, len2);
4060 return (len1 != len2 ||
4061 !rb_str_comparable(str1, str2) ||
4062 memcmp(ptr1, ptr2, len1) != 0);
4063}
4064
4065/*
4066 * call-seq:
4067 * hash -> integer
4068 *
4069 * Returns the integer hash value for +self+.
4070 * The value is based on the length, content and encoding of +self+.
4071 *
4072 * Related: Object#hash.
4073 */
4074
4075static VALUE
4076rb_str_hash_m(VALUE str)
4077{
4078 st_index_t hval = rb_str_hash(str);
4079 return ST2FIX(hval);
4080}
4081
4082#define lesser(a,b) (((a)>(b))?(b):(a))
4083
4084int
4086{
4087 int idx1, idx2;
4088 int rc1, rc2;
4089
4090 if (RSTRING_LEN(str1) == 0) return TRUE;
4091 if (RSTRING_LEN(str2) == 0) return TRUE;
4092 idx1 = ENCODING_GET(str1);
4093 idx2 = ENCODING_GET(str2);
4094 if (idx1 == idx2) return TRUE;
4095 rc1 = rb_enc_str_coderange(str1);
4096 rc2 = rb_enc_str_coderange(str2);
4097 if (rc1 == ENC_CODERANGE_7BIT) {
4098 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4099 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4100 return TRUE;
4101 }
4102 if (rc2 == ENC_CODERANGE_7BIT) {
4103 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4104 return TRUE;
4105 }
4106 return FALSE;
4107}
4108
4109int
4111{
4112 long len1, len2;
4113 const char *ptr1, *ptr2;
4114 int retval;
4115
4116 if (str1 == str2) return 0;
4117 RSTRING_GETMEM(str1, ptr1, len1);
4118 RSTRING_GETMEM(str2, ptr2, len2);
4119 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4120 if (len1 == len2) {
4121 if (!rb_str_comparable(str1, str2)) {
4122 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4123 return 1;
4124 return -1;
4125 }
4126 return 0;
4127 }
4128 if (len1 > len2) return 1;
4129 return -1;
4130 }
4131 if (retval > 0) return 1;
4132 return -1;
4133}
4134
4135/*
4136 * call-seq:
4137 * string == object -> true or false
4138 * string === object -> true or false
4139 *
4140 * Returns +true+ if +object+ has the same length and content;
4141 * as +self+; +false+ otherwise:
4142 *
4143 * s = 'foo'
4144 * s == 'foo' # => true
4145 * s == 'food' # => false
4146 * s == 'FOO' # => false
4147 *
4148 * Returns +false+ if the two strings' encodings are not compatible:
4149 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
4150 *
4151 * If +object+ is not an instance of +String+ but responds to +to_str+, then the
4152 * two strings are compared using <code>object.==</code>.
4153 */
4154
4155VALUE
4157{
4158 if (str1 == str2) return Qtrue;
4159 if (!RB_TYPE_P(str2, T_STRING)) {
4160 if (!rb_respond_to(str2, idTo_str)) {
4161 return Qfalse;
4162 }
4163 return rb_equal(str2, str1);
4164 }
4165 return rb_str_eql_internal(str1, str2);
4166}
4167
4168/*
4169 * call-seq:
4170 * eql?(object) -> true or false
4171 *
4172 * Returns +true+ if +object+ has the same length and content;
4173 * as +self+; +false+ otherwise:
4174 *
4175 * s = 'foo'
4176 * s.eql?('foo') # => true
4177 * s.eql?('food') # => false
4178 * s.eql?('FOO') # => false
4179 *
4180 * Returns +false+ if the two strings' encodings are not compatible:
4181 *
4182 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
4183 *
4184 */
4185
4186VALUE
4187rb_str_eql(VALUE str1, VALUE str2)
4188{
4189 if (str1 == str2) return Qtrue;
4190 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4191 return rb_str_eql_internal(str1, str2);
4192}
4193
4194/*
4195 * call-seq:
4196 * string <=> other_string -> -1, 0, 1, or nil
4197 *
4198 * Compares +self+ and +other_string+, returning:
4199 *
4200 * - -1 if +other_string+ is larger.
4201 * - 0 if the two are equal.
4202 * - 1 if +other_string+ is smaller.
4203 * - +nil+ if the two are incomparable.
4204 *
4205 * Examples:
4206 *
4207 * 'foo' <=> 'foo' # => 0
4208 * 'foo' <=> 'food' # => -1
4209 * 'food' <=> 'foo' # => 1
4210 * 'FOO' <=> 'foo' # => -1
4211 * 'foo' <=> 'FOO' # => 1
4212 * 'foo' <=> 1 # => nil
4213 *
4214 */
4215
4216static VALUE
4217rb_str_cmp_m(VALUE str1, VALUE str2)
4218{
4219 int result;
4220 VALUE s = rb_check_string_type(str2);
4221 if (NIL_P(s)) {
4222 return rb_invcmp(str1, str2);
4223 }
4224 result = rb_str_cmp(str1, s);
4225 return INT2FIX(result);
4226}
4227
4228static VALUE str_casecmp(VALUE str1, VALUE str2);
4229static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4230
4231/*
4232 * call-seq:
4233 * casecmp(other_string) -> -1, 0, 1, or nil
4234 *
4235 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4236 *
4237 * - -1 if <tt>other_string.downcase</tt> is larger.
4238 * - 0 if the two are equal.
4239 * - 1 if <tt>other_string.downcase</tt> is smaller.
4240 * - +nil+ if the two are incomparable.
4241 *
4242 * Examples:
4243 *
4244 * 'foo'.casecmp('foo') # => 0
4245 * 'foo'.casecmp('food') # => -1
4246 * 'food'.casecmp('foo') # => 1
4247 * 'FOO'.casecmp('foo') # => 0
4248 * 'foo'.casecmp('FOO') # => 0
4249 * 'foo'.casecmp(1) # => nil
4250 *
4251 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4252 *
4253 * Related: String#casecmp?.
4254 *
4255 */
4256
4257static VALUE
4258rb_str_casecmp(VALUE str1, VALUE str2)
4259{
4260 VALUE s = rb_check_string_type(str2);
4261 if (NIL_P(s)) {
4262 return Qnil;
4263 }
4264 return str_casecmp(str1, s);
4265}
4266
4267static VALUE
4268str_casecmp(VALUE str1, VALUE str2)
4269{
4270 long len;
4271 rb_encoding *enc;
4272 const char *p1, *p1end, *p2, *p2end;
4273
4274 enc = rb_enc_compatible(str1, str2);
4275 if (!enc) {
4276 return Qnil;
4277 }
4278
4279 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4280 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4281 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4282 while (p1 < p1end && p2 < p2end) {
4283 if (*p1 != *p2) {
4284 unsigned int c1 = TOLOWER(*p1 & 0xff);
4285 unsigned int c2 = TOLOWER(*p2 & 0xff);
4286 if (c1 != c2)
4287 return INT2FIX(c1 < c2 ? -1 : 1);
4288 }
4289 p1++;
4290 p2++;
4291 }
4292 }
4293 else {
4294 while (p1 < p1end && p2 < p2end) {
4295 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4296 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4297
4298 if (0 <= c1 && 0 <= c2) {
4299 c1 = TOLOWER(c1);
4300 c2 = TOLOWER(c2);
4301 if (c1 != c2)
4302 return INT2FIX(c1 < c2 ? -1 : 1);
4303 }
4304 else {
4305 int r;
4306 l1 = rb_enc_mbclen(p1, p1end, enc);
4307 l2 = rb_enc_mbclen(p2, p2end, enc);
4308 len = l1 < l2 ? l1 : l2;
4309 r = memcmp(p1, p2, len);
4310 if (r != 0)
4311 return INT2FIX(r < 0 ? -1 : 1);
4312 if (l1 != l2)
4313 return INT2FIX(l1 < l2 ? -1 : 1);
4314 }
4315 p1 += l1;
4316 p2 += l2;
4317 }
4318 }
4319 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4320 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4321 return INT2FIX(-1);
4322}
4323
4324/*
4325 * call-seq:
4326 * casecmp?(other_string) -> true, false, or nil
4327 *
4328 * Returns +true+ if +self+ and +other_string+ are equal after
4329 * Unicode case folding, otherwise +false+:
4330 *
4331 * 'foo'.casecmp?('foo') # => true
4332 * 'foo'.casecmp?('food') # => false
4333 * 'food'.casecmp?('foo') # => false
4334 * 'FOO'.casecmp?('foo') # => true
4335 * 'foo'.casecmp?('FOO') # => true
4336 *
4337 * Returns +nil+ if the two values are incomparable:
4338 *
4339 * 'foo'.casecmp?(1) # => nil
4340 *
4341 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4342 *
4343 * Related: String#casecmp.
4344 *
4345 */
4346
4347static VALUE
4348rb_str_casecmp_p(VALUE str1, VALUE str2)
4349{
4350 VALUE s = rb_check_string_type(str2);
4351 if (NIL_P(s)) {
4352 return Qnil;
4353 }
4354 return str_casecmp_p(str1, s);
4355}
4356
4357static VALUE
4358str_casecmp_p(VALUE str1, VALUE str2)
4359{
4360 rb_encoding *enc;
4361 VALUE folded_str1, folded_str2;
4362 VALUE fold_opt = sym_fold;
4363
4364 enc = rb_enc_compatible(str1, str2);
4365 if (!enc) {
4366 return Qnil;
4367 }
4368
4369 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4370 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4371
4372 return rb_str_eql(folded_str1, folded_str2);
4373}
4374
4375static long
4376strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4377 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4378{
4379 const char *search_start = str_ptr;
4380 long pos, search_len = str_len - offset;
4381
4382 for (;;) {
4383 const char *t;
4384 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4385 if (pos < 0) return pos;
4386 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4387 if (t == search_start + pos) break;
4388 search_len -= t - search_start;
4389 if (search_len <= 0) return -1;
4390 offset += t - search_start;
4391 search_start = t;
4392 }
4393 return pos + offset;
4394}
4395
4396/* found index in byte */
4397#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4398#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4399
4400static long
4401rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4402{
4403 const char *str_ptr, *str_ptr_end, *sub_ptr;
4404 long str_len, sub_len;
4405 rb_encoding *enc;
4406
4407 enc = rb_enc_check(str, sub);
4408 if (is_broken_string(sub)) return -1;
4409
4410 str_ptr = RSTRING_PTR(str);
4411 str_ptr_end = RSTRING_END(str);
4412 str_len = RSTRING_LEN(str);
4413 sub_ptr = RSTRING_PTR(sub);
4414 sub_len = RSTRING_LEN(sub);
4415
4416 if (str_len < sub_len) return -1;
4417
4418 if (offset != 0) {
4419 long str_len_char, sub_len_char;
4420 int single_byte = single_byte_optimizable(str);
4421 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4422 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4423 if (offset < 0) {
4424 offset += str_len_char;
4425 if (offset < 0) return -1;
4426 }
4427 if (str_len_char - offset < sub_len_char) return -1;
4428 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4429 str_ptr += offset;
4430 }
4431 if (sub_len == 0) return offset;
4432
4433 /* need proceed one character at a time */
4434 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4435}
4436
4437
4438/*
4439 * call-seq:
4440 * index(substring, offset = 0) -> integer or nil
4441 * index(regexp, offset = 0) -> integer or nil
4442 *
4443 * :include: doc/string/index.rdoc
4444 *
4445 */
4446
4447static VALUE
4448rb_str_index_m(int argc, VALUE *argv, VALUE str)
4449{
4450 VALUE sub;
4451 VALUE initpos;
4452 rb_encoding *enc = STR_ENC_GET(str);
4453 long pos;
4454
4455 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4456 long slen = str_strlen(str, enc); /* str's enc */
4457 pos = NUM2LONG(initpos);
4458 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4459 if (RB_TYPE_P(sub, T_REGEXP)) {
4461 }
4462 return Qnil;
4463 }
4464 }
4465 else {
4466 pos = 0;
4467 }
4468
4469 if (RB_TYPE_P(sub, T_REGEXP)) {
4470 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4471 enc, single_byte_optimizable(str));
4472
4473 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4474 VALUE match = rb_backref_get();
4475 struct re_registers *regs = RMATCH_REGS(match);
4476 pos = rb_str_sublen(str, BEG(0));
4477 return LONG2NUM(pos);
4478 }
4479 }
4480 else {
4481 StringValue(sub);
4482 pos = rb_str_index(str, sub, pos);
4483 if (pos >= 0) {
4484 pos = rb_str_sublen(str, pos);
4485 return LONG2NUM(pos);
4486 }
4487 }
4488 return Qnil;
4489}
4490
4491/* Ensure that the given pos is a valid character boundary.
4492 * Note that in this function, "character" means a code point
4493 * (Unicode scalar value), not a grapheme cluster.
4494 */
4495static void
4496str_ensure_byte_pos(VALUE str, long pos)
4497{
4498 if (!single_byte_optimizable(str)) {
4499 const char *s = RSTRING_PTR(str);
4500 const char *e = RSTRING_END(str);
4501 const char *p = s + pos;
4502 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4503 rb_raise(rb_eIndexError,
4504 "offset %ld does not land on character boundary", pos);
4505 }
4506 }
4507}
4508
4509/*
4510 * call-seq:
4511 * byteindex(substring, offset = 0) -> integer or nil
4512 * byteindex(regexp, offset = 0) -> integer or nil
4513 *
4514 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4515 * or +nil+ if none found:
4516 *
4517 * 'foo'.byteindex('f') # => 0
4518 * 'foo'.byteindex('o') # => 1
4519 * 'foo'.byteindex('oo') # => 1
4520 * 'foo'.byteindex('ooo') # => nil
4521 *
4522 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4523 * or +nil+ if none found:
4524 *
4525 * 'foo'.byteindex(/f/) # => 0
4526 * 'foo'.byteindex(/o/) # => 1
4527 * 'foo'.byteindex(/oo/) # => 1
4528 * 'foo'.byteindex(/ooo/) # => nil
4529 *
4530 * Integer argument +offset+, if given, specifies the byte-based position in the
4531 * string to begin the search:
4532 *
4533 * 'foo'.byteindex('o', 1) # => 1
4534 * 'foo'.byteindex('o', 2) # => 2
4535 * 'foo'.byteindex('o', 3) # => nil
4536 *
4537 * If +offset+ is negative, counts backward from the end of +self+:
4538 *
4539 * 'foo'.byteindex('o', -1) # => 2
4540 * 'foo'.byteindex('o', -2) # => 1
4541 * 'foo'.byteindex('o', -3) # => 1
4542 * 'foo'.byteindex('o', -4) # => nil
4543 *
4544 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4545 * raised.
4546 *
4547 * Related: String#index, String#byterindex.
4548 */
4549
4550static VALUE
4551rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4552{
4553 VALUE sub;
4554 VALUE initpos;
4555 long pos;
4556
4557 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4558 long slen = RSTRING_LEN(str);
4559 pos = NUM2LONG(initpos);
4560 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4561 if (RB_TYPE_P(sub, T_REGEXP)) {
4563 }
4564 return Qnil;
4565 }
4566 }
4567 else {
4568 pos = 0;
4569 }
4570
4571 str_ensure_byte_pos(str, pos);
4572
4573 if (RB_TYPE_P(sub, T_REGEXP)) {
4574 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4575 VALUE match = rb_backref_get();
4576 struct re_registers *regs = RMATCH_REGS(match);
4577 pos = BEG(0);
4578 return LONG2NUM(pos);
4579 }
4580 }
4581 else {
4582 StringValue(sub);
4583 pos = rb_str_byteindex(str, sub, pos);
4584 if (pos >= 0) return LONG2NUM(pos);
4585 }
4586 return Qnil;
4587}
4588
4589#ifndef HAVE_MEMRCHR
4590static void*
4591memrchr(const char *search_str, int chr, long search_len)
4592{
4593 const char *ptr = search_str + search_len;
4594 while (ptr > search_str) {
4595 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4596 }
4597
4598 return ((void *)0);
4599}
4600#endif
4601
4602static long
4603str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4604{
4605 char *hit, *adjusted;
4606 int c;
4607 long slen, searchlen;
4608 char *sbeg, *e, *t;
4609
4610 sbeg = RSTRING_PTR(str);
4611 slen = RSTRING_LEN(sub);
4612 if (slen == 0) return s - sbeg;
4613 e = RSTRING_END(str);
4614 t = RSTRING_PTR(sub);
4615 c = *t & 0xff;
4616 searchlen = s - sbeg + 1;
4617
4618 if (memcmp(s, t, slen) == 0) {
4619 return s - sbeg;
4620 }
4621
4622 do {
4623 hit = memrchr(sbeg, c, searchlen);
4624 if (!hit) break;
4625 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4626 if (hit != adjusted) {
4627 searchlen = adjusted - sbeg;
4628 continue;
4629 }
4630 if (memcmp(hit, t, slen) == 0)
4631 return hit - sbeg;
4632 searchlen = adjusted - sbeg;
4633 } while (searchlen > 0);
4634
4635 return -1;
4636}
4637
4638/* found index in byte */
4639static long
4640rb_str_rindex(VALUE str, VALUE sub, long pos)
4641{
4642 long len, slen;
4643 char *sbeg, *s;
4644 rb_encoding *enc;
4645 int singlebyte;
4646
4647 enc = rb_enc_check(str, sub);
4648 if (is_broken_string(sub)) return -1;
4649 singlebyte = single_byte_optimizable(str);
4650 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4651 slen = str_strlen(sub, enc); /* rb_enc_check */
4652
4653 /* substring longer than string */
4654 if (len < slen) return -1;
4655 if (len - pos < slen) pos = len - slen;
4656 if (len == 0) return pos;
4657
4658 sbeg = RSTRING_PTR(str);
4659
4660 if (pos == 0) {
4661 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4662 return 0;
4663 else
4664 return -1;
4665 }
4666
4667 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4668 return str_rindex(str, sub, s, enc);
4669}
4670
4671/*
4672 * call-seq:
4673 * rindex(substring, offset = self.length) -> integer or nil
4674 * rindex(regexp, offset = self.length) -> integer or nil
4675 *
4676 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4677 * or +nil+ if none found:
4678 *
4679 * 'foo'.rindex('f') # => 0
4680 * 'foo'.rindex('o') # => 2
4681 * 'foo'.rindex('oo') # => 1
4682 * 'foo'.rindex('ooo') # => nil
4683 *
4684 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4685 * or +nil+ if none found:
4686 *
4687 * 'foo'.rindex(/f/) # => 0
4688 * 'foo'.rindex(/o/) # => 2
4689 * 'foo'.rindex(/oo/) # => 1
4690 * 'foo'.rindex(/ooo/) # => nil
4691 *
4692 * The _last_ match means starting at the possible last position, not
4693 * the last of longest matches.
4694 *
4695 * 'foo'.rindex(/o+/) # => 2
4696 * $~ #=> #<MatchData "o">
4697 *
4698 * To get the last longest match, needs to combine with negative
4699 * lookbehind.
4700 *
4701 * 'foo'.rindex(/(?<!o)o+/) # => 1
4702 * $~ #=> #<MatchData "oo">
4703 *
4704 * Or String#index with negative lookforward.
4705 *
4706 * 'foo'.index(/o+(?!.*o)/) # => 1
4707 * $~ #=> #<MatchData "oo">
4708 *
4709 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4710 * string to _end_ the search:
4711 *
4712 * 'foo'.rindex('o', 0) # => nil
4713 * 'foo'.rindex('o', 1) # => 1
4714 * 'foo'.rindex('o', 2) # => 2
4715 * 'foo'.rindex('o', 3) # => 2
4716 *
4717 * If +offset+ is a negative Integer, the maximum starting position in the
4718 * string to _end_ the search is the sum of the string's length and +offset+:
4719 *
4720 * 'foo'.rindex('o', -1) # => 2
4721 * 'foo'.rindex('o', -2) # => 1
4722 * 'foo'.rindex('o', -3) # => nil
4723 * 'foo'.rindex('o', -4) # => nil
4724 *
4725 * Related: String#index.
4726 */
4727
4728static VALUE
4729rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4730{
4731 VALUE sub;
4732 VALUE initpos;
4733 rb_encoding *enc = STR_ENC_GET(str);
4734 long pos, len = str_strlen(str, enc); /* str's enc */
4735
4736 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4737 pos = NUM2LONG(initpos);
4738 if (pos < 0 && (pos += len) < 0) {
4739 if (RB_TYPE_P(sub, T_REGEXP)) {
4741 }
4742 return Qnil;
4743 }
4744 if (pos > len) pos = len;
4745 }
4746 else {
4747 pos = len;
4748 }
4749
4750 if (RB_TYPE_P(sub, T_REGEXP)) {
4751 /* enc = rb_enc_check(str, sub); */
4752 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4753 enc, single_byte_optimizable(str));
4754
4755 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4756 VALUE match = rb_backref_get();
4757 struct re_registers *regs = RMATCH_REGS(match);
4758 pos = rb_str_sublen(str, BEG(0));
4759 return LONG2NUM(pos);
4760 }
4761 }
4762 else {
4763 StringValue(sub);
4764 pos = rb_str_rindex(str, sub, pos);
4765 if (pos >= 0) {
4766 pos = rb_str_sublen(str, pos);
4767 return LONG2NUM(pos);
4768 }
4769 }
4770 return Qnil;
4771}
4772
4773static long
4774rb_str_byterindex(VALUE str, VALUE sub, long pos)
4775{
4776 long len, slen;
4777 char *sbeg, *s;
4778 rb_encoding *enc;
4779
4780 enc = rb_enc_check(str, sub);
4781 if (is_broken_string(sub)) return -1;
4782 len = RSTRING_LEN(str);
4783 slen = RSTRING_LEN(sub);
4784
4785 /* substring longer than string */
4786 if (len < slen) return -1;
4787 if (len - pos < slen) pos = len - slen;
4788 if (len == 0) return pos;
4789
4790 sbeg = RSTRING_PTR(str);
4791
4792 if (pos == 0) {
4793 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4794 return 0;
4795 else
4796 return -1;
4797 }
4798
4799 s = sbeg + pos;
4800 return str_rindex(str, sub, s, enc);
4801}
4802
4803
4804/*
4805 * call-seq:
4806 * byterindex(substring, offset = self.bytesize) -> integer or nil
4807 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4808 *
4809 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4810 * or +nil+ if none found:
4811 *
4812 * 'foo'.byterindex('f') # => 0
4813 * 'foo'.byterindex('o') # => 2
4814 * 'foo'.byterindex('oo') # => 1
4815 * 'foo'.byterindex('ooo') # => nil
4816 *
4817 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4818 * or +nil+ if none found:
4819 *
4820 * 'foo'.byterindex(/f/) # => 0
4821 * 'foo'.byterindex(/o/) # => 2
4822 * 'foo'.byterindex(/oo/) # => 1
4823 * 'foo'.byterindex(/ooo/) # => nil
4824 *
4825 * The _last_ match means starting at the possible last position, not
4826 * the last of longest matches.
4827 *
4828 * 'foo'.byterindex(/o+/) # => 2
4829 * $~ #=> #<MatchData "o">
4830 *
4831 * To get the last longest match, needs to combine with negative
4832 * lookbehind.
4833 *
4834 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4835 * $~ #=> #<MatchData "oo">
4836 *
4837 * Or String#byteindex with negative lookforward.
4838 *
4839 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4840 * $~ #=> #<MatchData "oo">
4841 *
4842 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4843 * string to _end_ the search:
4844 *
4845 * 'foo'.byterindex('o', 0) # => nil
4846 * 'foo'.byterindex('o', 1) # => 1
4847 * 'foo'.byterindex('o', 2) # => 2
4848 * 'foo'.byterindex('o', 3) # => 2
4849 *
4850 * If +offset+ is a negative Integer, the maximum starting position in the
4851 * string to _end_ the search is the sum of the string's length and +offset+:
4852 *
4853 * 'foo'.byterindex('o', -1) # => 2
4854 * 'foo'.byterindex('o', -2) # => 1
4855 * 'foo'.byterindex('o', -3) # => nil
4856 * 'foo'.byterindex('o', -4) # => nil
4857 *
4858 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4859 * raised.
4860 *
4861 * Related: String#byteindex.
4862 */
4863
4864static VALUE
4865rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4866{
4867 VALUE sub;
4868 VALUE initpos;
4869 long pos, len = RSTRING_LEN(str);
4870
4871 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4872 pos = NUM2LONG(initpos);
4873 if (pos < 0 && (pos += len) < 0) {
4874 if (RB_TYPE_P(sub, T_REGEXP)) {
4876 }
4877 return Qnil;
4878 }
4879 if (pos > len) pos = len;
4880 }
4881 else {
4882 pos = len;
4883 }
4884
4885 str_ensure_byte_pos(str, pos);
4886
4887 if (RB_TYPE_P(sub, T_REGEXP)) {
4888 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4889 VALUE match = rb_backref_get();
4890 struct re_registers *regs = RMATCH_REGS(match);
4891 pos = BEG(0);
4892 return LONG2NUM(pos);
4893 }
4894 }
4895 else {
4896 StringValue(sub);
4897 pos = rb_str_byterindex(str, sub, pos);
4898 if (pos >= 0) return LONG2NUM(pos);
4899 }
4900 return Qnil;
4901}
4902
4903/*
4904 * call-seq:
4905 * string =~ regexp -> integer or nil
4906 * string =~ object -> integer or nil
4907 *
4908 * Returns the Integer index of the first substring that matches
4909 * the given +regexp+, or +nil+ if no match found:
4910 *
4911 * 'foo' =~ /f/ # => 0
4912 * 'foo' =~ /o/ # => 1
4913 * 'foo' =~ /x/ # => nil
4914 *
4915 * Note: also updates Regexp@Global+Variables.
4916 *
4917 * If the given +object+ is not a Regexp, returns the value
4918 * returned by <tt>object =~ self</tt>.
4919 *
4920 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4921 * (see Regexp#=~):
4922 *
4923 * number= nil
4924 * "no. 9" =~ /(?<number>\d+)/
4925 * number # => nil (not assigned)
4926 * /(?<number>\d+)/ =~ "no. 9"
4927 * number #=> "9"
4928 *
4929 */
4930
4931static VALUE
4932rb_str_match(VALUE x, VALUE y)
4933{
4934 switch (OBJ_BUILTIN_TYPE(y)) {
4935 case T_STRING:
4936 rb_raise(rb_eTypeError, "type mismatch: String given");
4937
4938 case T_REGEXP:
4939 return rb_reg_match(y, x);
4940
4941 default:
4942 return rb_funcall(y, idEqTilde, 1, x);
4943 }
4944}
4945
4946
4947static VALUE get_pat(VALUE);
4948
4949
4950/*
4951 * call-seq:
4952 * match(pattern, offset = 0) -> matchdata or nil
4953 * match(pattern, offset = 0) {|matchdata| ... } -> object
4954 *
4955 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4956 *
4957 * Note: also updates Regexp@Global+Variables.
4958 *
4959 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4960 * regexp = Regexp.new(pattern)
4961 * - Computes +matchdata+, which will be either a MatchData object or +nil+
4962 * (see Regexp#match):
4963 * matchdata = <tt>regexp.match(self)
4964 *
4965 * With no block given, returns the computed +matchdata+:
4966 *
4967 * 'foo'.match('f') # => #<MatchData "f">
4968 * 'foo'.match('o') # => #<MatchData "o">
4969 * 'foo'.match('x') # => nil
4970 *
4971 * If Integer argument +offset+ is given, the search begins at index +offset+:
4972 *
4973 * 'foo'.match('f', 1) # => nil
4974 * 'foo'.match('o', 1) # => #<MatchData "o">
4975 *
4976 * With a block given, calls the block with the computed +matchdata+
4977 * and returns the block's return value:
4978 *
4979 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4980 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4981 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4982 *
4983 */
4984
4985static VALUE
4986rb_str_match_m(int argc, VALUE *argv, VALUE str)
4987{
4988 VALUE re, result;
4989 if (argc < 1)
4990 rb_check_arity(argc, 1, 2);
4991 re = argv[0];
4992 argv[0] = str;
4993 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4994 if (!NIL_P(result) && rb_block_given_p()) {
4995 return rb_yield(result);
4996 }
4997 return result;
4998}
4999
5000/*
5001 * call-seq:
5002 * match?(pattern, offset = 0) -> true or false
5003 *
5004 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5005 *
5006 * Note: does not update Regexp@Global+Variables.
5007 *
5008 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5009 * regexp = Regexp.new(pattern)
5010 *
5011 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5012 * +false+ otherwise:
5013 *
5014 * 'foo'.match?(/o/) # => true
5015 * 'foo'.match?('o') # => true
5016 * 'foo'.match?(/x/) # => false
5017 *
5018 * If Integer argument +offset+ is given, the search begins at index +offset+:
5019 * 'foo'.match?('f', 1) # => false
5020 * 'foo'.match?('o', 1) # => true
5021 *
5022 */
5023
5024static VALUE
5025rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5026{
5027 VALUE re;
5028 rb_check_arity(argc, 1, 2);
5029 re = get_pat(argv[0]);
5030 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5031}
5032
5033enum neighbor_char {
5034 NEIGHBOR_NOT_CHAR,
5035 NEIGHBOR_FOUND,
5036 NEIGHBOR_WRAPPED
5037};
5038
5039static enum neighbor_char
5040enc_succ_char(char *p, long len, rb_encoding *enc)
5041{
5042 long i;
5043 int l;
5044
5045 if (rb_enc_mbminlen(enc) > 1) {
5046 /* wchar, trivial case */
5047 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5048 if (!MBCLEN_CHARFOUND_P(r)) {
5049 return NEIGHBOR_NOT_CHAR;
5050 }
5051 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5052 l = rb_enc_code_to_mbclen(c, enc);
5053 if (!l) return NEIGHBOR_NOT_CHAR;
5054 if (l != len) return NEIGHBOR_WRAPPED;
5055 rb_enc_mbcput(c, p, enc);
5056 r = rb_enc_precise_mbclen(p, p + len, enc);
5057 if (!MBCLEN_CHARFOUND_P(r)) {
5058 return NEIGHBOR_NOT_CHAR;
5059 }
5060 return NEIGHBOR_FOUND;
5061 }
5062 while (1) {
5063 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5064 p[i] = '\0';
5065 if (i < 0)
5066 return NEIGHBOR_WRAPPED;
5067 ++((unsigned char*)p)[i];
5068 l = rb_enc_precise_mbclen(p, p+len, enc);
5069 if (MBCLEN_CHARFOUND_P(l)) {
5070 l = MBCLEN_CHARFOUND_LEN(l);
5071 if (l == len) {
5072 return NEIGHBOR_FOUND;
5073 }
5074 else {
5075 memset(p+l, 0xff, len-l);
5076 }
5077 }
5078 if (MBCLEN_INVALID_P(l) && i < len-1) {
5079 long len2;
5080 int l2;
5081 for (len2 = len-1; 0 < len2; len2--) {
5082 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5083 if (!MBCLEN_INVALID_P(l2))
5084 break;
5085 }
5086 memset(p+len2+1, 0xff, len-(len2+1));
5087 }
5088 }
5089}
5090
5091static enum neighbor_char
5092enc_pred_char(char *p, long len, rb_encoding *enc)
5093{
5094 long i;
5095 int l;
5096 if (rb_enc_mbminlen(enc) > 1) {
5097 /* wchar, trivial case */
5098 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5099 if (!MBCLEN_CHARFOUND_P(r)) {
5100 return NEIGHBOR_NOT_CHAR;
5101 }
5102 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5103 if (!c) return NEIGHBOR_NOT_CHAR;
5104 --c;
5105 l = rb_enc_code_to_mbclen(c, enc);
5106 if (!l) return NEIGHBOR_NOT_CHAR;
5107 if (l != len) return NEIGHBOR_WRAPPED;
5108 rb_enc_mbcput(c, p, enc);
5109 r = rb_enc_precise_mbclen(p, p + len, enc);
5110 if (!MBCLEN_CHARFOUND_P(r)) {
5111 return NEIGHBOR_NOT_CHAR;
5112 }
5113 return NEIGHBOR_FOUND;
5114 }
5115 while (1) {
5116 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5117 p[i] = '\xff';
5118 if (i < 0)
5119 return NEIGHBOR_WRAPPED;
5120 --((unsigned char*)p)[i];
5121 l = rb_enc_precise_mbclen(p, p+len, enc);
5122 if (MBCLEN_CHARFOUND_P(l)) {
5123 l = MBCLEN_CHARFOUND_LEN(l);
5124 if (l == len) {
5125 return NEIGHBOR_FOUND;
5126 }
5127 else {
5128 memset(p+l, 0, len-l);
5129 }
5130 }
5131 if (MBCLEN_INVALID_P(l) && i < len-1) {
5132 long len2;
5133 int l2;
5134 for (len2 = len-1; 0 < len2; len2--) {
5135 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5136 if (!MBCLEN_INVALID_P(l2))
5137 break;
5138 }
5139 memset(p+len2+1, 0, len-(len2+1));
5140 }
5141 }
5142}
5143
5144/*
5145 overwrite +p+ by succeeding letter in +enc+ and returns
5146 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5147 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5148 assuming each ranges are successive, and mbclen
5149 never change in each ranges.
5150 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5151 character.
5152 */
5153static enum neighbor_char
5154enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5155{
5156 enum neighbor_char ret;
5157 unsigned int c;
5158 int ctype;
5159 int range;
5160 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5161
5162 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5163 int try;
5164 const int max_gaps = 1;
5165
5166 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5167 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5168 ctype = ONIGENC_CTYPE_DIGIT;
5169 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5170 ctype = ONIGENC_CTYPE_ALPHA;
5171 else
5172 return NEIGHBOR_NOT_CHAR;
5173
5174 MEMCPY(save, p, char, len);
5175 for (try = 0; try <= max_gaps; ++try) {
5176 ret = enc_succ_char(p, len, enc);
5177 if (ret == NEIGHBOR_FOUND) {
5178 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5179 if (rb_enc_isctype(c, ctype, enc))
5180 return NEIGHBOR_FOUND;
5181 }
5182 }
5183 MEMCPY(p, save, char, len);
5184 range = 1;
5185 while (1) {
5186 MEMCPY(save, p, char, len);
5187 ret = enc_pred_char(p, len, enc);
5188 if (ret == NEIGHBOR_FOUND) {
5189 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5190 if (!rb_enc_isctype(c, ctype, enc)) {
5191 MEMCPY(p, save, char, len);
5192 break;
5193 }
5194 }
5195 else {
5196 MEMCPY(p, save, char, len);
5197 break;
5198 }
5199 range++;
5200 }
5201 if (range == 1) {
5202 return NEIGHBOR_NOT_CHAR;
5203 }
5204
5205 if (ctype != ONIGENC_CTYPE_DIGIT) {
5206 MEMCPY(carry, p, char, len);
5207 return NEIGHBOR_WRAPPED;
5208 }
5209
5210 MEMCPY(carry, p, char, len);
5211 enc_succ_char(carry, len, enc);
5212 return NEIGHBOR_WRAPPED;
5213}
5214
5215
5216static VALUE str_succ(VALUE str);
5217
5218/*
5219 * call-seq:
5220 * succ -> new_str
5221 *
5222 * Returns the successor to +self+. The successor is calculated by
5223 * incrementing characters.
5224 *
5225 * The first character to be incremented is the rightmost alphanumeric:
5226 * or, if no alphanumerics, the rightmost character:
5227 *
5228 * 'THX1138'.succ # => "THX1139"
5229 * '<<koala>>'.succ # => "<<koalb>>"
5230 * '***'.succ # => '**+'
5231 *
5232 * The successor to a digit is another digit, "carrying" to the next-left
5233 * character for a "rollover" from 9 to 0, and prepending another digit
5234 * if necessary:
5235 *
5236 * '00'.succ # => "01"
5237 * '09'.succ # => "10"
5238 * '99'.succ # => "100"
5239 *
5240 * The successor to a letter is another letter of the same case,
5241 * carrying to the next-left character for a rollover,
5242 * and prepending another same-case letter if necessary:
5243 *
5244 * 'aa'.succ # => "ab"
5245 * 'az'.succ # => "ba"
5246 * 'zz'.succ # => "aaa"
5247 * 'AA'.succ # => "AB"
5248 * 'AZ'.succ # => "BA"
5249 * 'ZZ'.succ # => "AAA"
5250 *
5251 * The successor to a non-alphanumeric character is the next character
5252 * in the underlying character set's collating sequence,
5253 * carrying to the next-left character for a rollover,
5254 * and prepending another character if necessary:
5255 *
5256 * s = 0.chr * 3
5257 * s # => "\x00\x00\x00"
5258 * s.succ # => "\x00\x00\x01"
5259 * s = 255.chr * 3
5260 * s # => "\xFF\xFF\xFF"
5261 * s.succ # => "\x01\x00\x00\x00"
5262 *
5263 * Carrying can occur between and among mixtures of alphanumeric characters:
5264 *
5265 * s = 'zz99zz99'
5266 * s.succ # => "aaa00aa00"
5267 * s = '99zz99zz'
5268 * s.succ # => "100aa00aa"
5269 *
5270 * The successor to an empty +String+ is a new empty +String+:
5271 *
5272 * ''.succ # => ""
5273 *
5274 */
5275
5276VALUE
5278{
5279 VALUE str;
5280 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5281 rb_enc_cr_str_copy_for_substr(str, orig);
5282 return str_succ(str);
5283}
5284
5285static VALUE
5286str_succ(VALUE str)
5287{
5288 rb_encoding *enc;
5289 char *sbeg, *s, *e, *last_alnum = 0;
5290 int found_alnum = 0;
5291 long l, slen;
5292 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5293 long carry_pos = 0, carry_len = 1;
5294 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5295
5296 slen = RSTRING_LEN(str);
5297 if (slen == 0) return str;
5298
5299 enc = STR_ENC_GET(str);
5300 sbeg = RSTRING_PTR(str);
5301 s = e = sbeg + slen;
5302
5303 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5304 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5305 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5306 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5307 break;
5308 }
5309 }
5310 l = rb_enc_precise_mbclen(s, e, enc);
5311 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5312 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5313 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5314 switch (neighbor) {
5315 case NEIGHBOR_NOT_CHAR:
5316 continue;
5317 case NEIGHBOR_FOUND:
5318 return str;
5319 case NEIGHBOR_WRAPPED:
5320 last_alnum = s;
5321 break;
5322 }
5323 found_alnum = 1;
5324 carry_pos = s - sbeg;
5325 carry_len = l;
5326 }
5327 if (!found_alnum) { /* str contains no alnum */
5328 s = e;
5329 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5330 enum neighbor_char neighbor;
5331 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5332 l = rb_enc_precise_mbclen(s, e, enc);
5333 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5334 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5335 MEMCPY(tmp, s, char, l);
5336 neighbor = enc_succ_char(tmp, l, enc);
5337 switch (neighbor) {
5338 case NEIGHBOR_FOUND:
5339 MEMCPY(s, tmp, char, l);
5340 return str;
5341 break;
5342 case NEIGHBOR_WRAPPED:
5343 MEMCPY(s, tmp, char, l);
5344 break;
5345 case NEIGHBOR_NOT_CHAR:
5346 break;
5347 }
5348 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5349 /* wrapped to \0...\0. search next valid char. */
5350 enc_succ_char(s, l, enc);
5351 }
5352 if (!rb_enc_asciicompat(enc)) {
5353 MEMCPY(carry, s, char, l);
5354 carry_len = l;
5355 }
5356 carry_pos = s - sbeg;
5357 }
5359 }
5360 RESIZE_CAPA(str, slen + carry_len);
5361 sbeg = RSTRING_PTR(str);
5362 s = sbeg + carry_pos;
5363 memmove(s + carry_len, s, slen - carry_pos);
5364 memmove(s, carry, carry_len);
5365 slen += carry_len;
5366 STR_SET_LEN(str, slen);
5367 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5369 return str;
5370}
5371
5372
5373/*
5374 * call-seq:
5375 * succ! -> self
5376 *
5377 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5378 */
5379
5380static VALUE
5381rb_str_succ_bang(VALUE str)
5382{
5383 rb_str_modify(str);
5384 str_succ(str);
5385 return str;
5386}
5387
5388static int
5389all_digits_p(const char *s, long len)
5390{
5391 while (len-- > 0) {
5392 if (!ISDIGIT(*s)) return 0;
5393 s++;
5394 }
5395 return 1;
5396}
5397
5398static int
5399str_upto_i(VALUE str, VALUE arg)
5400{
5401 rb_yield(str);
5402 return 0;
5403}
5404
5405/*
5406 * call-seq:
5407 * upto(other_string, exclusive = false) {|string| ... } -> self
5408 * upto(other_string, exclusive = false) -> new_enumerator
5409 *
5410 * With a block given, calls the block with each +String+ value
5411 * returned by successive calls to String#succ;
5412 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5413 * the sequence terminates when value +other_string+ is reached;
5414 * returns +self+:
5415 *
5416 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5417 * Output:
5418 *
5419 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5420 *
5421 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5422 *
5423 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5424 *
5425 * Output:
5426 *
5427 * a8 a9 b0 b1 b2 b3 b4 b5
5428 *
5429 * If +other_string+ would not be reached, does not call the block:
5430 *
5431 * '25'.upto('5') {|s| fail s }
5432 * 'aa'.upto('a') {|s| fail s }
5433 *
5434 * With no block given, returns a new Enumerator:
5435 *
5436 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5437 *
5438 */
5439
5440static VALUE
5441rb_str_upto(int argc, VALUE *argv, VALUE beg)
5442{
5443 VALUE end, exclusive;
5444
5445 rb_scan_args(argc, argv, "11", &end, &exclusive);
5446 RETURN_ENUMERATOR(beg, argc, argv);
5447 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5448}
5449
5450VALUE
5451rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5452{
5453 VALUE current, after_end;
5454 ID succ;
5455 int n, ascii;
5456 rb_encoding *enc;
5457
5458 CONST_ID(succ, "succ");
5459 StringValue(end);
5460 enc = rb_enc_check(beg, end);
5461 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5462 /* single character */
5463 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5464 char c = RSTRING_PTR(beg)[0];
5465 char e = RSTRING_PTR(end)[0];
5466
5467 if (c > e || (excl && c == e)) return beg;
5468 for (;;) {
5469 VALUE str = rb_enc_str_new(&c, 1, enc);
5471 if ((*each)(str, arg)) break;
5472 if (!excl && c == e) break;
5473 c++;
5474 if (excl && c == e) break;
5475 }
5476 return beg;
5477 }
5478 /* both edges are all digits */
5479 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5480 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5481 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5482 VALUE b, e;
5483 int width;
5484
5485 width = RSTRING_LENINT(beg);
5486 b = rb_str_to_inum(beg, 10, FALSE);
5487 e = rb_str_to_inum(end, 10, FALSE);
5488 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5489 long bi = FIX2LONG(b);
5490 long ei = FIX2LONG(e);
5491 rb_encoding *usascii = rb_usascii_encoding();
5492
5493 while (bi <= ei) {
5494 if (excl && bi == ei) break;
5495 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5496 bi++;
5497 }
5498 }
5499 else {
5500 ID op = excl ? '<' : idLE;
5501 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5502
5503 args[0] = INT2FIX(width);
5504 while (rb_funcall(b, op, 1, e)) {
5505 args[1] = b;
5506 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5507 b = rb_funcallv(b, succ, 0, 0);
5508 }
5509 }
5510 return beg;
5511 }
5512 /* normal case */
5513 n = rb_str_cmp(beg, end);
5514 if (n > 0 || (excl && n == 0)) return beg;
5515
5516 after_end = rb_funcallv(end, succ, 0, 0);
5517 current = str_duplicate(rb_cString, beg);
5518 while (!rb_str_equal(current, after_end)) {
5519 VALUE next = Qnil;
5520 if (excl || !rb_str_equal(current, end))
5521 next = rb_funcallv(current, succ, 0, 0);
5522 if ((*each)(current, arg)) break;
5523 if (NIL_P(next)) break;
5524 current = next;
5525 StringValue(current);
5526 if (excl && rb_str_equal(current, end)) break;
5527 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5528 break;
5529 }
5530
5531 return beg;
5532}
5533
5534VALUE
5535rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5536{
5537 VALUE current;
5538 ID succ;
5539
5540 CONST_ID(succ, "succ");
5541 /* both edges are all digits */
5542 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5543 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5544 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5545 int width = RSTRING_LENINT(beg);
5546 b = rb_str_to_inum(beg, 10, FALSE);
5547 if (FIXNUM_P(b)) {
5548 long bi = FIX2LONG(b);
5549 rb_encoding *usascii = rb_usascii_encoding();
5550
5551 while (FIXABLE(bi)) {
5552 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5553 bi++;
5554 }
5555 b = LONG2NUM(bi);
5556 }
5557 args[0] = INT2FIX(width);
5558 while (1) {
5559 args[1] = b;
5560 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5561 b = rb_funcallv(b, succ, 0, 0);
5562 }
5563 }
5564 /* normal case */
5565 current = str_duplicate(rb_cString, beg);
5566 while (1) {
5567 VALUE next = rb_funcallv(current, succ, 0, 0);
5568 if ((*each)(current, arg)) break;
5569 current = next;
5570 StringValue(current);
5571 if (RSTRING_LEN(current) == 0)
5572 break;
5573 }
5574
5575 return beg;
5576}
5577
5578static int
5579include_range_i(VALUE str, VALUE arg)
5580{
5581 VALUE *argp = (VALUE *)arg;
5582 if (!rb_equal(str, *argp)) return 0;
5583 *argp = Qnil;
5584 return 1;
5585}
5586
5587VALUE
5588rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5589{
5590 beg = rb_str_new_frozen(beg);
5591 StringValue(end);
5592 end = rb_str_new_frozen(end);
5593 if (NIL_P(val)) return Qfalse;
5594 val = rb_check_string_type(val);
5595 if (NIL_P(val)) return Qfalse;
5596 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5597 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5598 rb_enc_asciicompat(STR_ENC_GET(val))) {
5599 const char *bp = RSTRING_PTR(beg);
5600 const char *ep = RSTRING_PTR(end);
5601 const char *vp = RSTRING_PTR(val);
5602 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5603 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5604 return Qfalse;
5605 else {
5606 char b = *bp;
5607 char e = *ep;
5608 char v = *vp;
5609
5610 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5611 if (b <= v && v < e) return Qtrue;
5612 return RBOOL(!RTEST(exclusive) && v == e);
5613 }
5614 }
5615 }
5616#if 0
5617 /* both edges are all digits */
5618 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5619 all_digits_p(bp, RSTRING_LEN(beg)) &&
5620 all_digits_p(ep, RSTRING_LEN(end))) {
5621 /* TODO */
5622 }
5623#endif
5624 }
5625 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5626
5627 return RBOOL(NIL_P(val));
5628}
5629
5630static VALUE
5631rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5632{
5633 if (rb_reg_search(re, str, 0, 0) >= 0) {
5634 VALUE match = rb_backref_get();
5635 int nth = rb_reg_backref_number(match, backref);
5636 return rb_reg_nth_match(nth, match);
5637 }
5638 return Qnil;
5639}
5640
5641static VALUE
5642rb_str_aref(VALUE str, VALUE indx)
5643{
5644 long idx;
5645
5646 if (FIXNUM_P(indx)) {
5647 idx = FIX2LONG(indx);
5648 }
5649 else if (RB_TYPE_P(indx, T_REGEXP)) {
5650 return rb_str_subpat(str, indx, INT2FIX(0));
5651 }
5652 else if (RB_TYPE_P(indx, T_STRING)) {
5653 if (rb_str_index(str, indx, 0) != -1)
5654 return str_duplicate(rb_cString, indx);
5655 return Qnil;
5656 }
5657 else {
5658 /* check if indx is Range */
5659 long beg, len = str_strlen(str, NULL);
5660 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5661 case Qfalse:
5662 break;
5663 case Qnil:
5664 return Qnil;
5665 default:
5666 return rb_str_substr(str, beg, len);
5667 }
5668 idx = NUM2LONG(indx);
5669 }
5670
5671 return str_substr(str, idx, 1, FALSE);
5672}
5673
5674
5675/*
5676 * call-seq:
5677 * string[index] -> new_string or nil
5678 * string[start, length] -> new_string or nil
5679 * string[range] -> new_string or nil
5680 * string[regexp, capture = 0] -> new_string or nil
5681 * string[substring] -> new_string or nil
5682 *
5683 * Returns the substring of +self+ specified by the arguments.
5684 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5685 *
5686 *
5687 */
5688
5689static VALUE
5690rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5691{
5692 if (argc == 2) {
5693 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5694 return rb_str_subpat(str, argv[0], argv[1]);
5695 }
5696 else {
5697 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5698 }
5699 }
5700 rb_check_arity(argc, 1, 2);
5701 return rb_str_aref(str, argv[0]);
5702}
5703
5704VALUE
5706{
5707 char *ptr = RSTRING_PTR(str);
5708 long olen = RSTRING_LEN(str), nlen;
5709
5710 str_modifiable(str);
5711 if (len > olen) len = olen;
5712 nlen = olen - len;
5713 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5714 char *oldptr = ptr;
5715 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5716 STR_SET_EMBED(str);
5717 ptr = RSTRING(str)->as.embed.ary;
5718 memmove(ptr, oldptr + len, nlen);
5719 if (fl == STR_NOEMBED) xfree(oldptr);
5720 }
5721 else {
5722 if (!STR_SHARED_P(str)) {
5723 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5724 rb_enc_cr_str_exact_copy(shared, str);
5725 OBJ_FREEZE(shared);
5726 }
5727 ptr = RSTRING(str)->as.heap.ptr += len;
5728 }
5729 STR_SET_LEN(str, nlen);
5730
5731 if (!SHARABLE_MIDDLE_SUBSTRING) {
5732 TERM_FILL(ptr + nlen, TERM_LEN(str));
5733 }
5735 return str;
5736}
5737
5738static void
5739rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5740{
5741 char *sptr;
5742 long slen;
5743 int cr;
5744
5745 if (beg == 0 && vlen == 0) {
5746 rb_str_drop_bytes(str, len);
5747 return;
5748 }
5749
5750 str_modify_keep_cr(str);
5751 RSTRING_GETMEM(str, sptr, slen);
5752 if (len < vlen) {
5753 /* expand string */
5754 RESIZE_CAPA(str, slen + vlen - len);
5755 sptr = RSTRING_PTR(str);
5756 }
5757
5759 cr = rb_enc_str_coderange(val);
5760 else
5762
5763 if (vlen != len) {
5764 memmove(sptr + beg + vlen,
5765 sptr + beg + len,
5766 slen - (beg + len));
5767 }
5768 if (vlen < beg && len < 0) {
5769 MEMZERO(sptr + slen, char, -len);
5770 }
5771 if (vlen > 0) {
5772 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5773 }
5774 slen += vlen - len;
5775 STR_SET_LEN(str, slen);
5776 TERM_FILL(&sptr[slen], TERM_LEN(str));
5777 ENC_CODERANGE_SET(str, cr);
5778}
5779
5780static inline void
5781rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5782{
5783 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5784}
5785
5786void
5787rb_str_update(VALUE str, long beg, long len, VALUE val)
5788{
5789 long slen;
5790 char *p, *e;
5791 rb_encoding *enc;
5792 int singlebyte = single_byte_optimizable(str);
5793 int cr;
5794
5795 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5796
5797 StringValue(val);
5798 enc = rb_enc_check(str, val);
5799 slen = str_strlen(str, enc); /* rb_enc_check */
5800
5801 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5802 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5803 }
5804 if (beg < 0) {
5805 beg += slen;
5806 }
5807 RUBY_ASSERT(beg >= 0);
5808 RUBY_ASSERT(beg <= slen);
5809
5810 if (len > slen - beg) {
5811 len = slen - beg;
5812 }
5813 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5814 if (!p) p = RSTRING_END(str);
5815 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5816 if (!e) e = RSTRING_END(str);
5817 /* error check */
5818 beg = p - RSTRING_PTR(str); /* physical position */
5819 len = e - p; /* physical length */
5820 rb_str_update_0(str, beg, len, val);
5821 rb_enc_associate(str, enc);
5823 if (cr != ENC_CODERANGE_BROKEN)
5824 ENC_CODERANGE_SET(str, cr);
5825}
5826
5827static void
5828rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5829{
5830 int nth;
5831 VALUE match;
5832 long start, end, len;
5833 rb_encoding *enc;
5834 struct re_registers *regs;
5835
5836 if (rb_reg_search(re, str, 0, 0) < 0) {
5837 rb_raise(rb_eIndexError, "regexp not matched");
5838 }
5839 match = rb_backref_get();
5840 nth = rb_reg_backref_number(match, backref);
5841 regs = RMATCH_REGS(match);
5842 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5843 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5844 }
5845 if (nth < 0) {
5846 nth += regs->num_regs;
5847 }
5848
5849 start = BEG(nth);
5850 if (start == -1) {
5851 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5852 }
5853 end = END(nth);
5854 len = end - start;
5855 StringValue(val);
5856 enc = rb_enc_check_str(str, val);
5857 rb_str_update_0(str, start, len, val);
5858 rb_enc_associate(str, enc);
5859}
5860
5861static VALUE
5862rb_str_aset(VALUE str, VALUE indx, VALUE val)
5863{
5864 long idx, beg;
5865
5866 switch (TYPE(indx)) {
5867 case T_REGEXP:
5868 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5869 return val;
5870
5871 case T_STRING:
5872 beg = rb_str_index(str, indx, 0);
5873 if (beg < 0) {
5874 rb_raise(rb_eIndexError, "string not matched");
5875 }
5876 beg = rb_str_sublen(str, beg);
5877 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5878 return val;
5879
5880 default:
5881 /* check if indx is Range */
5882 {
5883 long beg, len;
5884 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5885 rb_str_update(str, beg, len, val);
5886 return val;
5887 }
5888 }
5889 /* FALLTHROUGH */
5890
5891 case T_FIXNUM:
5892 idx = NUM2LONG(indx);
5893 rb_str_update(str, idx, 1, val);
5894 return val;
5895 }
5896}
5897
5898/*
5899 * call-seq:
5900 * string[index] = new_string
5901 * string[start, length] = new_string
5902 * string[range] = new_string
5903 * string[regexp, capture = 0] = new_string
5904 * string[substring] = new_string
5905 *
5906 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5907 * See {String Slices}[rdoc-ref:String@String+Slices].
5908 *
5909 * A few examples:
5910 *
5911 * s = 'foo'
5912 * s[2] = 'rtune' # => "rtune"
5913 * s # => "fortune"
5914 * s[1, 5] = 'init' # => "init"
5915 * s # => "finite"
5916 * s[3..4] = 'al' # => "al"
5917 * s # => "finale"
5918 * s[/e$/] = 'ly' # => "ly"
5919 * s # => "finally"
5920 * s['lly'] = 'ncial' # => "ncial"
5921 * s # => "financial"
5922 *
5923 */
5924
5925static VALUE
5926rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5927{
5928 if (argc == 3) {
5929 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5930 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5931 }
5932 else {
5933 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5934 }
5935 return argv[2];
5936 }
5937 rb_check_arity(argc, 2, 3);
5938 return rb_str_aset(str, argv[0], argv[1]);
5939}
5940
5941/*
5942 * call-seq:
5943 * insert(index, other_string) -> self
5944 *
5945 * Inserts the given +other_string+ into +self+; returns +self+.
5946 *
5947 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5948 *
5949 * 'foo'.insert(1, 'bar') # => "fbaroo"
5950 *
5951 * If the Integer +index+ is negative, counts backward from the end of +self+
5952 * and inserts +other_string+ at offset <tt>index+1</tt>
5953 * (that is, _after_ <tt>self[index]</tt>):
5954 *
5955 * 'foo'.insert(-2, 'bar') # => "fobaro"
5956 *
5957 */
5958
5959static VALUE
5960rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5961{
5962 long pos = NUM2LONG(idx);
5963
5964 if (pos == -1) {
5965 return rb_str_append(str, str2);
5966 }
5967 else if (pos < 0) {
5968 pos++;
5969 }
5970 rb_str_update(str, pos, 0, str2);
5971 return str;
5972}
5973
5974
5975/*
5976 * call-seq:
5977 * slice!(index) -> new_string or nil
5978 * slice!(start, length) -> new_string or nil
5979 * slice!(range) -> new_string or nil
5980 * slice!(regexp, capture = 0) -> new_string or nil
5981 * slice!(substring) -> new_string or nil
5982 *
5983 * Removes and returns the substring of +self+ specified by the arguments.
5984 * See {String Slices}[rdoc-ref:String@String+Slices].
5985 *
5986 * A few examples:
5987 *
5988 * string = "This is a string"
5989 * string.slice!(2) #=> "i"
5990 * string.slice!(3..6) #=> " is "
5991 * string.slice!(/s.*t/) #=> "sa st"
5992 * string.slice!("r") #=> "r"
5993 * string #=> "Thing"
5994 *
5995 */
5996
5997static VALUE
5998rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5999{
6000 VALUE result = Qnil;
6001 VALUE indx;
6002 long beg, len = 1;
6003 char *p;
6004
6005 rb_check_arity(argc, 1, 2);
6006 str_modify_keep_cr(str);
6007 indx = argv[0];
6008 if (RB_TYPE_P(indx, T_REGEXP)) {
6009 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6010 VALUE match = rb_backref_get();
6011 struct re_registers *regs = RMATCH_REGS(match);
6012 int nth = 0;
6013 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6014 if ((nth += regs->num_regs) <= 0) return Qnil;
6015 }
6016 else if (nth >= regs->num_regs) return Qnil;
6017 beg = BEG(nth);
6018 len = END(nth) - beg;
6019 goto subseq;
6020 }
6021 else if (argc == 2) {
6022 beg = NUM2LONG(indx);
6023 len = NUM2LONG(argv[1]);
6024 goto num_index;
6025 }
6026 else if (FIXNUM_P(indx)) {
6027 beg = FIX2LONG(indx);
6028 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6029 if (!len) return Qnil;
6030 beg = p - RSTRING_PTR(str);
6031 goto subseq;
6032 }
6033 else if (RB_TYPE_P(indx, T_STRING)) {
6034 beg = rb_str_index(str, indx, 0);
6035 if (beg == -1) return Qnil;
6036 len = RSTRING_LEN(indx);
6037 result = str_duplicate(rb_cString, indx);
6038 goto squash;
6039 }
6040 else {
6041 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6042 case Qnil:
6043 return Qnil;
6044 case Qfalse:
6045 beg = NUM2LONG(indx);
6046 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6047 if (!len) return Qnil;
6048 beg = p - RSTRING_PTR(str);
6049 goto subseq;
6050 default:
6051 goto num_index;
6052 }
6053 }
6054
6055 num_index:
6056 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6057 beg = p - RSTRING_PTR(str);
6058
6059 subseq:
6060 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6061 rb_enc_cr_str_copy_for_substr(result, str);
6062
6063 squash:
6064 if (len > 0) {
6065 if (beg == 0) {
6066 rb_str_drop_bytes(str, len);
6067 }
6068 else {
6069 char *sptr = RSTRING_PTR(str);
6070 long slen = RSTRING_LEN(str);
6071 if (beg + len > slen) /* pathological check */
6072 len = slen - beg;
6073 memmove(sptr + beg,
6074 sptr + beg + len,
6075 slen - (beg + len));
6076 slen -= len;
6077 STR_SET_LEN(str, slen);
6078 TERM_FILL(&sptr[slen], TERM_LEN(str));
6079 }
6080 }
6081 return result;
6082}
6083
6084static VALUE
6085get_pat(VALUE pat)
6086{
6087 VALUE val;
6088
6089 switch (OBJ_BUILTIN_TYPE(pat)) {
6090 case T_REGEXP:
6091 return pat;
6092
6093 case T_STRING:
6094 break;
6095
6096 default:
6097 val = rb_check_string_type(pat);
6098 if (NIL_P(val)) {
6099 Check_Type(pat, T_REGEXP);
6100 }
6101 pat = val;
6102 }
6103
6104 return rb_reg_regcomp(pat);
6105}
6106
6107static VALUE
6108get_pat_quoted(VALUE pat, int check)
6109{
6110 VALUE val;
6111
6112 switch (OBJ_BUILTIN_TYPE(pat)) {
6113 case T_REGEXP:
6114 return pat;
6115
6116 case T_STRING:
6117 break;
6118
6119 default:
6120 val = rb_check_string_type(pat);
6121 if (NIL_P(val)) {
6122 Check_Type(pat, T_REGEXP);
6123 }
6124 pat = val;
6125 }
6126 if (check && is_broken_string(pat)) {
6127 rb_exc_raise(rb_reg_check_preprocess(pat));
6128 }
6129 return pat;
6130}
6131
6132static long
6133rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6134{
6135 if (BUILTIN_TYPE(pat) == T_STRING) {
6136 pos = rb_str_byteindex(str, pat, pos);
6137 if (set_backref_str) {
6138 if (pos >= 0) {
6139 str = rb_str_new_frozen_String(str);
6140 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6141 }
6142 else {
6144 }
6145 }
6146 return pos;
6147 }
6148 else {
6149 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6150 }
6151}
6152
6153
6154/*
6155 * call-seq:
6156 * sub!(pattern, replacement) -> self or nil
6157 * sub!(pattern) {|match| ... } -> self or nil
6158 *
6159 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6160 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6161 *
6162 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6163 *
6164 * Related: String#sub, String#gsub, String#gsub!.
6165 *
6166 */
6167
6168static VALUE
6169rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6170{
6171 VALUE pat, repl, hash = Qnil;
6172 int iter = 0;
6173 long plen;
6174 int min_arity = rb_block_given_p() ? 1 : 2;
6175 long beg;
6176
6177 rb_check_arity(argc, min_arity, 2);
6178 if (argc == 1) {
6179 iter = 1;
6180 }
6181 else {
6182 repl = argv[1];
6183 hash = rb_check_hash_type(argv[1]);
6184 if (NIL_P(hash)) {
6185 StringValue(repl);
6186 }
6187 }
6188
6189 pat = get_pat_quoted(argv[0], 1);
6190
6191 str_modifiable(str);
6192 beg = rb_pat_search(pat, str, 0, 1);
6193 if (beg >= 0) {
6194 rb_encoding *enc;
6195 int cr = ENC_CODERANGE(str);
6196 long beg0, end0;
6197 VALUE match, match0 = Qnil;
6198 struct re_registers *regs;
6199 char *p, *rp;
6200 long len, rlen;
6201
6202 match = rb_backref_get();
6203 regs = RMATCH_REGS(match);
6204 if (RB_TYPE_P(pat, T_STRING)) {
6205 beg0 = beg;
6206 end0 = beg0 + RSTRING_LEN(pat);
6207 match0 = pat;
6208 }
6209 else {
6210 beg0 = BEG(0);
6211 end0 = END(0);
6212 if (iter) match0 = rb_reg_nth_match(0, match);
6213 }
6214
6215 if (iter || !NIL_P(hash)) {
6216 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6217
6218 if (iter) {
6219 repl = rb_obj_as_string(rb_yield(match0));
6220 }
6221 else {
6222 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6223 repl = rb_obj_as_string(repl);
6224 }
6225 str_mod_check(str, p, len);
6226 rb_check_frozen(str);
6227 }
6228 else {
6229 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6230 }
6231
6232 enc = rb_enc_compatible(str, repl);
6233 if (!enc) {
6234 rb_encoding *str_enc = STR_ENC_GET(str);
6235 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6236 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6237 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6238 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6239 rb_enc_inspect_name(str_enc),
6240 rb_enc_inspect_name(STR_ENC_GET(repl)));
6241 }
6242 enc = STR_ENC_GET(repl);
6243 }
6244 rb_str_modify(str);
6245 rb_enc_associate(str, enc);
6247 int cr2 = ENC_CODERANGE(repl);
6248 if (cr2 == ENC_CODERANGE_BROKEN ||
6249 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6251 else
6252 cr = cr2;
6253 }
6254 plen = end0 - beg0;
6255 rlen = RSTRING_LEN(repl);
6256 len = RSTRING_LEN(str);
6257 if (rlen > plen) {
6258 RESIZE_CAPA(str, len + rlen - plen);
6259 }
6260 p = RSTRING_PTR(str);
6261 if (rlen != plen) {
6262 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6263 }
6264 rp = RSTRING_PTR(repl);
6265 memmove(p + beg0, rp, rlen);
6266 len += rlen - plen;
6267 STR_SET_LEN(str, len);
6268 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6269 ENC_CODERANGE_SET(str, cr);
6270
6271 RB_GC_GUARD(match);
6272
6273 return str;
6274 }
6275 return Qnil;
6276}
6277
6278
6279/*
6280 * call-seq:
6281 * sub(pattern, replacement) -> new_string
6282 * sub(pattern) {|match| ... } -> new_string
6283 *
6284 * Returns a copy of +self+ with only the first occurrence
6285 * (not all occurrences) of the given +pattern+ replaced.
6286 *
6287 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6288 *
6289 * Related: String#sub!, String#gsub, String#gsub!.
6290 *
6291 */
6292
6293static VALUE
6294rb_str_sub(int argc, VALUE *argv, VALUE str)
6295{
6296 str = str_duplicate(rb_cString, str);
6297 rb_str_sub_bang(argc, argv, str);
6298 return str;
6299}
6300
6301static VALUE
6302str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6303{
6304 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
6305 long beg, beg0, end0;
6306 long offset, blen, slen, len, last;
6307 enum {STR, ITER, MAP} mode = STR;
6308 char *sp, *cp;
6309 int need_backref = -1;
6310 rb_encoding *str_enc;
6311
6312 switch (argc) {
6313 case 1:
6314 RETURN_ENUMERATOR(str, argc, argv);
6315 mode = ITER;
6316 break;
6317 case 2:
6318 repl = argv[1];
6319 hash = rb_check_hash_type(argv[1]);
6320 if (NIL_P(hash)) {
6321 StringValue(repl);
6322 }
6323 else {
6324 mode = MAP;
6325 }
6326 break;
6327 default:
6328 rb_error_arity(argc, 1, 2);
6329 }
6330
6331 pat = get_pat_quoted(argv[0], 1);
6332 beg = rb_pat_search(pat, str, 0, need_backref);
6333 if (beg < 0) {
6334 if (bang) return Qnil; /* no match, no substitution */
6335 return str_duplicate(rb_cString, str);
6336 }
6337
6338 offset = 0;
6339 blen = RSTRING_LEN(str) + 30; /* len + margin */
6340 dest = rb_str_buf_new(blen);
6341 sp = RSTRING_PTR(str);
6342 slen = RSTRING_LEN(str);
6343 cp = sp;
6344 str_enc = STR_ENC_GET(str);
6345 rb_enc_associate(dest, str_enc);
6346 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6347
6348 do {
6349 VALUE match = rb_backref_get();
6350 struct re_registers *regs = RMATCH_REGS(match);
6351 if (RB_TYPE_P(pat, T_STRING)) {
6352 beg0 = beg;
6353 end0 = beg0 + RSTRING_LEN(pat);
6354 match0 = pat;
6355 }
6356 else {
6357 beg0 = BEG(0);
6358 end0 = END(0);
6359 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6360 }
6361
6362 if (mode) {
6363 if (mode == ITER) {
6364 val = rb_obj_as_string(rb_yield(match0));
6365 }
6366 else {
6367 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6368 val = rb_obj_as_string(val);
6369 }
6370 str_mod_check(str, sp, slen);
6371 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6372 rb_raise(rb_eRuntimeError, "block should not cheat");
6373 }
6374 }
6375 else if (need_backref) {
6376 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6377 if (need_backref < 0) {
6378 need_backref = val != repl;
6379 }
6380 }
6381 else {
6382 val = repl;
6383 }
6384
6385 len = beg0 - offset; /* copy pre-match substr */
6386 if (len) {
6387 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6388 }
6389
6390 rb_str_buf_append(dest, val);
6391
6392 last = offset;
6393 offset = end0;
6394 if (beg0 == end0) {
6395 /*
6396 * Always consume at least one character of the input string
6397 * in order to prevent infinite loops.
6398 */
6399 if (RSTRING_LEN(str) <= end0) break;
6400 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6401 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6402 offset = end0 + len;
6403 }
6404 cp = RSTRING_PTR(str) + offset;
6405 if (offset > RSTRING_LEN(str)) break;
6406 beg = rb_pat_search(pat, str, offset, need_backref);
6407
6408 RB_GC_GUARD(match);
6409 } while (beg >= 0);
6410 if (RSTRING_LEN(str) > offset) {
6411 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6412 }
6413 rb_pat_search(pat, str, last, 1);
6414 if (bang) {
6415 str_shared_replace(str, dest);
6416 }
6417 else {
6418 str = dest;
6419 }
6420
6421 return str;
6422}
6423
6424
6425/*
6426 * call-seq:
6427 * gsub!(pattern, replacement) -> self or nil
6428 * gsub!(pattern) {|match| ... } -> self or nil
6429 * gsub!(pattern) -> an_enumerator
6430 *
6431 * Performs the specified substring replacement(s) on +self+;
6432 * returns +self+ if any replacement occurred, +nil+ otherwise.
6433 *
6434 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6435 *
6436 * Returns an Enumerator if no +replacement+ and no block given.
6437 *
6438 * Related: String#sub, String#gsub, String#sub!.
6439 *
6440 */
6441
6442static VALUE
6443rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6444{
6445 str_modify_keep_cr(str);
6446 return str_gsub(argc, argv, str, 1);
6447}
6448
6449
6450/*
6451 * call-seq:
6452 * gsub(pattern, replacement) -> new_string
6453 * gsub(pattern) {|match| ... } -> new_string
6454 * gsub(pattern) -> enumerator
6455 *
6456 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6457 *
6458 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6459 *
6460 * Returns an Enumerator if no +replacement+ and no block given.
6461 *
6462 * Related: String#sub, String#sub!, String#gsub!.
6463 *
6464 */
6465
6466static VALUE
6467rb_str_gsub(int argc, VALUE *argv, VALUE str)
6468{
6469 return str_gsub(argc, argv, str, 0);
6470}
6471
6472
6473/*
6474 * call-seq:
6475 * replace(other_string) -> self
6476 *
6477 * Replaces the contents of +self+ with the contents of +other_string+:
6478 *
6479 * s = 'foo' # => "foo"
6480 * s.replace('bar') # => "bar"
6481 *
6482 */
6483
6484VALUE
6486{
6487 str_modifiable(str);
6488 if (str == str2) return str;
6489
6490 StringValue(str2);
6491 str_discard(str);
6492 return str_replace(str, str2);
6493}
6494
6495/*
6496 * call-seq:
6497 * clear -> self
6498 *
6499 * Removes the contents of +self+:
6500 *
6501 * s = 'foo' # => "foo"
6502 * s.clear # => ""
6503 *
6504 */
6505
6506static VALUE
6507rb_str_clear(VALUE str)
6508{
6509 str_discard(str);
6510 STR_SET_EMBED(str);
6511 STR_SET_LEN(str, 0);
6512 RSTRING_PTR(str)[0] = 0;
6513 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6515 else
6517 return str;
6518}
6519
6520/*
6521 * call-seq:
6522 * chr -> string
6523 *
6524 * Returns a string containing the first character of +self+:
6525 *
6526 * s = 'foo' # => "foo"
6527 * s.chr # => "f"
6528 *
6529 */
6530
6531static VALUE
6532rb_str_chr(VALUE str)
6533{
6534 return rb_str_substr(str, 0, 1);
6535}
6536
6537/*
6538 * call-seq:
6539 * getbyte(index) -> integer or nil
6540 *
6541 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6542 *
6543 * s = 'abcde' # => "abcde"
6544 * s.getbyte(0) # => 97
6545 * s.getbyte(-1) # => 101
6546 * s.getbyte(5) # => nil
6547 *
6548 * Related: String#setbyte.
6549 */
6550VALUE
6551rb_str_getbyte(VALUE str, VALUE index)
6552{
6553 long pos = NUM2LONG(index);
6554
6555 if (pos < 0)
6556 pos += RSTRING_LEN(str);
6557 if (pos < 0 || RSTRING_LEN(str) <= pos)
6558 return Qnil;
6559
6560 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6561}
6562
6563/*
6564 * call-seq:
6565 * setbyte(index, integer) -> integer
6566 *
6567 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6568 *
6569 * s = 'abcde' # => "abcde"
6570 * s.setbyte(0, 98) # => 98
6571 * s # => "bbcde"
6572 *
6573 * Related: String#getbyte.
6574 */
6575VALUE
6576rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6577{
6578 long pos = NUM2LONG(index);
6579 long len = RSTRING_LEN(str);
6580 char *ptr, *head, *left = 0;
6581 rb_encoding *enc;
6582 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6583
6584 if (pos < -len || len <= pos)
6585 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6586 if (pos < 0)
6587 pos += len;
6588
6589 VALUE v = rb_to_int(value);
6590 VALUE w = rb_int_and(v, INT2FIX(0xff));
6591 char byte = (char)(NUM2INT(w) & 0xFF);
6592
6593 if (!str_independent(str))
6594 str_make_independent(str);
6595 enc = STR_ENC_GET(str);
6596 head = RSTRING_PTR(str);
6597 ptr = &head[pos];
6598 if (!STR_EMBED_P(str)) {
6599 cr = ENC_CODERANGE(str);
6600 switch (cr) {
6601 case ENC_CODERANGE_7BIT:
6602 left = ptr;
6603 *ptr = byte;
6604 if (ISASCII(byte)) goto end;
6605 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6606 if (!MBCLEN_CHARFOUND_P(nlen))
6608 else
6610 goto end;
6612 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6613 width = rb_enc_precise_mbclen(left, head+len, enc);
6614 *ptr = byte;
6615 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6616 if (!MBCLEN_CHARFOUND_P(nlen))
6618 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6620 goto end;
6621 }
6622 }
6624 *ptr = byte;
6625
6626 end:
6627 return value;
6628}
6629
6630static VALUE
6631str_byte_substr(VALUE str, long beg, long len, int empty)
6632{
6633 long n = RSTRING_LEN(str);
6634
6635 if (beg > n || len < 0) return Qnil;
6636 if (beg < 0) {
6637 beg += n;
6638 if (beg < 0) return Qnil;
6639 }
6640 if (len > n - beg)
6641 len = n - beg;
6642 if (len <= 0) {
6643 if (!empty) return Qnil;
6644 len = 0;
6645 }
6646
6647 VALUE str2 = str_subseq(str, beg, len);
6648
6649 str_enc_copy_direct(str2, str);
6650
6651 if (RSTRING_LEN(str2) == 0) {
6652 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6654 else
6656 }
6657 else {
6658 switch (ENC_CODERANGE(str)) {
6659 case ENC_CODERANGE_7BIT:
6661 break;
6662 default:
6664 break;
6665 }
6666 }
6667
6668 return str2;
6669}
6670
6671VALUE
6672rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6673{
6674 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6675}
6676
6677static VALUE
6678str_byte_aref(VALUE str, VALUE indx)
6679{
6680 long idx;
6681 if (FIXNUM_P(indx)) {
6682 idx = FIX2LONG(indx);
6683 }
6684 else {
6685 /* check if indx is Range */
6686 long beg, len = RSTRING_LEN(str);
6687
6688 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6689 case Qfalse:
6690 break;
6691 case Qnil:
6692 return Qnil;
6693 default:
6694 return str_byte_substr(str, beg, len, TRUE);
6695 }
6696
6697 idx = NUM2LONG(indx);
6698 }
6699 return str_byte_substr(str, idx, 1, FALSE);
6700}
6701
6702/*
6703 * call-seq:
6704 * byteslice(index, length = 1) -> string or nil
6705 * byteslice(range) -> string or nil
6706 *
6707 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6708 *
6709 * With integer arguments +index+ and +length+ given,
6710 * returns the substring beginning at the given +index+
6711 * of the given +length+ (if possible),
6712 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6713 *
6714 * s = '0123456789' # => "0123456789"
6715 * s.byteslice(2) # => "2"
6716 * s.byteslice(200) # => nil
6717 * s.byteslice(4, 3) # => "456"
6718 * s.byteslice(4, 30) # => "456789"
6719 * s.byteslice(4, -1) # => nil
6720 * s.byteslice(40, 2) # => nil
6721 *
6722 * In either case above, counts backwards from the end of +self+
6723 * if +index+ is negative:
6724 *
6725 * s = '0123456789' # => "0123456789"
6726 * s.byteslice(-4) # => "6"
6727 * s.byteslice(-4, 3) # => "678"
6728 *
6729 * With Range argument +range+ given, returns
6730 * <tt>byteslice(range.begin, range.size)</tt>:
6731 *
6732 * s = '0123456789' # => "0123456789"
6733 * s.byteslice(4..6) # => "456"
6734 * s.byteslice(-6..-4) # => "456"
6735 * s.byteslice(5..2) # => "" # range.size is zero.
6736 * s.byteslice(40..42) # => nil
6737 *
6738 * In all cases, a returned string has the same encoding as +self+:
6739 *
6740 * s.encoding # => #<Encoding:UTF-8>
6741 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6742 *
6743 */
6744
6745static VALUE
6746rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6747{
6748 if (argc == 2) {
6749 long beg = NUM2LONG(argv[0]);
6750 long len = NUM2LONG(argv[1]);
6751 return str_byte_substr(str, beg, len, TRUE);
6752 }
6753 rb_check_arity(argc, 1, 2);
6754 return str_byte_aref(str, argv[0]);
6755}
6756
6757static void
6758str_check_beg_len(VALUE str, long *beg, long *len)
6759{
6760 long end, slen = RSTRING_LEN(str);
6761
6762 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6763 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6764 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6765 }
6766 if (*beg < 0) {
6767 *beg += slen;
6768 }
6769 RUBY_ASSERT(*beg >= 0);
6770 RUBY_ASSERT(*beg <= slen);
6771
6772 if (*len > slen - *beg) {
6773 *len = slen - *beg;
6774 }
6775 end = *beg + *len;
6776 str_ensure_byte_pos(str, *beg);
6777 str_ensure_byte_pos(str, end);
6778}
6779
6780/*
6781 * call-seq:
6782 * bytesplice(index, length, str) -> string
6783 * bytesplice(index, length, str, str_index, str_length) -> string
6784 * bytesplice(range, str) -> string
6785 * bytesplice(range, str, str_range) -> string
6786 *
6787 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6788 * The portion of the string affected is determined using
6789 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6790 * If the replacement string is not the same length as the text it is replacing,
6791 * the string will be adjusted accordingly.
6792 *
6793 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6794 *
6795 * The form that take an Integer will raise an IndexError if the value is out
6796 * of range; the Range form will raise a RangeError.
6797 * If the beginning or ending offset does not land on character (codepoint)
6798 * boundary, an IndexError will be raised.
6799 */
6800
6801static VALUE
6802rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6803{
6804 long beg, len, vbeg, vlen;
6805 VALUE val;
6806 int cr;
6807
6808 rb_check_arity(argc, 2, 5);
6809 if (!(argc == 2 || argc == 3 || argc == 5)) {
6810 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6811 }
6812 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6813 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6814 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6815 rb_builtin_class_name(argv[0]));
6816 }
6817 val = argv[1];
6818 StringValue(val);
6819 if (argc == 2) {
6820 /* bytesplice(range, str) */
6821 vbeg = 0;
6822 vlen = RSTRING_LEN(val);
6823 }
6824 else {
6825 /* bytesplice(range, str, str_range) */
6826 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6827 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6828 rb_builtin_class_name(argv[2]));
6829 }
6830 }
6831 }
6832 else {
6833 beg = NUM2LONG(argv[0]);
6834 len = NUM2LONG(argv[1]);
6835 val = argv[2];
6836 StringValue(val);
6837 if (argc == 3) {
6838 /* bytesplice(index, length, str) */
6839 vbeg = 0;
6840 vlen = RSTRING_LEN(val);
6841 }
6842 else {
6843 /* bytesplice(index, length, str, str_index, str_length) */
6844 vbeg = NUM2LONG(argv[3]);
6845 vlen = NUM2LONG(argv[4]);
6846 }
6847 }
6848 str_check_beg_len(str, &beg, &len);
6849 str_check_beg_len(val, &vbeg, &vlen);
6850 str_modify_keep_cr(str);
6851
6852 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6853 rb_enc_associate(str, rb_enc_check(str, val));
6854 }
6855
6856 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6858 if (cr != ENC_CODERANGE_BROKEN)
6859 ENC_CODERANGE_SET(str, cr);
6860 return str;
6861}
6862
6863/*
6864 * call-seq:
6865 * reverse -> string
6866 *
6867 * Returns a new string with the characters from +self+ in reverse order.
6868 *
6869 * 'stressed'.reverse # => "desserts"
6870 *
6871 */
6872
6873static VALUE
6874rb_str_reverse(VALUE str)
6875{
6876 rb_encoding *enc;
6877 VALUE rev;
6878 char *s, *e, *p;
6879 int cr;
6880
6881 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6882 enc = STR_ENC_GET(str);
6883 rev = rb_str_new(0, RSTRING_LEN(str));
6884 s = RSTRING_PTR(str); e = RSTRING_END(str);
6885 p = RSTRING_END(rev);
6886 cr = ENC_CODERANGE(str);
6887
6888 if (RSTRING_LEN(str) > 1) {
6889 if (single_byte_optimizable(str)) {
6890 while (s < e) {
6891 *--p = *s++;
6892 }
6893 }
6894 else if (cr == ENC_CODERANGE_VALID) {
6895 while (s < e) {
6896 int clen = rb_enc_fast_mbclen(s, e, enc);
6897
6898 p -= clen;
6899 memcpy(p, s, clen);
6900 s += clen;
6901 }
6902 }
6903 else {
6904 cr = rb_enc_asciicompat(enc) ?
6906 while (s < e) {
6907 int clen = rb_enc_mbclen(s, e, enc);
6908
6909 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6910 p -= clen;
6911 memcpy(p, s, clen);
6912 s += clen;
6913 }
6914 }
6915 }
6916 STR_SET_LEN(rev, RSTRING_LEN(str));
6917 str_enc_copy_direct(rev, str);
6918 ENC_CODERANGE_SET(rev, cr);
6919
6920 return rev;
6921}
6922
6923
6924/*
6925 * call-seq:
6926 * reverse! -> self
6927 *
6928 * Returns +self+ with its characters reversed:
6929 *
6930 * s = 'stressed'
6931 * s.reverse! # => "desserts"
6932 * s # => "desserts"
6933 *
6934 */
6935
6936static VALUE
6937rb_str_reverse_bang(VALUE str)
6938{
6939 if (RSTRING_LEN(str) > 1) {
6940 if (single_byte_optimizable(str)) {
6941 char *s, *e, c;
6942
6943 str_modify_keep_cr(str);
6944 s = RSTRING_PTR(str);
6945 e = RSTRING_END(str) - 1;
6946 while (s < e) {
6947 c = *s;
6948 *s++ = *e;
6949 *e-- = c;
6950 }
6951 }
6952 else {
6953 str_shared_replace(str, rb_str_reverse(str));
6954 }
6955 }
6956 else {
6957 str_modify_keep_cr(str);
6958 }
6959 return str;
6960}
6961
6962
6963/*
6964 * call-seq:
6965 * include?(other_string) -> true or false
6966 *
6967 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6968 *
6969 * s = 'foo'
6970 * s.include?('f') # => true
6971 * s.include?('fo') # => true
6972 * s.include?('food') # => false
6973 *
6974 */
6975
6976VALUE
6977rb_str_include(VALUE str, VALUE arg)
6978{
6979 long i;
6980
6981 StringValue(arg);
6982 i = rb_str_index(str, arg, 0);
6983
6984 return RBOOL(i != -1);
6985}
6986
6987
6988/*
6989 * call-seq:
6990 * to_i(base = 10) -> integer
6991 *
6992 * Returns the result of interpreting leading characters in +self+
6993 * as an integer in the given +base+ (which must be in (0, 2..36)):
6994 *
6995 * '123456'.to_i # => 123456
6996 * '123def'.to_i(16) # => 1195503
6997 *
6998 * With +base+ zero, string +object+ may contain leading characters
6999 * to specify the actual base:
7000 *
7001 * '123def'.to_i(0) # => 123
7002 * '0123def'.to_i(0) # => 83
7003 * '0b123def'.to_i(0) # => 1
7004 * '0o123def'.to_i(0) # => 83
7005 * '0d123def'.to_i(0) # => 123
7006 * '0x123def'.to_i(0) # => 1195503
7007 *
7008 * Characters past a leading valid number (in the given +base+) are ignored:
7009 *
7010 * '12.345'.to_i # => 12
7011 * '12345'.to_i(2) # => 1
7012 *
7013 * Returns zero if there is no leading valid number:
7014 *
7015 * 'abcdef'.to_i # => 0
7016 * '2'.to_i(2) # => 0
7017 *
7018 */
7019
7020static VALUE
7021rb_str_to_i(int argc, VALUE *argv, VALUE str)
7022{
7023 int base = 10;
7024
7025 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7026 rb_raise(rb_eArgError, "invalid radix %d", base);
7027 }
7028 return rb_str_to_inum(str, base, FALSE);
7029}
7030
7031
7032/*
7033 * call-seq:
7034 * to_f -> float
7035 *
7036 * Returns the result of interpreting leading characters in +self+ as a Float:
7037 *
7038 * '3.14159'.to_f # => 3.14159
7039 * '1.234e-2'.to_f # => 0.01234
7040 *
7041 * Characters past a leading valid number (in the given +base+) are ignored:
7042 *
7043 * '3.14 (pi to two places)'.to_f # => 3.14
7044 *
7045 * Returns zero if there is no leading valid number:
7046 *
7047 * 'abcdef'.to_f # => 0.0
7048 *
7049 */
7050
7051static VALUE
7052rb_str_to_f(VALUE str)
7053{
7054 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7055}
7056
7057
7058/*
7059 * call-seq:
7060 * to_s -> self or string
7061 *
7062 * Returns +self+ if +self+ is a +String+,
7063 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7064 */
7065
7066static VALUE
7067rb_str_to_s(VALUE str)
7068{
7069 if (rb_obj_class(str) != rb_cString) {
7070 return str_duplicate(rb_cString, str);
7071 }
7072 return str;
7073}
7074
7075#if 0
7076static void
7077str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7078{
7079 char s[RUBY_MAX_CHAR_LEN];
7080 int n = rb_enc_codelen(c, enc);
7081
7082 rb_enc_mbcput(c, s, enc);
7083 rb_enc_str_buf_cat(str, s, n, enc);
7084}
7085#endif
7086
7087#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7088
7089int
7090rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7091{
7092 char buf[CHAR_ESC_LEN + 1];
7093 int l;
7094
7095#if SIZEOF_INT > 4
7096 c &= 0xffffffff;
7097#endif
7098 if (unicode_p) {
7099 if (c < 0x7F && ISPRINT(c)) {
7100 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7101 }
7102 else if (c < 0x10000) {
7103 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7104 }
7105 else {
7106 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7107 }
7108 }
7109 else {
7110 if (c < 0x100) {
7111 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7112 }
7113 else {
7114 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7115 }
7116 }
7117 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7118 rb_str_buf_cat(result, buf, l);
7119 return l;
7120}
7121
7122const char *
7123ruby_escaped_char(int c)
7124{
7125 switch (c) {
7126 case '\0': return "\\0";
7127 case '\n': return "\\n";
7128 case '\r': return "\\r";
7129 case '\t': return "\\t";
7130 case '\f': return "\\f";
7131 case '\013': return "\\v";
7132 case '\010': return "\\b";
7133 case '\007': return "\\a";
7134 case '\033': return "\\e";
7135 case '\x7f': return "\\c?";
7136 }
7137 return NULL;
7138}
7139
7140VALUE
7141rb_str_escape(VALUE str)
7142{
7143 int encidx = ENCODING_GET(str);
7144 rb_encoding *enc = rb_enc_from_index(encidx);
7145 const char *p = RSTRING_PTR(str);
7146 const char *pend = RSTRING_END(str);
7147 const char *prev = p;
7148 char buf[CHAR_ESC_LEN + 1];
7149 VALUE result = rb_str_buf_new(0);
7150 int unicode_p = rb_enc_unicode_p(enc);
7151 int asciicompat = rb_enc_asciicompat(enc);
7152
7153 while (p < pend) {
7154 unsigned int c;
7155 const char *cc;
7156 int n = rb_enc_precise_mbclen(p, pend, enc);
7157 if (!MBCLEN_CHARFOUND_P(n)) {
7158 if (p > prev) str_buf_cat(result, prev, p - prev);
7159 n = rb_enc_mbminlen(enc);
7160 if (pend < p + n)
7161 n = (int)(pend - p);
7162 while (n--) {
7163 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7164 str_buf_cat(result, buf, strlen(buf));
7165 prev = ++p;
7166 }
7167 continue;
7168 }
7169 n = MBCLEN_CHARFOUND_LEN(n);
7170 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7171 p += n;
7172 cc = ruby_escaped_char(c);
7173 if (cc) {
7174 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7175 str_buf_cat(result, cc, strlen(cc));
7176 prev = p;
7177 }
7178 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7179 }
7180 else {
7181 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7182 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7183 prev = p;
7184 }
7185 }
7186 if (p > prev) str_buf_cat(result, prev, p - prev);
7188
7189 return result;
7190}
7191
7192/*
7193 * call-seq:
7194 * inspect -> string
7195 *
7196 * Returns a printable version of +self+, enclosed in double-quotes,
7197 * and with special characters escaped:
7198 *
7199 * s = "foo\tbar\tbaz\n"
7200 * s.inspect
7201 * # => "\"foo\\tbar\\tbaz\\n\""
7202 *
7203 */
7204
7205VALUE
7207{
7208 int encidx = ENCODING_GET(str);
7209 rb_encoding *enc = rb_enc_from_index(encidx);
7210 const char *p, *pend, *prev;
7211 char buf[CHAR_ESC_LEN + 1];
7212 VALUE result = rb_str_buf_new(0);
7213 rb_encoding *resenc = rb_default_internal_encoding();
7214 int unicode_p = rb_enc_unicode_p(enc);
7215 int asciicompat = rb_enc_asciicompat(enc);
7216
7217 if (resenc == NULL) resenc = rb_default_external_encoding();
7218 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7219 rb_enc_associate(result, resenc);
7220 str_buf_cat2(result, "\"");
7221
7222 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7223 prev = p;
7224 while (p < pend) {
7225 unsigned int c, cc;
7226 int n;
7227
7228 n = rb_enc_precise_mbclen(p, pend, enc);
7229 if (!MBCLEN_CHARFOUND_P(n)) {
7230 if (p > prev) str_buf_cat(result, prev, p - prev);
7231 n = rb_enc_mbminlen(enc);
7232 if (pend < p + n)
7233 n = (int)(pend - p);
7234 while (n--) {
7235 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7236 str_buf_cat(result, buf, strlen(buf));
7237 prev = ++p;
7238 }
7239 continue;
7240 }
7241 n = MBCLEN_CHARFOUND_LEN(n);
7242 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7243 p += n;
7244 if ((asciicompat || unicode_p) &&
7245 (c == '"'|| c == '\\' ||
7246 (c == '#' &&
7247 p < pend &&
7248 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7249 (cc = rb_enc_codepoint(p,pend,enc),
7250 (cc == '$' || cc == '@' || cc == '{'))))) {
7251 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7252 str_buf_cat2(result, "\\");
7253 if (asciicompat || enc == resenc) {
7254 prev = p - n;
7255 continue;
7256 }
7257 }
7258 switch (c) {
7259 case '\n': cc = 'n'; break;
7260 case '\r': cc = 'r'; break;
7261 case '\t': cc = 't'; break;
7262 case '\f': cc = 'f'; break;
7263 case '\013': cc = 'v'; break;
7264 case '\010': cc = 'b'; break;
7265 case '\007': cc = 'a'; break;
7266 case 033: cc = 'e'; break;
7267 default: cc = 0; break;
7268 }
7269 if (cc) {
7270 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7271 buf[0] = '\\';
7272 buf[1] = (char)cc;
7273 str_buf_cat(result, buf, 2);
7274 prev = p;
7275 continue;
7276 }
7277 /* The special casing of 0x85 (NEXT_LINE) here is because
7278 * Oniguruma historically treats it as printable, but it
7279 * doesn't match the print POSIX bracket class or character
7280 * property in regexps.
7281 *
7282 * See Ruby Bug #16842 for details:
7283 * https://bugs.ruby-lang.org/issues/16842
7284 */
7285 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7286 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7287 continue;
7288 }
7289 else {
7290 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7291 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7292 prev = p;
7293 continue;
7294 }
7295 }
7296 if (p > prev) str_buf_cat(result, prev, p - prev);
7297 str_buf_cat2(result, "\"");
7298
7299 return result;
7300}
7301
7302#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7303
7304/*
7305 * call-seq:
7306 * dump -> string
7307 *
7308 * Returns a printable version of +self+, enclosed in double-quotes,
7309 * with special characters escaped, and with non-printing characters
7310 * replaced by hexadecimal notation:
7311 *
7312 * "hello \n ''".dump # => "\"hello \\n ''\""
7313 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7314 *
7315 * Related: String#undump (inverse of String#dump).
7316 *
7317 */
7318
7319VALUE
7321{
7322 int encidx = rb_enc_get_index(str);
7323 rb_encoding *enc = rb_enc_from_index(encidx);
7324 long len;
7325 const char *p, *pend;
7326 char *q, *qend;
7327 VALUE result;
7328 int u8 = (encidx == rb_utf8_encindex());
7329 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7330
7331 len = 2; /* "" */
7332 if (!rb_enc_asciicompat(enc)) {
7333 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7334 len += strlen(enc->name);
7335 }
7336
7337 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7338 while (p < pend) {
7339 int clen;
7340 unsigned char c = *p++;
7341
7342 switch (c) {
7343 case '"': case '\\':
7344 case '\n': case '\r':
7345 case '\t': case '\f':
7346 case '\013': case '\010': case '\007': case '\033':
7347 clen = 2;
7348 break;
7349
7350 case '#':
7351 clen = IS_EVSTR(p, pend) ? 2 : 1;
7352 break;
7353
7354 default:
7355 if (ISPRINT(c)) {
7356 clen = 1;
7357 }
7358 else {
7359 if (u8 && c > 0x7F) { /* \u notation */
7360 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7361 if (MBCLEN_CHARFOUND_P(n)) {
7362 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7363 if (cc <= 0xFFFF)
7364 clen = 6; /* \uXXXX */
7365 else if (cc <= 0xFFFFF)
7366 clen = 9; /* \u{XXXXX} */
7367 else
7368 clen = 10; /* \u{XXXXXX} */
7369 p += MBCLEN_CHARFOUND_LEN(n)-1;
7370 break;
7371 }
7372 }
7373 clen = 4; /* \xNN */
7374 }
7375 break;
7376 }
7377
7378 if (clen > LONG_MAX - len) {
7379 rb_raise(rb_eRuntimeError, "string size too big");
7380 }
7381 len += clen;
7382 }
7383
7384 result = rb_str_new(0, len);
7385 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7386 q = RSTRING_PTR(result); qend = q + len + 1;
7387
7388 *q++ = '"';
7389 while (p < pend) {
7390 unsigned char c = *p++;
7391
7392 if (c == '"' || c == '\\') {
7393 *q++ = '\\';
7394 *q++ = c;
7395 }
7396 else if (c == '#') {
7397 if (IS_EVSTR(p, pend)) *q++ = '\\';
7398 *q++ = '#';
7399 }
7400 else if (c == '\n') {
7401 *q++ = '\\';
7402 *q++ = 'n';
7403 }
7404 else if (c == '\r') {
7405 *q++ = '\\';
7406 *q++ = 'r';
7407 }
7408 else if (c == '\t') {
7409 *q++ = '\\';
7410 *q++ = 't';
7411 }
7412 else if (c == '\f') {
7413 *q++ = '\\';
7414 *q++ = 'f';
7415 }
7416 else if (c == '\013') {
7417 *q++ = '\\';
7418 *q++ = 'v';
7419 }
7420 else if (c == '\010') {
7421 *q++ = '\\';
7422 *q++ = 'b';
7423 }
7424 else if (c == '\007') {
7425 *q++ = '\\';
7426 *q++ = 'a';
7427 }
7428 else if (c == '\033') {
7429 *q++ = '\\';
7430 *q++ = 'e';
7431 }
7432 else if (ISPRINT(c)) {
7433 *q++ = c;
7434 }
7435 else {
7436 *q++ = '\\';
7437 if (u8) {
7438 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7439 if (MBCLEN_CHARFOUND_P(n)) {
7440 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7441 p += n;
7442 if (cc <= 0xFFFF)
7443 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7444 else
7445 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7446 q += strlen(q);
7447 continue;
7448 }
7449 }
7450 snprintf(q, qend-q, "x%02X", c);
7451 q += 3;
7452 }
7453 }
7454 *q++ = '"';
7455 *q = '\0';
7456 if (!rb_enc_asciicompat(enc)) {
7457 snprintf(q, qend-q, nonascii_suffix, enc->name);
7458 encidx = rb_ascii8bit_encindex();
7459 }
7460 /* result from dump is ASCII */
7461 rb_enc_associate_index(result, encidx);
7463 return result;
7464}
7465
7466static int
7467unescape_ascii(unsigned int c)
7468{
7469 switch (c) {
7470 case 'n':
7471 return '\n';
7472 case 'r':
7473 return '\r';
7474 case 't':
7475 return '\t';
7476 case 'f':
7477 return '\f';
7478 case 'v':
7479 return '\13';
7480 case 'b':
7481 return '\010';
7482 case 'a':
7483 return '\007';
7484 case 'e':
7485 return 033;
7486 }
7488}
7489
7490static void
7491undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7492{
7493 const char *s = *ss;
7494 unsigned int c;
7495 int codelen;
7496 size_t hexlen;
7497 unsigned char buf[6];
7498 static rb_encoding *enc_utf8 = NULL;
7499
7500 switch (*s) {
7501 case '\\':
7502 case '"':
7503 case '#':
7504 rb_str_cat(undumped, s, 1); /* cat itself */
7505 s++;
7506 break;
7507 case 'n':
7508 case 'r':
7509 case 't':
7510 case 'f':
7511 case 'v':
7512 case 'b':
7513 case 'a':
7514 case 'e':
7515 *buf = unescape_ascii(*s);
7516 rb_str_cat(undumped, (char *)buf, 1);
7517 s++;
7518 break;
7519 case 'u':
7520 if (*binary) {
7521 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7522 }
7523 *utf8 = true;
7524 if (++s >= s_end) {
7525 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7526 }
7527 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7528 if (*penc != enc_utf8) {
7529 *penc = enc_utf8;
7530 rb_enc_associate(undumped, enc_utf8);
7531 }
7532 if (*s == '{') { /* handle \u{...} form */
7533 s++;
7534 for (;;) {
7535 if (s >= s_end) {
7536 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7537 }
7538 if (*s == '}') {
7539 s++;
7540 break;
7541 }
7542 if (ISSPACE(*s)) {
7543 s++;
7544 continue;
7545 }
7546 c = scan_hex(s, s_end-s, &hexlen);
7547 if (hexlen == 0 || hexlen > 6) {
7548 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7549 }
7550 if (c > 0x10ffff) {
7551 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7552 }
7553 if (0xd800 <= c && c <= 0xdfff) {
7554 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7555 }
7556 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7557 rb_str_cat(undumped, (char *)buf, codelen);
7558 s += hexlen;
7559 }
7560 }
7561 else { /* handle \uXXXX form */
7562 c = scan_hex(s, 4, &hexlen);
7563 if (hexlen != 4) {
7564 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7565 }
7566 if (0xd800 <= c && c <= 0xdfff) {
7567 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7568 }
7569 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7570 rb_str_cat(undumped, (char *)buf, codelen);
7571 s += hexlen;
7572 }
7573 break;
7574 case 'x':
7575 if (*utf8) {
7576 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7577 }
7578 *binary = true;
7579 if (++s >= s_end) {
7580 rb_raise(rb_eRuntimeError, "invalid hex escape");
7581 }
7582 *buf = scan_hex(s, 2, &hexlen);
7583 if (hexlen != 2) {
7584 rb_raise(rb_eRuntimeError, "invalid hex escape");
7585 }
7586 rb_str_cat(undumped, (char *)buf, 1);
7587 s += hexlen;
7588 break;
7589 default:
7590 rb_str_cat(undumped, s-1, 2);
7591 s++;
7592 }
7593
7594 *ss = s;
7595}
7596
7597static VALUE rb_str_is_ascii_only_p(VALUE str);
7598
7599/*
7600 * call-seq:
7601 * undump -> string
7602 *
7603 * Returns an unescaped version of +self+:
7604 *
7605 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7606 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7607 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7608 * s_undumped == s_orig # => true
7609 *
7610 * Related: String#dump (inverse of String#undump).
7611 *
7612 */
7613
7614static VALUE
7615str_undump(VALUE str)
7616{
7617 const char *s = RSTRING_PTR(str);
7618 const char *s_end = RSTRING_END(str);
7619 rb_encoding *enc = rb_enc_get(str);
7620 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7621 bool utf8 = false;
7622 bool binary = false;
7623 int w;
7624
7626 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7627 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7628 }
7629 if (!str_null_check(str, &w)) {
7630 rb_raise(rb_eRuntimeError, "string contains null byte");
7631 }
7632 if (RSTRING_LEN(str) < 2) goto invalid_format;
7633 if (*s != '"') goto invalid_format;
7634
7635 /* strip '"' at the start */
7636 s++;
7637
7638 for (;;) {
7639 if (s >= s_end) {
7640 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7641 }
7642
7643 if (*s == '"') {
7644 /* epilogue */
7645 s++;
7646 if (s == s_end) {
7647 /* ascii compatible dumped string */
7648 break;
7649 }
7650 else {
7651 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7652 static const char dup_suffix[] = ".dup";
7653 const char *encname;
7654 int encidx;
7655 ptrdiff_t size;
7656
7657 /* check separately for strings dumped by older versions */
7658 size = sizeof(dup_suffix) - 1;
7659 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7660
7661 size = sizeof(force_encoding_suffix) - 1;
7662 if (s_end - s <= size) goto invalid_format;
7663 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7664 s += size;
7665
7666 if (utf8) {
7667 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7668 }
7669
7670 encname = s;
7671 s = memchr(s, '"', s_end-s);
7672 size = s - encname;
7673 if (!s) goto invalid_format;
7674 if (s_end - s != 2) goto invalid_format;
7675 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7676
7677 encidx = rb_enc_find_index2(encname, (long)size);
7678 if (encidx < 0) {
7679 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7680 }
7681 rb_enc_associate_index(undumped, encidx);
7682 }
7683 break;
7684 }
7685
7686 if (*s == '\\') {
7687 s++;
7688 if (s >= s_end) {
7689 rb_raise(rb_eRuntimeError, "invalid escape");
7690 }
7691 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7692 }
7693 else {
7694 rb_str_cat(undumped, s++, 1);
7695 }
7696 }
7697
7698 RB_GC_GUARD(str);
7699
7700 return undumped;
7701invalid_format:
7702 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7703}
7704
7705static void
7706rb_str_check_dummy_enc(rb_encoding *enc)
7707{
7708 if (rb_enc_dummy_p(enc)) {
7709 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7710 rb_enc_name(enc));
7711 }
7712}
7713
7714static rb_encoding *
7715str_true_enc(VALUE str)
7716{
7717 rb_encoding *enc = STR_ENC_GET(str);
7718 rb_str_check_dummy_enc(enc);
7719 return enc;
7720}
7721
7722static OnigCaseFoldType
7723check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7724{
7725 if (argc==0)
7726 return flags;
7727 if (argc>2)
7728 rb_raise(rb_eArgError, "too many options");
7729 if (argv[0]==sym_turkic) {
7730 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7731 if (argc==2) {
7732 if (argv[1]==sym_lithuanian)
7733 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7734 else
7735 rb_raise(rb_eArgError, "invalid second option");
7736 }
7737 }
7738 else if (argv[0]==sym_lithuanian) {
7739 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7740 if (argc==2) {
7741 if (argv[1]==sym_turkic)
7742 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7743 else
7744 rb_raise(rb_eArgError, "invalid second option");
7745 }
7746 }
7747 else if (argc>1)
7748 rb_raise(rb_eArgError, "too many options");
7749 else if (argv[0]==sym_ascii)
7750 flags |= ONIGENC_CASE_ASCII_ONLY;
7751 else if (argv[0]==sym_fold) {
7752 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7753 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7754 else
7755 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7756 }
7757 else
7758 rb_raise(rb_eArgError, "invalid option");
7759 return flags;
7760}
7761
7762static inline bool
7763case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7764{
7765 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7766 return true;
7767 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7768}
7769
7770/* 16 should be long enough to absorb any kind of single character length increase */
7771#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7772#ifndef CASEMAP_DEBUG
7773# define CASEMAP_DEBUG 0
7774#endif
7775
7776struct mapping_buffer;
7777typedef struct mapping_buffer {
7778 size_t capa;
7779 size_t used;
7780 struct mapping_buffer *next;
7781 OnigUChar space[FLEX_ARY_LEN];
7783
7784static void
7785mapping_buffer_free(void *p)
7786{
7787 mapping_buffer *previous_buffer;
7788 mapping_buffer *current_buffer = p;
7789 while (current_buffer) {
7790 previous_buffer = current_buffer;
7791 current_buffer = current_buffer->next;
7792 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7793 }
7794}
7795
7796static const rb_data_type_t mapping_buffer_type = {
7797 "mapping_buffer",
7798 {0, mapping_buffer_free,},
7799 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7800};
7801
7802static VALUE
7803rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7804{
7805 VALUE target;
7806
7807 const OnigUChar *source_current, *source_end;
7808 int target_length = 0;
7809 VALUE buffer_anchor;
7810 mapping_buffer *current_buffer = 0;
7811 mapping_buffer **pre_buffer;
7812 size_t buffer_count = 0;
7813 int buffer_length_or_invalid;
7814
7815 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7816
7817 source_current = (OnigUChar*)RSTRING_PTR(source);
7818 source_end = (OnigUChar*)RSTRING_END(source);
7819
7820 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7821 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7822 while (source_current < source_end) {
7823 /* increase multiplier using buffer count to converge quickly */
7824 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7825 if (CASEMAP_DEBUG) {
7826 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7827 }
7828 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7829 *pre_buffer = current_buffer;
7830 pre_buffer = &current_buffer->next;
7831 current_buffer->next = NULL;
7832 current_buffer->capa = capa;
7833 buffer_length_or_invalid = enc->case_map(flags,
7834 &source_current, source_end,
7835 current_buffer->space,
7836 current_buffer->space+current_buffer->capa,
7837 enc);
7838 if (buffer_length_or_invalid < 0) {
7839 current_buffer = DATA_PTR(buffer_anchor);
7840 DATA_PTR(buffer_anchor) = 0;
7841 mapping_buffer_free(current_buffer);
7842 rb_raise(rb_eArgError, "input string invalid");
7843 }
7844 target_length += current_buffer->used = buffer_length_or_invalid;
7845 }
7846 if (CASEMAP_DEBUG) {
7847 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7848 }
7849
7850 if (buffer_count==1) {
7851 target = rb_str_new((const char*)current_buffer->space, target_length);
7852 }
7853 else {
7854 char *target_current;
7855
7856 target = rb_str_new(0, target_length);
7857 target_current = RSTRING_PTR(target);
7858 current_buffer = DATA_PTR(buffer_anchor);
7859 while (current_buffer) {
7860 memcpy(target_current, current_buffer->space, current_buffer->used);
7861 target_current += current_buffer->used;
7862 current_buffer = current_buffer->next;
7863 }
7864 }
7865 current_buffer = DATA_PTR(buffer_anchor);
7866 DATA_PTR(buffer_anchor) = 0;
7867 mapping_buffer_free(current_buffer);
7868
7869 RB_GC_GUARD(buffer_anchor);
7870
7871 /* TODO: check about string terminator character */
7872 str_enc_copy_direct(target, source);
7873 /*ENC_CODERANGE_SET(mapped, cr);*/
7874
7875 return target;
7876}
7877
7878static VALUE
7879rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7880{
7881 const OnigUChar *source_current, *source_end;
7882 OnigUChar *target_current, *target_end;
7883 long old_length = RSTRING_LEN(source);
7884 int length_or_invalid;
7885
7886 if (old_length == 0) return Qnil;
7887
7888 source_current = (OnigUChar*)RSTRING_PTR(source);
7889 source_end = (OnigUChar*)RSTRING_END(source);
7890 if (source == target) {
7891 target_current = (OnigUChar*)source_current;
7892 target_end = (OnigUChar*)source_end;
7893 }
7894 else {
7895 target_current = (OnigUChar*)RSTRING_PTR(target);
7896 target_end = (OnigUChar*)RSTRING_END(target);
7897 }
7898
7899 length_or_invalid = onigenc_ascii_only_case_map(flags,
7900 &source_current, source_end,
7901 target_current, target_end, enc);
7902 if (length_or_invalid < 0)
7903 rb_raise(rb_eArgError, "input string invalid");
7904 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7905 fprintf(stderr, "problem with rb_str_ascii_casemap"
7906 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7907 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7908 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7909 }
7910
7911 str_enc_copy(target, source);
7912
7913 return target;
7914}
7915
7916static bool
7917upcase_single(VALUE str)
7918{
7919 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7920 bool modified = false;
7921
7922 while (s < send) {
7923 unsigned int c = *(unsigned char*)s;
7924
7925 if ('a' <= c && c <= 'z') {
7926 *s = 'A' + (c - 'a');
7927 modified = true;
7928 }
7929 s++;
7930 }
7931 return modified;
7932}
7933
7934/*
7935 * call-seq:
7936 * upcase!(*options) -> self or nil
7937 *
7938 * Upcases the characters in +self+;
7939 * returns +self+ if any changes were made, +nil+ otherwise:
7940 *
7941 * s = 'Hello World!' # => "Hello World!"
7942 * s.upcase! # => "HELLO WORLD!"
7943 * s # => "HELLO WORLD!"
7944 * s.upcase! # => nil
7945 *
7946 * The casing may be affected by the given +options+;
7947 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7948 *
7949 * Related: String#upcase, String#downcase, String#downcase!.
7950 *
7951 */
7952
7953static VALUE
7954rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7955{
7956 rb_encoding *enc;
7957 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7958
7959 flags = check_case_options(argc, argv, flags);
7960 str_modify_keep_cr(str);
7961 enc = str_true_enc(str);
7962 if (case_option_single_p(flags, enc, str)) {
7963 if (upcase_single(str))
7964 flags |= ONIGENC_CASE_MODIFIED;
7965 }
7966 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7967 rb_str_ascii_casemap(str, str, &flags, enc);
7968 else
7969 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7970
7971 if (ONIGENC_CASE_MODIFIED&flags) return str;
7972 return Qnil;
7973}
7974
7975
7976/*
7977 * call-seq:
7978 * upcase(*options) -> string
7979 *
7980 * Returns a string containing the upcased characters in +self+:
7981 *
7982 * s = 'Hello World!' # => "Hello World!"
7983 * s.upcase # => "HELLO WORLD!"
7984 *
7985 * The casing may be affected by the given +options+;
7986 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7987 *
7988 * Related: String#upcase!, String#downcase, String#downcase!.
7989 *
7990 */
7991
7992static VALUE
7993rb_str_upcase(int argc, VALUE *argv, VALUE str)
7994{
7995 rb_encoding *enc;
7996 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7997 VALUE ret;
7998
7999 flags = check_case_options(argc, argv, flags);
8000 enc = str_true_enc(str);
8001 if (case_option_single_p(flags, enc, str)) {
8002 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8003 str_enc_copy_direct(ret, str);
8004 upcase_single(ret);
8005 }
8006 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8007 ret = rb_str_new(0, RSTRING_LEN(str));
8008 rb_str_ascii_casemap(str, ret, &flags, enc);
8009 }
8010 else {
8011 ret = rb_str_casemap(str, &flags, enc);
8012 }
8013
8014 return ret;
8015}
8016
8017static bool
8018downcase_single(VALUE str)
8019{
8020 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8021 bool modified = false;
8022
8023 while (s < send) {
8024 unsigned int c = *(unsigned char*)s;
8025
8026 if ('A' <= c && c <= 'Z') {
8027 *s = 'a' + (c - 'A');
8028 modified = true;
8029 }
8030 s++;
8031 }
8032
8033 return modified;
8034}
8035
8036/*
8037 * call-seq:
8038 * downcase!(*options) -> self or nil
8039 *
8040 * Downcases the characters in +self+;
8041 * returns +self+ if any changes were made, +nil+ otherwise:
8042 *
8043 * s = 'Hello World!' # => "Hello World!"
8044 * s.downcase! # => "hello world!"
8045 * s # => "hello world!"
8046 * s.downcase! # => nil
8047 *
8048 * The casing may be affected by the given +options+;
8049 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8050 *
8051 * Related: String#downcase, String#upcase, String#upcase!.
8052 *
8053 */
8054
8055static VALUE
8056rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8057{
8058 rb_encoding *enc;
8059 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8060
8061 flags = check_case_options(argc, argv, flags);
8062 str_modify_keep_cr(str);
8063 enc = str_true_enc(str);
8064 if (case_option_single_p(flags, enc, str)) {
8065 if (downcase_single(str))
8066 flags |= ONIGENC_CASE_MODIFIED;
8067 }
8068 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8069 rb_str_ascii_casemap(str, str, &flags, enc);
8070 else
8071 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8072
8073 if (ONIGENC_CASE_MODIFIED&flags) return str;
8074 return Qnil;
8075}
8076
8077
8078/*
8079 * call-seq:
8080 * downcase(*options) -> string
8081 *
8082 * Returns a string containing the downcased characters in +self+:
8083 *
8084 * s = 'Hello World!' # => "Hello World!"
8085 * s.downcase # => "hello world!"
8086 *
8087 * The casing may be affected by the given +options+;
8088 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8089 *
8090 * Related: String#downcase!, String#upcase, String#upcase!.
8091 *
8092 */
8093
8094static VALUE
8095rb_str_downcase(int argc, VALUE *argv, VALUE str)
8096{
8097 rb_encoding *enc;
8098 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8099 VALUE ret;
8100
8101 flags = check_case_options(argc, argv, flags);
8102 enc = str_true_enc(str);
8103 if (case_option_single_p(flags, enc, str)) {
8104 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8105 str_enc_copy_direct(ret, str);
8106 downcase_single(ret);
8107 }
8108 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8109 ret = rb_str_new(0, RSTRING_LEN(str));
8110 rb_str_ascii_casemap(str, ret, &flags, enc);
8111 }
8112 else {
8113 ret = rb_str_casemap(str, &flags, enc);
8114 }
8115
8116 return ret;
8117}
8118
8119
8120/*
8121 * call-seq:
8122 * capitalize!(*options) -> self or nil
8123 *
8124 * Upcases the first character in +self+;
8125 * downcases the remaining characters;
8126 * returns +self+ if any changes were made, +nil+ otherwise:
8127 *
8128 * s = 'hello World!' # => "hello World!"
8129 * s.capitalize! # => "Hello world!"
8130 * s # => "Hello world!"
8131 * s.capitalize! # => nil
8132 *
8133 * The casing may be affected by the given +options+;
8134 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8135 *
8136 * Related: String#capitalize.
8137 *
8138 */
8139
8140static VALUE
8141rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8142{
8143 rb_encoding *enc;
8144 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8145
8146 flags = check_case_options(argc, argv, flags);
8147 str_modify_keep_cr(str);
8148 enc = str_true_enc(str);
8149 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8150 if (flags&ONIGENC_CASE_ASCII_ONLY)
8151 rb_str_ascii_casemap(str, str, &flags, enc);
8152 else
8153 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8154
8155 if (ONIGENC_CASE_MODIFIED&flags) return str;
8156 return Qnil;
8157}
8158
8159
8160/*
8161 * call-seq:
8162 * capitalize(*options) -> string
8163 *
8164 * Returns a string containing the characters in +self+;
8165 * the first character is upcased;
8166 * the remaining characters are downcased:
8167 *
8168 * s = 'hello World!' # => "hello World!"
8169 * s.capitalize # => "Hello world!"
8170 *
8171 * The casing may be affected by the given +options+;
8172 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8173 *
8174 * Related: String#capitalize!.
8175 *
8176 */
8177
8178static VALUE
8179rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8180{
8181 rb_encoding *enc;
8182 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8183 VALUE ret;
8184
8185 flags = check_case_options(argc, argv, flags);
8186 enc = str_true_enc(str);
8187 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8188 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8189 ret = rb_str_new(0, RSTRING_LEN(str));
8190 rb_str_ascii_casemap(str, ret, &flags, enc);
8191 }
8192 else {
8193 ret = rb_str_casemap(str, &flags, enc);
8194 }
8195 return ret;
8196}
8197
8198
8199/*
8200 * call-seq:
8201 * swapcase!(*options) -> self or nil
8202 *
8203 * Upcases each lowercase character in +self+;
8204 * downcases uppercase character;
8205 * returns +self+ if any changes were made, +nil+ otherwise:
8206 *
8207 * s = 'Hello World!' # => "Hello World!"
8208 * s.swapcase! # => "hELLO wORLD!"
8209 * s # => "hELLO wORLD!"
8210 * ''.swapcase! # => nil
8211 *
8212 * The casing may be affected by the given +options+;
8213 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8214 *
8215 * Related: String#swapcase.
8216 *
8217 */
8218
8219static VALUE
8220rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8221{
8222 rb_encoding *enc;
8223 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8224
8225 flags = check_case_options(argc, argv, flags);
8226 str_modify_keep_cr(str);
8227 enc = str_true_enc(str);
8228 if (flags&ONIGENC_CASE_ASCII_ONLY)
8229 rb_str_ascii_casemap(str, str, &flags, enc);
8230 else
8231 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8232
8233 if (ONIGENC_CASE_MODIFIED&flags) return str;
8234 return Qnil;
8235}
8236
8237
8238/*
8239 * call-seq:
8240 * swapcase(*options) -> string
8241 *
8242 * Returns a string containing the characters in +self+, with cases reversed;
8243 * each uppercase character is downcased;
8244 * each lowercase character is upcased:
8245 *
8246 * s = 'Hello World!' # => "Hello World!"
8247 * s.swapcase # => "hELLO wORLD!"
8248 *
8249 * The casing may be affected by the given +options+;
8250 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8251 *
8252 * Related: String#swapcase!.
8253 *
8254 */
8255
8256static VALUE
8257rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8258{
8259 rb_encoding *enc;
8260 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8261 VALUE ret;
8262
8263 flags = check_case_options(argc, argv, flags);
8264 enc = str_true_enc(str);
8265 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8266 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8267 ret = rb_str_new(0, RSTRING_LEN(str));
8268 rb_str_ascii_casemap(str, ret, &flags, enc);
8269 }
8270 else {
8271 ret = rb_str_casemap(str, &flags, enc);
8272 }
8273 return ret;
8274}
8275
8276typedef unsigned char *USTR;
8277
8278struct tr {
8279 int gen;
8280 unsigned int now, max;
8281 char *p, *pend;
8282};
8283
8284static unsigned int
8285trnext(struct tr *t, rb_encoding *enc)
8286{
8287 int n;
8288
8289 for (;;) {
8290 nextpart:
8291 if (!t->gen) {
8292 if (t->p == t->pend) return -1;
8293 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8294 t->p += n;
8295 }
8296 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8297 t->p += n;
8298 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8299 t->p += n;
8300 if (t->p < t->pend) {
8301 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8302 t->p += n;
8303 if (t->now > c) {
8304 if (t->now < 0x80 && c < 0x80) {
8305 rb_raise(rb_eArgError,
8306 "invalid range \"%c-%c\" in string transliteration",
8307 t->now, c);
8308 }
8309 else {
8310 rb_raise(rb_eArgError, "invalid range in string transliteration");
8311 }
8312 continue; /* not reached */
8313 }
8314 else if (t->now < c) {
8315 t->gen = 1;
8316 t->max = c;
8317 }
8318 }
8319 }
8320 return t->now;
8321 }
8322 else {
8323 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8324 if (t->now == t->max) {
8325 t->gen = 0;
8326 goto nextpart;
8327 }
8328 }
8329 if (t->now < t->max) {
8330 return t->now;
8331 }
8332 else {
8333 t->gen = 0;
8334 return t->max;
8335 }
8336 }
8337 }
8338}
8339
8340static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8341
8342static VALUE
8343tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8344{
8345 const unsigned int errc = -1;
8346 unsigned int trans[256];
8347 rb_encoding *enc, *e1, *e2;
8348 struct tr trsrc, trrepl;
8349 int cflag = 0;
8350 unsigned int c, c0, last = 0;
8351 int modify = 0, i, l;
8352 unsigned char *s, *send;
8353 VALUE hash = 0;
8354 int singlebyte = single_byte_optimizable(str);
8355 int termlen;
8356 int cr;
8357
8358#define CHECK_IF_ASCII(c) \
8359 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8360 (cr = ENC_CODERANGE_VALID) : 0)
8361
8362 StringValue(src);
8363 StringValue(repl);
8364 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8365 if (RSTRING_LEN(repl) == 0) {
8366 return rb_str_delete_bang(1, &src, str);
8367 }
8368
8369 cr = ENC_CODERANGE(str);
8370 e1 = rb_enc_check(str, src);
8371 e2 = rb_enc_check(str, repl);
8372 if (e1 == e2) {
8373 enc = e1;
8374 }
8375 else {
8376 enc = rb_enc_check(src, repl);
8377 }
8378 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8379 if (RSTRING_LEN(src) > 1 &&
8380 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8381 trsrc.p + l < trsrc.pend) {
8382 cflag = 1;
8383 trsrc.p += l;
8384 }
8385 trrepl.p = RSTRING_PTR(repl);
8386 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8387 trsrc.gen = trrepl.gen = 0;
8388 trsrc.now = trrepl.now = 0;
8389 trsrc.max = trrepl.max = 0;
8390
8391 if (cflag) {
8392 for (i=0; i<256; i++) {
8393 trans[i] = 1;
8394 }
8395 while ((c = trnext(&trsrc, enc)) != errc) {
8396 if (c < 256) {
8397 trans[c] = errc;
8398 }
8399 else {
8400 if (!hash) hash = rb_hash_new();
8401 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8402 }
8403 }
8404 while ((c = trnext(&trrepl, enc)) != errc)
8405 /* retrieve last replacer */;
8406 last = trrepl.now;
8407 for (i=0; i<256; i++) {
8408 if (trans[i] != errc) {
8409 trans[i] = last;
8410 }
8411 }
8412 }
8413 else {
8414 unsigned int r;
8415
8416 for (i=0; i<256; i++) {
8417 trans[i] = errc;
8418 }
8419 while ((c = trnext(&trsrc, enc)) != errc) {
8420 r = trnext(&trrepl, enc);
8421 if (r == errc) r = trrepl.now;
8422 if (c < 256) {
8423 trans[c] = r;
8424 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8425 }
8426 else {
8427 if (!hash) hash = rb_hash_new();
8428 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8429 }
8430 }
8431 }
8432
8433 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8434 cr = ENC_CODERANGE_7BIT;
8435 str_modify_keep_cr(str);
8436 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8437 termlen = rb_enc_mbminlen(enc);
8438 if (sflag) {
8439 int clen, tlen;
8440 long offset, max = RSTRING_LEN(str);
8441 unsigned int save = -1;
8442 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8443
8444 while (s < send) {
8445 int may_modify = 0;
8446
8447 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8448 if (!MBCLEN_CHARFOUND_P(r)) {
8449 xfree(buf);
8450 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8451 }
8452 clen = MBCLEN_CHARFOUND_LEN(r);
8453 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8454
8455 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8456
8457 s += clen;
8458 if (c < 256) {
8459 c = trans[c];
8460 }
8461 else if (hash) {
8462 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8463 if (NIL_P(tmp)) {
8464 if (cflag) c = last;
8465 else c = errc;
8466 }
8467 else if (cflag) c = errc;
8468 else c = NUM2INT(tmp);
8469 }
8470 else {
8471 c = errc;
8472 }
8473 if (c != (unsigned int)-1) {
8474 if (save == c) {
8475 CHECK_IF_ASCII(c);
8476 continue;
8477 }
8478 save = c;
8479 tlen = rb_enc_codelen(c, enc);
8480 modify = 1;
8481 }
8482 else {
8483 save = -1;
8484 c = c0;
8485 if (enc != e1) may_modify = 1;
8486 }
8487 if ((offset = t - buf) + tlen > max) {
8488 size_t MAYBE_UNUSED(old) = max + termlen;
8489 max = offset + tlen + (send - s);
8490 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8491 t = buf + offset;
8492 }
8493 rb_enc_mbcput(c, t, enc);
8494 if (may_modify && memcmp(s, t, tlen) != 0) {
8495 modify = 1;
8496 }
8497 CHECK_IF_ASCII(c);
8498 t += tlen;
8499 }
8500 if (!STR_EMBED_P(str)) {
8501 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8502 }
8503 TERM_FILL((char *)t, termlen);
8504 RSTRING(str)->as.heap.ptr = (char *)buf;
8505 STR_SET_LEN(str, t - buf);
8506 STR_SET_NOEMBED(str);
8507 RSTRING(str)->as.heap.aux.capa = max;
8508 }
8509 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8510 while (s < send) {
8511 c = (unsigned char)*s;
8512 if (trans[c] != errc) {
8513 if (!cflag) {
8514 c = trans[c];
8515 *s = c;
8516 modify = 1;
8517 }
8518 else {
8519 *s = last;
8520 modify = 1;
8521 }
8522 }
8523 CHECK_IF_ASCII(c);
8524 s++;
8525 }
8526 }
8527 else {
8528 int clen, tlen;
8529 long offset, max = (long)((send - s) * 1.2);
8530 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8531
8532 while (s < send) {
8533 int may_modify = 0;
8534
8535 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8536 if (!MBCLEN_CHARFOUND_P(r)) {
8537 xfree(buf);
8538 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8539 }
8540 clen = MBCLEN_CHARFOUND_LEN(r);
8541 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8542
8543 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8544
8545 if (c < 256) {
8546 c = trans[c];
8547 }
8548 else if (hash) {
8549 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8550 if (NIL_P(tmp)) {
8551 if (cflag) c = last;
8552 else c = errc;
8553 }
8554 else if (cflag) c = errc;
8555 else c = NUM2INT(tmp);
8556 }
8557 else {
8558 c = cflag ? last : errc;
8559 }
8560 if (c != errc) {
8561 tlen = rb_enc_codelen(c, enc);
8562 modify = 1;
8563 }
8564 else {
8565 c = c0;
8566 if (enc != e1) may_modify = 1;
8567 }
8568 if ((offset = t - buf) + tlen > max) {
8569 size_t MAYBE_UNUSED(old) = max + termlen;
8570 max = offset + tlen + (long)((send - s) * 1.2);
8571 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8572 t = buf + offset;
8573 }
8574 if (s != t) {
8575 rb_enc_mbcput(c, t, enc);
8576 if (may_modify && memcmp(s, t, tlen) != 0) {
8577 modify = 1;
8578 }
8579 }
8580 CHECK_IF_ASCII(c);
8581 s += clen;
8582 t += tlen;
8583 }
8584 if (!STR_EMBED_P(str)) {
8585 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8586 }
8587 TERM_FILL((char *)t, termlen);
8588 RSTRING(str)->as.heap.ptr = (char *)buf;
8589 STR_SET_LEN(str, t - buf);
8590 STR_SET_NOEMBED(str);
8591 RSTRING(str)->as.heap.aux.capa = max;
8592 }
8593
8594 if (modify) {
8595 if (cr != ENC_CODERANGE_BROKEN)
8596 ENC_CODERANGE_SET(str, cr);
8597 rb_enc_associate(str, enc);
8598 return str;
8599 }
8600 return Qnil;
8601}
8602
8603
8604/*
8605 * call-seq:
8606 * tr!(selector, replacements) -> self or nil
8607 *
8608 * Like String#tr, but modifies +self+ in place.
8609 * Returns +self+ if any changes were made, +nil+ otherwise.
8610 *
8611 */
8612
8613static VALUE
8614rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8615{
8616 return tr_trans(str, src, repl, 0);
8617}
8618
8619
8620/*
8621 * call-seq:
8622 * tr(selector, replacements) -> new_string
8623 *
8624 * Returns a copy of +self+ with each character specified by string +selector+
8625 * translated to the corresponding character in string +replacements+.
8626 * The correspondence is _positional_:
8627 *
8628 * - Each occurrence of the first character specified by +selector+
8629 * is translated to the first character in +replacements+.
8630 * - Each occurrence of the second character specified by +selector+
8631 * is translated to the second character in +replacements+.
8632 * - And so on.
8633 *
8634 * Example:
8635 *
8636 * 'hello'.tr('el', 'ip') #=> "hippo"
8637 *
8638 * If +replacements+ is shorter than +selector+,
8639 * it is implicitly padded with its own last character:
8640 *
8641 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8642 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8643 *
8644 * Arguments +selector+ and +replacements+ must be valid character selectors
8645 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8646 * and may use any of its valid forms, including negation, ranges, and escaping:
8647 *
8648 * # Negation.
8649 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8650 * # Ranges.
8651 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8652 * # Escapes.
8653 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8654 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8655 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8656 *
8657 */
8658
8659static VALUE
8660rb_str_tr(VALUE str, VALUE src, VALUE repl)
8661{
8662 str = str_duplicate(rb_cString, str);
8663 tr_trans(str, src, repl, 0);
8664 return str;
8665}
8666
8667#define TR_TABLE_MAX (UCHAR_MAX+1)
8668#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8669static void
8670tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8671 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8672{
8673 const unsigned int errc = -1;
8674 char buf[TR_TABLE_MAX];
8675 struct tr tr;
8676 unsigned int c;
8677 VALUE table = 0, ptable = 0;
8678 int i, l, cflag = 0;
8679
8680 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8681 tr.gen = tr.now = tr.max = 0;
8682
8683 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8684 cflag = 1;
8685 tr.p += l;
8686 }
8687 if (first) {
8688 for (i=0; i<TR_TABLE_MAX; i++) {
8689 stable[i] = 1;
8690 }
8691 stable[TR_TABLE_MAX] = cflag;
8692 }
8693 else if (stable[TR_TABLE_MAX] && !cflag) {
8694 stable[TR_TABLE_MAX] = 0;
8695 }
8696 for (i=0; i<TR_TABLE_MAX; i++) {
8697 buf[i] = cflag;
8698 }
8699
8700 while ((c = trnext(&tr, enc)) != errc) {
8701 if (c < TR_TABLE_MAX) {
8702 buf[(unsigned char)c] = !cflag;
8703 }
8704 else {
8705 VALUE key = UINT2NUM(c);
8706
8707 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8708 if (cflag) {
8709 ptable = *ctablep;
8710 table = ptable ? ptable : rb_hash_new();
8711 *ctablep = table;
8712 }
8713 else {
8714 table = rb_hash_new();
8715 ptable = *tablep;
8716 *tablep = table;
8717 }
8718 }
8719 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8720 rb_hash_aset(table, key, Qtrue);
8721 }
8722 }
8723 }
8724 for (i=0; i<TR_TABLE_MAX; i++) {
8725 stable[i] = stable[i] && buf[i];
8726 }
8727 if (!table && !cflag) {
8728 *tablep = 0;
8729 }
8730}
8731
8732
8733static int
8734tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8735{
8736 if (c < TR_TABLE_MAX) {
8737 return table[c] != 0;
8738 }
8739 else {
8740 VALUE v = UINT2NUM(c);
8741
8742 if (del) {
8743 if (!NIL_P(rb_hash_lookup(del, v)) &&
8744 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8745 return TRUE;
8746 }
8747 }
8748 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8749 return FALSE;
8750 }
8751 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8752 }
8753}
8754
8755/*
8756 * call-seq:
8757 * delete!(*selectors) -> self or nil
8758 *
8759 * Like String#delete, but modifies +self+ in place.
8760 * Returns +self+ if any changes were made, +nil+ otherwise.
8761 *
8762 */
8763
8764static VALUE
8765rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8766{
8767 char squeez[TR_TABLE_SIZE];
8768 rb_encoding *enc = 0;
8769 char *s, *send, *t;
8770 VALUE del = 0, nodel = 0;
8771 int modify = 0;
8772 int i, ascompat, cr;
8773
8774 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8776 for (i=0; i<argc; i++) {
8777 VALUE s = argv[i];
8778
8779 StringValue(s);
8780 enc = rb_enc_check(str, s);
8781 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8782 }
8783
8784 str_modify_keep_cr(str);
8785 ascompat = rb_enc_asciicompat(enc);
8786 s = t = RSTRING_PTR(str);
8787 send = RSTRING_END(str);
8788 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8789 while (s < send) {
8790 unsigned int c;
8791 int clen;
8792
8793 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8794 if (squeez[c]) {
8795 modify = 1;
8796 }
8797 else {
8798 if (t != s) *t = c;
8799 t++;
8800 }
8801 s++;
8802 }
8803 else {
8804 c = rb_enc_codepoint_len(s, send, &clen, enc);
8805
8806 if (tr_find(c, squeez, del, nodel)) {
8807 modify = 1;
8808 }
8809 else {
8810 if (t != s) rb_enc_mbcput(c, t, enc);
8811 t += clen;
8813 }
8814 s += clen;
8815 }
8816 }
8817 TERM_FILL(t, TERM_LEN(str));
8818 STR_SET_LEN(str, t - RSTRING_PTR(str));
8819 ENC_CODERANGE_SET(str, cr);
8820
8821 if (modify) return str;
8822 return Qnil;
8823}
8824
8825
8826/*
8827 * call-seq:
8828 * delete(*selectors) -> new_string
8829 *
8830 * Returns a copy of +self+ with characters specified by +selectors+ removed
8831 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8832 *
8833 * "hello".delete "l","lo" #=> "heo"
8834 * "hello".delete "lo" #=> "he"
8835 * "hello".delete "aeiou", "^e" #=> "hell"
8836 * "hello".delete "ej-m" #=> "ho"
8837 *
8838 */
8839
8840static VALUE
8841rb_str_delete(int argc, VALUE *argv, VALUE str)
8842{
8843 str = str_duplicate(rb_cString, str);
8844 rb_str_delete_bang(argc, argv, str);
8845 return str;
8846}
8847
8848
8849/*
8850 * call-seq:
8851 * squeeze!(*selectors) -> self or nil
8852 *
8853 * Like String#squeeze, but modifies +self+ in place.
8854 * Returns +self+ if any changes were made, +nil+ otherwise.
8855 */
8856
8857static VALUE
8858rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8859{
8860 char squeez[TR_TABLE_SIZE];
8861 rb_encoding *enc = 0;
8862 VALUE del = 0, nodel = 0;
8863 unsigned char *s, *send, *t;
8864 int i, modify = 0;
8865 int ascompat, singlebyte = single_byte_optimizable(str);
8866 unsigned int save;
8867
8868 if (argc == 0) {
8869 enc = STR_ENC_GET(str);
8870 }
8871 else {
8872 for (i=0; i<argc; i++) {
8873 VALUE s = argv[i];
8874
8875 StringValue(s);
8876 enc = rb_enc_check(str, s);
8877 if (singlebyte && !single_byte_optimizable(s))
8878 singlebyte = 0;
8879 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8880 }
8881 }
8882
8883 str_modify_keep_cr(str);
8884 s = t = (unsigned char *)RSTRING_PTR(str);
8885 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8886 send = (unsigned char *)RSTRING_END(str);
8887 save = -1;
8888 ascompat = rb_enc_asciicompat(enc);
8889
8890 if (singlebyte) {
8891 while (s < send) {
8892 unsigned int c = *s++;
8893 if (c != save || (argc > 0 && !squeez[c])) {
8894 *t++ = save = c;
8895 }
8896 }
8897 }
8898 else {
8899 while (s < send) {
8900 unsigned int c;
8901 int clen;
8902
8903 if (ascompat && (c = *s) < 0x80) {
8904 if (c != save || (argc > 0 && !squeez[c])) {
8905 *t++ = save = c;
8906 }
8907 s++;
8908 }
8909 else {
8910 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8911
8912 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8913 if (t != s) rb_enc_mbcput(c, t, enc);
8914 save = c;
8915 t += clen;
8916 }
8917 s += clen;
8918 }
8919 }
8920 }
8921
8922 TERM_FILL((char *)t, TERM_LEN(str));
8923 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8924 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8925 modify = 1;
8926 }
8927
8928 if (modify) return str;
8929 return Qnil;
8930}
8931
8932
8933/*
8934 * call-seq:
8935 * squeeze(*selectors) -> new_string
8936 *
8937 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8938 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8939 *
8940 * "Squeezed" means that each multiple-character run of a selected character
8941 * is squeezed down to a single character;
8942 * with no arguments given, squeezes all characters:
8943 *
8944 * "yellow moon".squeeze #=> "yelow mon"
8945 * " now is the".squeeze(" ") #=> " now is the"
8946 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8947 *
8948 */
8949
8950static VALUE
8951rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8952{
8953 str = str_duplicate(rb_cString, str);
8954 rb_str_squeeze_bang(argc, argv, str);
8955 return str;
8956}
8957
8958
8959/*
8960 * call-seq:
8961 * tr_s!(selector, replacements) -> self or nil
8962 *
8963 * Like String#tr_s, but modifies +self+ in place.
8964 * Returns +self+ if any changes were made, +nil+ otherwise.
8965 *
8966 * Related: String#squeeze!.
8967 */
8968
8969static VALUE
8970rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8971{
8972 return tr_trans(str, src, repl, 1);
8973}
8974
8975
8976/*
8977 * call-seq:
8978 * tr_s(selector, replacements) -> string
8979 *
8980 * Like String#tr, but also squeezes the modified portions of the translated string;
8981 * returns a new string (translated and squeezed).
8982 *
8983 * 'hello'.tr_s('l', 'r') #=> "hero"
8984 * 'hello'.tr_s('el', '-') #=> "h-o"
8985 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8986 *
8987 * Related: String#squeeze.
8988 *
8989 */
8990
8991static VALUE
8992rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8993{
8994 str = str_duplicate(rb_cString, str);
8995 tr_trans(str, src, repl, 1);
8996 return str;
8997}
8998
8999
9000/*
9001 * call-seq:
9002 * count(*selectors) -> integer
9003 *
9004 * Returns the total number of characters in +self+
9005 * that are specified by the given +selectors+
9006 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9007 *
9008 * a = "hello world"
9009 * a.count "lo" #=> 5
9010 * a.count "lo", "o" #=> 2
9011 * a.count "hello", "^l" #=> 4
9012 * a.count "ej-m" #=> 4
9013 *
9014 * "hello^world".count "\\^aeiou" #=> 4
9015 * "hello-world".count "a\\-eo" #=> 4
9016 *
9017 * c = "hello world\\r\\n"
9018 * c.count "\\" #=> 2
9019 * c.count "\\A" #=> 0
9020 * c.count "X-\\w" #=> 3
9021 */
9022
9023static VALUE
9024rb_str_count(int argc, VALUE *argv, VALUE str)
9025{
9026 char table[TR_TABLE_SIZE];
9027 rb_encoding *enc = 0;
9028 VALUE del = 0, nodel = 0, tstr;
9029 char *s, *send;
9030 int i;
9031 int ascompat;
9032 size_t n = 0;
9033
9035
9036 tstr = argv[0];
9037 StringValue(tstr);
9038 enc = rb_enc_check(str, tstr);
9039 if (argc == 1) {
9040 const char *ptstr;
9041 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9042 (ptstr = RSTRING_PTR(tstr),
9043 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9044 !is_broken_string(str)) {
9045 int clen;
9046 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9047
9048 s = RSTRING_PTR(str);
9049 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9050 send = RSTRING_END(str);
9051 while (s < send) {
9052 if (*(unsigned char*)s++ == c) n++;
9053 }
9054 return SIZET2NUM(n);
9055 }
9056 }
9057
9058 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9059 for (i=1; i<argc; i++) {
9060 tstr = argv[i];
9061 StringValue(tstr);
9062 enc = rb_enc_check(str, tstr);
9063 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9064 }
9065
9066 s = RSTRING_PTR(str);
9067 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9068 send = RSTRING_END(str);
9069 ascompat = rb_enc_asciicompat(enc);
9070 while (s < send) {
9071 unsigned int c;
9072
9073 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9074 if (table[c]) {
9075 n++;
9076 }
9077 s++;
9078 }
9079 else {
9080 int clen;
9081 c = rb_enc_codepoint_len(s, send, &clen, enc);
9082 if (tr_find(c, table, del, nodel)) {
9083 n++;
9084 }
9085 s += clen;
9086 }
9087 }
9088
9089 return SIZET2NUM(n);
9090}
9091
9092static VALUE
9093rb_fs_check(VALUE val)
9094{
9095 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9096 val = rb_check_string_type(val);
9097 if (NIL_P(val)) return 0;
9098 }
9099 return val;
9100}
9101
9102static const char isspacetable[256] = {
9103 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9105 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9119};
9120
9121#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9122
9123static long
9124split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9125{
9126 if (empty_count >= 0 && len == 0) {
9127 return empty_count + 1;
9128 }
9129 if (empty_count > 0) {
9130 /* make different substrings */
9131 if (result) {
9132 do {
9133 rb_ary_push(result, str_new_empty_String(str));
9134 } while (--empty_count > 0);
9135 }
9136 else {
9137 do {
9138 rb_yield(str_new_empty_String(str));
9139 } while (--empty_count > 0);
9140 }
9141 }
9142 str = rb_str_subseq(str, beg, len);
9143 if (result) {
9144 rb_ary_push(result, str);
9145 }
9146 else {
9147 rb_yield(str);
9148 }
9149 return empty_count;
9150}
9151
9152typedef enum {
9153 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9154} split_type_t;
9155
9156static split_type_t
9157literal_split_pattern(VALUE spat, split_type_t default_type)
9158{
9159 rb_encoding *enc = STR_ENC_GET(spat);
9160 const char *ptr;
9161 long len;
9162 RSTRING_GETMEM(spat, ptr, len);
9163 if (len == 0) {
9164 /* Special case - split into chars */
9165 return SPLIT_TYPE_CHARS;
9166 }
9167 else if (rb_enc_asciicompat(enc)) {
9168 if (len == 1 && ptr[0] == ' ') {
9169 return SPLIT_TYPE_AWK;
9170 }
9171 }
9172 else {
9173 int l;
9174 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9175 return SPLIT_TYPE_AWK;
9176 }
9177 }
9178 return default_type;
9179}
9180
9181/*
9182 * call-seq:
9183 * split(field_sep = $;, limit = 0) -> array
9184 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9185 *
9186 * :include: doc/string/split.rdoc
9187 *
9188 */
9189
9190static VALUE
9191rb_str_split_m(int argc, VALUE *argv, VALUE str)
9192{
9193 rb_encoding *enc;
9194 VALUE spat;
9195 VALUE limit;
9196 split_type_t split_type;
9197 long beg, end, i = 0, empty_count = -1;
9198 int lim = 0;
9199 VALUE result, tmp;
9200
9201 result = rb_block_given_p() ? Qfalse : Qnil;
9202 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9203 lim = NUM2INT(limit);
9204 if (lim <= 0) limit = Qnil;
9205 else if (lim == 1) {
9206 if (RSTRING_LEN(str) == 0)
9207 return result ? rb_ary_new2(0) : str;
9208 tmp = str_duplicate(rb_cString, str);
9209 if (!result) {
9210 rb_yield(tmp);
9211 return str;
9212 }
9213 return rb_ary_new3(1, tmp);
9214 }
9215 i = 1;
9216 }
9217 if (NIL_P(limit) && !lim) empty_count = 0;
9218
9219 enc = STR_ENC_GET(str);
9220 split_type = SPLIT_TYPE_REGEXP;
9221 if (!NIL_P(spat)) {
9222 spat = get_pat_quoted(spat, 0);
9223 }
9224 else if (NIL_P(spat = rb_fs)) {
9225 split_type = SPLIT_TYPE_AWK;
9226 }
9227 else if (!(spat = rb_fs_check(spat))) {
9228 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9229 }
9230 else {
9231 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9232 }
9233 if (split_type != SPLIT_TYPE_AWK) {
9234 switch (BUILTIN_TYPE(spat)) {
9235 case T_REGEXP:
9236 rb_reg_options(spat); /* check if uninitialized */
9237 tmp = RREGEXP_SRC(spat);
9238 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9239 if (split_type == SPLIT_TYPE_AWK) {
9240 spat = tmp;
9241 split_type = SPLIT_TYPE_STRING;
9242 }
9243 break;
9244
9245 case T_STRING:
9246 mustnot_broken(spat);
9247 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9248 break;
9249
9250 default:
9252 }
9253 }
9254
9255#define SPLIT_STR(beg, len) ( \
9256 empty_count = split_string(result, str, beg, len, empty_count), \
9257 str_mod_check(str, str_start, str_len))
9258
9259 beg = 0;
9260 char *ptr = RSTRING_PTR(str);
9261 char *const str_start = ptr;
9262 const long str_len = RSTRING_LEN(str);
9263 char *const eptr = str_start + str_len;
9264 if (split_type == SPLIT_TYPE_AWK) {
9265 char *bptr = ptr;
9266 int skip = 1;
9267 unsigned int c;
9268
9269 if (result) result = rb_ary_new();
9270 end = beg;
9271 if (is_ascii_string(str)) {
9272 while (ptr < eptr) {
9273 c = (unsigned char)*ptr++;
9274 if (skip) {
9275 if (ascii_isspace(c)) {
9276 beg = ptr - bptr;
9277 }
9278 else {
9279 end = ptr - bptr;
9280 skip = 0;
9281 if (!NIL_P(limit) && lim <= i) break;
9282 }
9283 }
9284 else if (ascii_isspace(c)) {
9285 SPLIT_STR(beg, end-beg);
9286 skip = 1;
9287 beg = ptr - bptr;
9288 if (!NIL_P(limit)) ++i;
9289 }
9290 else {
9291 end = ptr - bptr;
9292 }
9293 }
9294 }
9295 else {
9296 while (ptr < eptr) {
9297 int n;
9298
9299 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9300 ptr += n;
9301 if (skip) {
9302 if (rb_isspace(c)) {
9303 beg = ptr - bptr;
9304 }
9305 else {
9306 end = ptr - bptr;
9307 skip = 0;
9308 if (!NIL_P(limit) && lim <= i) break;
9309 }
9310 }
9311 else if (rb_isspace(c)) {
9312 SPLIT_STR(beg, end-beg);
9313 skip = 1;
9314 beg = ptr - bptr;
9315 if (!NIL_P(limit)) ++i;
9316 }
9317 else {
9318 end = ptr - bptr;
9319 }
9320 }
9321 }
9322 }
9323 else if (split_type == SPLIT_TYPE_STRING) {
9324 char *substr_start = ptr;
9325 char *sptr = RSTRING_PTR(spat);
9326 long slen = RSTRING_LEN(spat);
9327
9328 if (result) result = rb_ary_new();
9329 mustnot_broken(str);
9330 enc = rb_enc_check(str, spat);
9331 while (ptr < eptr &&
9332 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9333 /* Check we are at the start of a char */
9334 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9335 if (t != ptr + end) {
9336 ptr = t;
9337 continue;
9338 }
9339 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9340 str_mod_check(spat, sptr, slen);
9341 ptr += end + slen;
9342 substr_start = ptr;
9343 if (!NIL_P(limit) && lim <= ++i) break;
9344 }
9345 beg = ptr - str_start;
9346 }
9347 else if (split_type == SPLIT_TYPE_CHARS) {
9348 int n;
9349
9350 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9351 mustnot_broken(str);
9352 enc = rb_enc_get(str);
9353 while (ptr < eptr &&
9354 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9355 SPLIT_STR(ptr - str_start, n);
9356 ptr += n;
9357 if (!NIL_P(limit) && lim <= ++i) break;
9358 }
9359 beg = ptr - str_start;
9360 }
9361 else {
9362 if (result) result = rb_ary_new();
9363 long len = RSTRING_LEN(str);
9364 long start = beg;
9365 long idx;
9366 int last_null = 0;
9367 struct re_registers *regs;
9368 VALUE match = 0;
9369
9370 for (; rb_reg_search(spat, str, start, 0) >= 0;
9371 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9372 match = rb_backref_get();
9373 if (!result) rb_match_busy(match);
9374 regs = RMATCH_REGS(match);
9375 end = BEG(0);
9376 if (start == end && BEG(0) == END(0)) {
9377 if (!ptr) {
9378 SPLIT_STR(0, 0);
9379 break;
9380 }
9381 else if (last_null == 1) {
9382 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9383 beg = start;
9384 }
9385 else {
9386 if (start == len)
9387 start++;
9388 else
9389 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9390 last_null = 1;
9391 continue;
9392 }
9393 }
9394 else {
9395 SPLIT_STR(beg, end-beg);
9396 beg = start = END(0);
9397 }
9398 last_null = 0;
9399
9400 for (idx=1; idx < regs->num_regs; idx++) {
9401 if (BEG(idx) == -1) continue;
9402 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9403 }
9404 if (!NIL_P(limit) && lim <= ++i) break;
9405 }
9406 if (match) rb_match_unbusy(match);
9407 }
9408 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9409 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9410 }
9411
9412 return result ? result : str;
9413}
9414
9415VALUE
9416rb_str_split(VALUE str, const char *sep0)
9417{
9418 VALUE sep;
9419
9420 StringValue(str);
9421 sep = rb_str_new_cstr(sep0);
9422 return rb_str_split_m(1, &sep, str);
9423}
9424
9425#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9426
9427static inline int
9428enumerator_element(VALUE ary, VALUE e)
9429{
9430 if (ary) {
9431 rb_ary_push(ary, e);
9432 return 0;
9433 }
9434 else {
9435 rb_yield(e);
9436 return 1;
9437 }
9438}
9439
9440#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9441
9442static const char *
9443chomp_newline(const char *p, const char *e, rb_encoding *enc)
9444{
9445 const char *prev = rb_enc_prev_char(p, e, e, enc);
9446 if (rb_enc_is_newline(prev, e, enc)) {
9447 e = prev;
9448 prev = rb_enc_prev_char(p, e, e, enc);
9449 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9450 e = prev;
9451 }
9452 return e;
9453}
9454
9455static VALUE
9456get_rs(void)
9457{
9458 VALUE rs = rb_rs;
9459 if (!NIL_P(rs) &&
9460 (!RB_TYPE_P(rs, T_STRING) ||
9461 RSTRING_LEN(rs) != 1 ||
9462 RSTRING_PTR(rs)[0] != '\n')) {
9463 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9464 }
9465 return rs;
9466}
9467
9468#define rb_rs get_rs()
9469
9470static VALUE
9471rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9472{
9473 rb_encoding *enc;
9474 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9475 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9476 long pos, len, rslen;
9477 int rsnewline = 0;
9478
9479 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9480 rs = rb_rs;
9481 if (!NIL_P(opts)) {
9482 static ID keywords[1];
9483 if (!keywords[0]) {
9484 keywords[0] = rb_intern_const("chomp");
9485 }
9486 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9487 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9488 }
9489
9490 if (NIL_P(rs)) {
9491 if (!ENUM_ELEM(ary, str)) {
9492 return ary;
9493 }
9494 else {
9495 return orig;
9496 }
9497 }
9498
9499 if (!RSTRING_LEN(str)) goto end;
9500 str = rb_str_new_frozen(str);
9501 ptr = subptr = RSTRING_PTR(str);
9502 pend = RSTRING_END(str);
9503 len = RSTRING_LEN(str);
9504 StringValue(rs);
9505 rslen = RSTRING_LEN(rs);
9506
9507 if (rs == rb_default_rs)
9508 enc = rb_enc_get(str);
9509 else
9510 enc = rb_enc_check(str, rs);
9511
9512 if (rslen == 0) {
9513 /* paragraph mode */
9514 int n;
9515 const char *eol = NULL;
9516 subend = subptr;
9517 while (subend < pend) {
9518 long chomp_rslen = 0;
9519 do {
9520 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9521 n = 0;
9522 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9523 if (rb_enc_is_newline(subend + n, pend, enc)) {
9524 if (eol == subend) break;
9525 subend += rslen;
9526 if (subptr) {
9527 eol = subend;
9528 chomp_rslen = -rslen;
9529 }
9530 }
9531 else {
9532 if (!subptr) subptr = subend;
9533 subend += rslen;
9534 }
9535 rslen = 0;
9536 } while (subend < pend);
9537 if (!subptr) break;
9538 if (rslen == 0) chomp_rslen = 0;
9539 line = rb_str_subseq(str, subptr - ptr,
9540 subend - subptr + (chomp ? chomp_rslen : rslen));
9541 if (ENUM_ELEM(ary, line)) {
9542 str_mod_check(str, ptr, len);
9543 }
9544 subptr = eol = NULL;
9545 }
9546 goto end;
9547 }
9548 else {
9549 rsptr = RSTRING_PTR(rs);
9550 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9551 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9552 rsnewline = 1;
9553 }
9554 }
9555
9556 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9557 rs = rb_str_new(rsptr, rslen);
9558 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9559 rsptr = RSTRING_PTR(rs);
9560 rslen = RSTRING_LEN(rs);
9561 }
9562
9563 while (subptr < pend) {
9564 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9565 if (pos < 0) break;
9566 hit = subptr + pos;
9567 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9568 if (hit != adjusted) {
9569 subptr = adjusted;
9570 continue;
9571 }
9572 subend = hit += rslen;
9573 if (chomp) {
9574 if (rsnewline) {
9575 subend = chomp_newline(subptr, subend, enc);
9576 }
9577 else {
9578 subend -= rslen;
9579 }
9580 }
9581 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9582 if (ENUM_ELEM(ary, line)) {
9583 str_mod_check(str, ptr, len);
9584 }
9585 subptr = hit;
9586 }
9587
9588 if (subptr != pend) {
9589 if (chomp) {
9590 if (rsnewline) {
9591 pend = chomp_newline(subptr, pend, enc);
9592 }
9593 else if (pend - subptr >= rslen &&
9594 memcmp(pend - rslen, rsptr, rslen) == 0) {
9595 pend -= rslen;
9596 }
9597 }
9598 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9599 ENUM_ELEM(ary, line);
9600 RB_GC_GUARD(str);
9601 }
9602
9603 end:
9604 if (ary)
9605 return ary;
9606 else
9607 return orig;
9608}
9609
9610/*
9611 * call-seq:
9612 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9613 * each_line(line_sep = $/, chomp: false) -> enumerator
9614 *
9615 * :include: doc/string/each_line.rdoc
9616 *
9617 */
9618
9619static VALUE
9620rb_str_each_line(int argc, VALUE *argv, VALUE str)
9621{
9622 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9623 return rb_str_enumerate_lines(argc, argv, str, 0);
9624}
9625
9626/*
9627 * call-seq:
9628 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9629 *
9630 * Forms substrings ("lines") of +self+ according to the given arguments
9631 * (see String#each_line for details); returns the lines in an array.
9632 *
9633 */
9634
9635static VALUE
9636rb_str_lines(int argc, VALUE *argv, VALUE str)
9637{
9638 VALUE ary = WANTARRAY("lines", 0);
9639 return rb_str_enumerate_lines(argc, argv, str, ary);
9640}
9641
9642static VALUE
9643rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9644{
9645 return LONG2FIX(RSTRING_LEN(str));
9646}
9647
9648static VALUE
9649rb_str_enumerate_bytes(VALUE str, VALUE ary)
9650{
9651 long i;
9652
9653 for (i=0; i<RSTRING_LEN(str); i++) {
9654 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9655 }
9656 if (ary)
9657 return ary;
9658 else
9659 return str;
9660}
9661
9662/*
9663 * call-seq:
9664 * each_byte {|byte| ... } -> self
9665 * each_byte -> enumerator
9666 *
9667 * :include: doc/string/each_byte.rdoc
9668 *
9669 */
9670
9671static VALUE
9672rb_str_each_byte(VALUE str)
9673{
9674 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9675 return rb_str_enumerate_bytes(str, 0);
9676}
9677
9678/*
9679 * call-seq:
9680 * bytes -> array_of_bytes
9681 *
9682 * :include: doc/string/bytes.rdoc
9683 *
9684 */
9685
9686static VALUE
9687rb_str_bytes(VALUE str)
9688{
9689 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9690 return rb_str_enumerate_bytes(str, ary);
9691}
9692
9693static VALUE
9694rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9695{
9696 return rb_str_length(str);
9697}
9698
9699static VALUE
9700rb_str_enumerate_chars(VALUE str, VALUE ary)
9701{
9702 VALUE orig = str;
9703 long i, len, n;
9704 const char *ptr;
9705 rb_encoding *enc;
9706
9707 str = rb_str_new_frozen(str);
9708 ptr = RSTRING_PTR(str);
9709 len = RSTRING_LEN(str);
9710 enc = rb_enc_get(str);
9711
9713 for (i = 0; i < len; i += n) {
9714 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9715 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9716 }
9717 }
9718 else {
9719 for (i = 0; i < len; i += n) {
9720 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9721 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9722 }
9723 }
9724 RB_GC_GUARD(str);
9725 if (ary)
9726 return ary;
9727 else
9728 return orig;
9729}
9730
9731/*
9732 * call-seq:
9733 * each_char {|c| ... } -> self
9734 * each_char -> enumerator
9735 *
9736 * :include: doc/string/each_char.rdoc
9737 *
9738 */
9739
9740static VALUE
9741rb_str_each_char(VALUE str)
9742{
9743 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9744 return rb_str_enumerate_chars(str, 0);
9745}
9746
9747/*
9748 * call-seq:
9749 * chars -> array_of_characters
9750 *
9751 * :include: doc/string/chars.rdoc
9752 *
9753 */
9754
9755static VALUE
9756rb_str_chars(VALUE str)
9757{
9758 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9759 return rb_str_enumerate_chars(str, ary);
9760}
9761
9762static VALUE
9763rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9764{
9765 VALUE orig = str;
9766 int n;
9767 unsigned int c;
9768 const char *ptr, *end;
9769 rb_encoding *enc;
9770
9771 if (single_byte_optimizable(str))
9772 return rb_str_enumerate_bytes(str, ary);
9773
9774 str = rb_str_new_frozen(str);
9775 ptr = RSTRING_PTR(str);
9776 end = RSTRING_END(str);
9777 enc = STR_ENC_GET(str);
9778
9779 while (ptr < end) {
9780 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9781 ENUM_ELEM(ary, UINT2NUM(c));
9782 ptr += n;
9783 }
9784 RB_GC_GUARD(str);
9785 if (ary)
9786 return ary;
9787 else
9788 return orig;
9789}
9790
9791/*
9792 * call-seq:
9793 * each_codepoint {|integer| ... } -> self
9794 * each_codepoint -> enumerator
9795 *
9796 * :include: doc/string/each_codepoint.rdoc
9797 *
9798 */
9799
9800static VALUE
9801rb_str_each_codepoint(VALUE str)
9802{
9803 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9804 return rb_str_enumerate_codepoints(str, 0);
9805}
9806
9807/*
9808 * call-seq:
9809 * codepoints -> array_of_integers
9810 *
9811 * :include: doc/string/codepoints.rdoc
9812 *
9813 */
9814
9815static VALUE
9816rb_str_codepoints(VALUE str)
9817{
9818 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9819 return rb_str_enumerate_codepoints(str, ary);
9820}
9821
9822static regex_t *
9823get_reg_grapheme_cluster(rb_encoding *enc)
9824{
9825 int encidx = rb_enc_to_index(enc);
9826
9827 const OnigUChar source_ascii[] = "\\X";
9828 const OnigUChar *source = source_ascii;
9829 size_t source_len = sizeof(source_ascii) - 1;
9830
9831 switch (encidx) {
9832#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9833#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9834#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9835#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9836#define CASE_UTF(e) \
9837 case ENCINDEX_UTF_##e: { \
9838 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9839 source = source_UTF_##e; \
9840 source_len = sizeof(source_UTF_##e); \
9841 break; \
9842 }
9843 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9844#undef CASE_UTF
9845#undef CHARS_16BE
9846#undef CHARS_16LE
9847#undef CHARS_32BE
9848#undef CHARS_32LE
9849 }
9850
9851 regex_t *reg_grapheme_cluster;
9852 OnigErrorInfo einfo;
9853 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9854 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9855 if (r) {
9856 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9857 onig_error_code_to_str(message, r, &einfo);
9858 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9859 }
9860
9861 return reg_grapheme_cluster;
9862}
9863
9864static regex_t *
9865get_cached_reg_grapheme_cluster(rb_encoding *enc)
9866{
9867 int encidx = rb_enc_to_index(enc);
9868 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9869
9870 if (encidx == rb_utf8_encindex()) {
9871 if (!reg_grapheme_cluster_utf8) {
9872 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9873 }
9874
9875 return reg_grapheme_cluster_utf8;
9876 }
9877
9878 return NULL;
9879}
9880
9881static VALUE
9882rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9883{
9884 size_t grapheme_cluster_count = 0;
9885 rb_encoding *enc = get_encoding(str);
9886 const char *ptr, *end;
9887
9888 if (!rb_enc_unicode_p(enc)) {
9889 return rb_str_length(str);
9890 }
9891
9892 bool cached_reg_grapheme_cluster = true;
9893 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9894 if (!reg_grapheme_cluster) {
9895 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9896 cached_reg_grapheme_cluster = false;
9897 }
9898
9899 ptr = RSTRING_PTR(str);
9900 end = RSTRING_END(str);
9901
9902 while (ptr < end) {
9903 OnigPosition len = onig_match(reg_grapheme_cluster,
9904 (const OnigUChar *)ptr, (const OnigUChar *)end,
9905 (const OnigUChar *)ptr, NULL, 0);
9906 if (len <= 0) break;
9907 grapheme_cluster_count++;
9908 ptr += len;
9909 }
9910
9911 if (!cached_reg_grapheme_cluster) {
9912 onig_free(reg_grapheme_cluster);
9913 }
9914
9915 return SIZET2NUM(grapheme_cluster_count);
9916}
9917
9918static VALUE
9919rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9920{
9921 VALUE orig = str;
9922 rb_encoding *enc = get_encoding(str);
9923 const char *ptr0, *ptr, *end;
9924
9925 if (!rb_enc_unicode_p(enc)) {
9926 return rb_str_enumerate_chars(str, ary);
9927 }
9928
9929 if (!ary) str = rb_str_new_frozen(str);
9930
9931 bool cached_reg_grapheme_cluster = true;
9932 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9933 if (!reg_grapheme_cluster) {
9934 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9935 cached_reg_grapheme_cluster = false;
9936 }
9937
9938 ptr0 = ptr = RSTRING_PTR(str);
9939 end = RSTRING_END(str);
9940
9941 while (ptr < end) {
9942 OnigPosition len = onig_match(reg_grapheme_cluster,
9943 (const OnigUChar *)ptr, (const OnigUChar *)end,
9944 (const OnigUChar *)ptr, NULL, 0);
9945 if (len <= 0) break;
9946 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9947 ptr += len;
9948 }
9949
9950 if (!cached_reg_grapheme_cluster) {
9951 onig_free(reg_grapheme_cluster);
9952 }
9953
9954 RB_GC_GUARD(str);
9955 if (ary)
9956 return ary;
9957 else
9958 return orig;
9959}
9960
9961/*
9962 * call-seq:
9963 * each_grapheme_cluster {|gc| ... } -> self
9964 * each_grapheme_cluster -> enumerator
9965 *
9966 * :include: doc/string/each_grapheme_cluster.rdoc
9967 *
9968 */
9969
9970static VALUE
9971rb_str_each_grapheme_cluster(VALUE str)
9972{
9973 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9974 return rb_str_enumerate_grapheme_clusters(str, 0);
9975}
9976
9977/*
9978 * call-seq:
9979 * grapheme_clusters -> array_of_grapheme_clusters
9980 *
9981 * :include: doc/string/grapheme_clusters.rdoc
9982 *
9983 */
9984
9985static VALUE
9986rb_str_grapheme_clusters(VALUE str)
9987{
9988 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9989 return rb_str_enumerate_grapheme_clusters(str, ary);
9990}
9991
9992static long
9993chopped_length(VALUE str)
9994{
9995 rb_encoding *enc = STR_ENC_GET(str);
9996 const char *p, *p2, *beg, *end;
9997
9998 beg = RSTRING_PTR(str);
9999 end = beg + RSTRING_LEN(str);
10000 if (beg >= end) return 0;
10001 p = rb_enc_prev_char(beg, end, end, enc);
10002 if (!p) return 0;
10003 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10004 p2 = rb_enc_prev_char(beg, p, end, enc);
10005 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10006 }
10007 return p - beg;
10008}
10009
10010/*
10011 * call-seq:
10012 * chop! -> self or nil
10013 *
10014 * Like String#chop, but modifies +self+ in place;
10015 * returns +nil+ if +self+ is empty, +self+ otherwise.
10016 *
10017 * Related: String#chomp!.
10018 */
10019
10020static VALUE
10021rb_str_chop_bang(VALUE str)
10022{
10023 str_modify_keep_cr(str);
10024 if (RSTRING_LEN(str) > 0) {
10025 long len;
10026 len = chopped_length(str);
10027 STR_SET_LEN(str, len);
10028 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10029 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10031 }
10032 return str;
10033 }
10034 return Qnil;
10035}
10036
10037
10038/*
10039 * call-seq:
10040 * chop -> new_string
10041 *
10042 * :include: doc/string/chop.rdoc
10043 *
10044 */
10045
10046static VALUE
10047rb_str_chop(VALUE str)
10048{
10049 return rb_str_subseq(str, 0, chopped_length(str));
10050}
10051
10052static long
10053smart_chomp(VALUE str, const char *e, const char *p)
10054{
10055 rb_encoding *enc = rb_enc_get(str);
10056 if (rb_enc_mbminlen(enc) > 1) {
10057 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10058 if (rb_enc_is_newline(pp, e, enc)) {
10059 e = pp;
10060 }
10061 pp = e - rb_enc_mbminlen(enc);
10062 if (pp >= p) {
10063 pp = rb_enc_left_char_head(p, pp, e, enc);
10064 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10065 e = pp;
10066 }
10067 }
10068 }
10069 else {
10070 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10071 case '\n':
10072 if (--e > p && *(e-1) == '\r') {
10073 --e;
10074 }
10075 break;
10076 case '\r':
10077 --e;
10078 break;
10079 }
10080 }
10081 return e - p;
10082}
10083
10084static long
10085chompped_length(VALUE str, VALUE rs)
10086{
10087 rb_encoding *enc;
10088 int newline;
10089 char *pp, *e, *rsptr;
10090 long rslen;
10091 char *const p = RSTRING_PTR(str);
10092 long len = RSTRING_LEN(str);
10093
10094 if (len == 0) return 0;
10095 e = p + len;
10096 if (rs == rb_default_rs) {
10097 return smart_chomp(str, e, p);
10098 }
10099
10100 enc = rb_enc_get(str);
10101 RSTRING_GETMEM(rs, rsptr, rslen);
10102 if (rslen == 0) {
10103 if (rb_enc_mbminlen(enc) > 1) {
10104 while (e > p) {
10105 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10106 if (!rb_enc_is_newline(pp, e, enc)) break;
10107 e = pp;
10108 pp -= rb_enc_mbminlen(enc);
10109 if (pp >= p) {
10110 pp = rb_enc_left_char_head(p, pp, e, enc);
10111 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10112 e = pp;
10113 }
10114 }
10115 }
10116 }
10117 else {
10118 while (e > p && *(e-1) == '\n') {
10119 --e;
10120 if (e > p && *(e-1) == '\r')
10121 --e;
10122 }
10123 }
10124 return e - p;
10125 }
10126 if (rslen > len) return len;
10127
10128 enc = rb_enc_get(rs);
10129 newline = rsptr[rslen-1];
10130 if (rslen == rb_enc_mbminlen(enc)) {
10131 if (rslen == 1) {
10132 if (newline == '\n')
10133 return smart_chomp(str, e, p);
10134 }
10135 else {
10136 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10137 return smart_chomp(str, e, p);
10138 }
10139 }
10140
10141 enc = rb_enc_check(str, rs);
10142 if (is_broken_string(rs)) {
10143 return len;
10144 }
10145 pp = e - rslen;
10146 if (p[len-1] == newline &&
10147 (rslen <= 1 ||
10148 memcmp(rsptr, pp, rslen) == 0)) {
10149 if (at_char_boundary(p, pp, e, enc))
10150 return len - rslen;
10151 RB_GC_GUARD(rs);
10152 }
10153 return len;
10154}
10155
10161static VALUE
10162chomp_rs(int argc, const VALUE *argv)
10163{
10164 rb_check_arity(argc, 0, 1);
10165 if (argc > 0) {
10166 VALUE rs = argv[0];
10167 if (!NIL_P(rs)) StringValue(rs);
10168 return rs;
10169 }
10170 else {
10171 return rb_rs;
10172 }
10173}
10174
10175VALUE
10176rb_str_chomp_string(VALUE str, VALUE rs)
10177{
10178 long olen = RSTRING_LEN(str);
10179 long len = chompped_length(str, rs);
10180 if (len >= olen) return Qnil;
10181 str_modify_keep_cr(str);
10182 STR_SET_LEN(str, len);
10183 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10184 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10186 }
10187 return str;
10188}
10189
10190/*
10191 * call-seq:
10192 * chomp!(line_sep = $/) -> self or nil
10193 *
10194 * Like String#chomp, but modifies +self+ in place;
10195 * returns +nil+ if no modification made, +self+ otherwise.
10196 *
10197 */
10198
10199static VALUE
10200rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10201{
10202 VALUE rs;
10203 str_modifiable(str);
10204 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10205 rs = chomp_rs(argc, argv);
10206 if (NIL_P(rs)) return Qnil;
10207 return rb_str_chomp_string(str, rs);
10208}
10209
10210
10211/*
10212 * call-seq:
10213 * chomp(line_sep = $/) -> new_string
10214 *
10215 * :include: doc/string/chomp.rdoc
10216 *
10217 */
10218
10219static VALUE
10220rb_str_chomp(int argc, VALUE *argv, VALUE str)
10221{
10222 VALUE rs = chomp_rs(argc, argv);
10223 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10224 return rb_str_subseq(str, 0, chompped_length(str, rs));
10225}
10226
10227static long
10228lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10229{
10230 const char *const start = s;
10231
10232 if (!s || s >= e) return 0;
10233
10234 /* remove spaces at head */
10235 if (single_byte_optimizable(str)) {
10236 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10237 }
10238 else {
10239 while (s < e) {
10240 int n;
10241 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10242
10243 if (cc && !rb_isspace(cc)) break;
10244 s += n;
10245 }
10246 }
10247 return s - start;
10248}
10249
10250/*
10251 * call-seq:
10252 * lstrip! -> self or nil
10253 *
10254 * Like String#lstrip, except that any modifications are made in +self+;
10255 * returns +self+ if any modification are made, +nil+ otherwise.
10256 *
10257 * Related: String#rstrip!, String#strip!.
10258 */
10259
10260static VALUE
10261rb_str_lstrip_bang(VALUE str)
10262{
10263 rb_encoding *enc;
10264 char *start, *s;
10265 long olen, loffset;
10266
10267 str_modify_keep_cr(str);
10268 enc = STR_ENC_GET(str);
10269 RSTRING_GETMEM(str, start, olen);
10270 loffset = lstrip_offset(str, start, start+olen, enc);
10271 if (loffset > 0) {
10272 long len = olen-loffset;
10273 s = start + loffset;
10274 memmove(start, s, len);
10275 STR_SET_LEN(str, len);
10276 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10277 return str;
10278 }
10279 return Qnil;
10280}
10281
10282
10283/*
10284 * call-seq:
10285 * lstrip -> new_string
10286 *
10287 * Returns a copy of +self+ with leading whitespace removed;
10288 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10289 *
10290 * whitespace = "\x00\t\n\v\f\r "
10291 * s = whitespace + 'abc' + whitespace
10292 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10293 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10294 *
10295 * Related: String#rstrip, String#strip.
10296 */
10297
10298static VALUE
10299rb_str_lstrip(VALUE str)
10300{
10301 char *start;
10302 long len, loffset;
10303 RSTRING_GETMEM(str, start, len);
10304 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10305 if (loffset <= 0) return str_duplicate(rb_cString, str);
10306 return rb_str_subseq(str, loffset, len - loffset);
10307}
10308
10309static long
10310rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10311{
10312 const char *t;
10313
10314 rb_str_check_dummy_enc(enc);
10316 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10317 }
10318 if (!s || s >= e) return 0;
10319 t = e;
10320
10321 /* remove trailing spaces or '\0's */
10322 if (single_byte_optimizable(str)) {
10323 unsigned char c;
10324 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10325 }
10326 else {
10327 char *tp;
10328
10329 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10330 unsigned int c = rb_enc_codepoint(tp, e, enc);
10331 if (c && !rb_isspace(c)) break;
10332 t = tp;
10333 }
10334 }
10335 return e - t;
10336}
10337
10338/*
10339 * call-seq:
10340 * rstrip! -> self or nil
10341 *
10342 * Like String#rstrip, except that any modifications are made in +self+;
10343 * returns +self+ if any modification are made, +nil+ otherwise.
10344 *
10345 * Related: String#lstrip!, String#strip!.
10346 */
10347
10348static VALUE
10349rb_str_rstrip_bang(VALUE str)
10350{
10351 rb_encoding *enc;
10352 char *start;
10353 long olen, roffset;
10354
10355 str_modify_keep_cr(str);
10356 enc = STR_ENC_GET(str);
10357 RSTRING_GETMEM(str, start, olen);
10358 roffset = rstrip_offset(str, start, start+olen, enc);
10359 if (roffset > 0) {
10360 long len = olen - roffset;
10361
10362 STR_SET_LEN(str, len);
10363 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10364 return str;
10365 }
10366 return Qnil;
10367}
10368
10369
10370/*
10371 * call-seq:
10372 * rstrip -> new_string
10373 *
10374 * Returns a copy of the receiver with trailing whitespace removed;
10375 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10376 *
10377 * whitespace = "\x00\t\n\v\f\r "
10378 * s = whitespace + 'abc' + whitespace
10379 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10380 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10381 *
10382 * Related: String#lstrip, String#strip.
10383 */
10384
10385static VALUE
10386rb_str_rstrip(VALUE str)
10387{
10388 rb_encoding *enc;
10389 char *start;
10390 long olen, roffset;
10391
10392 enc = STR_ENC_GET(str);
10393 RSTRING_GETMEM(str, start, olen);
10394 roffset = rstrip_offset(str, start, start+olen, enc);
10395
10396 if (roffset <= 0) return str_duplicate(rb_cString, str);
10397 return rb_str_subseq(str, 0, olen-roffset);
10398}
10399
10400
10401/*
10402 * call-seq:
10403 * strip! -> self or nil
10404 *
10405 * Like String#strip, except that any modifications are made in +self+;
10406 * returns +self+ if any modification are made, +nil+ otherwise.
10407 *
10408 * Related: String#lstrip!, String#strip!.
10409 */
10410
10411static VALUE
10412rb_str_strip_bang(VALUE str)
10413{
10414 char *start;
10415 long olen, loffset, roffset;
10416 rb_encoding *enc;
10417
10418 str_modify_keep_cr(str);
10419 enc = STR_ENC_GET(str);
10420 RSTRING_GETMEM(str, start, olen);
10421 loffset = lstrip_offset(str, start, start+olen, enc);
10422 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10423
10424 if (loffset > 0 || roffset > 0) {
10425 long len = olen-roffset;
10426 if (loffset > 0) {
10427 len -= loffset;
10428 memmove(start, start + loffset, len);
10429 }
10430 STR_SET_LEN(str, len);
10431 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10432 return str;
10433 }
10434 return Qnil;
10435}
10436
10437
10438/*
10439 * call-seq:
10440 * strip -> new_string
10441 *
10442 * Returns a copy of the receiver with leading and trailing whitespace removed;
10443 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10444 *
10445 * whitespace = "\x00\t\n\v\f\r "
10446 * s = whitespace + 'abc' + whitespace
10447 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10448 * s.strip # => "abc"
10449 *
10450 * Related: String#lstrip, String#rstrip.
10451 */
10452
10453static VALUE
10454rb_str_strip(VALUE str)
10455{
10456 char *start;
10457 long olen, loffset, roffset;
10458 rb_encoding *enc = STR_ENC_GET(str);
10459
10460 RSTRING_GETMEM(str, start, olen);
10461 loffset = lstrip_offset(str, start, start+olen, enc);
10462 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10463
10464 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10465 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10466}
10467
10468static VALUE
10469scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10470{
10471 VALUE result = Qnil;
10472 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10473 if (pos >= 0) {
10474 VALUE match;
10475 struct re_registers *regs;
10476 if (BUILTIN_TYPE(pat) == T_STRING) {
10477 regs = NULL;
10478 end = pos + RSTRING_LEN(pat);
10479 }
10480 else {
10481 match = rb_backref_get();
10482 regs = RMATCH_REGS(match);
10483 pos = BEG(0);
10484 end = END(0);
10485 }
10486
10487 if (pos == end) {
10488 rb_encoding *enc = STR_ENC_GET(str);
10489 /*
10490 * Always consume at least one character of the input string
10491 */
10492 if (RSTRING_LEN(str) > end)
10493 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10494 RSTRING_END(str), enc);
10495 else
10496 *start = end + 1;
10497 }
10498 else {
10499 *start = end;
10500 }
10501
10502 if (!regs || regs->num_regs == 1) {
10503 result = rb_str_subseq(str, pos, end - pos);
10504 return result;
10505 }
10506 else {
10507 result = rb_ary_new2(regs->num_regs);
10508 for (int i = 1; i < regs->num_regs; i++) {
10509 VALUE s = Qnil;
10510 if (BEG(i) >= 0) {
10511 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10512 }
10513
10514 rb_ary_push(result, s);
10515 }
10516 }
10517
10518 RB_GC_GUARD(match);
10519 }
10520
10521 return result;
10522}
10523
10524
10525/*
10526 * call-seq:
10527 * scan(string_or_regexp) -> array
10528 * scan(string_or_regexp) {|matches| ... } -> self
10529 *
10530 * Matches a pattern against +self+; the pattern is:
10531 *
10532 * - +string_or_regexp+ itself, if it is a Regexp.
10533 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10534 *
10535 * Iterates through +self+, generating a collection of matching results:
10536 *
10537 * - If the pattern contains no groups, each result is the
10538 * matched string, <code>$&</code>.
10539 * - If the pattern contains groups, each result is an array
10540 * containing one entry per group.
10541 *
10542 * With no block given, returns an array of the results:
10543 *
10544 * s = 'cruel world'
10545 * s.scan(/\w+/) # => ["cruel", "world"]
10546 * s.scan(/.../) # => ["cru", "el ", "wor"]
10547 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10548 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10549 *
10550 * With a block given, calls the block with each result; returns +self+:
10551 *
10552 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10553 * print "\n"
10554 * s.scan(/(.)(.)/) {|x,y| print y, x }
10555 * print "\n"
10556 *
10557 * Output:
10558 *
10559 * <<cruel>> <<world>>
10560 * rceu lowlr
10561 *
10562 */
10563
10564static VALUE
10565rb_str_scan(VALUE str, VALUE pat)
10566{
10567 VALUE result;
10568 long start = 0;
10569 long last = -1, prev = 0;
10570 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10571
10572 pat = get_pat_quoted(pat, 1);
10573 mustnot_broken(str);
10574 if (!rb_block_given_p()) {
10575 VALUE ary = rb_ary_new();
10576
10577 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10578 last = prev;
10579 prev = start;
10580 rb_ary_push(ary, result);
10581 }
10582 if (last >= 0) rb_pat_search(pat, str, last, 1);
10583 else rb_backref_set(Qnil);
10584 return ary;
10585 }
10586
10587 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10588 last = prev;
10589 prev = start;
10590 rb_yield(result);
10591 str_mod_check(str, p, len);
10592 }
10593 if (last >= 0) rb_pat_search(pat, str, last, 1);
10594 return str;
10595}
10596
10597
10598/*
10599 * call-seq:
10600 * hex -> integer
10601 *
10602 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10603 * (with an optional sign and an optional <code>0x</code>) and returns the
10604 * corresponding number;
10605 * returns zero if there is no such leading substring:
10606 *
10607 * '0x0a'.hex # => 10
10608 * '-1234'.hex # => -4660
10609 * '0'.hex # => 0
10610 * 'non-numeric'.hex # => 0
10611 *
10612 * Related: String#oct.
10613 *
10614 */
10615
10616static VALUE
10617rb_str_hex(VALUE str)
10618{
10619 return rb_str_to_inum(str, 16, FALSE);
10620}
10621
10622
10623/*
10624 * call-seq:
10625 * oct -> integer
10626 *
10627 * Interprets the leading substring of +self+ as a string of octal digits
10628 * (with an optional sign) and returns the corresponding number;
10629 * returns zero if there is no such leading substring:
10630 *
10631 * '123'.oct # => 83
10632 * '-377'.oct # => -255
10633 * '0377non-numeric'.oct # => 255
10634 * 'non-numeric'.oct # => 0
10635 *
10636 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10637 * see Kernel#Integer.
10638 *
10639 * Related: String#hex.
10640 *
10641 */
10642
10643static VALUE
10644rb_str_oct(VALUE str)
10645{
10646 return rb_str_to_inum(str, -8, FALSE);
10647}
10648
10649#ifndef HAVE_CRYPT_R
10650# include "ruby/thread_native.h"
10651# include "ruby/atomic.h"
10652
10653static struct {
10654 rb_nativethread_lock_t lock;
10655} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10656
10657static void
10658crypt_mutex_initialize(void)
10659{
10660}
10661#endif
10662
10663/*
10664 * call-seq:
10665 * crypt(salt_str) -> new_string
10666 *
10667 * Returns the string generated by calling <code>crypt(3)</code>
10668 * standard library function with <code>str</code> and
10669 * <code>salt_str</code>, in this order, as its arguments. Please do
10670 * not use this method any longer. It is legacy; provided only for
10671 * backward compatibility with ruby scripts in earlier days. It is
10672 * bad to use in contemporary programs for several reasons:
10673 *
10674 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10675 * run. The generated string lacks data portability.
10676 *
10677 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10678 * (i.e. silently ends up in unexpected results).
10679 *
10680 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10681 * thread safe.
10682 *
10683 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10684 * very very weak. According to its manpage, Linux's traditional
10685 * <code>crypt(3)</code> output has only 2**56 variations; too
10686 * easy to brute force today. And this is the default behaviour.
10687 *
10688 * * In order to make things robust some OSes implement so-called
10689 * "modular" usage. To go through, you have to do a complex
10690 * build-up of the <code>salt_str</code> parameter, by hand.
10691 * Failure in generation of a proper salt string tends not to
10692 * yield any errors; typos in parameters are normally not
10693 * detectable.
10694 *
10695 * * For instance, in the following example, the second invocation
10696 * of String#crypt is wrong; it has a typo in "round=" (lacks
10697 * "s"). However the call does not fail and something unexpected
10698 * is generated.
10699 *
10700 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10701 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10702 *
10703 * * Even in the "modular" mode, some hash functions are considered
10704 * archaic and no longer recommended at all; for instance module
10705 * <code>$1$</code> is officially abandoned by its author: see
10706 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10707 * instance module <code>$3$</code> is considered completely
10708 * broken: see the manpage of FreeBSD.
10709 *
10710 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10711 * written above, <code>crypt(3)</code> on Mac OS never fails.
10712 * This means even if you build up a proper salt string it
10713 * generates a traditional DES hash anyways, and there is no way
10714 * for you to be aware of.
10715 *
10716 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10717 *
10718 * If for some reason you cannot migrate to other secure contemporary
10719 * password hashing algorithms, install the string-crypt gem and
10720 * <code>require 'string/crypt'</code> to continue using it.
10721 */
10722
10723static VALUE
10724rb_str_crypt(VALUE str, VALUE salt)
10725{
10726#ifdef HAVE_CRYPT_R
10727 VALUE databuf;
10728 struct crypt_data *data;
10729# define CRYPT_END() ALLOCV_END(databuf)
10730#else
10731 extern char *crypt(const char *, const char *);
10732# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10733#endif
10734 VALUE result;
10735 const char *s, *saltp;
10736 char *res;
10737#ifdef BROKEN_CRYPT
10738 char salt_8bit_clean[3];
10739#endif
10740
10741 StringValue(salt);
10742 mustnot_wchar(str);
10743 mustnot_wchar(salt);
10744 s = StringValueCStr(str);
10745 saltp = RSTRING_PTR(salt);
10746 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10747 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10748 }
10749
10750#ifdef BROKEN_CRYPT
10751 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10752 salt_8bit_clean[0] = saltp[0] & 0x7f;
10753 salt_8bit_clean[1] = saltp[1] & 0x7f;
10754 salt_8bit_clean[2] = '\0';
10755 saltp = salt_8bit_clean;
10756 }
10757#endif
10758#ifdef HAVE_CRYPT_R
10759 data = ALLOCV(databuf, sizeof(struct crypt_data));
10760# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10761 data->initialized = 0;
10762# endif
10763 res = crypt_r(s, saltp, data);
10764#else
10765 crypt_mutex_initialize();
10766 rb_nativethread_lock_lock(&crypt_mutex.lock);
10767 res = crypt(s, saltp);
10768#endif
10769 if (!res) {
10770 int err = errno;
10771 CRYPT_END();
10772 rb_syserr_fail(err, "crypt");
10773 }
10774 result = rb_str_new_cstr(res);
10775 CRYPT_END();
10776 return result;
10777}
10778
10779
10780/*
10781 * call-seq:
10782 * ord -> integer
10783 *
10784 * :include: doc/string/ord.rdoc
10785 *
10786 */
10787
10788static VALUE
10789rb_str_ord(VALUE s)
10790{
10791 unsigned int c;
10792
10793 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10794 return UINT2NUM(c);
10795}
10796/*
10797 * call-seq:
10798 * sum(n = 16) -> integer
10799 *
10800 * :include: doc/string/sum.rdoc
10801 *
10802 */
10803
10804static VALUE
10805rb_str_sum(int argc, VALUE *argv, VALUE str)
10806{
10807 int bits = 16;
10808 char *ptr, *p, *pend;
10809 long len;
10810 VALUE sum = INT2FIX(0);
10811 unsigned long sum0 = 0;
10812
10813 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10814 bits = 0;
10815 }
10816 ptr = p = RSTRING_PTR(str);
10817 len = RSTRING_LEN(str);
10818 pend = p + len;
10819
10820 while (p < pend) {
10821 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10822 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10823 str_mod_check(str, ptr, len);
10824 sum0 = 0;
10825 }
10826 sum0 += (unsigned char)*p;
10827 p++;
10828 }
10829
10830 if (bits == 0) {
10831 if (sum0) {
10832 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10833 }
10834 }
10835 else {
10836 if (sum == INT2FIX(0)) {
10837 if (bits < (int)sizeof(long)*CHAR_BIT) {
10838 sum0 &= (((unsigned long)1)<<bits)-1;
10839 }
10840 sum = LONG2FIX(sum0);
10841 }
10842 else {
10843 VALUE mod;
10844
10845 if (sum0) {
10846 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10847 }
10848
10849 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10850 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10851 sum = rb_funcall(sum, '&', 1, mod);
10852 }
10853 }
10854 return sum;
10855}
10856
10857static VALUE
10858rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10859{
10860 rb_encoding *enc;
10861 VALUE w;
10862 long width, len, flen = 1, fclen = 1;
10863 VALUE res;
10864 char *p;
10865 const char *f = " ";
10866 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10867 VALUE pad;
10868 int singlebyte = 1, cr;
10869 int termlen;
10870
10871 rb_scan_args(argc, argv, "11", &w, &pad);
10872 enc = STR_ENC_GET(str);
10873 termlen = rb_enc_mbminlen(enc);
10874 width = NUM2LONG(w);
10875 if (argc == 2) {
10876 StringValue(pad);
10877 enc = rb_enc_check(str, pad);
10878 f = RSTRING_PTR(pad);
10879 flen = RSTRING_LEN(pad);
10880 fclen = str_strlen(pad, enc); /* rb_enc_check */
10881 singlebyte = single_byte_optimizable(pad);
10882 if (flen == 0 || fclen == 0) {
10883 rb_raise(rb_eArgError, "zero width padding");
10884 }
10885 }
10886 len = str_strlen(str, enc); /* rb_enc_check */
10887 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10888 n = width - len;
10889 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10890 rlen = n - llen;
10891 cr = ENC_CODERANGE(str);
10892 if (flen > 1) {
10893 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10894 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10895 }
10896 size = RSTRING_LEN(str);
10897 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10898 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10899 (len += llen2 + rlen2) >= LONG_MAX - size) {
10900 rb_raise(rb_eArgError, "argument too big");
10901 }
10902 len += size;
10903 res = str_enc_new(rb_cString, 0, len, enc);
10904 p = RSTRING_PTR(res);
10905 if (flen <= 1) {
10906 memset(p, *f, llen);
10907 p += llen;
10908 }
10909 else {
10910 while (llen >= fclen) {
10911 memcpy(p,f,flen);
10912 p += flen;
10913 llen -= fclen;
10914 }
10915 if (llen > 0) {
10916 memcpy(p, f, llen2);
10917 p += llen2;
10918 }
10919 }
10920 memcpy(p, RSTRING_PTR(str), size);
10921 p += size;
10922 if (flen <= 1) {
10923 memset(p, *f, rlen);
10924 p += rlen;
10925 }
10926 else {
10927 while (rlen >= fclen) {
10928 memcpy(p,f,flen);
10929 p += flen;
10930 rlen -= fclen;
10931 }
10932 if (rlen > 0) {
10933 memcpy(p, f, rlen2);
10934 p += rlen2;
10935 }
10936 }
10937 TERM_FILL(p, termlen);
10938 STR_SET_LEN(res, p-RSTRING_PTR(res));
10939
10940 if (argc == 2)
10941 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10942 if (cr != ENC_CODERANGE_BROKEN)
10943 ENC_CODERANGE_SET(res, cr);
10944
10945 RB_GC_GUARD(pad);
10946 return res;
10947}
10948
10949
10950/*
10951 * call-seq:
10952 * ljust(size, pad_string = ' ') -> new_string
10953 *
10954 * :include: doc/string/ljust.rdoc
10955 *
10956 * Related: String#rjust, String#center.
10957 *
10958 */
10959
10960static VALUE
10961rb_str_ljust(int argc, VALUE *argv, VALUE str)
10962{
10963 return rb_str_justify(argc, argv, str, 'l');
10964}
10965
10966/*
10967 * call-seq:
10968 * rjust(size, pad_string = ' ') -> new_string
10969 *
10970 * :include: doc/string/rjust.rdoc
10971 *
10972 * Related: String#ljust, String#center.
10973 *
10974 */
10975
10976static VALUE
10977rb_str_rjust(int argc, VALUE *argv, VALUE str)
10978{
10979 return rb_str_justify(argc, argv, str, 'r');
10980}
10981
10982
10983/*
10984 * call-seq:
10985 * center(size, pad_string = ' ') -> new_string
10986 *
10987 * :include: doc/string/center.rdoc
10988 *
10989 * Related: String#ljust, String#rjust.
10990 *
10991 */
10992
10993static VALUE
10994rb_str_center(int argc, VALUE *argv, VALUE str)
10995{
10996 return rb_str_justify(argc, argv, str, 'c');
10997}
10998
10999/*
11000 * call-seq:
11001 * partition(string_or_regexp) -> [head, match, tail]
11002 *
11003 * :include: doc/string/partition.rdoc
11004 *
11005 */
11006
11007static VALUE
11008rb_str_partition(VALUE str, VALUE sep)
11009{
11010 long pos;
11011
11012 sep = get_pat_quoted(sep, 0);
11013 if (RB_TYPE_P(sep, T_REGEXP)) {
11014 if (rb_reg_search(sep, str, 0, 0) < 0) {
11015 goto failed;
11016 }
11017 VALUE match = rb_backref_get();
11018 struct re_registers *regs = RMATCH_REGS(match);
11019
11020 pos = BEG(0);
11021 sep = rb_str_subseq(str, pos, END(0) - pos);
11022 }
11023 else {
11024 pos = rb_str_index(str, sep, 0);
11025 if (pos < 0) goto failed;
11026 }
11027 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11028 sep,
11029 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11030 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11031
11032 failed:
11033 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11034}
11035
11036/*
11037 * call-seq:
11038 * rpartition(sep) -> [head, match, tail]
11039 *
11040 * :include: doc/string/rpartition.rdoc
11041 *
11042 */
11043
11044static VALUE
11045rb_str_rpartition(VALUE str, VALUE sep)
11046{
11047 long pos = RSTRING_LEN(str);
11048
11049 sep = get_pat_quoted(sep, 0);
11050 if (RB_TYPE_P(sep, T_REGEXP)) {
11051 if (rb_reg_search(sep, str, pos, 1) < 0) {
11052 goto failed;
11053 }
11054 VALUE match = rb_backref_get();
11055 struct re_registers *regs = RMATCH_REGS(match);
11056
11057 pos = BEG(0);
11058 sep = rb_str_subseq(str, pos, END(0) - pos);
11059 }
11060 else {
11061 pos = rb_str_sublen(str, pos);
11062 pos = rb_str_rindex(str, sep, pos);
11063 if (pos < 0) {
11064 goto failed;
11065 }
11066 }
11067
11068 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11069 sep,
11070 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11071 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11072 failed:
11073 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11074}
11075
11076/*
11077 * call-seq:
11078 * start_with?(*string_or_regexp) -> true or false
11079 *
11080 * :include: doc/string/start_with_p.rdoc
11081 *
11082 */
11083
11084static VALUE
11085rb_str_start_with(int argc, VALUE *argv, VALUE str)
11086{
11087 int i;
11088
11089 for (i=0; i<argc; i++) {
11090 VALUE tmp = argv[i];
11091 if (RB_TYPE_P(tmp, T_REGEXP)) {
11092 if (rb_reg_start_with_p(tmp, str))
11093 return Qtrue;
11094 }
11095 else {
11096 const char *p, *s, *e;
11097 long slen, tlen;
11098 rb_encoding *enc;
11099
11100 StringValue(tmp);
11101 enc = rb_enc_check(str, tmp);
11102 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11103 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11104 p = RSTRING_PTR(str);
11105 e = p + slen;
11106 s = p + tlen;
11107 if (!at_char_right_boundary(p, s, e, enc))
11108 continue;
11109 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11110 return Qtrue;
11111 }
11112 }
11113 return Qfalse;
11114}
11115
11116/*
11117 * call-seq:
11118 * end_with?(*strings) -> true or false
11119 *
11120 * :include: doc/string/end_with_p.rdoc
11121 *
11122 */
11123
11124static VALUE
11125rb_str_end_with(int argc, VALUE *argv, VALUE str)
11126{
11127 int i;
11128
11129 for (i=0; i<argc; i++) {
11130 VALUE tmp = argv[i];
11131 const char *p, *s, *e;
11132 long slen, tlen;
11133 rb_encoding *enc;
11134
11135 StringValue(tmp);
11136 enc = rb_enc_check(str, tmp);
11137 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11138 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11139 p = RSTRING_PTR(str);
11140 e = p + slen;
11141 s = e - tlen;
11142 if (!at_char_boundary(p, s, e, enc))
11143 continue;
11144 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11145 return Qtrue;
11146 }
11147 return Qfalse;
11148}
11149
11159static long
11160deleted_prefix_length(VALUE str, VALUE prefix)
11161{
11162 const char *strptr, *prefixptr;
11163 long olen, prefixlen;
11164 rb_encoding *enc = rb_enc_get(str);
11165
11166 StringValue(prefix);
11167
11168 if (!is_broken_string(prefix) ||
11169 !rb_enc_asciicompat(enc) ||
11170 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11171 enc = rb_enc_check(str, prefix);
11172 }
11173
11174 /* return 0 if not start with prefix */
11175 prefixlen = RSTRING_LEN(prefix);
11176 if (prefixlen <= 0) return 0;
11177 olen = RSTRING_LEN(str);
11178 if (olen < prefixlen) return 0;
11179 strptr = RSTRING_PTR(str);
11180 prefixptr = RSTRING_PTR(prefix);
11181 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11182 if (is_broken_string(prefix)) {
11183 if (!is_broken_string(str)) {
11184 /* prefix in a valid string cannot be broken */
11185 return 0;
11186 }
11187 const char *strend = strptr + olen;
11188 const char *after_prefix = strptr + prefixlen;
11189 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11190 /* prefix does not end at char-boundary */
11191 return 0;
11192 }
11193 }
11194 /* prefix part in `str` also should be valid. */
11195
11196 return prefixlen;
11197}
11198
11199/*
11200 * call-seq:
11201 * delete_prefix!(prefix) -> self or nil
11202 *
11203 * Like String#delete_prefix, except that +self+ is modified in place.
11204 * Returns +self+ if the prefix is removed, +nil+ otherwise.
11205 *
11206 */
11207
11208static VALUE
11209rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11210{
11211 long prefixlen;
11212 str_modify_keep_cr(str);
11213
11214 prefixlen = deleted_prefix_length(str, prefix);
11215 if (prefixlen <= 0) return Qnil;
11216
11217 return rb_str_drop_bytes(str, prefixlen);
11218}
11219
11220/*
11221 * call-seq:
11222 * delete_prefix(prefix) -> new_string
11223 *
11224 * :include: doc/string/delete_prefix.rdoc
11225 *
11226 */
11227
11228static VALUE
11229rb_str_delete_prefix(VALUE str, VALUE prefix)
11230{
11231 long prefixlen;
11232
11233 prefixlen = deleted_prefix_length(str, prefix);
11234 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11235
11236 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11237}
11238
11248static long
11249deleted_suffix_length(VALUE str, VALUE suffix)
11250{
11251 const char *strptr, *suffixptr;
11252 long olen, suffixlen;
11253 rb_encoding *enc;
11254
11255 StringValue(suffix);
11256 if (is_broken_string(suffix)) return 0;
11257 enc = rb_enc_check(str, suffix);
11258
11259 /* return 0 if not start with suffix */
11260 suffixlen = RSTRING_LEN(suffix);
11261 if (suffixlen <= 0) return 0;
11262 olen = RSTRING_LEN(str);
11263 if (olen < suffixlen) return 0;
11264 strptr = RSTRING_PTR(str);
11265 suffixptr = RSTRING_PTR(suffix);
11266 const char *strend = strptr + olen;
11267 const char *before_suffix = strend - suffixlen;
11268 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11269 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11270
11271 return suffixlen;
11272}
11273
11274/*
11275 * call-seq:
11276 * delete_suffix!(suffix) -> self or nil
11277 *
11278 * Like String#delete_suffix, except that +self+ is modified in place.
11279 * Returns +self+ if the suffix is removed, +nil+ otherwise.
11280 *
11281 */
11282
11283static VALUE
11284rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11285{
11286 long olen, suffixlen, len;
11287 str_modifiable(str);
11288
11289 suffixlen = deleted_suffix_length(str, suffix);
11290 if (suffixlen <= 0) return Qnil;
11291
11292 olen = RSTRING_LEN(str);
11293 str_modify_keep_cr(str);
11294 len = olen - suffixlen;
11295 STR_SET_LEN(str, len);
11296 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11297 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11299 }
11300 return str;
11301}
11302
11303/*
11304 * call-seq:
11305 * delete_suffix(suffix) -> new_string
11306 *
11307 * :include: doc/string/delete_suffix.rdoc
11308 *
11309 */
11310
11311static VALUE
11312rb_str_delete_suffix(VALUE str, VALUE suffix)
11313{
11314 long suffixlen;
11315
11316 suffixlen = deleted_suffix_length(str, suffix);
11317 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11318
11319 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11320}
11321
11322void
11323rb_str_setter(VALUE val, ID id, VALUE *var)
11324{
11325 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11326 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11327 }
11328 *var = val;
11329}
11330
11331static void
11332rb_fs_setter(VALUE val, ID id, VALUE *var)
11333{
11334 val = rb_fs_check(val);
11335 if (!val) {
11336 rb_raise(rb_eTypeError,
11337 "value of %"PRIsVALUE" must be String or Regexp",
11338 rb_id2str(id));
11339 }
11340 if (!NIL_P(val)) {
11341 rb_warn_deprecated("'$;'", NULL);
11342 }
11343 *var = val;
11344}
11345
11346
11347/*
11348 * call-seq:
11349 * force_encoding(encoding) -> self
11350 *
11351 * :include: doc/string/force_encoding.rdoc
11352 *
11353 */
11354
11355static VALUE
11356rb_str_force_encoding(VALUE str, VALUE enc)
11357{
11358 str_modifiable(str);
11359
11360 rb_encoding *encoding = rb_to_encoding(enc);
11361 int idx = rb_enc_to_index(encoding);
11362
11363 // If the encoding is unchanged, we do nothing.
11364 if (ENCODING_GET(str) == idx) {
11365 return str;
11366 }
11367
11368 rb_enc_associate_index(str, idx);
11369
11370 // If the coderange was 7bit and the new encoding is ASCII-compatible
11371 // we can keep the coderange.
11372 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11373 return str;
11374 }
11375
11377 return str;
11378}
11379
11380/*
11381 * call-seq:
11382 * b -> string
11383 *
11384 * :include: doc/string/b.rdoc
11385 *
11386 */
11387
11388static VALUE
11389rb_str_b(VALUE str)
11390{
11391 VALUE str2;
11392 if (STR_EMBED_P(str)) {
11393 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11394 }
11395 else {
11396 str2 = str_alloc_heap(rb_cString);
11397 }
11398 str_replace_shared_without_enc(str2, str);
11399
11400 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11401 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11402 // If we know the receiver's code range then we know the result's code range.
11403 int cr = ENC_CODERANGE(str);
11404 switch (cr) {
11405 case ENC_CODERANGE_7BIT:
11407 break;
11411 break;
11412 default:
11413 ENC_CODERANGE_CLEAR(str2);
11414 break;
11415 }
11416 }
11417
11418 return str2;
11419}
11420
11421/*
11422 * call-seq:
11423 * valid_encoding? -> true or false
11424 *
11425 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11426 *
11427 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
11428 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
11429 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
11430 */
11431
11432static VALUE
11433rb_str_valid_encoding_p(VALUE str)
11434{
11435 int cr = rb_enc_str_coderange(str);
11436
11437 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11438}
11439
11440/*
11441 * call-seq:
11442 * ascii_only? -> true or false
11443 *
11444 * Returns +true+ if +self+ contains only ASCII characters,
11445 * +false+ otherwise:
11446 *
11447 * 'abc'.ascii_only? # => true
11448 * "abc\u{6666}".ascii_only? # => false
11449 *
11450 */
11451
11452static VALUE
11453rb_str_is_ascii_only_p(VALUE str)
11454{
11455 int cr = rb_enc_str_coderange(str);
11456
11457 return RBOOL(cr == ENC_CODERANGE_7BIT);
11458}
11459
11460VALUE
11462{
11463 static const char ellipsis[] = "...";
11464 const long ellipsislen = sizeof(ellipsis) - 1;
11465 rb_encoding *const enc = rb_enc_get(str);
11466 const long blen = RSTRING_LEN(str);
11467 const char *const p = RSTRING_PTR(str), *e = p + blen;
11468 VALUE estr, ret = 0;
11469
11470 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11471 if (len * rb_enc_mbminlen(enc) >= blen ||
11472 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11473 ret = str;
11474 }
11475 else if (len <= ellipsislen ||
11476 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11477 if (rb_enc_asciicompat(enc)) {
11478 ret = rb_str_new(ellipsis, len);
11479 rb_enc_associate(ret, enc);
11480 }
11481 else {
11482 estr = rb_usascii_str_new(ellipsis, len);
11483 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11484 }
11485 }
11486 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11487 rb_str_cat(ret, ellipsis, ellipsislen);
11488 }
11489 else {
11490 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11491 rb_enc_from_encoding(enc), 0, Qnil);
11492 rb_str_append(ret, estr);
11493 }
11494 return ret;
11495}
11496
11497static VALUE
11498str_compat_and_valid(VALUE str, rb_encoding *enc)
11499{
11500 int cr;
11501 str = StringValue(str);
11502 cr = rb_enc_str_coderange(str);
11503 if (cr == ENC_CODERANGE_BROKEN) {
11504 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11505 }
11506 else {
11507 rb_encoding *e = STR_ENC_GET(str);
11508 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11509 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11510 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11511 }
11512 }
11513 return str;
11514}
11515
11516static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11517
11518VALUE
11520{
11521 rb_encoding *enc = STR_ENC_GET(str);
11522 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11523}
11524
11525VALUE
11526rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11527{
11528 int cr = ENC_CODERANGE_UNKNOWN;
11529 if (enc == STR_ENC_GET(str)) {
11530 /* cached coderange makes sense only when enc equals the
11531 * actual encoding of str */
11532 cr = ENC_CODERANGE(str);
11533 }
11534 return enc_str_scrub(enc, str, repl, cr);
11535}
11536
11537static VALUE
11538enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11539{
11540 int encidx;
11541 VALUE buf = Qnil;
11542 const char *rep, *p, *e, *p1, *sp;
11543 long replen = -1;
11544 long slen;
11545
11546 if (rb_block_given_p()) {
11547 if (!NIL_P(repl))
11548 rb_raise(rb_eArgError, "both of block and replacement given");
11549 replen = 0;
11550 }
11551
11552 if (ENC_CODERANGE_CLEAN_P(cr))
11553 return Qnil;
11554
11555 if (!NIL_P(repl)) {
11556 repl = str_compat_and_valid(repl, enc);
11557 }
11558
11559 if (rb_enc_dummy_p(enc)) {
11560 return Qnil;
11561 }
11562 encidx = rb_enc_to_index(enc);
11563
11564#define DEFAULT_REPLACE_CHAR(str) do { \
11565 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11566 rep = replace; replen = (int)sizeof(replace); \
11567 } while (0)
11568
11569 slen = RSTRING_LEN(str);
11570 p = RSTRING_PTR(str);
11571 e = RSTRING_END(str);
11572 p1 = p;
11573 sp = p;
11574
11575 if (rb_enc_asciicompat(enc)) {
11576 int rep7bit_p;
11577 if (!replen) {
11578 rep = NULL;
11579 rep7bit_p = FALSE;
11580 }
11581 else if (!NIL_P(repl)) {
11582 rep = RSTRING_PTR(repl);
11583 replen = RSTRING_LEN(repl);
11584 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11585 }
11586 else if (encidx == rb_utf8_encindex()) {
11587 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11588 rep7bit_p = FALSE;
11589 }
11590 else {
11591 DEFAULT_REPLACE_CHAR("?");
11592 rep7bit_p = TRUE;
11593 }
11594 cr = ENC_CODERANGE_7BIT;
11595
11596 p = search_nonascii(p, e);
11597 if (!p) {
11598 p = e;
11599 }
11600 while (p < e) {
11601 int ret = rb_enc_precise_mbclen(p, e, enc);
11602 if (MBCLEN_NEEDMORE_P(ret)) {
11603 break;
11604 }
11605 else if (MBCLEN_CHARFOUND_P(ret)) {
11607 p += MBCLEN_CHARFOUND_LEN(ret);
11608 }
11609 else if (MBCLEN_INVALID_P(ret)) {
11610 /*
11611 * p1~p: valid ascii/multibyte chars
11612 * p ~e: invalid bytes + unknown bytes
11613 */
11614 long clen = rb_enc_mbmaxlen(enc);
11615 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11616 if (p > p1) {
11617 rb_str_buf_cat(buf, p1, p - p1);
11618 }
11619
11620 if (e - p < clen) clen = e - p;
11621 if (clen <= 2) {
11622 clen = 1;
11623 }
11624 else {
11625 const char *q = p;
11626 clen--;
11627 for (; clen > 1; clen--) {
11628 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11629 if (MBCLEN_NEEDMORE_P(ret)) break;
11630 if (MBCLEN_INVALID_P(ret)) continue;
11632 }
11633 }
11634 if (rep) {
11635 rb_str_buf_cat(buf, rep, replen);
11636 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11637 }
11638 else {
11639 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11640 str_mod_check(str, sp, slen);
11641 repl = str_compat_and_valid(repl, enc);
11642 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11645 }
11646 p += clen;
11647 p1 = p;
11648 p = search_nonascii(p, e);
11649 if (!p) {
11650 p = e;
11651 break;
11652 }
11653 }
11654 else {
11656 }
11657 }
11658 if (NIL_P(buf)) {
11659 if (p == e) {
11660 ENC_CODERANGE_SET(str, cr);
11661 return Qnil;
11662 }
11663 buf = rb_str_buf_new(RSTRING_LEN(str));
11664 }
11665 if (p1 < p) {
11666 rb_str_buf_cat(buf, p1, p - p1);
11667 }
11668 if (p < e) {
11669 if (rep) {
11670 rb_str_buf_cat(buf, rep, replen);
11671 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11672 }
11673 else {
11674 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11675 str_mod_check(str, sp, slen);
11676 repl = str_compat_and_valid(repl, enc);
11677 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11680 }
11681 }
11682 }
11683 else {
11684 /* ASCII incompatible */
11685 long mbminlen = rb_enc_mbminlen(enc);
11686 if (!replen) {
11687 rep = NULL;
11688 }
11689 else if (!NIL_P(repl)) {
11690 rep = RSTRING_PTR(repl);
11691 replen = RSTRING_LEN(repl);
11692 }
11693 else if (encidx == ENCINDEX_UTF_16BE) {
11694 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11695 }
11696 else if (encidx == ENCINDEX_UTF_16LE) {
11697 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11698 }
11699 else if (encidx == ENCINDEX_UTF_32BE) {
11700 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11701 }
11702 else if (encidx == ENCINDEX_UTF_32LE) {
11703 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11704 }
11705 else {
11706 DEFAULT_REPLACE_CHAR("?");
11707 }
11708
11709 while (p < e) {
11710 int ret = rb_enc_precise_mbclen(p, e, enc);
11711 if (MBCLEN_NEEDMORE_P(ret)) {
11712 break;
11713 }
11714 else if (MBCLEN_CHARFOUND_P(ret)) {
11715 p += MBCLEN_CHARFOUND_LEN(ret);
11716 }
11717 else if (MBCLEN_INVALID_P(ret)) {
11718 const char *q = p;
11719 long clen = rb_enc_mbmaxlen(enc);
11720 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11721 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11722
11723 if (e - p < clen) clen = e - p;
11724 if (clen <= mbminlen * 2) {
11725 clen = mbminlen;
11726 }
11727 else {
11728 clen -= mbminlen;
11729 for (; clen > mbminlen; clen-=mbminlen) {
11730 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11731 if (MBCLEN_NEEDMORE_P(ret)) break;
11732 if (MBCLEN_INVALID_P(ret)) continue;
11734 }
11735 }
11736 if (rep) {
11737 rb_str_buf_cat(buf, rep, replen);
11738 }
11739 else {
11740 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11741 str_mod_check(str, sp, slen);
11742 repl = str_compat_and_valid(repl, enc);
11743 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11744 }
11745 p += clen;
11746 p1 = p;
11747 }
11748 else {
11750 }
11751 }
11752 if (NIL_P(buf)) {
11753 if (p == e) {
11755 return Qnil;
11756 }
11757 buf = rb_str_buf_new(RSTRING_LEN(str));
11758 }
11759 if (p1 < p) {
11760 rb_str_buf_cat(buf, p1, p - p1);
11761 }
11762 if (p < e) {
11763 if (rep) {
11764 rb_str_buf_cat(buf, rep, replen);
11765 }
11766 else {
11767 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11768 str_mod_check(str, sp, slen);
11769 repl = str_compat_and_valid(repl, enc);
11770 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11771 }
11772 }
11774 }
11775 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11776 return buf;
11777}
11778
11779/*
11780 * call-seq:
11781 * scrub(replacement_string = default_replacement) -> new_string
11782 * scrub{|bytes| ... } -> new_string
11783 *
11784 * :include: doc/string/scrub.rdoc
11785 *
11786 */
11787static VALUE
11788str_scrub(int argc, VALUE *argv, VALUE str)
11789{
11790 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11791 VALUE new = rb_str_scrub(str, repl);
11792 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11793}
11794
11795/*
11796 * call-seq:
11797 * scrub! -> self
11798 * scrub!(replacement_string = default_replacement) -> self
11799 * scrub!{|bytes| ... } -> self
11800 *
11801 * Like String#scrub, except that any replacements are made in +self+.
11802 *
11803 */
11804static VALUE
11805str_scrub_bang(int argc, VALUE *argv, VALUE str)
11806{
11807 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11808 VALUE new = rb_str_scrub(str, repl);
11809 if (!NIL_P(new)) rb_str_replace(str, new);
11810 return str;
11811}
11812
11813static ID id_normalize;
11814static ID id_normalized_p;
11815static VALUE mUnicodeNormalize;
11816
11817static VALUE
11818unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11819{
11820 static int UnicodeNormalizeRequired = 0;
11821 VALUE argv2[2];
11822
11823 if (!UnicodeNormalizeRequired) {
11824 rb_require("unicode_normalize/normalize.rb");
11825 UnicodeNormalizeRequired = 1;
11826 }
11827 argv2[0] = str;
11828 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11829 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11830}
11831
11832/*
11833 * call-seq:
11834 * unicode_normalize(form = :nfc) -> string
11835 *
11836 * Returns a copy of +self+ with
11837 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11838 *
11839 * Argument +form+ must be one of the following symbols
11840 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11841 *
11842 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11843 * - +:nfd+: Canonical decomposition.
11844 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11845 * - +:nfkd+: Compatibility decomposition.
11846 *
11847 * The encoding of +self+ must be one of:
11848 *
11849 * - Encoding::UTF_8
11850 * - Encoding::UTF_16BE
11851 * - Encoding::UTF_16LE
11852 * - Encoding::UTF_32BE
11853 * - Encoding::UTF_32LE
11854 * - Encoding::GB18030
11855 * - Encoding::UCS_2BE
11856 * - Encoding::UCS_4BE
11857 *
11858 * Examples:
11859 *
11860 * "a\u0300".unicode_normalize # => "a"
11861 * "\u00E0".unicode_normalize(:nfd) # => "a "
11862 *
11863 * Related: String#unicode_normalize!, String#unicode_normalized?.
11864 */
11865static VALUE
11866rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11867{
11868 return unicode_normalize_common(argc, argv, str, id_normalize);
11869}
11870
11871/*
11872 * call-seq:
11873 * unicode_normalize!(form = :nfc) -> self
11874 *
11875 * Like String#unicode_normalize, except that the normalization
11876 * is performed on +self+.
11877 *
11878 * Related String#unicode_normalized?.
11879 *
11880 */
11881static VALUE
11882rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11883{
11884 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11885}
11886
11887/* call-seq:
11888 * unicode_normalized?(form = :nfc) -> true or false
11889 *
11890 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11891 * +false+ otherwise.
11892 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11893 *
11894 * Examples:
11895 *
11896 * "a\u0300".unicode_normalized? # => false
11897 * "a\u0300".unicode_normalized?(:nfd) # => true
11898 * "\u00E0".unicode_normalized? # => true
11899 * "\u00E0".unicode_normalized?(:nfd) # => false
11900 *
11901 *
11902 * Raises an exception if +self+ is not in a Unicode encoding:
11903 *
11904 * s = "\xE0".force_encoding('ISO-8859-1')
11905 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11906 *
11907 * Related: String#unicode_normalize, String#unicode_normalize!.
11908 *
11909 */
11910static VALUE
11911rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11912{
11913 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11914}
11915
11916/**********************************************************************
11917 * Document-class: Symbol
11918 *
11919 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11920 *
11921 * You can create a +Symbol+ object explicitly with:
11922 *
11923 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11924 *
11925 * The same +Symbol+ object will be
11926 * created for a given name or string for the duration of a program's
11927 * execution, regardless of the context or meaning of that name. Thus
11928 * if <code>Fred</code> is a constant in one context, a method in
11929 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11930 * will be the same object in all three contexts.
11931 *
11932 * module One
11933 * class Fred
11934 * end
11935 * $f1 = :Fred
11936 * end
11937 * module Two
11938 * Fred = 1
11939 * $f2 = :Fred
11940 * end
11941 * def Fred()
11942 * end
11943 * $f3 = :Fred
11944 * $f1.object_id #=> 2514190
11945 * $f2.object_id #=> 2514190
11946 * $f3.object_id #=> 2514190
11947 *
11948 * Constant, method, and variable names are returned as symbols:
11949 *
11950 * module One
11951 * Two = 2
11952 * def three; 3 end
11953 * @four = 4
11954 * @@five = 5
11955 * $six = 6
11956 * end
11957 * seven = 7
11958 *
11959 * One.constants
11960 * # => [:Two]
11961 * One.instance_methods(true)
11962 * # => [:three]
11963 * One.instance_variables
11964 * # => [:@four]
11965 * One.class_variables
11966 * # => [:@@five]
11967 * global_variables.grep(/six/)
11968 * # => [:$six]
11969 * local_variables
11970 * # => [:seven]
11971 *
11972 * A +Symbol+ object differs from a String object in that
11973 * a +Symbol+ object represents an identifier, while a String object
11974 * represents text or data.
11975 *
11976 * == What's Here
11977 *
11978 * First, what's elsewhere. \Class +Symbol+:
11979 *
11980 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11981 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11982 *
11983 * Here, class +Symbol+ provides methods that are useful for:
11984 *
11985 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11986 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11987 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11988 *
11989 * === Methods for Querying
11990 *
11991 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11992 * - #=~: Returns the index of the first substring in symbol that matches a
11993 * given Regexp or other object; returns +nil+ if no match is found.
11994 * - #[], #slice : Returns a substring of symbol
11995 * determined by a given index, start/length, or range, or string.
11996 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11997 * - #encoding: Returns the Encoding object that represents the encoding
11998 * of symbol.
11999 * - #end_with?: Returns +true+ if symbol ends with
12000 * any of the given strings.
12001 * - #match: Returns a MatchData object if symbol
12002 * matches a given Regexp; +nil+ otherwise.
12003 * - #match?: Returns +true+ if symbol
12004 * matches a given Regexp; +false+ otherwise.
12005 * - #length, #size: Returns the number of characters in symbol.
12006 * - #start_with?: Returns +true+ if symbol starts with
12007 * any of the given strings.
12008 *
12009 * === Methods for Comparing
12010 *
12011 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12012 * or larger than symbol.
12013 * - #==, #===: Returns +true+ if a given symbol has the same content and
12014 * encoding.
12015 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12016 * symbol is smaller than, equal to, or larger than symbol.
12017 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12018 * after Unicode case folding; +false+ otherwise.
12019 *
12020 * === Methods for Converting
12021 *
12022 * - #capitalize: Returns symbol with the first character upcased
12023 * and all other characters downcased.
12024 * - #downcase: Returns symbol with all characters downcased.
12025 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12026 * - #name: Returns the frozen string corresponding to symbol.
12027 * - #succ, #next: Returns the symbol that is the successor to symbol.
12028 * - #swapcase: Returns symbol with all upcase characters downcased
12029 * and all downcase characters upcased.
12030 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12031 * - #to_s, #id2name: Returns the string corresponding to +self+.
12032 * - #to_sym, #intern: Returns +self+.
12033 * - #upcase: Returns symbol with all characters upcased.
12034 *
12035 */
12036
12037
12038/*
12039 * call-seq:
12040 * symbol == object -> true or false
12041 *
12042 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12043 */
12044
12045#define sym_equal rb_obj_equal
12046
12047static int
12048sym_printable(const char *s, const char *send, rb_encoding *enc)
12049{
12050 while (s < send) {
12051 int n;
12052 int c = rb_enc_precise_mbclen(s, send, enc);
12053
12054 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12055 n = MBCLEN_CHARFOUND_LEN(c);
12056 c = rb_enc_mbc_to_codepoint(s, send, enc);
12057 if (!rb_enc_isprint(c, enc)) return FALSE;
12058 s += n;
12059 }
12060 return TRUE;
12061}
12062
12063int
12064rb_str_symname_p(VALUE sym)
12065{
12066 rb_encoding *enc;
12067 const char *ptr;
12068 long len;
12069 rb_encoding *resenc = rb_default_internal_encoding();
12070
12071 if (resenc == NULL) resenc = rb_default_external_encoding();
12072 enc = STR_ENC_GET(sym);
12073 ptr = RSTRING_PTR(sym);
12074 len = RSTRING_LEN(sym);
12075 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12076 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12077 return FALSE;
12078 }
12079 return TRUE;
12080}
12081
12082VALUE
12083rb_str_quote_unprintable(VALUE str)
12084{
12085 rb_encoding *enc;
12086 const char *ptr;
12087 long len;
12088 rb_encoding *resenc;
12089
12090 Check_Type(str, T_STRING);
12092 if (resenc == NULL) resenc = rb_default_external_encoding();
12093 enc = STR_ENC_GET(str);
12094 ptr = RSTRING_PTR(str);
12095 len = RSTRING_LEN(str);
12096 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12097 !sym_printable(ptr, ptr + len, enc)) {
12098 return rb_str_escape(str);
12099 }
12100 return str;
12101}
12102
12103VALUE
12104rb_id_quote_unprintable(ID id)
12105{
12106 VALUE str = rb_id2str(id);
12107 if (!rb_str_symname_p(str)) {
12108 return rb_str_escape(str);
12109 }
12110 return str;
12111}
12112
12113/*
12114 * call-seq:
12115 * inspect -> string
12116 *
12117 * Returns a string representation of +self+ (including the leading colon):
12118 *
12119 * :foo.inspect # => ":foo"
12120 *
12121 * Related: Symbol#to_s, Symbol#name.
12122 *
12123 */
12124
12125static VALUE
12126sym_inspect(VALUE sym)
12127{
12128 VALUE str = rb_sym2str(sym);
12129 const char *ptr;
12130 long len;
12131 char *dest;
12132
12133 if (!rb_str_symname_p(str)) {
12134 str = rb_str_inspect(str);
12135 len = RSTRING_LEN(str);
12136 rb_str_resize(str, len + 1);
12137 dest = RSTRING_PTR(str);
12138 memmove(dest + 1, dest, len);
12139 }
12140 else {
12141 rb_encoding *enc = STR_ENC_GET(str);
12142 VALUE orig_str = str;
12143
12144 len = RSTRING_LEN(orig_str);
12145 str = rb_enc_str_new(0, len + 1, enc);
12146
12147 // Get data pointer after allocation
12148 ptr = RSTRING_PTR(orig_str);
12149 dest = RSTRING_PTR(str);
12150 memcpy(dest + 1, ptr, len);
12151
12152 RB_GC_GUARD(orig_str);
12153 }
12154 dest[0] = ':';
12155
12157
12158 return str;
12159}
12160
12161VALUE
12163{
12164 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12165 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12166 return str;
12167}
12168
12169VALUE
12170rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12171{
12172 VALUE obj;
12173
12174 if (argc < 1) {
12175 rb_raise(rb_eArgError, "no receiver given");
12176 }
12177 obj = argv[0];
12178 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12179}
12180
12181/*
12182 * call-seq:
12183 * succ
12184 *
12185 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12186 *
12187 * :foo.succ # => :fop
12188 *
12189 * Related: String#succ.
12190 */
12191
12192static VALUE
12193sym_succ(VALUE sym)
12194{
12195 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12196}
12197
12198/*
12199 * call-seq:
12200 * symbol <=> object -> -1, 0, +1, or nil
12201 *
12202 * If +object+ is a symbol,
12203 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12204 *
12205 * :bar <=> :foo # => -1
12206 * :foo <=> :foo # => 0
12207 * :foo <=> :bar # => 1
12208 *
12209 * Otherwise, returns +nil+:
12210 *
12211 * :foo <=> 'bar' # => nil
12212 *
12213 * Related: String#<=>.
12214 */
12215
12216static VALUE
12217sym_cmp(VALUE sym, VALUE other)
12218{
12219 if (!SYMBOL_P(other)) {
12220 return Qnil;
12221 }
12222 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12223}
12224
12225/*
12226 * call-seq:
12227 * casecmp(object) -> -1, 0, 1, or nil
12228 *
12229 * :include: doc/symbol/casecmp.rdoc
12230 *
12231 */
12232
12233static VALUE
12234sym_casecmp(VALUE sym, VALUE other)
12235{
12236 if (!SYMBOL_P(other)) {
12237 return Qnil;
12238 }
12239 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12240}
12241
12242/*
12243 * call-seq:
12244 * casecmp?(object) -> true, false, or nil
12245 *
12246 * :include: doc/symbol/casecmp_p.rdoc
12247 *
12248 */
12249
12250static VALUE
12251sym_casecmp_p(VALUE sym, VALUE other)
12252{
12253 if (!SYMBOL_P(other)) {
12254 return Qnil;
12255 }
12256 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12257}
12258
12259/*
12260 * call-seq:
12261 * symbol =~ object -> integer or nil
12262 *
12263 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12264 * including possible updates to global variables;
12265 * see String#=~.
12266 *
12267 */
12268
12269static VALUE
12270sym_match(VALUE sym, VALUE other)
12271{
12272 return rb_str_match(rb_sym2str(sym), other);
12273}
12274
12275/*
12276 * call-seq:
12277 * match(pattern, offset = 0) -> matchdata or nil
12278 * match(pattern, offset = 0) {|matchdata| } -> object
12279 *
12280 * Equivalent to <tt>self.to_s.match</tt>,
12281 * including possible updates to global variables;
12282 * see String#match.
12283 *
12284 */
12285
12286static VALUE
12287sym_match_m(int argc, VALUE *argv, VALUE sym)
12288{
12289 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12290}
12291
12292/*
12293 * call-seq:
12294 * match?(pattern, offset) -> true or false
12295 *
12296 * Equivalent to <tt>sym.to_s.match?</tt>;
12297 * see String#match.
12298 *
12299 */
12300
12301static VALUE
12302sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12303{
12304 return rb_str_match_m_p(argc, argv, sym);
12305}
12306
12307/*
12308 * call-seq:
12309 * symbol[index] -> string or nil
12310 * symbol[start, length] -> string or nil
12311 * symbol[range] -> string or nil
12312 * symbol[regexp, capture = 0] -> string or nil
12313 * symbol[substring] -> string or nil
12314 *
12315 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12316 *
12317 */
12318
12319static VALUE
12320sym_aref(int argc, VALUE *argv, VALUE sym)
12321{
12322 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12323}
12324
12325/*
12326 * call-seq:
12327 * length -> integer
12328 *
12329 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12330 */
12331
12332static VALUE
12333sym_length(VALUE sym)
12334{
12335 return rb_str_length(rb_sym2str(sym));
12336}
12337
12338/*
12339 * call-seq:
12340 * empty? -> true or false
12341 *
12342 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12343 *
12344 */
12345
12346static VALUE
12347sym_empty(VALUE sym)
12348{
12349 return rb_str_empty(rb_sym2str(sym));
12350}
12351
12352/*
12353 * call-seq:
12354 * upcase(*options) -> symbol
12355 *
12356 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12357 *
12358 * See String#upcase.
12359 *
12360 */
12361
12362static VALUE
12363sym_upcase(int argc, VALUE *argv, VALUE sym)
12364{
12365 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12366}
12367
12368/*
12369 * call-seq:
12370 * downcase(*options) -> symbol
12371 *
12372 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12373 *
12374 * See String#downcase.
12375 *
12376 * Related: Symbol#upcase.
12377 *
12378 */
12379
12380static VALUE
12381sym_downcase(int argc, VALUE *argv, VALUE sym)
12382{
12383 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12384}
12385
12386/*
12387 * call-seq:
12388 * capitalize(*options) -> symbol
12389 *
12390 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12391 *
12392 * See String#capitalize.
12393 *
12394 */
12395
12396static VALUE
12397sym_capitalize(int argc, VALUE *argv, VALUE sym)
12398{
12399 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12400}
12401
12402/*
12403 * call-seq:
12404 * swapcase(*options) -> symbol
12405 *
12406 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12407 *
12408 * See String#swapcase.
12409 *
12410 */
12411
12412static VALUE
12413sym_swapcase(int argc, VALUE *argv, VALUE sym)
12414{
12415 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12416}
12417
12418/*
12419 * call-seq:
12420 * start_with?(*string_or_regexp) -> true or false
12421 *
12422 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12423 *
12424 */
12425
12426static VALUE
12427sym_start_with(int argc, VALUE *argv, VALUE sym)
12428{
12429 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12430}
12431
12432/*
12433 * call-seq:
12434 * end_with?(*strings) -> true or false
12435 *
12436 *
12437 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12438 *
12439 */
12440
12441static VALUE
12442sym_end_with(int argc, VALUE *argv, VALUE sym)
12443{
12444 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12445}
12446
12447/*
12448 * call-seq:
12449 * encoding -> encoding
12450 *
12451 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12452 *
12453 */
12454
12455static VALUE
12456sym_encoding(VALUE sym)
12457{
12458 return rb_obj_encoding(rb_sym2str(sym));
12459}
12460
12461static VALUE
12462string_for_symbol(VALUE name)
12463{
12464 if (!RB_TYPE_P(name, T_STRING)) {
12465 VALUE tmp = rb_check_string_type(name);
12466 if (NIL_P(tmp)) {
12467 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12468 name);
12469 }
12470 name = tmp;
12471 }
12472 return name;
12473}
12474
12475ID
12477{
12478 if (SYMBOL_P(name)) {
12479 return SYM2ID(name);
12480 }
12481 name = string_for_symbol(name);
12482 return rb_intern_str(name);
12483}
12484
12485VALUE
12487{
12488 if (SYMBOL_P(name)) {
12489 return name;
12490 }
12491 name = string_for_symbol(name);
12492 return rb_str_intern(name);
12493}
12494
12495/*
12496 * call-seq:
12497 * Symbol.all_symbols -> array_of_symbols
12498 *
12499 * Returns an array of all symbols currently in Ruby's symbol table:
12500 *
12501 * Symbol.all_symbols.size # => 9334
12502 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12503 *
12504 */
12505
12506static VALUE
12507sym_all_symbols(VALUE _)
12508{
12509 return rb_sym_all_symbols();
12510}
12511
12512VALUE
12513rb_str_to_interned_str(VALUE str)
12514{
12515 return rb_fstring(str);
12516}
12517
12518VALUE
12519rb_interned_str(const char *ptr, long len)
12520{
12521 struct RString fake_str = {RBASIC_INIT};
12522 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12523}
12524
12525VALUE
12526rb_interned_str_cstr(const char *ptr)
12527{
12528 return rb_interned_str(ptr, strlen(ptr));
12529}
12530
12531VALUE
12532rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12533{
12534 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12535 rb_enc_autoload(enc);
12536 }
12537
12538 struct RString fake_str = {RBASIC_INIT};
12539 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12540}
12541
12542VALUE
12543rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12544{
12545 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12546 rb_enc_autoload(enc);
12547 }
12548
12549 struct RString fake_str = {RBASIC_INIT};
12550 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12551}
12552
12553VALUE
12554rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
12555{
12556 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12557}
12558
12559#if USE_YJIT
12560void
12561rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12562{
12563 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12564 ssize_t code = RB_NUM2SSIZE(codepoint);
12565
12566 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12567 rb_str_buf_cat_byte(str, (char) code);
12568 return;
12569 }
12570 }
12571
12572 rb_str_concat(str, codepoint);
12573}
12574#endif
12575
12576void
12577Init_String(void)
12578{
12579 rb_cString = rb_define_class("String", rb_cObject);
12580 RUBY_ASSERT(rb_vm_fstring_table());
12581 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12583 rb_define_alloc_func(rb_cString, empty_str_alloc);
12584 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12585 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12586 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12587 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12588 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12591 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12592 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12593 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12594 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12597 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12598 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12599 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12600 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12603 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12604 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12605 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12606 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12607 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12609 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12611 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12612 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12613 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12614 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12615 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12616 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12618 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12619 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12620 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12621 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12622 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12623 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12624 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12625 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12627 rb_define_method(rb_cString, "+@", str_uplus, 0);
12628 rb_define_method(rb_cString, "-@", str_uminus, 0);
12629 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12630 rb_define_alias(rb_cString, "dedup", "-@");
12631
12632 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12633 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12634 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12635 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12638 rb_define_method(rb_cString, "undump", str_undump, 0);
12639
12640 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12641 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12642 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12643 sym_fold = ID2SYM(rb_intern_const("fold"));
12644
12645 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12646 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12647 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12648 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12649
12650 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12651 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12652 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12653 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12654
12655 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12656 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12657 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12658 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12659 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12660 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12661 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12662 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12663 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12664 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12665 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12666 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12668 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12669 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12670 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12671 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12672 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12673
12674 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12675 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12676 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12677
12678 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12679
12680 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12681 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12682 rb_define_method(rb_cString, "center", rb_str_center, -1);
12683
12684 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12685 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12686 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12687 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12688 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12689 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12690 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12691 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12692 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12693
12694 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12695 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12696 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12697 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12698 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12699 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12700 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12701 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12702 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12703
12704 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12705 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12706 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12707 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12708 rb_define_method(rb_cString, "count", rb_str_count, -1);
12709
12710 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12711 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12712 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12713 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12714
12715 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12716 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12717 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12718 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12719 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12720
12721 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12722
12723 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12724 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12725
12726 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12727 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12728
12729 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12730 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12731 rb_define_method(rb_cString, "b", rb_str_b, 0);
12732 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12733 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12734
12735 /* define UnicodeNormalize module here so that we don't have to look it up */
12736 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12737 id_normalize = rb_intern_const("normalize");
12738 id_normalized_p = rb_intern_const("normalized?");
12739
12740 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12741 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12742 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12743
12744 rb_fs = Qnil;
12745 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12746 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12747 rb_gc_register_address(&rb_fs);
12748
12749 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12753 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12754
12755 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12756 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12757 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12758 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12759 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12760 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12761
12762 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12763 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12764 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12765 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12766
12767 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12768 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12769 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12770 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12771 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12772 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12773 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12774
12775 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12776 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12777 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12778 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12779
12780 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12781 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12782
12783 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12784}
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
#define ISSPACE
@old{rb_isspace}
Definition ctype.h:88
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define ISDIGIT
@old{rb_isdigit}
Definition ctype.h:93
#define ISALPHA
@old{rb_isalpha}
Definition ctype.h:92
#define TOLOWER
@old{rb_tolower}
Definition ctype.h:101
#define ISPRINT
@old{rb_isprint}
Definition ctype.h:86
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:883
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:469
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1187
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:980
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1095
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2350
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2171
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2640
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:937
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2429
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:134
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:126
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3877
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:669
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2097
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2115
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1272
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3508
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:247
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:576
VALUE rb_cSymbol
Symbol class.
Definition string.c:80
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:179
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1260
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:79
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3192
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition encoding.c:1475
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
Definition encoding.c:1463
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition encoding.c:1537
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition encoding.c:1676
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition encoding.c:1481
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition encoding.c:1469
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition encoding.c:1589
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition encoding.c:1523
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition encoding.c:1487
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition encoding.c:1493
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1292
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:907
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1157
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2935
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1176
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12532
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2256
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3620
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1105
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1397
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1298
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:926
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12554
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:791
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:415
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1099
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1186
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_ary_new(void)
Allocates a new, empty array.
Definition array.c:741
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition hash.c:1477
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:674
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1835
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1043
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1841
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1892
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4198
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3695
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1905
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1684
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1462
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2407
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3685
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1373
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12162
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2479
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
Definition string.c:1349
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1678
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2963
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5277
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4054
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3060
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11461
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1770
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1720
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1139
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:961
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1468
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1923
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4040
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3453
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2345
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1941
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6485
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3068
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12526
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1379
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3651
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3010
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4156
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3277
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7206
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2701
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12519
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4110
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3927
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4085
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3627
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3185
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5787
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11519
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1634
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2859
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3157
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3260
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1151
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2657
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7320
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1361
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1650
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2359
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5705
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9416
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1145
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:879
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1939
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:1956
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2973
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1297
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:971
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12486
ID rb_to_id(VALUE str)
Definition string.c:12476
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1844
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3479
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4442
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1354
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
int st_foreach(st_table *q, int_type *w, st_data_t e)
Iteration over the given table.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:153
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1391
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2836
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2720
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition rstring.h:367
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1385
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2731
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1711
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition rstring.h:381
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
struct rb_data_type_struct rb_data_type_t
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:197
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1425
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:75
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024::@365170260060164113275356137374160141226332013204 aux
Auxiliary info.
struct RString::@157025041137035241047331270155043025061071337053::@153056146250355212360325351117351053336274231135 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
union RString::@157025041137035241047331270155043025061071337053 as
String's specific fields.
struct RString::@157025041137035241047331270155043025061071337053::@157067065136062356112324002106172053054013023024 heap
Strings that use separated memory region for contents use this pattern.
Definition string.c:8278
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:300
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113